xref: /qemu/target/i386/whpx/whpx-all.c (revision 12d1a768bdfea6e27a3a829228840d72507613a1)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "system/address-spaces.h"
14 #include "system/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "system/whpx.h"
18 #include "system/cpus.h"
19 #include "system/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <winhvplatform.h>
35 #include <winhvemulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct AccelCPUState {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240     bool dirty;
241 
242     /* Must be the last field as it may have a tail */
243     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
244 };
245 
246 static bool whpx_allowed;
247 static bool whp_dispatch_initialized;
248 static HMODULE hWinHvPlatform, hWinHvEmulation;
249 static uint32_t max_vcpu_index;
250 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
251 
252 struct whpx_state whpx_global;
253 struct WHPDispatch whp_dispatch;
254 
whpx_has_xsave(void)255 static bool whpx_has_xsave(void)
256 {
257     return whpx_xsave_cap.XsaveSupport;
258 }
259 
whpx_seg_q2h(const SegmentCache * qs,int v86,int r86)260 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
261                                              int r86)
262 {
263     WHV_X64_SEGMENT_REGISTER hs;
264     unsigned flags = qs->flags;
265 
266     hs.Base = qs->base;
267     hs.Limit = qs->limit;
268     hs.Selector = qs->selector;
269 
270     if (v86) {
271         hs.Attributes = 0;
272         hs.SegmentType = 3;
273         hs.Present = 1;
274         hs.DescriptorPrivilegeLevel = 3;
275         hs.NonSystemSegment = 1;
276 
277     } else {
278         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
279 
280         if (r86) {
281             /* hs.Base &= 0xfffff; */
282         }
283     }
284 
285     return hs;
286 }
287 
whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER * hs)288 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
289 {
290     SegmentCache qs;
291 
292     qs.base = hs->Base;
293     qs.limit = hs->Limit;
294     qs.selector = hs->Selector;
295 
296     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
297 
298     return qs;
299 }
300 
301 /* X64 Extended Control Registers */
whpx_set_xcrs(CPUState * cpu)302 static void whpx_set_xcrs(CPUState *cpu)
303 {
304     HRESULT hr;
305     struct whpx_state *whpx = &whpx_global;
306     WHV_REGISTER_VALUE xcr0;
307     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
308 
309     if (!whpx_has_xsave()) {
310         return;
311     }
312 
313     /* Only xcr0 is supported by the hypervisor currently */
314     xcr0.Reg64 = cpu_env(cpu)->xcr0;
315     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
316         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
317     if (FAILED(hr)) {
318         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
319     }
320 }
321 
whpx_set_tsc(CPUState * cpu)322 static int whpx_set_tsc(CPUState *cpu)
323 {
324     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
325     WHV_REGISTER_VALUE tsc_val;
326     HRESULT hr;
327     struct whpx_state *whpx = &whpx_global;
328 
329     /*
330      * Suspend the partition prior to setting the TSC to reduce the variance
331      * in TSC across vCPUs. When the first vCPU runs post suspend, the
332      * partition is automatically resumed.
333      */
334     if (whp_dispatch.WHvSuspendPartitionTime) {
335 
336         /*
337          * Unable to suspend partition while setting TSC is not a fatal
338          * error. It just increases the likelihood of TSC variance between
339          * vCPUs and some guest OS are able to handle that just fine.
340          */
341         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
342         if (FAILED(hr)) {
343             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
344         }
345     }
346 
347     tsc_val.Reg64 = cpu_env(cpu)->tsc;
348     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
349         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
350     if (FAILED(hr)) {
351         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
352         return -1;
353     }
354 
355     return 0;
356 }
357 
358 /*
359  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
360  * however, they use a slightly different encoding. Specifically:
361  *
362  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
363  *
364  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
365  * and IA-32 Architectures Software Developer's Manual.
366  *
367  * The functions below translate the value of CR8 to TPR and vice versa.
368  */
369 
whpx_apic_tpr_to_cr8(uint64_t tpr)370 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
371 {
372     return tpr >> 4;
373 }
374 
whpx_cr8_to_apic_tpr(uint64_t cr8)375 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
376 {
377     return cr8 << 4;
378 }
379 
whpx_set_registers(CPUState * cpu,int level)380 static void whpx_set_registers(CPUState *cpu, int level)
381 {
382     struct whpx_state *whpx = &whpx_global;
383     AccelCPUState *vcpu = cpu->accel;
384     X86CPU *x86_cpu = X86_CPU(cpu);
385     CPUX86State *env = &x86_cpu->env;
386     struct whpx_register_set vcxt;
387     HRESULT hr;
388     int idx;
389     int idx_next;
390     int i;
391     int v86, r86;
392 
393     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
394 
395     /*
396      * Following MSRs have side effects on the guest or are too heavy for
397      * runtime. Limit them to full state update.
398      */
399     if (level >= WHPX_SET_RESET_STATE) {
400         whpx_set_tsc(cpu);
401     }
402 
403     memset(&vcxt, 0, sizeof(struct whpx_register_set));
404 
405     v86 = (env->eflags & VM_MASK);
406     r86 = !(env->cr[0] & CR0_PE_MASK);
407 
408     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
409     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
410 
411     idx = 0;
412 
413     /* Indexes for first 16 registers match between HV and QEMU definitions */
414     idx_next = 16;
415     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
416         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
417     }
418     idx = idx_next;
419 
420     /* Same goes for RIP and RFLAGS */
421     assert(whpx_register_names[idx] == WHvX64RegisterRip);
422     vcxt.values[idx++].Reg64 = env->eip;
423 
424     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
425     vcxt.values[idx++].Reg64 = env->eflags;
426 
427     /* Translate 6+4 segment registers. HV and QEMU order matches  */
428     assert(idx == WHvX64RegisterEs);
429     for (i = 0; i < 6; i += 1, idx += 1) {
430         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
431     }
432 
433     assert(idx == WHvX64RegisterLdtr);
434     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
435 
436     assert(idx == WHvX64RegisterTr);
437     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
438 
439     assert(idx == WHvX64RegisterIdtr);
440     vcxt.values[idx].Table.Base = env->idt.base;
441     vcxt.values[idx].Table.Limit = env->idt.limit;
442     idx += 1;
443 
444     assert(idx == WHvX64RegisterGdtr);
445     vcxt.values[idx].Table.Base = env->gdt.base;
446     vcxt.values[idx].Table.Limit = env->gdt.limit;
447     idx += 1;
448 
449     /* CR0, 2, 3, 4, 8 */
450     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
451     vcxt.values[idx++].Reg64 = env->cr[0];
452     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
453     vcxt.values[idx++].Reg64 = env->cr[2];
454     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
455     vcxt.values[idx++].Reg64 = env->cr[3];
456     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
457     vcxt.values[idx++].Reg64 = env->cr[4];
458     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
459     vcxt.values[idx++].Reg64 = vcpu->tpr;
460 
461     /* 8 Debug Registers - Skipped */
462 
463     /*
464      * Extended control registers needs to be handled separately depending
465      * on whether xsave is supported/enabled or not.
466      */
467     whpx_set_xcrs(cpu);
468 
469     /* 16 XMM registers */
470     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
471     idx_next = idx + 16;
472     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
473         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
474         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
475     }
476     idx = idx_next;
477 
478     /* 8 FP registers */
479     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
480     for (i = 0; i < 8; i += 1, idx += 1) {
481         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
482         /* vcxt.values[idx].Fp.AsUINT128.High64 =
483                env->fpregs[i].mmx.MMX_Q(1);
484         */
485     }
486 
487     /* FP control status register */
488     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
489     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
490     vcxt.values[idx].FpControlStatus.FpStatus =
491         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
492     vcxt.values[idx].FpControlStatus.FpTag = 0;
493     for (i = 0; i < 8; ++i) {
494         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
495     }
496     vcxt.values[idx].FpControlStatus.Reserved = 0;
497     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
498     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
499     idx += 1;
500 
501     /* XMM control status register */
502     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
503     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
504     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
505     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
506     idx += 1;
507 
508     /* MSRs */
509     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
510     vcxt.values[idx++].Reg64 = env->efer;
511 #ifdef TARGET_X86_64
512     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
513     vcxt.values[idx++].Reg64 = env->kernelgsbase;
514 #endif
515 
516     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
517     vcxt.values[idx++].Reg64 = vcpu->apic_base;
518 
519     /* WHvX64RegisterPat - Skipped */
520 
521     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
522     vcxt.values[idx++].Reg64 = env->sysenter_cs;
523     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
524     vcxt.values[idx++].Reg64 = env->sysenter_eip;
525     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
526     vcxt.values[idx++].Reg64 = env->sysenter_esp;
527     assert(whpx_register_names[idx] == WHvX64RegisterStar);
528     vcxt.values[idx++].Reg64 = env->star;
529 #ifdef TARGET_X86_64
530     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
531     vcxt.values[idx++].Reg64 = env->lstar;
532     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
533     vcxt.values[idx++].Reg64 = env->cstar;
534     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
535     vcxt.values[idx++].Reg64 = env->fmask;
536 #endif
537 
538     /* Interrupt / Event Registers - Skipped */
539 
540     assert(idx == RTL_NUMBER_OF(whpx_register_names));
541 
542     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
543         whpx->partition, cpu->cpu_index,
544         whpx_register_names,
545         RTL_NUMBER_OF(whpx_register_names),
546         &vcxt.values[0]);
547 
548     if (FAILED(hr)) {
549         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
550                      hr);
551     }
552 }
553 
whpx_get_tsc(CPUState * cpu)554 static int whpx_get_tsc(CPUState *cpu)
555 {
556     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
557     WHV_REGISTER_VALUE tsc_val;
558     HRESULT hr;
559     struct whpx_state *whpx = &whpx_global;
560 
561     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
562         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
563     if (FAILED(hr)) {
564         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
565         return -1;
566     }
567 
568     cpu_env(cpu)->tsc = tsc_val.Reg64;
569     return 0;
570 }
571 
572 /* X64 Extended Control Registers */
whpx_get_xcrs(CPUState * cpu)573 static void whpx_get_xcrs(CPUState *cpu)
574 {
575     HRESULT hr;
576     struct whpx_state *whpx = &whpx_global;
577     WHV_REGISTER_VALUE xcr0;
578     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
579 
580     if (!whpx_has_xsave()) {
581         return;
582     }
583 
584     /* Only xcr0 is supported by the hypervisor currently */
585     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
586         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
587     if (FAILED(hr)) {
588         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
589         return;
590     }
591 
592     cpu_env(cpu)->xcr0 = xcr0.Reg64;
593 }
594 
whpx_get_registers(CPUState * cpu)595 static void whpx_get_registers(CPUState *cpu)
596 {
597     struct whpx_state *whpx = &whpx_global;
598     AccelCPUState *vcpu = cpu->accel;
599     X86CPU *x86_cpu = X86_CPU(cpu);
600     CPUX86State *env = &x86_cpu->env;
601     struct whpx_register_set vcxt;
602     uint64_t tpr, apic_base;
603     HRESULT hr;
604     int idx;
605     int idx_next;
606     int i;
607 
608     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
609 
610     if (!env->tsc_valid) {
611         whpx_get_tsc(cpu);
612         env->tsc_valid = !runstate_is_running();
613     }
614 
615     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
616         whpx->partition, cpu->cpu_index,
617         whpx_register_names,
618         RTL_NUMBER_OF(whpx_register_names),
619         &vcxt.values[0]);
620     if (FAILED(hr)) {
621         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
622                      hr);
623     }
624 
625     if (whpx_apic_in_platform()) {
626         /*
627          * Fetch the TPR value from the emulated APIC. It may get overwritten
628          * below with the value from CR8 returned by
629          * WHvGetVirtualProcessorRegisters().
630          */
631         whpx_apic_get(x86_cpu->apic_state);
632         vcpu->tpr = whpx_apic_tpr_to_cr8(
633             cpu_get_apic_tpr(x86_cpu->apic_state));
634     }
635 
636     idx = 0;
637 
638     /* Indexes for first 16 registers match between HV and QEMU definitions */
639     idx_next = 16;
640     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
641         env->regs[idx] = vcxt.values[idx].Reg64;
642     }
643     idx = idx_next;
644 
645     /* Same goes for RIP and RFLAGS */
646     assert(whpx_register_names[idx] == WHvX64RegisterRip);
647     env->eip = vcxt.values[idx++].Reg64;
648     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
649     env->eflags = vcxt.values[idx++].Reg64;
650 
651     /* Translate 6+4 segment registers. HV and QEMU order matches  */
652     assert(idx == WHvX64RegisterEs);
653     for (i = 0; i < 6; i += 1, idx += 1) {
654         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
655     }
656 
657     assert(idx == WHvX64RegisterLdtr);
658     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
659     assert(idx == WHvX64RegisterTr);
660     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
661     assert(idx == WHvX64RegisterIdtr);
662     env->idt.base = vcxt.values[idx].Table.Base;
663     env->idt.limit = vcxt.values[idx].Table.Limit;
664     idx += 1;
665     assert(idx == WHvX64RegisterGdtr);
666     env->gdt.base = vcxt.values[idx].Table.Base;
667     env->gdt.limit = vcxt.values[idx].Table.Limit;
668     idx += 1;
669 
670     /* CR0, 2, 3, 4, 8 */
671     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
672     env->cr[0] = vcxt.values[idx++].Reg64;
673     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
674     env->cr[2] = vcxt.values[idx++].Reg64;
675     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
676     env->cr[3] = vcxt.values[idx++].Reg64;
677     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
678     env->cr[4] = vcxt.values[idx++].Reg64;
679     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
680     tpr = vcxt.values[idx++].Reg64;
681     if (tpr != vcpu->tpr) {
682         vcpu->tpr = tpr;
683         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
684     }
685 
686     /* 8 Debug Registers - Skipped */
687 
688     /*
689      * Extended control registers needs to be handled separately depending
690      * on whether xsave is supported/enabled or not.
691      */
692     whpx_get_xcrs(cpu);
693 
694     /* 16 XMM registers */
695     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
696     idx_next = idx + 16;
697     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
698         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
699         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
700     }
701     idx = idx_next;
702 
703     /* 8 FP registers */
704     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
705     for (i = 0; i < 8; i += 1, idx += 1) {
706         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
707         /* env->fpregs[i].mmx.MMX_Q(1) =
708                vcxt.values[idx].Fp.AsUINT128.High64;
709         */
710     }
711 
712     /* FP control status register */
713     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
714     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
715     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
716     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
717     for (i = 0; i < 8; ++i) {
718         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
719     }
720     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
721     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
722     idx += 1;
723 
724     /* XMM control status register */
725     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
726     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
727     idx += 1;
728 
729     /* MSRs */
730     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
731     env->efer = vcxt.values[idx++].Reg64;
732 #ifdef TARGET_X86_64
733     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
734     env->kernelgsbase = vcxt.values[idx++].Reg64;
735 #endif
736 
737     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
738     apic_base = vcxt.values[idx++].Reg64;
739     if (apic_base != vcpu->apic_base) {
740         vcpu->apic_base = apic_base;
741         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
742     }
743 
744     /* WHvX64RegisterPat - Skipped */
745 
746     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
747     env->sysenter_cs = vcxt.values[idx++].Reg64;
748     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
749     env->sysenter_eip = vcxt.values[idx++].Reg64;
750     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
751     env->sysenter_esp = vcxt.values[idx++].Reg64;
752     assert(whpx_register_names[idx] == WHvX64RegisterStar);
753     env->star = vcxt.values[idx++].Reg64;
754 #ifdef TARGET_X86_64
755     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
756     env->lstar = vcxt.values[idx++].Reg64;
757     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
758     env->cstar = vcxt.values[idx++].Reg64;
759     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
760     env->fmask = vcxt.values[idx++].Reg64;
761 #endif
762 
763     /* Interrupt / Event Registers - Skipped */
764 
765     assert(idx == RTL_NUMBER_OF(whpx_register_names));
766 
767     if (whpx_apic_in_platform()) {
768         whpx_apic_get(x86_cpu->apic_state);
769     }
770 
771     x86_update_hflags(env);
772 }
773 
whpx_emu_ioport_callback(void * ctx,WHV_EMULATOR_IO_ACCESS_INFO * IoAccess)774 static HRESULT CALLBACK whpx_emu_ioport_callback(
775     void *ctx,
776     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
777 {
778     MemTxAttrs attrs = { 0 };
779     address_space_rw(&address_space_io, IoAccess->Port, attrs,
780                      &IoAccess->Data, IoAccess->AccessSize,
781                      IoAccess->Direction);
782     return S_OK;
783 }
784 
whpx_emu_mmio_callback(void * ctx,WHV_EMULATOR_MEMORY_ACCESS_INFO * ma)785 static HRESULT CALLBACK whpx_emu_mmio_callback(
786     void *ctx,
787     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
788 {
789     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
790                            ma->Direction);
791     return S_OK;
792 }
793 
whpx_emu_getreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,WHV_REGISTER_VALUE * RegisterValues)794 static HRESULT CALLBACK whpx_emu_getreg_callback(
795     void *ctx,
796     const WHV_REGISTER_NAME *RegisterNames,
797     UINT32 RegisterCount,
798     WHV_REGISTER_VALUE *RegisterValues)
799 {
800     HRESULT hr;
801     struct whpx_state *whpx = &whpx_global;
802     CPUState *cpu = (CPUState *)ctx;
803 
804     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
805         whpx->partition, cpu->cpu_index,
806         RegisterNames, RegisterCount,
807         RegisterValues);
808     if (FAILED(hr)) {
809         error_report("WHPX: Failed to get virtual processor registers,"
810                      " hr=%08lx", hr);
811     }
812 
813     return hr;
814 }
815 
whpx_emu_setreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,const WHV_REGISTER_VALUE * RegisterValues)816 static HRESULT CALLBACK whpx_emu_setreg_callback(
817     void *ctx,
818     const WHV_REGISTER_NAME *RegisterNames,
819     UINT32 RegisterCount,
820     const WHV_REGISTER_VALUE *RegisterValues)
821 {
822     HRESULT hr;
823     struct whpx_state *whpx = &whpx_global;
824     CPUState *cpu = (CPUState *)ctx;
825 
826     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
827         whpx->partition, cpu->cpu_index,
828         RegisterNames, RegisterCount,
829         RegisterValues);
830     if (FAILED(hr)) {
831         error_report("WHPX: Failed to set virtual processor registers,"
832                      " hr=%08lx", hr);
833     }
834 
835     /*
836      * The emulator just successfully wrote the register state. We clear the
837      * dirty state so we avoid the double write on resume of the VP.
838      */
839     cpu->accel->dirty = false;
840 
841     return hr;
842 }
843 
whpx_emu_translate_callback(void * ctx,WHV_GUEST_VIRTUAL_ADDRESS Gva,WHV_TRANSLATE_GVA_FLAGS TranslateFlags,WHV_TRANSLATE_GVA_RESULT_CODE * TranslationResult,WHV_GUEST_PHYSICAL_ADDRESS * Gpa)844 static HRESULT CALLBACK whpx_emu_translate_callback(
845     void *ctx,
846     WHV_GUEST_VIRTUAL_ADDRESS Gva,
847     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
848     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
849     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
850 {
851     HRESULT hr;
852     struct whpx_state *whpx = &whpx_global;
853     CPUState *cpu = (CPUState *)ctx;
854     WHV_TRANSLATE_GVA_RESULT res;
855 
856     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
857                                       Gva, TranslateFlags, &res, Gpa);
858     if (FAILED(hr)) {
859         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
860     } else {
861         *TranslationResult = res.ResultCode;
862     }
863 
864     return hr;
865 }
866 
867 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
868     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
869     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
870     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
871     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
872     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
873     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
874 };
875 
whpx_handle_mmio(CPUState * cpu,WHV_MEMORY_ACCESS_CONTEXT * ctx)876 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
877 {
878     HRESULT hr;
879     AccelCPUState *vcpu = cpu->accel;
880     WHV_EMULATOR_STATUS emu_status;
881 
882     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
883         vcpu->emulator, cpu,
884         &vcpu->exit_ctx.VpContext, ctx,
885         &emu_status);
886     if (FAILED(hr)) {
887         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
888         return -1;
889     }
890 
891     if (!emu_status.EmulationSuccessful) {
892         error_report("WHPX: Failed to emulate MMIO access with"
893                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
894         return -1;
895     }
896 
897     return 0;
898 }
899 
whpx_handle_portio(CPUState * cpu,WHV_X64_IO_PORT_ACCESS_CONTEXT * ctx)900 static int whpx_handle_portio(CPUState *cpu,
901                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
902 {
903     HRESULT hr;
904     AccelCPUState *vcpu = cpu->accel;
905     WHV_EMULATOR_STATUS emu_status;
906 
907     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
908         vcpu->emulator, cpu,
909         &vcpu->exit_ctx.VpContext, ctx,
910         &emu_status);
911     if (FAILED(hr)) {
912         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
913         return -1;
914     }
915 
916     if (!emu_status.EmulationSuccessful) {
917         error_report("WHPX: Failed to emulate PortIO access with"
918                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
919         return -1;
920     }
921 
922     return 0;
923 }
924 
925 /*
926  * Controls whether we should intercept various exceptions on the guest,
927  * namely breakpoint/single-step events.
928  *
929  * The 'exceptions' argument accepts a bitmask, e.g:
930  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
931  */
whpx_set_exception_exit_bitmap(UINT64 exceptions)932 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
933 {
934     struct whpx_state *whpx = &whpx_global;
935     WHV_PARTITION_PROPERTY prop = { 0, };
936     HRESULT hr;
937 
938     if (exceptions == whpx->exception_exit_bitmap) {
939         return S_OK;
940     }
941 
942     prop.ExceptionExitBitmap = exceptions;
943 
944     hr = whp_dispatch.WHvSetPartitionProperty(
945         whpx->partition,
946         WHvPartitionPropertyCodeExceptionExitBitmap,
947         &prop,
948         sizeof(WHV_PARTITION_PROPERTY));
949 
950     if (SUCCEEDED(hr)) {
951         whpx->exception_exit_bitmap = exceptions;
952     }
953 
954     return hr;
955 }
956 
957 
958 /*
959  * This function is called before/after stepping over a single instruction.
960  * It will update the CPU registers to arm/disarm the instruction stepping
961  * accordingly.
962  */
whpx_vcpu_configure_single_stepping(CPUState * cpu,bool set,uint64_t * exit_context_rflags)963 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
964     bool set,
965     uint64_t *exit_context_rflags)
966 {
967     WHV_REGISTER_NAME reg_name;
968     WHV_REGISTER_VALUE reg_value;
969     HRESULT hr;
970     struct whpx_state *whpx = &whpx_global;
971 
972     /*
973      * If we are trying to step over a single instruction, we need to set the
974      * TF bit in rflags. Otherwise, clear it.
975      */
976     reg_name = WHvX64RegisterRflags;
977     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
978         whpx->partition,
979         cpu->cpu_index,
980         &reg_name,
981         1,
982         &reg_value);
983 
984     if (FAILED(hr)) {
985         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
986         return hr;
987     }
988 
989     if (exit_context_rflags) {
990         assert(*exit_context_rflags == reg_value.Reg64);
991     }
992 
993     if (set) {
994         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
995         reg_value.Reg64 |= TF_MASK;
996     } else {
997         reg_value.Reg64 &= ~TF_MASK;
998     }
999 
1000     if (exit_context_rflags) {
1001         *exit_context_rflags = reg_value.Reg64;
1002     }
1003 
1004     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1005         whpx->partition,
1006         cpu->cpu_index,
1007         &reg_name,
1008         1,
1009         &reg_value);
1010 
1011     if (FAILED(hr)) {
1012         error_report("WHPX: Failed to set rflags,"
1013             " hr=%08lx",
1014             hr);
1015         return hr;
1016     }
1017 
1018     reg_name = WHvRegisterInterruptState;
1019     reg_value.Reg64 = 0;
1020 
1021     /* Suspend delivery of hardware interrupts during single-stepping. */
1022     reg_value.InterruptState.InterruptShadow = set != 0;
1023 
1024     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1025     whpx->partition,
1026         cpu->cpu_index,
1027         &reg_name,
1028         1,
1029         &reg_value);
1030 
1031     if (FAILED(hr)) {
1032         error_report("WHPX: Failed to set InterruptState,"
1033             " hr=%08lx",
1034             hr);
1035         return hr;
1036     }
1037 
1038     if (!set) {
1039         /*
1040          * We have just finished stepping over a single instruction,
1041          * and intercepted the INT1 generated by it.
1042          * We need to now hide the INT1 from the guest,
1043          * as it would not be expecting it.
1044          */
1045 
1046         reg_name = WHvX64RegisterPendingDebugException;
1047         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1048         whpx->partition,
1049             cpu->cpu_index,
1050             &reg_name,
1051             1,
1052             &reg_value);
1053 
1054         if (FAILED(hr)) {
1055             error_report("WHPX: Failed to get pending debug exceptions,"
1056                          "hr=%08lx", hr);
1057             return hr;
1058         }
1059 
1060         if (reg_value.PendingDebugException.SingleStep) {
1061             reg_value.PendingDebugException.SingleStep = 0;
1062 
1063             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1064                 whpx->partition,
1065                 cpu->cpu_index,
1066                 &reg_name,
1067                 1,
1068                 &reg_value);
1069 
1070             if (FAILED(hr)) {
1071                 error_report("WHPX: Failed to clear pending debug exceptions,"
1072                              "hr=%08lx", hr);
1073              return hr;
1074             }
1075         }
1076 
1077     }
1078 
1079     return S_OK;
1080 }
1081 
1082 /* Tries to find a breakpoint at the specified address. */
whpx_lookup_breakpoint_by_addr(uint64_t address)1083 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1084 {
1085     struct whpx_state *whpx = &whpx_global;
1086     int i;
1087 
1088     if (whpx->breakpoints.breakpoints) {
1089         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1090             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1091                 return &whpx->breakpoints.breakpoints->data[i];
1092             }
1093         }
1094     }
1095 
1096     return NULL;
1097 }
1098 
1099 /*
1100  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1101  * debugging user-mode applications. Since the WHPX API does not offer
1102  * an easy way to pass the intercepted exception back to the guest, we
1103  * resort to using INT1 instead, and let the guest always handle INT3.
1104  */
1105 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1106 
1107 /*
1108  * The WHPX QEMU backend implements breakpoints by writing the INT1
1109  * instruction into memory (ignoring the DRx registers). This raises a few
1110  * issues that need to be carefully handled:
1111  *
1112  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1113  *    at the same location, and later remove them in arbitrary order.
1114  *    This should not cause memory corruption, and should only remove the
1115  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1116  *
1117  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1118  *    physical location. Hence, physically adding/removing a breakpoint can
1119  *    theoretically fail at any time. We need to keep track of it.
1120  *
1121  * The function below rebuilds a list of low-level breakpoints (one per
1122  * address, tracking the original instruction and any errors) from the list of
1123  * high-level breakpoints (set via cpu_breakpoint_insert()).
1124  *
1125  * In order to optimize performance, this function stores the list of
1126  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1127  * low-level ones, so that it won't be re-invoked until these breakpoints
1128  * change.
1129  *
1130  * Note that this function decides which breakpoints should be inserted into,
1131  * memory, but doesn't actually do it. The memory accessing is done in
1132  * whpx_apply_breakpoints().
1133  */
whpx_translate_cpu_breakpoints(struct whpx_breakpoints * breakpoints,CPUState * cpu,int cpu_breakpoint_count)1134 static void whpx_translate_cpu_breakpoints(
1135     struct whpx_breakpoints *breakpoints,
1136     CPUState *cpu,
1137     int cpu_breakpoint_count)
1138 {
1139     CPUBreakpoint *bp;
1140     int cpu_bp_index = 0;
1141 
1142     breakpoints->original_addresses =
1143         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1144 
1145     breakpoints->original_address_count = cpu_breakpoint_count;
1146 
1147     int max_breakpoints = cpu_breakpoint_count +
1148         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1149 
1150     struct whpx_breakpoint_collection *new_breakpoints =
1151         g_malloc0(sizeof(struct whpx_breakpoint_collection)
1152                   + max_breakpoints * sizeof(struct whpx_breakpoint));
1153 
1154     new_breakpoints->allocated = max_breakpoints;
1155     new_breakpoints->used = 0;
1156 
1157     /*
1158      * 1. Preserve all old breakpoints that could not be automatically
1159      * cleared when the CPU got stopped.
1160      */
1161     if (breakpoints->breakpoints) {
1162         int i;
1163         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1164             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1165                 new_breakpoints->data[new_breakpoints->used++] =
1166                     breakpoints->breakpoints->data[i];
1167             }
1168         }
1169     }
1170 
1171     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1172     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1173         int i;
1174         bool found = false;
1175 
1176         /* This will be used to detect changed CPU breakpoints later. */
1177         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1178 
1179         for (i = 0; i < new_breakpoints->used; i++) {
1180             /*
1181              * WARNING: This loop has O(N^2) complexity, where N is the
1182              * number of breakpoints. It should not be a bottleneck in
1183              * real-world scenarios, since it only needs to run once after
1184              * the breakpoints have been modified.
1185              * If this ever becomes a concern, it can be optimized by storing
1186              * high-level breakpoint objects in a tree or hash map.
1187              */
1188 
1189             if (new_breakpoints->data[i].address == bp->pc) {
1190                 /* There was already a breakpoint at this address. */
1191                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1192                     new_breakpoints->data[i].state = WHPX_BP_SET;
1193                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1194                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1195                 }
1196 
1197                 found = true;
1198                 break;
1199             }
1200         }
1201 
1202         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1203             /* No WHPX breakpoint at this address. Create one. */
1204             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1205             new_breakpoints->data[new_breakpoints->used].state =
1206                 WHPX_BP_SET_PENDING;
1207             new_breakpoints->used++;
1208         }
1209     }
1210 
1211     /*
1212      * Free the previous breakpoint list. This can be optimized by keeping
1213      * it as shadow buffer for the next computation instead of freeing
1214      * it immediately.
1215      */
1216     g_free(breakpoints->breakpoints);
1217 
1218     breakpoints->breakpoints = new_breakpoints;
1219 }
1220 
1221 /*
1222  * Physically inserts/removes the breakpoints by reading and writing the
1223  * physical memory, keeping a track of the failed attempts.
1224  *
1225  * Passing resuming=true  will try to set all previously unset breakpoints.
1226  * Passing resuming=false will remove all inserted ones.
1227  */
whpx_apply_breakpoints(struct whpx_breakpoint_collection * breakpoints,CPUState * cpu,bool resuming)1228 static void whpx_apply_breakpoints(
1229     struct whpx_breakpoint_collection *breakpoints,
1230     CPUState *cpu,
1231     bool resuming)
1232 {
1233     int i, rc;
1234     if (!breakpoints) {
1235         return;
1236     }
1237 
1238     for (i = 0; i < breakpoints->used; i++) {
1239         /* Decide what to do right now based on the last known state. */
1240         WhpxBreakpointState state = breakpoints->data[i].state;
1241         switch (state) {
1242         case WHPX_BP_CLEARED:
1243             if (resuming) {
1244                 state = WHPX_BP_SET_PENDING;
1245             }
1246             break;
1247         case WHPX_BP_SET_PENDING:
1248             if (!resuming) {
1249                 state = WHPX_BP_CLEARED;
1250             }
1251             break;
1252         case WHPX_BP_SET:
1253             if (!resuming) {
1254                 state = WHPX_BP_CLEAR_PENDING;
1255             }
1256             break;
1257         case WHPX_BP_CLEAR_PENDING:
1258             if (resuming) {
1259                 state = WHPX_BP_SET;
1260             }
1261             break;
1262         }
1263 
1264         if (state == WHPX_BP_SET_PENDING) {
1265             /* Remember the original instruction. */
1266             rc = cpu_memory_rw_debug(cpu,
1267                 breakpoints->data[i].address,
1268                 &breakpoints->data[i].original_instruction,
1269                 1,
1270                 false);
1271 
1272             if (!rc) {
1273                 /* Write the breakpoint instruction. */
1274                 rc = cpu_memory_rw_debug(cpu,
1275                     breakpoints->data[i].address,
1276                     (void *)&whpx_breakpoint_instruction,
1277                     1,
1278                     true);
1279             }
1280 
1281             if (!rc) {
1282                 state = WHPX_BP_SET;
1283             }
1284 
1285         }
1286 
1287         if (state == WHPX_BP_CLEAR_PENDING) {
1288             /* Restore the original instruction. */
1289             rc = cpu_memory_rw_debug(cpu,
1290                 breakpoints->data[i].address,
1291                 &breakpoints->data[i].original_instruction,
1292                 1,
1293                 true);
1294 
1295             if (!rc) {
1296                 state = WHPX_BP_CLEARED;
1297             }
1298         }
1299 
1300         breakpoints->data[i].state = state;
1301     }
1302 }
1303 
1304 /*
1305  * This function is called when the a VCPU is about to start and no other
1306  * VCPUs have been started so far. Since the VCPU start order could be
1307  * arbitrary, it doesn't have to be VCPU#0.
1308  *
1309  * It is used to commit the breakpoints into memory, and configure WHPX
1310  * to intercept debug exceptions.
1311  *
1312  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1313  * more VCPUs are already running, so this is the best place to do it.
1314  */
whpx_first_vcpu_starting(CPUState * cpu)1315 static int whpx_first_vcpu_starting(CPUState *cpu)
1316 {
1317     struct whpx_state *whpx = &whpx_global;
1318     HRESULT hr;
1319 
1320     g_assert(bql_locked());
1321 
1322     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1323             (whpx->breakpoints.breakpoints &&
1324              whpx->breakpoints.breakpoints->used)) {
1325         CPUBreakpoint *bp;
1326         int i = 0;
1327         bool update_pending = false;
1328 
1329         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1330             if (i >= whpx->breakpoints.original_address_count ||
1331                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1332                 update_pending = true;
1333             }
1334 
1335             i++;
1336         }
1337 
1338         if (i != whpx->breakpoints.original_address_count) {
1339             update_pending = true;
1340         }
1341 
1342         if (update_pending) {
1343             /*
1344              * The CPU breakpoints have changed since the last call to
1345              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1346              * now be recomputed.
1347              */
1348             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1349         }
1350 
1351         /* Actually insert the breakpoints into the memory. */
1352         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1353     }
1354 
1355     uint64_t exception_mask;
1356     if (whpx->step_pending ||
1357         (whpx->breakpoints.breakpoints &&
1358          whpx->breakpoints.breakpoints->used)) {
1359         /*
1360          * We are either attempting to single-step one or more CPUs, or
1361          * have one or more breakpoints enabled. Both require intercepting
1362          * the WHvX64ExceptionTypeBreakpointTrap exception.
1363          */
1364 
1365         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1366     } else {
1367         /* Let the guest handle all exceptions. */
1368         exception_mask = 0;
1369     }
1370 
1371     hr = whpx_set_exception_exit_bitmap(exception_mask);
1372     if (!SUCCEEDED(hr)) {
1373         error_report("WHPX: Failed to update exception exit mask,"
1374                      "hr=%08lx.", hr);
1375         return 1;
1376     }
1377 
1378     return 0;
1379 }
1380 
1381 /*
1382  * This function is called when the last VCPU has finished running.
1383  * It is used to remove any previously set breakpoints from memory.
1384  */
whpx_last_vcpu_stopping(CPUState * cpu)1385 static int whpx_last_vcpu_stopping(CPUState *cpu)
1386 {
1387     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1388     return 0;
1389 }
1390 
1391 /* Returns the address of the next instruction that is about to be executed. */
whpx_vcpu_get_pc(CPUState * cpu,bool exit_context_valid)1392 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1393 {
1394     if (cpu->accel->dirty) {
1395         /* The CPU registers have been modified by other parts of QEMU. */
1396         return cpu_env(cpu)->eip;
1397     } else if (exit_context_valid) {
1398         /*
1399          * The CPU registers have not been modified by neither other parts
1400          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1401          * This is the most common case.
1402          */
1403         AccelCPUState *vcpu = cpu->accel;
1404         return vcpu->exit_ctx.VpContext.Rip;
1405     } else {
1406         /*
1407          * The CPU registers have been modified by a call to
1408          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1409          * the target.
1410          */
1411         WHV_REGISTER_VALUE reg_value;
1412         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1413         HRESULT hr;
1414         struct whpx_state *whpx = &whpx_global;
1415 
1416         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1417             whpx->partition,
1418             cpu->cpu_index,
1419             &reg_name,
1420             1,
1421             &reg_value);
1422 
1423         if (FAILED(hr)) {
1424             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1425             return 0;
1426         }
1427 
1428         return reg_value.Reg64;
1429     }
1430 }
1431 
whpx_handle_halt(CPUState * cpu)1432 static int whpx_handle_halt(CPUState *cpu)
1433 {
1434     int ret = 0;
1435 
1436     bql_lock();
1437     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1438           (cpu_env(cpu)->eflags & IF_MASK)) &&
1439         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1440         cpu->exception_index = EXCP_HLT;
1441         cpu->halted = true;
1442         ret = 1;
1443     }
1444     bql_unlock();
1445 
1446     return ret;
1447 }
1448 
whpx_vcpu_pre_run(CPUState * cpu)1449 static void whpx_vcpu_pre_run(CPUState *cpu)
1450 {
1451     HRESULT hr;
1452     struct whpx_state *whpx = &whpx_global;
1453     AccelCPUState *vcpu = cpu->accel;
1454     X86CPU *x86_cpu = X86_CPU(cpu);
1455     CPUX86State *env = &x86_cpu->env;
1456     int irq;
1457     uint8_t tpr;
1458     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1459     UINT32 reg_count = 0;
1460     WHV_REGISTER_VALUE reg_values[3];
1461     WHV_REGISTER_NAME reg_names[3];
1462 
1463     memset(&new_int, 0, sizeof(new_int));
1464     memset(reg_values, 0, sizeof(reg_values));
1465 
1466     bql_lock();
1467 
1468     /* Inject NMI */
1469     if (!vcpu->interruption_pending &&
1470         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1471         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1472             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1473             vcpu->interruptable = false;
1474             new_int.InterruptionType = WHvX64PendingNmi;
1475             new_int.InterruptionPending = 1;
1476             new_int.InterruptionVector = 2;
1477         }
1478         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1479             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1480         }
1481     }
1482 
1483     /*
1484      * Force the VCPU out of its inner loop to process any INIT requests or
1485      * commit pending TPR access.
1486      */
1487     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1488         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1489             !(env->hflags & HF_SMM_MASK)) {
1490             cpu->exit_request = 1;
1491         }
1492         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1493             cpu->exit_request = 1;
1494         }
1495     }
1496 
1497     /* Get pending hard interruption or replay one that was overwritten */
1498     if (!whpx_apic_in_platform()) {
1499         if (!vcpu->interruption_pending &&
1500             vcpu->interruptable && (env->eflags & IF_MASK)) {
1501             assert(!new_int.InterruptionPending);
1502             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1503                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1504                 irq = cpu_get_pic_interrupt(env);
1505                 if (irq >= 0) {
1506                     new_int.InterruptionType = WHvX64PendingInterrupt;
1507                     new_int.InterruptionPending = 1;
1508                     new_int.InterruptionVector = irq;
1509                 }
1510             }
1511         }
1512 
1513         /* Setup interrupt state if new one was prepared */
1514         if (new_int.InterruptionPending) {
1515             reg_values[reg_count].PendingInterruption = new_int;
1516             reg_names[reg_count] = WHvRegisterPendingInterruption;
1517             reg_count += 1;
1518         }
1519     } else if (vcpu->ready_for_pic_interrupt &&
1520                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1521         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1522         irq = cpu_get_pic_interrupt(env);
1523         if (irq >= 0) {
1524             reg_names[reg_count] = WHvRegisterPendingEvent;
1525             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1526             {
1527                 .EventPending = 1,
1528                 .EventType = WHvX64PendingEventExtInt,
1529                 .Vector = irq,
1530             };
1531             reg_count += 1;
1532         }
1533      }
1534 
1535     /* Sync the TPR to the CR8 if was modified during the intercept */
1536     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1537     if (tpr != vcpu->tpr) {
1538         vcpu->tpr = tpr;
1539         reg_values[reg_count].Reg64 = tpr;
1540         cpu->exit_request = 1;
1541         reg_names[reg_count] = WHvX64RegisterCr8;
1542         reg_count += 1;
1543     }
1544 
1545     /* Update the state of the interrupt delivery notification */
1546     if (!vcpu->window_registered &&
1547         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1548         reg_values[reg_count].DeliverabilityNotifications =
1549             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1550                 .InterruptNotification = 1
1551             };
1552         vcpu->window_registered = 1;
1553         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1554         reg_count += 1;
1555     }
1556 
1557     bql_unlock();
1558     vcpu->ready_for_pic_interrupt = false;
1559 
1560     if (reg_count) {
1561         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1562             whpx->partition, cpu->cpu_index,
1563             reg_names, reg_count, reg_values);
1564         if (FAILED(hr)) {
1565             error_report("WHPX: Failed to set interrupt state registers,"
1566                          " hr=%08lx", hr);
1567         }
1568     }
1569 }
1570 
whpx_vcpu_post_run(CPUState * cpu)1571 static void whpx_vcpu_post_run(CPUState *cpu)
1572 {
1573     AccelCPUState *vcpu = cpu->accel;
1574     X86CPU *x86_cpu = X86_CPU(cpu);
1575     CPUX86State *env = &x86_cpu->env;
1576 
1577     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1578 
1579     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1580     if (vcpu->tpr != tpr) {
1581         vcpu->tpr = tpr;
1582         bql_lock();
1583         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1584         bql_unlock();
1585     }
1586 
1587     vcpu->interruption_pending =
1588         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1589 
1590     vcpu->interruptable =
1591         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1592 }
1593 
whpx_vcpu_process_async_events(CPUState * cpu)1594 static void whpx_vcpu_process_async_events(CPUState *cpu)
1595 {
1596     X86CPU *x86_cpu = X86_CPU(cpu);
1597     CPUX86State *env = &x86_cpu->env;
1598     AccelCPUState *vcpu = cpu->accel;
1599 
1600     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1601         !(env->hflags & HF_SMM_MASK)) {
1602         whpx_cpu_synchronize_state(cpu);
1603         do_cpu_init(x86_cpu);
1604         vcpu->interruptable = true;
1605     }
1606 
1607     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1608         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1609         apic_poll_irq(x86_cpu->apic_state);
1610     }
1611 
1612     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1613          (env->eflags & IF_MASK)) ||
1614         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1615         cpu->halted = false;
1616     }
1617 
1618     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1619         whpx_cpu_synchronize_state(cpu);
1620         do_cpu_sipi(x86_cpu);
1621     }
1622 
1623     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1624         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1625         whpx_cpu_synchronize_state(cpu);
1626         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1627                                       env->tpr_access_type);
1628     }
1629 }
1630 
whpx_vcpu_run(CPUState * cpu)1631 static int whpx_vcpu_run(CPUState *cpu)
1632 {
1633     HRESULT hr;
1634     struct whpx_state *whpx = &whpx_global;
1635     AccelCPUState *vcpu = cpu->accel;
1636     struct whpx_breakpoint *stepped_over_bp = NULL;
1637     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1638     int ret;
1639 
1640     g_assert(bql_locked());
1641 
1642     if (whpx->running_cpus++ == 0) {
1643         /* Insert breakpoints into memory, update exception exit bitmap. */
1644         ret = whpx_first_vcpu_starting(cpu);
1645         if (ret != 0) {
1646             return ret;
1647         }
1648     }
1649 
1650     if (whpx->breakpoints.breakpoints &&
1651         whpx->breakpoints.breakpoints->used > 0)
1652     {
1653         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1654         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1655         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1656             stepped_over_bp = NULL;
1657         }
1658 
1659         if (stepped_over_bp) {
1660             /*
1661              * We are trying to run the instruction overwritten by an active
1662              * breakpoint. We will temporarily disable the breakpoint, suspend
1663              * other CPUs, and step over the instruction.
1664              */
1665             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1666         }
1667     }
1668 
1669     if (exclusive_step_mode == WHPX_STEP_NONE) {
1670         whpx_vcpu_process_async_events(cpu);
1671         if (cpu->halted && !whpx_apic_in_platform()) {
1672             cpu->exception_index = EXCP_HLT;
1673             qatomic_set(&cpu->exit_request, false);
1674             return 0;
1675         }
1676     }
1677 
1678     bql_unlock();
1679 
1680     if (exclusive_step_mode != WHPX_STEP_NONE) {
1681         start_exclusive();
1682         g_assert(cpu == current_cpu);
1683         g_assert(!cpu->running);
1684         cpu->running = true;
1685 
1686         hr = whpx_set_exception_exit_bitmap(
1687             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1688         if (!SUCCEEDED(hr)) {
1689             error_report("WHPX: Failed to update exception exit mask, "
1690                          "hr=%08lx.", hr);
1691             return 1;
1692         }
1693 
1694         if (stepped_over_bp) {
1695             /* Temporarily disable the triggered breakpoint. */
1696             cpu_memory_rw_debug(cpu,
1697                 stepped_over_bp->address,
1698                 &stepped_over_bp->original_instruction,
1699                 1,
1700                 true);
1701         }
1702     } else {
1703         cpu_exec_start(cpu);
1704     }
1705 
1706     do {
1707         if (cpu->accel->dirty) {
1708             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1709             cpu->accel->dirty = false;
1710         }
1711 
1712         if (exclusive_step_mode == WHPX_STEP_NONE) {
1713             whpx_vcpu_pre_run(cpu);
1714 
1715             if (qatomic_read(&cpu->exit_request)) {
1716                 whpx_vcpu_kick(cpu);
1717             }
1718         }
1719 
1720         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1721             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1722         }
1723 
1724         hr = whp_dispatch.WHvRunVirtualProcessor(
1725             whpx->partition, cpu->cpu_index,
1726             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1727 
1728         if (FAILED(hr)) {
1729             error_report("WHPX: Failed to exec a virtual processor,"
1730                          " hr=%08lx", hr);
1731             ret = -1;
1732             break;
1733         }
1734 
1735         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1736             whpx_vcpu_configure_single_stepping(cpu,
1737                 false,
1738                 &vcpu->exit_ctx.VpContext.Rflags);
1739         }
1740 
1741         whpx_vcpu_post_run(cpu);
1742 
1743         switch (vcpu->exit_ctx.ExitReason) {
1744         case WHvRunVpExitReasonMemoryAccess:
1745             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1746             break;
1747 
1748         case WHvRunVpExitReasonX64IoPortAccess:
1749             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1750             break;
1751 
1752         case WHvRunVpExitReasonX64InterruptWindow:
1753             vcpu->ready_for_pic_interrupt = 1;
1754             vcpu->window_registered = 0;
1755             ret = 0;
1756             break;
1757 
1758         case WHvRunVpExitReasonX64ApicEoi:
1759             assert(whpx_apic_in_platform());
1760             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1761             break;
1762 
1763         case WHvRunVpExitReasonX64Halt:
1764             /*
1765              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1766              * longer used.
1767              */
1768             ret = whpx_handle_halt(cpu);
1769             break;
1770 
1771         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1772             WHV_INTERRUPT_CONTROL ipi = {0};
1773             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1774             uint32_t delivery_mode =
1775                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1776             int dest_shorthand =
1777                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1778             bool broadcast = false;
1779             bool include_self = false;
1780             uint32_t i;
1781 
1782             /* We only registered for INIT and SIPI exits. */
1783             if ((delivery_mode != APIC_DM_INIT) &&
1784                 (delivery_mode != APIC_DM_SIPI)) {
1785                 error_report(
1786                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1787                 break;
1788             }
1789 
1790             if (delivery_mode == APIC_DM_INIT) {
1791                 ipi.Type = WHvX64InterruptTypeInit;
1792             } else {
1793                 ipi.Type = WHvX64InterruptTypeSipi;
1794             }
1795 
1796             ipi.DestinationMode =
1797                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1798                     WHvX64InterruptDestinationModeLogical :
1799                     WHvX64InterruptDestinationModePhysical;
1800 
1801             ipi.TriggerMode =
1802                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1803                     WHvX64InterruptTriggerModeLevel :
1804                     WHvX64InterruptTriggerModeEdge;
1805 
1806             ipi.Vector = icr & APIC_VECTOR_MASK;
1807             switch (dest_shorthand) {
1808             /* no shorthand. Bits 56-63 contain the destination. */
1809             case 0:
1810                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1811                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1812                         &ipi, sizeof(ipi));
1813                 if (FAILED(hr)) {
1814                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1815                         hr);
1816                 }
1817 
1818                 break;
1819 
1820             /* self */
1821             case 1:
1822                 include_self = true;
1823                 break;
1824 
1825             /* broadcast, including self */
1826             case 2:
1827                 broadcast = true;
1828                 include_self = true;
1829                 break;
1830 
1831             /* broadcast, excluding self */
1832             case 3:
1833                 broadcast = true;
1834                 break;
1835             }
1836 
1837             if (!broadcast && !include_self) {
1838                 break;
1839             }
1840 
1841             for (i = 0; i <= max_vcpu_index; i++) {
1842                 if (i == cpu->cpu_index && !include_self) {
1843                     continue;
1844                 }
1845 
1846                 /*
1847                  * Assuming that APIC Ids are identity mapped since
1848                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1849                  * are not handled yet and the hypervisor doesn't allow the
1850                  * guest to modify the APIC ID.
1851                  */
1852                 ipi.Destination = i;
1853                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1854                         &ipi, sizeof(ipi));
1855                 if (FAILED(hr)) {
1856                     error_report(
1857                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1858                         i, hr);
1859                 }
1860             }
1861 
1862             break;
1863         }
1864 
1865         case WHvRunVpExitReasonCanceled:
1866             if (exclusive_step_mode != WHPX_STEP_NONE) {
1867                 /*
1868                  * We are trying to step over a single instruction, and
1869                  * likely got a request to stop from another thread.
1870                  * Delay it until we are done stepping
1871                  * over.
1872                  */
1873                 ret = 0;
1874             } else {
1875                 cpu->exception_index = EXCP_INTERRUPT;
1876                 ret = 1;
1877             }
1878             break;
1879         case WHvRunVpExitReasonX64MsrAccess: {
1880             WHV_REGISTER_VALUE reg_values[3] = {0};
1881             WHV_REGISTER_NAME reg_names[3];
1882             UINT32 reg_count;
1883 
1884             reg_names[0] = WHvX64RegisterRip;
1885             reg_names[1] = WHvX64RegisterRax;
1886             reg_names[2] = WHvX64RegisterRdx;
1887 
1888             reg_values[0].Reg64 =
1889                 vcpu->exit_ctx.VpContext.Rip +
1890                 vcpu->exit_ctx.VpContext.InstructionLength;
1891 
1892             /*
1893              * For all unsupported MSR access we:
1894              *     ignore writes
1895              *     return 0 on read.
1896              */
1897             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1898                         1 : 3;
1899 
1900             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1901                 whpx->partition,
1902                 cpu->cpu_index,
1903                 reg_names, reg_count,
1904                 reg_values);
1905 
1906             if (FAILED(hr)) {
1907                 error_report("WHPX: Failed to set MsrAccess state "
1908                              " registers, hr=%08lx", hr);
1909             }
1910             ret = 0;
1911             break;
1912         }
1913         case WHvRunVpExitReasonX64Cpuid: {
1914             WHV_REGISTER_VALUE reg_values[5];
1915             WHV_REGISTER_NAME reg_names[5];
1916             UINT32 reg_count = 5;
1917             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1918             X86CPU *x86_cpu = X86_CPU(cpu);
1919             CPUX86State *env = &x86_cpu->env;
1920 
1921             memset(reg_values, 0, sizeof(reg_values));
1922 
1923             rip = vcpu->exit_ctx.VpContext.Rip +
1924                   vcpu->exit_ctx.VpContext.InstructionLength;
1925             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1926 
1927             /*
1928              * Ideally, these should be supplied to the hypervisor during VCPU
1929              * initialization and it should be able to satisfy this request.
1930              * But, currently, WHPX doesn't support setting CPUID values in the
1931              * hypervisor once the partition has been setup, which is too late
1932              * since VCPUs are realized later. For now, use the values from
1933              * QEMU to satisfy these requests, until WHPX adds support for
1934              * being able to set these values in the hypervisor at runtime.
1935              */
1936             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1937                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1938             switch (cpuid_fn) {
1939             case 0x40000000:
1940                 /* Expose the vmware cpu frequency cpuid leaf */
1941                 rax = 0x40000010;
1942                 rbx = rcx = rdx = 0;
1943                 break;
1944 
1945             case 0x40000010:
1946                 rax = env->tsc_khz;
1947                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1948                 rcx = rdx = 0;
1949                 break;
1950 
1951             case 0x80000001:
1952                 /* Remove any support of OSVW */
1953                 rcx &= ~CPUID_EXT3_OSVW;
1954                 break;
1955             }
1956 
1957             reg_names[0] = WHvX64RegisterRip;
1958             reg_names[1] = WHvX64RegisterRax;
1959             reg_names[2] = WHvX64RegisterRcx;
1960             reg_names[3] = WHvX64RegisterRdx;
1961             reg_names[4] = WHvX64RegisterRbx;
1962 
1963             reg_values[0].Reg64 = rip;
1964             reg_values[1].Reg64 = rax;
1965             reg_values[2].Reg64 = rcx;
1966             reg_values[3].Reg64 = rdx;
1967             reg_values[4].Reg64 = rbx;
1968 
1969             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1970                 whpx->partition, cpu->cpu_index,
1971                 reg_names,
1972                 reg_count,
1973                 reg_values);
1974 
1975             if (FAILED(hr)) {
1976                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1977                              " hr=%08lx", hr);
1978             }
1979             ret = 0;
1980             break;
1981         }
1982         case WHvRunVpExitReasonException:
1983             whpx_get_registers(cpu);
1984 
1985             if ((vcpu->exit_ctx.VpException.ExceptionType ==
1986                  WHvX64ExceptionTypeDebugTrapOrFault) &&
1987                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1988                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1989                  whpx_breakpoint_instruction)) {
1990                 /* Stopped at a software breakpoint. */
1991                 cpu->exception_index = EXCP_DEBUG;
1992             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
1993                         WHvX64ExceptionTypeDebugTrapOrFault) &&
1994                        !cpu->singlestep_enabled) {
1995                 /*
1996                  * Just finished stepping over a breakpoint, but the
1997                  * gdb does not expect us to do single-stepping.
1998                  * Don't do anything special.
1999                  */
2000                 cpu->exception_index = EXCP_INTERRUPT;
2001             } else {
2002                 /* Another exception or debug event. Report it to GDB. */
2003                 cpu->exception_index = EXCP_DEBUG;
2004             }
2005 
2006             ret = 1;
2007             break;
2008         case WHvRunVpExitReasonNone:
2009         case WHvRunVpExitReasonUnrecoverableException:
2010         case WHvRunVpExitReasonInvalidVpRegisterValue:
2011         case WHvRunVpExitReasonUnsupportedFeature:
2012         default:
2013             error_report("WHPX: Unexpected VP exit code %d",
2014                          vcpu->exit_ctx.ExitReason);
2015             whpx_get_registers(cpu);
2016             bql_lock();
2017             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2018             bql_unlock();
2019             break;
2020         }
2021 
2022     } while (!ret);
2023 
2024     if (stepped_over_bp) {
2025         /* Restore the breakpoint we stepped over */
2026         cpu_memory_rw_debug(cpu,
2027             stepped_over_bp->address,
2028             (void *)&whpx_breakpoint_instruction,
2029             1,
2030             true);
2031     }
2032 
2033     if (exclusive_step_mode != WHPX_STEP_NONE) {
2034         g_assert(cpu_in_exclusive_context(cpu));
2035         cpu->running = false;
2036         end_exclusive();
2037 
2038         exclusive_step_mode = WHPX_STEP_NONE;
2039     } else {
2040         cpu_exec_end(cpu);
2041     }
2042 
2043     bql_lock();
2044     current_cpu = cpu;
2045 
2046     if (--whpx->running_cpus == 0) {
2047         whpx_last_vcpu_stopping(cpu);
2048     }
2049 
2050     qatomic_set(&cpu->exit_request, false);
2051 
2052     return ret < 0;
2053 }
2054 
do_whpx_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2055 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2056 {
2057     if (!cpu->accel->dirty) {
2058         whpx_get_registers(cpu);
2059         cpu->accel->dirty = true;
2060     }
2061 }
2062 
do_whpx_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2063 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2064                                                run_on_cpu_data arg)
2065 {
2066     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2067     cpu->accel->dirty = false;
2068 }
2069 
do_whpx_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2070 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2071                                               run_on_cpu_data arg)
2072 {
2073     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2074     cpu->accel->dirty = false;
2075 }
2076 
do_whpx_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2077 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2078                                                run_on_cpu_data arg)
2079 {
2080     cpu->accel->dirty = true;
2081 }
2082 
2083 /*
2084  * CPU support.
2085  */
2086 
whpx_cpu_synchronize_state(CPUState * cpu)2087 void whpx_cpu_synchronize_state(CPUState *cpu)
2088 {
2089     if (!cpu->accel->dirty) {
2090         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2091     }
2092 }
2093 
whpx_cpu_synchronize_post_reset(CPUState * cpu)2094 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2095 {
2096     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2097 }
2098 
whpx_cpu_synchronize_post_init(CPUState * cpu)2099 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2100 {
2101     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2102 }
2103 
whpx_cpu_synchronize_pre_loadvm(CPUState * cpu)2104 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2105 {
2106     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2107 }
2108 
whpx_cpu_synchronize_pre_resume(bool step_pending)2109 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2110 {
2111     whpx_global.step_pending = step_pending;
2112 }
2113 
2114 /*
2115  * Vcpu support.
2116  */
2117 
2118 static Error *whpx_migration_blocker;
2119 
whpx_cpu_update_state(void * opaque,bool running,RunState state)2120 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2121 {
2122     CPUX86State *env = opaque;
2123 
2124     if (running) {
2125         env->tsc_valid = false;
2126     }
2127 }
2128 
whpx_init_vcpu(CPUState * cpu)2129 int whpx_init_vcpu(CPUState *cpu)
2130 {
2131     HRESULT hr;
2132     struct whpx_state *whpx = &whpx_global;
2133     AccelCPUState *vcpu = NULL;
2134     Error *local_error = NULL;
2135     X86CPU *x86_cpu = X86_CPU(cpu);
2136     CPUX86State *env = &x86_cpu->env;
2137     UINT64 freq = 0;
2138     int ret;
2139 
2140     /* Add migration blockers for all unsupported features of the
2141      * Windows Hypervisor Platform
2142      */
2143     if (whpx_migration_blocker == NULL) {
2144         error_setg(&whpx_migration_blocker,
2145                "State blocked due to non-migratable CPUID feature support,"
2146                "dirty memory tracking support, and XSAVE/XRSTOR support");
2147 
2148         if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2149             error_report_err(local_error);
2150             ret = -EINVAL;
2151             goto error;
2152         }
2153     }
2154 
2155     vcpu = g_new0(AccelCPUState, 1);
2156 
2157     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2158         &whpx_emu_callbacks,
2159         &vcpu->emulator);
2160     if (FAILED(hr)) {
2161         error_report("WHPX: Failed to setup instruction completion support,"
2162                      " hr=%08lx", hr);
2163         ret = -EINVAL;
2164         goto error;
2165     }
2166 
2167     hr = whp_dispatch.WHvCreateVirtualProcessor(
2168         whpx->partition, cpu->cpu_index, 0);
2169     if (FAILED(hr)) {
2170         error_report("WHPX: Failed to create a virtual processor,"
2171                      " hr=%08lx", hr);
2172         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2173         ret = -EINVAL;
2174         goto error;
2175     }
2176 
2177     /*
2178      * vcpu's TSC frequency is either specified by user, or use the value
2179      * provided by Hyper-V if the former is not present. In the latter case, we
2180      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2181      * frequency can be migrated later via this field.
2182      */
2183     if (!env->tsc_khz) {
2184         hr = whp_dispatch.WHvGetCapability(
2185             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2186                 NULL);
2187         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2188             if (FAILED(hr)) {
2189                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2190             } else {
2191                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2192             }
2193         }
2194     }
2195 
2196     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2197     hr = whp_dispatch.WHvGetCapability(
2198         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2199     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2200         if (FAILED(hr)) {
2201             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2202         } else {
2203             env->apic_bus_freq = freq;
2204         }
2205     }
2206 
2207     /*
2208      * If the vmware cpuid frequency leaf option is set, and we have a valid
2209      * tsc value, trap the corresponding cpuid's.
2210      */
2211     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2212         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2213 
2214         hr = whp_dispatch.WHvSetPartitionProperty(
2215                 whpx->partition,
2216                 WHvPartitionPropertyCodeCpuidExitList,
2217                 cpuidExitList,
2218                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2219 
2220         if (FAILED(hr)) {
2221             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2222                         hr);
2223             ret = -EINVAL;
2224             goto error;
2225         }
2226     }
2227 
2228     vcpu->interruptable = true;
2229     vcpu->dirty = true;
2230     cpu->accel = vcpu;
2231     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2232     qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2233 
2234     return 0;
2235 
2236 error:
2237     g_free(vcpu);
2238 
2239     return ret;
2240 }
2241 
whpx_vcpu_exec(CPUState * cpu)2242 int whpx_vcpu_exec(CPUState *cpu)
2243 {
2244     int ret;
2245     int fatal;
2246 
2247     for (;;) {
2248         if (cpu->exception_index >= EXCP_INTERRUPT) {
2249             ret = cpu->exception_index;
2250             cpu->exception_index = -1;
2251             break;
2252         }
2253 
2254         fatal = whpx_vcpu_run(cpu);
2255 
2256         if (fatal) {
2257             error_report("WHPX: Failed to exec a virtual processor");
2258             abort();
2259         }
2260     }
2261 
2262     return ret;
2263 }
2264 
whpx_destroy_vcpu(CPUState * cpu)2265 void whpx_destroy_vcpu(CPUState *cpu)
2266 {
2267     struct whpx_state *whpx = &whpx_global;
2268     AccelCPUState *vcpu = cpu->accel;
2269 
2270     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2271     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2272     g_free(cpu->accel);
2273 }
2274 
whpx_vcpu_kick(CPUState * cpu)2275 void whpx_vcpu_kick(CPUState *cpu)
2276 {
2277     struct whpx_state *whpx = &whpx_global;
2278     whp_dispatch.WHvCancelRunVirtualProcessor(
2279         whpx->partition, cpu->cpu_index, 0);
2280 }
2281 
2282 /*
2283  * Memory support.
2284  */
2285 
whpx_update_mapping(hwaddr start_pa,ram_addr_t size,void * host_va,int add,int rom,const char * name)2286 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2287                                 void *host_va, int add, int rom,
2288                                 const char *name)
2289 {
2290     struct whpx_state *whpx = &whpx_global;
2291     HRESULT hr;
2292 
2293     /*
2294     if (add) {
2295         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2296                (void*)start_pa, (void*)size, host_va,
2297                (rom ? "ROM" : "RAM"), name);
2298     } else {
2299         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2300                (void*)start_pa, (void*)size, host_va, name);
2301     }
2302     */
2303 
2304     if (add) {
2305         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2306                                          host_va,
2307                                          start_pa,
2308                                          size,
2309                                          (WHvMapGpaRangeFlagRead |
2310                                           WHvMapGpaRangeFlagExecute |
2311                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2312     } else {
2313         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2314                                            start_pa,
2315                                            size);
2316     }
2317 
2318     if (FAILED(hr)) {
2319         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2320                      " Host:%p, hr=%08lx",
2321                      (add ? "MAP" : "UNMAP"), name,
2322                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2323     }
2324 }
2325 
whpx_process_section(MemoryRegionSection * section,int add)2326 static void whpx_process_section(MemoryRegionSection *section, int add)
2327 {
2328     MemoryRegion *mr = section->mr;
2329     hwaddr start_pa = section->offset_within_address_space;
2330     ram_addr_t size = int128_get64(section->size);
2331     unsigned int delta;
2332     uint64_t host_va;
2333 
2334     if (!memory_region_is_ram(mr)) {
2335         return;
2336     }
2337 
2338     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2339     delta &= ~qemu_real_host_page_mask();
2340     if (delta > size) {
2341         return;
2342     }
2343     start_pa += delta;
2344     size -= delta;
2345     size &= qemu_real_host_page_mask();
2346     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2347         return;
2348     }
2349 
2350     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2351             + section->offset_within_region + delta;
2352 
2353     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2354                         memory_region_is_rom(mr), mr->name);
2355 }
2356 
whpx_region_add(MemoryListener * listener,MemoryRegionSection * section)2357 static void whpx_region_add(MemoryListener *listener,
2358                            MemoryRegionSection *section)
2359 {
2360     memory_region_ref(section->mr);
2361     whpx_process_section(section, 1);
2362 }
2363 
whpx_region_del(MemoryListener * listener,MemoryRegionSection * section)2364 static void whpx_region_del(MemoryListener *listener,
2365                            MemoryRegionSection *section)
2366 {
2367     whpx_process_section(section, 0);
2368     memory_region_unref(section->mr);
2369 }
2370 
whpx_transaction_begin(MemoryListener * listener)2371 static void whpx_transaction_begin(MemoryListener *listener)
2372 {
2373 }
2374 
whpx_transaction_commit(MemoryListener * listener)2375 static void whpx_transaction_commit(MemoryListener *listener)
2376 {
2377 }
2378 
whpx_log_sync(MemoryListener * listener,MemoryRegionSection * section)2379 static void whpx_log_sync(MemoryListener *listener,
2380                          MemoryRegionSection *section)
2381 {
2382     MemoryRegion *mr = section->mr;
2383 
2384     if (!memory_region_is_ram(mr)) {
2385         return;
2386     }
2387 
2388     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2389 }
2390 
2391 static MemoryListener whpx_memory_listener = {
2392     .name = "whpx",
2393     .begin = whpx_transaction_begin,
2394     .commit = whpx_transaction_commit,
2395     .region_add = whpx_region_add,
2396     .region_del = whpx_region_del,
2397     .log_sync = whpx_log_sync,
2398     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2399 };
2400 
whpx_memory_init(void)2401 static void whpx_memory_init(void)
2402 {
2403     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2404 }
2405 
2406 /*
2407  * Load the functions from the given library, using the given handle. If a
2408  * handle is provided, it is used, otherwise the library is opened. The
2409  * handle will be updated on return with the opened one.
2410  */
load_whp_dispatch_fns(HMODULE * handle,WHPFunctionList function_list)2411 static bool load_whp_dispatch_fns(HMODULE *handle,
2412     WHPFunctionList function_list)
2413 {
2414     HMODULE hLib = *handle;
2415 
2416     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2417     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2418     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2419         whp_dispatch.function_name = \
2420             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2421 
2422     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2423         whp_dispatch.function_name = \
2424             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2425         if (!whp_dispatch.function_name) { \
2426             error_report("Could not load function %s", #function_name); \
2427             goto error; \
2428         } \
2429 
2430     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2431     if (!handle_lib) { \
2432         handle_lib = LoadLibrary(lib_name); \
2433         if (!handle_lib) { \
2434             error_report("Could not load library %s.", lib_name); \
2435             goto error; \
2436         } \
2437     } \
2438 
2439     switch (function_list) {
2440     case WINHV_PLATFORM_FNS_DEFAULT:
2441         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2442         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2443         break;
2444 
2445     case WINHV_EMULATION_FNS_DEFAULT:
2446         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2447         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2448         break;
2449 
2450     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2451         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2452         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2453         break;
2454     }
2455 
2456     *handle = hLib;
2457     return true;
2458 
2459 error:
2460     if (hLib) {
2461         FreeLibrary(hLib);
2462     }
2463 
2464     return false;
2465 }
2466 
whpx_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2467 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2468                                    const char *name, void *opaque,
2469                                    Error **errp)
2470 {
2471     struct whpx_state *whpx = &whpx_global;
2472     OnOffSplit mode;
2473 
2474     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2475         return;
2476     }
2477 
2478     switch (mode) {
2479     case ON_OFF_SPLIT_ON:
2480         whpx->kernel_irqchip_allowed = true;
2481         whpx->kernel_irqchip_required = true;
2482         break;
2483 
2484     case ON_OFF_SPLIT_OFF:
2485         whpx->kernel_irqchip_allowed = false;
2486         whpx->kernel_irqchip_required = false;
2487         break;
2488 
2489     case ON_OFF_SPLIT_SPLIT:
2490         error_setg(errp, "WHPX: split irqchip currently not supported");
2491         error_append_hint(errp,
2492             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2493         break;
2494 
2495     default:
2496         /*
2497          * The value was checked in visit_type_OnOffSplit() above. If
2498          * we get here, then something is wrong in QEMU.
2499          */
2500         abort();
2501     }
2502 }
2503 
2504 /*
2505  * Partition support
2506  */
2507 
whpx_accel_init(MachineState * ms)2508 static int whpx_accel_init(MachineState *ms)
2509 {
2510     struct whpx_state *whpx;
2511     int ret;
2512     HRESULT hr;
2513     WHV_CAPABILITY whpx_cap;
2514     UINT32 whpx_cap_size;
2515     WHV_PARTITION_PROPERTY prop;
2516     UINT32 cpuidExitList[] = {1, 0x80000001};
2517     WHV_CAPABILITY_FEATURES features = {0};
2518 
2519     whpx = &whpx_global;
2520 
2521     if (!init_whp_dispatch()) {
2522         ret = -ENOSYS;
2523         goto error;
2524     }
2525 
2526     whpx->mem_quota = ms->ram_size;
2527 
2528     hr = whp_dispatch.WHvGetCapability(
2529         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2530         sizeof(whpx_cap), &whpx_cap_size);
2531     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2532         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2533         ret = -ENOSPC;
2534         goto error;
2535     }
2536 
2537     hr = whp_dispatch.WHvGetCapability(
2538         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2539     if (FAILED(hr)) {
2540         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2541         ret = -EINVAL;
2542         goto error;
2543     }
2544 
2545     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2546     if (FAILED(hr)) {
2547         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2548         ret = -EINVAL;
2549         goto error;
2550     }
2551 
2552     /*
2553      * Query the XSAVE capability of the partition. Any error here is not
2554      * considered fatal.
2555      */
2556     hr = whp_dispatch.WHvGetPartitionProperty(
2557         whpx->partition,
2558         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2559         &whpx_xsave_cap,
2560         sizeof(whpx_xsave_cap),
2561         &whpx_cap_size);
2562 
2563     /*
2564      * Windows version which don't support this property will return with the
2565      * specific error code.
2566      */
2567     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2568         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2569     }
2570 
2571     if (!whpx_has_xsave()) {
2572         printf("WHPX: Partition is not XSAVE capable\n");
2573     }
2574 
2575     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2576     prop.ProcessorCount = ms->smp.cpus;
2577     hr = whp_dispatch.WHvSetPartitionProperty(
2578         whpx->partition,
2579         WHvPartitionPropertyCodeProcessorCount,
2580         &prop,
2581         sizeof(WHV_PARTITION_PROPERTY));
2582 
2583     if (FAILED(hr)) {
2584         error_report("WHPX: Failed to set partition processor count to %u,"
2585                      " hr=%08lx", prop.ProcessorCount, hr);
2586         ret = -EINVAL;
2587         goto error;
2588     }
2589 
2590     /*
2591      * Error out if WHP doesn't support apic emulation and user is requiring
2592      * it.
2593      */
2594     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2595             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2596         error_report("WHPX: kernel irqchip requested, but unavailable. "
2597             "Try without kernel-irqchip or with kernel-irqchip=off");
2598         ret = -EINVAL;
2599         goto error;
2600     }
2601 
2602     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2603         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2604         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2605             WHvX64LocalApicEmulationModeXApic;
2606         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2607         hr = whp_dispatch.WHvSetPartitionProperty(
2608             whpx->partition,
2609             WHvPartitionPropertyCodeLocalApicEmulationMode,
2610             &mode,
2611             sizeof(mode));
2612         if (FAILED(hr)) {
2613             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2614             if (whpx->kernel_irqchip_required) {
2615                 error_report("WHPX: kernel irqchip requested, but unavailable");
2616                 ret = -EINVAL;
2617                 goto error;
2618             }
2619         } else {
2620             whpx->apic_in_platform = true;
2621         }
2622     }
2623 
2624     /* Register for MSR and CPUID exits */
2625     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2626     prop.ExtendedVmExits.X64MsrExit = 1;
2627     prop.ExtendedVmExits.X64CpuidExit = 1;
2628     prop.ExtendedVmExits.ExceptionExit = 1;
2629     if (whpx_apic_in_platform()) {
2630         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2631     }
2632 
2633     hr = whp_dispatch.WHvSetPartitionProperty(
2634             whpx->partition,
2635             WHvPartitionPropertyCodeExtendedVmExits,
2636             &prop,
2637             sizeof(WHV_PARTITION_PROPERTY));
2638     if (FAILED(hr)) {
2639         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2640         ret = -EINVAL;
2641         goto error;
2642     }
2643 
2644     hr = whp_dispatch.WHvSetPartitionProperty(
2645         whpx->partition,
2646         WHvPartitionPropertyCodeCpuidExitList,
2647         cpuidExitList,
2648         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2649 
2650     if (FAILED(hr)) {
2651         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2652                      hr);
2653         ret = -EINVAL;
2654         goto error;
2655     }
2656 
2657     /*
2658      * We do not want to intercept any exceptions from the guest,
2659      * until we actually start debugging with gdb.
2660      */
2661     whpx->exception_exit_bitmap = -1;
2662     hr = whpx_set_exception_exit_bitmap(0);
2663 
2664     if (FAILED(hr)) {
2665         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2666         ret = -EINVAL;
2667         goto error;
2668     }
2669 
2670     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2671     if (FAILED(hr)) {
2672         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2673         ret = -EINVAL;
2674         goto error;
2675     }
2676 
2677     whpx_memory_init();
2678 
2679     printf("Windows Hypervisor Platform accelerator is operational\n");
2680     return 0;
2681 
2682 error:
2683 
2684     if (NULL != whpx->partition) {
2685         whp_dispatch.WHvDeletePartition(whpx->partition);
2686         whpx->partition = NULL;
2687     }
2688 
2689     return ret;
2690 }
2691 
whpx_enabled(void)2692 int whpx_enabled(void)
2693 {
2694     return whpx_allowed;
2695 }
2696 
whpx_apic_in_platform(void)2697 bool whpx_apic_in_platform(void) {
2698     return whpx_global.apic_in_platform;
2699 }
2700 
whpx_accel_class_init(ObjectClass * oc,const void * data)2701 static void whpx_accel_class_init(ObjectClass *oc, const void *data)
2702 {
2703     AccelClass *ac = ACCEL_CLASS(oc);
2704     ac->name = "WHPX";
2705     ac->init_machine = whpx_accel_init;
2706     ac->allowed = &whpx_allowed;
2707 
2708     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2709         NULL, whpx_set_kernel_irqchip,
2710         NULL, NULL);
2711     object_class_property_set_description(oc, "kernel-irqchip",
2712         "Configure WHPX in-kernel irqchip");
2713 }
2714 
whpx_accel_instance_init(Object * obj)2715 static void whpx_accel_instance_init(Object *obj)
2716 {
2717     struct whpx_state *whpx = &whpx_global;
2718 
2719     memset(whpx, 0, sizeof(struct whpx_state));
2720     /* Turn on kernel-irqchip, by default */
2721     whpx->kernel_irqchip_allowed = true;
2722 }
2723 
2724 static const TypeInfo whpx_accel_type = {
2725     .name = ACCEL_CLASS_NAME("whpx"),
2726     .parent = TYPE_ACCEL,
2727     .instance_init = whpx_accel_instance_init,
2728     .class_init = whpx_accel_class_init,
2729 };
2730 
whpx_type_init(void)2731 static void whpx_type_init(void)
2732 {
2733     type_register_static(&whpx_accel_type);
2734 }
2735 
init_whp_dispatch(void)2736 bool init_whp_dispatch(void)
2737 {
2738     if (whp_dispatch_initialized) {
2739         return true;
2740     }
2741 
2742     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2743         goto error;
2744     }
2745 
2746     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2747         goto error;
2748     }
2749 
2750     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2751         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2752     whp_dispatch_initialized = true;
2753 
2754     return true;
2755 error:
2756     if (hWinHvPlatform) {
2757         FreeLibrary(hWinHvPlatform);
2758     }
2759 
2760     if (hWinHvEmulation) {
2761         FreeLibrary(hWinHvEmulation);
2762     }
2763 
2764     return false;
2765 }
2766 
2767 type_init(whpx_type_init);
2768