xref: /qemu/target/i386/whpx/whpx-all.c (revision 989dd906ed5556563a57b32ae7abf9db5e1f38ba)
1 /*
2  * QEMU Windows Hypervisor Platform accelerator (WHPX)
3  *
4  * Copyright Microsoft Corp. 2017
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2 or later.
7  * See the COPYING file in the top-level directory.
8  *
9  */
10 
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "system/address-spaces.h"
14 #include "system/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "system/whpx.h"
18 #include "system/cpus.h"
19 #include "system/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30 
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33 
34 #include <winhvplatform.h>
35 #include <winhvemulation.h>
36 
37 #define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
38 
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40 
41     /* X64 General purpose registers */
42     WHvX64RegisterRax,
43     WHvX64RegisterRcx,
44     WHvX64RegisterRdx,
45     WHvX64RegisterRbx,
46     WHvX64RegisterRsp,
47     WHvX64RegisterRbp,
48     WHvX64RegisterRsi,
49     WHvX64RegisterRdi,
50     WHvX64RegisterR8,
51     WHvX64RegisterR9,
52     WHvX64RegisterR10,
53     WHvX64RegisterR11,
54     WHvX64RegisterR12,
55     WHvX64RegisterR13,
56     WHvX64RegisterR14,
57     WHvX64RegisterR15,
58     WHvX64RegisterRip,
59     WHvX64RegisterRflags,
60 
61     /* X64 Segment registers */
62     WHvX64RegisterEs,
63     WHvX64RegisterCs,
64     WHvX64RegisterSs,
65     WHvX64RegisterDs,
66     WHvX64RegisterFs,
67     WHvX64RegisterGs,
68     WHvX64RegisterLdtr,
69     WHvX64RegisterTr,
70 
71     /* X64 Table registers */
72     WHvX64RegisterIdtr,
73     WHvX64RegisterGdtr,
74 
75     /* X64 Control Registers */
76     WHvX64RegisterCr0,
77     WHvX64RegisterCr2,
78     WHvX64RegisterCr3,
79     WHvX64RegisterCr4,
80     WHvX64RegisterCr8,
81 
82     /* X64 Debug Registers */
83     /*
84      * WHvX64RegisterDr0,
85      * WHvX64RegisterDr1,
86      * WHvX64RegisterDr2,
87      * WHvX64RegisterDr3,
88      * WHvX64RegisterDr6,
89      * WHvX64RegisterDr7,
90      */
91 
92     /* X64 Floating Point and Vector Registers */
93     WHvX64RegisterXmm0,
94     WHvX64RegisterXmm1,
95     WHvX64RegisterXmm2,
96     WHvX64RegisterXmm3,
97     WHvX64RegisterXmm4,
98     WHvX64RegisterXmm5,
99     WHvX64RegisterXmm6,
100     WHvX64RegisterXmm7,
101     WHvX64RegisterXmm8,
102     WHvX64RegisterXmm9,
103     WHvX64RegisterXmm10,
104     WHvX64RegisterXmm11,
105     WHvX64RegisterXmm12,
106     WHvX64RegisterXmm13,
107     WHvX64RegisterXmm14,
108     WHvX64RegisterXmm15,
109     WHvX64RegisterFpMmx0,
110     WHvX64RegisterFpMmx1,
111     WHvX64RegisterFpMmx2,
112     WHvX64RegisterFpMmx3,
113     WHvX64RegisterFpMmx4,
114     WHvX64RegisterFpMmx5,
115     WHvX64RegisterFpMmx6,
116     WHvX64RegisterFpMmx7,
117     WHvX64RegisterFpControlStatus,
118     WHvX64RegisterXmmControlStatus,
119 
120     /* X64 MSRs */
121     WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123     WHvX64RegisterKernelGsBase,
124 #endif
125     WHvX64RegisterApicBase,
126     /* WHvX64RegisterPat, */
127     WHvX64RegisterSysenterCs,
128     WHvX64RegisterSysenterEip,
129     WHvX64RegisterSysenterEsp,
130     WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132     WHvX64RegisterLstar,
133     WHvX64RegisterCstar,
134     WHvX64RegisterSfmask,
135 #endif
136 
137     /* Interrupt / Event Registers */
138     /*
139      * WHvRegisterPendingInterruption,
140      * WHvRegisterInterruptState,
141      * WHvRegisterPendingEvent0,
142      * WHvRegisterPendingEvent1
143      * WHvX64RegisterDeliverabilityNotifications,
144      */
145 };
146 
147 struct whpx_register_set {
148     WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150 
151 /*
152  * The current implementation of instruction stepping sets the TF flag
153  * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154  * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155  *
156  * This approach has a few limitations:
157  *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158  *        along with the other flags, possibly restoring it later. It would
159  *        result in another INT1 when the flags are restored, triggering
160  *        a stop in gdb that could be cleared by doing another step.
161  *
162  *        Stepping over a POPF/LAHF instruction will let it overwrite the
163  *        TF flags, ending the stepping mode.
164  *
165  *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166  *        or anything that could result in a page fault) will save the flags
167  *        to the stack, clear the TF flag, and let the guest execute the
168  *        handler. Normally, the guest will restore the original flags,
169  *        that will continue single-stepping.
170  *
171  *     3. Debuggers running on the guest may wish to set TF to do instruction
172  *        stepping. INT1 events generated by it would be intercepted by us,
173  *        as long as the gdb is connected to QEMU.
174  *
175  * In practice this means that:
176  *     1. Stepping through flags-modifying instructions may cause gdb to
177  *        continue or stop in unexpected places. This will be fully recoverable
178  *        and will not crash the target.
179  *
180  *     2. Stepping over an instruction that triggers an exception will step
181  *        over the exception handler, not into it.
182  *
183  *     3. Debugging the guest via gdb, while running debugger on the guest
184  *        at the same time may lead to unexpected effects. Removing all
185  *        breakpoints set via QEMU will prevent any further interference
186  *        with the guest-level debuggers.
187  *
188  * The limitations can be addressed as shown below:
189  *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190  *        stepping through them. The exact semantics of the instructions is
191  *        defined in the "Combined Volume Set of Intel 64 and IA-32
192  *        Architectures Software Developer's Manuals", however it involves a
193  *        fair amount of corner cases due to compatibility with real mode,
194  *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195  *
196  *     2. We could step into the guest's exception handlers using the following
197  *        sequence:
198  *          a. Temporarily enable catching of all exception types via
199  *             whpx_set_exception_exit_bitmap().
200  *          b. Once an exception is intercepted, read the IDT/GDT and locate
201  *             the original handler.
202  *          c. Patch the original handler, injecting an INT3 at the beginning.
203  *          d. Update the exception exit bitmap to only catch the
204  *             WHvX64ExceptionTypeBreakpointTrap exception.
205  *          e. Let the affected CPU run in the exclusive mode.
206  *          f. Restore the original handler and the exception exit bitmap.
207  *        Note that handling all corner cases related to IDT/GDT is harder
208  *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209  *        rough idea.
210  *
211  *     3. In order to properly support guest-level debugging in parallel with
212  *        the QEMU-level debugging, we would need to be able to pass some INT1
213  *        events to the guest. This could be done via the following methods:
214  *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215  *             it seems to only work for interrupts and not software
216  *             exceptions.
217  *          b. Locating and patching the original handler by parsing IDT/GDT.
218  *             This involves relatively complex logic outlined in the previous
219  *             paragraph.
220  *          c. Emulating the exception invocation (i.e. manually updating RIP,
221  *             RFLAGS, and pushing the old values to stack). This is even more
222  *             complicated than the previous option, since it involves checking
223  *             CPL, gate attributes, and doing various adjustments depending
224  *             on the current CPU mode, whether the CPL is changing, etc.
225  */
226 typedef enum WhpxStepMode {
227     WHPX_STEP_NONE = 0,
228     /* Halt other VCPUs */
229     WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231 
232 struct AccelCPUState {
233     WHV_EMULATOR_HANDLE emulator;
234     bool window_registered;
235     bool interruptable;
236     bool ready_for_pic_interrupt;
237     uint64_t tpr;
238     uint64_t apic_base;
239     bool interruption_pending;
240 
241     /* Must be the last field as it may have a tail */
242     WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244 
245 bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
250 
251 struct whpx_state whpx_global;
252 struct WHPDispatch whp_dispatch;
253 
whpx_has_xsave(void)254 static bool whpx_has_xsave(void)
255 {
256     return whpx_xsave_cap.XsaveSupport;
257 }
258 
whpx_seg_q2h(const SegmentCache * qs,int v86,int r86)259 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
260                                              int r86)
261 {
262     WHV_X64_SEGMENT_REGISTER hs;
263     unsigned flags = qs->flags;
264 
265     hs.Base = qs->base;
266     hs.Limit = qs->limit;
267     hs.Selector = qs->selector;
268 
269     if (v86) {
270         hs.Attributes = 0;
271         hs.SegmentType = 3;
272         hs.Present = 1;
273         hs.DescriptorPrivilegeLevel = 3;
274         hs.NonSystemSegment = 1;
275 
276     } else {
277         hs.Attributes = (flags >> DESC_TYPE_SHIFT);
278 
279         if (r86) {
280             /* hs.Base &= 0xfffff; */
281         }
282     }
283 
284     return hs;
285 }
286 
whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER * hs)287 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
288 {
289     SegmentCache qs;
290 
291     qs.base = hs->Base;
292     qs.limit = hs->Limit;
293     qs.selector = hs->Selector;
294 
295     qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
296 
297     return qs;
298 }
299 
300 /* X64 Extended Control Registers */
whpx_set_xcrs(CPUState * cpu)301 static void whpx_set_xcrs(CPUState *cpu)
302 {
303     HRESULT hr;
304     struct whpx_state *whpx = &whpx_global;
305     WHV_REGISTER_VALUE xcr0;
306     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
307 
308     if (!whpx_has_xsave()) {
309         return;
310     }
311 
312     /* Only xcr0 is supported by the hypervisor currently */
313     xcr0.Reg64 = cpu_env(cpu)->xcr0;
314     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
315         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
316     if (FAILED(hr)) {
317         error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
318     }
319 }
320 
whpx_set_tsc(CPUState * cpu)321 static int whpx_set_tsc(CPUState *cpu)
322 {
323     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
324     WHV_REGISTER_VALUE tsc_val;
325     HRESULT hr;
326     struct whpx_state *whpx = &whpx_global;
327 
328     /*
329      * Suspend the partition prior to setting the TSC to reduce the variance
330      * in TSC across vCPUs. When the first vCPU runs post suspend, the
331      * partition is automatically resumed.
332      */
333     if (whp_dispatch.WHvSuspendPartitionTime) {
334 
335         /*
336          * Unable to suspend partition while setting TSC is not a fatal
337          * error. It just increases the likelihood of TSC variance between
338          * vCPUs and some guest OS are able to handle that just fine.
339          */
340         hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
341         if (FAILED(hr)) {
342             warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
343         }
344     }
345 
346     tsc_val.Reg64 = cpu_env(cpu)->tsc;
347     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
348         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
349     if (FAILED(hr)) {
350         error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
351         return -1;
352     }
353 
354     return 0;
355 }
356 
357 /*
358  * The CR8 register in the CPU is mapped to the TPR register of the APIC,
359  * however, they use a slightly different encoding. Specifically:
360  *
361  *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
362  *
363  * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
364  * and IA-32 Architectures Software Developer's Manual.
365  *
366  * The functions below translate the value of CR8 to TPR and vice versa.
367  */
368 
whpx_apic_tpr_to_cr8(uint64_t tpr)369 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
370 {
371     return tpr >> 4;
372 }
373 
whpx_cr8_to_apic_tpr(uint64_t cr8)374 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
375 {
376     return cr8 << 4;
377 }
378 
whpx_set_registers(CPUState * cpu,int level)379 static void whpx_set_registers(CPUState *cpu, int level)
380 {
381     struct whpx_state *whpx = &whpx_global;
382     AccelCPUState *vcpu = cpu->accel;
383     X86CPU *x86_cpu = X86_CPU(cpu);
384     CPUX86State *env = &x86_cpu->env;
385     struct whpx_register_set vcxt;
386     HRESULT hr;
387     int idx;
388     int idx_next;
389     int i;
390     int v86, r86;
391 
392     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
393 
394     /*
395      * Following MSRs have side effects on the guest or are too heavy for
396      * runtime. Limit them to full state update.
397      */
398     if (level >= WHPX_SET_RESET_STATE) {
399         whpx_set_tsc(cpu);
400     }
401 
402     memset(&vcxt, 0, sizeof(struct whpx_register_set));
403 
404     v86 = (env->eflags & VM_MASK);
405     r86 = !(env->cr[0] & CR0_PE_MASK);
406 
407     vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
408     vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
409 
410     idx = 0;
411 
412     /* Indexes for first 16 registers match between HV and QEMU definitions */
413     idx_next = 16;
414     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
415         vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
416     }
417     idx = idx_next;
418 
419     /* Same goes for RIP and RFLAGS */
420     assert(whpx_register_names[idx] == WHvX64RegisterRip);
421     vcxt.values[idx++].Reg64 = env->eip;
422 
423     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
424     vcxt.values[idx++].Reg64 = env->eflags;
425 
426     /* Translate 6+4 segment registers. HV and QEMU order matches  */
427     assert(idx == WHvX64RegisterEs);
428     for (i = 0; i < 6; i += 1, idx += 1) {
429         vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
430     }
431 
432     assert(idx == WHvX64RegisterLdtr);
433     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
434 
435     assert(idx == WHvX64RegisterTr);
436     vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
437 
438     assert(idx == WHvX64RegisterIdtr);
439     vcxt.values[idx].Table.Base = env->idt.base;
440     vcxt.values[idx].Table.Limit = env->idt.limit;
441     idx += 1;
442 
443     assert(idx == WHvX64RegisterGdtr);
444     vcxt.values[idx].Table.Base = env->gdt.base;
445     vcxt.values[idx].Table.Limit = env->gdt.limit;
446     idx += 1;
447 
448     /* CR0, 2, 3, 4, 8 */
449     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
450     vcxt.values[idx++].Reg64 = env->cr[0];
451     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
452     vcxt.values[idx++].Reg64 = env->cr[2];
453     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
454     vcxt.values[idx++].Reg64 = env->cr[3];
455     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
456     vcxt.values[idx++].Reg64 = env->cr[4];
457     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
458     vcxt.values[idx++].Reg64 = vcpu->tpr;
459 
460     /* 8 Debug Registers - Skipped */
461 
462     /*
463      * Extended control registers needs to be handled separately depending
464      * on whether xsave is supported/enabled or not.
465      */
466     whpx_set_xcrs(cpu);
467 
468     /* 16 XMM registers */
469     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
470     idx_next = idx + 16;
471     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
472         vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
473         vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
474     }
475     idx = idx_next;
476 
477     /* 8 FP registers */
478     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
479     for (i = 0; i < 8; i += 1, idx += 1) {
480         vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
481         /* vcxt.values[idx].Fp.AsUINT128.High64 =
482                env->fpregs[i].mmx.MMX_Q(1);
483         */
484     }
485 
486     /* FP control status register */
487     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
488     vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
489     vcxt.values[idx].FpControlStatus.FpStatus =
490         (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
491     vcxt.values[idx].FpControlStatus.FpTag = 0;
492     for (i = 0; i < 8; ++i) {
493         vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
494     }
495     vcxt.values[idx].FpControlStatus.Reserved = 0;
496     vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
497     vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
498     idx += 1;
499 
500     /* XMM control status register */
501     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
502     vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
503     vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
504     vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
505     idx += 1;
506 
507     /* MSRs */
508     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
509     vcxt.values[idx++].Reg64 = env->efer;
510 #ifdef TARGET_X86_64
511     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
512     vcxt.values[idx++].Reg64 = env->kernelgsbase;
513 #endif
514 
515     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
516     vcxt.values[idx++].Reg64 = vcpu->apic_base;
517 
518     /* WHvX64RegisterPat - Skipped */
519 
520     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
521     vcxt.values[idx++].Reg64 = env->sysenter_cs;
522     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
523     vcxt.values[idx++].Reg64 = env->sysenter_eip;
524     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
525     vcxt.values[idx++].Reg64 = env->sysenter_esp;
526     assert(whpx_register_names[idx] == WHvX64RegisterStar);
527     vcxt.values[idx++].Reg64 = env->star;
528 #ifdef TARGET_X86_64
529     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
530     vcxt.values[idx++].Reg64 = env->lstar;
531     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
532     vcxt.values[idx++].Reg64 = env->cstar;
533     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
534     vcxt.values[idx++].Reg64 = env->fmask;
535 #endif
536 
537     /* Interrupt / Event Registers - Skipped */
538 
539     assert(idx == RTL_NUMBER_OF(whpx_register_names));
540 
541     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
542         whpx->partition, cpu->cpu_index,
543         whpx_register_names,
544         RTL_NUMBER_OF(whpx_register_names),
545         &vcxt.values[0]);
546 
547     if (FAILED(hr)) {
548         error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
549                      hr);
550     }
551 }
552 
whpx_get_tsc(CPUState * cpu)553 static int whpx_get_tsc(CPUState *cpu)
554 {
555     WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
556     WHV_REGISTER_VALUE tsc_val;
557     HRESULT hr;
558     struct whpx_state *whpx = &whpx_global;
559 
560     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
561         whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
562     if (FAILED(hr)) {
563         error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
564         return -1;
565     }
566 
567     cpu_env(cpu)->tsc = tsc_val.Reg64;
568     return 0;
569 }
570 
571 /* X64 Extended Control Registers */
whpx_get_xcrs(CPUState * cpu)572 static void whpx_get_xcrs(CPUState *cpu)
573 {
574     HRESULT hr;
575     struct whpx_state *whpx = &whpx_global;
576     WHV_REGISTER_VALUE xcr0;
577     WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
578 
579     if (!whpx_has_xsave()) {
580         return;
581     }
582 
583     /* Only xcr0 is supported by the hypervisor currently */
584     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
585         whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
586     if (FAILED(hr)) {
587         error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
588         return;
589     }
590 
591     cpu_env(cpu)->xcr0 = xcr0.Reg64;
592 }
593 
whpx_get_registers(CPUState * cpu)594 static void whpx_get_registers(CPUState *cpu)
595 {
596     struct whpx_state *whpx = &whpx_global;
597     AccelCPUState *vcpu = cpu->accel;
598     X86CPU *x86_cpu = X86_CPU(cpu);
599     CPUX86State *env = &x86_cpu->env;
600     struct whpx_register_set vcxt;
601     uint64_t tpr, apic_base;
602     HRESULT hr;
603     int idx;
604     int idx_next;
605     int i;
606 
607     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
608 
609     if (!env->tsc_valid) {
610         whpx_get_tsc(cpu);
611         env->tsc_valid = !runstate_is_running();
612     }
613 
614     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
615         whpx->partition, cpu->cpu_index,
616         whpx_register_names,
617         RTL_NUMBER_OF(whpx_register_names),
618         &vcxt.values[0]);
619     if (FAILED(hr)) {
620         error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
621                      hr);
622     }
623 
624     if (whpx_apic_in_platform()) {
625         /*
626          * Fetch the TPR value from the emulated APIC. It may get overwritten
627          * below with the value from CR8 returned by
628          * WHvGetVirtualProcessorRegisters().
629          */
630         whpx_apic_get(x86_cpu->apic_state);
631         vcpu->tpr = whpx_apic_tpr_to_cr8(
632             cpu_get_apic_tpr(x86_cpu->apic_state));
633     }
634 
635     idx = 0;
636 
637     /* Indexes for first 16 registers match between HV and QEMU definitions */
638     idx_next = 16;
639     for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
640         env->regs[idx] = vcxt.values[idx].Reg64;
641     }
642     idx = idx_next;
643 
644     /* Same goes for RIP and RFLAGS */
645     assert(whpx_register_names[idx] == WHvX64RegisterRip);
646     env->eip = vcxt.values[idx++].Reg64;
647     assert(whpx_register_names[idx] == WHvX64RegisterRflags);
648     env->eflags = vcxt.values[idx++].Reg64;
649 
650     /* Translate 6+4 segment registers. HV and QEMU order matches  */
651     assert(idx == WHvX64RegisterEs);
652     for (i = 0; i < 6; i += 1, idx += 1) {
653         env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
654     }
655 
656     assert(idx == WHvX64RegisterLdtr);
657     env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
658     assert(idx == WHvX64RegisterTr);
659     env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
660     assert(idx == WHvX64RegisterIdtr);
661     env->idt.base = vcxt.values[idx].Table.Base;
662     env->idt.limit = vcxt.values[idx].Table.Limit;
663     idx += 1;
664     assert(idx == WHvX64RegisterGdtr);
665     env->gdt.base = vcxt.values[idx].Table.Base;
666     env->gdt.limit = vcxt.values[idx].Table.Limit;
667     idx += 1;
668 
669     /* CR0, 2, 3, 4, 8 */
670     assert(whpx_register_names[idx] == WHvX64RegisterCr0);
671     env->cr[0] = vcxt.values[idx++].Reg64;
672     assert(whpx_register_names[idx] == WHvX64RegisterCr2);
673     env->cr[2] = vcxt.values[idx++].Reg64;
674     assert(whpx_register_names[idx] == WHvX64RegisterCr3);
675     env->cr[3] = vcxt.values[idx++].Reg64;
676     assert(whpx_register_names[idx] == WHvX64RegisterCr4);
677     env->cr[4] = vcxt.values[idx++].Reg64;
678     assert(whpx_register_names[idx] == WHvX64RegisterCr8);
679     tpr = vcxt.values[idx++].Reg64;
680     if (tpr != vcpu->tpr) {
681         vcpu->tpr = tpr;
682         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
683     }
684 
685     /* 8 Debug Registers - Skipped */
686 
687     /*
688      * Extended control registers needs to be handled separately depending
689      * on whether xsave is supported/enabled or not.
690      */
691     whpx_get_xcrs(cpu);
692 
693     /* 16 XMM registers */
694     assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
695     idx_next = idx + 16;
696     for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
697         env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
698         env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
699     }
700     idx = idx_next;
701 
702     /* 8 FP registers */
703     assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
704     for (i = 0; i < 8; i += 1, idx += 1) {
705         env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
706         /* env->fpregs[i].mmx.MMX_Q(1) =
707                vcxt.values[idx].Fp.AsUINT128.High64;
708         */
709     }
710 
711     /* FP control status register */
712     assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
713     env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
714     env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
715     env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
716     for (i = 0; i < 8; ++i) {
717         env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
718     }
719     env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
720     env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
721     idx += 1;
722 
723     /* XMM control status register */
724     assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
725     env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
726     idx += 1;
727 
728     /* MSRs */
729     assert(whpx_register_names[idx] == WHvX64RegisterEfer);
730     env->efer = vcxt.values[idx++].Reg64;
731 #ifdef TARGET_X86_64
732     assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
733     env->kernelgsbase = vcxt.values[idx++].Reg64;
734 #endif
735 
736     assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
737     apic_base = vcxt.values[idx++].Reg64;
738     if (apic_base != vcpu->apic_base) {
739         vcpu->apic_base = apic_base;
740         cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
741     }
742 
743     /* WHvX64RegisterPat - Skipped */
744 
745     assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
746     env->sysenter_cs = vcxt.values[idx++].Reg64;
747     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
748     env->sysenter_eip = vcxt.values[idx++].Reg64;
749     assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
750     env->sysenter_esp = vcxt.values[idx++].Reg64;
751     assert(whpx_register_names[idx] == WHvX64RegisterStar);
752     env->star = vcxt.values[idx++].Reg64;
753 #ifdef TARGET_X86_64
754     assert(whpx_register_names[idx] == WHvX64RegisterLstar);
755     env->lstar = vcxt.values[idx++].Reg64;
756     assert(whpx_register_names[idx] == WHvX64RegisterCstar);
757     env->cstar = vcxt.values[idx++].Reg64;
758     assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
759     env->fmask = vcxt.values[idx++].Reg64;
760 #endif
761 
762     /* Interrupt / Event Registers - Skipped */
763 
764     assert(idx == RTL_NUMBER_OF(whpx_register_names));
765 
766     if (whpx_apic_in_platform()) {
767         whpx_apic_get(x86_cpu->apic_state);
768     }
769 
770     x86_update_hflags(env);
771 }
772 
whpx_emu_ioport_callback(void * ctx,WHV_EMULATOR_IO_ACCESS_INFO * IoAccess)773 static HRESULT CALLBACK whpx_emu_ioport_callback(
774     void *ctx,
775     WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
776 {
777     MemTxAttrs attrs = { 0 };
778     address_space_rw(&address_space_io, IoAccess->Port, attrs,
779                      &IoAccess->Data, IoAccess->AccessSize,
780                      IoAccess->Direction);
781     return S_OK;
782 }
783 
whpx_emu_mmio_callback(void * ctx,WHV_EMULATOR_MEMORY_ACCESS_INFO * ma)784 static HRESULT CALLBACK whpx_emu_mmio_callback(
785     void *ctx,
786     WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
787 {
788     cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
789                            ma->Direction);
790     return S_OK;
791 }
792 
whpx_emu_getreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,WHV_REGISTER_VALUE * RegisterValues)793 static HRESULT CALLBACK whpx_emu_getreg_callback(
794     void *ctx,
795     const WHV_REGISTER_NAME *RegisterNames,
796     UINT32 RegisterCount,
797     WHV_REGISTER_VALUE *RegisterValues)
798 {
799     HRESULT hr;
800     struct whpx_state *whpx = &whpx_global;
801     CPUState *cpu = (CPUState *)ctx;
802 
803     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
804         whpx->partition, cpu->cpu_index,
805         RegisterNames, RegisterCount,
806         RegisterValues);
807     if (FAILED(hr)) {
808         error_report("WHPX: Failed to get virtual processor registers,"
809                      " hr=%08lx", hr);
810     }
811 
812     return hr;
813 }
814 
whpx_emu_setreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,const WHV_REGISTER_VALUE * RegisterValues)815 static HRESULT CALLBACK whpx_emu_setreg_callback(
816     void *ctx,
817     const WHV_REGISTER_NAME *RegisterNames,
818     UINT32 RegisterCount,
819     const WHV_REGISTER_VALUE *RegisterValues)
820 {
821     HRESULT hr;
822     struct whpx_state *whpx = &whpx_global;
823     CPUState *cpu = (CPUState *)ctx;
824 
825     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
826         whpx->partition, cpu->cpu_index,
827         RegisterNames, RegisterCount,
828         RegisterValues);
829     if (FAILED(hr)) {
830         error_report("WHPX: Failed to set virtual processor registers,"
831                      " hr=%08lx", hr);
832     }
833 
834     /*
835      * The emulator just successfully wrote the register state. We clear the
836      * dirty state so we avoid the double write on resume of the VP.
837      */
838     cpu->vcpu_dirty = false;
839 
840     return hr;
841 }
842 
whpx_emu_translate_callback(void * ctx,WHV_GUEST_VIRTUAL_ADDRESS Gva,WHV_TRANSLATE_GVA_FLAGS TranslateFlags,WHV_TRANSLATE_GVA_RESULT_CODE * TranslationResult,WHV_GUEST_PHYSICAL_ADDRESS * Gpa)843 static HRESULT CALLBACK whpx_emu_translate_callback(
844     void *ctx,
845     WHV_GUEST_VIRTUAL_ADDRESS Gva,
846     WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
847     WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
848     WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
849 {
850     HRESULT hr;
851     struct whpx_state *whpx = &whpx_global;
852     CPUState *cpu = (CPUState *)ctx;
853     WHV_TRANSLATE_GVA_RESULT res;
854 
855     hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
856                                       Gva, TranslateFlags, &res, Gpa);
857     if (FAILED(hr)) {
858         error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
859     } else {
860         *TranslationResult = res.ResultCode;
861     }
862 
863     return hr;
864 }
865 
866 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
867     .Size = sizeof(WHV_EMULATOR_CALLBACKS),
868     .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
869     .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
870     .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
871     .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
872     .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
873 };
874 
whpx_handle_mmio(CPUState * cpu,WHV_MEMORY_ACCESS_CONTEXT * ctx)875 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
876 {
877     HRESULT hr;
878     AccelCPUState *vcpu = cpu->accel;
879     WHV_EMULATOR_STATUS emu_status;
880 
881     hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
882         vcpu->emulator, cpu,
883         &vcpu->exit_ctx.VpContext, ctx,
884         &emu_status);
885     if (FAILED(hr)) {
886         error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
887         return -1;
888     }
889 
890     if (!emu_status.EmulationSuccessful) {
891         error_report("WHPX: Failed to emulate MMIO access with"
892                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
893         return -1;
894     }
895 
896     return 0;
897 }
898 
whpx_handle_portio(CPUState * cpu,WHV_X64_IO_PORT_ACCESS_CONTEXT * ctx)899 static int whpx_handle_portio(CPUState *cpu,
900                               WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
901 {
902     HRESULT hr;
903     AccelCPUState *vcpu = cpu->accel;
904     WHV_EMULATOR_STATUS emu_status;
905 
906     hr = whp_dispatch.WHvEmulatorTryIoEmulation(
907         vcpu->emulator, cpu,
908         &vcpu->exit_ctx.VpContext, ctx,
909         &emu_status);
910     if (FAILED(hr)) {
911         error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
912         return -1;
913     }
914 
915     if (!emu_status.EmulationSuccessful) {
916         error_report("WHPX: Failed to emulate PortIO access with"
917                      " EmulatorReturnStatus: %u", emu_status.AsUINT32);
918         return -1;
919     }
920 
921     return 0;
922 }
923 
924 /*
925  * Controls whether we should intercept various exceptions on the guest,
926  * namely breakpoint/single-step events.
927  *
928  * The 'exceptions' argument accepts a bitmask, e.g:
929  * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
930  */
whpx_set_exception_exit_bitmap(UINT64 exceptions)931 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
932 {
933     struct whpx_state *whpx = &whpx_global;
934     WHV_PARTITION_PROPERTY prop = { 0, };
935     HRESULT hr;
936 
937     if (exceptions == whpx->exception_exit_bitmap) {
938         return S_OK;
939     }
940 
941     prop.ExceptionExitBitmap = exceptions;
942 
943     hr = whp_dispatch.WHvSetPartitionProperty(
944         whpx->partition,
945         WHvPartitionPropertyCodeExceptionExitBitmap,
946         &prop,
947         sizeof(WHV_PARTITION_PROPERTY));
948 
949     if (SUCCEEDED(hr)) {
950         whpx->exception_exit_bitmap = exceptions;
951     }
952 
953     return hr;
954 }
955 
956 
957 /*
958  * This function is called before/after stepping over a single instruction.
959  * It will update the CPU registers to arm/disarm the instruction stepping
960  * accordingly.
961  */
whpx_vcpu_configure_single_stepping(CPUState * cpu,bool set,uint64_t * exit_context_rflags)962 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
963     bool set,
964     uint64_t *exit_context_rflags)
965 {
966     WHV_REGISTER_NAME reg_name;
967     WHV_REGISTER_VALUE reg_value;
968     HRESULT hr;
969     struct whpx_state *whpx = &whpx_global;
970 
971     /*
972      * If we are trying to step over a single instruction, we need to set the
973      * TF bit in rflags. Otherwise, clear it.
974      */
975     reg_name = WHvX64RegisterRflags;
976     hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
977         whpx->partition,
978         cpu->cpu_index,
979         &reg_name,
980         1,
981         &reg_value);
982 
983     if (FAILED(hr)) {
984         error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
985         return hr;
986     }
987 
988     if (exit_context_rflags) {
989         assert(*exit_context_rflags == reg_value.Reg64);
990     }
991 
992     if (set) {
993         /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
994         reg_value.Reg64 |= TF_MASK;
995     } else {
996         reg_value.Reg64 &= ~TF_MASK;
997     }
998 
999     if (exit_context_rflags) {
1000         *exit_context_rflags = reg_value.Reg64;
1001     }
1002 
1003     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1004         whpx->partition,
1005         cpu->cpu_index,
1006         &reg_name,
1007         1,
1008         &reg_value);
1009 
1010     if (FAILED(hr)) {
1011         error_report("WHPX: Failed to set rflags,"
1012             " hr=%08lx",
1013             hr);
1014         return hr;
1015     }
1016 
1017     reg_name = WHvRegisterInterruptState;
1018     reg_value.Reg64 = 0;
1019 
1020     /* Suspend delivery of hardware interrupts during single-stepping. */
1021     reg_value.InterruptState.InterruptShadow = set != 0;
1022 
1023     hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1024     whpx->partition,
1025         cpu->cpu_index,
1026         &reg_name,
1027         1,
1028         &reg_value);
1029 
1030     if (FAILED(hr)) {
1031         error_report("WHPX: Failed to set InterruptState,"
1032             " hr=%08lx",
1033             hr);
1034         return hr;
1035     }
1036 
1037     if (!set) {
1038         /*
1039          * We have just finished stepping over a single instruction,
1040          * and intercepted the INT1 generated by it.
1041          * We need to now hide the INT1 from the guest,
1042          * as it would not be expecting it.
1043          */
1044 
1045         reg_name = WHvX64RegisterPendingDebugException;
1046         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1047         whpx->partition,
1048             cpu->cpu_index,
1049             &reg_name,
1050             1,
1051             &reg_value);
1052 
1053         if (FAILED(hr)) {
1054             error_report("WHPX: Failed to get pending debug exceptions,"
1055                          "hr=%08lx", hr);
1056             return hr;
1057         }
1058 
1059         if (reg_value.PendingDebugException.SingleStep) {
1060             reg_value.PendingDebugException.SingleStep = 0;
1061 
1062             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1063                 whpx->partition,
1064                 cpu->cpu_index,
1065                 &reg_name,
1066                 1,
1067                 &reg_value);
1068 
1069             if (FAILED(hr)) {
1070                 error_report("WHPX: Failed to clear pending debug exceptions,"
1071                              "hr=%08lx", hr);
1072              return hr;
1073             }
1074         }
1075 
1076     }
1077 
1078     return S_OK;
1079 }
1080 
1081 /* Tries to find a breakpoint at the specified address. */
whpx_lookup_breakpoint_by_addr(uint64_t address)1082 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1083 {
1084     struct whpx_state *whpx = &whpx_global;
1085     int i;
1086 
1087     if (whpx->breakpoints.breakpoints) {
1088         for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1089             if (address == whpx->breakpoints.breakpoints->data[i].address) {
1090                 return &whpx->breakpoints.breakpoints->data[i];
1091             }
1092         }
1093     }
1094 
1095     return NULL;
1096 }
1097 
1098 /*
1099  * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1100  * debugging user-mode applications. Since the WHPX API does not offer
1101  * an easy way to pass the intercepted exception back to the guest, we
1102  * resort to using INT1 instead, and let the guest always handle INT3.
1103  */
1104 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1105 
1106 /*
1107  * The WHPX QEMU backend implements breakpoints by writing the INT1
1108  * instruction into memory (ignoring the DRx registers). This raises a few
1109  * issues that need to be carefully handled:
1110  *
1111  * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1112  *    at the same location, and later remove them in arbitrary order.
1113  *    This should not cause memory corruption, and should only remove the
1114  *    physical breakpoint instruction when the last QEMU breakpoint is gone.
1115  *
1116  * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1117  *    physical location. Hence, physically adding/removing a breakpoint can
1118  *    theoretically fail at any time. We need to keep track of it.
1119  *
1120  * The function below rebuilds a list of low-level breakpoints (one per
1121  * address, tracking the original instruction and any errors) from the list of
1122  * high-level breakpoints (set via cpu_breakpoint_insert()).
1123  *
1124  * In order to optimize performance, this function stores the list of
1125  * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1126  * low-level ones, so that it won't be re-invoked until these breakpoints
1127  * change.
1128  *
1129  * Note that this function decides which breakpoints should be inserted into,
1130  * memory, but doesn't actually do it. The memory accessing is done in
1131  * whpx_apply_breakpoints().
1132  */
whpx_translate_cpu_breakpoints(struct whpx_breakpoints * breakpoints,CPUState * cpu,int cpu_breakpoint_count)1133 static void whpx_translate_cpu_breakpoints(
1134     struct whpx_breakpoints *breakpoints,
1135     CPUState *cpu,
1136     int cpu_breakpoint_count)
1137 {
1138     CPUBreakpoint *bp;
1139     int cpu_bp_index = 0;
1140 
1141     breakpoints->original_addresses =
1142         g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1143 
1144     breakpoints->original_address_count = cpu_breakpoint_count;
1145 
1146     int max_breakpoints = cpu_breakpoint_count +
1147         (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1148 
1149     struct whpx_breakpoint_collection *new_breakpoints =
1150         g_malloc0(sizeof(struct whpx_breakpoint_collection)
1151                   + max_breakpoints * sizeof(struct whpx_breakpoint));
1152 
1153     new_breakpoints->allocated = max_breakpoints;
1154     new_breakpoints->used = 0;
1155 
1156     /*
1157      * 1. Preserve all old breakpoints that could not be automatically
1158      * cleared when the CPU got stopped.
1159      */
1160     if (breakpoints->breakpoints) {
1161         int i;
1162         for (i = 0; i < breakpoints->breakpoints->used; i++) {
1163             if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1164                 new_breakpoints->data[new_breakpoints->used++] =
1165                     breakpoints->breakpoints->data[i];
1166             }
1167         }
1168     }
1169 
1170     /* 2. Map all CPU breakpoints to WHPX breakpoints */
1171     QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1172         int i;
1173         bool found = false;
1174 
1175         /* This will be used to detect changed CPU breakpoints later. */
1176         breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1177 
1178         for (i = 0; i < new_breakpoints->used; i++) {
1179             /*
1180              * WARNING: This loop has O(N^2) complexity, where N is the
1181              * number of breakpoints. It should not be a bottleneck in
1182              * real-world scenarios, since it only needs to run once after
1183              * the breakpoints have been modified.
1184              * If this ever becomes a concern, it can be optimized by storing
1185              * high-level breakpoint objects in a tree or hash map.
1186              */
1187 
1188             if (new_breakpoints->data[i].address == bp->pc) {
1189                 /* There was already a breakpoint at this address. */
1190                 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1191                     new_breakpoints->data[i].state = WHPX_BP_SET;
1192                 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1193                     new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1194                 }
1195 
1196                 found = true;
1197                 break;
1198             }
1199         }
1200 
1201         if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1202             /* No WHPX breakpoint at this address. Create one. */
1203             new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1204             new_breakpoints->data[new_breakpoints->used].state =
1205                 WHPX_BP_SET_PENDING;
1206             new_breakpoints->used++;
1207         }
1208     }
1209 
1210     /*
1211      * Free the previous breakpoint list. This can be optimized by keeping
1212      * it as shadow buffer for the next computation instead of freeing
1213      * it immediately.
1214      */
1215     g_free(breakpoints->breakpoints);
1216 
1217     breakpoints->breakpoints = new_breakpoints;
1218 }
1219 
1220 /*
1221  * Physically inserts/removes the breakpoints by reading and writing the
1222  * physical memory, keeping a track of the failed attempts.
1223  *
1224  * Passing resuming=true  will try to set all previously unset breakpoints.
1225  * Passing resuming=false will remove all inserted ones.
1226  */
whpx_apply_breakpoints(struct whpx_breakpoint_collection * breakpoints,CPUState * cpu,bool resuming)1227 static void whpx_apply_breakpoints(
1228     struct whpx_breakpoint_collection *breakpoints,
1229     CPUState *cpu,
1230     bool resuming)
1231 {
1232     int i, rc;
1233     if (!breakpoints) {
1234         return;
1235     }
1236 
1237     for (i = 0; i < breakpoints->used; i++) {
1238         /* Decide what to do right now based on the last known state. */
1239         WhpxBreakpointState state = breakpoints->data[i].state;
1240         switch (state) {
1241         case WHPX_BP_CLEARED:
1242             if (resuming) {
1243                 state = WHPX_BP_SET_PENDING;
1244             }
1245             break;
1246         case WHPX_BP_SET_PENDING:
1247             if (!resuming) {
1248                 state = WHPX_BP_CLEARED;
1249             }
1250             break;
1251         case WHPX_BP_SET:
1252             if (!resuming) {
1253                 state = WHPX_BP_CLEAR_PENDING;
1254             }
1255             break;
1256         case WHPX_BP_CLEAR_PENDING:
1257             if (resuming) {
1258                 state = WHPX_BP_SET;
1259             }
1260             break;
1261         }
1262 
1263         if (state == WHPX_BP_SET_PENDING) {
1264             /* Remember the original instruction. */
1265             rc = cpu_memory_rw_debug(cpu,
1266                 breakpoints->data[i].address,
1267                 &breakpoints->data[i].original_instruction,
1268                 1,
1269                 false);
1270 
1271             if (!rc) {
1272                 /* Write the breakpoint instruction. */
1273                 rc = cpu_memory_rw_debug(cpu,
1274                     breakpoints->data[i].address,
1275                     (void *)&whpx_breakpoint_instruction,
1276                     1,
1277                     true);
1278             }
1279 
1280             if (!rc) {
1281                 state = WHPX_BP_SET;
1282             }
1283 
1284         }
1285 
1286         if (state == WHPX_BP_CLEAR_PENDING) {
1287             /* Restore the original instruction. */
1288             rc = cpu_memory_rw_debug(cpu,
1289                 breakpoints->data[i].address,
1290                 &breakpoints->data[i].original_instruction,
1291                 1,
1292                 true);
1293 
1294             if (!rc) {
1295                 state = WHPX_BP_CLEARED;
1296             }
1297         }
1298 
1299         breakpoints->data[i].state = state;
1300     }
1301 }
1302 
1303 /*
1304  * This function is called when the a VCPU is about to start and no other
1305  * VCPUs have been started so far. Since the VCPU start order could be
1306  * arbitrary, it doesn't have to be VCPU#0.
1307  *
1308  * It is used to commit the breakpoints into memory, and configure WHPX
1309  * to intercept debug exceptions.
1310  *
1311  * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1312  * more VCPUs are already running, so this is the best place to do it.
1313  */
whpx_first_vcpu_starting(CPUState * cpu)1314 static int whpx_first_vcpu_starting(CPUState *cpu)
1315 {
1316     struct whpx_state *whpx = &whpx_global;
1317     HRESULT hr;
1318 
1319     g_assert(bql_locked());
1320 
1321     if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1322             (whpx->breakpoints.breakpoints &&
1323              whpx->breakpoints.breakpoints->used)) {
1324         CPUBreakpoint *bp;
1325         int i = 0;
1326         bool update_pending = false;
1327 
1328         QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1329             if (i >= whpx->breakpoints.original_address_count ||
1330                 bp->pc != whpx->breakpoints.original_addresses[i]) {
1331                 update_pending = true;
1332             }
1333 
1334             i++;
1335         }
1336 
1337         if (i != whpx->breakpoints.original_address_count) {
1338             update_pending = true;
1339         }
1340 
1341         if (update_pending) {
1342             /*
1343              * The CPU breakpoints have changed since the last call to
1344              * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1345              * now be recomputed.
1346              */
1347             whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1348         }
1349 
1350         /* Actually insert the breakpoints into the memory. */
1351         whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1352     }
1353 
1354     uint64_t exception_mask;
1355     if (whpx->step_pending ||
1356         (whpx->breakpoints.breakpoints &&
1357          whpx->breakpoints.breakpoints->used)) {
1358         /*
1359          * We are either attempting to single-step one or more CPUs, or
1360          * have one or more breakpoints enabled. Both require intercepting
1361          * the WHvX64ExceptionTypeBreakpointTrap exception.
1362          */
1363 
1364         exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1365     } else {
1366         /* Let the guest handle all exceptions. */
1367         exception_mask = 0;
1368     }
1369 
1370     hr = whpx_set_exception_exit_bitmap(exception_mask);
1371     if (!SUCCEEDED(hr)) {
1372         error_report("WHPX: Failed to update exception exit mask,"
1373                      "hr=%08lx.", hr);
1374         return 1;
1375     }
1376 
1377     return 0;
1378 }
1379 
1380 /*
1381  * This function is called when the last VCPU has finished running.
1382  * It is used to remove any previously set breakpoints from memory.
1383  */
whpx_last_vcpu_stopping(CPUState * cpu)1384 static int whpx_last_vcpu_stopping(CPUState *cpu)
1385 {
1386     whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1387     return 0;
1388 }
1389 
1390 /* Returns the address of the next instruction that is about to be executed. */
whpx_vcpu_get_pc(CPUState * cpu,bool exit_context_valid)1391 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1392 {
1393     if (cpu->vcpu_dirty) {
1394         /* The CPU registers have been modified by other parts of QEMU. */
1395         return cpu_env(cpu)->eip;
1396     } else if (exit_context_valid) {
1397         /*
1398          * The CPU registers have not been modified by neither other parts
1399          * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1400          * This is the most common case.
1401          */
1402         AccelCPUState *vcpu = cpu->accel;
1403         return vcpu->exit_ctx.VpContext.Rip;
1404     } else {
1405         /*
1406          * The CPU registers have been modified by a call to
1407          * WHvSetVirtualProcessorRegisters() and must be re-queried from
1408          * the target.
1409          */
1410         WHV_REGISTER_VALUE reg_value;
1411         WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1412         HRESULT hr;
1413         struct whpx_state *whpx = &whpx_global;
1414 
1415         hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1416             whpx->partition,
1417             cpu->cpu_index,
1418             &reg_name,
1419             1,
1420             &reg_value);
1421 
1422         if (FAILED(hr)) {
1423             error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1424             return 0;
1425         }
1426 
1427         return reg_value.Reg64;
1428     }
1429 }
1430 
whpx_handle_halt(CPUState * cpu)1431 static int whpx_handle_halt(CPUState *cpu)
1432 {
1433     int ret = 0;
1434 
1435     bql_lock();
1436     if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1437           (cpu_env(cpu)->eflags & IF_MASK)) &&
1438         !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1439         cpu->exception_index = EXCP_HLT;
1440         cpu->halted = true;
1441         ret = 1;
1442     }
1443     bql_unlock();
1444 
1445     return ret;
1446 }
1447 
whpx_vcpu_pre_run(CPUState * cpu)1448 static void whpx_vcpu_pre_run(CPUState *cpu)
1449 {
1450     HRESULT hr;
1451     struct whpx_state *whpx = &whpx_global;
1452     AccelCPUState *vcpu = cpu->accel;
1453     X86CPU *x86_cpu = X86_CPU(cpu);
1454     CPUX86State *env = &x86_cpu->env;
1455     int irq;
1456     uint8_t tpr;
1457     WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1458     UINT32 reg_count = 0;
1459     WHV_REGISTER_VALUE reg_values[3];
1460     WHV_REGISTER_NAME reg_names[3];
1461 
1462     memset(&new_int, 0, sizeof(new_int));
1463     memset(reg_values, 0, sizeof(reg_values));
1464 
1465     bql_lock();
1466 
1467     /* Inject NMI */
1468     if (!vcpu->interruption_pending &&
1469         cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1470         if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1471             cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1472             vcpu->interruptable = false;
1473             new_int.InterruptionType = WHvX64PendingNmi;
1474             new_int.InterruptionPending = 1;
1475             new_int.InterruptionVector = 2;
1476         }
1477         if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1478             cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1479         }
1480     }
1481 
1482     /*
1483      * Force the VCPU out of its inner loop to process any INIT requests or
1484      * commit pending TPR access.
1485      */
1486     if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1487         if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1488             !(env->hflags & HF_SMM_MASK)) {
1489             cpu->exit_request = 1;
1490         }
1491         if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1492             cpu->exit_request = 1;
1493         }
1494     }
1495 
1496     /* Get pending hard interruption or replay one that was overwritten */
1497     if (!whpx_apic_in_platform()) {
1498         if (!vcpu->interruption_pending &&
1499             vcpu->interruptable && (env->eflags & IF_MASK)) {
1500             assert(!new_int.InterruptionPending);
1501             if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1502                 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1503                 irq = cpu_get_pic_interrupt(env);
1504                 if (irq >= 0) {
1505                     new_int.InterruptionType = WHvX64PendingInterrupt;
1506                     new_int.InterruptionPending = 1;
1507                     new_int.InterruptionVector = irq;
1508                 }
1509             }
1510         }
1511 
1512         /* Setup interrupt state if new one was prepared */
1513         if (new_int.InterruptionPending) {
1514             reg_values[reg_count].PendingInterruption = new_int;
1515             reg_names[reg_count] = WHvRegisterPendingInterruption;
1516             reg_count += 1;
1517         }
1518     } else if (vcpu->ready_for_pic_interrupt &&
1519                (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1520         cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1521         irq = cpu_get_pic_interrupt(env);
1522         if (irq >= 0) {
1523             reg_names[reg_count] = WHvRegisterPendingEvent;
1524             reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1525             {
1526                 .EventPending = 1,
1527                 .EventType = WHvX64PendingEventExtInt,
1528                 .Vector = irq,
1529             };
1530             reg_count += 1;
1531         }
1532      }
1533 
1534     /* Sync the TPR to the CR8 if was modified during the intercept */
1535     tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1536     if (tpr != vcpu->tpr) {
1537         vcpu->tpr = tpr;
1538         reg_values[reg_count].Reg64 = tpr;
1539         cpu->exit_request = 1;
1540         reg_names[reg_count] = WHvX64RegisterCr8;
1541         reg_count += 1;
1542     }
1543 
1544     /* Update the state of the interrupt delivery notification */
1545     if (!vcpu->window_registered &&
1546         cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1547         reg_values[reg_count].DeliverabilityNotifications =
1548             (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1549                 .InterruptNotification = 1
1550             };
1551         vcpu->window_registered = 1;
1552         reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1553         reg_count += 1;
1554     }
1555 
1556     bql_unlock();
1557     vcpu->ready_for_pic_interrupt = false;
1558 
1559     if (reg_count) {
1560         hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1561             whpx->partition, cpu->cpu_index,
1562             reg_names, reg_count, reg_values);
1563         if (FAILED(hr)) {
1564             error_report("WHPX: Failed to set interrupt state registers,"
1565                          " hr=%08lx", hr);
1566         }
1567     }
1568 }
1569 
whpx_vcpu_post_run(CPUState * cpu)1570 static void whpx_vcpu_post_run(CPUState *cpu)
1571 {
1572     AccelCPUState *vcpu = cpu->accel;
1573     X86CPU *x86_cpu = X86_CPU(cpu);
1574     CPUX86State *env = &x86_cpu->env;
1575 
1576     env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1577 
1578     uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1579     if (vcpu->tpr != tpr) {
1580         vcpu->tpr = tpr;
1581         bql_lock();
1582         cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1583         bql_unlock();
1584     }
1585 
1586     vcpu->interruption_pending =
1587         vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1588 
1589     vcpu->interruptable =
1590         !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1591 }
1592 
whpx_vcpu_process_async_events(CPUState * cpu)1593 static void whpx_vcpu_process_async_events(CPUState *cpu)
1594 {
1595     X86CPU *x86_cpu = X86_CPU(cpu);
1596     CPUX86State *env = &x86_cpu->env;
1597     AccelCPUState *vcpu = cpu->accel;
1598 
1599     if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1600         !(env->hflags & HF_SMM_MASK)) {
1601         whpx_cpu_synchronize_state(cpu);
1602         do_cpu_init(x86_cpu);
1603         vcpu->interruptable = true;
1604     }
1605 
1606     if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1607         cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1608         apic_poll_irq(x86_cpu->apic_state);
1609     }
1610 
1611     if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1612          (env->eflags & IF_MASK)) ||
1613         (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1614         cpu->halted = false;
1615     }
1616 
1617     if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1618         whpx_cpu_synchronize_state(cpu);
1619         do_cpu_sipi(x86_cpu);
1620     }
1621 
1622     if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1623         cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1624         whpx_cpu_synchronize_state(cpu);
1625         apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1626                                       env->tpr_access_type);
1627     }
1628 }
1629 
whpx_vcpu_run(CPUState * cpu)1630 static int whpx_vcpu_run(CPUState *cpu)
1631 {
1632     HRESULT hr;
1633     struct whpx_state *whpx = &whpx_global;
1634     AccelCPUState *vcpu = cpu->accel;
1635     struct whpx_breakpoint *stepped_over_bp = NULL;
1636     WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1637     int ret;
1638 
1639     g_assert(bql_locked());
1640 
1641     if (whpx->running_cpus++ == 0) {
1642         /* Insert breakpoints into memory, update exception exit bitmap. */
1643         ret = whpx_first_vcpu_starting(cpu);
1644         if (ret != 0) {
1645             return ret;
1646         }
1647     }
1648 
1649     if (whpx->breakpoints.breakpoints &&
1650         whpx->breakpoints.breakpoints->used > 0)
1651     {
1652         uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1653         stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1654         if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1655             stepped_over_bp = NULL;
1656         }
1657 
1658         if (stepped_over_bp) {
1659             /*
1660              * We are trying to run the instruction overwritten by an active
1661              * breakpoint. We will temporarily disable the breakpoint, suspend
1662              * other CPUs, and step over the instruction.
1663              */
1664             exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1665         }
1666     }
1667 
1668     if (exclusive_step_mode == WHPX_STEP_NONE) {
1669         whpx_vcpu_process_async_events(cpu);
1670         if (cpu->halted && !whpx_apic_in_platform()) {
1671             cpu->exception_index = EXCP_HLT;
1672             qatomic_set(&cpu->exit_request, false);
1673             return 0;
1674         }
1675     }
1676 
1677     bql_unlock();
1678 
1679     if (exclusive_step_mode != WHPX_STEP_NONE) {
1680         start_exclusive();
1681         g_assert(cpu == current_cpu);
1682         g_assert(!cpu->running);
1683         cpu->running = true;
1684 
1685         hr = whpx_set_exception_exit_bitmap(
1686             1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1687         if (!SUCCEEDED(hr)) {
1688             error_report("WHPX: Failed to update exception exit mask, "
1689                          "hr=%08lx.", hr);
1690             return 1;
1691         }
1692 
1693         if (stepped_over_bp) {
1694             /* Temporarily disable the triggered breakpoint. */
1695             cpu_memory_rw_debug(cpu,
1696                 stepped_over_bp->address,
1697                 &stepped_over_bp->original_instruction,
1698                 1,
1699                 true);
1700         }
1701     } else {
1702         cpu_exec_start(cpu);
1703     }
1704 
1705     do {
1706         if (cpu->vcpu_dirty) {
1707             whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1708             cpu->vcpu_dirty = false;
1709         }
1710 
1711         if (exclusive_step_mode == WHPX_STEP_NONE) {
1712             whpx_vcpu_pre_run(cpu);
1713 
1714             if (qatomic_read(&cpu->exit_request)) {
1715                 whpx_vcpu_kick(cpu);
1716             }
1717         }
1718 
1719         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1720             whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1721         }
1722 
1723         hr = whp_dispatch.WHvRunVirtualProcessor(
1724             whpx->partition, cpu->cpu_index,
1725             &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1726 
1727         if (FAILED(hr)) {
1728             error_report("WHPX: Failed to exec a virtual processor,"
1729                          " hr=%08lx", hr);
1730             ret = -1;
1731             break;
1732         }
1733 
1734         if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1735             whpx_vcpu_configure_single_stepping(cpu,
1736                 false,
1737                 &vcpu->exit_ctx.VpContext.Rflags);
1738         }
1739 
1740         whpx_vcpu_post_run(cpu);
1741 
1742         switch (vcpu->exit_ctx.ExitReason) {
1743         case WHvRunVpExitReasonMemoryAccess:
1744             ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1745             break;
1746 
1747         case WHvRunVpExitReasonX64IoPortAccess:
1748             ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1749             break;
1750 
1751         case WHvRunVpExitReasonX64InterruptWindow:
1752             vcpu->ready_for_pic_interrupt = 1;
1753             vcpu->window_registered = 0;
1754             ret = 0;
1755             break;
1756 
1757         case WHvRunVpExitReasonX64ApicEoi:
1758             assert(whpx_apic_in_platform());
1759             ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1760             break;
1761 
1762         case WHvRunVpExitReasonX64Halt:
1763             /*
1764              * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1765              * longer used.
1766              */
1767             ret = whpx_handle_halt(cpu);
1768             break;
1769 
1770         case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1771             WHV_INTERRUPT_CONTROL ipi = {0};
1772             uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1773             uint32_t delivery_mode =
1774                 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1775             int dest_shorthand =
1776                 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1777             bool broadcast = false;
1778             bool include_self = false;
1779             uint32_t i;
1780 
1781             /* We only registered for INIT and SIPI exits. */
1782             if ((delivery_mode != APIC_DM_INIT) &&
1783                 (delivery_mode != APIC_DM_SIPI)) {
1784                 error_report(
1785                     "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1786                 break;
1787             }
1788 
1789             if (delivery_mode == APIC_DM_INIT) {
1790                 ipi.Type = WHvX64InterruptTypeInit;
1791             } else {
1792                 ipi.Type = WHvX64InterruptTypeSipi;
1793             }
1794 
1795             ipi.DestinationMode =
1796                 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1797                     WHvX64InterruptDestinationModeLogical :
1798                     WHvX64InterruptDestinationModePhysical;
1799 
1800             ipi.TriggerMode =
1801                 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1802                     WHvX64InterruptTriggerModeLevel :
1803                     WHvX64InterruptTriggerModeEdge;
1804 
1805             ipi.Vector = icr & APIC_VECTOR_MASK;
1806             switch (dest_shorthand) {
1807             /* no shorthand. Bits 56-63 contain the destination. */
1808             case 0:
1809                 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1810                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1811                         &ipi, sizeof(ipi));
1812                 if (FAILED(hr)) {
1813                     error_report("WHPX: Failed to request interrupt  hr=%08lx",
1814                         hr);
1815                 }
1816 
1817                 break;
1818 
1819             /* self */
1820             case 1:
1821                 include_self = true;
1822                 break;
1823 
1824             /* broadcast, including self */
1825             case 2:
1826                 broadcast = true;
1827                 include_self = true;
1828                 break;
1829 
1830             /* broadcast, excluding self */
1831             case 3:
1832                 broadcast = true;
1833                 break;
1834             }
1835 
1836             if (!broadcast && !include_self) {
1837                 break;
1838             }
1839 
1840             for (i = 0; i <= max_vcpu_index; i++) {
1841                 if (i == cpu->cpu_index && !include_self) {
1842                     continue;
1843                 }
1844 
1845                 /*
1846                  * Assuming that APIC Ids are identity mapped since
1847                  * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1848                  * are not handled yet and the hypervisor doesn't allow the
1849                  * guest to modify the APIC ID.
1850                  */
1851                 ipi.Destination = i;
1852                 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1853                         &ipi, sizeof(ipi));
1854                 if (FAILED(hr)) {
1855                     error_report(
1856                         "WHPX: Failed to request SIPI for %d,  hr=%08lx",
1857                         i, hr);
1858                 }
1859             }
1860 
1861             break;
1862         }
1863 
1864         case WHvRunVpExitReasonCanceled:
1865             if (exclusive_step_mode != WHPX_STEP_NONE) {
1866                 /*
1867                  * We are trying to step over a single instruction, and
1868                  * likely got a request to stop from another thread.
1869                  * Delay it until we are done stepping
1870                  * over.
1871                  */
1872                 ret = 0;
1873             } else {
1874                 cpu->exception_index = EXCP_INTERRUPT;
1875                 ret = 1;
1876             }
1877             break;
1878         case WHvRunVpExitReasonX64MsrAccess: {
1879             WHV_REGISTER_VALUE reg_values[3] = {0};
1880             WHV_REGISTER_NAME reg_names[3];
1881             UINT32 reg_count;
1882 
1883             reg_names[0] = WHvX64RegisterRip;
1884             reg_names[1] = WHvX64RegisterRax;
1885             reg_names[2] = WHvX64RegisterRdx;
1886 
1887             reg_values[0].Reg64 =
1888                 vcpu->exit_ctx.VpContext.Rip +
1889                 vcpu->exit_ctx.VpContext.InstructionLength;
1890 
1891             /*
1892              * For all unsupported MSR access we:
1893              *     ignore writes
1894              *     return 0 on read.
1895              */
1896             reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1897                         1 : 3;
1898 
1899             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1900                 whpx->partition,
1901                 cpu->cpu_index,
1902                 reg_names, reg_count,
1903                 reg_values);
1904 
1905             if (FAILED(hr)) {
1906                 error_report("WHPX: Failed to set MsrAccess state "
1907                              " registers, hr=%08lx", hr);
1908             }
1909             ret = 0;
1910             break;
1911         }
1912         case WHvRunVpExitReasonX64Cpuid: {
1913             WHV_REGISTER_VALUE reg_values[5];
1914             WHV_REGISTER_NAME reg_names[5];
1915             UINT32 reg_count = 5;
1916             UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1917             X86CPU *x86_cpu = X86_CPU(cpu);
1918             CPUX86State *env = &x86_cpu->env;
1919 
1920             memset(reg_values, 0, sizeof(reg_values));
1921 
1922             rip = vcpu->exit_ctx.VpContext.Rip +
1923                   vcpu->exit_ctx.VpContext.InstructionLength;
1924             cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1925 
1926             /*
1927              * Ideally, these should be supplied to the hypervisor during VCPU
1928              * initialization and it should be able to satisfy this request.
1929              * But, currently, WHPX doesn't support setting CPUID values in the
1930              * hypervisor once the partition has been setup, which is too late
1931              * since VCPUs are realized later. For now, use the values from
1932              * QEMU to satisfy these requests, until WHPX adds support for
1933              * being able to set these values in the hypervisor at runtime.
1934              */
1935             cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1936                 (UINT32 *)&rcx, (UINT32 *)&rdx);
1937             switch (cpuid_fn) {
1938             case 0x40000000:
1939                 /* Expose the vmware cpu frequency cpuid leaf */
1940                 rax = 0x40000010;
1941                 rbx = rcx = rdx = 0;
1942                 break;
1943 
1944             case 0x40000010:
1945                 rax = env->tsc_khz;
1946                 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1947                 rcx = rdx = 0;
1948                 break;
1949 
1950             case 0x80000001:
1951                 /* Remove any support of OSVW */
1952                 rcx &= ~CPUID_EXT3_OSVW;
1953                 break;
1954             }
1955 
1956             reg_names[0] = WHvX64RegisterRip;
1957             reg_names[1] = WHvX64RegisterRax;
1958             reg_names[2] = WHvX64RegisterRcx;
1959             reg_names[3] = WHvX64RegisterRdx;
1960             reg_names[4] = WHvX64RegisterRbx;
1961 
1962             reg_values[0].Reg64 = rip;
1963             reg_values[1].Reg64 = rax;
1964             reg_values[2].Reg64 = rcx;
1965             reg_values[3].Reg64 = rdx;
1966             reg_values[4].Reg64 = rbx;
1967 
1968             hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1969                 whpx->partition, cpu->cpu_index,
1970                 reg_names,
1971                 reg_count,
1972                 reg_values);
1973 
1974             if (FAILED(hr)) {
1975                 error_report("WHPX: Failed to set CpuidAccess state registers,"
1976                              " hr=%08lx", hr);
1977             }
1978             ret = 0;
1979             break;
1980         }
1981         case WHvRunVpExitReasonException:
1982             whpx_get_registers(cpu);
1983 
1984             if ((vcpu->exit_ctx.VpException.ExceptionType ==
1985                  WHvX64ExceptionTypeDebugTrapOrFault) &&
1986                 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1987                 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1988                  whpx_breakpoint_instruction)) {
1989                 /* Stopped at a software breakpoint. */
1990                 cpu->exception_index = EXCP_DEBUG;
1991             } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
1992                         WHvX64ExceptionTypeDebugTrapOrFault) &&
1993                        !cpu->singlestep_enabled) {
1994                 /*
1995                  * Just finished stepping over a breakpoint, but the
1996                  * gdb does not expect us to do single-stepping.
1997                  * Don't do anything special.
1998                  */
1999                 cpu->exception_index = EXCP_INTERRUPT;
2000             } else {
2001                 /* Another exception or debug event. Report it to GDB. */
2002                 cpu->exception_index = EXCP_DEBUG;
2003             }
2004 
2005             ret = 1;
2006             break;
2007         case WHvRunVpExitReasonNone:
2008         case WHvRunVpExitReasonUnrecoverableException:
2009         case WHvRunVpExitReasonInvalidVpRegisterValue:
2010         case WHvRunVpExitReasonUnsupportedFeature:
2011         default:
2012             error_report("WHPX: Unexpected VP exit code %d",
2013                          vcpu->exit_ctx.ExitReason);
2014             whpx_get_registers(cpu);
2015             bql_lock();
2016             qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2017             bql_unlock();
2018             break;
2019         }
2020 
2021     } while (!ret);
2022 
2023     if (stepped_over_bp) {
2024         /* Restore the breakpoint we stepped over */
2025         cpu_memory_rw_debug(cpu,
2026             stepped_over_bp->address,
2027             (void *)&whpx_breakpoint_instruction,
2028             1,
2029             true);
2030     }
2031 
2032     if (exclusive_step_mode != WHPX_STEP_NONE) {
2033         g_assert(cpu_in_exclusive_context(cpu));
2034         cpu->running = false;
2035         end_exclusive();
2036 
2037         exclusive_step_mode = WHPX_STEP_NONE;
2038     } else {
2039         cpu_exec_end(cpu);
2040     }
2041 
2042     bql_lock();
2043     current_cpu = cpu;
2044 
2045     if (--whpx->running_cpus == 0) {
2046         whpx_last_vcpu_stopping(cpu);
2047     }
2048 
2049     qatomic_set(&cpu->exit_request, false);
2050 
2051     return ret < 0;
2052 }
2053 
do_whpx_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2054 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2055 {
2056     if (!cpu->vcpu_dirty) {
2057         whpx_get_registers(cpu);
2058         cpu->vcpu_dirty = true;
2059     }
2060 }
2061 
do_whpx_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2062 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2063                                                run_on_cpu_data arg)
2064 {
2065     whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2066     cpu->vcpu_dirty = false;
2067 }
2068 
do_whpx_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2069 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2070                                               run_on_cpu_data arg)
2071 {
2072     whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2073     cpu->vcpu_dirty = false;
2074 }
2075 
do_whpx_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2076 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2077                                                run_on_cpu_data arg)
2078 {
2079     cpu->vcpu_dirty = true;
2080 }
2081 
2082 /*
2083  * CPU support.
2084  */
2085 
whpx_cpu_synchronize_state(CPUState * cpu)2086 void whpx_cpu_synchronize_state(CPUState *cpu)
2087 {
2088     if (!cpu->vcpu_dirty) {
2089         run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2090     }
2091 }
2092 
whpx_cpu_synchronize_post_reset(CPUState * cpu)2093 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2094 {
2095     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2096 }
2097 
whpx_cpu_synchronize_post_init(CPUState * cpu)2098 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2099 {
2100     run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2101 }
2102 
whpx_cpu_synchronize_pre_loadvm(CPUState * cpu)2103 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2104 {
2105     run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2106 }
2107 
whpx_cpu_synchronize_pre_resume(bool step_pending)2108 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2109 {
2110     whpx_global.step_pending = step_pending;
2111 }
2112 
2113 /*
2114  * Vcpu support.
2115  */
2116 
2117 static Error *whpx_migration_blocker;
2118 
whpx_cpu_update_state(void * opaque,bool running,RunState state)2119 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2120 {
2121     CPUX86State *env = opaque;
2122 
2123     if (running) {
2124         env->tsc_valid = false;
2125     }
2126 }
2127 
whpx_init_vcpu(CPUState * cpu)2128 int whpx_init_vcpu(CPUState *cpu)
2129 {
2130     HRESULT hr;
2131     struct whpx_state *whpx = &whpx_global;
2132     AccelCPUState *vcpu = NULL;
2133     Error *local_error = NULL;
2134     X86CPU *x86_cpu = X86_CPU(cpu);
2135     CPUX86State *env = &x86_cpu->env;
2136     UINT64 freq = 0;
2137     int ret;
2138 
2139     /* Add migration blockers for all unsupported features of the
2140      * Windows Hypervisor Platform
2141      */
2142     if (whpx_migration_blocker == NULL) {
2143         error_setg(&whpx_migration_blocker,
2144                "State blocked due to non-migratable CPUID feature support,"
2145                "dirty memory tracking support, and XSAVE/XRSTOR support");
2146 
2147         if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2148             error_report_err(local_error);
2149             ret = -EINVAL;
2150             goto error;
2151         }
2152     }
2153 
2154     vcpu = g_new0(AccelCPUState, 1);
2155 
2156     hr = whp_dispatch.WHvEmulatorCreateEmulator(
2157         &whpx_emu_callbacks,
2158         &vcpu->emulator);
2159     if (FAILED(hr)) {
2160         error_report("WHPX: Failed to setup instruction completion support,"
2161                      " hr=%08lx", hr);
2162         ret = -EINVAL;
2163         goto error;
2164     }
2165 
2166     hr = whp_dispatch.WHvCreateVirtualProcessor(
2167         whpx->partition, cpu->cpu_index, 0);
2168     if (FAILED(hr)) {
2169         error_report("WHPX: Failed to create a virtual processor,"
2170                      " hr=%08lx", hr);
2171         whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2172         ret = -EINVAL;
2173         goto error;
2174     }
2175 
2176     /*
2177      * vcpu's TSC frequency is either specified by user, or use the value
2178      * provided by Hyper-V if the former is not present. In the latter case, we
2179      * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2180      * frequency can be migrated later via this field.
2181      */
2182     if (!env->tsc_khz) {
2183         hr = whp_dispatch.WHvGetCapability(
2184             WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2185                 NULL);
2186         if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2187             if (FAILED(hr)) {
2188                 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2189             } else {
2190                 env->tsc_khz = freq / 1000; /* Hz to KHz */
2191             }
2192         }
2193     }
2194 
2195     env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2196     hr = whp_dispatch.WHvGetCapability(
2197         WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2198     if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2199         if (FAILED(hr)) {
2200             printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2201         } else {
2202             env->apic_bus_freq = freq;
2203         }
2204     }
2205 
2206     /*
2207      * If the vmware cpuid frequency leaf option is set, and we have a valid
2208      * tsc value, trap the corresponding cpuid's.
2209      */
2210     if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2211         UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2212 
2213         hr = whp_dispatch.WHvSetPartitionProperty(
2214                 whpx->partition,
2215                 WHvPartitionPropertyCodeCpuidExitList,
2216                 cpuidExitList,
2217                 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2218 
2219         if (FAILED(hr)) {
2220             error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2221                         hr);
2222             ret = -EINVAL;
2223             goto error;
2224         }
2225     }
2226 
2227     vcpu->interruptable = true;
2228     cpu->vcpu_dirty = true;
2229     cpu->accel = vcpu;
2230     max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2231     qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2232 
2233     return 0;
2234 
2235 error:
2236     g_free(vcpu);
2237 
2238     return ret;
2239 }
2240 
whpx_vcpu_exec(CPUState * cpu)2241 int whpx_vcpu_exec(CPUState *cpu)
2242 {
2243     int ret;
2244     int fatal;
2245 
2246     for (;;) {
2247         if (cpu->exception_index >= EXCP_INTERRUPT) {
2248             ret = cpu->exception_index;
2249             cpu->exception_index = -1;
2250             break;
2251         }
2252 
2253         fatal = whpx_vcpu_run(cpu);
2254 
2255         if (fatal) {
2256             error_report("WHPX: Failed to exec a virtual processor");
2257             abort();
2258         }
2259     }
2260 
2261     return ret;
2262 }
2263 
whpx_destroy_vcpu(CPUState * cpu)2264 void whpx_destroy_vcpu(CPUState *cpu)
2265 {
2266     struct whpx_state *whpx = &whpx_global;
2267     AccelCPUState *vcpu = cpu->accel;
2268 
2269     whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2270     whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2271     g_free(cpu->accel);
2272 }
2273 
whpx_vcpu_kick(CPUState * cpu)2274 void whpx_vcpu_kick(CPUState *cpu)
2275 {
2276     struct whpx_state *whpx = &whpx_global;
2277     whp_dispatch.WHvCancelRunVirtualProcessor(
2278         whpx->partition, cpu->cpu_index, 0);
2279 }
2280 
2281 /*
2282  * Memory support.
2283  */
2284 
whpx_update_mapping(hwaddr start_pa,ram_addr_t size,void * host_va,int add,int rom,const char * name)2285 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2286                                 void *host_va, int add, int rom,
2287                                 const char *name)
2288 {
2289     struct whpx_state *whpx = &whpx_global;
2290     HRESULT hr;
2291 
2292     /*
2293     if (add) {
2294         printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2295                (void*)start_pa, (void*)size, host_va,
2296                (rom ? "ROM" : "RAM"), name);
2297     } else {
2298         printf("WHPX: DEL PA:%p Size:%p, Host:%p,      '%s'\n",
2299                (void*)start_pa, (void*)size, host_va, name);
2300     }
2301     */
2302 
2303     if (add) {
2304         hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2305                                          host_va,
2306                                          start_pa,
2307                                          size,
2308                                          (WHvMapGpaRangeFlagRead |
2309                                           WHvMapGpaRangeFlagExecute |
2310                                           (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2311     } else {
2312         hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2313                                            start_pa,
2314                                            size);
2315     }
2316 
2317     if (FAILED(hr)) {
2318         error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2319                      " Host:%p, hr=%08lx",
2320                      (add ? "MAP" : "UNMAP"), name,
2321                      (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2322     }
2323 }
2324 
whpx_process_section(MemoryRegionSection * section,int add)2325 static void whpx_process_section(MemoryRegionSection *section, int add)
2326 {
2327     MemoryRegion *mr = section->mr;
2328     hwaddr start_pa = section->offset_within_address_space;
2329     ram_addr_t size = int128_get64(section->size);
2330     unsigned int delta;
2331     uint64_t host_va;
2332 
2333     if (!memory_region_is_ram(mr)) {
2334         return;
2335     }
2336 
2337     delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2338     delta &= ~qemu_real_host_page_mask();
2339     if (delta > size) {
2340         return;
2341     }
2342     start_pa += delta;
2343     size -= delta;
2344     size &= qemu_real_host_page_mask();
2345     if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2346         return;
2347     }
2348 
2349     host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2350             + section->offset_within_region + delta;
2351 
2352     whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2353                         memory_region_is_rom(mr), mr->name);
2354 }
2355 
whpx_region_add(MemoryListener * listener,MemoryRegionSection * section)2356 static void whpx_region_add(MemoryListener *listener,
2357                            MemoryRegionSection *section)
2358 {
2359     memory_region_ref(section->mr);
2360     whpx_process_section(section, 1);
2361 }
2362 
whpx_region_del(MemoryListener * listener,MemoryRegionSection * section)2363 static void whpx_region_del(MemoryListener *listener,
2364                            MemoryRegionSection *section)
2365 {
2366     whpx_process_section(section, 0);
2367     memory_region_unref(section->mr);
2368 }
2369 
whpx_transaction_begin(MemoryListener * listener)2370 static void whpx_transaction_begin(MemoryListener *listener)
2371 {
2372 }
2373 
whpx_transaction_commit(MemoryListener * listener)2374 static void whpx_transaction_commit(MemoryListener *listener)
2375 {
2376 }
2377 
whpx_log_sync(MemoryListener * listener,MemoryRegionSection * section)2378 static void whpx_log_sync(MemoryListener *listener,
2379                          MemoryRegionSection *section)
2380 {
2381     MemoryRegion *mr = section->mr;
2382 
2383     if (!memory_region_is_ram(mr)) {
2384         return;
2385     }
2386 
2387     memory_region_set_dirty(mr, 0, int128_get64(section->size));
2388 }
2389 
2390 static MemoryListener whpx_memory_listener = {
2391     .name = "whpx",
2392     .begin = whpx_transaction_begin,
2393     .commit = whpx_transaction_commit,
2394     .region_add = whpx_region_add,
2395     .region_del = whpx_region_del,
2396     .log_sync = whpx_log_sync,
2397     .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2398 };
2399 
whpx_memory_init(void)2400 static void whpx_memory_init(void)
2401 {
2402     memory_listener_register(&whpx_memory_listener, &address_space_memory);
2403 }
2404 
2405 /*
2406  * Load the functions from the given library, using the given handle. If a
2407  * handle is provided, it is used, otherwise the library is opened. The
2408  * handle will be updated on return with the opened one.
2409  */
load_whp_dispatch_fns(HMODULE * handle,WHPFunctionList function_list)2410 static bool load_whp_dispatch_fns(HMODULE *handle,
2411     WHPFunctionList function_list)
2412 {
2413     HMODULE hLib = *handle;
2414 
2415     #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2416     #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2417     #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2418         whp_dispatch.function_name = \
2419             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2420 
2421     #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2422         whp_dispatch.function_name = \
2423             (function_name ## _t)GetProcAddress(hLib, #function_name); \
2424         if (!whp_dispatch.function_name) { \
2425             error_report("Could not load function %s", #function_name); \
2426             goto error; \
2427         } \
2428 
2429     #define WHP_LOAD_LIB(lib_name, handle_lib) \
2430     if (!handle_lib) { \
2431         handle_lib = LoadLibrary(lib_name); \
2432         if (!handle_lib) { \
2433             error_report("Could not load library %s.", lib_name); \
2434             goto error; \
2435         } \
2436     } \
2437 
2438     switch (function_list) {
2439     case WINHV_PLATFORM_FNS_DEFAULT:
2440         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2441         LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2442         break;
2443 
2444     case WINHV_EMULATION_FNS_DEFAULT:
2445         WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2446         LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2447         break;
2448 
2449     case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2450         WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2451         LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2452         break;
2453     }
2454 
2455     *handle = hLib;
2456     return true;
2457 
2458 error:
2459     if (hLib) {
2460         FreeLibrary(hLib);
2461     }
2462 
2463     return false;
2464 }
2465 
whpx_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2466 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2467                                    const char *name, void *opaque,
2468                                    Error **errp)
2469 {
2470     struct whpx_state *whpx = &whpx_global;
2471     OnOffSplit mode;
2472 
2473     if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2474         return;
2475     }
2476 
2477     switch (mode) {
2478     case ON_OFF_SPLIT_ON:
2479         whpx->kernel_irqchip_allowed = true;
2480         whpx->kernel_irqchip_required = true;
2481         break;
2482 
2483     case ON_OFF_SPLIT_OFF:
2484         whpx->kernel_irqchip_allowed = false;
2485         whpx->kernel_irqchip_required = false;
2486         break;
2487 
2488     case ON_OFF_SPLIT_SPLIT:
2489         error_setg(errp, "WHPX: split irqchip currently not supported");
2490         error_append_hint(errp,
2491             "Try without kernel-irqchip or with kernel-irqchip=on|off");
2492         break;
2493 
2494     default:
2495         /*
2496          * The value was checked in visit_type_OnOffSplit() above. If
2497          * we get here, then something is wrong in QEMU.
2498          */
2499         abort();
2500     }
2501 }
2502 
2503 /*
2504  * Partition support
2505  */
2506 
whpx_accel_init(AccelState * as,MachineState * ms)2507 static int whpx_accel_init(AccelState *as, MachineState *ms)
2508 {
2509     struct whpx_state *whpx;
2510     int ret;
2511     HRESULT hr;
2512     WHV_CAPABILITY whpx_cap;
2513     UINT32 whpx_cap_size;
2514     WHV_PARTITION_PROPERTY prop;
2515     UINT32 cpuidExitList[] = {1, 0x80000001};
2516     WHV_CAPABILITY_FEATURES features = {0};
2517 
2518     whpx = &whpx_global;
2519 
2520     if (!init_whp_dispatch()) {
2521         ret = -ENOSYS;
2522         goto error;
2523     }
2524 
2525     whpx->mem_quota = ms->ram_size;
2526 
2527     hr = whp_dispatch.WHvGetCapability(
2528         WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2529         sizeof(whpx_cap), &whpx_cap_size);
2530     if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2531         error_report("WHPX: No accelerator found, hr=%08lx", hr);
2532         ret = -ENOSPC;
2533         goto error;
2534     }
2535 
2536     hr = whp_dispatch.WHvGetCapability(
2537         WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2538     if (FAILED(hr)) {
2539         error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2540         ret = -EINVAL;
2541         goto error;
2542     }
2543 
2544     hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2545     if (FAILED(hr)) {
2546         error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2547         ret = -EINVAL;
2548         goto error;
2549     }
2550 
2551     /*
2552      * Query the XSAVE capability of the partition. Any error here is not
2553      * considered fatal.
2554      */
2555     hr = whp_dispatch.WHvGetPartitionProperty(
2556         whpx->partition,
2557         WHvPartitionPropertyCodeProcessorXsaveFeatures,
2558         &whpx_xsave_cap,
2559         sizeof(whpx_xsave_cap),
2560         &whpx_cap_size);
2561 
2562     /*
2563      * Windows version which don't support this property will return with the
2564      * specific error code.
2565      */
2566     if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2567         error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2568     }
2569 
2570     if (!whpx_has_xsave()) {
2571         printf("WHPX: Partition is not XSAVE capable\n");
2572     }
2573 
2574     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2575     prop.ProcessorCount = ms->smp.cpus;
2576     hr = whp_dispatch.WHvSetPartitionProperty(
2577         whpx->partition,
2578         WHvPartitionPropertyCodeProcessorCount,
2579         &prop,
2580         sizeof(WHV_PARTITION_PROPERTY));
2581 
2582     if (FAILED(hr)) {
2583         error_report("WHPX: Failed to set partition processor count to %u,"
2584                      " hr=%08lx", prop.ProcessorCount, hr);
2585         ret = -EINVAL;
2586         goto error;
2587     }
2588 
2589     /*
2590      * Error out if WHP doesn't support apic emulation and user is requiring
2591      * it.
2592      */
2593     if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2594             !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2595         error_report("WHPX: kernel irqchip requested, but unavailable. "
2596             "Try without kernel-irqchip or with kernel-irqchip=off");
2597         ret = -EINVAL;
2598         goto error;
2599     }
2600 
2601     if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2602         whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2603         WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2604             WHvX64LocalApicEmulationModeXApic;
2605         printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2606         hr = whp_dispatch.WHvSetPartitionProperty(
2607             whpx->partition,
2608             WHvPartitionPropertyCodeLocalApicEmulationMode,
2609             &mode,
2610             sizeof(mode));
2611         if (FAILED(hr)) {
2612             error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2613             if (whpx->kernel_irqchip_required) {
2614                 error_report("WHPX: kernel irqchip requested, but unavailable");
2615                 ret = -EINVAL;
2616                 goto error;
2617             }
2618         } else {
2619             whpx->apic_in_platform = true;
2620         }
2621     }
2622 
2623     /* Register for MSR and CPUID exits */
2624     memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2625     prop.ExtendedVmExits.X64MsrExit = 1;
2626     prop.ExtendedVmExits.X64CpuidExit = 1;
2627     prop.ExtendedVmExits.ExceptionExit = 1;
2628     if (whpx_apic_in_platform()) {
2629         prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2630     }
2631 
2632     hr = whp_dispatch.WHvSetPartitionProperty(
2633             whpx->partition,
2634             WHvPartitionPropertyCodeExtendedVmExits,
2635             &prop,
2636             sizeof(WHV_PARTITION_PROPERTY));
2637     if (FAILED(hr)) {
2638         error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2639         ret = -EINVAL;
2640         goto error;
2641     }
2642 
2643     hr = whp_dispatch.WHvSetPartitionProperty(
2644         whpx->partition,
2645         WHvPartitionPropertyCodeCpuidExitList,
2646         cpuidExitList,
2647         RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2648 
2649     if (FAILED(hr)) {
2650         error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2651                      hr);
2652         ret = -EINVAL;
2653         goto error;
2654     }
2655 
2656     /*
2657      * We do not want to intercept any exceptions from the guest,
2658      * until we actually start debugging with gdb.
2659      */
2660     whpx->exception_exit_bitmap = -1;
2661     hr = whpx_set_exception_exit_bitmap(0);
2662 
2663     if (FAILED(hr)) {
2664         error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2665         ret = -EINVAL;
2666         goto error;
2667     }
2668 
2669     hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2670     if (FAILED(hr)) {
2671         error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2672         ret = -EINVAL;
2673         goto error;
2674     }
2675 
2676     whpx_memory_init();
2677 
2678     printf("Windows Hypervisor Platform accelerator is operational\n");
2679     return 0;
2680 
2681 error:
2682 
2683     if (NULL != whpx->partition) {
2684         whp_dispatch.WHvDeletePartition(whpx->partition);
2685         whpx->partition = NULL;
2686     }
2687 
2688     return ret;
2689 }
2690 
whpx_apic_in_platform(void)2691 bool whpx_apic_in_platform(void) {
2692     return whpx_global.apic_in_platform;
2693 }
2694 
whpx_accel_class_init(ObjectClass * oc,const void * data)2695 static void whpx_accel_class_init(ObjectClass *oc, const void *data)
2696 {
2697     AccelClass *ac = ACCEL_CLASS(oc);
2698     ac->name = "WHPX";
2699     ac->init_machine = whpx_accel_init;
2700     ac->allowed = &whpx_allowed;
2701 
2702     object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2703         NULL, whpx_set_kernel_irqchip,
2704         NULL, NULL);
2705     object_class_property_set_description(oc, "kernel-irqchip",
2706         "Configure WHPX in-kernel irqchip");
2707 }
2708 
whpx_accel_instance_init(Object * obj)2709 static void whpx_accel_instance_init(Object *obj)
2710 {
2711     struct whpx_state *whpx = &whpx_global;
2712 
2713     memset(whpx, 0, sizeof(struct whpx_state));
2714     /* Turn on kernel-irqchip, by default */
2715     whpx->kernel_irqchip_allowed = true;
2716 }
2717 
2718 static const TypeInfo whpx_accel_type = {
2719     .name = ACCEL_CLASS_NAME("whpx"),
2720     .parent = TYPE_ACCEL,
2721     .instance_init = whpx_accel_instance_init,
2722     .class_init = whpx_accel_class_init,
2723 };
2724 
whpx_type_init(void)2725 static void whpx_type_init(void)
2726 {
2727     type_register_static(&whpx_accel_type);
2728 }
2729 
init_whp_dispatch(void)2730 bool init_whp_dispatch(void)
2731 {
2732     if (whp_dispatch_initialized) {
2733         return true;
2734     }
2735 
2736     if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2737         goto error;
2738     }
2739 
2740     if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2741         goto error;
2742     }
2743 
2744     assert(load_whp_dispatch_fns(&hWinHvPlatform,
2745         WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2746     whp_dispatch_initialized = true;
2747 
2748     return true;
2749 error:
2750     if (hWinHvPlatform) {
2751         FreeLibrary(hWinHvPlatform);
2752     }
2753 
2754     if (hWinHvEmulation) {
2755         FreeLibrary(hWinHvEmulation);
2756     }
2757 
2758     return false;
2759 }
2760 
2761 type_init(whpx_type_init);
2762