1 /*
2 * QEMU Windows Hypervisor Platform accelerator (WHPX)
3 *
4 * Copyright Microsoft Corp. 2017
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2 or later.
7 * See the COPYING file in the top-level directory.
8 *
9 */
10
11 #include "qemu/osdep.h"
12 #include "cpu.h"
13 #include "system/address-spaces.h"
14 #include "system/ioport.h"
15 #include "gdbstub/helpers.h"
16 #include "qemu/accel.h"
17 #include "system/whpx.h"
18 #include "system/cpus.h"
19 #include "system/runstate.h"
20 #include "qemu/main-loop.h"
21 #include "hw/boards.h"
22 #include "hw/intc/ioapic.h"
23 #include "hw/i386/apic_internal.h"
24 #include "qemu/error-report.h"
25 #include "qapi/error.h"
26 #include "qapi/qapi-types-common.h"
27 #include "qapi/qapi-visit-common.h"
28 #include "migration/blocker.h"
29 #include <winerror.h>
30
31 #include "whpx-internal.h"
32 #include "whpx-accel-ops.h"
33
34 #include <winhvplatform.h>
35 #include <winhvemulation.h>
36
37 #define HYPERV_APIC_BUS_FREQUENCY (200000000ULL)
38
39 static const WHV_REGISTER_NAME whpx_register_names[] = {
40
41 /* X64 General purpose registers */
42 WHvX64RegisterRax,
43 WHvX64RegisterRcx,
44 WHvX64RegisterRdx,
45 WHvX64RegisterRbx,
46 WHvX64RegisterRsp,
47 WHvX64RegisterRbp,
48 WHvX64RegisterRsi,
49 WHvX64RegisterRdi,
50 WHvX64RegisterR8,
51 WHvX64RegisterR9,
52 WHvX64RegisterR10,
53 WHvX64RegisterR11,
54 WHvX64RegisterR12,
55 WHvX64RegisterR13,
56 WHvX64RegisterR14,
57 WHvX64RegisterR15,
58 WHvX64RegisterRip,
59 WHvX64RegisterRflags,
60
61 /* X64 Segment registers */
62 WHvX64RegisterEs,
63 WHvX64RegisterCs,
64 WHvX64RegisterSs,
65 WHvX64RegisterDs,
66 WHvX64RegisterFs,
67 WHvX64RegisterGs,
68 WHvX64RegisterLdtr,
69 WHvX64RegisterTr,
70
71 /* X64 Table registers */
72 WHvX64RegisterIdtr,
73 WHvX64RegisterGdtr,
74
75 /* X64 Control Registers */
76 WHvX64RegisterCr0,
77 WHvX64RegisterCr2,
78 WHvX64RegisterCr3,
79 WHvX64RegisterCr4,
80 WHvX64RegisterCr8,
81
82 /* X64 Debug Registers */
83 /*
84 * WHvX64RegisterDr0,
85 * WHvX64RegisterDr1,
86 * WHvX64RegisterDr2,
87 * WHvX64RegisterDr3,
88 * WHvX64RegisterDr6,
89 * WHvX64RegisterDr7,
90 */
91
92 /* X64 Floating Point and Vector Registers */
93 WHvX64RegisterXmm0,
94 WHvX64RegisterXmm1,
95 WHvX64RegisterXmm2,
96 WHvX64RegisterXmm3,
97 WHvX64RegisterXmm4,
98 WHvX64RegisterXmm5,
99 WHvX64RegisterXmm6,
100 WHvX64RegisterXmm7,
101 WHvX64RegisterXmm8,
102 WHvX64RegisterXmm9,
103 WHvX64RegisterXmm10,
104 WHvX64RegisterXmm11,
105 WHvX64RegisterXmm12,
106 WHvX64RegisterXmm13,
107 WHvX64RegisterXmm14,
108 WHvX64RegisterXmm15,
109 WHvX64RegisterFpMmx0,
110 WHvX64RegisterFpMmx1,
111 WHvX64RegisterFpMmx2,
112 WHvX64RegisterFpMmx3,
113 WHvX64RegisterFpMmx4,
114 WHvX64RegisterFpMmx5,
115 WHvX64RegisterFpMmx6,
116 WHvX64RegisterFpMmx7,
117 WHvX64RegisterFpControlStatus,
118 WHvX64RegisterXmmControlStatus,
119
120 /* X64 MSRs */
121 WHvX64RegisterEfer,
122 #ifdef TARGET_X86_64
123 WHvX64RegisterKernelGsBase,
124 #endif
125 WHvX64RegisterApicBase,
126 /* WHvX64RegisterPat, */
127 WHvX64RegisterSysenterCs,
128 WHvX64RegisterSysenterEip,
129 WHvX64RegisterSysenterEsp,
130 WHvX64RegisterStar,
131 #ifdef TARGET_X86_64
132 WHvX64RegisterLstar,
133 WHvX64RegisterCstar,
134 WHvX64RegisterSfmask,
135 #endif
136
137 /* Interrupt / Event Registers */
138 /*
139 * WHvRegisterPendingInterruption,
140 * WHvRegisterInterruptState,
141 * WHvRegisterPendingEvent0,
142 * WHvRegisterPendingEvent1
143 * WHvX64RegisterDeliverabilityNotifications,
144 */
145 };
146
147 struct whpx_register_set {
148 WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
149 };
150
151 /*
152 * The current implementation of instruction stepping sets the TF flag
153 * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
154 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
155 *
156 * This approach has a few limitations:
157 * 1. Stepping over a PUSHF/SAHF instruction will save the TF flag
158 * along with the other flags, possibly restoring it later. It would
159 * result in another INT1 when the flags are restored, triggering
160 * a stop in gdb that could be cleared by doing another step.
161 *
162 * Stepping over a POPF/LAHF instruction will let it overwrite the
163 * TF flags, ending the stepping mode.
164 *
165 * 2. Stepping over an instruction raising an exception (e.g. INT, DIV,
166 * or anything that could result in a page fault) will save the flags
167 * to the stack, clear the TF flag, and let the guest execute the
168 * handler. Normally, the guest will restore the original flags,
169 * that will continue single-stepping.
170 *
171 * 3. Debuggers running on the guest may wish to set TF to do instruction
172 * stepping. INT1 events generated by it would be intercepted by us,
173 * as long as the gdb is connected to QEMU.
174 *
175 * In practice this means that:
176 * 1. Stepping through flags-modifying instructions may cause gdb to
177 * continue or stop in unexpected places. This will be fully recoverable
178 * and will not crash the target.
179 *
180 * 2. Stepping over an instruction that triggers an exception will step
181 * over the exception handler, not into it.
182 *
183 * 3. Debugging the guest via gdb, while running debugger on the guest
184 * at the same time may lead to unexpected effects. Removing all
185 * breakpoints set via QEMU will prevent any further interference
186 * with the guest-level debuggers.
187 *
188 * The limitations can be addressed as shown below:
189 * 1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
190 * stepping through them. The exact semantics of the instructions is
191 * defined in the "Combined Volume Set of Intel 64 and IA-32
192 * Architectures Software Developer's Manuals", however it involves a
193 * fair amount of corner cases due to compatibility with real mode,
194 * virtual 8086 mode, and differences between 64-bit and 32-bit modes.
195 *
196 * 2. We could step into the guest's exception handlers using the following
197 * sequence:
198 * a. Temporarily enable catching of all exception types via
199 * whpx_set_exception_exit_bitmap().
200 * b. Once an exception is intercepted, read the IDT/GDT and locate
201 * the original handler.
202 * c. Patch the original handler, injecting an INT3 at the beginning.
203 * d. Update the exception exit bitmap to only catch the
204 * WHvX64ExceptionTypeBreakpointTrap exception.
205 * e. Let the affected CPU run in the exclusive mode.
206 * f. Restore the original handler and the exception exit bitmap.
207 * Note that handling all corner cases related to IDT/GDT is harder
208 * than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
209 * rough idea.
210 *
211 * 3. In order to properly support guest-level debugging in parallel with
212 * the QEMU-level debugging, we would need to be able to pass some INT1
213 * events to the guest. This could be done via the following methods:
214 * a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
215 * it seems to only work for interrupts and not software
216 * exceptions.
217 * b. Locating and patching the original handler by parsing IDT/GDT.
218 * This involves relatively complex logic outlined in the previous
219 * paragraph.
220 * c. Emulating the exception invocation (i.e. manually updating RIP,
221 * RFLAGS, and pushing the old values to stack). This is even more
222 * complicated than the previous option, since it involves checking
223 * CPL, gate attributes, and doing various adjustments depending
224 * on the current CPU mode, whether the CPL is changing, etc.
225 */
226 typedef enum WhpxStepMode {
227 WHPX_STEP_NONE = 0,
228 /* Halt other VCPUs */
229 WHPX_STEP_EXCLUSIVE,
230 } WhpxStepMode;
231
232 struct AccelCPUState {
233 WHV_EMULATOR_HANDLE emulator;
234 bool window_registered;
235 bool interruptable;
236 bool ready_for_pic_interrupt;
237 uint64_t tpr;
238 uint64_t apic_base;
239 bool interruption_pending;
240
241 /* Must be the last field as it may have a tail */
242 WHV_RUN_VP_EXIT_CONTEXT exit_ctx;
243 };
244
245 bool whpx_allowed;
246 static bool whp_dispatch_initialized;
247 static HMODULE hWinHvPlatform, hWinHvEmulation;
248 static uint32_t max_vcpu_index;
249 static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;
250
251 struct whpx_state whpx_global;
252 struct WHPDispatch whp_dispatch;
253
whpx_has_xsave(void)254 static bool whpx_has_xsave(void)
255 {
256 return whpx_xsave_cap.XsaveSupport;
257 }
258
whpx_seg_q2h(const SegmentCache * qs,int v86,int r86)259 static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
260 int r86)
261 {
262 WHV_X64_SEGMENT_REGISTER hs;
263 unsigned flags = qs->flags;
264
265 hs.Base = qs->base;
266 hs.Limit = qs->limit;
267 hs.Selector = qs->selector;
268
269 if (v86) {
270 hs.Attributes = 0;
271 hs.SegmentType = 3;
272 hs.Present = 1;
273 hs.DescriptorPrivilegeLevel = 3;
274 hs.NonSystemSegment = 1;
275
276 } else {
277 hs.Attributes = (flags >> DESC_TYPE_SHIFT);
278
279 if (r86) {
280 /* hs.Base &= 0xfffff; */
281 }
282 }
283
284 return hs;
285 }
286
whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER * hs)287 static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
288 {
289 SegmentCache qs;
290
291 qs.base = hs->Base;
292 qs.limit = hs->Limit;
293 qs.selector = hs->Selector;
294
295 qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;
296
297 return qs;
298 }
299
300 /* X64 Extended Control Registers */
whpx_set_xcrs(CPUState * cpu)301 static void whpx_set_xcrs(CPUState *cpu)
302 {
303 HRESULT hr;
304 struct whpx_state *whpx = &whpx_global;
305 WHV_REGISTER_VALUE xcr0;
306 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
307
308 if (!whpx_has_xsave()) {
309 return;
310 }
311
312 /* Only xcr0 is supported by the hypervisor currently */
313 xcr0.Reg64 = cpu_env(cpu)->xcr0;
314 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
315 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
316 if (FAILED(hr)) {
317 error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
318 }
319 }
320
whpx_set_tsc(CPUState * cpu)321 static int whpx_set_tsc(CPUState *cpu)
322 {
323 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
324 WHV_REGISTER_VALUE tsc_val;
325 HRESULT hr;
326 struct whpx_state *whpx = &whpx_global;
327
328 /*
329 * Suspend the partition prior to setting the TSC to reduce the variance
330 * in TSC across vCPUs. When the first vCPU runs post suspend, the
331 * partition is automatically resumed.
332 */
333 if (whp_dispatch.WHvSuspendPartitionTime) {
334
335 /*
336 * Unable to suspend partition while setting TSC is not a fatal
337 * error. It just increases the likelihood of TSC variance between
338 * vCPUs and some guest OS are able to handle that just fine.
339 */
340 hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
341 if (FAILED(hr)) {
342 warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
343 }
344 }
345
346 tsc_val.Reg64 = cpu_env(cpu)->tsc;
347 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
348 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
349 if (FAILED(hr)) {
350 error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
351 return -1;
352 }
353
354 return 0;
355 }
356
357 /*
358 * The CR8 register in the CPU is mapped to the TPR register of the APIC,
359 * however, they use a slightly different encoding. Specifically:
360 *
361 * APIC.TPR[bits 7:4] = CR8[bits 3:0]
362 *
363 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
364 * and IA-32 Architectures Software Developer's Manual.
365 *
366 * The functions below translate the value of CR8 to TPR and vice versa.
367 */
368
whpx_apic_tpr_to_cr8(uint64_t tpr)369 static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
370 {
371 return tpr >> 4;
372 }
373
whpx_cr8_to_apic_tpr(uint64_t cr8)374 static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
375 {
376 return cr8 << 4;
377 }
378
whpx_set_registers(CPUState * cpu,int level)379 static void whpx_set_registers(CPUState *cpu, int level)
380 {
381 struct whpx_state *whpx = &whpx_global;
382 AccelCPUState *vcpu = cpu->accel;
383 X86CPU *x86_cpu = X86_CPU(cpu);
384 CPUX86State *env = &x86_cpu->env;
385 struct whpx_register_set vcxt;
386 HRESULT hr;
387 int idx;
388 int idx_next;
389 int i;
390 int v86, r86;
391
392 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
393
394 /*
395 * Following MSRs have side effects on the guest or are too heavy for
396 * runtime. Limit them to full state update.
397 */
398 if (level >= WHPX_SET_RESET_STATE) {
399 whpx_set_tsc(cpu);
400 }
401
402 memset(&vcxt, 0, sizeof(struct whpx_register_set));
403
404 v86 = (env->eflags & VM_MASK);
405 r86 = !(env->cr[0] & CR0_PE_MASK);
406
407 vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
408 vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);
409
410 idx = 0;
411
412 /* Indexes for first 16 registers match between HV and QEMU definitions */
413 idx_next = 16;
414 for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
415 vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
416 }
417 idx = idx_next;
418
419 /* Same goes for RIP and RFLAGS */
420 assert(whpx_register_names[idx] == WHvX64RegisterRip);
421 vcxt.values[idx++].Reg64 = env->eip;
422
423 assert(whpx_register_names[idx] == WHvX64RegisterRflags);
424 vcxt.values[idx++].Reg64 = env->eflags;
425
426 /* Translate 6+4 segment registers. HV and QEMU order matches */
427 assert(idx == WHvX64RegisterEs);
428 for (i = 0; i < 6; i += 1, idx += 1) {
429 vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
430 }
431
432 assert(idx == WHvX64RegisterLdtr);
433 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);
434
435 assert(idx == WHvX64RegisterTr);
436 vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);
437
438 assert(idx == WHvX64RegisterIdtr);
439 vcxt.values[idx].Table.Base = env->idt.base;
440 vcxt.values[idx].Table.Limit = env->idt.limit;
441 idx += 1;
442
443 assert(idx == WHvX64RegisterGdtr);
444 vcxt.values[idx].Table.Base = env->gdt.base;
445 vcxt.values[idx].Table.Limit = env->gdt.limit;
446 idx += 1;
447
448 /* CR0, 2, 3, 4, 8 */
449 assert(whpx_register_names[idx] == WHvX64RegisterCr0);
450 vcxt.values[idx++].Reg64 = env->cr[0];
451 assert(whpx_register_names[idx] == WHvX64RegisterCr2);
452 vcxt.values[idx++].Reg64 = env->cr[2];
453 assert(whpx_register_names[idx] == WHvX64RegisterCr3);
454 vcxt.values[idx++].Reg64 = env->cr[3];
455 assert(whpx_register_names[idx] == WHvX64RegisterCr4);
456 vcxt.values[idx++].Reg64 = env->cr[4];
457 assert(whpx_register_names[idx] == WHvX64RegisterCr8);
458 vcxt.values[idx++].Reg64 = vcpu->tpr;
459
460 /* 8 Debug Registers - Skipped */
461
462 /*
463 * Extended control registers needs to be handled separately depending
464 * on whether xsave is supported/enabled or not.
465 */
466 whpx_set_xcrs(cpu);
467
468 /* 16 XMM registers */
469 assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
470 idx_next = idx + 16;
471 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
472 vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
473 vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
474 }
475 idx = idx_next;
476
477 /* 8 FP registers */
478 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
479 for (i = 0; i < 8; i += 1, idx += 1) {
480 vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
481 /* vcxt.values[idx].Fp.AsUINT128.High64 =
482 env->fpregs[i].mmx.MMX_Q(1);
483 */
484 }
485
486 /* FP control status register */
487 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
488 vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
489 vcxt.values[idx].FpControlStatus.FpStatus =
490 (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
491 vcxt.values[idx].FpControlStatus.FpTag = 0;
492 for (i = 0; i < 8; ++i) {
493 vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
494 }
495 vcxt.values[idx].FpControlStatus.Reserved = 0;
496 vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
497 vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
498 idx += 1;
499
500 /* XMM control status register */
501 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
502 vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
503 vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
504 vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
505 idx += 1;
506
507 /* MSRs */
508 assert(whpx_register_names[idx] == WHvX64RegisterEfer);
509 vcxt.values[idx++].Reg64 = env->efer;
510 #ifdef TARGET_X86_64
511 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
512 vcxt.values[idx++].Reg64 = env->kernelgsbase;
513 #endif
514
515 assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
516 vcxt.values[idx++].Reg64 = vcpu->apic_base;
517
518 /* WHvX64RegisterPat - Skipped */
519
520 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
521 vcxt.values[idx++].Reg64 = env->sysenter_cs;
522 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
523 vcxt.values[idx++].Reg64 = env->sysenter_eip;
524 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
525 vcxt.values[idx++].Reg64 = env->sysenter_esp;
526 assert(whpx_register_names[idx] == WHvX64RegisterStar);
527 vcxt.values[idx++].Reg64 = env->star;
528 #ifdef TARGET_X86_64
529 assert(whpx_register_names[idx] == WHvX64RegisterLstar);
530 vcxt.values[idx++].Reg64 = env->lstar;
531 assert(whpx_register_names[idx] == WHvX64RegisterCstar);
532 vcxt.values[idx++].Reg64 = env->cstar;
533 assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
534 vcxt.values[idx++].Reg64 = env->fmask;
535 #endif
536
537 /* Interrupt / Event Registers - Skipped */
538
539 assert(idx == RTL_NUMBER_OF(whpx_register_names));
540
541 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
542 whpx->partition, cpu->cpu_index,
543 whpx_register_names,
544 RTL_NUMBER_OF(whpx_register_names),
545 &vcxt.values[0]);
546
547 if (FAILED(hr)) {
548 error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
549 hr);
550 }
551 }
552
whpx_get_tsc(CPUState * cpu)553 static int whpx_get_tsc(CPUState *cpu)
554 {
555 WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
556 WHV_REGISTER_VALUE tsc_val;
557 HRESULT hr;
558 struct whpx_state *whpx = &whpx_global;
559
560 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
561 whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
562 if (FAILED(hr)) {
563 error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
564 return -1;
565 }
566
567 cpu_env(cpu)->tsc = tsc_val.Reg64;
568 return 0;
569 }
570
571 /* X64 Extended Control Registers */
whpx_get_xcrs(CPUState * cpu)572 static void whpx_get_xcrs(CPUState *cpu)
573 {
574 HRESULT hr;
575 struct whpx_state *whpx = &whpx_global;
576 WHV_REGISTER_VALUE xcr0;
577 WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;
578
579 if (!whpx_has_xsave()) {
580 return;
581 }
582
583 /* Only xcr0 is supported by the hypervisor currently */
584 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
585 whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
586 if (FAILED(hr)) {
587 error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
588 return;
589 }
590
591 cpu_env(cpu)->xcr0 = xcr0.Reg64;
592 }
593
whpx_get_registers(CPUState * cpu)594 static void whpx_get_registers(CPUState *cpu)
595 {
596 struct whpx_state *whpx = &whpx_global;
597 AccelCPUState *vcpu = cpu->accel;
598 X86CPU *x86_cpu = X86_CPU(cpu);
599 CPUX86State *env = &x86_cpu->env;
600 struct whpx_register_set vcxt;
601 uint64_t tpr, apic_base;
602 HRESULT hr;
603 int idx;
604 int idx_next;
605 int i;
606
607 assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
608
609 if (!env->tsc_valid) {
610 whpx_get_tsc(cpu);
611 env->tsc_valid = !runstate_is_running();
612 }
613
614 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
615 whpx->partition, cpu->cpu_index,
616 whpx_register_names,
617 RTL_NUMBER_OF(whpx_register_names),
618 &vcxt.values[0]);
619 if (FAILED(hr)) {
620 error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
621 hr);
622 }
623
624 if (whpx_apic_in_platform()) {
625 /*
626 * Fetch the TPR value from the emulated APIC. It may get overwritten
627 * below with the value from CR8 returned by
628 * WHvGetVirtualProcessorRegisters().
629 */
630 whpx_apic_get(x86_cpu->apic_state);
631 vcpu->tpr = whpx_apic_tpr_to_cr8(
632 cpu_get_apic_tpr(x86_cpu->apic_state));
633 }
634
635 idx = 0;
636
637 /* Indexes for first 16 registers match between HV and QEMU definitions */
638 idx_next = 16;
639 for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
640 env->regs[idx] = vcxt.values[idx].Reg64;
641 }
642 idx = idx_next;
643
644 /* Same goes for RIP and RFLAGS */
645 assert(whpx_register_names[idx] == WHvX64RegisterRip);
646 env->eip = vcxt.values[idx++].Reg64;
647 assert(whpx_register_names[idx] == WHvX64RegisterRflags);
648 env->eflags = vcxt.values[idx++].Reg64;
649
650 /* Translate 6+4 segment registers. HV and QEMU order matches */
651 assert(idx == WHvX64RegisterEs);
652 for (i = 0; i < 6; i += 1, idx += 1) {
653 env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
654 }
655
656 assert(idx == WHvX64RegisterLdtr);
657 env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
658 assert(idx == WHvX64RegisterTr);
659 env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
660 assert(idx == WHvX64RegisterIdtr);
661 env->idt.base = vcxt.values[idx].Table.Base;
662 env->idt.limit = vcxt.values[idx].Table.Limit;
663 idx += 1;
664 assert(idx == WHvX64RegisterGdtr);
665 env->gdt.base = vcxt.values[idx].Table.Base;
666 env->gdt.limit = vcxt.values[idx].Table.Limit;
667 idx += 1;
668
669 /* CR0, 2, 3, 4, 8 */
670 assert(whpx_register_names[idx] == WHvX64RegisterCr0);
671 env->cr[0] = vcxt.values[idx++].Reg64;
672 assert(whpx_register_names[idx] == WHvX64RegisterCr2);
673 env->cr[2] = vcxt.values[idx++].Reg64;
674 assert(whpx_register_names[idx] == WHvX64RegisterCr3);
675 env->cr[3] = vcxt.values[idx++].Reg64;
676 assert(whpx_register_names[idx] == WHvX64RegisterCr4);
677 env->cr[4] = vcxt.values[idx++].Reg64;
678 assert(whpx_register_names[idx] == WHvX64RegisterCr8);
679 tpr = vcxt.values[idx++].Reg64;
680 if (tpr != vcpu->tpr) {
681 vcpu->tpr = tpr;
682 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
683 }
684
685 /* 8 Debug Registers - Skipped */
686
687 /*
688 * Extended control registers needs to be handled separately depending
689 * on whether xsave is supported/enabled or not.
690 */
691 whpx_get_xcrs(cpu);
692
693 /* 16 XMM registers */
694 assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
695 idx_next = idx + 16;
696 for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
697 env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
698 env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
699 }
700 idx = idx_next;
701
702 /* 8 FP registers */
703 assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
704 for (i = 0; i < 8; i += 1, idx += 1) {
705 env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
706 /* env->fpregs[i].mmx.MMX_Q(1) =
707 vcxt.values[idx].Fp.AsUINT128.High64;
708 */
709 }
710
711 /* FP control status register */
712 assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
713 env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
714 env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
715 env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
716 for (i = 0; i < 8; ++i) {
717 env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
718 }
719 env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
720 env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
721 idx += 1;
722
723 /* XMM control status register */
724 assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
725 env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
726 idx += 1;
727
728 /* MSRs */
729 assert(whpx_register_names[idx] == WHvX64RegisterEfer);
730 env->efer = vcxt.values[idx++].Reg64;
731 #ifdef TARGET_X86_64
732 assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
733 env->kernelgsbase = vcxt.values[idx++].Reg64;
734 #endif
735
736 assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
737 apic_base = vcxt.values[idx++].Reg64;
738 if (apic_base != vcpu->apic_base) {
739 vcpu->apic_base = apic_base;
740 cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
741 }
742
743 /* WHvX64RegisterPat - Skipped */
744
745 assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
746 env->sysenter_cs = vcxt.values[idx++].Reg64;
747 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
748 env->sysenter_eip = vcxt.values[idx++].Reg64;
749 assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
750 env->sysenter_esp = vcxt.values[idx++].Reg64;
751 assert(whpx_register_names[idx] == WHvX64RegisterStar);
752 env->star = vcxt.values[idx++].Reg64;
753 #ifdef TARGET_X86_64
754 assert(whpx_register_names[idx] == WHvX64RegisterLstar);
755 env->lstar = vcxt.values[idx++].Reg64;
756 assert(whpx_register_names[idx] == WHvX64RegisterCstar);
757 env->cstar = vcxt.values[idx++].Reg64;
758 assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
759 env->fmask = vcxt.values[idx++].Reg64;
760 #endif
761
762 /* Interrupt / Event Registers - Skipped */
763
764 assert(idx == RTL_NUMBER_OF(whpx_register_names));
765
766 if (whpx_apic_in_platform()) {
767 whpx_apic_get(x86_cpu->apic_state);
768 }
769
770 x86_update_hflags(env);
771 }
772
whpx_emu_ioport_callback(void * ctx,WHV_EMULATOR_IO_ACCESS_INFO * IoAccess)773 static HRESULT CALLBACK whpx_emu_ioport_callback(
774 void *ctx,
775 WHV_EMULATOR_IO_ACCESS_INFO *IoAccess)
776 {
777 MemTxAttrs attrs = { 0 };
778 address_space_rw(&address_space_io, IoAccess->Port, attrs,
779 &IoAccess->Data, IoAccess->AccessSize,
780 IoAccess->Direction);
781 return S_OK;
782 }
783
whpx_emu_mmio_callback(void * ctx,WHV_EMULATOR_MEMORY_ACCESS_INFO * ma)784 static HRESULT CALLBACK whpx_emu_mmio_callback(
785 void *ctx,
786 WHV_EMULATOR_MEMORY_ACCESS_INFO *ma)
787 {
788 cpu_physical_memory_rw(ma->GpaAddress, ma->Data, ma->AccessSize,
789 ma->Direction);
790 return S_OK;
791 }
792
whpx_emu_getreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,WHV_REGISTER_VALUE * RegisterValues)793 static HRESULT CALLBACK whpx_emu_getreg_callback(
794 void *ctx,
795 const WHV_REGISTER_NAME *RegisterNames,
796 UINT32 RegisterCount,
797 WHV_REGISTER_VALUE *RegisterValues)
798 {
799 HRESULT hr;
800 struct whpx_state *whpx = &whpx_global;
801 CPUState *cpu = (CPUState *)ctx;
802
803 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
804 whpx->partition, cpu->cpu_index,
805 RegisterNames, RegisterCount,
806 RegisterValues);
807 if (FAILED(hr)) {
808 error_report("WHPX: Failed to get virtual processor registers,"
809 " hr=%08lx", hr);
810 }
811
812 return hr;
813 }
814
whpx_emu_setreg_callback(void * ctx,const WHV_REGISTER_NAME * RegisterNames,UINT32 RegisterCount,const WHV_REGISTER_VALUE * RegisterValues)815 static HRESULT CALLBACK whpx_emu_setreg_callback(
816 void *ctx,
817 const WHV_REGISTER_NAME *RegisterNames,
818 UINT32 RegisterCount,
819 const WHV_REGISTER_VALUE *RegisterValues)
820 {
821 HRESULT hr;
822 struct whpx_state *whpx = &whpx_global;
823 CPUState *cpu = (CPUState *)ctx;
824
825 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
826 whpx->partition, cpu->cpu_index,
827 RegisterNames, RegisterCount,
828 RegisterValues);
829 if (FAILED(hr)) {
830 error_report("WHPX: Failed to set virtual processor registers,"
831 " hr=%08lx", hr);
832 }
833
834 /*
835 * The emulator just successfully wrote the register state. We clear the
836 * dirty state so we avoid the double write on resume of the VP.
837 */
838 cpu->vcpu_dirty = false;
839
840 return hr;
841 }
842
whpx_emu_translate_callback(void * ctx,WHV_GUEST_VIRTUAL_ADDRESS Gva,WHV_TRANSLATE_GVA_FLAGS TranslateFlags,WHV_TRANSLATE_GVA_RESULT_CODE * TranslationResult,WHV_GUEST_PHYSICAL_ADDRESS * Gpa)843 static HRESULT CALLBACK whpx_emu_translate_callback(
844 void *ctx,
845 WHV_GUEST_VIRTUAL_ADDRESS Gva,
846 WHV_TRANSLATE_GVA_FLAGS TranslateFlags,
847 WHV_TRANSLATE_GVA_RESULT_CODE *TranslationResult,
848 WHV_GUEST_PHYSICAL_ADDRESS *Gpa)
849 {
850 HRESULT hr;
851 struct whpx_state *whpx = &whpx_global;
852 CPUState *cpu = (CPUState *)ctx;
853 WHV_TRANSLATE_GVA_RESULT res;
854
855 hr = whp_dispatch.WHvTranslateGva(whpx->partition, cpu->cpu_index,
856 Gva, TranslateFlags, &res, Gpa);
857 if (FAILED(hr)) {
858 error_report("WHPX: Failed to translate GVA, hr=%08lx", hr);
859 } else {
860 *TranslationResult = res.ResultCode;
861 }
862
863 return hr;
864 }
865
866 static const WHV_EMULATOR_CALLBACKS whpx_emu_callbacks = {
867 .Size = sizeof(WHV_EMULATOR_CALLBACKS),
868 .WHvEmulatorIoPortCallback = whpx_emu_ioport_callback,
869 .WHvEmulatorMemoryCallback = whpx_emu_mmio_callback,
870 .WHvEmulatorGetVirtualProcessorRegisters = whpx_emu_getreg_callback,
871 .WHvEmulatorSetVirtualProcessorRegisters = whpx_emu_setreg_callback,
872 .WHvEmulatorTranslateGvaPage = whpx_emu_translate_callback,
873 };
874
whpx_handle_mmio(CPUState * cpu,WHV_MEMORY_ACCESS_CONTEXT * ctx)875 static int whpx_handle_mmio(CPUState *cpu, WHV_MEMORY_ACCESS_CONTEXT *ctx)
876 {
877 HRESULT hr;
878 AccelCPUState *vcpu = cpu->accel;
879 WHV_EMULATOR_STATUS emu_status;
880
881 hr = whp_dispatch.WHvEmulatorTryMmioEmulation(
882 vcpu->emulator, cpu,
883 &vcpu->exit_ctx.VpContext, ctx,
884 &emu_status);
885 if (FAILED(hr)) {
886 error_report("WHPX: Failed to parse MMIO access, hr=%08lx", hr);
887 return -1;
888 }
889
890 if (!emu_status.EmulationSuccessful) {
891 error_report("WHPX: Failed to emulate MMIO access with"
892 " EmulatorReturnStatus: %u", emu_status.AsUINT32);
893 return -1;
894 }
895
896 return 0;
897 }
898
whpx_handle_portio(CPUState * cpu,WHV_X64_IO_PORT_ACCESS_CONTEXT * ctx)899 static int whpx_handle_portio(CPUState *cpu,
900 WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx)
901 {
902 HRESULT hr;
903 AccelCPUState *vcpu = cpu->accel;
904 WHV_EMULATOR_STATUS emu_status;
905
906 hr = whp_dispatch.WHvEmulatorTryIoEmulation(
907 vcpu->emulator, cpu,
908 &vcpu->exit_ctx.VpContext, ctx,
909 &emu_status);
910 if (FAILED(hr)) {
911 error_report("WHPX: Failed to parse PortIO access, hr=%08lx", hr);
912 return -1;
913 }
914
915 if (!emu_status.EmulationSuccessful) {
916 error_report("WHPX: Failed to emulate PortIO access with"
917 " EmulatorReturnStatus: %u", emu_status.AsUINT32);
918 return -1;
919 }
920
921 return 0;
922 }
923
924 /*
925 * Controls whether we should intercept various exceptions on the guest,
926 * namely breakpoint/single-step events.
927 *
928 * The 'exceptions' argument accepts a bitmask, e.g:
929 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
930 */
whpx_set_exception_exit_bitmap(UINT64 exceptions)931 static HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
932 {
933 struct whpx_state *whpx = &whpx_global;
934 WHV_PARTITION_PROPERTY prop = { 0, };
935 HRESULT hr;
936
937 if (exceptions == whpx->exception_exit_bitmap) {
938 return S_OK;
939 }
940
941 prop.ExceptionExitBitmap = exceptions;
942
943 hr = whp_dispatch.WHvSetPartitionProperty(
944 whpx->partition,
945 WHvPartitionPropertyCodeExceptionExitBitmap,
946 &prop,
947 sizeof(WHV_PARTITION_PROPERTY));
948
949 if (SUCCEEDED(hr)) {
950 whpx->exception_exit_bitmap = exceptions;
951 }
952
953 return hr;
954 }
955
956
957 /*
958 * This function is called before/after stepping over a single instruction.
959 * It will update the CPU registers to arm/disarm the instruction stepping
960 * accordingly.
961 */
whpx_vcpu_configure_single_stepping(CPUState * cpu,bool set,uint64_t * exit_context_rflags)962 static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
963 bool set,
964 uint64_t *exit_context_rflags)
965 {
966 WHV_REGISTER_NAME reg_name;
967 WHV_REGISTER_VALUE reg_value;
968 HRESULT hr;
969 struct whpx_state *whpx = &whpx_global;
970
971 /*
972 * If we are trying to step over a single instruction, we need to set the
973 * TF bit in rflags. Otherwise, clear it.
974 */
975 reg_name = WHvX64RegisterRflags;
976 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
977 whpx->partition,
978 cpu->cpu_index,
979 ®_name,
980 1,
981 ®_value);
982
983 if (FAILED(hr)) {
984 error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
985 return hr;
986 }
987
988 if (exit_context_rflags) {
989 assert(*exit_context_rflags == reg_value.Reg64);
990 }
991
992 if (set) {
993 /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
994 reg_value.Reg64 |= TF_MASK;
995 } else {
996 reg_value.Reg64 &= ~TF_MASK;
997 }
998
999 if (exit_context_rflags) {
1000 *exit_context_rflags = reg_value.Reg64;
1001 }
1002
1003 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1004 whpx->partition,
1005 cpu->cpu_index,
1006 ®_name,
1007 1,
1008 ®_value);
1009
1010 if (FAILED(hr)) {
1011 error_report("WHPX: Failed to set rflags,"
1012 " hr=%08lx",
1013 hr);
1014 return hr;
1015 }
1016
1017 reg_name = WHvRegisterInterruptState;
1018 reg_value.Reg64 = 0;
1019
1020 /* Suspend delivery of hardware interrupts during single-stepping. */
1021 reg_value.InterruptState.InterruptShadow = set != 0;
1022
1023 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1024 whpx->partition,
1025 cpu->cpu_index,
1026 ®_name,
1027 1,
1028 ®_value);
1029
1030 if (FAILED(hr)) {
1031 error_report("WHPX: Failed to set InterruptState,"
1032 " hr=%08lx",
1033 hr);
1034 return hr;
1035 }
1036
1037 if (!set) {
1038 /*
1039 * We have just finished stepping over a single instruction,
1040 * and intercepted the INT1 generated by it.
1041 * We need to now hide the INT1 from the guest,
1042 * as it would not be expecting it.
1043 */
1044
1045 reg_name = WHvX64RegisterPendingDebugException;
1046 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1047 whpx->partition,
1048 cpu->cpu_index,
1049 ®_name,
1050 1,
1051 ®_value);
1052
1053 if (FAILED(hr)) {
1054 error_report("WHPX: Failed to get pending debug exceptions,"
1055 "hr=%08lx", hr);
1056 return hr;
1057 }
1058
1059 if (reg_value.PendingDebugException.SingleStep) {
1060 reg_value.PendingDebugException.SingleStep = 0;
1061
1062 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1063 whpx->partition,
1064 cpu->cpu_index,
1065 ®_name,
1066 1,
1067 ®_value);
1068
1069 if (FAILED(hr)) {
1070 error_report("WHPX: Failed to clear pending debug exceptions,"
1071 "hr=%08lx", hr);
1072 return hr;
1073 }
1074 }
1075
1076 }
1077
1078 return S_OK;
1079 }
1080
1081 /* Tries to find a breakpoint at the specified address. */
whpx_lookup_breakpoint_by_addr(uint64_t address)1082 static struct whpx_breakpoint *whpx_lookup_breakpoint_by_addr(uint64_t address)
1083 {
1084 struct whpx_state *whpx = &whpx_global;
1085 int i;
1086
1087 if (whpx->breakpoints.breakpoints) {
1088 for (i = 0; i < whpx->breakpoints.breakpoints->used; i++) {
1089 if (address == whpx->breakpoints.breakpoints->data[i].address) {
1090 return &whpx->breakpoints.breakpoints->data[i];
1091 }
1092 }
1093 }
1094
1095 return NULL;
1096 }
1097
1098 /*
1099 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
1100 * debugging user-mode applications. Since the WHPX API does not offer
1101 * an easy way to pass the intercepted exception back to the guest, we
1102 * resort to using INT1 instead, and let the guest always handle INT3.
1103 */
1104 static const uint8_t whpx_breakpoint_instruction = 0xF1;
1105
1106 /*
1107 * The WHPX QEMU backend implements breakpoints by writing the INT1
1108 * instruction into memory (ignoring the DRx registers). This raises a few
1109 * issues that need to be carefully handled:
1110 *
1111 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
1112 * at the same location, and later remove them in arbitrary order.
1113 * This should not cause memory corruption, and should only remove the
1114 * physical breakpoint instruction when the last QEMU breakpoint is gone.
1115 *
1116 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
1117 * physical location. Hence, physically adding/removing a breakpoint can
1118 * theoretically fail at any time. We need to keep track of it.
1119 *
1120 * The function below rebuilds a list of low-level breakpoints (one per
1121 * address, tracking the original instruction and any errors) from the list of
1122 * high-level breakpoints (set via cpu_breakpoint_insert()).
1123 *
1124 * In order to optimize performance, this function stores the list of
1125 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
1126 * low-level ones, so that it won't be re-invoked until these breakpoints
1127 * change.
1128 *
1129 * Note that this function decides which breakpoints should be inserted into,
1130 * memory, but doesn't actually do it. The memory accessing is done in
1131 * whpx_apply_breakpoints().
1132 */
whpx_translate_cpu_breakpoints(struct whpx_breakpoints * breakpoints,CPUState * cpu,int cpu_breakpoint_count)1133 static void whpx_translate_cpu_breakpoints(
1134 struct whpx_breakpoints *breakpoints,
1135 CPUState *cpu,
1136 int cpu_breakpoint_count)
1137 {
1138 CPUBreakpoint *bp;
1139 int cpu_bp_index = 0;
1140
1141 breakpoints->original_addresses =
1142 g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);
1143
1144 breakpoints->original_address_count = cpu_breakpoint_count;
1145
1146 int max_breakpoints = cpu_breakpoint_count +
1147 (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);
1148
1149 struct whpx_breakpoint_collection *new_breakpoints =
1150 g_malloc0(sizeof(struct whpx_breakpoint_collection)
1151 + max_breakpoints * sizeof(struct whpx_breakpoint));
1152
1153 new_breakpoints->allocated = max_breakpoints;
1154 new_breakpoints->used = 0;
1155
1156 /*
1157 * 1. Preserve all old breakpoints that could not be automatically
1158 * cleared when the CPU got stopped.
1159 */
1160 if (breakpoints->breakpoints) {
1161 int i;
1162 for (i = 0; i < breakpoints->breakpoints->used; i++) {
1163 if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
1164 new_breakpoints->data[new_breakpoints->used++] =
1165 breakpoints->breakpoints->data[i];
1166 }
1167 }
1168 }
1169
1170 /* 2. Map all CPU breakpoints to WHPX breakpoints */
1171 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1172 int i;
1173 bool found = false;
1174
1175 /* This will be used to detect changed CPU breakpoints later. */
1176 breakpoints->original_addresses[cpu_bp_index++] = bp->pc;
1177
1178 for (i = 0; i < new_breakpoints->used; i++) {
1179 /*
1180 * WARNING: This loop has O(N^2) complexity, where N is the
1181 * number of breakpoints. It should not be a bottleneck in
1182 * real-world scenarios, since it only needs to run once after
1183 * the breakpoints have been modified.
1184 * If this ever becomes a concern, it can be optimized by storing
1185 * high-level breakpoint objects in a tree or hash map.
1186 */
1187
1188 if (new_breakpoints->data[i].address == bp->pc) {
1189 /* There was already a breakpoint at this address. */
1190 if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
1191 new_breakpoints->data[i].state = WHPX_BP_SET;
1192 } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
1193 new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
1194 }
1195
1196 found = true;
1197 break;
1198 }
1199 }
1200
1201 if (!found && new_breakpoints->used < new_breakpoints->allocated) {
1202 /* No WHPX breakpoint at this address. Create one. */
1203 new_breakpoints->data[new_breakpoints->used].address = bp->pc;
1204 new_breakpoints->data[new_breakpoints->used].state =
1205 WHPX_BP_SET_PENDING;
1206 new_breakpoints->used++;
1207 }
1208 }
1209
1210 /*
1211 * Free the previous breakpoint list. This can be optimized by keeping
1212 * it as shadow buffer for the next computation instead of freeing
1213 * it immediately.
1214 */
1215 g_free(breakpoints->breakpoints);
1216
1217 breakpoints->breakpoints = new_breakpoints;
1218 }
1219
1220 /*
1221 * Physically inserts/removes the breakpoints by reading and writing the
1222 * physical memory, keeping a track of the failed attempts.
1223 *
1224 * Passing resuming=true will try to set all previously unset breakpoints.
1225 * Passing resuming=false will remove all inserted ones.
1226 */
whpx_apply_breakpoints(struct whpx_breakpoint_collection * breakpoints,CPUState * cpu,bool resuming)1227 static void whpx_apply_breakpoints(
1228 struct whpx_breakpoint_collection *breakpoints,
1229 CPUState *cpu,
1230 bool resuming)
1231 {
1232 int i, rc;
1233 if (!breakpoints) {
1234 return;
1235 }
1236
1237 for (i = 0; i < breakpoints->used; i++) {
1238 /* Decide what to do right now based on the last known state. */
1239 WhpxBreakpointState state = breakpoints->data[i].state;
1240 switch (state) {
1241 case WHPX_BP_CLEARED:
1242 if (resuming) {
1243 state = WHPX_BP_SET_PENDING;
1244 }
1245 break;
1246 case WHPX_BP_SET_PENDING:
1247 if (!resuming) {
1248 state = WHPX_BP_CLEARED;
1249 }
1250 break;
1251 case WHPX_BP_SET:
1252 if (!resuming) {
1253 state = WHPX_BP_CLEAR_PENDING;
1254 }
1255 break;
1256 case WHPX_BP_CLEAR_PENDING:
1257 if (resuming) {
1258 state = WHPX_BP_SET;
1259 }
1260 break;
1261 }
1262
1263 if (state == WHPX_BP_SET_PENDING) {
1264 /* Remember the original instruction. */
1265 rc = cpu_memory_rw_debug(cpu,
1266 breakpoints->data[i].address,
1267 &breakpoints->data[i].original_instruction,
1268 1,
1269 false);
1270
1271 if (!rc) {
1272 /* Write the breakpoint instruction. */
1273 rc = cpu_memory_rw_debug(cpu,
1274 breakpoints->data[i].address,
1275 (void *)&whpx_breakpoint_instruction,
1276 1,
1277 true);
1278 }
1279
1280 if (!rc) {
1281 state = WHPX_BP_SET;
1282 }
1283
1284 }
1285
1286 if (state == WHPX_BP_CLEAR_PENDING) {
1287 /* Restore the original instruction. */
1288 rc = cpu_memory_rw_debug(cpu,
1289 breakpoints->data[i].address,
1290 &breakpoints->data[i].original_instruction,
1291 1,
1292 true);
1293
1294 if (!rc) {
1295 state = WHPX_BP_CLEARED;
1296 }
1297 }
1298
1299 breakpoints->data[i].state = state;
1300 }
1301 }
1302
1303 /*
1304 * This function is called when the a VCPU is about to start and no other
1305 * VCPUs have been started so far. Since the VCPU start order could be
1306 * arbitrary, it doesn't have to be VCPU#0.
1307 *
1308 * It is used to commit the breakpoints into memory, and configure WHPX
1309 * to intercept debug exceptions.
1310 *
1311 * Note that whpx_set_exception_exit_bitmap() cannot be called if one or
1312 * more VCPUs are already running, so this is the best place to do it.
1313 */
whpx_first_vcpu_starting(CPUState * cpu)1314 static int whpx_first_vcpu_starting(CPUState *cpu)
1315 {
1316 struct whpx_state *whpx = &whpx_global;
1317 HRESULT hr;
1318
1319 g_assert(bql_locked());
1320
1321 if (!QTAILQ_EMPTY(&cpu->breakpoints) ||
1322 (whpx->breakpoints.breakpoints &&
1323 whpx->breakpoints.breakpoints->used)) {
1324 CPUBreakpoint *bp;
1325 int i = 0;
1326 bool update_pending = false;
1327
1328 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
1329 if (i >= whpx->breakpoints.original_address_count ||
1330 bp->pc != whpx->breakpoints.original_addresses[i]) {
1331 update_pending = true;
1332 }
1333
1334 i++;
1335 }
1336
1337 if (i != whpx->breakpoints.original_address_count) {
1338 update_pending = true;
1339 }
1340
1341 if (update_pending) {
1342 /*
1343 * The CPU breakpoints have changed since the last call to
1344 * whpx_translate_cpu_breakpoints(). WHPX breakpoints must
1345 * now be recomputed.
1346 */
1347 whpx_translate_cpu_breakpoints(&whpx->breakpoints, cpu, i);
1348 }
1349
1350 /* Actually insert the breakpoints into the memory. */
1351 whpx_apply_breakpoints(whpx->breakpoints.breakpoints, cpu, true);
1352 }
1353
1354 uint64_t exception_mask;
1355 if (whpx->step_pending ||
1356 (whpx->breakpoints.breakpoints &&
1357 whpx->breakpoints.breakpoints->used)) {
1358 /*
1359 * We are either attempting to single-step one or more CPUs, or
1360 * have one or more breakpoints enabled. Both require intercepting
1361 * the WHvX64ExceptionTypeBreakpointTrap exception.
1362 */
1363
1364 exception_mask = 1UL << WHvX64ExceptionTypeDebugTrapOrFault;
1365 } else {
1366 /* Let the guest handle all exceptions. */
1367 exception_mask = 0;
1368 }
1369
1370 hr = whpx_set_exception_exit_bitmap(exception_mask);
1371 if (!SUCCEEDED(hr)) {
1372 error_report("WHPX: Failed to update exception exit mask,"
1373 "hr=%08lx.", hr);
1374 return 1;
1375 }
1376
1377 return 0;
1378 }
1379
1380 /*
1381 * This function is called when the last VCPU has finished running.
1382 * It is used to remove any previously set breakpoints from memory.
1383 */
whpx_last_vcpu_stopping(CPUState * cpu)1384 static int whpx_last_vcpu_stopping(CPUState *cpu)
1385 {
1386 whpx_apply_breakpoints(whpx_global.breakpoints.breakpoints, cpu, false);
1387 return 0;
1388 }
1389
1390 /* Returns the address of the next instruction that is about to be executed. */
whpx_vcpu_get_pc(CPUState * cpu,bool exit_context_valid)1391 static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
1392 {
1393 if (cpu->vcpu_dirty) {
1394 /* The CPU registers have been modified by other parts of QEMU. */
1395 return cpu_env(cpu)->eip;
1396 } else if (exit_context_valid) {
1397 /*
1398 * The CPU registers have not been modified by neither other parts
1399 * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
1400 * This is the most common case.
1401 */
1402 AccelCPUState *vcpu = cpu->accel;
1403 return vcpu->exit_ctx.VpContext.Rip;
1404 } else {
1405 /*
1406 * The CPU registers have been modified by a call to
1407 * WHvSetVirtualProcessorRegisters() and must be re-queried from
1408 * the target.
1409 */
1410 WHV_REGISTER_VALUE reg_value;
1411 WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
1412 HRESULT hr;
1413 struct whpx_state *whpx = &whpx_global;
1414
1415 hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
1416 whpx->partition,
1417 cpu->cpu_index,
1418 ®_name,
1419 1,
1420 ®_value);
1421
1422 if (FAILED(hr)) {
1423 error_report("WHPX: Failed to get PC, hr=%08lx", hr);
1424 return 0;
1425 }
1426
1427 return reg_value.Reg64;
1428 }
1429 }
1430
whpx_handle_halt(CPUState * cpu)1431 static int whpx_handle_halt(CPUState *cpu)
1432 {
1433 int ret = 0;
1434
1435 bql_lock();
1436 if (!((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1437 (cpu_env(cpu)->eflags & IF_MASK)) &&
1438 !(cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1439 cpu->exception_index = EXCP_HLT;
1440 cpu->halted = true;
1441 ret = 1;
1442 }
1443 bql_unlock();
1444
1445 return ret;
1446 }
1447
whpx_vcpu_pre_run(CPUState * cpu)1448 static void whpx_vcpu_pre_run(CPUState *cpu)
1449 {
1450 HRESULT hr;
1451 struct whpx_state *whpx = &whpx_global;
1452 AccelCPUState *vcpu = cpu->accel;
1453 X86CPU *x86_cpu = X86_CPU(cpu);
1454 CPUX86State *env = &x86_cpu->env;
1455 int irq;
1456 uint8_t tpr;
1457 WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
1458 UINT32 reg_count = 0;
1459 WHV_REGISTER_VALUE reg_values[3];
1460 WHV_REGISTER_NAME reg_names[3];
1461
1462 memset(&new_int, 0, sizeof(new_int));
1463 memset(reg_values, 0, sizeof(reg_values));
1464
1465 bql_lock();
1466
1467 /* Inject NMI */
1468 if (!vcpu->interruption_pending &&
1469 cpu->interrupt_request & (CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
1470 if (cpu->interrupt_request & CPU_INTERRUPT_NMI) {
1471 cpu->interrupt_request &= ~CPU_INTERRUPT_NMI;
1472 vcpu->interruptable = false;
1473 new_int.InterruptionType = WHvX64PendingNmi;
1474 new_int.InterruptionPending = 1;
1475 new_int.InterruptionVector = 2;
1476 }
1477 if (cpu->interrupt_request & CPU_INTERRUPT_SMI) {
1478 cpu->interrupt_request &= ~CPU_INTERRUPT_SMI;
1479 }
1480 }
1481
1482 /*
1483 * Force the VCPU out of its inner loop to process any INIT requests or
1484 * commit pending TPR access.
1485 */
1486 if (cpu->interrupt_request & (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
1487 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1488 !(env->hflags & HF_SMM_MASK)) {
1489 cpu->exit_request = 1;
1490 }
1491 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1492 cpu->exit_request = 1;
1493 }
1494 }
1495
1496 /* Get pending hard interruption or replay one that was overwritten */
1497 if (!whpx_apic_in_platform()) {
1498 if (!vcpu->interruption_pending &&
1499 vcpu->interruptable && (env->eflags & IF_MASK)) {
1500 assert(!new_int.InterruptionPending);
1501 if (cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1502 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1503 irq = cpu_get_pic_interrupt(env);
1504 if (irq >= 0) {
1505 new_int.InterruptionType = WHvX64PendingInterrupt;
1506 new_int.InterruptionPending = 1;
1507 new_int.InterruptionVector = irq;
1508 }
1509 }
1510 }
1511
1512 /* Setup interrupt state if new one was prepared */
1513 if (new_int.InterruptionPending) {
1514 reg_values[reg_count].PendingInterruption = new_int;
1515 reg_names[reg_count] = WHvRegisterPendingInterruption;
1516 reg_count += 1;
1517 }
1518 } else if (vcpu->ready_for_pic_interrupt &&
1519 (cpu->interrupt_request & CPU_INTERRUPT_HARD)) {
1520 cpu->interrupt_request &= ~CPU_INTERRUPT_HARD;
1521 irq = cpu_get_pic_interrupt(env);
1522 if (irq >= 0) {
1523 reg_names[reg_count] = WHvRegisterPendingEvent;
1524 reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
1525 {
1526 .EventPending = 1,
1527 .EventType = WHvX64PendingEventExtInt,
1528 .Vector = irq,
1529 };
1530 reg_count += 1;
1531 }
1532 }
1533
1534 /* Sync the TPR to the CR8 if was modified during the intercept */
1535 tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
1536 if (tpr != vcpu->tpr) {
1537 vcpu->tpr = tpr;
1538 reg_values[reg_count].Reg64 = tpr;
1539 cpu->exit_request = 1;
1540 reg_names[reg_count] = WHvX64RegisterCr8;
1541 reg_count += 1;
1542 }
1543
1544 /* Update the state of the interrupt delivery notification */
1545 if (!vcpu->window_registered &&
1546 cpu->interrupt_request & CPU_INTERRUPT_HARD) {
1547 reg_values[reg_count].DeliverabilityNotifications =
1548 (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
1549 .InterruptNotification = 1
1550 };
1551 vcpu->window_registered = 1;
1552 reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
1553 reg_count += 1;
1554 }
1555
1556 bql_unlock();
1557 vcpu->ready_for_pic_interrupt = false;
1558
1559 if (reg_count) {
1560 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1561 whpx->partition, cpu->cpu_index,
1562 reg_names, reg_count, reg_values);
1563 if (FAILED(hr)) {
1564 error_report("WHPX: Failed to set interrupt state registers,"
1565 " hr=%08lx", hr);
1566 }
1567 }
1568 }
1569
whpx_vcpu_post_run(CPUState * cpu)1570 static void whpx_vcpu_post_run(CPUState *cpu)
1571 {
1572 AccelCPUState *vcpu = cpu->accel;
1573 X86CPU *x86_cpu = X86_CPU(cpu);
1574 CPUX86State *env = &x86_cpu->env;
1575
1576 env->eflags = vcpu->exit_ctx.VpContext.Rflags;
1577
1578 uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
1579 if (vcpu->tpr != tpr) {
1580 vcpu->tpr = tpr;
1581 bql_lock();
1582 cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
1583 bql_unlock();
1584 }
1585
1586 vcpu->interruption_pending =
1587 vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;
1588
1589 vcpu->interruptable =
1590 !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
1591 }
1592
whpx_vcpu_process_async_events(CPUState * cpu)1593 static void whpx_vcpu_process_async_events(CPUState *cpu)
1594 {
1595 X86CPU *x86_cpu = X86_CPU(cpu);
1596 CPUX86State *env = &x86_cpu->env;
1597 AccelCPUState *vcpu = cpu->accel;
1598
1599 if ((cpu->interrupt_request & CPU_INTERRUPT_INIT) &&
1600 !(env->hflags & HF_SMM_MASK)) {
1601 whpx_cpu_synchronize_state(cpu);
1602 do_cpu_init(x86_cpu);
1603 vcpu->interruptable = true;
1604 }
1605
1606 if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
1607 cpu->interrupt_request &= ~CPU_INTERRUPT_POLL;
1608 apic_poll_irq(x86_cpu->apic_state);
1609 }
1610
1611 if (((cpu->interrupt_request & CPU_INTERRUPT_HARD) &&
1612 (env->eflags & IF_MASK)) ||
1613 (cpu->interrupt_request & CPU_INTERRUPT_NMI)) {
1614 cpu->halted = false;
1615 }
1616
1617 if (cpu->interrupt_request & CPU_INTERRUPT_SIPI) {
1618 whpx_cpu_synchronize_state(cpu);
1619 do_cpu_sipi(x86_cpu);
1620 }
1621
1622 if (cpu->interrupt_request & CPU_INTERRUPT_TPR) {
1623 cpu->interrupt_request &= ~CPU_INTERRUPT_TPR;
1624 whpx_cpu_synchronize_state(cpu);
1625 apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
1626 env->tpr_access_type);
1627 }
1628 }
1629
whpx_vcpu_run(CPUState * cpu)1630 static int whpx_vcpu_run(CPUState *cpu)
1631 {
1632 HRESULT hr;
1633 struct whpx_state *whpx = &whpx_global;
1634 AccelCPUState *vcpu = cpu->accel;
1635 struct whpx_breakpoint *stepped_over_bp = NULL;
1636 WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
1637 int ret;
1638
1639 g_assert(bql_locked());
1640
1641 if (whpx->running_cpus++ == 0) {
1642 /* Insert breakpoints into memory, update exception exit bitmap. */
1643 ret = whpx_first_vcpu_starting(cpu);
1644 if (ret != 0) {
1645 return ret;
1646 }
1647 }
1648
1649 if (whpx->breakpoints.breakpoints &&
1650 whpx->breakpoints.breakpoints->used > 0)
1651 {
1652 uint64_t pc = whpx_vcpu_get_pc(cpu, true);
1653 stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
1654 if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
1655 stepped_over_bp = NULL;
1656 }
1657
1658 if (stepped_over_bp) {
1659 /*
1660 * We are trying to run the instruction overwritten by an active
1661 * breakpoint. We will temporarily disable the breakpoint, suspend
1662 * other CPUs, and step over the instruction.
1663 */
1664 exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
1665 }
1666 }
1667
1668 if (exclusive_step_mode == WHPX_STEP_NONE) {
1669 whpx_vcpu_process_async_events(cpu);
1670 if (cpu->halted && !whpx_apic_in_platform()) {
1671 cpu->exception_index = EXCP_HLT;
1672 qatomic_set(&cpu->exit_request, false);
1673 return 0;
1674 }
1675 }
1676
1677 bql_unlock();
1678
1679 if (exclusive_step_mode != WHPX_STEP_NONE) {
1680 start_exclusive();
1681 g_assert(cpu == current_cpu);
1682 g_assert(!cpu->running);
1683 cpu->running = true;
1684
1685 hr = whpx_set_exception_exit_bitmap(
1686 1UL << WHvX64ExceptionTypeDebugTrapOrFault);
1687 if (!SUCCEEDED(hr)) {
1688 error_report("WHPX: Failed to update exception exit mask, "
1689 "hr=%08lx.", hr);
1690 return 1;
1691 }
1692
1693 if (stepped_over_bp) {
1694 /* Temporarily disable the triggered breakpoint. */
1695 cpu_memory_rw_debug(cpu,
1696 stepped_over_bp->address,
1697 &stepped_over_bp->original_instruction,
1698 1,
1699 true);
1700 }
1701 } else {
1702 cpu_exec_start(cpu);
1703 }
1704
1705 do {
1706 if (cpu->vcpu_dirty) {
1707 whpx_set_registers(cpu, WHPX_SET_RUNTIME_STATE);
1708 cpu->vcpu_dirty = false;
1709 }
1710
1711 if (exclusive_step_mode == WHPX_STEP_NONE) {
1712 whpx_vcpu_pre_run(cpu);
1713
1714 if (qatomic_read(&cpu->exit_request)) {
1715 whpx_vcpu_kick(cpu);
1716 }
1717 }
1718
1719 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1720 whpx_vcpu_configure_single_stepping(cpu, true, NULL);
1721 }
1722
1723 hr = whp_dispatch.WHvRunVirtualProcessor(
1724 whpx->partition, cpu->cpu_index,
1725 &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));
1726
1727 if (FAILED(hr)) {
1728 error_report("WHPX: Failed to exec a virtual processor,"
1729 " hr=%08lx", hr);
1730 ret = -1;
1731 break;
1732 }
1733
1734 if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
1735 whpx_vcpu_configure_single_stepping(cpu,
1736 false,
1737 &vcpu->exit_ctx.VpContext.Rflags);
1738 }
1739
1740 whpx_vcpu_post_run(cpu);
1741
1742 switch (vcpu->exit_ctx.ExitReason) {
1743 case WHvRunVpExitReasonMemoryAccess:
1744 ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx.MemoryAccess);
1745 break;
1746
1747 case WHvRunVpExitReasonX64IoPortAccess:
1748 ret = whpx_handle_portio(cpu, &vcpu->exit_ctx.IoPortAccess);
1749 break;
1750
1751 case WHvRunVpExitReasonX64InterruptWindow:
1752 vcpu->ready_for_pic_interrupt = 1;
1753 vcpu->window_registered = 0;
1754 ret = 0;
1755 break;
1756
1757 case WHvRunVpExitReasonX64ApicEoi:
1758 assert(whpx_apic_in_platform());
1759 ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
1760 break;
1761
1762 case WHvRunVpExitReasonX64Halt:
1763 /*
1764 * WARNING: as of build 19043.1526 (21H1), this exit reason is no
1765 * longer used.
1766 */
1767 ret = whpx_handle_halt(cpu);
1768 break;
1769
1770 case WHvRunVpExitReasonX64ApicInitSipiTrap: {
1771 WHV_INTERRUPT_CONTROL ipi = {0};
1772 uint64_t icr = vcpu->exit_ctx.ApicInitSipi.ApicIcr;
1773 uint32_t delivery_mode =
1774 (icr & APIC_ICR_DELIV_MOD) >> APIC_ICR_DELIV_MOD_SHIFT;
1775 int dest_shorthand =
1776 (icr & APIC_ICR_DEST_SHORT) >> APIC_ICR_DEST_SHORT_SHIFT;
1777 bool broadcast = false;
1778 bool include_self = false;
1779 uint32_t i;
1780
1781 /* We only registered for INIT and SIPI exits. */
1782 if ((delivery_mode != APIC_DM_INIT) &&
1783 (delivery_mode != APIC_DM_SIPI)) {
1784 error_report(
1785 "WHPX: Unexpected APIC exit that is not a INIT or SIPI");
1786 break;
1787 }
1788
1789 if (delivery_mode == APIC_DM_INIT) {
1790 ipi.Type = WHvX64InterruptTypeInit;
1791 } else {
1792 ipi.Type = WHvX64InterruptTypeSipi;
1793 }
1794
1795 ipi.DestinationMode =
1796 ((icr & APIC_ICR_DEST_MOD) >> APIC_ICR_DEST_MOD_SHIFT) ?
1797 WHvX64InterruptDestinationModeLogical :
1798 WHvX64InterruptDestinationModePhysical;
1799
1800 ipi.TriggerMode =
1801 ((icr & APIC_ICR_TRIGGER_MOD) >> APIC_ICR_TRIGGER_MOD_SHIFT) ?
1802 WHvX64InterruptTriggerModeLevel :
1803 WHvX64InterruptTriggerModeEdge;
1804
1805 ipi.Vector = icr & APIC_VECTOR_MASK;
1806 switch (dest_shorthand) {
1807 /* no shorthand. Bits 56-63 contain the destination. */
1808 case 0:
1809 ipi.Destination = (icr >> 56) & APIC_VECTOR_MASK;
1810 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1811 &ipi, sizeof(ipi));
1812 if (FAILED(hr)) {
1813 error_report("WHPX: Failed to request interrupt hr=%08lx",
1814 hr);
1815 }
1816
1817 break;
1818
1819 /* self */
1820 case 1:
1821 include_self = true;
1822 break;
1823
1824 /* broadcast, including self */
1825 case 2:
1826 broadcast = true;
1827 include_self = true;
1828 break;
1829
1830 /* broadcast, excluding self */
1831 case 3:
1832 broadcast = true;
1833 break;
1834 }
1835
1836 if (!broadcast && !include_self) {
1837 break;
1838 }
1839
1840 for (i = 0; i <= max_vcpu_index; i++) {
1841 if (i == cpu->cpu_index && !include_self) {
1842 continue;
1843 }
1844
1845 /*
1846 * Assuming that APIC Ids are identity mapped since
1847 * WHvX64RegisterApicId & WHvX64RegisterInitialApicId registers
1848 * are not handled yet and the hypervisor doesn't allow the
1849 * guest to modify the APIC ID.
1850 */
1851 ipi.Destination = i;
1852 hr = whp_dispatch.WHvRequestInterrupt(whpx->partition,
1853 &ipi, sizeof(ipi));
1854 if (FAILED(hr)) {
1855 error_report(
1856 "WHPX: Failed to request SIPI for %d, hr=%08lx",
1857 i, hr);
1858 }
1859 }
1860
1861 break;
1862 }
1863
1864 case WHvRunVpExitReasonCanceled:
1865 if (exclusive_step_mode != WHPX_STEP_NONE) {
1866 /*
1867 * We are trying to step over a single instruction, and
1868 * likely got a request to stop from another thread.
1869 * Delay it until we are done stepping
1870 * over.
1871 */
1872 ret = 0;
1873 } else {
1874 cpu->exception_index = EXCP_INTERRUPT;
1875 ret = 1;
1876 }
1877 break;
1878 case WHvRunVpExitReasonX64MsrAccess: {
1879 WHV_REGISTER_VALUE reg_values[3] = {0};
1880 WHV_REGISTER_NAME reg_names[3];
1881 UINT32 reg_count;
1882
1883 reg_names[0] = WHvX64RegisterRip;
1884 reg_names[1] = WHvX64RegisterRax;
1885 reg_names[2] = WHvX64RegisterRdx;
1886
1887 reg_values[0].Reg64 =
1888 vcpu->exit_ctx.VpContext.Rip +
1889 vcpu->exit_ctx.VpContext.InstructionLength;
1890
1891 /*
1892 * For all unsupported MSR access we:
1893 * ignore writes
1894 * return 0 on read.
1895 */
1896 reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
1897 1 : 3;
1898
1899 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1900 whpx->partition,
1901 cpu->cpu_index,
1902 reg_names, reg_count,
1903 reg_values);
1904
1905 if (FAILED(hr)) {
1906 error_report("WHPX: Failed to set MsrAccess state "
1907 " registers, hr=%08lx", hr);
1908 }
1909 ret = 0;
1910 break;
1911 }
1912 case WHvRunVpExitReasonX64Cpuid: {
1913 WHV_REGISTER_VALUE reg_values[5];
1914 WHV_REGISTER_NAME reg_names[5];
1915 UINT32 reg_count = 5;
1916 UINT64 cpuid_fn, rip = 0, rax = 0, rcx = 0, rdx = 0, rbx = 0;
1917 X86CPU *x86_cpu = X86_CPU(cpu);
1918 CPUX86State *env = &x86_cpu->env;
1919
1920 memset(reg_values, 0, sizeof(reg_values));
1921
1922 rip = vcpu->exit_ctx.VpContext.Rip +
1923 vcpu->exit_ctx.VpContext.InstructionLength;
1924 cpuid_fn = vcpu->exit_ctx.CpuidAccess.Rax;
1925
1926 /*
1927 * Ideally, these should be supplied to the hypervisor during VCPU
1928 * initialization and it should be able to satisfy this request.
1929 * But, currently, WHPX doesn't support setting CPUID values in the
1930 * hypervisor once the partition has been setup, which is too late
1931 * since VCPUs are realized later. For now, use the values from
1932 * QEMU to satisfy these requests, until WHPX adds support for
1933 * being able to set these values in the hypervisor at runtime.
1934 */
1935 cpu_x86_cpuid(env, cpuid_fn, 0, (UINT32 *)&rax, (UINT32 *)&rbx,
1936 (UINT32 *)&rcx, (UINT32 *)&rdx);
1937 switch (cpuid_fn) {
1938 case 0x40000000:
1939 /* Expose the vmware cpu frequency cpuid leaf */
1940 rax = 0x40000010;
1941 rbx = rcx = rdx = 0;
1942 break;
1943
1944 case 0x40000010:
1945 rax = env->tsc_khz;
1946 rbx = env->apic_bus_freq / 1000; /* Hz to KHz */
1947 rcx = rdx = 0;
1948 break;
1949
1950 case 0x80000001:
1951 /* Remove any support of OSVW */
1952 rcx &= ~CPUID_EXT3_OSVW;
1953 break;
1954 }
1955
1956 reg_names[0] = WHvX64RegisterRip;
1957 reg_names[1] = WHvX64RegisterRax;
1958 reg_names[2] = WHvX64RegisterRcx;
1959 reg_names[3] = WHvX64RegisterRdx;
1960 reg_names[4] = WHvX64RegisterRbx;
1961
1962 reg_values[0].Reg64 = rip;
1963 reg_values[1].Reg64 = rax;
1964 reg_values[2].Reg64 = rcx;
1965 reg_values[3].Reg64 = rdx;
1966 reg_values[4].Reg64 = rbx;
1967
1968 hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
1969 whpx->partition, cpu->cpu_index,
1970 reg_names,
1971 reg_count,
1972 reg_values);
1973
1974 if (FAILED(hr)) {
1975 error_report("WHPX: Failed to set CpuidAccess state registers,"
1976 " hr=%08lx", hr);
1977 }
1978 ret = 0;
1979 break;
1980 }
1981 case WHvRunVpExitReasonException:
1982 whpx_get_registers(cpu);
1983
1984 if ((vcpu->exit_ctx.VpException.ExceptionType ==
1985 WHvX64ExceptionTypeDebugTrapOrFault) &&
1986 (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
1987 (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
1988 whpx_breakpoint_instruction)) {
1989 /* Stopped at a software breakpoint. */
1990 cpu->exception_index = EXCP_DEBUG;
1991 } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
1992 WHvX64ExceptionTypeDebugTrapOrFault) &&
1993 !cpu->singlestep_enabled) {
1994 /*
1995 * Just finished stepping over a breakpoint, but the
1996 * gdb does not expect us to do single-stepping.
1997 * Don't do anything special.
1998 */
1999 cpu->exception_index = EXCP_INTERRUPT;
2000 } else {
2001 /* Another exception or debug event. Report it to GDB. */
2002 cpu->exception_index = EXCP_DEBUG;
2003 }
2004
2005 ret = 1;
2006 break;
2007 case WHvRunVpExitReasonNone:
2008 case WHvRunVpExitReasonUnrecoverableException:
2009 case WHvRunVpExitReasonInvalidVpRegisterValue:
2010 case WHvRunVpExitReasonUnsupportedFeature:
2011 default:
2012 error_report("WHPX: Unexpected VP exit code %d",
2013 vcpu->exit_ctx.ExitReason);
2014 whpx_get_registers(cpu);
2015 bql_lock();
2016 qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2017 bql_unlock();
2018 break;
2019 }
2020
2021 } while (!ret);
2022
2023 if (stepped_over_bp) {
2024 /* Restore the breakpoint we stepped over */
2025 cpu_memory_rw_debug(cpu,
2026 stepped_over_bp->address,
2027 (void *)&whpx_breakpoint_instruction,
2028 1,
2029 true);
2030 }
2031
2032 if (exclusive_step_mode != WHPX_STEP_NONE) {
2033 g_assert(cpu_in_exclusive_context(cpu));
2034 cpu->running = false;
2035 end_exclusive();
2036
2037 exclusive_step_mode = WHPX_STEP_NONE;
2038 } else {
2039 cpu_exec_end(cpu);
2040 }
2041
2042 bql_lock();
2043 current_cpu = cpu;
2044
2045 if (--whpx->running_cpus == 0) {
2046 whpx_last_vcpu_stopping(cpu);
2047 }
2048
2049 qatomic_set(&cpu->exit_request, false);
2050
2051 return ret < 0;
2052 }
2053
do_whpx_cpu_synchronize_state(CPUState * cpu,run_on_cpu_data arg)2054 static void do_whpx_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2055 {
2056 if (!cpu->vcpu_dirty) {
2057 whpx_get_registers(cpu);
2058 cpu->vcpu_dirty = true;
2059 }
2060 }
2061
do_whpx_cpu_synchronize_post_reset(CPUState * cpu,run_on_cpu_data arg)2062 static void do_whpx_cpu_synchronize_post_reset(CPUState *cpu,
2063 run_on_cpu_data arg)
2064 {
2065 whpx_set_registers(cpu, WHPX_SET_RESET_STATE);
2066 cpu->vcpu_dirty = false;
2067 }
2068
do_whpx_cpu_synchronize_post_init(CPUState * cpu,run_on_cpu_data arg)2069 static void do_whpx_cpu_synchronize_post_init(CPUState *cpu,
2070 run_on_cpu_data arg)
2071 {
2072 whpx_set_registers(cpu, WHPX_SET_FULL_STATE);
2073 cpu->vcpu_dirty = false;
2074 }
2075
do_whpx_cpu_synchronize_pre_loadvm(CPUState * cpu,run_on_cpu_data arg)2076 static void do_whpx_cpu_synchronize_pre_loadvm(CPUState *cpu,
2077 run_on_cpu_data arg)
2078 {
2079 cpu->vcpu_dirty = true;
2080 }
2081
2082 /*
2083 * CPU support.
2084 */
2085
whpx_cpu_synchronize_state(CPUState * cpu)2086 void whpx_cpu_synchronize_state(CPUState *cpu)
2087 {
2088 if (!cpu->vcpu_dirty) {
2089 run_on_cpu(cpu, do_whpx_cpu_synchronize_state, RUN_ON_CPU_NULL);
2090 }
2091 }
2092
whpx_cpu_synchronize_post_reset(CPUState * cpu)2093 void whpx_cpu_synchronize_post_reset(CPUState *cpu)
2094 {
2095 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2096 }
2097
whpx_cpu_synchronize_post_init(CPUState * cpu)2098 void whpx_cpu_synchronize_post_init(CPUState *cpu)
2099 {
2100 run_on_cpu(cpu, do_whpx_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2101 }
2102
whpx_cpu_synchronize_pre_loadvm(CPUState * cpu)2103 void whpx_cpu_synchronize_pre_loadvm(CPUState *cpu)
2104 {
2105 run_on_cpu(cpu, do_whpx_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2106 }
2107
whpx_cpu_synchronize_pre_resume(bool step_pending)2108 void whpx_cpu_synchronize_pre_resume(bool step_pending)
2109 {
2110 whpx_global.step_pending = step_pending;
2111 }
2112
2113 /*
2114 * Vcpu support.
2115 */
2116
2117 static Error *whpx_migration_blocker;
2118
whpx_cpu_update_state(void * opaque,bool running,RunState state)2119 static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
2120 {
2121 CPUX86State *env = opaque;
2122
2123 if (running) {
2124 env->tsc_valid = false;
2125 }
2126 }
2127
whpx_init_vcpu(CPUState * cpu)2128 int whpx_init_vcpu(CPUState *cpu)
2129 {
2130 HRESULT hr;
2131 struct whpx_state *whpx = &whpx_global;
2132 AccelCPUState *vcpu = NULL;
2133 Error *local_error = NULL;
2134 X86CPU *x86_cpu = X86_CPU(cpu);
2135 CPUX86State *env = &x86_cpu->env;
2136 UINT64 freq = 0;
2137 int ret;
2138
2139 /* Add migration blockers for all unsupported features of the
2140 * Windows Hypervisor Platform
2141 */
2142 if (whpx_migration_blocker == NULL) {
2143 error_setg(&whpx_migration_blocker,
2144 "State blocked due to non-migratable CPUID feature support,"
2145 "dirty memory tracking support, and XSAVE/XRSTOR support");
2146
2147 if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
2148 error_report_err(local_error);
2149 ret = -EINVAL;
2150 goto error;
2151 }
2152 }
2153
2154 vcpu = g_new0(AccelCPUState, 1);
2155
2156 hr = whp_dispatch.WHvEmulatorCreateEmulator(
2157 &whpx_emu_callbacks,
2158 &vcpu->emulator);
2159 if (FAILED(hr)) {
2160 error_report("WHPX: Failed to setup instruction completion support,"
2161 " hr=%08lx", hr);
2162 ret = -EINVAL;
2163 goto error;
2164 }
2165
2166 hr = whp_dispatch.WHvCreateVirtualProcessor(
2167 whpx->partition, cpu->cpu_index, 0);
2168 if (FAILED(hr)) {
2169 error_report("WHPX: Failed to create a virtual processor,"
2170 " hr=%08lx", hr);
2171 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2172 ret = -EINVAL;
2173 goto error;
2174 }
2175
2176 /*
2177 * vcpu's TSC frequency is either specified by user, or use the value
2178 * provided by Hyper-V if the former is not present. In the latter case, we
2179 * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
2180 * frequency can be migrated later via this field.
2181 */
2182 if (!env->tsc_khz) {
2183 hr = whp_dispatch.WHvGetCapability(
2184 WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
2185 NULL);
2186 if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2187 if (FAILED(hr)) {
2188 printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
2189 } else {
2190 env->tsc_khz = freq / 1000; /* Hz to KHz */
2191 }
2192 }
2193 }
2194
2195 env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
2196 hr = whp_dispatch.WHvGetCapability(
2197 WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
2198 if (hr != WHV_E_UNKNOWN_CAPABILITY) {
2199 if (FAILED(hr)) {
2200 printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
2201 } else {
2202 env->apic_bus_freq = freq;
2203 }
2204 }
2205
2206 /*
2207 * If the vmware cpuid frequency leaf option is set, and we have a valid
2208 * tsc value, trap the corresponding cpuid's.
2209 */
2210 if (x86_cpu->vmware_cpuid_freq && env->tsc_khz) {
2211 UINT32 cpuidExitList[] = {1, 0x80000001, 0x40000000, 0x40000010};
2212
2213 hr = whp_dispatch.WHvSetPartitionProperty(
2214 whpx->partition,
2215 WHvPartitionPropertyCodeCpuidExitList,
2216 cpuidExitList,
2217 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2218
2219 if (FAILED(hr)) {
2220 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2221 hr);
2222 ret = -EINVAL;
2223 goto error;
2224 }
2225 }
2226
2227 vcpu->interruptable = true;
2228 cpu->vcpu_dirty = true;
2229 cpu->accel = vcpu;
2230 max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
2231 qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);
2232
2233 return 0;
2234
2235 error:
2236 g_free(vcpu);
2237
2238 return ret;
2239 }
2240
whpx_vcpu_exec(CPUState * cpu)2241 int whpx_vcpu_exec(CPUState *cpu)
2242 {
2243 int ret;
2244 int fatal;
2245
2246 for (;;) {
2247 if (cpu->exception_index >= EXCP_INTERRUPT) {
2248 ret = cpu->exception_index;
2249 cpu->exception_index = -1;
2250 break;
2251 }
2252
2253 fatal = whpx_vcpu_run(cpu);
2254
2255 if (fatal) {
2256 error_report("WHPX: Failed to exec a virtual processor");
2257 abort();
2258 }
2259 }
2260
2261 return ret;
2262 }
2263
whpx_destroy_vcpu(CPUState * cpu)2264 void whpx_destroy_vcpu(CPUState *cpu)
2265 {
2266 struct whpx_state *whpx = &whpx_global;
2267 AccelCPUState *vcpu = cpu->accel;
2268
2269 whp_dispatch.WHvDeleteVirtualProcessor(whpx->partition, cpu->cpu_index);
2270 whp_dispatch.WHvEmulatorDestroyEmulator(vcpu->emulator);
2271 g_free(cpu->accel);
2272 }
2273
whpx_vcpu_kick(CPUState * cpu)2274 void whpx_vcpu_kick(CPUState *cpu)
2275 {
2276 struct whpx_state *whpx = &whpx_global;
2277 whp_dispatch.WHvCancelRunVirtualProcessor(
2278 whpx->partition, cpu->cpu_index, 0);
2279 }
2280
2281 /*
2282 * Memory support.
2283 */
2284
whpx_update_mapping(hwaddr start_pa,ram_addr_t size,void * host_va,int add,int rom,const char * name)2285 static void whpx_update_mapping(hwaddr start_pa, ram_addr_t size,
2286 void *host_va, int add, int rom,
2287 const char *name)
2288 {
2289 struct whpx_state *whpx = &whpx_global;
2290 HRESULT hr;
2291
2292 /*
2293 if (add) {
2294 printf("WHPX: ADD PA:%p Size:%p, Host:%p, %s, '%s'\n",
2295 (void*)start_pa, (void*)size, host_va,
2296 (rom ? "ROM" : "RAM"), name);
2297 } else {
2298 printf("WHPX: DEL PA:%p Size:%p, Host:%p, '%s'\n",
2299 (void*)start_pa, (void*)size, host_va, name);
2300 }
2301 */
2302
2303 if (add) {
2304 hr = whp_dispatch.WHvMapGpaRange(whpx->partition,
2305 host_va,
2306 start_pa,
2307 size,
2308 (WHvMapGpaRangeFlagRead |
2309 WHvMapGpaRangeFlagExecute |
2310 (rom ? 0 : WHvMapGpaRangeFlagWrite)));
2311 } else {
2312 hr = whp_dispatch.WHvUnmapGpaRange(whpx->partition,
2313 start_pa,
2314 size);
2315 }
2316
2317 if (FAILED(hr)) {
2318 error_report("WHPX: Failed to %s GPA range '%s' PA:%p, Size:%p bytes,"
2319 " Host:%p, hr=%08lx",
2320 (add ? "MAP" : "UNMAP"), name,
2321 (void *)(uintptr_t)start_pa, (void *)size, host_va, hr);
2322 }
2323 }
2324
whpx_process_section(MemoryRegionSection * section,int add)2325 static void whpx_process_section(MemoryRegionSection *section, int add)
2326 {
2327 MemoryRegion *mr = section->mr;
2328 hwaddr start_pa = section->offset_within_address_space;
2329 ram_addr_t size = int128_get64(section->size);
2330 unsigned int delta;
2331 uint64_t host_va;
2332
2333 if (!memory_region_is_ram(mr)) {
2334 return;
2335 }
2336
2337 delta = qemu_real_host_page_size() - (start_pa & ~qemu_real_host_page_mask());
2338 delta &= ~qemu_real_host_page_mask();
2339 if (delta > size) {
2340 return;
2341 }
2342 start_pa += delta;
2343 size -= delta;
2344 size &= qemu_real_host_page_mask();
2345 if (!size || (start_pa & ~qemu_real_host_page_mask())) {
2346 return;
2347 }
2348
2349 host_va = (uintptr_t)memory_region_get_ram_ptr(mr)
2350 + section->offset_within_region + delta;
2351
2352 whpx_update_mapping(start_pa, size, (void *)(uintptr_t)host_va, add,
2353 memory_region_is_rom(mr), mr->name);
2354 }
2355
whpx_region_add(MemoryListener * listener,MemoryRegionSection * section)2356 static void whpx_region_add(MemoryListener *listener,
2357 MemoryRegionSection *section)
2358 {
2359 memory_region_ref(section->mr);
2360 whpx_process_section(section, 1);
2361 }
2362
whpx_region_del(MemoryListener * listener,MemoryRegionSection * section)2363 static void whpx_region_del(MemoryListener *listener,
2364 MemoryRegionSection *section)
2365 {
2366 whpx_process_section(section, 0);
2367 memory_region_unref(section->mr);
2368 }
2369
whpx_transaction_begin(MemoryListener * listener)2370 static void whpx_transaction_begin(MemoryListener *listener)
2371 {
2372 }
2373
whpx_transaction_commit(MemoryListener * listener)2374 static void whpx_transaction_commit(MemoryListener *listener)
2375 {
2376 }
2377
whpx_log_sync(MemoryListener * listener,MemoryRegionSection * section)2378 static void whpx_log_sync(MemoryListener *listener,
2379 MemoryRegionSection *section)
2380 {
2381 MemoryRegion *mr = section->mr;
2382
2383 if (!memory_region_is_ram(mr)) {
2384 return;
2385 }
2386
2387 memory_region_set_dirty(mr, 0, int128_get64(section->size));
2388 }
2389
2390 static MemoryListener whpx_memory_listener = {
2391 .name = "whpx",
2392 .begin = whpx_transaction_begin,
2393 .commit = whpx_transaction_commit,
2394 .region_add = whpx_region_add,
2395 .region_del = whpx_region_del,
2396 .log_sync = whpx_log_sync,
2397 .priority = MEMORY_LISTENER_PRIORITY_ACCEL,
2398 };
2399
whpx_memory_init(void)2400 static void whpx_memory_init(void)
2401 {
2402 memory_listener_register(&whpx_memory_listener, &address_space_memory);
2403 }
2404
2405 /*
2406 * Load the functions from the given library, using the given handle. If a
2407 * handle is provided, it is used, otherwise the library is opened. The
2408 * handle will be updated on return with the opened one.
2409 */
load_whp_dispatch_fns(HMODULE * handle,WHPFunctionList function_list)2410 static bool load_whp_dispatch_fns(HMODULE *handle,
2411 WHPFunctionList function_list)
2412 {
2413 HMODULE hLib = *handle;
2414
2415 #define WINHV_PLATFORM_DLL "WinHvPlatform.dll"
2416 #define WINHV_EMULATION_DLL "WinHvEmulation.dll"
2417 #define WHP_LOAD_FIELD_OPTIONAL(return_type, function_name, signature) \
2418 whp_dispatch.function_name = \
2419 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2420
2421 #define WHP_LOAD_FIELD(return_type, function_name, signature) \
2422 whp_dispatch.function_name = \
2423 (function_name ## _t)GetProcAddress(hLib, #function_name); \
2424 if (!whp_dispatch.function_name) { \
2425 error_report("Could not load function %s", #function_name); \
2426 goto error; \
2427 } \
2428
2429 #define WHP_LOAD_LIB(lib_name, handle_lib) \
2430 if (!handle_lib) { \
2431 handle_lib = LoadLibrary(lib_name); \
2432 if (!handle_lib) { \
2433 error_report("Could not load library %s.", lib_name); \
2434 goto error; \
2435 } \
2436 } \
2437
2438 switch (function_list) {
2439 case WINHV_PLATFORM_FNS_DEFAULT:
2440 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2441 LIST_WINHVPLATFORM_FUNCTIONS(WHP_LOAD_FIELD)
2442 break;
2443
2444 case WINHV_EMULATION_FNS_DEFAULT:
2445 WHP_LOAD_LIB(WINHV_EMULATION_DLL, hLib)
2446 LIST_WINHVEMULATION_FUNCTIONS(WHP_LOAD_FIELD)
2447 break;
2448
2449 case WINHV_PLATFORM_FNS_SUPPLEMENTAL:
2450 WHP_LOAD_LIB(WINHV_PLATFORM_DLL, hLib)
2451 LIST_WINHVPLATFORM_FUNCTIONS_SUPPLEMENTAL(WHP_LOAD_FIELD_OPTIONAL)
2452 break;
2453 }
2454
2455 *handle = hLib;
2456 return true;
2457
2458 error:
2459 if (hLib) {
2460 FreeLibrary(hLib);
2461 }
2462
2463 return false;
2464 }
2465
whpx_set_kernel_irqchip(Object * obj,Visitor * v,const char * name,void * opaque,Error ** errp)2466 static void whpx_set_kernel_irqchip(Object *obj, Visitor *v,
2467 const char *name, void *opaque,
2468 Error **errp)
2469 {
2470 struct whpx_state *whpx = &whpx_global;
2471 OnOffSplit mode;
2472
2473 if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
2474 return;
2475 }
2476
2477 switch (mode) {
2478 case ON_OFF_SPLIT_ON:
2479 whpx->kernel_irqchip_allowed = true;
2480 whpx->kernel_irqchip_required = true;
2481 break;
2482
2483 case ON_OFF_SPLIT_OFF:
2484 whpx->kernel_irqchip_allowed = false;
2485 whpx->kernel_irqchip_required = false;
2486 break;
2487
2488 case ON_OFF_SPLIT_SPLIT:
2489 error_setg(errp, "WHPX: split irqchip currently not supported");
2490 error_append_hint(errp,
2491 "Try without kernel-irqchip or with kernel-irqchip=on|off");
2492 break;
2493
2494 default:
2495 /*
2496 * The value was checked in visit_type_OnOffSplit() above. If
2497 * we get here, then something is wrong in QEMU.
2498 */
2499 abort();
2500 }
2501 }
2502
2503 /*
2504 * Partition support
2505 */
2506
whpx_accel_init(AccelState * as,MachineState * ms)2507 static int whpx_accel_init(AccelState *as, MachineState *ms)
2508 {
2509 struct whpx_state *whpx;
2510 int ret;
2511 HRESULT hr;
2512 WHV_CAPABILITY whpx_cap;
2513 UINT32 whpx_cap_size;
2514 WHV_PARTITION_PROPERTY prop;
2515 UINT32 cpuidExitList[] = {1, 0x80000001};
2516 WHV_CAPABILITY_FEATURES features = {0};
2517
2518 whpx = &whpx_global;
2519
2520 if (!init_whp_dispatch()) {
2521 ret = -ENOSYS;
2522 goto error;
2523 }
2524
2525 whpx->mem_quota = ms->ram_size;
2526
2527 hr = whp_dispatch.WHvGetCapability(
2528 WHvCapabilityCodeHypervisorPresent, &whpx_cap,
2529 sizeof(whpx_cap), &whpx_cap_size);
2530 if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
2531 error_report("WHPX: No accelerator found, hr=%08lx", hr);
2532 ret = -ENOSPC;
2533 goto error;
2534 }
2535
2536 hr = whp_dispatch.WHvGetCapability(
2537 WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
2538 if (FAILED(hr)) {
2539 error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
2540 ret = -EINVAL;
2541 goto error;
2542 }
2543
2544 hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
2545 if (FAILED(hr)) {
2546 error_report("WHPX: Failed to create partition, hr=%08lx", hr);
2547 ret = -EINVAL;
2548 goto error;
2549 }
2550
2551 /*
2552 * Query the XSAVE capability of the partition. Any error here is not
2553 * considered fatal.
2554 */
2555 hr = whp_dispatch.WHvGetPartitionProperty(
2556 whpx->partition,
2557 WHvPartitionPropertyCodeProcessorXsaveFeatures,
2558 &whpx_xsave_cap,
2559 sizeof(whpx_xsave_cap),
2560 &whpx_cap_size);
2561
2562 /*
2563 * Windows version which don't support this property will return with the
2564 * specific error code.
2565 */
2566 if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
2567 error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
2568 }
2569
2570 if (!whpx_has_xsave()) {
2571 printf("WHPX: Partition is not XSAVE capable\n");
2572 }
2573
2574 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2575 prop.ProcessorCount = ms->smp.cpus;
2576 hr = whp_dispatch.WHvSetPartitionProperty(
2577 whpx->partition,
2578 WHvPartitionPropertyCodeProcessorCount,
2579 &prop,
2580 sizeof(WHV_PARTITION_PROPERTY));
2581
2582 if (FAILED(hr)) {
2583 error_report("WHPX: Failed to set partition processor count to %u,"
2584 " hr=%08lx", prop.ProcessorCount, hr);
2585 ret = -EINVAL;
2586 goto error;
2587 }
2588
2589 /*
2590 * Error out if WHP doesn't support apic emulation and user is requiring
2591 * it.
2592 */
2593 if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
2594 !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
2595 error_report("WHPX: kernel irqchip requested, but unavailable. "
2596 "Try without kernel-irqchip or with kernel-irqchip=off");
2597 ret = -EINVAL;
2598 goto error;
2599 }
2600
2601 if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
2602 whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
2603 WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
2604 WHvX64LocalApicEmulationModeXApic;
2605 printf("WHPX: setting APIC emulation mode in the hypervisor\n");
2606 hr = whp_dispatch.WHvSetPartitionProperty(
2607 whpx->partition,
2608 WHvPartitionPropertyCodeLocalApicEmulationMode,
2609 &mode,
2610 sizeof(mode));
2611 if (FAILED(hr)) {
2612 error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
2613 if (whpx->kernel_irqchip_required) {
2614 error_report("WHPX: kernel irqchip requested, but unavailable");
2615 ret = -EINVAL;
2616 goto error;
2617 }
2618 } else {
2619 whpx->apic_in_platform = true;
2620 }
2621 }
2622
2623 /* Register for MSR and CPUID exits */
2624 memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
2625 prop.ExtendedVmExits.X64MsrExit = 1;
2626 prop.ExtendedVmExits.X64CpuidExit = 1;
2627 prop.ExtendedVmExits.ExceptionExit = 1;
2628 if (whpx_apic_in_platform()) {
2629 prop.ExtendedVmExits.X64ApicInitSipiExitTrap = 1;
2630 }
2631
2632 hr = whp_dispatch.WHvSetPartitionProperty(
2633 whpx->partition,
2634 WHvPartitionPropertyCodeExtendedVmExits,
2635 &prop,
2636 sizeof(WHV_PARTITION_PROPERTY));
2637 if (FAILED(hr)) {
2638 error_report("WHPX: Failed to enable MSR & CPUIDexit, hr=%08lx", hr);
2639 ret = -EINVAL;
2640 goto error;
2641 }
2642
2643 hr = whp_dispatch.WHvSetPartitionProperty(
2644 whpx->partition,
2645 WHvPartitionPropertyCodeCpuidExitList,
2646 cpuidExitList,
2647 RTL_NUMBER_OF(cpuidExitList) * sizeof(UINT32));
2648
2649 if (FAILED(hr)) {
2650 error_report("WHPX: Failed to set partition CpuidExitList hr=%08lx",
2651 hr);
2652 ret = -EINVAL;
2653 goto error;
2654 }
2655
2656 /*
2657 * We do not want to intercept any exceptions from the guest,
2658 * until we actually start debugging with gdb.
2659 */
2660 whpx->exception_exit_bitmap = -1;
2661 hr = whpx_set_exception_exit_bitmap(0);
2662
2663 if (FAILED(hr)) {
2664 error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
2665 ret = -EINVAL;
2666 goto error;
2667 }
2668
2669 hr = whp_dispatch.WHvSetupPartition(whpx->partition);
2670 if (FAILED(hr)) {
2671 error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
2672 ret = -EINVAL;
2673 goto error;
2674 }
2675
2676 whpx_memory_init();
2677
2678 printf("Windows Hypervisor Platform accelerator is operational\n");
2679 return 0;
2680
2681 error:
2682
2683 if (NULL != whpx->partition) {
2684 whp_dispatch.WHvDeletePartition(whpx->partition);
2685 whpx->partition = NULL;
2686 }
2687
2688 return ret;
2689 }
2690
whpx_apic_in_platform(void)2691 bool whpx_apic_in_platform(void) {
2692 return whpx_global.apic_in_platform;
2693 }
2694
whpx_accel_class_init(ObjectClass * oc,const void * data)2695 static void whpx_accel_class_init(ObjectClass *oc, const void *data)
2696 {
2697 AccelClass *ac = ACCEL_CLASS(oc);
2698 ac->name = "WHPX";
2699 ac->init_machine = whpx_accel_init;
2700 ac->allowed = &whpx_allowed;
2701
2702 object_class_property_add(oc, "kernel-irqchip", "on|off|split",
2703 NULL, whpx_set_kernel_irqchip,
2704 NULL, NULL);
2705 object_class_property_set_description(oc, "kernel-irqchip",
2706 "Configure WHPX in-kernel irqchip");
2707 }
2708
whpx_accel_instance_init(Object * obj)2709 static void whpx_accel_instance_init(Object *obj)
2710 {
2711 struct whpx_state *whpx = &whpx_global;
2712
2713 memset(whpx, 0, sizeof(struct whpx_state));
2714 /* Turn on kernel-irqchip, by default */
2715 whpx->kernel_irqchip_allowed = true;
2716 }
2717
2718 static const TypeInfo whpx_accel_type = {
2719 .name = ACCEL_CLASS_NAME("whpx"),
2720 .parent = TYPE_ACCEL,
2721 .instance_init = whpx_accel_instance_init,
2722 .class_init = whpx_accel_class_init,
2723 };
2724
whpx_type_init(void)2725 static void whpx_type_init(void)
2726 {
2727 type_register_static(&whpx_accel_type);
2728 }
2729
init_whp_dispatch(void)2730 bool init_whp_dispatch(void)
2731 {
2732 if (whp_dispatch_initialized) {
2733 return true;
2734 }
2735
2736 if (!load_whp_dispatch_fns(&hWinHvPlatform, WINHV_PLATFORM_FNS_DEFAULT)) {
2737 goto error;
2738 }
2739
2740 if (!load_whp_dispatch_fns(&hWinHvEmulation, WINHV_EMULATION_FNS_DEFAULT)) {
2741 goto error;
2742 }
2743
2744 assert(load_whp_dispatch_fns(&hWinHvPlatform,
2745 WINHV_PLATFORM_FNS_SUPPLEMENTAL));
2746 whp_dispatch_initialized = true;
2747
2748 return true;
2749 error:
2750 if (hWinHvPlatform) {
2751 FreeLibrary(hWinHvPlatform);
2752 }
2753
2754 if (hWinHvEmulation) {
2755 FreeLibrary(hWinHvEmulation);
2756 }
2757
2758 return false;
2759 }
2760
2761 type_init(whpx_type_init);
2762