xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 226ecf47bb608d52367de61236fb8ad37b871ca2)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::collections::BTreeMap;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use std::io::Write;
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::os::unix::thread::JoinHandleExt;
20 use std::sync::atomic::{AtomicBool, Ordering};
21 use std::sync::{Arc, Barrier, Mutex};
22 use std::{cmp, io, result, thread};
23 
24 #[cfg(not(target_arch = "riscv64"))]
25 use acpi_tables::sdt::Sdt;
26 use acpi_tables::{aml, Aml};
27 use anyhow::anyhow;
28 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
29 use arch::aarch64::regs;
30 #[cfg(target_arch = "x86_64")]
31 use arch::x86_64::get_x2apic_id;
32 use arch::{EntryPoint, NumaNodes};
33 #[cfg(target_arch = "aarch64")]
34 use devices::gic::Gic;
35 use devices::interrupt_controller::InterruptController;
36 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
37 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
38 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
39 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
41 use hypervisor::arch::x86::msr_index;
42 #[cfg(target_arch = "x86_64")]
43 use hypervisor::arch::x86::CpuIdEntry;
44 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
45 use hypervisor::arch::x86::MsrEntry;
46 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
47 use hypervisor::arch::x86::SpecialRegisters;
48 #[cfg(feature = "tdx")]
49 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
50 #[cfg(target_arch = "x86_64")]
51 use hypervisor::CpuVendor;
52 #[cfg(feature = "kvm")]
53 use hypervisor::HypervisorType;
54 #[cfg(feature = "guest_debug")]
55 use hypervisor::StandardRegisters;
56 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
57 use libc::{c_void, siginfo_t};
58 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
59 use linux_loader::elf::Elf64_Nhdr;
60 use seccompiler::{apply_filter, SeccompAction};
61 use thiserror::Error;
62 use tracer::trace_scoped;
63 use vm_device::BusDevice;
64 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
65 use vm_memory::ByteValued;
66 #[cfg(feature = "guest_debug")]
67 use vm_memory::{Bytes, GuestAddressSpace};
68 use vm_memory::{GuestAddress, GuestMemoryAtomic};
69 use vm_migration::{
70     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
71     Transportable,
72 };
73 use vmm_sys_util::eventfd::EventFd;
74 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
75 use zerocopy::AsBytes;
76 
77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
78 use crate::coredump::{
79     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
80     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
81     NT_PRSTATUS,
82 };
83 #[cfg(feature = "guest_debug")]
84 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
85 #[cfg(target_arch = "x86_64")]
86 use crate::memory_manager::MemoryManager;
87 use crate::seccomp_filters::{get_seccomp_filter, Thread};
88 #[cfg(target_arch = "x86_64")]
89 use crate::vm::physical_bits;
90 use crate::vm_config::CpusConfig;
91 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID};
92 
93 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
94 /// Extract the specified bits of a 64-bit integer.
95 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
96 /// following expression should return 3 (`0b11`):
97 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
98 ///
99 macro_rules! extract_bits_64 {
100     ($value: tt, $offset: tt, $length: tt) => {
101         ($value >> $offset) & (!0u64 >> (64 - $length))
102     };
103 }
104 
105 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
106 macro_rules! extract_bits_64_without_offset {
107     ($value: tt, $length: tt) => {
108         $value & (!0u64 >> (64 - $length))
109     };
110 }
111 
112 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
113 
114 #[derive(Debug, Error)]
115 pub enum Error {
116     #[error("Error creating vCPU: {0}")]
117     VcpuCreate(#[source] anyhow::Error),
118 
119     #[error("Error running bCPU: {0}")]
120     VcpuRun(#[source] anyhow::Error),
121 
122     #[error("Error spawning vCPU thread: {0}")]
123     VcpuSpawn(#[source] io::Error),
124 
125     #[error("Error generating common CPUID: {0}")]
126     CommonCpuId(#[source] arch::Error),
127 
128     #[error("Error configuring vCPU: {0}")]
129     VcpuConfiguration(#[source] arch::Error),
130 
131     #[error("Still pending removed vcpu")]
132     VcpuPendingRemovedVcpu,
133 
134     #[cfg(target_arch = "aarch64")]
135     #[error("Error fetching preferred target: {0}")]
136     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
137 
138     #[cfg(target_arch = "aarch64")]
139     #[error("Error setting vCPU processor features: {0}")]
140     VcpuSetProcessorFeatures(#[source] hypervisor::HypervisorCpuError),
141 
142     #[cfg(target_arch = "aarch64")]
143     #[error("Error initialising vCPU: {0}")]
144     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
145 
146     #[cfg(target_arch = "aarch64")]
147     #[error("Error finalising vCPU: {0}")]
148     VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError),
149 
150     #[error("Failed to join on vCPU threads: {0:?}")]
151     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
152 
153     #[error("Error adding CpuManager to MMIO bus: {0}")]
154     BusError(#[source] vm_device::BusError),
155 
156     #[error("Requested vCPUs exceed maximum")]
157     DesiredVCpuCountExceedsMax,
158 
159     #[error("Cannot create seccomp filter: {0}")]
160     CreateSeccompFilter(#[source] seccompiler::Error),
161 
162     #[error("Cannot apply seccomp filter: {0}")]
163     ApplySeccompFilter(#[source] seccompiler::Error),
164 
165     #[error("Error starting vCPU after restore: {0}")]
166     StartRestoreVcpu(#[source] anyhow::Error),
167 
168     #[error("Unexpected VmExit")]
169     UnexpectedVmExit,
170 
171     #[error("Failed to allocate MMIO address for CpuManager")]
172     AllocateMmmioAddress,
173 
174     #[cfg(feature = "tdx")]
175     #[error("Error initializing TDX: {0}")]
176     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
177 
178     #[cfg(target_arch = "aarch64")]
179     #[error("Error initializing PMU: {0}")]
180     InitPmu(#[source] hypervisor::HypervisorCpuError),
181 
182     #[cfg(feature = "guest_debug")]
183     #[error("Error during CPU debug: {0}")]
184     CpuDebug(#[source] hypervisor::HypervisorCpuError),
185 
186     #[cfg(feature = "guest_debug")]
187     #[error("Error translating virtual address: {0}")]
188     TranslateVirtualAddress(#[source] anyhow::Error),
189 
190     #[cfg(target_arch = "x86_64")]
191     #[error("Error setting up AMX: {0}")]
192     AmxEnable(#[source] anyhow::Error),
193 
194     #[error("Maximum number of vCPUs exceeds host limit")]
195     MaximumVcpusExceeded,
196 
197     #[cfg(feature = "sev_snp")]
198     #[error("Failed to set sev control register: {0}")]
199     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
200 
201     #[cfg(target_arch = "x86_64")]
202     #[error("Failed to inject NMI")]
203     NmiError(hypervisor::HypervisorCpuError),
204 }
205 pub type Result<T> = result::Result<T, Error>;
206 
207 #[cfg(target_arch = "x86_64")]
208 #[allow(dead_code)]
209 #[repr(C, packed)]
210 #[derive(AsBytes)]
211 struct LocalX2Apic {
212     pub r#type: u8,
213     pub length: u8,
214     pub _reserved: u16,
215     pub apic_id: u32,
216     pub flags: u32,
217     pub processor_id: u32,
218 }
219 
220 #[allow(dead_code)]
221 #[repr(C, packed)]
222 #[derive(Default, AsBytes)]
223 struct Ioapic {
224     pub r#type: u8,
225     pub length: u8,
226     pub ioapic_id: u8,
227     _reserved: u8,
228     pub apic_address: u32,
229     pub gsi_base: u32,
230 }
231 
232 #[cfg(target_arch = "aarch64")]
233 #[allow(dead_code)]
234 #[repr(C, packed)]
235 #[derive(AsBytes)]
236 struct GicC {
237     pub r#type: u8,
238     pub length: u8,
239     pub reserved0: u16,
240     pub cpu_interface_number: u32,
241     pub uid: u32,
242     pub flags: u32,
243     pub parking_version: u32,
244     pub performance_interrupt: u32,
245     pub parked_address: u64,
246     pub base_address: u64,
247     pub gicv_base_address: u64,
248     pub gich_base_address: u64,
249     pub vgic_interrupt: u32,
250     pub gicr_base_address: u64,
251     pub mpidr: u64,
252     pub proc_power_effi_class: u8,
253     pub reserved1: u8,
254     pub spe_overflow_interrupt: u16,
255 }
256 
257 #[cfg(target_arch = "aarch64")]
258 #[allow(dead_code)]
259 #[repr(C, packed)]
260 #[derive(AsBytes)]
261 struct GicD {
262     pub r#type: u8,
263     pub length: u8,
264     pub reserved0: u16,
265     pub gic_id: u32,
266     pub base_address: u64,
267     pub global_irq_base: u32,
268     pub version: u8,
269     pub reserved1: [u8; 3],
270 }
271 
272 #[cfg(target_arch = "aarch64")]
273 #[allow(dead_code)]
274 #[repr(C, packed)]
275 #[derive(AsBytes)]
276 struct GicR {
277     pub r#type: u8,
278     pub length: u8,
279     pub reserved: u16,
280     pub base_address: u64,
281     pub range_length: u32,
282 }
283 
284 #[cfg(target_arch = "aarch64")]
285 #[allow(dead_code)]
286 #[repr(C, packed)]
287 #[derive(AsBytes)]
288 struct GicIts {
289     pub r#type: u8,
290     pub length: u8,
291     pub reserved0: u16,
292     pub translation_id: u32,
293     pub base_address: u64,
294     pub reserved1: u32,
295 }
296 
297 #[cfg(target_arch = "aarch64")]
298 #[allow(dead_code)]
299 #[repr(C, packed)]
300 #[derive(AsBytes)]
301 struct ProcessorHierarchyNode {
302     pub r#type: u8,
303     pub length: u8,
304     pub reserved: u16,
305     pub flags: u32,
306     pub parent: u32,
307     pub acpi_processor_id: u32,
308     pub num_private_resources: u32,
309 }
310 
311 #[allow(dead_code)]
312 #[repr(C, packed)]
313 #[derive(Default, AsBytes)]
314 struct InterruptSourceOverride {
315     pub r#type: u8,
316     pub length: u8,
317     pub bus: u8,
318     pub source: u8,
319     pub gsi: u32,
320     pub flags: u16,
321 }
322 
323 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
324 macro_rules! round_up {
325     ($n:expr,$d:expr) => {
326         (($n / ($d + 1)) + 1) * $d
327     };
328 }
329 
330 /// A wrapper around creating and using a kvm-based VCPU.
331 pub struct Vcpu {
332     // The hypervisor abstracted CPU.
333     vcpu: Arc<dyn hypervisor::Vcpu>,
334     id: u8,
335     #[cfg(target_arch = "aarch64")]
336     mpidr: u64,
337     saved_state: Option<CpuState>,
338     #[cfg(target_arch = "x86_64")]
339     vendor: CpuVendor,
340 }
341 
342 impl Vcpu {
343     /// Constructs a new VCPU for `vm`.
344     ///
345     /// # Arguments
346     ///
347     /// * `id` - Represents the CPU number between [0, max vcpus).
348     /// * `vm` - The virtual machine this vcpu will get attached to.
349     /// * `vm_ops` - Optional object for exit handling.
350     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
351     pub fn new(
352         id: u8,
353         apic_id: u8,
354         vm: &Arc<dyn hypervisor::Vm>,
355         vm_ops: Option<Arc<dyn VmOps>>,
356         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
357     ) -> Result<Self> {
358         let vcpu = vm
359             .create_vcpu(apic_id, vm_ops)
360             .map_err(|e| Error::VcpuCreate(e.into()))?;
361         // Initially the cpuid per vCPU is the one supported by this VM.
362         Ok(Vcpu {
363             vcpu,
364             id,
365             #[cfg(target_arch = "aarch64")]
366             mpidr: 0,
367             saved_state: None,
368             #[cfg(target_arch = "x86_64")]
369             vendor: cpu_vendor,
370         })
371     }
372 
373     /// Configures a vcpu and should be called once per vcpu when created.
374     ///
375     /// # Arguments
376     ///
377     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
378     /// * `guest_memory` - Guest memory.
379     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
380     pub fn configure(
381         &mut self,
382         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
383         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
384         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
385         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
386         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
387     ) -> Result<()> {
388         #[cfg(target_arch = "aarch64")]
389         {
390             self.init(vm)?;
391             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
392                 .map_err(Error::VcpuConfiguration)?;
393         }
394         #[cfg(target_arch = "riscv64")]
395         arch::configure_vcpu(&self.vcpu, self.id, boot_setup).map_err(Error::VcpuConfiguration)?;
396         info!("Configuring vCPU: cpu_id = {}", self.id);
397         #[cfg(target_arch = "x86_64")]
398         arch::configure_vcpu(
399             &self.vcpu,
400             self.id,
401             boot_setup,
402             cpuid,
403             kvm_hyperv,
404             self.vendor,
405             topology,
406         )
407         .map_err(Error::VcpuConfiguration)?;
408 
409         Ok(())
410     }
411 
412     /// Gets the MPIDR register value.
413     #[cfg(target_arch = "aarch64")]
414     pub fn get_mpidr(&self) -> u64 {
415         self.mpidr
416     }
417 
418     /// Gets the saved vCPU state.
419     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
420     pub fn get_saved_state(&self) -> Option<CpuState> {
421         self.saved_state.clone()
422     }
423 
424     /// Initializes an aarch64 specific vcpu for booting Linux.
425     #[cfg(target_arch = "aarch64")]
426     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
427         use std::arch::is_aarch64_feature_detected;
428         #[allow(clippy::nonminimal_bool)]
429         let sve_supported =
430             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
431         let mut kvi = self.vcpu.create_vcpu_init();
432 
433         // This reads back the kernel's preferred target type.
434         vm.get_preferred_target(&mut kvi)
435             .map_err(Error::VcpuArmPreferredTarget)?;
436 
437         self.vcpu
438             .vcpu_set_processor_features(vm, &mut kvi, self.id)
439             .map_err(Error::VcpuSetProcessorFeatures)?;
440 
441         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?;
442 
443         if sve_supported {
444             let finalized_features = self.vcpu.vcpu_get_finalized_features();
445             self.vcpu
446                 .vcpu_finalize(finalized_features)
447                 .map_err(Error::VcpuArmFinalize)?;
448         }
449         Ok(())
450     }
451 
452     /// Runs the VCPU until it exits, returning the reason.
453     ///
454     /// Note that the state of the VCPU and associated VM must be setup first for this to do
455     /// anything useful.
456     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
457         self.vcpu.run()
458     }
459 
460     #[cfg(feature = "sev_snp")]
461     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
462         self.vcpu
463             .set_sev_control_register(vmsa_pfn)
464             .map_err(Error::SetSevControlRegister)
465     }
466 }
467 
468 impl Pausable for Vcpu {}
469 impl Snapshottable for Vcpu {
470     fn id(&self) -> String {
471         self.id.to_string()
472     }
473 
474     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
475         let saved_state = self
476             .vcpu
477             .state()
478             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
479 
480         self.saved_state = Some(saved_state.clone());
481 
482         Ok(Snapshot::from_data(SnapshotData::new_from_state(
483             &saved_state,
484         )?))
485     }
486 }
487 
488 pub struct CpuManager {
489     config: CpusConfig,
490     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
491     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
492     #[cfg(target_arch = "x86_64")]
493     cpuid: Vec<CpuIdEntry>,
494     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
495     vm: Arc<dyn hypervisor::Vm>,
496     vcpus_kill_signalled: Arc<AtomicBool>,
497     vcpus_pause_signalled: Arc<AtomicBool>,
498     vcpus_kick_signalled: Arc<AtomicBool>,
499     exit_evt: EventFd,
500     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
501     reset_evt: EventFd,
502     #[cfg(feature = "guest_debug")]
503     vm_debug_evt: EventFd,
504     vcpu_states: Vec<VcpuState>,
505     selected_cpu: u8,
506     vcpus: Vec<Arc<Mutex<Vcpu>>>,
507     seccomp_action: SeccompAction,
508     vm_ops: Arc<dyn VmOps>,
509     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
510     acpi_address: Option<GuestAddress>,
511     proximity_domain_per_cpu: BTreeMap<u8, u32>,
512     affinity: BTreeMap<u8, Vec<usize>>,
513     dynamic: bool,
514     hypervisor: Arc<dyn hypervisor::Hypervisor>,
515     #[cfg(feature = "sev_snp")]
516     sev_snp_enabled: bool,
517 }
518 
519 const CPU_ENABLE_FLAG: usize = 0;
520 const CPU_INSERTING_FLAG: usize = 1;
521 const CPU_REMOVING_FLAG: usize = 2;
522 const CPU_EJECT_FLAG: usize = 3;
523 
524 const CPU_STATUS_OFFSET: u64 = 4;
525 const CPU_SELECTION_OFFSET: u64 = 0;
526 
527 impl BusDevice for CpuManager {
528     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
529         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
530         data.fill(0);
531 
532         match offset {
533             CPU_SELECTION_OFFSET => {
534                 data[0] = self.selected_cpu;
535             }
536             CPU_STATUS_OFFSET => {
537                 if self.selected_cpu < self.max_vcpus() {
538                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
539                     if state.active() {
540                         data[0] |= 1 << CPU_ENABLE_FLAG;
541                     }
542                     if state.inserting {
543                         data[0] |= 1 << CPU_INSERTING_FLAG;
544                     }
545                     if state.removing {
546                         data[0] |= 1 << CPU_REMOVING_FLAG;
547                     }
548                 } else {
549                     warn!("Out of range vCPU id: {}", self.selected_cpu);
550                 }
551             }
552             _ => {
553                 warn!(
554                     "Unexpected offset for accessing CPU manager device: {:#}",
555                     offset
556                 );
557             }
558         }
559     }
560 
561     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
562         match offset {
563             CPU_SELECTION_OFFSET => {
564                 self.selected_cpu = data[0];
565             }
566             CPU_STATUS_OFFSET => {
567                 if self.selected_cpu < self.max_vcpus() {
568                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
569                     // The ACPI code writes back a 1 to acknowledge the insertion
570                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
571                         && state.inserting
572                     {
573                         state.inserting = false;
574                     }
575                     // Ditto for removal
576                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
577                         && state.removing
578                     {
579                         state.removing = false;
580                     }
581                     // Trigger removal of vCPU
582                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
583                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
584                             error!("Error removing vCPU: {:?}", e);
585                         }
586                     }
587                 } else {
588                     warn!("Out of range vCPU id: {}", self.selected_cpu);
589                 }
590             }
591             _ => {
592                 warn!(
593                     "Unexpected offset for accessing CPU manager device: {:#}",
594                     offset
595                 );
596             }
597         }
598         None
599     }
600 }
601 
602 #[derive(Default)]
603 struct VcpuState {
604     inserting: bool,
605     removing: bool,
606     pending_removal: Arc<AtomicBool>,
607     handle: Option<thread::JoinHandle<()>>,
608     kill: Arc<AtomicBool>,
609     vcpu_run_interrupted: Arc<AtomicBool>,
610     paused: Arc<AtomicBool>,
611 }
612 
613 impl VcpuState {
614     fn active(&self) -> bool {
615         self.handle.is_some()
616     }
617 
618     fn signal_thread(&self) {
619         if let Some(handle) = self.handle.as_ref() {
620             loop {
621                 // SAFETY: FFI call with correct arguments
622                 unsafe {
623                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
624                 }
625                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
626                     break;
627                 } else {
628                     // This is more effective than thread::yield_now() at
629                     // avoiding a priority inversion with the vCPU thread
630                     thread::sleep(std::time::Duration::from_millis(1));
631                 }
632             }
633         }
634     }
635 
636     fn join_thread(&mut self) -> Result<()> {
637         if let Some(handle) = self.handle.take() {
638             handle.join().map_err(Error::ThreadCleanup)?
639         }
640 
641         Ok(())
642     }
643 
644     fn unpark_thread(&self) {
645         if let Some(handle) = self.handle.as_ref() {
646             handle.thread().unpark()
647         }
648     }
649 }
650 
651 impl CpuManager {
652     #[allow(unused_variables)]
653     #[allow(clippy::too_many_arguments)]
654     pub fn new(
655         config: &CpusConfig,
656         vm: Arc<dyn hypervisor::Vm>,
657         exit_evt: EventFd,
658         reset_evt: EventFd,
659         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
660         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
661         seccomp_action: SeccompAction,
662         vm_ops: Arc<dyn VmOps>,
663         #[cfg(feature = "tdx")] tdx_enabled: bool,
664         numa_nodes: &NumaNodes,
665         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
666     ) -> Result<Arc<Mutex<CpuManager>>> {
667         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
668             return Err(Error::MaximumVcpusExceeded);
669         }
670 
671         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
672         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
673         let hypervisor_type = hypervisor.hypervisor_type();
674         #[cfg(target_arch = "x86_64")]
675         let cpu_vendor = hypervisor.get_cpu_vendor();
676 
677         #[cfg(target_arch = "x86_64")]
678         if config.features.amx {
679             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
680             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
681             const XFEATURE_XTILEDATA: usize = 18;
682             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
683 
684             // SAFETY: the syscall is only modifying kernel internal
685             // data structures that the kernel is itself expected to safeguard.
686             let amx_tile = unsafe {
687                 libc::syscall(
688                     libc::SYS_arch_prctl,
689                     ARCH_REQ_XCOMP_GUEST_PERM,
690                     XFEATURE_XTILEDATA,
691                 )
692             };
693 
694             if amx_tile != 0 {
695                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
696             } else {
697                 let mask: usize = 0;
698                 // SAFETY: the mask being modified (not marked mutable as it is
699                 // modified in unsafe only which is permitted) isn't in use elsewhere.
700                 let result = unsafe {
701                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
702                 };
703                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
704                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
705                 }
706             }
707         }
708 
709         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
710             let mut cpu_list = Vec::new();
711             for (proximity_domain, numa_node) in numa_nodes.iter() {
712                 for cpu in numa_node.cpus.iter() {
713                     cpu_list.push((*cpu, *proximity_domain))
714                 }
715             }
716             cpu_list
717         }
718         .into_iter()
719         .collect();
720 
721         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
722             cpu_affinity
723                 .iter()
724                 .map(|a| (a.vcpu, a.host_cpus.clone()))
725                 .collect()
726         } else {
727             BTreeMap::new()
728         };
729 
730         #[cfg(feature = "tdx")]
731         let dynamic = !tdx_enabled;
732         #[cfg(not(feature = "tdx"))]
733         let dynamic = true;
734 
735         Ok(Arc::new(Mutex::new(CpuManager {
736             config: config.clone(),
737             interrupt_controller: None,
738             #[cfg(target_arch = "x86_64")]
739             cpuid: Vec::new(),
740             vm,
741             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
742             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
743             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
744             vcpu_states,
745             exit_evt,
746             reset_evt,
747             #[cfg(feature = "guest_debug")]
748             vm_debug_evt,
749             selected_cpu: 0,
750             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
751             seccomp_action,
752             vm_ops,
753             acpi_address: None,
754             proximity_domain_per_cpu,
755             affinity,
756             dynamic,
757             hypervisor: hypervisor.clone(),
758             #[cfg(feature = "sev_snp")]
759             sev_snp_enabled,
760         })))
761     }
762 
763     #[cfg(target_arch = "x86_64")]
764     pub fn populate_cpuid(
765         &mut self,
766         memory_manager: &Arc<Mutex<MemoryManager>>,
767         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
768         #[cfg(feature = "tdx")] tdx: bool,
769     ) -> Result<()> {
770         let sgx_epc_sections = memory_manager
771             .lock()
772             .unwrap()
773             .sgx_epc_region()
774             .as_ref()
775             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
776 
777         self.cpuid = {
778             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
779             arch::generate_common_cpuid(
780                 hypervisor,
781                 &arch::CpuidConfig {
782                     sgx_epc_sections,
783                     phys_bits,
784                     kvm_hyperv: self.config.kvm_hyperv,
785                     #[cfg(feature = "tdx")]
786                     tdx,
787                     amx: self.config.features.amx,
788                 },
789             )
790             .map_err(Error::CommonCpuId)?
791         };
792 
793         Ok(())
794     }
795 
796     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
797         info!("Creating vCPU: cpu_id = {}", cpu_id);
798 
799         #[cfg(target_arch = "x86_64")]
800         let topology = self.get_vcpu_topology();
801         #[cfg(target_arch = "x86_64")]
802         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
803         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
804         let x2apic_id = cpu_id as u32;
805 
806         let mut vcpu = Vcpu::new(
807             cpu_id,
808             x2apic_id as u8,
809             &self.vm,
810             Some(self.vm_ops.clone()),
811             #[cfg(target_arch = "x86_64")]
812             self.hypervisor.get_cpu_vendor(),
813         )?;
814 
815         if let Some(snapshot) = snapshot {
816             // AArch64 vCPUs should be initialized after created.
817             #[cfg(target_arch = "aarch64")]
818             vcpu.init(&self.vm)?;
819 
820             let state: CpuState = snapshot.to_state().map_err(|e| {
821                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
822             })?;
823             vcpu.vcpu
824                 .set_state(&state)
825                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
826 
827             vcpu.saved_state = Some(state);
828         }
829 
830         let vcpu = Arc::new(Mutex::new(vcpu));
831 
832         // Adding vCPU to the CpuManager's vCPU list.
833         self.vcpus.push(vcpu.clone());
834 
835         Ok(vcpu)
836     }
837 
838     pub fn configure_vcpu(
839         &self,
840         vcpu: Arc<Mutex<Vcpu>>,
841         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
842     ) -> Result<()> {
843         let mut vcpu = vcpu.lock().unwrap();
844 
845         #[cfg(feature = "sev_snp")]
846         if self.sev_snp_enabled {
847             if let Some((kernel_entry_point, _)) = boot_setup {
848                 vcpu.set_sev_control_register(
849                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
850                 )?;
851             }
852 
853             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
854             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
855             return Ok(());
856         }
857 
858         #[cfg(target_arch = "x86_64")]
859         assert!(!self.cpuid.is_empty());
860 
861         #[cfg(target_arch = "x86_64")]
862         let topology = self.config.topology.clone().map_or_else(
863             || Some((1, self.boot_vcpus(), 1)),
864             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
865         );
866         #[cfg(target_arch = "x86_64")]
867         vcpu.configure(
868             boot_setup,
869             self.cpuid.clone(),
870             self.config.kvm_hyperv,
871             topology,
872         )?;
873 
874         #[cfg(target_arch = "aarch64")]
875         vcpu.configure(&self.vm, boot_setup)?;
876 
877         #[cfg(target_arch = "riscv64")]
878         vcpu.configure(boot_setup)?;
879 
880         Ok(())
881     }
882 
883     /// Only create new vCPUs if there aren't any inactive ones to reuse
884     fn create_vcpus(
885         &mut self,
886         desired_vcpus: u8,
887         snapshot: Option<Snapshot>,
888     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
889         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
890         info!(
891             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
892             desired_vcpus,
893             self.config.max_vcpus,
894             self.vcpus.len(),
895             self.present_vcpus()
896         );
897 
898         if desired_vcpus > self.config.max_vcpus {
899             return Err(Error::DesiredVCpuCountExceedsMax);
900         }
901 
902         // Only create vCPUs in excess of all the allocated vCPUs.
903         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
904             vcpus.push(self.create_vcpu(
905                 cpu_id,
906                 // TODO: The special format of the CPU id can be removed once
907                 // ready to break live upgrade.
908                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
909             )?);
910         }
911 
912         Ok(vcpus)
913     }
914 
915     #[cfg(target_arch = "aarch64")]
916     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
917         for cpu in self.vcpus.iter() {
918             let cpu = cpu.lock().unwrap();
919             // Check if PMU attr is available, if not, log the information.
920             if cpu.vcpu.has_pmu_support() {
921                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
922             } else {
923                 debug!(
924                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
925                     cpu.id
926                 );
927                 return Ok(false);
928             }
929         }
930 
931         Ok(true)
932     }
933 
934     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
935         self.vcpus.clone()
936     }
937 
938     fn start_vcpu(
939         &mut self,
940         vcpu: Arc<Mutex<Vcpu>>,
941         vcpu_id: u8,
942         vcpu_thread_barrier: Arc<Barrier>,
943         inserting: bool,
944     ) -> Result<()> {
945         let reset_evt = self.reset_evt.try_clone().unwrap();
946         let exit_evt = self.exit_evt.try_clone().unwrap();
947         #[cfg(feature = "kvm")]
948         let hypervisor_type = self.hypervisor.hypervisor_type();
949         #[cfg(feature = "guest_debug")]
950         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
951         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
952         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
953         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
954         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
955 
956         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
957         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
958             .vcpu_run_interrupted
959             .clone();
960         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
961         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
962 
963         // Prepare the CPU set the current vCPU is expected to run onto.
964         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
965             // SAFETY: all zeros is a valid pattern
966             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
967             // SAFETY: FFI call, trivially safe
968             unsafe { libc::CPU_ZERO(&mut cpuset) };
969             for host_cpu in host_cpus {
970                 // SAFETY: FFI call, trivially safe
971                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
972             }
973             cpuset
974         });
975 
976         // Retrieve seccomp filter for vcpu thread
977         let vcpu_seccomp_filter = get_seccomp_filter(
978             &self.seccomp_action,
979             Thread::Vcpu,
980             self.hypervisor.hypervisor_type(),
981         )
982         .map_err(Error::CreateSeccompFilter)?;
983 
984         #[cfg(target_arch = "x86_64")]
985         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
986 
987         info!("Starting vCPU: cpu_id = {}", vcpu_id);
988 
989         let handle = Some(
990             thread::Builder::new()
991                 .name(format!("vcpu{vcpu_id}"))
992                 .spawn(move || {
993                     // Schedule the thread to run on the expected CPU set
994                     if let Some(cpuset) = cpuset.as_ref() {
995                         // SAFETY: FFI call with correct arguments
996                         let ret = unsafe {
997                             libc::sched_setaffinity(
998                                 0,
999                                 std::mem::size_of::<libc::cpu_set_t>(),
1000                                 cpuset as *const libc::cpu_set_t,
1001                             )
1002                         };
1003 
1004                         if ret != 0 {
1005                             error!(
1006                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
1007                                 vcpu_id,
1008                                 io::Error::last_os_error()
1009                             );
1010                             return;
1011                         }
1012                     }
1013 
1014                     // Apply seccomp filter for vcpu thread.
1015                     if !vcpu_seccomp_filter.is_empty() {
1016                         if let Err(e) =
1017                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1018                         {
1019                             error!("Error applying seccomp filter: {:?}", e);
1020                             return;
1021                         }
1022                     }
1023                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1024                     // This uses an async signal safe handler to kill the vcpu handles.
1025                     register_signal_handler(SIGRTMIN(), handle_signal)
1026                         .expect("Failed to register vcpu signal handler");
1027                     // Block until all CPUs are ready.
1028                     vcpu_thread_barrier.wait();
1029 
1030                     std::panic::catch_unwind(move || {
1031                         loop {
1032                             // If we are being told to pause, we park the thread
1033                             // until the pause boolean is toggled.
1034                             // The resume operation is responsible for toggling
1035                             // the boolean and unpark the thread.
1036                             // We enter a loop because park() could spuriously
1037                             // return. We will then park() again unless the
1038                             // pause boolean has been toggled.
1039 
1040                             // Need to use Ordering::SeqCst as we have multiple
1041                             // loads and stores to different atomics and we need
1042                             // to see them in a consistent order in all threads
1043 
1044                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1045                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1046                                 // completed by returning to KVM_RUN. From the kernel docs:
1047                                 //
1048                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1049                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1050                                 // operations are complete (and guest state is consistent) only after userspace
1051                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1052                                 // incomplete operations and then check for pending signals.
1053                                 // The pending state of the operation is not preserved in state which is
1054                                 // visible to userspace, thus userspace should ensure that the operation is
1055                                 // completed before performing a live migration.  Userspace can re-enter the
1056                                 // guest with an unmasked signal pending or with the immediate_exit field set
1057                                 // to complete pending operations without allowing any further instructions
1058                                 // to be executed.
1059 
1060                                 #[cfg(feature = "kvm")]
1061                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1062                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1063                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1064                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1065                                         break;
1066                                     }
1067                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1068                                 }
1069 
1070                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1071 
1072                                 vcpu_paused.store(true, Ordering::SeqCst);
1073                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1074                                     thread::park();
1075                                 }
1076                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1077                             }
1078 
1079                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1080                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1081                                 #[cfg(target_arch = "x86_64")]
1082                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1083                                     Ok(()) => {},
1084                                     Err(e) => {
1085                                         error!("Error when inject nmi {}", e);
1086                                         break;
1087                                     }
1088                                 }
1089                             }
1090 
1091                             // We've been told to terminate
1092                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1093                                 || vcpu_kill.load(Ordering::SeqCst)
1094                             {
1095                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1096                                 break;
1097                             }
1098 
1099                             #[cfg(feature = "tdx")]
1100                             let mut vcpu = vcpu.lock().unwrap();
1101                             #[cfg(not(feature = "tdx"))]
1102                             let vcpu = vcpu.lock().unwrap();
1103                             // vcpu.run() returns false on a triple-fault so trigger a reset
1104                             match vcpu.run() {
1105                                 Ok(run) => match run {
1106                                     #[cfg(feature = "kvm")]
1107                                     VmExit::Debug => {
1108                                         info!("VmExit::Debug");
1109                                         #[cfg(feature = "guest_debug")]
1110                                         {
1111                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1112                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1113                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1114                                         }
1115                                     }
1116                                     #[cfg(target_arch = "x86_64")]
1117                                     VmExit::IoapicEoi(vector) => {
1118                                         if let Some(interrupt_controller) =
1119                                             &interrupt_controller_clone
1120                                         {
1121                                             interrupt_controller
1122                                                 .lock()
1123                                                 .unwrap()
1124                                                 .end_of_interrupt(vector);
1125                                         }
1126                                     }
1127                                     VmExit::Ignore => {}
1128                                     VmExit::Hyperv => {}
1129                                     VmExit::Reset => {
1130                                         info!("VmExit::Reset");
1131                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1132                                         reset_evt.write(1).unwrap();
1133                                         break;
1134                                     }
1135                                     VmExit::Shutdown => {
1136                                         info!("VmExit::Shutdown");
1137                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1138                                         exit_evt.write(1).unwrap();
1139                                         break;
1140                                     }
1141                                     #[cfg(feature = "tdx")]
1142                                     VmExit::Tdx => {
1143                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1144                                             match vcpu.get_tdx_exit_details() {
1145                                                 Ok(details) => match details {
1146                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1147                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1148                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1149                                                     }
1150                                                 },
1151                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1152                                             }
1153                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1154                                         } else {
1155                                             // We should never reach this code as
1156                                             // this means the design from the code
1157                                             // is wrong.
1158                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1159                                         }
1160                                     }
1161                                 },
1162 
1163                                 Err(e) => {
1164                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1165                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1166                                     exit_evt.write(1).unwrap();
1167                                     break;
1168                                 }
1169                             }
1170 
1171                             // We've been told to terminate
1172                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1173                                 || vcpu_kill.load(Ordering::SeqCst)
1174                             {
1175                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1176                                 break;
1177                             }
1178                         }
1179                     })
1180                     .or_else(|_| {
1181                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1182                         error!("vCPU thread panicked");
1183                         panic_exit_evt.write(1)
1184                     })
1185                     .ok();
1186                 })
1187                 .map_err(Error::VcpuSpawn)?,
1188         );
1189 
1190         // On hot plug calls into this function entry_point is None. It is for
1191         // those hotplug CPU additions that we need to set the inserting flag.
1192         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1193         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1194 
1195         Ok(())
1196     }
1197 
1198     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1199     fn activate_vcpus(
1200         &mut self,
1201         desired_vcpus: u8,
1202         inserting: bool,
1203         paused: Option<bool>,
1204     ) -> Result<()> {
1205         if desired_vcpus > self.config.max_vcpus {
1206             return Err(Error::DesiredVCpuCountExceedsMax);
1207         }
1208 
1209         let vcpu_thread_barrier = Arc::new(Barrier::new(
1210             (desired_vcpus - self.present_vcpus() + 1) as usize,
1211         ));
1212 
1213         if let Some(paused) = paused {
1214             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1215         }
1216 
1217         info!(
1218             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1219             desired_vcpus,
1220             self.vcpus.len(),
1221             self.present_vcpus(),
1222             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1223         );
1224 
1225         // This reuses any inactive vCPUs as well as any that were newly created
1226         for vcpu_id in self.present_vcpus()..desired_vcpus {
1227             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1228             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1229         }
1230 
1231         // Unblock all CPU threads.
1232         vcpu_thread_barrier.wait();
1233         Ok(())
1234     }
1235 
1236     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1237         // Mark vCPUs for removal, actual removal happens on ejection
1238         for cpu_id in desired_vcpus..self.present_vcpus() {
1239             self.vcpu_states[usize::from(cpu_id)].removing = true;
1240             self.vcpu_states[usize::from(cpu_id)]
1241                 .pending_removal
1242                 .store(true, Ordering::SeqCst);
1243         }
1244     }
1245 
1246     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1247         for state in self.vcpu_states.iter() {
1248             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1249                 return true;
1250             }
1251         }
1252         false
1253     }
1254 
1255     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1256         info!("Removing vCPU: cpu_id = {}", cpu_id);
1257         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1258         state.kill.store(true, Ordering::SeqCst);
1259         state.signal_thread();
1260         state.join_thread()?;
1261         state.handle = None;
1262 
1263         // Once the thread has exited, clear the "kill" so that it can reused
1264         state.kill.store(false, Ordering::SeqCst);
1265         state.pending_removal.store(false, Ordering::SeqCst);
1266 
1267         Ok(())
1268     }
1269 
1270     pub fn create_boot_vcpus(
1271         &mut self,
1272         snapshot: Option<Snapshot>,
1273     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1274         trace_scoped!("create_boot_vcpus");
1275 
1276         self.create_vcpus(self.boot_vcpus(), snapshot)
1277     }
1278 
1279     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1280     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1281         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1282     }
1283 
1284     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1285         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1286             .map_err(|e| {
1287                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1288             })?;
1289 
1290         Ok(())
1291     }
1292 
1293     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1294         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1295             return Ok(false);
1296         }
1297 
1298         if !self.dynamic {
1299             return Ok(false);
1300         }
1301 
1302         if self.check_pending_removed_vcpu() {
1303             return Err(Error::VcpuPendingRemovedVcpu);
1304         }
1305 
1306         match desired_vcpus.cmp(&self.present_vcpus()) {
1307             cmp::Ordering::Greater => {
1308                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1309                 for vcpu in vcpus {
1310                     self.configure_vcpu(vcpu, None)?
1311                 }
1312                 self.activate_vcpus(desired_vcpus, true, None)?;
1313                 Ok(true)
1314             }
1315             cmp::Ordering::Less => {
1316                 self.mark_vcpus_for_removal(desired_vcpus);
1317                 Ok(true)
1318             }
1319             _ => Ok(false),
1320         }
1321     }
1322 
1323     pub fn shutdown(&mut self) -> Result<()> {
1324         // Tell the vCPUs to stop themselves next time they go through the loop
1325         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1326 
1327         // Toggle the vCPUs pause boolean
1328         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1329 
1330         // Unpark all the VCPU threads.
1331         for state in self.vcpu_states.iter() {
1332             state.unpark_thread();
1333         }
1334 
1335         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1336         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1337         // above.
1338         for state in self.vcpu_states.iter() {
1339             state.signal_thread();
1340         }
1341 
1342         // Wait for all the threads to finish. This removes the state from the vector.
1343         for mut state in self.vcpu_states.drain(..) {
1344             state.join_thread()?;
1345         }
1346 
1347         Ok(())
1348     }
1349 
1350     #[cfg(feature = "tdx")]
1351     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1352         for vcpu in &self.vcpus {
1353             vcpu.lock()
1354                 .unwrap()
1355                 .vcpu
1356                 .tdx_init(hob_address)
1357                 .map_err(Error::InitializeTdx)?;
1358         }
1359         Ok(())
1360     }
1361 
1362     pub fn boot_vcpus(&self) -> u8 {
1363         self.config.boot_vcpus
1364     }
1365 
1366     pub fn max_vcpus(&self) -> u8 {
1367         self.config.max_vcpus
1368     }
1369 
1370     #[cfg(target_arch = "x86_64")]
1371     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1372         assert!(!self.cpuid.is_empty());
1373         self.cpuid.clone()
1374     }
1375 
1376     fn present_vcpus(&self) -> u8 {
1377         self.vcpu_states
1378             .iter()
1379             .fold(0, |acc, state| acc + state.active() as u8)
1380     }
1381 
1382     #[cfg(target_arch = "aarch64")]
1383     pub fn get_mpidrs(&self) -> Vec<u64> {
1384         self.vcpus
1385             .iter()
1386             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1387             .collect()
1388     }
1389 
1390     #[cfg(target_arch = "aarch64")]
1391     pub fn get_saved_states(&self) -> Vec<CpuState> {
1392         self.vcpus
1393             .iter()
1394             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1395             .collect()
1396     }
1397 
1398     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1399         self.config
1400             .topology
1401             .clone()
1402             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1403     }
1404 
1405     #[cfg(not(target_arch = "riscv64"))]
1406     pub fn create_madt(&self) -> Sdt {
1407         use crate::acpi;
1408         // This is also checked in the commandline parsing.
1409         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1410 
1411         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1412         #[cfg(target_arch = "x86_64")]
1413         {
1414             madt.write(36, arch::layout::APIC_START.0);
1415 
1416             for cpu in 0..self.config.max_vcpus {
1417                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1418 
1419                 let lapic = LocalX2Apic {
1420                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1421                     length: 16,
1422                     processor_id: cpu.into(),
1423                     apic_id: x2apic_id,
1424                     flags: if cpu < self.config.boot_vcpus {
1425                         1 << MADT_CPU_ENABLE_FLAG
1426                     } else {
1427                         0
1428                     } | (1 << MADT_CPU_ONLINE_CAPABLE_FLAG),
1429                     _reserved: 0,
1430                 };
1431                 madt.append(lapic);
1432             }
1433 
1434             madt.append(Ioapic {
1435                 r#type: acpi::ACPI_APIC_IO,
1436                 length: 12,
1437                 ioapic_id: 0,
1438                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1439                 gsi_base: 0,
1440                 ..Default::default()
1441             });
1442 
1443             madt.append(InterruptSourceOverride {
1444                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1445                 length: 10,
1446                 bus: 0,
1447                 source: 4,
1448                 gsi: 4,
1449                 flags: 0,
1450             });
1451         }
1452 
1453         #[cfg(target_arch = "aarch64")]
1454         {
1455             /* Notes:
1456              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1457              */
1458 
1459             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1460             for cpu in 0..self.config.boot_vcpus {
1461                 let vcpu = &self.vcpus[cpu as usize];
1462                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1463                 /* ARMv8 MPIDR format:
1464                      Bits [63:40] Must be zero
1465                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1466                      Bits [31:24] Must be zero
1467                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1468                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1469                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1470                 */
1471                 let mpidr_mask = 0xff_00ff_ffff;
1472                 let gicc = GicC {
1473                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1474                     length: 80,
1475                     reserved0: 0,
1476                     cpu_interface_number: cpu as u32,
1477                     uid: cpu as u32,
1478                     flags: 1,
1479                     parking_version: 0,
1480                     performance_interrupt: 0,
1481                     parked_address: 0,
1482                     base_address: 0,
1483                     gicv_base_address: 0,
1484                     gich_base_address: 0,
1485                     vgic_interrupt: 0,
1486                     gicr_base_address: 0,
1487                     mpidr: mpidr & mpidr_mask,
1488                     proc_power_effi_class: 0,
1489                     reserved1: 0,
1490                     spe_overflow_interrupt: 0,
1491                 };
1492 
1493                 madt.append(gicc);
1494             }
1495             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1496 
1497             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1498             let gicd = GicD {
1499                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1500                 length: 24,
1501                 reserved0: 0,
1502                 gic_id: 0,
1503                 base_address: vgic_config.dist_addr,
1504                 global_irq_base: 0,
1505                 version: 3,
1506                 reserved1: [0; 3],
1507             };
1508             madt.append(gicd);
1509 
1510             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1511             let gicr = GicR {
1512                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1513                 length: 16,
1514                 reserved: 0,
1515                 base_address: vgic_config.redists_addr,
1516                 range_length: vgic_config.redists_size as u32,
1517             };
1518             madt.append(gicr);
1519 
1520             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1521             let gicits = GicIts {
1522                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1523                 length: 20,
1524                 reserved0: 0,
1525                 translation_id: 0,
1526                 base_address: vgic_config.msi_addr,
1527                 reserved1: 0,
1528             };
1529             madt.append(gicits);
1530 
1531             madt.update_checksum();
1532         }
1533 
1534         madt
1535     }
1536 
1537     #[cfg(target_arch = "aarch64")]
1538     pub fn create_pptt(&self) -> Sdt {
1539         let pptt_start = 0;
1540         let mut cpus = 0;
1541         let mut uid = 0;
1542         // If topology is not specified, the default setting is:
1543         // 1 package, multiple cores, 1 thread per core
1544         // This is also the behavior when PPTT is missing.
1545         let (threads_per_core, cores_per_package, packages) =
1546             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1547 
1548         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1549 
1550         for cluster_idx in 0..packages {
1551             if cpus < self.config.boot_vcpus as usize {
1552                 let cluster_offset = pptt.len() - pptt_start;
1553                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1554                     r#type: 0,
1555                     length: 20,
1556                     reserved: 0,
1557                     flags: 0x2,
1558                     parent: 0,
1559                     acpi_processor_id: cluster_idx as u32,
1560                     num_private_resources: 0,
1561                 };
1562                 pptt.append(cluster_hierarchy_node);
1563 
1564                 for core_idx in 0..cores_per_package {
1565                     let core_offset = pptt.len() - pptt_start;
1566 
1567                     if threads_per_core > 1 {
1568                         let core_hierarchy_node = ProcessorHierarchyNode {
1569                             r#type: 0,
1570                             length: 20,
1571                             reserved: 0,
1572                             flags: 0x2,
1573                             parent: cluster_offset as u32,
1574                             acpi_processor_id: core_idx as u32,
1575                             num_private_resources: 0,
1576                         };
1577                         pptt.append(core_hierarchy_node);
1578 
1579                         for _thread_idx in 0..threads_per_core {
1580                             let thread_hierarchy_node = ProcessorHierarchyNode {
1581                                 r#type: 0,
1582                                 length: 20,
1583                                 reserved: 0,
1584                                 flags: 0xE,
1585                                 parent: core_offset as u32,
1586                                 acpi_processor_id: uid as u32,
1587                                 num_private_resources: 0,
1588                             };
1589                             pptt.append(thread_hierarchy_node);
1590                             uid += 1;
1591                         }
1592                     } else {
1593                         let thread_hierarchy_node = ProcessorHierarchyNode {
1594                             r#type: 0,
1595                             length: 20,
1596                             reserved: 0,
1597                             flags: 0xA,
1598                             parent: cluster_offset as u32,
1599                             acpi_processor_id: uid as u32,
1600                             num_private_resources: 0,
1601                         };
1602                         pptt.append(thread_hierarchy_node);
1603                         uid += 1;
1604                     }
1605                 }
1606                 cpus += (cores_per_package * threads_per_core) as usize;
1607             }
1608         }
1609 
1610         pptt.update_checksum();
1611         pptt
1612     }
1613 
1614     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1615     fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters {
1616         self.vcpus[usize::from(cpu_id)]
1617             .lock()
1618             .unwrap()
1619             .vcpu
1620             .create_standard_regs()
1621     }
1622 
1623     #[cfg(feature = "guest_debug")]
1624     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1625         self.vcpus[usize::from(cpu_id)]
1626             .lock()
1627             .unwrap()
1628             .vcpu
1629             .get_regs()
1630             .map_err(Error::CpuDebug)
1631     }
1632 
1633     #[cfg(feature = "guest_debug")]
1634     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1635         self.vcpus[usize::from(cpu_id)]
1636             .lock()
1637             .unwrap()
1638             .vcpu
1639             .set_regs(regs)
1640             .map_err(Error::CpuDebug)
1641     }
1642 
1643     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1644     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1645         self.vcpus[usize::from(cpu_id)]
1646             .lock()
1647             .unwrap()
1648             .vcpu
1649             .get_sregs()
1650             .map_err(Error::CpuDebug)
1651     }
1652 
1653     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1654     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1655         self.vcpus[usize::from(cpu_id)]
1656             .lock()
1657             .unwrap()
1658             .vcpu
1659             .set_sregs(sregs)
1660             .map_err(Error::CpuDebug)
1661     }
1662 
1663     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1664     fn translate_gva(
1665         &self,
1666         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1667         cpu_id: u8,
1668         gva: u64,
1669     ) -> Result<u64> {
1670         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1671             .lock()
1672             .unwrap()
1673             .vcpu
1674             .translate_gva(gva, /* flags: unused */ 0)
1675             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1676         Ok(gpa)
1677     }
1678 
1679     ///
1680     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1681     /// it in VMM by walking through translation tables.
1682     ///
1683     /// Address translation is big topic, here we only focus the scenario that
1684     /// happens in VMM while debugging kernel. This `translate_gva`
1685     /// implementation is restricted to:
1686     /// - Exception Level 1
1687     /// - Translate high address range only (kernel space)
1688     ///
1689     /// This implementation supports following Arm-v8a features related to
1690     /// address translation:
1691     /// - FEAT_LPA
1692     /// - FEAT_LVA
1693     /// - FEAT_LPA2
1694     ///
1695     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1696     fn translate_gva(
1697         &self,
1698         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1699         cpu_id: u8,
1700         gva: u64,
1701     ) -> Result<u64> {
1702         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1703             .lock()
1704             .unwrap()
1705             .vcpu
1706             .get_sys_reg(regs::TCR_EL1)
1707             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1708         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1709             .lock()
1710             .unwrap()
1711             .vcpu
1712             .get_sys_reg(regs::TTBR1_EL1)
1713             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1714         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1715             .lock()
1716             .unwrap()
1717             .vcpu
1718             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1719             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1720 
1721         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1722         // or low (0x000xxx...).
1723         let high_range = extract_bits_64!(gva, 55, 1);
1724         if high_range == 0 {
1725             info!("VA (0x{:x}) range is not supported!", gva);
1726             return Ok(gva);
1727         }
1728 
1729         // High range size offset
1730         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1731         // Granule size
1732         let tg = extract_bits_64!(tcr_el1, 30, 2);
1733         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1734         let ds = extract_bits_64!(tcr_el1, 59, 1);
1735 
1736         if tsz == 0 {
1737             info!("VA translation is not ready!");
1738             return Ok(gva);
1739         }
1740 
1741         // VA size is determined by TCR_BL1.T1SZ
1742         let va_size = 64 - tsz;
1743         // Number of bits in VA consumed in each level of translation
1744         let stride = match tg {
1745             3 => 13, // 64KB granule size
1746             1 => 11, // 16KB granule size
1747             _ => 9,  // 4KB, default
1748         };
1749         // Starting level of walking
1750         let mut level = 4 - (va_size - 4) / stride;
1751 
1752         // PA or IPA size is determined
1753         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1754         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1755         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1756         // To be safe, we use the minimum value if they are different.
1757         let pa_range = std::cmp::min(tcr_ips, pa_range);
1758         // PA size in bits
1759         let pa_size = match pa_range {
1760             0 => 32,
1761             1 => 36,
1762             2 => 40,
1763             3 => 42,
1764             4 => 44,
1765             5 => 48,
1766             6 => 52,
1767             _ => {
1768                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1769                     "PA range not supported {pa_range}"
1770                 ))))
1771             }
1772         };
1773 
1774         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1775         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1776         // If FEAT_LPA2 is present, the translation table descriptor holds
1777         // 50 bits of the table address of next level.
1778         // Otherwise, it is 48 bits.
1779         let descaddrmask = if ds == 1 {
1780             !0u64 >> (64 - 50) // mask with 50 least significant bits
1781         } else {
1782             !0u64 >> (64 - 48) // mask with 48 least significant bits
1783         };
1784         let descaddrmask = descaddrmask & !indexmask_grainsize;
1785 
1786         // Translation table base address
1787         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1788         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1789         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1790         if pa_size == 52 {
1791             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1792         }
1793 
1794         // Loop through tables of each level
1795         loop {
1796             // Table offset for current level
1797             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1798             descaddr |= table_offset;
1799             descaddr &= !7u64;
1800 
1801             let mut buf = [0; 8];
1802             guest_memory
1803                 .memory()
1804                 .read(&mut buf, GuestAddress(descaddr))
1805                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1806             let descriptor = u64::from_le_bytes(buf);
1807 
1808             descaddr = descriptor & descaddrmask;
1809             // In the case of FEAT_LPA, the next-level translation table address
1810             // bits [48:51] comes from bits [12:15] of the current descriptor.
1811             // For FEAT_LPA2, the next-level translation table address
1812             // bits [50:51] comes from bits [8:9] of the current descriptor,
1813             // bits [48:49] comes from bits [48:49] of the descriptor which was
1814             // handled previously.
1815             if pa_size == 52 {
1816                 if ds == 1 {
1817                     // FEAT_LPA2
1818                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1819                 } else {
1820                     // FEAT_LPA
1821                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1822                 }
1823             }
1824 
1825             if (descriptor & 2) != 0 && (level < 3) {
1826                 // This is a table entry. Go down to next level.
1827                 level += 1;
1828                 indexmask = indexmask_grainsize;
1829                 continue;
1830             }
1831 
1832             break;
1833         }
1834 
1835         // We have reached either:
1836         // - a page entry at level 3 or
1837         // - a block entry at level 1 or 2
1838         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1839         descaddr &= !(page_size - 1);
1840         descaddr |= gva & (page_size - 1);
1841 
1842         Ok(descaddr)
1843     }
1844 
1845     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1846         self.acpi_address = Some(acpi_address);
1847     }
1848 
1849     pub(crate) fn set_interrupt_controller(
1850         &mut self,
1851         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1852     ) {
1853         self.interrupt_controller = Some(interrupt_controller);
1854     }
1855 
1856     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1857         &self.vcpus_kill_signalled
1858     }
1859 
1860     #[cfg(feature = "igvm")]
1861     pub(crate) fn get_cpuid_leaf(
1862         &self,
1863         cpu_id: u8,
1864         eax: u32,
1865         ecx: u32,
1866         xfem: u64,
1867         xss: u64,
1868     ) -> Result<[u32; 4]> {
1869         let leaf_info = self.vcpus[usize::from(cpu_id)]
1870             .lock()
1871             .unwrap()
1872             .vcpu
1873             .get_cpuid_values(eax, ecx, xfem, xss)
1874             .unwrap();
1875         Ok(leaf_info)
1876     }
1877 
1878     #[cfg(feature = "sev_snp")]
1879     pub(crate) fn sev_snp_enabled(&self) -> bool {
1880         self.sev_snp_enabled
1881     }
1882 
1883     pub(crate) fn nmi(&self) -> Result<()> {
1884         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1885 
1886         for state in self.vcpu_states.iter() {
1887             state.signal_thread();
1888         }
1889 
1890         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1891 
1892         Ok(())
1893     }
1894 }
1895 
1896 struct Cpu {
1897     cpu_id: u8,
1898     proximity_domain: u32,
1899     dynamic: bool,
1900     #[cfg(target_arch = "x86_64")]
1901     topology: Option<(u8, u8, u8)>,
1902 }
1903 
1904 #[cfg(target_arch = "x86_64")]
1905 const MADT_CPU_ENABLE_FLAG: usize = 0;
1906 
1907 #[cfg(target_arch = "x86_64")]
1908 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1909 
1910 impl Cpu {
1911     #[cfg(target_arch = "x86_64")]
1912     fn generate_mat(&self) -> Vec<u8> {
1913         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1914 
1915         let lapic = LocalX2Apic {
1916             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1917             length: 16,
1918             processor_id: self.cpu_id.into(),
1919             apic_id: x2apic_id,
1920             flags: 1 << MADT_CPU_ENABLE_FLAG,
1921             _reserved: 0,
1922         };
1923 
1924         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1925         // SAFETY: mat_data is large enough to hold lapic
1926         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1927 
1928         mat_data
1929     }
1930 }
1931 
1932 impl Aml for Cpu {
1933     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1934         #[cfg(target_arch = "x86_64")]
1935         let mat_data: Vec<u8> = self.generate_mat();
1936         #[allow(clippy::if_same_then_else)]
1937         if self.dynamic {
1938             aml::Device::new(
1939                 format!("C{:03X}", self.cpu_id).as_str().into(),
1940                 vec![
1941                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1942                     &aml::Name::new("_UID".into(), &self.cpu_id),
1943                     // Currently, AArch64 cannot support following fields.
1944                     /*
1945                     _STA return value:
1946                     Bit [0] – Set if the device is present.
1947                     Bit [1] – Set if the device is enabled and decoding its resources.
1948                     Bit [2] – Set if the device should be shown in the UI.
1949                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1950                     Bit [4] – Set if the battery is present.
1951                     Bits [31:5] – Reserved (must be cleared).
1952                     */
1953                     #[cfg(target_arch = "x86_64")]
1954                     &aml::Method::new(
1955                         "_STA".into(),
1956                         0,
1957                         false,
1958                         // Call into CSTA method which will interrogate device
1959                         vec![&aml::Return::new(&aml::MethodCall::new(
1960                             "CSTA".into(),
1961                             vec![&self.cpu_id],
1962                         ))],
1963                     ),
1964                     &aml::Method::new(
1965                         "_PXM".into(),
1966                         0,
1967                         false,
1968                         vec![&aml::Return::new(&self.proximity_domain)],
1969                     ),
1970                     // The Linux kernel expects every CPU device to have a _MAT entry
1971                     // containing the LAPIC for this processor with the enabled bit set
1972                     // even it if is disabled in the MADT (non-boot CPU)
1973                     #[cfg(target_arch = "x86_64")]
1974                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1975                     // Trigger CPU ejection
1976                     #[cfg(target_arch = "x86_64")]
1977                     &aml::Method::new(
1978                         "_EJ0".into(),
1979                         1,
1980                         false,
1981                         // Call into CEJ0 method which will actually eject device
1982                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1983                     ),
1984                 ],
1985             )
1986             .to_aml_bytes(sink);
1987         } else {
1988             aml::Device::new(
1989                 format!("C{:03X}", self.cpu_id).as_str().into(),
1990                 vec![
1991                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1992                     &aml::Name::new("_UID".into(), &self.cpu_id),
1993                     #[cfg(target_arch = "x86_64")]
1994                     &aml::Method::new(
1995                         "_STA".into(),
1996                         0,
1997                         false,
1998                         // Mark CPU present see CSTA implementation
1999                         vec![&aml::Return::new(&0xfu8)],
2000                     ),
2001                     &aml::Method::new(
2002                         "_PXM".into(),
2003                         0,
2004                         false,
2005                         vec![&aml::Return::new(&self.proximity_domain)],
2006                     ),
2007                     // The Linux kernel expects every CPU device to have a _MAT entry
2008                     // containing the LAPIC for this processor with the enabled bit set
2009                     // even it if is disabled in the MADT (non-boot CPU)
2010                     #[cfg(target_arch = "x86_64")]
2011                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
2012                 ],
2013             )
2014             .to_aml_bytes(sink);
2015         }
2016     }
2017 }
2018 
2019 struct CpuNotify {
2020     cpu_id: u8,
2021 }
2022 
2023 impl Aml for CpuNotify {
2024     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2025         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2026         aml::If::new(
2027             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2028             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2029         )
2030         .to_aml_bytes(sink)
2031     }
2032 }
2033 
2034 struct CpuMethods {
2035     max_vcpus: u8,
2036     dynamic: bool,
2037 }
2038 
2039 impl Aml for CpuMethods {
2040     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2041         if self.dynamic {
2042             // CPU status method
2043             aml::Method::new(
2044                 "CSTA".into(),
2045                 1,
2046                 true,
2047                 vec![
2048                     // Take lock defined above
2049                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2050                     // Write CPU number (in first argument) to I/O port via field
2051                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2052                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2053                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2054                     &aml::If::new(
2055                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2056                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2057                     ),
2058                     // Release lock
2059                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2060                     // Return 0 or 0xf
2061                     &aml::Return::new(&aml::Local(0)),
2062                 ],
2063             )
2064             .to_aml_bytes(sink);
2065 
2066             let mut cpu_notifies = Vec::new();
2067             for cpu_id in 0..self.max_vcpus {
2068                 cpu_notifies.push(CpuNotify { cpu_id });
2069             }
2070 
2071             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2072             for cpu_id in 0..self.max_vcpus {
2073                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2074             }
2075 
2076             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2077 
2078             aml::Method::new(
2079                 "CEJ0".into(),
2080                 1,
2081                 true,
2082                 vec![
2083                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2084                     // Write CPU number (in first argument) to I/O port via field
2085                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2086                     // Set CEJ0 bit
2087                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2088                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2089                 ],
2090             )
2091             .to_aml_bytes(sink);
2092 
2093             aml::Method::new(
2094                 "CSCN".into(),
2095                 0,
2096                 true,
2097                 vec![
2098                     // Take lock defined above
2099                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2100                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2101                     &aml::While::new(
2102                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2103                         vec![
2104                             // Write CPU number (in first argument) to I/O port via field
2105                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2106                             // Check if CINS bit is set
2107                             &aml::If::new(
2108                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2109                                 // Notify device if it is
2110                                 vec![
2111                                     &aml::MethodCall::new(
2112                                         "CTFY".into(),
2113                                         vec![&aml::Local(0), &aml::ONE],
2114                                     ),
2115                                     // Reset CINS bit
2116                                     &aml::Store::new(
2117                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2118                                         &aml::ONE,
2119                                     ),
2120                                 ],
2121                             ),
2122                             // Check if CRMV bit is set
2123                             &aml::If::new(
2124                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2125                                 // Notify device if it is (with the eject constant 0x3)
2126                                 vec![
2127                                     &aml::MethodCall::new(
2128                                         "CTFY".into(),
2129                                         vec![&aml::Local(0), &3u8],
2130                                     ),
2131                                     // Reset CRMV bit
2132                                     &aml::Store::new(
2133                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2134                                         &aml::ONE,
2135                                     ),
2136                                 ],
2137                             ),
2138                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2139                         ],
2140                     ),
2141                     // Release lock
2142                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2143                 ],
2144             )
2145             .to_aml_bytes(sink)
2146         } else {
2147             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2148         }
2149     }
2150 }
2151 
2152 impl Aml for CpuManager {
2153     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2154         #[cfg(target_arch = "x86_64")]
2155         if let Some(acpi_address) = self.acpi_address {
2156             // CPU hotplug controller
2157             aml::Device::new(
2158                 "_SB_.PRES".into(),
2159                 vec![
2160                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2161                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2162                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2163                     &aml::Mutex::new("CPLK".into(), 0),
2164                     &aml::Name::new(
2165                         "_CRS".into(),
2166                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2167                             aml::AddressSpaceCacheable::NotCacheable,
2168                             true,
2169                             acpi_address.0,
2170                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2171                             None,
2172                         )]),
2173                     ),
2174                     // OpRegion and Fields map MMIO range into individual field values
2175                     &aml::OpRegion::new(
2176                         "PRST".into(),
2177                         aml::OpRegionSpace::SystemMemory,
2178                         &(acpi_address.0 as usize),
2179                         &CPU_MANAGER_ACPI_SIZE,
2180                     ),
2181                     &aml::Field::new(
2182                         "PRST".into(),
2183                         aml::FieldAccessType::Byte,
2184                         aml::FieldLockRule::NoLock,
2185                         aml::FieldUpdateRule::WriteAsZeroes,
2186                         vec![
2187                             aml::FieldEntry::Reserved(32),
2188                             aml::FieldEntry::Named(*b"CPEN", 1),
2189                             aml::FieldEntry::Named(*b"CINS", 1),
2190                             aml::FieldEntry::Named(*b"CRMV", 1),
2191                             aml::FieldEntry::Named(*b"CEJ0", 1),
2192                             aml::FieldEntry::Reserved(4),
2193                             aml::FieldEntry::Named(*b"CCMD", 8),
2194                         ],
2195                     ),
2196                     &aml::Field::new(
2197                         "PRST".into(),
2198                         aml::FieldAccessType::DWord,
2199                         aml::FieldLockRule::NoLock,
2200                         aml::FieldUpdateRule::Preserve,
2201                         vec![
2202                             aml::FieldEntry::Named(*b"CSEL", 32),
2203                             aml::FieldEntry::Reserved(32),
2204                             aml::FieldEntry::Named(*b"CDAT", 32),
2205                         ],
2206                     ),
2207                 ],
2208             )
2209             .to_aml_bytes(sink);
2210         }
2211 
2212         // CPU devices
2213         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2214         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2215         // Bundle methods together under a common object
2216         let methods = CpuMethods {
2217             max_vcpus: self.config.max_vcpus,
2218             dynamic: self.dynamic,
2219         };
2220         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2221 
2222         #[cfg(target_arch = "x86_64")]
2223         let topology = self.get_vcpu_topology();
2224         let mut cpu_devices = Vec::new();
2225         for cpu_id in 0..self.config.max_vcpus {
2226             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2227             let cpu_device = Cpu {
2228                 cpu_id,
2229                 proximity_domain,
2230                 dynamic: self.dynamic,
2231                 #[cfg(target_arch = "x86_64")]
2232                 topology,
2233             };
2234 
2235             cpu_devices.push(cpu_device);
2236         }
2237 
2238         for cpu_device in cpu_devices.iter() {
2239             cpu_data_inner.push(cpu_device);
2240         }
2241 
2242         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2243     }
2244 }
2245 
2246 impl Pausable for CpuManager {
2247     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2248         // Tell the vCPUs to pause themselves next time they exit
2249         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2250 
2251         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2252         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2253         // above.
2254         for state in self.vcpu_states.iter() {
2255             state.signal_thread();
2256         }
2257 
2258         for vcpu in self.vcpus.iter() {
2259             let mut vcpu = vcpu.lock().unwrap();
2260             vcpu.pause()?;
2261             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2262             if !self.config.kvm_hyperv {
2263                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2264                     MigratableError::Pause(anyhow!(
2265                         "Could not notify guest it has been paused {:?}",
2266                         e
2267                     ))
2268                 })?;
2269             }
2270         }
2271 
2272         // The vCPU thread will change its paused state before parking, wait here for each
2273         // activated vCPU change their state to ensure they have parked.
2274         for state in self.vcpu_states.iter() {
2275             if state.active() {
2276                 while !state.paused.load(Ordering::SeqCst) {
2277                     // To avoid a priority inversion with the vCPU thread
2278                     thread::sleep(std::time::Duration::from_millis(1));
2279                 }
2280             }
2281         }
2282 
2283         Ok(())
2284     }
2285 
2286     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2287         for vcpu in self.vcpus.iter() {
2288             vcpu.lock().unwrap().resume()?;
2289         }
2290 
2291         // Toggle the vCPUs pause boolean
2292         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2293 
2294         // Unpark all the VCPU threads.
2295         // Once unparked, the next thing they will do is checking for the pause
2296         // boolean. Since it'll be set to false, they will exit their pause loop
2297         // and go back to vmx root.
2298         for state in self.vcpu_states.iter() {
2299             state.paused.store(false, Ordering::SeqCst);
2300             state.unpark_thread();
2301         }
2302         Ok(())
2303     }
2304 }
2305 
2306 impl Snapshottable for CpuManager {
2307     fn id(&self) -> String {
2308         CPU_MANAGER_SNAPSHOT_ID.to_string()
2309     }
2310 
2311     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2312         let mut cpu_manager_snapshot = Snapshot::default();
2313 
2314         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2315         for vcpu in &self.vcpus {
2316             let mut vcpu = vcpu.lock().unwrap();
2317             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2318         }
2319 
2320         Ok(cpu_manager_snapshot)
2321     }
2322 }
2323 
2324 impl Transportable for CpuManager {}
2325 impl Migratable for CpuManager {}
2326 
2327 #[cfg(feature = "guest_debug")]
2328 impl Debuggable for CpuManager {
2329     #[cfg(feature = "kvm")]
2330     fn set_guest_debug(
2331         &self,
2332         cpu_id: usize,
2333         addrs: &[GuestAddress],
2334         singlestep: bool,
2335     ) -> std::result::Result<(), DebuggableError> {
2336         self.vcpus[cpu_id]
2337             .lock()
2338             .unwrap()
2339             .vcpu
2340             .set_guest_debug(addrs, singlestep)
2341             .map_err(DebuggableError::SetDebug)
2342     }
2343 
2344     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2345         Ok(())
2346     }
2347 
2348     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2349         Ok(())
2350     }
2351 
2352     #[cfg(target_arch = "x86_64")]
2353     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2354         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2355         let gregs = self
2356             .get_regs(cpu_id as u8)
2357             .map_err(DebuggableError::ReadRegs)?;
2358         let regs = [
2359             gregs.get_rax(),
2360             gregs.get_rbx(),
2361             gregs.get_rcx(),
2362             gregs.get_rdx(),
2363             gregs.get_rsi(),
2364             gregs.get_rdi(),
2365             gregs.get_rbp(),
2366             gregs.get_rsp(),
2367             gregs.get_r8(),
2368             gregs.get_r9(),
2369             gregs.get_r10(),
2370             gregs.get_r11(),
2371             gregs.get_r12(),
2372             gregs.get_r13(),
2373             gregs.get_r14(),
2374             gregs.get_r15(),
2375         ];
2376 
2377         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2378         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2379         let eflags = gregs.get_rflags() as u32;
2380         let rip = gregs.get_rip();
2381 
2382         // Segment registers: CS, SS, DS, ES, FS, GS
2383         let sregs = self
2384             .get_sregs(cpu_id as u8)
2385             .map_err(DebuggableError::ReadRegs)?;
2386         let segments = X86SegmentRegs {
2387             cs: sregs.cs.selector as u32,
2388             ss: sregs.ss.selector as u32,
2389             ds: sregs.ds.selector as u32,
2390             es: sregs.es.selector as u32,
2391             fs: sregs.fs.selector as u32,
2392             gs: sregs.gs.selector as u32,
2393         };
2394 
2395         // TODO: Add other registers
2396 
2397         Ok(CoreRegs {
2398             regs,
2399             eflags,
2400             rip,
2401             segments,
2402             ..Default::default()
2403         })
2404     }
2405 
2406     #[cfg(target_arch = "aarch64")]
2407     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2408         let gregs = self
2409             .get_regs(cpu_id as u8)
2410             .map_err(DebuggableError::ReadRegs)?;
2411         Ok(CoreRegs {
2412             x: gregs.get_regs(),
2413             sp: gregs.get_sp(),
2414             pc: gregs.get_pc(),
2415             ..Default::default()
2416         })
2417     }
2418 
2419     #[cfg(target_arch = "x86_64")]
2420     fn write_regs(
2421         &self,
2422         cpu_id: usize,
2423         regs: &CoreRegs,
2424     ) -> std::result::Result<(), DebuggableError> {
2425         let orig_gregs = self
2426             .get_regs(cpu_id as u8)
2427             .map_err(DebuggableError::ReadRegs)?;
2428         let mut gregs = self.create_standard_regs(cpu_id as u8);
2429         gregs.set_rax(regs.regs[0]);
2430         gregs.set_rbx(regs.regs[1]);
2431         gregs.set_rcx(regs.regs[2]);
2432         gregs.set_rdx(regs.regs[3]);
2433         gregs.set_rsi(regs.regs[4]);
2434         gregs.set_rdi(regs.regs[5]);
2435         gregs.set_rbp(regs.regs[6]);
2436         gregs.set_rsp(regs.regs[7]);
2437         gregs.set_r8(regs.regs[8]);
2438         gregs.set_r9(regs.regs[9]);
2439         gregs.set_r10(regs.regs[10]);
2440         gregs.set_r11(regs.regs[11]);
2441         gregs.set_r12(regs.regs[12]);
2442         gregs.set_r13(regs.regs[13]);
2443         gregs.set_r14(regs.regs[14]);
2444         gregs.set_r15(regs.regs[15]);
2445         gregs.set_rip(regs.rip);
2446         // Update the lower 32-bit of rflags.
2447         gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64));
2448 
2449         self.set_regs(cpu_id as u8, &gregs)
2450             .map_err(DebuggableError::WriteRegs)?;
2451 
2452         // Segment registers: CS, SS, DS, ES, FS, GS
2453         // Since GDB care only selectors, we call get_sregs() first.
2454         let mut sregs = self
2455             .get_sregs(cpu_id as u8)
2456             .map_err(DebuggableError::ReadRegs)?;
2457         sregs.cs.selector = regs.segments.cs as u16;
2458         sregs.ss.selector = regs.segments.ss as u16;
2459         sregs.ds.selector = regs.segments.ds as u16;
2460         sregs.es.selector = regs.segments.es as u16;
2461         sregs.fs.selector = regs.segments.fs as u16;
2462         sregs.gs.selector = regs.segments.gs as u16;
2463 
2464         self.set_sregs(cpu_id as u8, &sregs)
2465             .map_err(DebuggableError::WriteRegs)?;
2466 
2467         // TODO: Add other registers
2468 
2469         Ok(())
2470     }
2471 
2472     #[cfg(target_arch = "aarch64")]
2473     fn write_regs(
2474         &self,
2475         cpu_id: usize,
2476         regs: &CoreRegs,
2477     ) -> std::result::Result<(), DebuggableError> {
2478         let mut gregs = self
2479             .get_regs(cpu_id as u8)
2480             .map_err(DebuggableError::ReadRegs)?;
2481 
2482         gregs.set_regs(regs.x);
2483         gregs.set_sp(regs.sp);
2484         gregs.set_pc(regs.pc);
2485 
2486         self.set_regs(cpu_id as u8, &gregs)
2487             .map_err(DebuggableError::WriteRegs)?;
2488 
2489         Ok(())
2490     }
2491 
2492     fn read_mem(
2493         &self,
2494         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2495         cpu_id: usize,
2496         vaddr: GuestAddress,
2497         len: usize,
2498     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2499         let mut buf = vec![0; len];
2500         let mut total_read = 0_u64;
2501 
2502         while total_read < len as u64 {
2503             let gaddr = vaddr.0 + total_read;
2504             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2505                 Ok(paddr) => paddr,
2506                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2507                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2508             };
2509             let psize = arch::PAGE_SIZE as u64;
2510             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2511             guest_memory
2512                 .memory()
2513                 .read(
2514                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2515                     GuestAddress(paddr),
2516                 )
2517                 .map_err(DebuggableError::ReadMem)?;
2518             total_read += read_len;
2519         }
2520         Ok(buf)
2521     }
2522 
2523     fn write_mem(
2524         &self,
2525         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2526         cpu_id: usize,
2527         vaddr: &GuestAddress,
2528         data: &[u8],
2529     ) -> std::result::Result<(), DebuggableError> {
2530         let mut total_written = 0_u64;
2531 
2532         while total_written < data.len() as u64 {
2533             let gaddr = vaddr.0 + total_written;
2534             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2535                 Ok(paddr) => paddr,
2536                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2537                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2538             };
2539             let psize = arch::PAGE_SIZE as u64;
2540             let write_len = std::cmp::min(
2541                 data.len() as u64 - total_written,
2542                 psize - (paddr & (psize - 1)),
2543             );
2544             guest_memory
2545                 .memory()
2546                 .write(
2547                     &data[total_written as usize..total_written as usize + write_len as usize],
2548                     GuestAddress(paddr),
2549                 )
2550                 .map_err(DebuggableError::WriteMem)?;
2551             total_written += write_len;
2552         }
2553         Ok(())
2554     }
2555 
2556     fn active_vcpus(&self) -> usize {
2557         self.present_vcpus() as usize
2558     }
2559 }
2560 
2561 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2562 impl Elf64Writable for CpuManager {}
2563 
2564 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2565 impl CpuElf64Writable for CpuManager {
2566     fn cpu_write_elf64_note(
2567         &mut self,
2568         dump_state: &DumpState,
2569     ) -> std::result::Result<(), GuestDebuggableError> {
2570         let mut coredump_file = dump_state.file.as_ref().unwrap();
2571         for vcpu in &self.vcpus {
2572             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2573             let mut pos: usize = 0;
2574             let mut buf = vec![0; note_size as usize];
2575             let descsz = size_of::<X86_64ElfPrStatus>();
2576             let vcpu_id = vcpu.lock().unwrap().id;
2577 
2578             let note = Elf64_Nhdr {
2579                 n_namesz: COREDUMP_NAME_SIZE,
2580                 n_descsz: descsz as u32,
2581                 n_type: NT_PRSTATUS,
2582             };
2583 
2584             let bytes: &[u8] = note.as_slice();
2585             buf.splice(0.., bytes.to_vec());
2586             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2587             buf.resize(pos + 4, 0);
2588             buf.splice(pos.., "CORE".to_string().into_bytes());
2589 
2590             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2591             buf.resize(pos + 32 + 4, 0);
2592             let pid = vcpu_id as u64;
2593             let bytes: &[u8] = pid.as_slice();
2594             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2595 
2596             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2597 
2598             let orig_rax: u64 = 0;
2599             let gregs = self.vcpus[usize::from(vcpu_id)]
2600                 .lock()
2601                 .unwrap()
2602                 .vcpu
2603                 .get_regs()
2604                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2605 
2606             let regs1 = [
2607                 gregs.get_r15(),
2608                 gregs.get_r14(),
2609                 gregs.get_r13(),
2610                 gregs.get_r12(),
2611                 gregs.get_rbp(),
2612                 gregs.get_rbx(),
2613                 gregs.get_r11(),
2614                 gregs.get_r10(),
2615             ];
2616             let regs2 = [
2617                 gregs.get_r9(),
2618                 gregs.get_r8(),
2619                 gregs.get_rax(),
2620                 gregs.get_rcx(),
2621                 gregs.get_rdx(),
2622                 gregs.get_rsi(),
2623                 gregs.get_rdi(),
2624                 orig_rax,
2625             ];
2626 
2627             let sregs = self.vcpus[usize::from(vcpu_id)]
2628                 .lock()
2629                 .unwrap()
2630                 .vcpu
2631                 .get_sregs()
2632                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2633 
2634             debug!(
2635                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2636                 gregs.get_rip(),
2637                 gregs.get_rsp(),
2638                 sregs.gs.base,
2639                 sregs.cs.selector,
2640                 sregs.ss.selector,
2641                 sregs.ds.selector,
2642             );
2643 
2644             let regs = X86_64UserRegs {
2645                 regs1,
2646                 regs2,
2647                 rip: gregs.get_rip(),
2648                 cs: sregs.cs.selector as u64,
2649                 eflags: gregs.get_rflags(),
2650                 rsp: gregs.get_rsp(),
2651                 ss: sregs.ss.selector as u64,
2652                 fs_base: sregs.fs.base,
2653                 gs_base: sregs.gs.base,
2654                 ds: sregs.ds.selector as u64,
2655                 es: sregs.es.selector as u64,
2656                 fs: sregs.fs.selector as u64,
2657                 gs: sregs.gs.selector as u64,
2658             };
2659 
2660             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2661             let bytes: &[u8] = regs.as_slice();
2662             buf.resize(note_size as usize, 0);
2663             buf.splice(pos.., bytes.to_vec());
2664             buf.resize(note_size as usize, 0);
2665 
2666             coredump_file
2667                 .write(&buf)
2668                 .map_err(GuestDebuggableError::CoredumpFile)?;
2669         }
2670 
2671         Ok(())
2672     }
2673 
2674     fn cpu_write_vmm_note(
2675         &mut self,
2676         dump_state: &DumpState,
2677     ) -> std::result::Result<(), GuestDebuggableError> {
2678         let mut coredump_file = dump_state.file.as_ref().unwrap();
2679         for vcpu in &self.vcpus {
2680             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2681             let mut pos: usize = 0;
2682             let mut buf = vec![0; note_size as usize];
2683             let descsz = size_of::<DumpCpusState>();
2684             let vcpu_id = vcpu.lock().unwrap().id;
2685 
2686             let note = Elf64_Nhdr {
2687                 n_namesz: COREDUMP_NAME_SIZE,
2688                 n_descsz: descsz as u32,
2689                 n_type: 0,
2690             };
2691 
2692             let bytes: &[u8] = note.as_slice();
2693             buf.splice(0.., bytes.to_vec());
2694             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2695 
2696             buf.resize(pos + 4, 0);
2697             buf.splice(pos.., "QEMU".to_string().into_bytes());
2698 
2699             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2700 
2701             let gregs = self.vcpus[usize::from(vcpu_id)]
2702                 .lock()
2703                 .unwrap()
2704                 .vcpu
2705                 .get_regs()
2706                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2707 
2708             let regs1 = [
2709                 gregs.get_rax(),
2710                 gregs.get_rbx(),
2711                 gregs.get_rcx(),
2712                 gregs.get_rdx(),
2713                 gregs.get_rsi(),
2714                 gregs.get_rdi(),
2715                 gregs.get_rsp(),
2716                 gregs.get_rbp(),
2717             ];
2718 
2719             let regs2 = [
2720                 gregs.get_r8(),
2721                 gregs.get_r9(),
2722                 gregs.get_r10(),
2723                 gregs.get_r11(),
2724                 gregs.get_r12(),
2725                 gregs.get_r13(),
2726                 gregs.get_r14(),
2727                 gregs.get_r15(),
2728             ];
2729 
2730             let sregs = self.vcpus[usize::from(vcpu_id)]
2731                 .lock()
2732                 .unwrap()
2733                 .vcpu
2734                 .get_sregs()
2735                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2736 
2737             let mut msrs = vec![MsrEntry {
2738                 index: msr_index::MSR_KERNEL_GS_BASE,
2739                 ..Default::default()
2740             }];
2741 
2742             self.vcpus[vcpu_id as usize]
2743                 .lock()
2744                 .unwrap()
2745                 .vcpu
2746                 .get_msrs(&mut msrs)
2747                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2748             let kernel_gs_base = msrs[0].data;
2749 
2750             let cs = CpuSegment::new(sregs.cs);
2751             let ds = CpuSegment::new(sregs.ds);
2752             let es = CpuSegment::new(sregs.es);
2753             let fs = CpuSegment::new(sregs.fs);
2754             let gs = CpuSegment::new(sregs.gs);
2755             let ss = CpuSegment::new(sregs.ss);
2756             let ldt = CpuSegment::new(sregs.ldt);
2757             let tr = CpuSegment::new(sregs.tr);
2758             let gdt = CpuSegment::new_from_table(sregs.gdt);
2759             let idt = CpuSegment::new_from_table(sregs.idt);
2760             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2761             let regs = DumpCpusState {
2762                 version: 1,
2763                 size: size_of::<DumpCpusState>() as u32,
2764                 regs1,
2765                 regs2,
2766                 rip: gregs.get_rip(),
2767                 rflags: gregs.get_rflags(),
2768                 cs,
2769                 ds,
2770                 es,
2771                 fs,
2772                 gs,
2773                 ss,
2774                 ldt,
2775                 tr,
2776                 gdt,
2777                 idt,
2778                 cr,
2779                 kernel_gs_base,
2780             };
2781 
2782             let bytes: &[u8] = regs.as_slice();
2783             buf.resize(note_size as usize, 0);
2784             buf.splice(pos.., bytes.to_vec());
2785             buf.resize(note_size as usize, 0);
2786 
2787             coredump_file
2788                 .write(&buf)
2789                 .map_err(GuestDebuggableError::CoredumpFile)?;
2790         }
2791 
2792         Ok(())
2793     }
2794 }
2795 
2796 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2797 #[cfg(test)]
2798 mod tests {
2799     use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START};
2800     use arch::x86_64::interrupts::*;
2801     use arch::x86_64::regs::*;
2802     use hypervisor::arch::x86::{FpuState, LapicState};
2803     use hypervisor::StandardRegisters;
2804     use linux_loader::loader::bootparam::setup_header;
2805 
2806     #[test]
2807     fn test_setlint() {
2808         let hv = hypervisor::new().unwrap();
2809         let vm = hv.create_vm().expect("new VM fd creation failed");
2810         hv.check_required_extensions().unwrap();
2811         // Calling get_lapic will fail if there is no irqchip before hand.
2812         vm.create_irq_chip().unwrap();
2813         let vcpu = vm.create_vcpu(0, None).unwrap();
2814         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2815 
2816         // Compute the value that is expected to represent LVT0 and LVT1.
2817         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2818         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2819         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2820         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2821 
2822         set_lint(&vcpu).unwrap();
2823 
2824         // Compute the value that represents LVT0 and LVT1 after set_lint.
2825         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2826         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2827         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2828         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2829         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2830     }
2831 
2832     #[test]
2833     fn test_setup_fpu() {
2834         let hv = hypervisor::new().unwrap();
2835         let vm = hv.create_vm().expect("new VM fd creation failed");
2836         let vcpu = vm.create_vcpu(0, None).unwrap();
2837         setup_fpu(&vcpu).unwrap();
2838 
2839         let expected_fpu: FpuState = FpuState {
2840             fcw: 0x37f,
2841             mxcsr: 0x1f80,
2842             ..Default::default()
2843         };
2844         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2845         // TODO: auto-generate kvm related structures with PartialEq on.
2846         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2847         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2848         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2849         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2850         // remove it at all.
2851         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2852     }
2853 
2854     #[test]
2855     fn test_setup_msrs() {
2856         use hypervisor::arch::x86::{msr_index, MsrEntry};
2857 
2858         let hv = hypervisor::new().unwrap();
2859         let vm = hv.create_vm().expect("new VM fd creation failed");
2860         let vcpu = vm.create_vcpu(0, None).unwrap();
2861         setup_msrs(&vcpu).unwrap();
2862 
2863         // This test will check against the last MSR entry configured (the tenth one).
2864         // See create_msr_entries for details.
2865         let mut msrs = vec![MsrEntry {
2866             index: msr_index::MSR_IA32_MISC_ENABLE,
2867             ..Default::default()
2868         }];
2869 
2870         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2871         // in this test case scenario.
2872         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2873         assert_eq!(read_msrs, 1);
2874 
2875         // Official entries that were setup when we did setup_msrs. We need to assert that the
2876         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2877         // expect.
2878         let entry_vec = vcpu.boot_msr_entries();
2879         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2880     }
2881 
2882     #[test]
2883     fn test_setup_regs_for_pvh() {
2884         let hv = hypervisor::new().unwrap();
2885         let vm = hv.create_vm().expect("new VM fd creation failed");
2886         let vcpu = vm.create_vcpu(0, None).unwrap();
2887 
2888         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2889         expected_regs.set_rflags(0x0000000000000002u64);
2890         expected_regs.set_rbx(arch::layout::PVH_INFO_START.0);
2891         expected_regs.set_rip(1);
2892 
2893         setup_regs(
2894             &vcpu,
2895             arch::EntryPoint {
2896                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2897                 setup_header: None,
2898             },
2899         )
2900         .unwrap();
2901 
2902         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2903         assert_eq!(actual_regs, expected_regs);
2904     }
2905 
2906     #[test]
2907     fn test_setup_regs_for_bzimage() {
2908         let hv = hypervisor::new().unwrap();
2909         let vm = hv.create_vm().expect("new VM fd creation failed");
2910         let vcpu = vm.create_vcpu(0, None).unwrap();
2911 
2912         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2913         expected_regs.set_rflags(0x0000000000000002u64);
2914         expected_regs.set_rip(1);
2915         expected_regs.set_rsp(BOOT_STACK_POINTER.0);
2916         expected_regs.set_rsi(ZERO_PAGE_START.0);
2917 
2918         setup_regs(
2919             &vcpu,
2920             arch::EntryPoint {
2921                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2922                 setup_header: Some(setup_header {
2923                     ..Default::default()
2924                 }),
2925             },
2926         )
2927         .unwrap();
2928 
2929         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2930         assert_eq!(actual_regs, expected_regs);
2931     }
2932 }
2933 
2934 #[cfg(target_arch = "aarch64")]
2935 #[cfg(test)]
2936 mod tests {
2937     use std::mem;
2938 
2939     use arch::aarch64::regs;
2940     use arch::layout;
2941     use hypervisor::kvm::aarch64::is_system_register;
2942     use hypervisor::kvm::kvm_bindings::{
2943         user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2944     };
2945     use hypervisor::{arm64_core_reg_id, offset_of};
2946 
2947     #[test]
2948     fn test_setup_regs() {
2949         let hv = hypervisor::new().unwrap();
2950         let vm = hv.create_vm().unwrap();
2951         let vcpu = vm.create_vcpu(0, None).unwrap();
2952 
2953         // Must fail when vcpu is not initialized yet.
2954         vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap_err();
2955 
2956         let mut kvi = vcpu.create_vcpu_init();
2957         vm.get_preferred_target(&mut kvi).unwrap();
2958         vcpu.vcpu_init(&kvi).unwrap();
2959 
2960         vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap();
2961     }
2962 
2963     #[test]
2964     fn test_read_mpidr() {
2965         let hv = hypervisor::new().unwrap();
2966         let vm = hv.create_vm().unwrap();
2967         let vcpu = vm.create_vcpu(0, None).unwrap();
2968         let mut kvi = vcpu.create_vcpu_init();
2969         vm.get_preferred_target(&mut kvi).unwrap();
2970 
2971         // Must fail when vcpu is not initialized yet.
2972         vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap_err();
2973 
2974         vcpu.vcpu_init(&kvi).unwrap();
2975         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2976     }
2977 
2978     #[test]
2979     fn test_is_system_register() {
2980         let offset = offset_of!(user_pt_regs, pc);
2981         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2982         assert!(!is_system_register(regid));
2983         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2984         assert!(is_system_register(regid));
2985     }
2986 
2987     #[test]
2988     fn test_save_restore_core_regs() {
2989         let hv = hypervisor::new().unwrap();
2990         let vm = hv.create_vm().unwrap();
2991         let vcpu = vm.create_vcpu(0, None).unwrap();
2992         let mut kvi = vcpu.create_vcpu_init();
2993         vm.get_preferred_target(&mut kvi).unwrap();
2994 
2995         // Must fail when vcpu is not initialized yet.
2996         assert_eq!(
2997             format!("{}", vcpu.get_regs().unwrap_err()),
2998             "Failed to get aarch64 core register: Exec format error (os error 8)"
2999         );
3000 
3001         let mut state = vcpu.create_standard_regs();
3002         assert_eq!(
3003             format!("{}", vcpu.set_regs(&state).unwrap_err()),
3004             "Failed to set aarch64 core register: Exec format error (os error 8)"
3005         );
3006 
3007         vcpu.vcpu_init(&kvi).unwrap();
3008         state = vcpu.get_regs().unwrap();
3009         assert_eq!(state.get_pstate(), 0x3C5);
3010 
3011         vcpu.set_regs(&state).unwrap();
3012     }
3013 
3014     #[test]
3015     fn test_get_set_mpstate() {
3016         let hv = hypervisor::new().unwrap();
3017         let vm = hv.create_vm().unwrap();
3018         let vcpu = vm.create_vcpu(0, None).unwrap();
3019         let mut kvi = vcpu.create_vcpu_init();
3020         vm.get_preferred_target(&mut kvi).unwrap();
3021 
3022         let state = vcpu.get_mp_state().unwrap();
3023         vcpu.set_mp_state(state).unwrap();
3024     }
3025 }
3026