xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 5f814308d6b19037f2afb3d36fe49b0aa14c0b22)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 #[cfg(target_arch = "x86_64")]
35 use arch::x86_64::get_x2apic_id;
36 use arch::EntryPoint;
37 use arch::NumaNodes;
38 #[cfg(target_arch = "aarch64")]
39 use devices::gic::Gic;
40 use devices::interrupt_controller::InterruptController;
41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
45 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
46 use hypervisor::aarch64::StandardRegisters;
47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
48 use hypervisor::arch::x86::msr_index;
49 #[cfg(target_arch = "x86_64")]
50 use hypervisor::arch::x86::CpuIdEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::MsrEntry;
53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
54 use hypervisor::arch::x86::SpecialRegisters;
55 #[cfg(target_arch = "aarch64")]
56 use hypervisor::kvm::kvm_bindings;
57 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
58 use hypervisor::kvm::kvm_ioctls::Cap;
59 #[cfg(feature = "tdx")]
60 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
61 #[cfg(target_arch = "x86_64")]
62 use hypervisor::CpuVendor;
63 #[cfg(feature = "kvm")]
64 use hypervisor::HypervisorType;
65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
66 use hypervisor::StandardRegisters;
67 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
68 use libc::{c_void, siginfo_t};
69 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
70 use linux_loader::elf::Elf64_Nhdr;
71 use seccompiler::{apply_filter, SeccompAction};
72 use std::collections::BTreeMap;
73 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
74 use std::io::Write;
75 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
76 use std::mem::size_of;
77 use std::os::unix::thread::JoinHandleExt;
78 use std::sync::atomic::{AtomicBool, Ordering};
79 use std::sync::{Arc, Barrier, Mutex};
80 use std::{cmp, io, result, thread};
81 use thiserror::Error;
82 use tracer::trace_scoped;
83 use vm_device::BusDevice;
84 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
85 use vm_memory::ByteValued;
86 #[cfg(feature = "guest_debug")]
87 use vm_memory::{Bytes, GuestAddressSpace};
88 use vm_memory::{GuestAddress, GuestMemoryAtomic};
89 use vm_migration::{
90     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
91     Transportable,
92 };
93 use vmm_sys_util::eventfd::EventFd;
94 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
95 use zerocopy::AsBytes;
96 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
97 /// Extract the specified bits of a 64-bit integer.
98 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
99 /// following expression should return 3 (`0b11`):
100 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
101 ///
102 macro_rules! extract_bits_64 {
103     ($value: tt, $offset: tt, $length: tt) => {
104         ($value >> $offset) & (!0u64 >> (64 - $length))
105     };
106 }
107 
108 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
109 macro_rules! extract_bits_64_without_offset {
110     ($value: tt, $length: tt) => {
111         $value & (!0u64 >> (64 - $length))
112     };
113 }
114 
115 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
116 
117 #[derive(Debug, Error)]
118 pub enum Error {
119     #[error("Error creating vCPU: {0}")]
120     VcpuCreate(#[source] anyhow::Error),
121 
122     #[error("Error running bCPU: {0}")]
123     VcpuRun(#[source] anyhow::Error),
124 
125     #[error("Error spawning vCPU thread: {0}")]
126     VcpuSpawn(#[source] io::Error),
127 
128     #[error("Error generating common CPUID: {0}")]
129     CommonCpuId(#[source] arch::Error),
130 
131     #[error("Error configuring vCPU: {0}")]
132     VcpuConfiguration(#[source] arch::Error),
133 
134     #[error("Still pending removed vcpu")]
135     VcpuPendingRemovedVcpu,
136 
137     #[cfg(target_arch = "aarch64")]
138     #[error("Error fetching preferred target: {0}")]
139     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
140 
141     #[cfg(target_arch = "aarch64")]
142     #[error("Error initialising vCPU: {0}")]
143     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
144 
145     #[error("Failed to join on vCPU threads: {0:?}")]
146     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
147 
148     #[error("Error adding CpuManager to MMIO bus: {0}")]
149     BusError(#[source] vm_device::BusError),
150 
151     #[error("Requested vCPUs exceed maximum")]
152     DesiredVCpuCountExceedsMax,
153 
154     #[error("Cannot create seccomp filter: {0}")]
155     CreateSeccompFilter(#[source] seccompiler::Error),
156 
157     #[error("Cannot apply seccomp filter: {0}")]
158     ApplySeccompFilter(#[source] seccompiler::Error),
159 
160     #[error("Error starting vCPU after restore: {0}")]
161     StartRestoreVcpu(#[source] anyhow::Error),
162 
163     #[error("Unexpected VmExit")]
164     UnexpectedVmExit,
165 
166     #[error("Failed to allocate MMIO address for CpuManager")]
167     AllocateMmmioAddress,
168 
169     #[cfg(feature = "tdx")]
170     #[error("Error initializing TDX: {0}")]
171     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
172 
173     #[cfg(target_arch = "aarch64")]
174     #[error("Error initializing PMU: {0}")]
175     InitPmu(#[source] hypervisor::HypervisorCpuError),
176 
177     #[cfg(feature = "guest_debug")]
178     #[error("Error during CPU debug: {0}")]
179     CpuDebug(#[source] hypervisor::HypervisorCpuError),
180 
181     #[cfg(feature = "guest_debug")]
182     #[error("Error translating virtual address: {0}")]
183     TranslateVirtualAddress(#[source] anyhow::Error),
184 
185     #[cfg(target_arch = "x86_64")]
186     #[error("Error setting up AMX: {0}")]
187     AmxEnable(#[source] anyhow::Error),
188 
189     #[error("Maximum number of vCPUs exceeds host limit")]
190     MaximumVcpusExceeded,
191 
192     #[cfg(feature = "sev_snp")]
193     #[error("Failed to set sev control register: {0}")]
194     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
195 
196     #[cfg(target_arch = "x86_64")]
197     #[error("Failed to inject NMI")]
198     NmiError(hypervisor::HypervisorCpuError),
199 }
200 pub type Result<T> = result::Result<T, Error>;
201 
202 #[cfg(target_arch = "x86_64")]
203 #[allow(dead_code)]
204 #[repr(packed)]
205 #[derive(AsBytes)]
206 struct LocalX2Apic {
207     pub r#type: u8,
208     pub length: u8,
209     pub _reserved: u16,
210     pub apic_id: u32,
211     pub flags: u32,
212     pub processor_id: u32,
213 }
214 
215 #[allow(dead_code)]
216 #[repr(packed)]
217 #[derive(Default, AsBytes)]
218 struct Ioapic {
219     pub r#type: u8,
220     pub length: u8,
221     pub ioapic_id: u8,
222     _reserved: u8,
223     pub apic_address: u32,
224     pub gsi_base: u32,
225 }
226 
227 #[cfg(target_arch = "aarch64")]
228 #[allow(dead_code)]
229 #[repr(packed)]
230 #[derive(AsBytes)]
231 struct GicC {
232     pub r#type: u8,
233     pub length: u8,
234     pub reserved0: u16,
235     pub cpu_interface_number: u32,
236     pub uid: u32,
237     pub flags: u32,
238     pub parking_version: u32,
239     pub performance_interrupt: u32,
240     pub parked_address: u64,
241     pub base_address: u64,
242     pub gicv_base_address: u64,
243     pub gich_base_address: u64,
244     pub vgic_interrupt: u32,
245     pub gicr_base_address: u64,
246     pub mpidr: u64,
247     pub proc_power_effi_class: u8,
248     pub reserved1: u8,
249     pub spe_overflow_interrupt: u16,
250 }
251 
252 #[cfg(target_arch = "aarch64")]
253 #[allow(dead_code)]
254 #[repr(packed)]
255 #[derive(AsBytes)]
256 struct GicD {
257     pub r#type: u8,
258     pub length: u8,
259     pub reserved0: u16,
260     pub gic_id: u32,
261     pub base_address: u64,
262     pub global_irq_base: u32,
263     pub version: u8,
264     pub reserved1: [u8; 3],
265 }
266 
267 #[cfg(target_arch = "aarch64")]
268 #[allow(dead_code)]
269 #[repr(packed)]
270 #[derive(AsBytes)]
271 struct GicR {
272     pub r#type: u8,
273     pub length: u8,
274     pub reserved: u16,
275     pub base_address: u64,
276     pub range_length: u32,
277 }
278 
279 #[cfg(target_arch = "aarch64")]
280 #[allow(dead_code)]
281 #[repr(packed)]
282 #[derive(AsBytes)]
283 struct GicIts {
284     pub r#type: u8,
285     pub length: u8,
286     pub reserved0: u16,
287     pub translation_id: u32,
288     pub base_address: u64,
289     pub reserved1: u32,
290 }
291 
292 #[cfg(target_arch = "aarch64")]
293 #[allow(dead_code)]
294 #[repr(packed)]
295 #[derive(AsBytes)]
296 struct ProcessorHierarchyNode {
297     pub r#type: u8,
298     pub length: u8,
299     pub reserved: u16,
300     pub flags: u32,
301     pub parent: u32,
302     pub acpi_processor_id: u32,
303     pub num_private_resources: u32,
304 }
305 
306 #[allow(dead_code)]
307 #[repr(packed)]
308 #[derive(Default, AsBytes)]
309 struct InterruptSourceOverride {
310     pub r#type: u8,
311     pub length: u8,
312     pub bus: u8,
313     pub source: u8,
314     pub gsi: u32,
315     pub flags: u16,
316 }
317 
318 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
319 macro_rules! round_up {
320     ($n:expr,$d:expr) => {
321         (($n / ($d + 1)) + 1) * $d
322     };
323 }
324 
325 /// A wrapper around creating and using a kvm-based VCPU.
326 pub struct Vcpu {
327     // The hypervisor abstracted CPU.
328     vcpu: Arc<dyn hypervisor::Vcpu>,
329     id: u8,
330     #[cfg(target_arch = "aarch64")]
331     mpidr: u64,
332     saved_state: Option<CpuState>,
333     #[cfg(target_arch = "x86_64")]
334     vendor: CpuVendor,
335 }
336 
337 impl Vcpu {
338     /// Constructs a new VCPU for `vm`.
339     ///
340     /// # Arguments
341     ///
342     /// * `id` - Represents the CPU number between [0, max vcpus).
343     /// * `vm` - The virtual machine this vcpu will get attached to.
344     /// * `vm_ops` - Optional object for exit handling.
345     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
346     pub fn new(
347         id: u8,
348         apic_id: u8,
349         vm: &Arc<dyn hypervisor::Vm>,
350         vm_ops: Option<Arc<dyn VmOps>>,
351         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
352     ) -> Result<Self> {
353         let vcpu = vm
354             .create_vcpu(apic_id, vm_ops)
355             .map_err(|e| Error::VcpuCreate(e.into()))?;
356         // Initially the cpuid per vCPU is the one supported by this VM.
357         Ok(Vcpu {
358             vcpu,
359             id,
360             #[cfg(target_arch = "aarch64")]
361             mpidr: 0,
362             saved_state: None,
363             #[cfg(target_arch = "x86_64")]
364             vendor: cpu_vendor,
365         })
366     }
367 
368     /// Configures a vcpu and should be called once per vcpu when created.
369     ///
370     /// # Arguments
371     ///
372     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
373     /// * `guest_memory` - Guest memory.
374     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
375     pub fn configure(
376         &mut self,
377         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
378         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
379         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
380         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
381         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
382     ) -> Result<()> {
383         #[cfg(target_arch = "aarch64")]
384         {
385             self.init(vm)?;
386             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
387                 .map_err(Error::VcpuConfiguration)?;
388         }
389         info!("Configuring vCPU: cpu_id = {}", self.id);
390         #[cfg(target_arch = "x86_64")]
391         arch::configure_vcpu(
392             &self.vcpu,
393             self.id,
394             boot_setup,
395             cpuid,
396             kvm_hyperv,
397             self.vendor,
398             topology,
399         )
400         .map_err(Error::VcpuConfiguration)?;
401 
402         Ok(())
403     }
404 
405     /// Gets the MPIDR register value.
406     #[cfg(target_arch = "aarch64")]
407     pub fn get_mpidr(&self) -> u64 {
408         self.mpidr
409     }
410 
411     /// Gets the saved vCPU state.
412     #[cfg(target_arch = "aarch64")]
413     pub fn get_saved_state(&self) -> Option<CpuState> {
414         self.saved_state.clone()
415     }
416 
417     /// Initializes an aarch64 specific vcpu for booting Linux.
418     #[cfg(target_arch = "aarch64")]
419     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
420         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
421 
422         // This reads back the kernel's preferred target type.
423         vm.get_preferred_target(&mut kvi)
424             .map_err(Error::VcpuArmPreferredTarget)?;
425         // We already checked that the capability is supported.
426         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
427         if vm
428             .as_any()
429             .downcast_ref::<hypervisor::kvm::KvmVm>()
430             .unwrap()
431             .check_extension(Cap::ArmPmuV3)
432         {
433             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
434         }
435         // Non-boot cpus are powered off initially.
436         if self.id > 0 {
437             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
438         }
439         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
440     }
441 
442     /// Runs the VCPU until it exits, returning the reason.
443     ///
444     /// Note that the state of the VCPU and associated VM must be setup first for this to do
445     /// anything useful.
446     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
447         self.vcpu.run()
448     }
449 
450     #[cfg(feature = "sev_snp")]
451     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
452         self.vcpu
453             .set_sev_control_register(vmsa_pfn)
454             .map_err(Error::SetSevControlRegister)
455     }
456 }
457 
458 impl Pausable for Vcpu {}
459 impl Snapshottable for Vcpu {
460     fn id(&self) -> String {
461         self.id.to_string()
462     }
463 
464     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
465         let saved_state = self
466             .vcpu
467             .state()
468             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
469 
470         self.saved_state = Some(saved_state.clone());
471 
472         Ok(Snapshot::from_data(SnapshotData::new_from_state(
473             &saved_state,
474         )?))
475     }
476 }
477 
478 pub struct CpuManager {
479     config: CpusConfig,
480     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
481     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
482     #[cfg(target_arch = "x86_64")]
483     cpuid: Vec<CpuIdEntry>,
484     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
485     vm: Arc<dyn hypervisor::Vm>,
486     vcpus_kill_signalled: Arc<AtomicBool>,
487     vcpus_pause_signalled: Arc<AtomicBool>,
488     vcpus_kick_signalled: Arc<AtomicBool>,
489     exit_evt: EventFd,
490     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
491     reset_evt: EventFd,
492     #[cfg(feature = "guest_debug")]
493     vm_debug_evt: EventFd,
494     vcpu_states: Vec<VcpuState>,
495     selected_cpu: u8,
496     vcpus: Vec<Arc<Mutex<Vcpu>>>,
497     seccomp_action: SeccompAction,
498     vm_ops: Arc<dyn VmOps>,
499     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
500     acpi_address: Option<GuestAddress>,
501     proximity_domain_per_cpu: BTreeMap<u8, u32>,
502     affinity: BTreeMap<u8, Vec<usize>>,
503     dynamic: bool,
504     hypervisor: Arc<dyn hypervisor::Hypervisor>,
505     #[cfg(feature = "sev_snp")]
506     sev_snp_enabled: bool,
507 }
508 
509 const CPU_ENABLE_FLAG: usize = 0;
510 const CPU_INSERTING_FLAG: usize = 1;
511 const CPU_REMOVING_FLAG: usize = 2;
512 const CPU_EJECT_FLAG: usize = 3;
513 
514 const CPU_STATUS_OFFSET: u64 = 4;
515 const CPU_SELECTION_OFFSET: u64 = 0;
516 
517 impl BusDevice for CpuManager {
518     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
519         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
520         data.fill(0);
521 
522         match offset {
523             CPU_SELECTION_OFFSET => {
524                 data[0] = self.selected_cpu;
525             }
526             CPU_STATUS_OFFSET => {
527                 if self.selected_cpu < self.max_vcpus() {
528                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
529                     if state.active() {
530                         data[0] |= 1 << CPU_ENABLE_FLAG;
531                     }
532                     if state.inserting {
533                         data[0] |= 1 << CPU_INSERTING_FLAG;
534                     }
535                     if state.removing {
536                         data[0] |= 1 << CPU_REMOVING_FLAG;
537                     }
538                 } else {
539                     warn!("Out of range vCPU id: {}", self.selected_cpu);
540                 }
541             }
542             _ => {
543                 warn!(
544                     "Unexpected offset for accessing CPU manager device: {:#}",
545                     offset
546                 );
547             }
548         }
549     }
550 
551     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
552         match offset {
553             CPU_SELECTION_OFFSET => {
554                 self.selected_cpu = data[0];
555             }
556             CPU_STATUS_OFFSET => {
557                 if self.selected_cpu < self.max_vcpus() {
558                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
559                     // The ACPI code writes back a 1 to acknowledge the insertion
560                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
561                         && state.inserting
562                     {
563                         state.inserting = false;
564                     }
565                     // Ditto for removal
566                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
567                         && state.removing
568                     {
569                         state.removing = false;
570                     }
571                     // Trigger removal of vCPU
572                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
573                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
574                             error!("Error removing vCPU: {:?}", e);
575                         }
576                     }
577                 } else {
578                     warn!("Out of range vCPU id: {}", self.selected_cpu);
579                 }
580             }
581             _ => {
582                 warn!(
583                     "Unexpected offset for accessing CPU manager device: {:#}",
584                     offset
585                 );
586             }
587         }
588         None
589     }
590 }
591 
592 #[derive(Default)]
593 struct VcpuState {
594     inserting: bool,
595     removing: bool,
596     pending_removal: Arc<AtomicBool>,
597     handle: Option<thread::JoinHandle<()>>,
598     kill: Arc<AtomicBool>,
599     vcpu_run_interrupted: Arc<AtomicBool>,
600     paused: Arc<AtomicBool>,
601 }
602 
603 impl VcpuState {
604     fn active(&self) -> bool {
605         self.handle.is_some()
606     }
607 
608     fn signal_thread(&self) {
609         if let Some(handle) = self.handle.as_ref() {
610             loop {
611                 // SAFETY: FFI call with correct arguments
612                 unsafe {
613                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
614                 }
615                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
616                     break;
617                 } else {
618                     // This is more effective than thread::yield_now() at
619                     // avoiding a priority inversion with the vCPU thread
620                     thread::sleep(std::time::Duration::from_millis(1));
621                 }
622             }
623         }
624     }
625 
626     fn join_thread(&mut self) -> Result<()> {
627         if let Some(handle) = self.handle.take() {
628             handle.join().map_err(Error::ThreadCleanup)?
629         }
630 
631         Ok(())
632     }
633 
634     fn unpark_thread(&self) {
635         if let Some(handle) = self.handle.as_ref() {
636             handle.thread().unpark()
637         }
638     }
639 }
640 
641 impl CpuManager {
642     #[allow(unused_variables)]
643     #[allow(clippy::too_many_arguments)]
644     pub fn new(
645         config: &CpusConfig,
646         vm: Arc<dyn hypervisor::Vm>,
647         exit_evt: EventFd,
648         reset_evt: EventFd,
649         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
650         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
651         seccomp_action: SeccompAction,
652         vm_ops: Arc<dyn VmOps>,
653         #[cfg(feature = "tdx")] tdx_enabled: bool,
654         numa_nodes: &NumaNodes,
655         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
656     ) -> Result<Arc<Mutex<CpuManager>>> {
657         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
658             return Err(Error::MaximumVcpusExceeded);
659         }
660 
661         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
662         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
663         let hypervisor_type = hypervisor.hypervisor_type();
664         #[cfg(target_arch = "x86_64")]
665         let cpu_vendor = hypervisor.get_cpu_vendor();
666 
667         #[cfg(target_arch = "x86_64")]
668         if config.features.amx {
669             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
670             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
671             const XFEATURE_XTILEDATA: usize = 18;
672             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
673 
674             // SAFETY: the syscall is only modifying kernel internal
675             // data structures that the kernel is itself expected to safeguard.
676             let amx_tile = unsafe {
677                 libc::syscall(
678                     libc::SYS_arch_prctl,
679                     ARCH_REQ_XCOMP_GUEST_PERM,
680                     XFEATURE_XTILEDATA,
681                 )
682             };
683 
684             if amx_tile != 0 {
685                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
686             } else {
687                 let mask: usize = 0;
688                 // SAFETY: the mask being modified (not marked mutable as it is
689                 // modified in unsafe only which is permitted) isn't in use elsewhere.
690                 let result = unsafe {
691                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
692                 };
693                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
694                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
695                 }
696             }
697         }
698 
699         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
700             let mut cpu_list = Vec::new();
701             for (proximity_domain, numa_node) in numa_nodes.iter() {
702                 for cpu in numa_node.cpus.iter() {
703                     cpu_list.push((*cpu, *proximity_domain))
704                 }
705             }
706             cpu_list
707         }
708         .into_iter()
709         .collect();
710 
711         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
712             cpu_affinity
713                 .iter()
714                 .map(|a| (a.vcpu, a.host_cpus.clone()))
715                 .collect()
716         } else {
717             BTreeMap::new()
718         };
719 
720         #[cfg(feature = "tdx")]
721         let dynamic = !tdx_enabled;
722         #[cfg(not(feature = "tdx"))]
723         let dynamic = true;
724 
725         Ok(Arc::new(Mutex::new(CpuManager {
726             config: config.clone(),
727             interrupt_controller: None,
728             #[cfg(target_arch = "x86_64")]
729             cpuid: Vec::new(),
730             vm,
731             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
732             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
733             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
734             vcpu_states,
735             exit_evt,
736             reset_evt,
737             #[cfg(feature = "guest_debug")]
738             vm_debug_evt,
739             selected_cpu: 0,
740             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
741             seccomp_action,
742             vm_ops,
743             acpi_address: None,
744             proximity_domain_per_cpu,
745             affinity,
746             dynamic,
747             hypervisor: hypervisor.clone(),
748             #[cfg(feature = "sev_snp")]
749             sev_snp_enabled,
750         })))
751     }
752 
753     #[cfg(target_arch = "x86_64")]
754     pub fn populate_cpuid(
755         &mut self,
756         memory_manager: &Arc<Mutex<MemoryManager>>,
757         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
758         #[cfg(feature = "tdx")] tdx: bool,
759     ) -> Result<()> {
760         let sgx_epc_sections = memory_manager
761             .lock()
762             .unwrap()
763             .sgx_epc_region()
764             .as_ref()
765             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
766 
767         self.cpuid = {
768             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
769             arch::generate_common_cpuid(
770                 hypervisor,
771                 &arch::CpuidConfig {
772                     sgx_epc_sections,
773                     phys_bits,
774                     kvm_hyperv: self.config.kvm_hyperv,
775                     #[cfg(feature = "tdx")]
776                     tdx,
777                     amx: self.config.features.amx,
778                 },
779             )
780             .map_err(Error::CommonCpuId)?
781         };
782 
783         Ok(())
784     }
785 
786     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
787         info!("Creating vCPU: cpu_id = {}", cpu_id);
788 
789         #[cfg(target_arch = "x86_64")]
790         let topology = self.get_vcpu_topology();
791         #[cfg(target_arch = "x86_64")]
792         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
793         #[cfg(target_arch = "aarch64")]
794         let x2apic_id = cpu_id as u32;
795 
796         let mut vcpu = Vcpu::new(
797             cpu_id,
798             x2apic_id as u8,
799             &self.vm,
800             Some(self.vm_ops.clone()),
801             #[cfg(target_arch = "x86_64")]
802             self.hypervisor.get_cpu_vendor(),
803         )?;
804 
805         if let Some(snapshot) = snapshot {
806             // AArch64 vCPUs should be initialized after created.
807             #[cfg(target_arch = "aarch64")]
808             vcpu.init(&self.vm)?;
809 
810             let state: CpuState = snapshot.to_state().map_err(|e| {
811                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
812             })?;
813             vcpu.vcpu
814                 .set_state(&state)
815                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
816 
817             vcpu.saved_state = Some(state);
818         }
819 
820         let vcpu = Arc::new(Mutex::new(vcpu));
821 
822         // Adding vCPU to the CpuManager's vCPU list.
823         self.vcpus.push(vcpu.clone());
824 
825         Ok(vcpu)
826     }
827 
828     pub fn configure_vcpu(
829         &self,
830         vcpu: Arc<Mutex<Vcpu>>,
831         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
832     ) -> Result<()> {
833         let mut vcpu = vcpu.lock().unwrap();
834 
835         #[cfg(feature = "sev_snp")]
836         if self.sev_snp_enabled {
837             if let Some((kernel_entry_point, _)) = boot_setup {
838                 vcpu.set_sev_control_register(
839                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
840                 )?;
841             }
842 
843             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
844             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
845             return Ok(());
846         }
847 
848         #[cfg(target_arch = "x86_64")]
849         assert!(!self.cpuid.is_empty());
850 
851         #[cfg(target_arch = "x86_64")]
852         let topology = self.config.topology.clone().map_or_else(
853             || Some((1, self.boot_vcpus(), 1)),
854             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
855         );
856         #[cfg(target_arch = "x86_64")]
857         vcpu.configure(
858             boot_setup,
859             self.cpuid.clone(),
860             self.config.kvm_hyperv,
861             topology,
862         )?;
863 
864         #[cfg(target_arch = "aarch64")]
865         vcpu.configure(&self.vm, boot_setup)?;
866 
867         Ok(())
868     }
869 
870     /// Only create new vCPUs if there aren't any inactive ones to reuse
871     fn create_vcpus(
872         &mut self,
873         desired_vcpus: u8,
874         snapshot: Option<Snapshot>,
875     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
876         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
877         info!(
878             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
879             desired_vcpus,
880             self.config.max_vcpus,
881             self.vcpus.len(),
882             self.present_vcpus()
883         );
884 
885         if desired_vcpus > self.config.max_vcpus {
886             return Err(Error::DesiredVCpuCountExceedsMax);
887         }
888 
889         // Only create vCPUs in excess of all the allocated vCPUs.
890         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
891             vcpus.push(self.create_vcpu(
892                 cpu_id,
893                 // TODO: The special format of the CPU id can be removed once
894                 // ready to break live upgrade.
895                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
896             )?);
897         }
898 
899         Ok(vcpus)
900     }
901 
902     #[cfg(target_arch = "aarch64")]
903     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
904         for cpu in self.vcpus.iter() {
905             let cpu = cpu.lock().unwrap();
906             // Check if PMU attr is available, if not, log the information.
907             if cpu.vcpu.has_pmu_support() {
908                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
909             } else {
910                 debug!(
911                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
912                     cpu.id
913                 );
914                 return Ok(false);
915             }
916         }
917 
918         Ok(true)
919     }
920 
921     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
922         self.vcpus.clone()
923     }
924 
925     fn start_vcpu(
926         &mut self,
927         vcpu: Arc<Mutex<Vcpu>>,
928         vcpu_id: u8,
929         vcpu_thread_barrier: Arc<Barrier>,
930         inserting: bool,
931     ) -> Result<()> {
932         let reset_evt = self.reset_evt.try_clone().unwrap();
933         let exit_evt = self.exit_evt.try_clone().unwrap();
934         #[cfg(feature = "kvm")]
935         let hypervisor_type = self.hypervisor.hypervisor_type();
936         #[cfg(feature = "guest_debug")]
937         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
938         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
939         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
940         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
941         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
942 
943         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
944         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
945             .vcpu_run_interrupted
946             .clone();
947         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
948         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
949 
950         // Prepare the CPU set the current vCPU is expected to run onto.
951         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
952             // SAFETY: all zeros is a valid pattern
953             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
954             // SAFETY: FFI call, trivially safe
955             unsafe { libc::CPU_ZERO(&mut cpuset) };
956             for host_cpu in host_cpus {
957                 // SAFETY: FFI call, trivially safe
958                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
959             }
960             cpuset
961         });
962 
963         // Retrieve seccomp filter for vcpu thread
964         let vcpu_seccomp_filter = get_seccomp_filter(
965             &self.seccomp_action,
966             Thread::Vcpu,
967             self.hypervisor.hypervisor_type(),
968         )
969         .map_err(Error::CreateSeccompFilter)?;
970 
971         #[cfg(target_arch = "x86_64")]
972         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
973 
974         info!("Starting vCPU: cpu_id = {}", vcpu_id);
975 
976         let handle = Some(
977             thread::Builder::new()
978                 .name(format!("vcpu{vcpu_id}"))
979                 .spawn(move || {
980                     // Schedule the thread to run on the expected CPU set
981                     if let Some(cpuset) = cpuset.as_ref() {
982                         // SAFETY: FFI call with correct arguments
983                         let ret = unsafe {
984                             libc::sched_setaffinity(
985                                 0,
986                                 std::mem::size_of::<libc::cpu_set_t>(),
987                                 cpuset as *const libc::cpu_set_t,
988                             )
989                         };
990 
991                         if ret != 0 {
992                             error!(
993                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
994                                 vcpu_id,
995                                 io::Error::last_os_error()
996                             );
997                             return;
998                         }
999                     }
1000 
1001                     // Apply seccomp filter for vcpu thread.
1002                     if !vcpu_seccomp_filter.is_empty() {
1003                         if let Err(e) =
1004                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1005                         {
1006                             error!("Error applying seccomp filter: {:?}", e);
1007                             return;
1008                         }
1009                     }
1010                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1011                     // This uses an async signal safe handler to kill the vcpu handles.
1012                     register_signal_handler(SIGRTMIN(), handle_signal)
1013                         .expect("Failed to register vcpu signal handler");
1014                     // Block until all CPUs are ready.
1015                     vcpu_thread_barrier.wait();
1016 
1017                     std::panic::catch_unwind(move || {
1018                         loop {
1019                             // If we are being told to pause, we park the thread
1020                             // until the pause boolean is toggled.
1021                             // The resume operation is responsible for toggling
1022                             // the boolean and unpark the thread.
1023                             // We enter a loop because park() could spuriously
1024                             // return. We will then park() again unless the
1025                             // pause boolean has been toggled.
1026 
1027                             // Need to use Ordering::SeqCst as we have multiple
1028                             // loads and stores to different atomics and we need
1029                             // to see them in a consistent order in all threads
1030 
1031                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1032                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1033                                 // completed by returning to KVM_RUN. From the kernel docs:
1034                                 //
1035                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1036                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1037                                 // operations are complete (and guest state is consistent) only after userspace
1038                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1039                                 // incomplete operations and then check for pending signals.
1040                                 // The pending state of the operation is not preserved in state which is
1041                                 // visible to userspace, thus userspace should ensure that the operation is
1042                                 // completed before performing a live migration.  Userspace can re-enter the
1043                                 // guest with an unmasked signal pending or with the immediate_exit field set
1044                                 // to complete pending operations without allowing any further instructions
1045                                 // to be executed.
1046 
1047                                 #[cfg(feature = "kvm")]
1048                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1049                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1050                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1051                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1052                                         break;
1053                                     }
1054                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1055                                 }
1056 
1057                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1058 
1059                                 vcpu_paused.store(true, Ordering::SeqCst);
1060                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1061                                     thread::park();
1062                                 }
1063                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1064                             }
1065 
1066                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1067                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1068                                 #[cfg(target_arch = "x86_64")]
1069                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1070                                     Ok(()) => {},
1071                                     Err(e) => {
1072                                         error!("Error when inject nmi {}", e);
1073                                         break;
1074                                     }
1075                                 }
1076                             }
1077 
1078                             // We've been told to terminate
1079                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1080                                 || vcpu_kill.load(Ordering::SeqCst)
1081                             {
1082                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1083                                 break;
1084                             }
1085 
1086                             #[cfg(feature = "tdx")]
1087                             let mut vcpu = vcpu.lock().unwrap();
1088                             #[cfg(not(feature = "tdx"))]
1089                             let vcpu = vcpu.lock().unwrap();
1090                             // vcpu.run() returns false on a triple-fault so trigger a reset
1091                             match vcpu.run() {
1092                                 Ok(run) => match run {
1093                                     #[cfg(feature = "kvm")]
1094                                     VmExit::Debug => {
1095                                         info!("VmExit::Debug");
1096                                         #[cfg(feature = "guest_debug")]
1097                                         {
1098                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1099                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1100                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1101                                         }
1102                                     }
1103                                     #[cfg(target_arch = "x86_64")]
1104                                     VmExit::IoapicEoi(vector) => {
1105                                         if let Some(interrupt_controller) =
1106                                             &interrupt_controller_clone
1107                                         {
1108                                             interrupt_controller
1109                                                 .lock()
1110                                                 .unwrap()
1111                                                 .end_of_interrupt(vector);
1112                                         }
1113                                     }
1114                                     VmExit::Ignore => {}
1115                                     VmExit::Hyperv => {}
1116                                     VmExit::Reset => {
1117                                         info!("VmExit::Reset");
1118                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1119                                         reset_evt.write(1).unwrap();
1120                                         break;
1121                                     }
1122                                     VmExit::Shutdown => {
1123                                         info!("VmExit::Shutdown");
1124                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1125                                         exit_evt.write(1).unwrap();
1126                                         break;
1127                                     }
1128                                     #[cfg(feature = "tdx")]
1129                                     VmExit::Tdx => {
1130                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1131                                             match vcpu.get_tdx_exit_details() {
1132                                                 Ok(details) => match details {
1133                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1134                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1135                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1136                                                     }
1137                                                 },
1138                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1139                                             }
1140                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1141                                         } else {
1142                                             // We should never reach this code as
1143                                             // this means the design from the code
1144                                             // is wrong.
1145                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1146                                         }
1147                                     }
1148                                 },
1149 
1150                                 Err(e) => {
1151                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1152                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1153                                     exit_evt.write(1).unwrap();
1154                                     break;
1155                                 }
1156                             }
1157 
1158                             // We've been told to terminate
1159                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1160                                 || vcpu_kill.load(Ordering::SeqCst)
1161                             {
1162                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1163                                 break;
1164                             }
1165                         }
1166                     })
1167                     .or_else(|_| {
1168                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1169                         error!("vCPU thread panicked");
1170                         panic_exit_evt.write(1)
1171                     })
1172                     .ok();
1173                 })
1174                 .map_err(Error::VcpuSpawn)?,
1175         );
1176 
1177         // On hot plug calls into this function entry_point is None. It is for
1178         // those hotplug CPU additions that we need to set the inserting flag.
1179         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1180         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1181 
1182         Ok(())
1183     }
1184 
1185     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1186     fn activate_vcpus(
1187         &mut self,
1188         desired_vcpus: u8,
1189         inserting: bool,
1190         paused: Option<bool>,
1191     ) -> Result<()> {
1192         if desired_vcpus > self.config.max_vcpus {
1193             return Err(Error::DesiredVCpuCountExceedsMax);
1194         }
1195 
1196         let vcpu_thread_barrier = Arc::new(Barrier::new(
1197             (desired_vcpus - self.present_vcpus() + 1) as usize,
1198         ));
1199 
1200         if let Some(paused) = paused {
1201             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1202         }
1203 
1204         info!(
1205             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1206             desired_vcpus,
1207             self.vcpus.len(),
1208             self.present_vcpus(),
1209             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1210         );
1211 
1212         // This reuses any inactive vCPUs as well as any that were newly created
1213         for vcpu_id in self.present_vcpus()..desired_vcpus {
1214             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1215             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1216         }
1217 
1218         // Unblock all CPU threads.
1219         vcpu_thread_barrier.wait();
1220         Ok(())
1221     }
1222 
1223     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1224         // Mark vCPUs for removal, actual removal happens on ejection
1225         for cpu_id in desired_vcpus..self.present_vcpus() {
1226             self.vcpu_states[usize::from(cpu_id)].removing = true;
1227             self.vcpu_states[usize::from(cpu_id)]
1228                 .pending_removal
1229                 .store(true, Ordering::SeqCst);
1230         }
1231     }
1232 
1233     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1234         for state in self.vcpu_states.iter() {
1235             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1236                 return true;
1237             }
1238         }
1239         false
1240     }
1241 
1242     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1243         info!("Removing vCPU: cpu_id = {}", cpu_id);
1244         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1245         state.kill.store(true, Ordering::SeqCst);
1246         state.signal_thread();
1247         state.join_thread()?;
1248         state.handle = None;
1249 
1250         // Once the thread has exited, clear the "kill" so that it can reused
1251         state.kill.store(false, Ordering::SeqCst);
1252         state.pending_removal.store(false, Ordering::SeqCst);
1253 
1254         Ok(())
1255     }
1256 
1257     pub fn create_boot_vcpus(
1258         &mut self,
1259         snapshot: Option<Snapshot>,
1260     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1261         trace_scoped!("create_boot_vcpus");
1262 
1263         self.create_vcpus(self.boot_vcpus(), snapshot)
1264     }
1265 
1266     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1267     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1268         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1269     }
1270 
1271     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1272         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1273             .map_err(|e| {
1274                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1275             })?;
1276 
1277         Ok(())
1278     }
1279 
1280     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1281         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1282             return Ok(false);
1283         }
1284 
1285         if !self.dynamic {
1286             return Ok(false);
1287         }
1288 
1289         if self.check_pending_removed_vcpu() {
1290             return Err(Error::VcpuPendingRemovedVcpu);
1291         }
1292 
1293         match desired_vcpus.cmp(&self.present_vcpus()) {
1294             cmp::Ordering::Greater => {
1295                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1296                 for vcpu in vcpus {
1297                     self.configure_vcpu(vcpu, None)?
1298                 }
1299                 self.activate_vcpus(desired_vcpus, true, None)?;
1300                 Ok(true)
1301             }
1302             cmp::Ordering::Less => {
1303                 self.mark_vcpus_for_removal(desired_vcpus);
1304                 Ok(true)
1305             }
1306             _ => Ok(false),
1307         }
1308     }
1309 
1310     pub fn shutdown(&mut self) -> Result<()> {
1311         // Tell the vCPUs to stop themselves next time they go through the loop
1312         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1313 
1314         // Toggle the vCPUs pause boolean
1315         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1316 
1317         // Unpark all the VCPU threads.
1318         for state in self.vcpu_states.iter() {
1319             state.unpark_thread();
1320         }
1321 
1322         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1323         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1324         // above.
1325         for state in self.vcpu_states.iter() {
1326             state.signal_thread();
1327         }
1328 
1329         // Wait for all the threads to finish. This removes the state from the vector.
1330         for mut state in self.vcpu_states.drain(..) {
1331             state.join_thread()?;
1332         }
1333 
1334         Ok(())
1335     }
1336 
1337     #[cfg(feature = "tdx")]
1338     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1339         for vcpu in &self.vcpus {
1340             vcpu.lock()
1341                 .unwrap()
1342                 .vcpu
1343                 .tdx_init(hob_address)
1344                 .map_err(Error::InitializeTdx)?;
1345         }
1346         Ok(())
1347     }
1348 
1349     pub fn boot_vcpus(&self) -> u8 {
1350         self.config.boot_vcpus
1351     }
1352 
1353     pub fn max_vcpus(&self) -> u8 {
1354         self.config.max_vcpus
1355     }
1356 
1357     #[cfg(target_arch = "x86_64")]
1358     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1359         assert!(!self.cpuid.is_empty());
1360         self.cpuid.clone()
1361     }
1362 
1363     fn present_vcpus(&self) -> u8 {
1364         self.vcpu_states
1365             .iter()
1366             .fold(0, |acc, state| acc + state.active() as u8)
1367     }
1368 
1369     #[cfg(target_arch = "aarch64")]
1370     pub fn get_mpidrs(&self) -> Vec<u64> {
1371         self.vcpus
1372             .iter()
1373             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1374             .collect()
1375     }
1376 
1377     #[cfg(target_arch = "aarch64")]
1378     pub fn get_saved_states(&self) -> Vec<CpuState> {
1379         self.vcpus
1380             .iter()
1381             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1382             .collect()
1383     }
1384 
1385     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1386         self.config
1387             .topology
1388             .clone()
1389             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1390     }
1391 
1392     pub fn create_madt(&self) -> Sdt {
1393         use crate::acpi;
1394         // This is also checked in the commandline parsing.
1395         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1396 
1397         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1398         #[cfg(target_arch = "x86_64")]
1399         {
1400             madt.write(36, arch::layout::APIC_START.0);
1401 
1402             for cpu in 0..self.config.max_vcpus {
1403                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1404 
1405                 let lapic = LocalX2Apic {
1406                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1407                     length: 16,
1408                     processor_id: cpu.into(),
1409                     apic_id: x2apic_id,
1410                     flags: if cpu < self.config.boot_vcpus {
1411                         1 << MADT_CPU_ENABLE_FLAG
1412                     } else {
1413                         0
1414                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1415                     _reserved: 0,
1416                 };
1417                 madt.append(lapic);
1418             }
1419 
1420             madt.append(Ioapic {
1421                 r#type: acpi::ACPI_APIC_IO,
1422                 length: 12,
1423                 ioapic_id: 0,
1424                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1425                 gsi_base: 0,
1426                 ..Default::default()
1427             });
1428 
1429             madt.append(InterruptSourceOverride {
1430                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1431                 length: 10,
1432                 bus: 0,
1433                 source: 4,
1434                 gsi: 4,
1435                 flags: 0,
1436             });
1437         }
1438 
1439         #[cfg(target_arch = "aarch64")]
1440         {
1441             /* Notes:
1442              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1443              */
1444 
1445             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1446             for cpu in 0..self.config.boot_vcpus {
1447                 let vcpu = &self.vcpus[cpu as usize];
1448                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1449                 /* ARMv8 MPIDR format:
1450                      Bits [63:40] Must be zero
1451                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1452                      Bits [31:24] Must be zero
1453                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1454                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1455                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1456                 */
1457                 let mpidr_mask = 0xff_00ff_ffff;
1458                 let gicc = GicC {
1459                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1460                     length: 80,
1461                     reserved0: 0,
1462                     cpu_interface_number: cpu as u32,
1463                     uid: cpu as u32,
1464                     flags: 1,
1465                     parking_version: 0,
1466                     performance_interrupt: 0,
1467                     parked_address: 0,
1468                     base_address: 0,
1469                     gicv_base_address: 0,
1470                     gich_base_address: 0,
1471                     vgic_interrupt: 0,
1472                     gicr_base_address: 0,
1473                     mpidr: mpidr & mpidr_mask,
1474                     proc_power_effi_class: 0,
1475                     reserved1: 0,
1476                     spe_overflow_interrupt: 0,
1477                 };
1478 
1479                 madt.append(gicc);
1480             }
1481             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1482 
1483             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1484             let gicd = GicD {
1485                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1486                 length: 24,
1487                 reserved0: 0,
1488                 gic_id: 0,
1489                 base_address: vgic_config.dist_addr,
1490                 global_irq_base: 0,
1491                 version: 3,
1492                 reserved1: [0; 3],
1493             };
1494             madt.append(gicd);
1495 
1496             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1497             let gicr = GicR {
1498                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1499                 length: 16,
1500                 reserved: 0,
1501                 base_address: vgic_config.redists_addr,
1502                 range_length: vgic_config.redists_size as u32,
1503             };
1504             madt.append(gicr);
1505 
1506             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1507             let gicits = GicIts {
1508                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1509                 length: 20,
1510                 reserved0: 0,
1511                 translation_id: 0,
1512                 base_address: vgic_config.msi_addr,
1513                 reserved1: 0,
1514             };
1515             madt.append(gicits);
1516 
1517             madt.update_checksum();
1518         }
1519 
1520         madt
1521     }
1522 
1523     #[cfg(target_arch = "aarch64")]
1524     pub fn create_pptt(&self) -> Sdt {
1525         let pptt_start = 0;
1526         let mut cpus = 0;
1527         let mut uid = 0;
1528         // If topology is not specified, the default setting is:
1529         // 1 package, multiple cores, 1 thread per core
1530         // This is also the behavior when PPTT is missing.
1531         let (threads_per_core, cores_per_package, packages) =
1532             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1533 
1534         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1535 
1536         for cluster_idx in 0..packages {
1537             if cpus < self.config.boot_vcpus as usize {
1538                 let cluster_offset = pptt.len() - pptt_start;
1539                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1540                     r#type: 0,
1541                     length: 20,
1542                     reserved: 0,
1543                     flags: 0x2,
1544                     parent: 0,
1545                     acpi_processor_id: cluster_idx as u32,
1546                     num_private_resources: 0,
1547                 };
1548                 pptt.append(cluster_hierarchy_node);
1549 
1550                 for core_idx in 0..cores_per_package {
1551                     let core_offset = pptt.len() - pptt_start;
1552 
1553                     if threads_per_core > 1 {
1554                         let core_hierarchy_node = ProcessorHierarchyNode {
1555                             r#type: 0,
1556                             length: 20,
1557                             reserved: 0,
1558                             flags: 0x2,
1559                             parent: cluster_offset as u32,
1560                             acpi_processor_id: core_idx as u32,
1561                             num_private_resources: 0,
1562                         };
1563                         pptt.append(core_hierarchy_node);
1564 
1565                         for _thread_idx in 0..threads_per_core {
1566                             let thread_hierarchy_node = ProcessorHierarchyNode {
1567                                 r#type: 0,
1568                                 length: 20,
1569                                 reserved: 0,
1570                                 flags: 0xE,
1571                                 parent: core_offset as u32,
1572                                 acpi_processor_id: uid as u32,
1573                                 num_private_resources: 0,
1574                             };
1575                             pptt.append(thread_hierarchy_node);
1576                             uid += 1;
1577                         }
1578                     } else {
1579                         let thread_hierarchy_node = ProcessorHierarchyNode {
1580                             r#type: 0,
1581                             length: 20,
1582                             reserved: 0,
1583                             flags: 0xA,
1584                             parent: cluster_offset as u32,
1585                             acpi_processor_id: uid as u32,
1586                             num_private_resources: 0,
1587                         };
1588                         pptt.append(thread_hierarchy_node);
1589                         uid += 1;
1590                     }
1591                 }
1592                 cpus += (cores_per_package * threads_per_core) as usize;
1593             }
1594         }
1595 
1596         pptt.update_checksum();
1597         pptt
1598     }
1599 
1600     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1601     fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters {
1602         self.vcpus[usize::from(cpu_id)]
1603             .lock()
1604             .unwrap()
1605             .vcpu
1606             .create_standard_regs()
1607     }
1608 
1609     #[cfg(feature = "guest_debug")]
1610     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1611         self.vcpus[usize::from(cpu_id)]
1612             .lock()
1613             .unwrap()
1614             .vcpu
1615             .get_regs()
1616             .map_err(Error::CpuDebug)
1617     }
1618 
1619     #[cfg(feature = "guest_debug")]
1620     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1621         self.vcpus[usize::from(cpu_id)]
1622             .lock()
1623             .unwrap()
1624             .vcpu
1625             .set_regs(regs)
1626             .map_err(Error::CpuDebug)
1627     }
1628 
1629     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1630     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1631         self.vcpus[usize::from(cpu_id)]
1632             .lock()
1633             .unwrap()
1634             .vcpu
1635             .get_sregs()
1636             .map_err(Error::CpuDebug)
1637     }
1638 
1639     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1640     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1641         self.vcpus[usize::from(cpu_id)]
1642             .lock()
1643             .unwrap()
1644             .vcpu
1645             .set_sregs(sregs)
1646             .map_err(Error::CpuDebug)
1647     }
1648 
1649     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1650     fn translate_gva(
1651         &self,
1652         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1653         cpu_id: u8,
1654         gva: u64,
1655     ) -> Result<u64> {
1656         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1657             .lock()
1658             .unwrap()
1659             .vcpu
1660             .translate_gva(gva, /* flags: unused */ 0)
1661             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1662         Ok(gpa)
1663     }
1664 
1665     ///
1666     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1667     /// it in VMM by walking through translation tables.
1668     ///
1669     /// Address translation is big topic, here we only focus the scenario that
1670     /// happens in VMM while debugging kernel. This `translate_gva`
1671     /// implementation is restricted to:
1672     /// - Exception Level 1
1673     /// - Translate high address range only (kernel space)
1674     ///
1675     /// This implementation supports following Arm-v8a features related to
1676     /// address translation:
1677     /// - FEAT_LPA
1678     /// - FEAT_LVA
1679     /// - FEAT_LPA2
1680     ///
1681     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1682     fn translate_gva(
1683         &self,
1684         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1685         cpu_id: u8,
1686         gva: u64,
1687     ) -> Result<u64> {
1688         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1689             .lock()
1690             .unwrap()
1691             .vcpu
1692             .get_sys_reg(regs::TCR_EL1)
1693             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1694         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1695             .lock()
1696             .unwrap()
1697             .vcpu
1698             .get_sys_reg(regs::TTBR1_EL1)
1699             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1700         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1701             .lock()
1702             .unwrap()
1703             .vcpu
1704             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1705             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1706 
1707         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1708         // or low (0x000xxx...).
1709         let high_range = extract_bits_64!(gva, 55, 1);
1710         if high_range == 0 {
1711             info!("VA (0x{:x}) range is not supported!", gva);
1712             return Ok(gva);
1713         }
1714 
1715         // High range size offset
1716         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1717         // Granule size
1718         let tg = extract_bits_64!(tcr_el1, 30, 2);
1719         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1720         let ds = extract_bits_64!(tcr_el1, 59, 1);
1721 
1722         if tsz == 0 {
1723             info!("VA translation is not ready!");
1724             return Ok(gva);
1725         }
1726 
1727         // VA size is determined by TCR_BL1.T1SZ
1728         let va_size = 64 - tsz;
1729         // Number of bits in VA consumed in each level of translation
1730         let stride = match tg {
1731             3 => 13, // 64KB granule size
1732             1 => 11, // 16KB granule size
1733             _ => 9,  // 4KB, default
1734         };
1735         // Starting level of walking
1736         let mut level = 4 - (va_size - 4) / stride;
1737 
1738         // PA or IPA size is determined
1739         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1740         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1741         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1742         // To be safe, we use the minimum value if they are different.
1743         let pa_range = std::cmp::min(tcr_ips, pa_range);
1744         // PA size in bits
1745         let pa_size = match pa_range {
1746             0 => 32,
1747             1 => 36,
1748             2 => 40,
1749             3 => 42,
1750             4 => 44,
1751             5 => 48,
1752             6 => 52,
1753             _ => {
1754                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1755                     "PA range not supported {pa_range}"
1756                 ))))
1757             }
1758         };
1759 
1760         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1761         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1762         // If FEAT_LPA2 is present, the translation table descriptor holds
1763         // 50 bits of the table address of next level.
1764         // Otherwise, it is 48 bits.
1765         let descaddrmask = if ds == 1 {
1766             !0u64 >> (64 - 50) // mask with 50 least significant bits
1767         } else {
1768             !0u64 >> (64 - 48) // mask with 48 least significant bits
1769         };
1770         let descaddrmask = descaddrmask & !indexmask_grainsize;
1771 
1772         // Translation table base address
1773         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1774         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1775         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1776         if pa_size == 52 {
1777             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1778         }
1779 
1780         // Loop through tables of each level
1781         loop {
1782             // Table offset for current level
1783             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1784             descaddr |= table_offset;
1785             descaddr &= !7u64;
1786 
1787             let mut buf = [0; 8];
1788             guest_memory
1789                 .memory()
1790                 .read(&mut buf, GuestAddress(descaddr))
1791                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1792             let descriptor = u64::from_le_bytes(buf);
1793 
1794             descaddr = descriptor & descaddrmask;
1795             // In the case of FEAT_LPA, the next-level translation table address
1796             // bits [48:51] comes from bits [12:15] of the current descriptor.
1797             // For FEAT_LPA2, the next-level translation table address
1798             // bits [50:51] comes from bits [8:9] of the current descriptor,
1799             // bits [48:49] comes from bits [48:49] of the descriptor which was
1800             // handled previously.
1801             if pa_size == 52 {
1802                 if ds == 1 {
1803                     // FEAT_LPA2
1804                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1805                 } else {
1806                     // FEAT_LPA
1807                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1808                 }
1809             }
1810 
1811             if (descriptor & 2) != 0 && (level < 3) {
1812                 // This is a table entry. Go down to next level.
1813                 level += 1;
1814                 indexmask = indexmask_grainsize;
1815                 continue;
1816             }
1817 
1818             break;
1819         }
1820 
1821         // We have reached either:
1822         // - a page entry at level 3 or
1823         // - a block entry at level 1 or 2
1824         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1825         descaddr &= !(page_size - 1);
1826         descaddr |= gva & (page_size - 1);
1827 
1828         Ok(descaddr)
1829     }
1830 
1831     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1832         self.acpi_address = Some(acpi_address);
1833     }
1834 
1835     pub(crate) fn set_interrupt_controller(
1836         &mut self,
1837         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1838     ) {
1839         self.interrupt_controller = Some(interrupt_controller);
1840     }
1841 
1842     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1843         &self.vcpus_kill_signalled
1844     }
1845 
1846     #[cfg(feature = "igvm")]
1847     pub(crate) fn get_cpuid_leaf(
1848         &self,
1849         cpu_id: u8,
1850         eax: u32,
1851         ecx: u32,
1852         xfem: u64,
1853         xss: u64,
1854     ) -> Result<[u32; 4]> {
1855         let leaf_info = self.vcpus[usize::from(cpu_id)]
1856             .lock()
1857             .unwrap()
1858             .vcpu
1859             .get_cpuid_values(eax, ecx, xfem, xss)
1860             .unwrap();
1861         Ok(leaf_info)
1862     }
1863 
1864     #[cfg(feature = "sev_snp")]
1865     pub(crate) fn sev_snp_enabled(&self) -> bool {
1866         self.sev_snp_enabled
1867     }
1868 
1869     pub(crate) fn nmi(&self) -> Result<()> {
1870         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1871 
1872         for state in self.vcpu_states.iter() {
1873             state.signal_thread();
1874         }
1875 
1876         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1877 
1878         Ok(())
1879     }
1880 }
1881 
1882 struct Cpu {
1883     cpu_id: u8,
1884     proximity_domain: u32,
1885     dynamic: bool,
1886     #[cfg(target_arch = "x86_64")]
1887     topology: Option<(u8, u8, u8)>,
1888 }
1889 
1890 #[cfg(target_arch = "x86_64")]
1891 const MADT_CPU_ENABLE_FLAG: usize = 0;
1892 
1893 #[cfg(target_arch = "x86_64")]
1894 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1895 
1896 impl Cpu {
1897     #[cfg(target_arch = "x86_64")]
1898     fn generate_mat(&self) -> Vec<u8> {
1899         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1900 
1901         let lapic = LocalX2Apic {
1902             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1903             length: 16,
1904             processor_id: self.cpu_id.into(),
1905             apic_id: x2apic_id,
1906             flags: 1 << MADT_CPU_ENABLE_FLAG,
1907             _reserved: 0,
1908         };
1909 
1910         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1911         // SAFETY: mat_data is large enough to hold lapic
1912         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1913 
1914         mat_data
1915     }
1916 }
1917 
1918 impl Aml for Cpu {
1919     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1920         #[cfg(target_arch = "x86_64")]
1921         let mat_data: Vec<u8> = self.generate_mat();
1922         #[allow(clippy::if_same_then_else)]
1923         if self.dynamic {
1924             aml::Device::new(
1925                 format!("C{:03X}", self.cpu_id).as_str().into(),
1926                 vec![
1927                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1928                     &aml::Name::new("_UID".into(), &self.cpu_id),
1929                     // Currently, AArch64 cannot support following fields.
1930                     /*
1931                     _STA return value:
1932                     Bit [0] – Set if the device is present.
1933                     Bit [1] – Set if the device is enabled and decoding its resources.
1934                     Bit [2] – Set if the device should be shown in the UI.
1935                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1936                     Bit [4] – Set if the battery is present.
1937                     Bits [31:5] – Reserved (must be cleared).
1938                     */
1939                     #[cfg(target_arch = "x86_64")]
1940                     &aml::Method::new(
1941                         "_STA".into(),
1942                         0,
1943                         false,
1944                         // Call into CSTA method which will interrogate device
1945                         vec![&aml::Return::new(&aml::MethodCall::new(
1946                             "CSTA".into(),
1947                             vec![&self.cpu_id],
1948                         ))],
1949                     ),
1950                     &aml::Method::new(
1951                         "_PXM".into(),
1952                         0,
1953                         false,
1954                         vec![&aml::Return::new(&self.proximity_domain)],
1955                     ),
1956                     // The Linux kernel expects every CPU device to have a _MAT entry
1957                     // containing the LAPIC for this processor with the enabled bit set
1958                     // even it if is disabled in the MADT (non-boot CPU)
1959                     #[cfg(target_arch = "x86_64")]
1960                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1961                     // Trigger CPU ejection
1962                     #[cfg(target_arch = "x86_64")]
1963                     &aml::Method::new(
1964                         "_EJ0".into(),
1965                         1,
1966                         false,
1967                         // Call into CEJ0 method which will actually eject device
1968                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1969                     ),
1970                 ],
1971             )
1972             .to_aml_bytes(sink);
1973         } else {
1974             aml::Device::new(
1975                 format!("C{:03X}", self.cpu_id).as_str().into(),
1976                 vec![
1977                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1978                     &aml::Name::new("_UID".into(), &self.cpu_id),
1979                     #[cfg(target_arch = "x86_64")]
1980                     &aml::Method::new(
1981                         "_STA".into(),
1982                         0,
1983                         false,
1984                         // Mark CPU present see CSTA implementation
1985                         vec![&aml::Return::new(&0xfu8)],
1986                     ),
1987                     &aml::Method::new(
1988                         "_PXM".into(),
1989                         0,
1990                         false,
1991                         vec![&aml::Return::new(&self.proximity_domain)],
1992                     ),
1993                     // The Linux kernel expects every CPU device to have a _MAT entry
1994                     // containing the LAPIC for this processor with the enabled bit set
1995                     // even it if is disabled in the MADT (non-boot CPU)
1996                     #[cfg(target_arch = "x86_64")]
1997                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1998                 ],
1999             )
2000             .to_aml_bytes(sink);
2001         }
2002     }
2003 }
2004 
2005 struct CpuNotify {
2006     cpu_id: u8,
2007 }
2008 
2009 impl Aml for CpuNotify {
2010     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2011         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2012         aml::If::new(
2013             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2014             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2015         )
2016         .to_aml_bytes(sink)
2017     }
2018 }
2019 
2020 struct CpuMethods {
2021     max_vcpus: u8,
2022     dynamic: bool,
2023 }
2024 
2025 impl Aml for CpuMethods {
2026     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2027         if self.dynamic {
2028             // CPU status method
2029             aml::Method::new(
2030                 "CSTA".into(),
2031                 1,
2032                 true,
2033                 vec![
2034                     // Take lock defined above
2035                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2036                     // Write CPU number (in first argument) to I/O port via field
2037                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2038                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2039                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2040                     &aml::If::new(
2041                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2042                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2043                     ),
2044                     // Release lock
2045                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2046                     // Return 0 or 0xf
2047                     &aml::Return::new(&aml::Local(0)),
2048                 ],
2049             )
2050             .to_aml_bytes(sink);
2051 
2052             let mut cpu_notifies = Vec::new();
2053             for cpu_id in 0..self.max_vcpus {
2054                 cpu_notifies.push(CpuNotify { cpu_id });
2055             }
2056 
2057             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2058             for cpu_id in 0..self.max_vcpus {
2059                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2060             }
2061 
2062             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2063 
2064             aml::Method::new(
2065                 "CEJ0".into(),
2066                 1,
2067                 true,
2068                 vec![
2069                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2070                     // Write CPU number (in first argument) to I/O port via field
2071                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2072                     // Set CEJ0 bit
2073                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2074                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2075                 ],
2076             )
2077             .to_aml_bytes(sink);
2078 
2079             aml::Method::new(
2080                 "CSCN".into(),
2081                 0,
2082                 true,
2083                 vec![
2084                     // Take lock defined above
2085                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2086                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2087                     &aml::While::new(
2088                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2089                         vec![
2090                             // Write CPU number (in first argument) to I/O port via field
2091                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2092                             // Check if CINS bit is set
2093                             &aml::If::new(
2094                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2095                                 // Notify device if it is
2096                                 vec![
2097                                     &aml::MethodCall::new(
2098                                         "CTFY".into(),
2099                                         vec![&aml::Local(0), &aml::ONE],
2100                                     ),
2101                                     // Reset CINS bit
2102                                     &aml::Store::new(
2103                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2104                                         &aml::ONE,
2105                                     ),
2106                                 ],
2107                             ),
2108                             // Check if CRMV bit is set
2109                             &aml::If::new(
2110                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2111                                 // Notify device if it is (with the eject constant 0x3)
2112                                 vec![
2113                                     &aml::MethodCall::new(
2114                                         "CTFY".into(),
2115                                         vec![&aml::Local(0), &3u8],
2116                                     ),
2117                                     // Reset CRMV bit
2118                                     &aml::Store::new(
2119                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2120                                         &aml::ONE,
2121                                     ),
2122                                 ],
2123                             ),
2124                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2125                         ],
2126                     ),
2127                     // Release lock
2128                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2129                 ],
2130             )
2131             .to_aml_bytes(sink)
2132         } else {
2133             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2134         }
2135     }
2136 }
2137 
2138 impl Aml for CpuManager {
2139     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2140         #[cfg(target_arch = "x86_64")]
2141         if let Some(acpi_address) = self.acpi_address {
2142             // CPU hotplug controller
2143             aml::Device::new(
2144                 "_SB_.PRES".into(),
2145                 vec![
2146                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2147                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2148                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2149                     &aml::Mutex::new("CPLK".into(), 0),
2150                     &aml::Name::new(
2151                         "_CRS".into(),
2152                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2153                             aml::AddressSpaceCacheable::NotCacheable,
2154                             true,
2155                             acpi_address.0,
2156                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2157                             None,
2158                         )]),
2159                     ),
2160                     // OpRegion and Fields map MMIO range into individual field values
2161                     &aml::OpRegion::new(
2162                         "PRST".into(),
2163                         aml::OpRegionSpace::SystemMemory,
2164                         &(acpi_address.0 as usize),
2165                         &CPU_MANAGER_ACPI_SIZE,
2166                     ),
2167                     &aml::Field::new(
2168                         "PRST".into(),
2169                         aml::FieldAccessType::Byte,
2170                         aml::FieldLockRule::NoLock,
2171                         aml::FieldUpdateRule::WriteAsZeroes,
2172                         vec![
2173                             aml::FieldEntry::Reserved(32),
2174                             aml::FieldEntry::Named(*b"CPEN", 1),
2175                             aml::FieldEntry::Named(*b"CINS", 1),
2176                             aml::FieldEntry::Named(*b"CRMV", 1),
2177                             aml::FieldEntry::Named(*b"CEJ0", 1),
2178                             aml::FieldEntry::Reserved(4),
2179                             aml::FieldEntry::Named(*b"CCMD", 8),
2180                         ],
2181                     ),
2182                     &aml::Field::new(
2183                         "PRST".into(),
2184                         aml::FieldAccessType::DWord,
2185                         aml::FieldLockRule::NoLock,
2186                         aml::FieldUpdateRule::Preserve,
2187                         vec![
2188                             aml::FieldEntry::Named(*b"CSEL", 32),
2189                             aml::FieldEntry::Reserved(32),
2190                             aml::FieldEntry::Named(*b"CDAT", 32),
2191                         ],
2192                     ),
2193                 ],
2194             )
2195             .to_aml_bytes(sink);
2196         }
2197 
2198         // CPU devices
2199         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2200         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2201         // Bundle methods together under a common object
2202         let methods = CpuMethods {
2203             max_vcpus: self.config.max_vcpus,
2204             dynamic: self.dynamic,
2205         };
2206         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2207 
2208         #[cfg(target_arch = "x86_64")]
2209         let topology = self.get_vcpu_topology();
2210         let mut cpu_devices = Vec::new();
2211         for cpu_id in 0..self.config.max_vcpus {
2212             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2213             let cpu_device = Cpu {
2214                 cpu_id,
2215                 proximity_domain,
2216                 dynamic: self.dynamic,
2217                 #[cfg(target_arch = "x86_64")]
2218                 topology,
2219             };
2220 
2221             cpu_devices.push(cpu_device);
2222         }
2223 
2224         for cpu_device in cpu_devices.iter() {
2225             cpu_data_inner.push(cpu_device);
2226         }
2227 
2228         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2229     }
2230 }
2231 
2232 impl Pausable for CpuManager {
2233     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2234         // Tell the vCPUs to pause themselves next time they exit
2235         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2236 
2237         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2238         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2239         // above.
2240         for state in self.vcpu_states.iter() {
2241             state.signal_thread();
2242         }
2243 
2244         for vcpu in self.vcpus.iter() {
2245             let mut vcpu = vcpu.lock().unwrap();
2246             vcpu.pause()?;
2247             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2248             if !self.config.kvm_hyperv {
2249                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2250                     MigratableError::Pause(anyhow!(
2251                         "Could not notify guest it has been paused {:?}",
2252                         e
2253                     ))
2254                 })?;
2255             }
2256         }
2257 
2258         // The vCPU thread will change its paused state before parking, wait here for each
2259         // activated vCPU change their state to ensure they have parked.
2260         for state in self.vcpu_states.iter() {
2261             if state.active() {
2262                 while !state.paused.load(Ordering::SeqCst) {
2263                     // To avoid a priority inversion with the vCPU thread
2264                     thread::sleep(std::time::Duration::from_millis(1));
2265                 }
2266             }
2267         }
2268 
2269         Ok(())
2270     }
2271 
2272     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2273         for vcpu in self.vcpus.iter() {
2274             vcpu.lock().unwrap().resume()?;
2275         }
2276 
2277         // Toggle the vCPUs pause boolean
2278         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2279 
2280         // Unpark all the VCPU threads.
2281         // Once unparked, the next thing they will do is checking for the pause
2282         // boolean. Since it'll be set to false, they will exit their pause loop
2283         // and go back to vmx root.
2284         for state in self.vcpu_states.iter() {
2285             state.paused.store(false, Ordering::SeqCst);
2286             state.unpark_thread();
2287         }
2288         Ok(())
2289     }
2290 }
2291 
2292 impl Snapshottable for CpuManager {
2293     fn id(&self) -> String {
2294         CPU_MANAGER_SNAPSHOT_ID.to_string()
2295     }
2296 
2297     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2298         let mut cpu_manager_snapshot = Snapshot::default();
2299 
2300         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2301         for vcpu in &self.vcpus {
2302             let mut vcpu = vcpu.lock().unwrap();
2303             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2304         }
2305 
2306         Ok(cpu_manager_snapshot)
2307     }
2308 }
2309 
2310 impl Transportable for CpuManager {}
2311 impl Migratable for CpuManager {}
2312 
2313 #[cfg(feature = "guest_debug")]
2314 impl Debuggable for CpuManager {
2315     #[cfg(feature = "kvm")]
2316     fn set_guest_debug(
2317         &self,
2318         cpu_id: usize,
2319         addrs: &[GuestAddress],
2320         singlestep: bool,
2321     ) -> std::result::Result<(), DebuggableError> {
2322         self.vcpus[cpu_id]
2323             .lock()
2324             .unwrap()
2325             .vcpu
2326             .set_guest_debug(addrs, singlestep)
2327             .map_err(DebuggableError::SetDebug)
2328     }
2329 
2330     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2331         Ok(())
2332     }
2333 
2334     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2335         Ok(())
2336     }
2337 
2338     #[cfg(target_arch = "x86_64")]
2339     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2340         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2341         let gregs = self
2342             .get_regs(cpu_id as u8)
2343             .map_err(DebuggableError::ReadRegs)?;
2344         let regs = [
2345             gregs.get_rax(),
2346             gregs.get_rbx(),
2347             gregs.get_rcx(),
2348             gregs.get_rdx(),
2349             gregs.get_rsi(),
2350             gregs.get_rdi(),
2351             gregs.get_rbp(),
2352             gregs.get_rsp(),
2353             gregs.get_r8(),
2354             gregs.get_r9(),
2355             gregs.get_r10(),
2356             gregs.get_r11(),
2357             gregs.get_r12(),
2358             gregs.get_r13(),
2359             gregs.get_r14(),
2360             gregs.get_r15(),
2361         ];
2362 
2363         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2364         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2365         let eflags = gregs.get_rflags() as u32;
2366         let rip = gregs.get_rip();
2367 
2368         // Segment registers: CS, SS, DS, ES, FS, GS
2369         let sregs = self
2370             .get_sregs(cpu_id as u8)
2371             .map_err(DebuggableError::ReadRegs)?;
2372         let segments = X86SegmentRegs {
2373             cs: sregs.cs.selector as u32,
2374             ss: sregs.ss.selector as u32,
2375             ds: sregs.ds.selector as u32,
2376             es: sregs.es.selector as u32,
2377             fs: sregs.fs.selector as u32,
2378             gs: sregs.gs.selector as u32,
2379         };
2380 
2381         // TODO: Add other registers
2382 
2383         Ok(CoreRegs {
2384             regs,
2385             eflags,
2386             rip,
2387             segments,
2388             ..Default::default()
2389         })
2390     }
2391 
2392     #[cfg(target_arch = "aarch64")]
2393     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2394         let gregs = self
2395             .get_regs(cpu_id as u8)
2396             .map_err(DebuggableError::ReadRegs)?;
2397         Ok(CoreRegs {
2398             x: gregs.regs.regs,
2399             sp: gregs.regs.sp,
2400             pc: gregs.regs.pc,
2401             ..Default::default()
2402         })
2403     }
2404 
2405     #[cfg(target_arch = "x86_64")]
2406     fn write_regs(
2407         &self,
2408         cpu_id: usize,
2409         regs: &CoreRegs,
2410     ) -> std::result::Result<(), DebuggableError> {
2411         let orig_gregs = self
2412             .get_regs(cpu_id as u8)
2413             .map_err(DebuggableError::ReadRegs)?;
2414         let mut gregs = self.create_standard_regs(cpu_id as u8);
2415         gregs.set_rax(regs.regs[0]);
2416         gregs.set_rbx(regs.regs[1]);
2417         gregs.set_rcx(regs.regs[2]);
2418         gregs.set_rdx(regs.regs[3]);
2419         gregs.set_rsi(regs.regs[4]);
2420         gregs.set_rdi(regs.regs[5]);
2421         gregs.set_rbp(regs.regs[6]);
2422         gregs.set_rsp(regs.regs[7]);
2423         gregs.set_r8(regs.regs[8]);
2424         gregs.set_r9(regs.regs[9]);
2425         gregs.set_r10(regs.regs[10]);
2426         gregs.set_r11(regs.regs[11]);
2427         gregs.set_r12(regs.regs[12]);
2428         gregs.set_r13(regs.regs[13]);
2429         gregs.set_r14(regs.regs[14]);
2430         gregs.set_r15(regs.regs[15]);
2431         gregs.set_rip(regs.rip);
2432         // Update the lower 32-bit of rflags.
2433         gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64));
2434 
2435         self.set_regs(cpu_id as u8, &gregs)
2436             .map_err(DebuggableError::WriteRegs)?;
2437 
2438         // Segment registers: CS, SS, DS, ES, FS, GS
2439         // Since GDB care only selectors, we call get_sregs() first.
2440         let mut sregs = self
2441             .get_sregs(cpu_id as u8)
2442             .map_err(DebuggableError::ReadRegs)?;
2443         sregs.cs.selector = regs.segments.cs as u16;
2444         sregs.ss.selector = regs.segments.ss as u16;
2445         sregs.ds.selector = regs.segments.ds as u16;
2446         sregs.es.selector = regs.segments.es as u16;
2447         sregs.fs.selector = regs.segments.fs as u16;
2448         sregs.gs.selector = regs.segments.gs as u16;
2449 
2450         self.set_sregs(cpu_id as u8, &sregs)
2451             .map_err(DebuggableError::WriteRegs)?;
2452 
2453         // TODO: Add other registers
2454 
2455         Ok(())
2456     }
2457 
2458     #[cfg(target_arch = "aarch64")]
2459     fn write_regs(
2460         &self,
2461         cpu_id: usize,
2462         regs: &CoreRegs,
2463     ) -> std::result::Result<(), DebuggableError> {
2464         let mut gregs = self
2465             .get_regs(cpu_id as u8)
2466             .map_err(DebuggableError::ReadRegs)?;
2467 
2468         gregs.regs.regs = regs.x;
2469         gregs.regs.sp = regs.sp;
2470         gregs.regs.pc = regs.pc;
2471 
2472         self.set_regs(cpu_id as u8, &gregs)
2473             .map_err(DebuggableError::WriteRegs)?;
2474 
2475         Ok(())
2476     }
2477 
2478     fn read_mem(
2479         &self,
2480         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2481         cpu_id: usize,
2482         vaddr: GuestAddress,
2483         len: usize,
2484     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2485         let mut buf = vec![0; len];
2486         let mut total_read = 0_u64;
2487 
2488         while total_read < len as u64 {
2489             let gaddr = vaddr.0 + total_read;
2490             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2491                 Ok(paddr) => paddr,
2492                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2493                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2494             };
2495             let psize = arch::PAGE_SIZE as u64;
2496             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2497             guest_memory
2498                 .memory()
2499                 .read(
2500                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2501                     GuestAddress(paddr),
2502                 )
2503                 .map_err(DebuggableError::ReadMem)?;
2504             total_read += read_len;
2505         }
2506         Ok(buf)
2507     }
2508 
2509     fn write_mem(
2510         &self,
2511         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2512         cpu_id: usize,
2513         vaddr: &GuestAddress,
2514         data: &[u8],
2515     ) -> std::result::Result<(), DebuggableError> {
2516         let mut total_written = 0_u64;
2517 
2518         while total_written < data.len() as u64 {
2519             let gaddr = vaddr.0 + total_written;
2520             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2521                 Ok(paddr) => paddr,
2522                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2523                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2524             };
2525             let psize = arch::PAGE_SIZE as u64;
2526             let write_len = std::cmp::min(
2527                 data.len() as u64 - total_written,
2528                 psize - (paddr & (psize - 1)),
2529             );
2530             guest_memory
2531                 .memory()
2532                 .write(
2533                     &data[total_written as usize..total_written as usize + write_len as usize],
2534                     GuestAddress(paddr),
2535                 )
2536                 .map_err(DebuggableError::WriteMem)?;
2537             total_written += write_len;
2538         }
2539         Ok(())
2540     }
2541 
2542     fn active_vcpus(&self) -> usize {
2543         self.present_vcpus() as usize
2544     }
2545 }
2546 
2547 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2548 impl Elf64Writable for CpuManager {}
2549 
2550 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2551 impl CpuElf64Writable for CpuManager {
2552     fn cpu_write_elf64_note(
2553         &mut self,
2554         dump_state: &DumpState,
2555     ) -> std::result::Result<(), GuestDebuggableError> {
2556         let mut coredump_file = dump_state.file.as_ref().unwrap();
2557         for vcpu in &self.vcpus {
2558             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2559             let mut pos: usize = 0;
2560             let mut buf = vec![0; note_size as usize];
2561             let descsz = size_of::<X86_64ElfPrStatus>();
2562             let vcpu_id = vcpu.lock().unwrap().id;
2563 
2564             let note = Elf64_Nhdr {
2565                 n_namesz: COREDUMP_NAME_SIZE,
2566                 n_descsz: descsz as u32,
2567                 n_type: NT_PRSTATUS,
2568             };
2569 
2570             let bytes: &[u8] = note.as_slice();
2571             buf.splice(0.., bytes.to_vec());
2572             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2573             buf.resize(pos + 4, 0);
2574             buf.splice(pos.., "CORE".to_string().into_bytes());
2575 
2576             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2577             buf.resize(pos + 32 + 4, 0);
2578             let pid = vcpu_id as u64;
2579             let bytes: &[u8] = pid.as_slice();
2580             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2581 
2582             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2583 
2584             let orig_rax: u64 = 0;
2585             let gregs = self.vcpus[usize::from(vcpu_id)]
2586                 .lock()
2587                 .unwrap()
2588                 .vcpu
2589                 .get_regs()
2590                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2591 
2592             let regs1 = [
2593                 gregs.get_r15(),
2594                 gregs.get_r14(),
2595                 gregs.get_r13(),
2596                 gregs.get_r12(),
2597                 gregs.get_rbp(),
2598                 gregs.get_rbx(),
2599                 gregs.get_r11(),
2600                 gregs.get_r10(),
2601             ];
2602             let regs2 = [
2603                 gregs.get_r9(),
2604                 gregs.get_r8(),
2605                 gregs.get_rax(),
2606                 gregs.get_rcx(),
2607                 gregs.get_rdx(),
2608                 gregs.get_rsi(),
2609                 gregs.get_rdi(),
2610                 orig_rax,
2611             ];
2612 
2613             let sregs = self.vcpus[usize::from(vcpu_id)]
2614                 .lock()
2615                 .unwrap()
2616                 .vcpu
2617                 .get_sregs()
2618                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2619 
2620             debug!(
2621                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2622                 gregs.get_rip(),
2623                 gregs.get_rsp(),
2624                 sregs.gs.base,
2625                 sregs.cs.selector,
2626                 sregs.ss.selector,
2627                 sregs.ds.selector,
2628             );
2629 
2630             let regs = X86_64UserRegs {
2631                 regs1,
2632                 regs2,
2633                 rip: gregs.get_rip(),
2634                 cs: sregs.cs.selector as u64,
2635                 eflags: gregs.get_rflags(),
2636                 rsp: gregs.get_rsp(),
2637                 ss: sregs.ss.selector as u64,
2638                 fs_base: sregs.fs.base,
2639                 gs_base: sregs.gs.base,
2640                 ds: sregs.ds.selector as u64,
2641                 es: sregs.es.selector as u64,
2642                 fs: sregs.fs.selector as u64,
2643                 gs: sregs.gs.selector as u64,
2644             };
2645 
2646             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2647             let bytes: &[u8] = regs.as_slice();
2648             buf.resize(note_size as usize, 0);
2649             buf.splice(pos.., bytes.to_vec());
2650             buf.resize(note_size as usize, 0);
2651 
2652             coredump_file
2653                 .write(&buf)
2654                 .map_err(GuestDebuggableError::CoredumpFile)?;
2655         }
2656 
2657         Ok(())
2658     }
2659 
2660     fn cpu_write_vmm_note(
2661         &mut self,
2662         dump_state: &DumpState,
2663     ) -> std::result::Result<(), GuestDebuggableError> {
2664         let mut coredump_file = dump_state.file.as_ref().unwrap();
2665         for vcpu in &self.vcpus {
2666             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2667             let mut pos: usize = 0;
2668             let mut buf = vec![0; note_size as usize];
2669             let descsz = size_of::<DumpCpusState>();
2670             let vcpu_id = vcpu.lock().unwrap().id;
2671 
2672             let note = Elf64_Nhdr {
2673                 n_namesz: COREDUMP_NAME_SIZE,
2674                 n_descsz: descsz as u32,
2675                 n_type: 0,
2676             };
2677 
2678             let bytes: &[u8] = note.as_slice();
2679             buf.splice(0.., bytes.to_vec());
2680             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2681 
2682             buf.resize(pos + 4, 0);
2683             buf.splice(pos.., "QEMU".to_string().into_bytes());
2684 
2685             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2686 
2687             let gregs = self.vcpus[usize::from(vcpu_id)]
2688                 .lock()
2689                 .unwrap()
2690                 .vcpu
2691                 .get_regs()
2692                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2693 
2694             let regs1 = [
2695                 gregs.get_rax(),
2696                 gregs.get_rbx(),
2697                 gregs.get_rcx(),
2698                 gregs.get_rdx(),
2699                 gregs.get_rsi(),
2700                 gregs.get_rdi(),
2701                 gregs.get_rsp(),
2702                 gregs.get_rbp(),
2703             ];
2704 
2705             let regs2 = [
2706                 gregs.get_r8(),
2707                 gregs.get_r9(),
2708                 gregs.get_r10(),
2709                 gregs.get_r11(),
2710                 gregs.get_r12(),
2711                 gregs.get_r13(),
2712                 gregs.get_r14(),
2713                 gregs.get_r15(),
2714             ];
2715 
2716             let sregs = self.vcpus[usize::from(vcpu_id)]
2717                 .lock()
2718                 .unwrap()
2719                 .vcpu
2720                 .get_sregs()
2721                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2722 
2723             let mut msrs = vec![MsrEntry {
2724                 index: msr_index::MSR_KERNEL_GS_BASE,
2725                 ..Default::default()
2726             }];
2727 
2728             self.vcpus[vcpu_id as usize]
2729                 .lock()
2730                 .unwrap()
2731                 .vcpu
2732                 .get_msrs(&mut msrs)
2733                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2734             let kernel_gs_base = msrs[0].data;
2735 
2736             let cs = CpuSegment::new(sregs.cs);
2737             let ds = CpuSegment::new(sregs.ds);
2738             let es = CpuSegment::new(sregs.es);
2739             let fs = CpuSegment::new(sregs.fs);
2740             let gs = CpuSegment::new(sregs.gs);
2741             let ss = CpuSegment::new(sregs.ss);
2742             let ldt = CpuSegment::new(sregs.ldt);
2743             let tr = CpuSegment::new(sregs.tr);
2744             let gdt = CpuSegment::new_from_table(sregs.gdt);
2745             let idt = CpuSegment::new_from_table(sregs.idt);
2746             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2747             let regs = DumpCpusState {
2748                 version: 1,
2749                 size: size_of::<DumpCpusState>() as u32,
2750                 regs1,
2751                 regs2,
2752                 rip: gregs.get_rip(),
2753                 rflags: gregs.get_rflags(),
2754                 cs,
2755                 ds,
2756                 es,
2757                 fs,
2758                 gs,
2759                 ss,
2760                 ldt,
2761                 tr,
2762                 gdt,
2763                 idt,
2764                 cr,
2765                 kernel_gs_base,
2766             };
2767 
2768             let bytes: &[u8] = regs.as_slice();
2769             buf.resize(note_size as usize, 0);
2770             buf.splice(pos.., bytes.to_vec());
2771             buf.resize(note_size as usize, 0);
2772 
2773             coredump_file
2774                 .write(&buf)
2775                 .map_err(GuestDebuggableError::CoredumpFile)?;
2776         }
2777 
2778         Ok(())
2779     }
2780 }
2781 
2782 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2783 #[cfg(test)]
2784 mod tests {
2785     use arch::layout::BOOT_STACK_POINTER;
2786     use arch::layout::ZERO_PAGE_START;
2787     use arch::x86_64::interrupts::*;
2788     use arch::x86_64::regs::*;
2789     use hypervisor::arch::x86::{FpuState, LapicState};
2790     use hypervisor::StandardRegisters;
2791     use linux_loader::loader::bootparam::setup_header;
2792 
2793     #[test]
2794     fn test_setlint() {
2795         let hv = hypervisor::new().unwrap();
2796         let vm = hv.create_vm().expect("new VM fd creation failed");
2797         assert!(hv.check_required_extensions().is_ok());
2798         // Calling get_lapic will fail if there is no irqchip before hand.
2799         assert!(vm.create_irq_chip().is_ok());
2800         let vcpu = vm.create_vcpu(0, None).unwrap();
2801         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2802 
2803         // Compute the value that is expected to represent LVT0 and LVT1.
2804         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2805         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2806         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2807         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2808 
2809         set_lint(&vcpu).unwrap();
2810 
2811         // Compute the value that represents LVT0 and LVT1 after set_lint.
2812         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2813         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2814         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2815         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2816         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2817     }
2818 
2819     #[test]
2820     fn test_setup_fpu() {
2821         let hv = hypervisor::new().unwrap();
2822         let vm = hv.create_vm().expect("new VM fd creation failed");
2823         let vcpu = vm.create_vcpu(0, None).unwrap();
2824         setup_fpu(&vcpu).unwrap();
2825 
2826         let expected_fpu: FpuState = FpuState {
2827             fcw: 0x37f,
2828             mxcsr: 0x1f80,
2829             ..Default::default()
2830         };
2831         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2832         // TODO: auto-generate kvm related structures with PartialEq on.
2833         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2834         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2835         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2836         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2837         // remove it at all.
2838         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2839     }
2840 
2841     #[test]
2842     fn test_setup_msrs() {
2843         use hypervisor::arch::x86::{msr_index, MsrEntry};
2844 
2845         let hv = hypervisor::new().unwrap();
2846         let vm = hv.create_vm().expect("new VM fd creation failed");
2847         let vcpu = vm.create_vcpu(0, None).unwrap();
2848         setup_msrs(&vcpu).unwrap();
2849 
2850         // This test will check against the last MSR entry configured (the tenth one).
2851         // See create_msr_entries for details.
2852         let mut msrs = vec![MsrEntry {
2853             index: msr_index::MSR_IA32_MISC_ENABLE,
2854             ..Default::default()
2855         }];
2856 
2857         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2858         // in this test case scenario.
2859         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2860         assert_eq!(read_msrs, 1);
2861 
2862         // Official entries that were setup when we did setup_msrs. We need to assert that the
2863         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2864         // expect.
2865         let entry_vec = vcpu.boot_msr_entries();
2866         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2867     }
2868 
2869     #[test]
2870     fn test_setup_regs_for_pvh() {
2871         let hv = hypervisor::new().unwrap();
2872         let vm = hv.create_vm().expect("new VM fd creation failed");
2873         let vcpu = vm.create_vcpu(0, None).unwrap();
2874 
2875         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2876         expected_regs.set_rflags(0x0000000000000002u64);
2877         expected_regs.set_rbx(arch::layout::PVH_INFO_START.0);
2878         expected_regs.set_rip(1);
2879 
2880         setup_regs(
2881             &vcpu,
2882             arch::EntryPoint {
2883                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2884                 setup_header: None,
2885             },
2886         )
2887         .unwrap();
2888 
2889         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2890         assert_eq!(actual_regs, expected_regs);
2891     }
2892 
2893     #[test]
2894     fn test_setup_regs_for_bzimage() {
2895         let hv = hypervisor::new().unwrap();
2896         let vm = hv.create_vm().expect("new VM fd creation failed");
2897         let vcpu = vm.create_vcpu(0, None).unwrap();
2898 
2899         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2900         expected_regs.set_rflags(0x0000000000000002u64);
2901         expected_regs.set_rip(1);
2902         expected_regs.set_rsp(BOOT_STACK_POINTER.0);
2903         expected_regs.set_rsi(ZERO_PAGE_START.0);
2904 
2905         setup_regs(
2906             &vcpu,
2907             arch::EntryPoint {
2908                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2909                 setup_header: Some(setup_header {
2910                     ..Default::default()
2911                 }),
2912             },
2913         )
2914         .unwrap();
2915 
2916         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2917         assert_eq!(actual_regs, expected_regs);
2918     }
2919 }
2920 
2921 #[cfg(target_arch = "aarch64")]
2922 #[cfg(test)]
2923 mod tests {
2924     use arch::{aarch64::regs, layout};
2925     use hypervisor::kvm::aarch64::is_system_register;
2926     use hypervisor::kvm::kvm_bindings::{
2927         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2928         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2929     };
2930     use hypervisor::{arm64_core_reg_id, offset_of};
2931     use std::mem;
2932 
2933     #[test]
2934     fn test_setup_regs() {
2935         let hv = hypervisor::new().unwrap();
2936         let vm = hv.create_vm().unwrap();
2937         let vcpu = vm.create_vcpu(0, None).unwrap();
2938 
2939         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2940         // Must fail when vcpu is not initialized yet.
2941         assert!(res.is_err());
2942 
2943         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2944         vm.get_preferred_target(&mut kvi).unwrap();
2945         vcpu.vcpu_init(&kvi).unwrap();
2946 
2947         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2948     }
2949 
2950     #[test]
2951     fn test_read_mpidr() {
2952         let hv = hypervisor::new().unwrap();
2953         let vm = hv.create_vm().unwrap();
2954         let vcpu = vm.create_vcpu(0, None).unwrap();
2955         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2956         vm.get_preferred_target(&mut kvi).unwrap();
2957 
2958         // Must fail when vcpu is not initialized yet.
2959         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2960 
2961         vcpu.vcpu_init(&kvi).unwrap();
2962         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2963     }
2964 
2965     #[test]
2966     fn test_is_system_register() {
2967         let offset = offset_of!(user_pt_regs, pc);
2968         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2969         assert!(!is_system_register(regid));
2970         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2971         assert!(is_system_register(regid));
2972     }
2973 
2974     #[test]
2975     fn test_save_restore_core_regs() {
2976         let hv = hypervisor::new().unwrap();
2977         let vm = hv.create_vm().unwrap();
2978         let vcpu = vm.create_vcpu(0, None).unwrap();
2979         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2980         vm.get_preferred_target(&mut kvi).unwrap();
2981 
2982         // Must fail when vcpu is not initialized yet.
2983         let res = vcpu.get_regs();
2984         assert!(res.is_err());
2985         assert_eq!(
2986             format!("{}", res.unwrap_err()),
2987             "Failed to get core register: Exec format error (os error 8)"
2988         );
2989 
2990         let mut state = kvm_regs::default();
2991         let res = vcpu.set_regs(&state);
2992         assert!(res.is_err());
2993         assert_eq!(
2994             format!("{}", res.unwrap_err()),
2995             "Failed to set core register: Exec format error (os error 8)"
2996         );
2997 
2998         vcpu.vcpu_init(&kvi).unwrap();
2999         let res = vcpu.get_regs();
3000         assert!(res.is_ok());
3001         state = res.unwrap();
3002         assert_eq!(state.regs.pstate, 0x3C5);
3003 
3004         assert!(vcpu.set_regs(&state).is_ok());
3005     }
3006 
3007     #[test]
3008     fn test_get_set_mpstate() {
3009         let hv = hypervisor::new().unwrap();
3010         let vm = hv.create_vm().unwrap();
3011         let vcpu = vm.create_vcpu(0, None).unwrap();
3012         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
3013         vm.get_preferred_target(&mut kvi).unwrap();
3014 
3015         let res = vcpu.get_mp_state();
3016         assert!(res.is_ok());
3017         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
3018     }
3019 }
3020