xref: /cloud-hypervisor/vmm/src/cpu.rs (revision adb318f4cd0079246b3cb07e01c4e978330445d2)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 #[cfg(target_arch = "x86_64")]
35 use arch::x86_64::get_x2apic_id;
36 use arch::EntryPoint;
37 use arch::NumaNodes;
38 #[cfg(target_arch = "aarch64")]
39 use devices::gic::Gic;
40 use devices::interrupt_controller::InterruptController;
41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
45 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
46 use hypervisor::aarch64::StandardRegisters;
47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
48 use hypervisor::arch::x86::msr_index;
49 #[cfg(target_arch = "x86_64")]
50 use hypervisor::arch::x86::CpuIdEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::MsrEntry;
53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
54 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
55 #[cfg(target_arch = "aarch64")]
56 use hypervisor::kvm::kvm_bindings;
57 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
58 use hypervisor::kvm::kvm_ioctls::Cap;
59 #[cfg(feature = "tdx")]
60 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
61 #[cfg(target_arch = "x86_64")]
62 use hypervisor::CpuVendor;
63 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
64 use libc::{c_void, siginfo_t};
65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
66 use linux_loader::elf::Elf64_Nhdr;
67 use seccompiler::{apply_filter, SeccompAction};
68 use std::collections::BTreeMap;
69 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
70 use std::io::Write;
71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
72 use std::mem::size_of;
73 use std::os::unix::thread::JoinHandleExt;
74 use std::sync::atomic::{AtomicBool, Ordering};
75 use std::sync::{Arc, Barrier, Mutex};
76 use std::{cmp, io, result, thread};
77 use thiserror::Error;
78 use tracer::trace_scoped;
79 use vm_device::BusDevice;
80 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
81 use vm_memory::ByteValued;
82 #[cfg(feature = "guest_debug")]
83 use vm_memory::{Bytes, GuestAddressSpace};
84 use vm_memory::{GuestAddress, GuestMemoryAtomic};
85 use vm_migration::{
86     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
87     Transportable,
88 };
89 use vmm_sys_util::eventfd::EventFd;
90 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
91 use zerocopy::AsBytes;
92 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
93 /// Extract the specified bits of a 64-bit integer.
94 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
95 /// following expression should return 3 (`0b11`):
96 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
97 ///
98 macro_rules! extract_bits_64 {
99     ($value: tt, $offset: tt, $length: tt) => {
100         ($value >> $offset) & (!0u64 >> (64 - $length))
101     };
102 }
103 
104 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
105 macro_rules! extract_bits_64_without_offset {
106     ($value: tt, $length: tt) => {
107         $value & (!0u64 >> (64 - $length))
108     };
109 }
110 
111 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
112 
113 #[derive(Debug, Error)]
114 pub enum Error {
115     #[error("Error creating vCPU: {0}")]
116     VcpuCreate(#[source] anyhow::Error),
117 
118     #[error("Error running bCPU: {0}")]
119     VcpuRun(#[source] anyhow::Error),
120 
121     #[error("Error spawning vCPU thread: {0}")]
122     VcpuSpawn(#[source] io::Error),
123 
124     #[error("Error generating common CPUID: {0}")]
125     CommonCpuId(#[source] arch::Error),
126 
127     #[error("Error configuring vCPU: {0}")]
128     VcpuConfiguration(#[source] arch::Error),
129 
130     #[error("Still pending removed vcpu")]
131     VcpuPendingRemovedVcpu,
132 
133     #[cfg(target_arch = "aarch64")]
134     #[error("Error fetching preferred target: {0}")]
135     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
136 
137     #[cfg(target_arch = "aarch64")]
138     #[error("Error initialising vCPU: {0}")]
139     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
140 
141     #[error("Failed to join on vCPU threads: {0:?}")]
142     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
143 
144     #[error("Error adding CpuManager to MMIO bus: {0}")]
145     BusError(#[source] vm_device::BusError),
146 
147     #[error("Requested vCPUs exceed maximum")]
148     DesiredVCpuCountExceedsMax,
149 
150     #[error("Cannot create seccomp filter: {0}")]
151     CreateSeccompFilter(#[source] seccompiler::Error),
152 
153     #[error("Cannot apply seccomp filter: {0}")]
154     ApplySeccompFilter(#[source] seccompiler::Error),
155 
156     #[error("Error starting vCPU after restore: {0}")]
157     StartRestoreVcpu(#[source] anyhow::Error),
158 
159     #[error("Unexpected VmExit")]
160     UnexpectedVmExit,
161 
162     #[error("Failed to allocate MMIO address for CpuManager")]
163     AllocateMmmioAddress,
164 
165     #[cfg(feature = "tdx")]
166     #[error("Error initializing TDX: {0}")]
167     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
168 
169     #[cfg(target_arch = "aarch64")]
170     #[error("Error initializing PMU: {0}")]
171     InitPmu(#[source] hypervisor::HypervisorCpuError),
172 
173     #[cfg(feature = "guest_debug")]
174     #[error("Error during CPU debug: {0}")]
175     CpuDebug(#[source] hypervisor::HypervisorCpuError),
176 
177     #[cfg(feature = "guest_debug")]
178     #[error("Error translating virtual address: {0}")]
179     TranslateVirtualAddress(#[source] anyhow::Error),
180 
181     #[cfg(target_arch = "x86_64")]
182     #[error("Error setting up AMX: {0}")]
183     AmxEnable(#[source] anyhow::Error),
184 
185     #[error("Maximum number of vCPUs exceeds host limit")]
186     MaximumVcpusExceeded,
187 
188     #[cfg(feature = "sev_snp")]
189     #[error("Failed to set sev control register: {0}")]
190     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
191 }
192 pub type Result<T> = result::Result<T, Error>;
193 
194 #[cfg(target_arch = "x86_64")]
195 #[allow(dead_code)]
196 #[repr(packed)]
197 #[derive(AsBytes)]
198 struct LocalX2Apic {
199     pub r#type: u8,
200     pub length: u8,
201     pub _reserved: u16,
202     pub apic_id: u32,
203     pub flags: u32,
204     pub processor_id: u32,
205 }
206 
207 #[allow(dead_code)]
208 #[repr(packed)]
209 #[derive(Default, AsBytes)]
210 struct Ioapic {
211     pub r#type: u8,
212     pub length: u8,
213     pub ioapic_id: u8,
214     _reserved: u8,
215     pub apic_address: u32,
216     pub gsi_base: u32,
217 }
218 
219 #[cfg(target_arch = "aarch64")]
220 #[allow(dead_code)]
221 #[repr(packed)]
222 #[derive(AsBytes)]
223 struct GicC {
224     pub r#type: u8,
225     pub length: u8,
226     pub reserved0: u16,
227     pub cpu_interface_number: u32,
228     pub uid: u32,
229     pub flags: u32,
230     pub parking_version: u32,
231     pub performance_interrupt: u32,
232     pub parked_address: u64,
233     pub base_address: u64,
234     pub gicv_base_address: u64,
235     pub gich_base_address: u64,
236     pub vgic_interrupt: u32,
237     pub gicr_base_address: u64,
238     pub mpidr: u64,
239     pub proc_power_effi_class: u8,
240     pub reserved1: u8,
241     pub spe_overflow_interrupt: u16,
242 }
243 
244 #[cfg(target_arch = "aarch64")]
245 #[allow(dead_code)]
246 #[repr(packed)]
247 #[derive(AsBytes)]
248 struct GicD {
249     pub r#type: u8,
250     pub length: u8,
251     pub reserved0: u16,
252     pub gic_id: u32,
253     pub base_address: u64,
254     pub global_irq_base: u32,
255     pub version: u8,
256     pub reserved1: [u8; 3],
257 }
258 
259 #[cfg(target_arch = "aarch64")]
260 #[allow(dead_code)]
261 #[repr(packed)]
262 #[derive(AsBytes)]
263 struct GicR {
264     pub r#type: u8,
265     pub length: u8,
266     pub reserved: u16,
267     pub base_address: u64,
268     pub range_length: u32,
269 }
270 
271 #[cfg(target_arch = "aarch64")]
272 #[allow(dead_code)]
273 #[repr(packed)]
274 #[derive(AsBytes)]
275 struct GicIts {
276     pub r#type: u8,
277     pub length: u8,
278     pub reserved0: u16,
279     pub translation_id: u32,
280     pub base_address: u64,
281     pub reserved1: u32,
282 }
283 
284 #[cfg(target_arch = "aarch64")]
285 #[allow(dead_code)]
286 #[repr(packed)]
287 #[derive(AsBytes)]
288 struct ProcessorHierarchyNode {
289     pub r#type: u8,
290     pub length: u8,
291     pub reserved: u16,
292     pub flags: u32,
293     pub parent: u32,
294     pub acpi_processor_id: u32,
295     pub num_private_resources: u32,
296 }
297 
298 #[allow(dead_code)]
299 #[repr(packed)]
300 #[derive(Default, AsBytes)]
301 struct InterruptSourceOverride {
302     pub r#type: u8,
303     pub length: u8,
304     pub bus: u8,
305     pub source: u8,
306     pub gsi: u32,
307     pub flags: u16,
308 }
309 
310 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
311 macro_rules! round_up {
312     ($n:expr,$d:expr) => {
313         (($n / ($d + 1)) + 1) * $d
314     };
315 }
316 
317 /// A wrapper around creating and using a kvm-based VCPU.
318 pub struct Vcpu {
319     // The hypervisor abstracted CPU.
320     vcpu: Arc<dyn hypervisor::Vcpu>,
321     id: u8,
322     #[cfg(target_arch = "aarch64")]
323     mpidr: u64,
324     saved_state: Option<CpuState>,
325     #[cfg(target_arch = "x86_64")]
326     vendor: CpuVendor,
327 }
328 
329 impl Vcpu {
330     /// Constructs a new VCPU for `vm`.
331     ///
332     /// # Arguments
333     ///
334     /// * `id` - Represents the CPU number between [0, max vcpus).
335     /// * `vm` - The virtual machine this vcpu will get attached to.
336     /// * `vm_ops` - Optional object for exit handling.
337     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
338     pub fn new(
339         id: u8,
340         apic_id: u8,
341         vm: &Arc<dyn hypervisor::Vm>,
342         vm_ops: Option<Arc<dyn VmOps>>,
343         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
344     ) -> Result<Self> {
345         let vcpu = vm
346             .create_vcpu(apic_id, vm_ops)
347             .map_err(|e| Error::VcpuCreate(e.into()))?;
348         // Initially the cpuid per vCPU is the one supported by this VM.
349         Ok(Vcpu {
350             vcpu,
351             id,
352             #[cfg(target_arch = "aarch64")]
353             mpidr: 0,
354             saved_state: None,
355             #[cfg(target_arch = "x86_64")]
356             vendor: cpu_vendor,
357         })
358     }
359 
360     /// Configures a vcpu and should be called once per vcpu when created.
361     ///
362     /// # Arguments
363     ///
364     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
365     /// * `guest_memory` - Guest memory.
366     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
367     pub fn configure(
368         &mut self,
369         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
370         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
371         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
372         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
373         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
374     ) -> Result<()> {
375         #[cfg(target_arch = "aarch64")]
376         {
377             self.init(vm)?;
378             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
379                 .map_err(Error::VcpuConfiguration)?;
380         }
381         info!("Configuring vCPU: cpu_id = {}", self.id);
382         #[cfg(target_arch = "x86_64")]
383         arch::configure_vcpu(
384             &self.vcpu,
385             self.id,
386             boot_setup,
387             cpuid,
388             kvm_hyperv,
389             self.vendor,
390             topology,
391         )
392         .map_err(Error::VcpuConfiguration)?;
393 
394         Ok(())
395     }
396 
397     /// Gets the MPIDR register value.
398     #[cfg(target_arch = "aarch64")]
399     pub fn get_mpidr(&self) -> u64 {
400         self.mpidr
401     }
402 
403     /// Gets the saved vCPU state.
404     #[cfg(target_arch = "aarch64")]
405     pub fn get_saved_state(&self) -> Option<CpuState> {
406         self.saved_state.clone()
407     }
408 
409     /// Initializes an aarch64 specific vcpu for booting Linux.
410     #[cfg(target_arch = "aarch64")]
411     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
412         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
413 
414         // This reads back the kernel's preferred target type.
415         vm.get_preferred_target(&mut kvi)
416             .map_err(Error::VcpuArmPreferredTarget)?;
417         // We already checked that the capability is supported.
418         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
419         if vm
420             .as_any()
421             .downcast_ref::<hypervisor::kvm::KvmVm>()
422             .unwrap()
423             .check_extension(Cap::ArmPmuV3)
424         {
425             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
426         }
427         // Non-boot cpus are powered off initially.
428         if self.id > 0 {
429             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
430         }
431         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
432     }
433 
434     /// Runs the VCPU until it exits, returning the reason.
435     ///
436     /// Note that the state of the VCPU and associated VM must be setup first for this to do
437     /// anything useful.
438     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
439         self.vcpu.run()
440     }
441 
442     #[cfg(feature = "sev_snp")]
443     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
444         self.vcpu
445             .set_sev_control_register(vmsa_pfn)
446             .map_err(Error::SetSevControlRegister)
447     }
448 }
449 
450 impl Pausable for Vcpu {}
451 impl Snapshottable for Vcpu {
452     fn id(&self) -> String {
453         self.id.to_string()
454     }
455 
456     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
457         let saved_state = self
458             .vcpu
459             .state()
460             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
461 
462         self.saved_state = Some(saved_state.clone());
463 
464         Ok(Snapshot::from_data(SnapshotData::new_from_state(
465             &saved_state,
466         )?))
467     }
468 }
469 
470 pub struct CpuManager {
471     config: CpusConfig,
472     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
473     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
474     #[cfg(target_arch = "x86_64")]
475     cpuid: Vec<CpuIdEntry>,
476     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
477     vm: Arc<dyn hypervisor::Vm>,
478     vcpus_kill_signalled: Arc<AtomicBool>,
479     vcpus_pause_signalled: Arc<AtomicBool>,
480     exit_evt: EventFd,
481     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
482     reset_evt: EventFd,
483     #[cfg(feature = "guest_debug")]
484     vm_debug_evt: EventFd,
485     vcpu_states: Vec<VcpuState>,
486     selected_cpu: u8,
487     vcpus: Vec<Arc<Mutex<Vcpu>>>,
488     seccomp_action: SeccompAction,
489     vm_ops: Arc<dyn VmOps>,
490     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
491     acpi_address: Option<GuestAddress>,
492     proximity_domain_per_cpu: BTreeMap<u8, u32>,
493     affinity: BTreeMap<u8, Vec<usize>>,
494     dynamic: bool,
495     hypervisor: Arc<dyn hypervisor::Hypervisor>,
496     #[cfg(feature = "sev_snp")]
497     sev_snp_enabled: bool,
498 }
499 
500 const CPU_ENABLE_FLAG: usize = 0;
501 const CPU_INSERTING_FLAG: usize = 1;
502 const CPU_REMOVING_FLAG: usize = 2;
503 const CPU_EJECT_FLAG: usize = 3;
504 
505 const CPU_STATUS_OFFSET: u64 = 4;
506 const CPU_SELECTION_OFFSET: u64 = 0;
507 
508 impl BusDevice for CpuManager {
509     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
510         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
511         data.fill(0);
512 
513         match offset {
514             CPU_SELECTION_OFFSET => {
515                 data[0] = self.selected_cpu;
516             }
517             CPU_STATUS_OFFSET => {
518                 if self.selected_cpu < self.max_vcpus() {
519                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
520                     if state.active() {
521                         data[0] |= 1 << CPU_ENABLE_FLAG;
522                     }
523                     if state.inserting {
524                         data[0] |= 1 << CPU_INSERTING_FLAG;
525                     }
526                     if state.removing {
527                         data[0] |= 1 << CPU_REMOVING_FLAG;
528                     }
529                 } else {
530                     warn!("Out of range vCPU id: {}", self.selected_cpu);
531                 }
532             }
533             _ => {
534                 warn!(
535                     "Unexpected offset for accessing CPU manager device: {:#}",
536                     offset
537                 );
538             }
539         }
540     }
541 
542     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
543         match offset {
544             CPU_SELECTION_OFFSET => {
545                 self.selected_cpu = data[0];
546             }
547             CPU_STATUS_OFFSET => {
548                 if self.selected_cpu < self.max_vcpus() {
549                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
550                     // The ACPI code writes back a 1 to acknowledge the insertion
551                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
552                         && state.inserting
553                     {
554                         state.inserting = false;
555                     }
556                     // Ditto for removal
557                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
558                         && state.removing
559                     {
560                         state.removing = false;
561                     }
562                     // Trigger removal of vCPU
563                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
564                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
565                             error!("Error removing vCPU: {:?}", e);
566                         }
567                     }
568                 } else {
569                     warn!("Out of range vCPU id: {}", self.selected_cpu);
570                 }
571             }
572             _ => {
573                 warn!(
574                     "Unexpected offset for accessing CPU manager device: {:#}",
575                     offset
576                 );
577             }
578         }
579         None
580     }
581 }
582 
583 #[derive(Default)]
584 struct VcpuState {
585     inserting: bool,
586     removing: bool,
587     pending_removal: Arc<AtomicBool>,
588     handle: Option<thread::JoinHandle<()>>,
589     kill: Arc<AtomicBool>,
590     vcpu_run_interrupted: Arc<AtomicBool>,
591     paused: Arc<AtomicBool>,
592 }
593 
594 impl VcpuState {
595     fn active(&self) -> bool {
596         self.handle.is_some()
597     }
598 
599     fn signal_thread(&self) {
600         if let Some(handle) = self.handle.as_ref() {
601             loop {
602                 // SAFETY: FFI call with correct arguments
603                 unsafe {
604                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
605                 }
606                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
607                     break;
608                 } else {
609                     // This is more effective than thread::yield_now() at
610                     // avoiding a priority inversion with the vCPU thread
611                     thread::sleep(std::time::Duration::from_millis(1));
612                 }
613             }
614         }
615     }
616 
617     fn join_thread(&mut self) -> Result<()> {
618         if let Some(handle) = self.handle.take() {
619             handle.join().map_err(Error::ThreadCleanup)?
620         }
621 
622         Ok(())
623     }
624 
625     fn unpark_thread(&self) {
626         if let Some(handle) = self.handle.as_ref() {
627             handle.thread().unpark()
628         }
629     }
630 }
631 
632 impl CpuManager {
633     #[allow(unused_variables)]
634     #[allow(clippy::too_many_arguments)]
635     pub fn new(
636         config: &CpusConfig,
637         vm: Arc<dyn hypervisor::Vm>,
638         exit_evt: EventFd,
639         reset_evt: EventFd,
640         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
641         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
642         seccomp_action: SeccompAction,
643         vm_ops: Arc<dyn VmOps>,
644         #[cfg(feature = "tdx")] tdx_enabled: bool,
645         numa_nodes: &NumaNodes,
646         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
647     ) -> Result<Arc<Mutex<CpuManager>>> {
648         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
649             return Err(Error::MaximumVcpusExceeded);
650         }
651 
652         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
653         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
654         let hypervisor_type = hypervisor.hypervisor_type();
655         #[cfg(target_arch = "x86_64")]
656         let cpu_vendor = hypervisor.get_cpu_vendor();
657 
658         #[cfg(target_arch = "x86_64")]
659         if config.features.amx {
660             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
661             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
662             const XFEATURE_XTILEDATA: usize = 18;
663             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
664 
665             // SAFETY: the syscall is only modifying kernel internal
666             // data structures that the kernel is itself expected to safeguard.
667             let amx_tile = unsafe {
668                 libc::syscall(
669                     libc::SYS_arch_prctl,
670                     ARCH_REQ_XCOMP_GUEST_PERM,
671                     XFEATURE_XTILEDATA,
672                 )
673             };
674 
675             if amx_tile != 0 {
676                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
677             } else {
678                 let mask: usize = 0;
679                 // SAFETY: the mask being modified (not marked mutable as it is
680                 // modified in unsafe only which is permitted) isn't in use elsewhere.
681                 let result = unsafe {
682                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
683                 };
684                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
685                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
686                 }
687             }
688         }
689 
690         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
691             let mut cpu_list = Vec::new();
692             for (proximity_domain, numa_node) in numa_nodes.iter() {
693                 for cpu in numa_node.cpus.iter() {
694                     cpu_list.push((*cpu, *proximity_domain))
695                 }
696             }
697             cpu_list
698         }
699         .into_iter()
700         .collect();
701 
702         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
703             cpu_affinity
704                 .iter()
705                 .map(|a| (a.vcpu, a.host_cpus.clone()))
706                 .collect()
707         } else {
708             BTreeMap::new()
709         };
710 
711         #[cfg(feature = "tdx")]
712         let dynamic = !tdx_enabled;
713         #[cfg(not(feature = "tdx"))]
714         let dynamic = true;
715 
716         Ok(Arc::new(Mutex::new(CpuManager {
717             config: config.clone(),
718             interrupt_controller: None,
719             #[cfg(target_arch = "x86_64")]
720             cpuid: Vec::new(),
721             vm,
722             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
723             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
724             vcpu_states,
725             exit_evt,
726             reset_evt,
727             #[cfg(feature = "guest_debug")]
728             vm_debug_evt,
729             selected_cpu: 0,
730             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
731             seccomp_action,
732             vm_ops,
733             acpi_address: None,
734             proximity_domain_per_cpu,
735             affinity,
736             dynamic,
737             hypervisor: hypervisor.clone(),
738             #[cfg(feature = "sev_snp")]
739             sev_snp_enabled,
740         })))
741     }
742 
743     #[cfg(target_arch = "x86_64")]
744     pub fn populate_cpuid(
745         &mut self,
746         memory_manager: &Arc<Mutex<MemoryManager>>,
747         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
748         #[cfg(feature = "tdx")] tdx: bool,
749     ) -> Result<()> {
750         let sgx_epc_sections = memory_manager
751             .lock()
752             .unwrap()
753             .sgx_epc_region()
754             .as_ref()
755             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
756 
757         self.cpuid = {
758             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
759             arch::generate_common_cpuid(
760                 hypervisor,
761                 &arch::CpuidConfig {
762                     sgx_epc_sections,
763                     phys_bits,
764                     kvm_hyperv: self.config.kvm_hyperv,
765                     #[cfg(feature = "tdx")]
766                     tdx,
767                     amx: self.config.features.amx,
768                 },
769             )
770             .map_err(Error::CommonCpuId)?
771         };
772 
773         Ok(())
774     }
775 
776     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
777         info!("Creating vCPU: cpu_id = {}", cpu_id);
778 
779         #[cfg(target_arch = "x86_64")]
780         let topology = self.get_vcpu_topology();
781         #[cfg(target_arch = "x86_64")]
782         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
783         #[cfg(target_arch = "aarch64")]
784         let x2apic_id = cpu_id as u32;
785 
786         let mut vcpu = Vcpu::new(
787             cpu_id,
788             x2apic_id as u8,
789             &self.vm,
790             Some(self.vm_ops.clone()),
791             #[cfg(target_arch = "x86_64")]
792             self.hypervisor.get_cpu_vendor(),
793         )?;
794 
795         if let Some(snapshot) = snapshot {
796             // AArch64 vCPUs should be initialized after created.
797             #[cfg(target_arch = "aarch64")]
798             vcpu.init(&self.vm)?;
799 
800             let state: CpuState = snapshot.to_state().map_err(|e| {
801                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
802             })?;
803             vcpu.vcpu
804                 .set_state(&state)
805                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
806 
807             vcpu.saved_state = Some(state);
808         }
809 
810         let vcpu = Arc::new(Mutex::new(vcpu));
811 
812         // Adding vCPU to the CpuManager's vCPU list.
813         self.vcpus.push(vcpu.clone());
814 
815         Ok(vcpu)
816     }
817 
818     pub fn configure_vcpu(
819         &self,
820         vcpu: Arc<Mutex<Vcpu>>,
821         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
822     ) -> Result<()> {
823         let mut vcpu = vcpu.lock().unwrap();
824 
825         #[cfg(feature = "sev_snp")]
826         if self.sev_snp_enabled {
827             if let Some((kernel_entry_point, _)) = boot_setup {
828                 vcpu.set_sev_control_register(
829                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
830                 )?;
831             }
832 
833             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
834             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
835             return Ok(());
836         }
837 
838         #[cfg(target_arch = "x86_64")]
839         assert!(!self.cpuid.is_empty());
840 
841         #[cfg(target_arch = "x86_64")]
842         let topology = self.config.topology.clone().map_or_else(
843             || {
844                 #[cfg(feature = "mshv")]
845                 if matches!(self.hypervisor.hypervisor_type(), HypervisorType::Mshv) {
846                     return Some((1, self.boot_vcpus(), 1));
847                 }
848                 None
849             },
850             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
851         );
852         #[cfg(target_arch = "x86_64")]
853         vcpu.configure(
854             boot_setup,
855             self.cpuid.clone(),
856             self.config.kvm_hyperv,
857             topology,
858         )?;
859 
860         #[cfg(target_arch = "aarch64")]
861         vcpu.configure(&self.vm, boot_setup)?;
862 
863         Ok(())
864     }
865 
866     /// Only create new vCPUs if there aren't any inactive ones to reuse
867     fn create_vcpus(
868         &mut self,
869         desired_vcpus: u8,
870         snapshot: Option<Snapshot>,
871     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
872         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
873         info!(
874             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
875             desired_vcpus,
876             self.config.max_vcpus,
877             self.vcpus.len(),
878             self.present_vcpus()
879         );
880 
881         if desired_vcpus > self.config.max_vcpus {
882             return Err(Error::DesiredVCpuCountExceedsMax);
883         }
884 
885         // Only create vCPUs in excess of all the allocated vCPUs.
886         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
887             vcpus.push(self.create_vcpu(
888                 cpu_id,
889                 // TODO: The special format of the CPU id can be removed once
890                 // ready to break live upgrade.
891                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
892             )?);
893         }
894 
895         Ok(vcpus)
896     }
897 
898     #[cfg(target_arch = "aarch64")]
899     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
900         for cpu in self.vcpus.iter() {
901             let cpu = cpu.lock().unwrap();
902             // Check if PMU attr is available, if not, log the information.
903             if cpu.vcpu.has_pmu_support() {
904                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
905             } else {
906                 debug!(
907                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
908                     cpu.id
909                 );
910                 return Ok(false);
911             }
912         }
913 
914         Ok(true)
915     }
916 
917     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
918         self.vcpus.clone()
919     }
920 
921     fn start_vcpu(
922         &mut self,
923         vcpu: Arc<Mutex<Vcpu>>,
924         vcpu_id: u8,
925         vcpu_thread_barrier: Arc<Barrier>,
926         inserting: bool,
927     ) -> Result<()> {
928         let reset_evt = self.reset_evt.try_clone().unwrap();
929         let exit_evt = self.exit_evt.try_clone().unwrap();
930         #[cfg(feature = "kvm")]
931         let hypervisor_type = self.hypervisor.hypervisor_type();
932         #[cfg(feature = "guest_debug")]
933         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
934         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
935         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
936         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
937 
938         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
939         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
940             .vcpu_run_interrupted
941             .clone();
942         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
943         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
944 
945         // Prepare the CPU set the current vCPU is expected to run onto.
946         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
947             // SAFETY: all zeros is a valid pattern
948             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
949             // SAFETY: FFI call, trivially safe
950             unsafe { libc::CPU_ZERO(&mut cpuset) };
951             for host_cpu in host_cpus {
952                 // SAFETY: FFI call, trivially safe
953                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
954             }
955             cpuset
956         });
957 
958         // Retrieve seccomp filter for vcpu thread
959         let vcpu_seccomp_filter = get_seccomp_filter(
960             &self.seccomp_action,
961             Thread::Vcpu,
962             self.hypervisor.hypervisor_type(),
963         )
964         .map_err(Error::CreateSeccompFilter)?;
965 
966         #[cfg(target_arch = "x86_64")]
967         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
968 
969         info!("Starting vCPU: cpu_id = {}", vcpu_id);
970 
971         let handle = Some(
972             thread::Builder::new()
973                 .name(format!("vcpu{vcpu_id}"))
974                 .spawn(move || {
975                     // Schedule the thread to run on the expected CPU set
976                     if let Some(cpuset) = cpuset.as_ref() {
977                         // SAFETY: FFI call with correct arguments
978                         let ret = unsafe {
979                             libc::sched_setaffinity(
980                                 0,
981                                 std::mem::size_of::<libc::cpu_set_t>(),
982                                 cpuset as *const libc::cpu_set_t,
983                             )
984                         };
985 
986                         if ret != 0 {
987                             error!(
988                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
989                                 vcpu_id,
990                                 io::Error::last_os_error()
991                             );
992                             return;
993                         }
994                     }
995 
996                     // Apply seccomp filter for vcpu thread.
997                     if !vcpu_seccomp_filter.is_empty() {
998                         if let Err(e) =
999                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1000                         {
1001                             error!("Error applying seccomp filter: {:?}", e);
1002                             return;
1003                         }
1004                     }
1005                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1006                     // This uses an async signal safe handler to kill the vcpu handles.
1007                     register_signal_handler(SIGRTMIN(), handle_signal)
1008                         .expect("Failed to register vcpu signal handler");
1009                     // Block until all CPUs are ready.
1010                     vcpu_thread_barrier.wait();
1011 
1012                     std::panic::catch_unwind(move || {
1013                         loop {
1014                             // If we are being told to pause, we park the thread
1015                             // until the pause boolean is toggled.
1016                             // The resume operation is responsible for toggling
1017                             // the boolean and unpark the thread.
1018                             // We enter a loop because park() could spuriously
1019                             // return. We will then park() again unless the
1020                             // pause boolean has been toggled.
1021 
1022                             // Need to use Ordering::SeqCst as we have multiple
1023                             // loads and stores to different atomics and we need
1024                             // to see them in a consistent order in all threads
1025 
1026                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1027                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1028                                 // completed by returning to KVM_RUN. From the kernel docs:
1029                                 //
1030                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1031                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1032                                 // operations are complete (and guest state is consistent) only after userspace
1033                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1034                                 // incomplete operations and then check for pending signals.
1035                                 // The pending state of the operation is not preserved in state which is
1036                                 // visible to userspace, thus userspace should ensure that the operation is
1037                                 // completed before performing a live migration.  Userspace can re-enter the
1038                                 // guest with an unmasked signal pending or with the immediate_exit field set
1039                                 // to complete pending operations without allowing any further instructions
1040                                 // to be executed.
1041 
1042                                 #[cfg(feature = "kvm")]
1043                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1044                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1045                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1046                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1047                                         break;
1048                                     }
1049                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1050                                 }
1051 
1052                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1053 
1054                                 vcpu_paused.store(true, Ordering::SeqCst);
1055                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1056                                     thread::park();
1057                                 }
1058                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1059                             }
1060 
1061                             // We've been told to terminate
1062                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1063                                 || vcpu_kill.load(Ordering::SeqCst)
1064                             {
1065                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1066                                 break;
1067                             }
1068 
1069                             #[cfg(feature = "tdx")]
1070                             let mut vcpu = vcpu.lock().unwrap();
1071                             #[cfg(not(feature = "tdx"))]
1072                             let vcpu = vcpu.lock().unwrap();
1073                             // vcpu.run() returns false on a triple-fault so trigger a reset
1074                             match vcpu.run() {
1075                                 Ok(run) => match run {
1076                                     #[cfg(feature = "kvm")]
1077                                     VmExit::Debug => {
1078                                         info!("VmExit::Debug");
1079                                         #[cfg(feature = "guest_debug")]
1080                                         {
1081                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1082                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1083                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1084                                         }
1085                                     }
1086                                     #[cfg(target_arch = "x86_64")]
1087                                     VmExit::IoapicEoi(vector) => {
1088                                         if let Some(interrupt_controller) =
1089                                             &interrupt_controller_clone
1090                                         {
1091                                             interrupt_controller
1092                                                 .lock()
1093                                                 .unwrap()
1094                                                 .end_of_interrupt(vector);
1095                                         }
1096                                     }
1097                                     VmExit::Ignore => {}
1098                                     VmExit::Hyperv => {}
1099                                     VmExit::Reset => {
1100                                         info!("VmExit::Reset");
1101                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1102                                         reset_evt.write(1).unwrap();
1103                                         break;
1104                                     }
1105                                     VmExit::Shutdown => {
1106                                         info!("VmExit::Shutdown");
1107                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1108                                         exit_evt.write(1).unwrap();
1109                                         break;
1110                                     }
1111                                     #[cfg(feature = "tdx")]
1112                                     VmExit::Tdx => {
1113                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1114                                             match vcpu.get_tdx_exit_details() {
1115                                                 Ok(details) => match details {
1116                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1117                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1118                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1119                                                     }
1120                                                 },
1121                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1122                                             }
1123                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1124                                         } else {
1125                                             // We should never reach this code as
1126                                             // this means the design from the code
1127                                             // is wrong.
1128                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1129                                         }
1130                                     }
1131                                     _ => {
1132                                         error!(
1133                                             "VCPU generated error: {:?}",
1134                                             Error::UnexpectedVmExit
1135                                         );
1136                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1137                                         exit_evt.write(1).unwrap();
1138                                         break;
1139                                     }
1140                                 },
1141 
1142                                 Err(e) => {
1143                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1144                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1145                                     exit_evt.write(1).unwrap();
1146                                     break;
1147                                 }
1148                             }
1149 
1150                             // We've been told to terminate
1151                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1152                                 || vcpu_kill.load(Ordering::SeqCst)
1153                             {
1154                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1155                                 break;
1156                             }
1157                         }
1158                     })
1159                     .or_else(|_| {
1160                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1161                         error!("vCPU thread panicked");
1162                         panic_exit_evt.write(1)
1163                     })
1164                     .ok();
1165                 })
1166                 .map_err(Error::VcpuSpawn)?,
1167         );
1168 
1169         // On hot plug calls into this function entry_point is None. It is for
1170         // those hotplug CPU additions that we need to set the inserting flag.
1171         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1172         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1173 
1174         Ok(())
1175     }
1176 
1177     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1178     fn activate_vcpus(
1179         &mut self,
1180         desired_vcpus: u8,
1181         inserting: bool,
1182         paused: Option<bool>,
1183     ) -> Result<()> {
1184         if desired_vcpus > self.config.max_vcpus {
1185             return Err(Error::DesiredVCpuCountExceedsMax);
1186         }
1187 
1188         let vcpu_thread_barrier = Arc::new(Barrier::new(
1189             (desired_vcpus - self.present_vcpus() + 1) as usize,
1190         ));
1191 
1192         if let Some(paused) = paused {
1193             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1194         }
1195 
1196         info!(
1197             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1198             desired_vcpus,
1199             self.vcpus.len(),
1200             self.present_vcpus(),
1201             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1202         );
1203 
1204         // This reuses any inactive vCPUs as well as any that were newly created
1205         for vcpu_id in self.present_vcpus()..desired_vcpus {
1206             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1207             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1208         }
1209 
1210         // Unblock all CPU threads.
1211         vcpu_thread_barrier.wait();
1212         Ok(())
1213     }
1214 
1215     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1216         // Mark vCPUs for removal, actual removal happens on ejection
1217         for cpu_id in desired_vcpus..self.present_vcpus() {
1218             self.vcpu_states[usize::from(cpu_id)].removing = true;
1219             self.vcpu_states[usize::from(cpu_id)]
1220                 .pending_removal
1221                 .store(true, Ordering::SeqCst);
1222         }
1223     }
1224 
1225     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1226         for state in self.vcpu_states.iter() {
1227             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1228                 return true;
1229             }
1230         }
1231         false
1232     }
1233 
1234     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1235         info!("Removing vCPU: cpu_id = {}", cpu_id);
1236         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1237         state.kill.store(true, Ordering::SeqCst);
1238         state.signal_thread();
1239         state.join_thread()?;
1240         state.handle = None;
1241 
1242         // Once the thread has exited, clear the "kill" so that it can reused
1243         state.kill.store(false, Ordering::SeqCst);
1244         state.pending_removal.store(false, Ordering::SeqCst);
1245 
1246         Ok(())
1247     }
1248 
1249     pub fn create_boot_vcpus(
1250         &mut self,
1251         snapshot: Option<Snapshot>,
1252     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1253         trace_scoped!("create_boot_vcpus");
1254 
1255         self.create_vcpus(self.boot_vcpus(), snapshot)
1256     }
1257 
1258     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1259     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1260         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1261     }
1262 
1263     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1264         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1265             .map_err(|e| {
1266                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1267             })?;
1268 
1269         Ok(())
1270     }
1271 
1272     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1273         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1274             return Ok(false);
1275         }
1276 
1277         if !self.dynamic {
1278             return Ok(false);
1279         }
1280 
1281         if self.check_pending_removed_vcpu() {
1282             return Err(Error::VcpuPendingRemovedVcpu);
1283         }
1284 
1285         match desired_vcpus.cmp(&self.present_vcpus()) {
1286             cmp::Ordering::Greater => {
1287                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1288                 for vcpu in vcpus {
1289                     self.configure_vcpu(vcpu, None)?
1290                 }
1291                 self.activate_vcpus(desired_vcpus, true, None)?;
1292                 Ok(true)
1293             }
1294             cmp::Ordering::Less => {
1295                 self.mark_vcpus_for_removal(desired_vcpus);
1296                 Ok(true)
1297             }
1298             _ => Ok(false),
1299         }
1300     }
1301 
1302     pub fn shutdown(&mut self) -> Result<()> {
1303         // Tell the vCPUs to stop themselves next time they go through the loop
1304         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1305 
1306         // Toggle the vCPUs pause boolean
1307         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1308 
1309         // Unpark all the VCPU threads.
1310         for state in self.vcpu_states.iter() {
1311             state.unpark_thread();
1312         }
1313 
1314         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1315         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1316         // above.
1317         for state in self.vcpu_states.iter() {
1318             state.signal_thread();
1319         }
1320 
1321         // Wait for all the threads to finish. This removes the state from the vector.
1322         for mut state in self.vcpu_states.drain(..) {
1323             state.join_thread()?;
1324         }
1325 
1326         Ok(())
1327     }
1328 
1329     #[cfg(feature = "tdx")]
1330     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1331         for vcpu in &self.vcpus {
1332             vcpu.lock()
1333                 .unwrap()
1334                 .vcpu
1335                 .tdx_init(hob_address)
1336                 .map_err(Error::InitializeTdx)?;
1337         }
1338         Ok(())
1339     }
1340 
1341     pub fn boot_vcpus(&self) -> u8 {
1342         self.config.boot_vcpus
1343     }
1344 
1345     pub fn max_vcpus(&self) -> u8 {
1346         self.config.max_vcpus
1347     }
1348 
1349     #[cfg(target_arch = "x86_64")]
1350     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1351         assert!(!self.cpuid.is_empty());
1352         self.cpuid.clone()
1353     }
1354 
1355     fn present_vcpus(&self) -> u8 {
1356         self.vcpu_states
1357             .iter()
1358             .fold(0, |acc, state| acc + state.active() as u8)
1359     }
1360 
1361     #[cfg(target_arch = "aarch64")]
1362     pub fn get_mpidrs(&self) -> Vec<u64> {
1363         self.vcpus
1364             .iter()
1365             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1366             .collect()
1367     }
1368 
1369     #[cfg(target_arch = "aarch64")]
1370     pub fn get_saved_states(&self) -> Vec<CpuState> {
1371         self.vcpus
1372             .iter()
1373             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1374             .collect()
1375     }
1376 
1377     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1378         self.config
1379             .topology
1380             .clone()
1381             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1382     }
1383 
1384     pub fn create_madt(&self) -> Sdt {
1385         use crate::acpi;
1386         // This is also checked in the commandline parsing.
1387         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1388 
1389         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1390         #[cfg(target_arch = "x86_64")]
1391         {
1392             madt.write(36, arch::layout::APIC_START.0);
1393 
1394             for cpu in 0..self.config.max_vcpus {
1395                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1396 
1397                 let lapic = LocalX2Apic {
1398                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1399                     length: 16,
1400                     processor_id: cpu.into(),
1401                     apic_id: x2apic_id,
1402                     flags: if cpu < self.config.boot_vcpus {
1403                         1 << MADT_CPU_ENABLE_FLAG
1404                     } else {
1405                         0
1406                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1407                     _reserved: 0,
1408                 };
1409                 madt.append(lapic);
1410             }
1411 
1412             madt.append(Ioapic {
1413                 r#type: acpi::ACPI_APIC_IO,
1414                 length: 12,
1415                 ioapic_id: 0,
1416                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1417                 gsi_base: 0,
1418                 ..Default::default()
1419             });
1420 
1421             madt.append(InterruptSourceOverride {
1422                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1423                 length: 10,
1424                 bus: 0,
1425                 source: 4,
1426                 gsi: 4,
1427                 flags: 0,
1428             });
1429         }
1430 
1431         #[cfg(target_arch = "aarch64")]
1432         {
1433             /* Notes:
1434              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1435              */
1436 
1437             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1438             for cpu in 0..self.config.boot_vcpus {
1439                 let vcpu = &self.vcpus[cpu as usize];
1440                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1441                 /* ARMv8 MPIDR format:
1442                      Bits [63:40] Must be zero
1443                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1444                      Bits [31:24] Must be zero
1445                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1446                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1447                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1448                 */
1449                 let mpidr_mask = 0xff_00ff_ffff;
1450                 let gicc = GicC {
1451                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1452                     length: 80,
1453                     reserved0: 0,
1454                     cpu_interface_number: cpu as u32,
1455                     uid: cpu as u32,
1456                     flags: 1,
1457                     parking_version: 0,
1458                     performance_interrupt: 0,
1459                     parked_address: 0,
1460                     base_address: 0,
1461                     gicv_base_address: 0,
1462                     gich_base_address: 0,
1463                     vgic_interrupt: 0,
1464                     gicr_base_address: 0,
1465                     mpidr: mpidr & mpidr_mask,
1466                     proc_power_effi_class: 0,
1467                     reserved1: 0,
1468                     spe_overflow_interrupt: 0,
1469                 };
1470 
1471                 madt.append(gicc);
1472             }
1473             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1474 
1475             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1476             let gicd = GicD {
1477                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1478                 length: 24,
1479                 reserved0: 0,
1480                 gic_id: 0,
1481                 base_address: vgic_config.dist_addr,
1482                 global_irq_base: 0,
1483                 version: 3,
1484                 reserved1: [0; 3],
1485             };
1486             madt.append(gicd);
1487 
1488             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1489             let gicr = GicR {
1490                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1491                 length: 16,
1492                 reserved: 0,
1493                 base_address: vgic_config.redists_addr,
1494                 range_length: vgic_config.redists_size as u32,
1495             };
1496             madt.append(gicr);
1497 
1498             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1499             let gicits = GicIts {
1500                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1501                 length: 20,
1502                 reserved0: 0,
1503                 translation_id: 0,
1504                 base_address: vgic_config.msi_addr,
1505                 reserved1: 0,
1506             };
1507             madt.append(gicits);
1508 
1509             madt.update_checksum();
1510         }
1511 
1512         madt
1513     }
1514 
1515     #[cfg(target_arch = "aarch64")]
1516     pub fn create_pptt(&self) -> Sdt {
1517         let pptt_start = 0;
1518         let mut cpus = 0;
1519         let mut uid = 0;
1520         // If topology is not specified, the default setting is:
1521         // 1 package, multiple cores, 1 thread per core
1522         // This is also the behavior when PPTT is missing.
1523         let (threads_per_core, cores_per_package, packages) =
1524             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1525 
1526         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1527 
1528         for cluster_idx in 0..packages {
1529             if cpus < self.config.boot_vcpus as usize {
1530                 let cluster_offset = pptt.len() - pptt_start;
1531                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1532                     r#type: 0,
1533                     length: 20,
1534                     reserved: 0,
1535                     flags: 0x2,
1536                     parent: 0,
1537                     acpi_processor_id: cluster_idx as u32,
1538                     num_private_resources: 0,
1539                 };
1540                 pptt.append(cluster_hierarchy_node);
1541 
1542                 for core_idx in 0..cores_per_package {
1543                     let core_offset = pptt.len() - pptt_start;
1544 
1545                     if threads_per_core > 1 {
1546                         let core_hierarchy_node = ProcessorHierarchyNode {
1547                             r#type: 0,
1548                             length: 20,
1549                             reserved: 0,
1550                             flags: 0x2,
1551                             parent: cluster_offset as u32,
1552                             acpi_processor_id: core_idx as u32,
1553                             num_private_resources: 0,
1554                         };
1555                         pptt.append(core_hierarchy_node);
1556 
1557                         for _thread_idx in 0..threads_per_core {
1558                             let thread_hierarchy_node = ProcessorHierarchyNode {
1559                                 r#type: 0,
1560                                 length: 20,
1561                                 reserved: 0,
1562                                 flags: 0xE,
1563                                 parent: core_offset as u32,
1564                                 acpi_processor_id: uid as u32,
1565                                 num_private_resources: 0,
1566                             };
1567                             pptt.append(thread_hierarchy_node);
1568                             uid += 1;
1569                         }
1570                     } else {
1571                         let thread_hierarchy_node = ProcessorHierarchyNode {
1572                             r#type: 0,
1573                             length: 20,
1574                             reserved: 0,
1575                             flags: 0xA,
1576                             parent: cluster_offset as u32,
1577                             acpi_processor_id: uid as u32,
1578                             num_private_resources: 0,
1579                         };
1580                         pptt.append(thread_hierarchy_node);
1581                         uid += 1;
1582                     }
1583                 }
1584                 cpus += (cores_per_package * threads_per_core) as usize;
1585             }
1586         }
1587 
1588         pptt.update_checksum();
1589         pptt
1590     }
1591 
1592     #[cfg(feature = "guest_debug")]
1593     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1594         self.vcpus[usize::from(cpu_id)]
1595             .lock()
1596             .unwrap()
1597             .vcpu
1598             .get_regs()
1599             .map_err(Error::CpuDebug)
1600     }
1601 
1602     #[cfg(feature = "guest_debug")]
1603     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1604         self.vcpus[usize::from(cpu_id)]
1605             .lock()
1606             .unwrap()
1607             .vcpu
1608             .set_regs(regs)
1609             .map_err(Error::CpuDebug)
1610     }
1611 
1612     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1613     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1614         self.vcpus[usize::from(cpu_id)]
1615             .lock()
1616             .unwrap()
1617             .vcpu
1618             .get_sregs()
1619             .map_err(Error::CpuDebug)
1620     }
1621 
1622     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1623     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1624         self.vcpus[usize::from(cpu_id)]
1625             .lock()
1626             .unwrap()
1627             .vcpu
1628             .set_sregs(sregs)
1629             .map_err(Error::CpuDebug)
1630     }
1631 
1632     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1633     fn translate_gva(
1634         &self,
1635         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1636         cpu_id: u8,
1637         gva: u64,
1638     ) -> Result<u64> {
1639         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1640             .lock()
1641             .unwrap()
1642             .vcpu
1643             .translate_gva(gva, /* flags: unused */ 0)
1644             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1645         Ok(gpa)
1646     }
1647 
1648     ///
1649     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1650     /// it in VMM by walking through translation tables.
1651     ///
1652     /// Address translation is big topic, here we only focus the scenario that
1653     /// happens in VMM while debugging kernel. This `translate_gva`
1654     /// implementation is restricted to:
1655     /// - Exception Level 1
1656     /// - Translate high address range only (kernel space)
1657     ///
1658     /// This implementation supports following Arm-v8a features related to
1659     /// address translation:
1660     /// - FEAT_LPA
1661     /// - FEAT_LVA
1662     /// - FEAT_LPA2
1663     ///
1664     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1665     fn translate_gva(
1666         &self,
1667         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1668         cpu_id: u8,
1669         gva: u64,
1670     ) -> Result<u64> {
1671         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1672             .lock()
1673             .unwrap()
1674             .vcpu
1675             .get_sys_reg(regs::TCR_EL1)
1676             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1677         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1678             .lock()
1679             .unwrap()
1680             .vcpu
1681             .get_sys_reg(regs::TTBR1_EL1)
1682             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1683         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1684             .lock()
1685             .unwrap()
1686             .vcpu
1687             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1688             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1689 
1690         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1691         // or low (0x000xxx...).
1692         let high_range = extract_bits_64!(gva, 55, 1);
1693         if high_range == 0 {
1694             info!("VA (0x{:x}) range is not supported!", gva);
1695             return Ok(gva);
1696         }
1697 
1698         // High range size offset
1699         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1700         // Granule size
1701         let tg = extract_bits_64!(tcr_el1, 30, 2);
1702         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1703         let ds = extract_bits_64!(tcr_el1, 59, 1);
1704 
1705         if tsz == 0 {
1706             info!("VA translation is not ready!");
1707             return Ok(gva);
1708         }
1709 
1710         // VA size is determined by TCR_BL1.T1SZ
1711         let va_size = 64 - tsz;
1712         // Number of bits in VA consumed in each level of translation
1713         let stride = match tg {
1714             3 => 13, // 64KB granule size
1715             1 => 11, // 16KB granule size
1716             _ => 9,  // 4KB, default
1717         };
1718         // Starting level of walking
1719         let mut level = 4 - (va_size - 4) / stride;
1720 
1721         // PA or IPA size is determined
1722         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1723         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1724         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1725         // To be safe, we use the minimum value if they are different.
1726         let pa_range = std::cmp::min(tcr_ips, pa_range);
1727         // PA size in bits
1728         let pa_size = match pa_range {
1729             0 => 32,
1730             1 => 36,
1731             2 => 40,
1732             3 => 42,
1733             4 => 44,
1734             5 => 48,
1735             6 => 52,
1736             _ => {
1737                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1738                     "PA range not supported {pa_range}"
1739                 ))))
1740             }
1741         };
1742 
1743         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1744         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1745         // If FEAT_LPA2 is present, the translation table descriptor holds
1746         // 50 bits of the table address of next level.
1747         // Otherwise, it is 48 bits.
1748         let descaddrmask = if ds == 1 {
1749             !0u64 >> (64 - 50) // mask with 50 least significant bits
1750         } else {
1751             !0u64 >> (64 - 48) // mask with 48 least significant bits
1752         };
1753         let descaddrmask = descaddrmask & !indexmask_grainsize;
1754 
1755         // Translation table base address
1756         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1757         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1758         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1759         if pa_size == 52 {
1760             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1761         }
1762 
1763         // Loop through tables of each level
1764         loop {
1765             // Table offset for current level
1766             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1767             descaddr |= table_offset;
1768             descaddr &= !7u64;
1769 
1770             let mut buf = [0; 8];
1771             guest_memory
1772                 .memory()
1773                 .read(&mut buf, GuestAddress(descaddr))
1774                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1775             let descriptor = u64::from_le_bytes(buf);
1776 
1777             descaddr = descriptor & descaddrmask;
1778             // In the case of FEAT_LPA, the next-level translation table address
1779             // bits [48:51] comes from bits [12:15] of the current descriptor.
1780             // For FEAT_LPA2, the next-level translation table address
1781             // bits [50:51] comes from bits [8:9] of the current descriptor,
1782             // bits [48:49] comes from bits [48:49] of the descriptor which was
1783             // handled previously.
1784             if pa_size == 52 {
1785                 if ds == 1 {
1786                     // FEAT_LPA2
1787                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1788                 } else {
1789                     // FEAT_LPA
1790                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1791                 }
1792             }
1793 
1794             if (descriptor & 2) != 0 && (level < 3) {
1795                 // This is a table entry. Go down to next level.
1796                 level += 1;
1797                 indexmask = indexmask_grainsize;
1798                 continue;
1799             }
1800 
1801             break;
1802         }
1803 
1804         // We have reached either:
1805         // - a page entry at level 3 or
1806         // - a block entry at level 1 or 2
1807         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1808         descaddr &= !(page_size - 1);
1809         descaddr |= gva & (page_size - 1);
1810 
1811         Ok(descaddr)
1812     }
1813 
1814     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1815         self.acpi_address = Some(acpi_address);
1816     }
1817 
1818     pub(crate) fn set_interrupt_controller(
1819         &mut self,
1820         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1821     ) {
1822         self.interrupt_controller = Some(interrupt_controller);
1823     }
1824 
1825     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1826         &self.vcpus_kill_signalled
1827     }
1828 
1829     #[cfg(feature = "igvm")]
1830     pub(crate) fn get_cpuid_leaf(
1831         &self,
1832         cpu_id: u8,
1833         eax: u32,
1834         ecx: u32,
1835         xfem: u64,
1836         xss: u64,
1837     ) -> Result<[u32; 4]> {
1838         let leaf_info = self.vcpus[usize::from(cpu_id)]
1839             .lock()
1840             .unwrap()
1841             .vcpu
1842             .get_cpuid_values(eax, ecx, xfem, xss)
1843             .unwrap();
1844         Ok(leaf_info)
1845     }
1846 
1847     #[cfg(feature = "sev_snp")]
1848     pub(crate) fn sev_snp_enabled(&self) -> bool {
1849         self.sev_snp_enabled
1850     }
1851 }
1852 
1853 struct Cpu {
1854     cpu_id: u8,
1855     proximity_domain: u32,
1856     dynamic: bool,
1857     #[cfg(target_arch = "x86_64")]
1858     topology: Option<(u8, u8, u8)>,
1859 }
1860 
1861 #[cfg(target_arch = "x86_64")]
1862 const MADT_CPU_ENABLE_FLAG: usize = 0;
1863 
1864 #[cfg(target_arch = "x86_64")]
1865 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1866 
1867 impl Cpu {
1868     #[cfg(target_arch = "x86_64")]
1869     fn generate_mat(&self) -> Vec<u8> {
1870         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1871 
1872         let lapic = LocalX2Apic {
1873             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1874             length: 16,
1875             processor_id: self.cpu_id.into(),
1876             apic_id: x2apic_id,
1877             flags: 1 << MADT_CPU_ENABLE_FLAG,
1878             _reserved: 0,
1879         };
1880 
1881         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1882         // SAFETY: mat_data is large enough to hold lapic
1883         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1884 
1885         mat_data
1886     }
1887 }
1888 
1889 impl Aml for Cpu {
1890     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1891         #[cfg(target_arch = "x86_64")]
1892         let mat_data: Vec<u8> = self.generate_mat();
1893         #[allow(clippy::if_same_then_else)]
1894         if self.dynamic {
1895             aml::Device::new(
1896                 format!("C{:03X}", self.cpu_id).as_str().into(),
1897                 vec![
1898                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1899                     &aml::Name::new("_UID".into(), &self.cpu_id),
1900                     // Currently, AArch64 cannot support following fields.
1901                     /*
1902                     _STA return value:
1903                     Bit [0] – Set if the device is present.
1904                     Bit [1] – Set if the device is enabled and decoding its resources.
1905                     Bit [2] – Set if the device should be shown in the UI.
1906                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1907                     Bit [4] – Set if the battery is present.
1908                     Bits [31:5] – Reserved (must be cleared).
1909                     */
1910                     #[cfg(target_arch = "x86_64")]
1911                     &aml::Method::new(
1912                         "_STA".into(),
1913                         0,
1914                         false,
1915                         // Call into CSTA method which will interrogate device
1916                         vec![&aml::Return::new(&aml::MethodCall::new(
1917                             "CSTA".into(),
1918                             vec![&self.cpu_id],
1919                         ))],
1920                     ),
1921                     &aml::Method::new(
1922                         "_PXM".into(),
1923                         0,
1924                         false,
1925                         vec![&aml::Return::new(&self.proximity_domain)],
1926                     ),
1927                     // The Linux kernel expects every CPU device to have a _MAT entry
1928                     // containing the LAPIC for this processor with the enabled bit set
1929                     // even it if is disabled in the MADT (non-boot CPU)
1930                     #[cfg(target_arch = "x86_64")]
1931                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1932                     // Trigger CPU ejection
1933                     #[cfg(target_arch = "x86_64")]
1934                     &aml::Method::new(
1935                         "_EJ0".into(),
1936                         1,
1937                         false,
1938                         // Call into CEJ0 method which will actually eject device
1939                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1940                     ),
1941                 ],
1942             )
1943             .to_aml_bytes(sink);
1944         } else {
1945             aml::Device::new(
1946                 format!("C{:03X}", self.cpu_id).as_str().into(),
1947                 vec![
1948                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1949                     &aml::Name::new("_UID".into(), &self.cpu_id),
1950                     #[cfg(target_arch = "x86_64")]
1951                     &aml::Method::new(
1952                         "_STA".into(),
1953                         0,
1954                         false,
1955                         // Mark CPU present see CSTA implementation
1956                         vec![&aml::Return::new(&0xfu8)],
1957                     ),
1958                     &aml::Method::new(
1959                         "_PXM".into(),
1960                         0,
1961                         false,
1962                         vec![&aml::Return::new(&self.proximity_domain)],
1963                     ),
1964                     // The Linux kernel expects every CPU device to have a _MAT entry
1965                     // containing the LAPIC for this processor with the enabled bit set
1966                     // even it if is disabled in the MADT (non-boot CPU)
1967                     #[cfg(target_arch = "x86_64")]
1968                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1969                 ],
1970             )
1971             .to_aml_bytes(sink);
1972         }
1973     }
1974 }
1975 
1976 struct CpuNotify {
1977     cpu_id: u8,
1978 }
1979 
1980 impl Aml for CpuNotify {
1981     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1982         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
1983         aml::If::new(
1984             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1985             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1986         )
1987         .to_aml_bytes(sink)
1988     }
1989 }
1990 
1991 struct CpuMethods {
1992     max_vcpus: u8,
1993     dynamic: bool,
1994 }
1995 
1996 impl Aml for CpuMethods {
1997     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1998         if self.dynamic {
1999             // CPU status method
2000             aml::Method::new(
2001                 "CSTA".into(),
2002                 1,
2003                 true,
2004                 vec![
2005                     // Take lock defined above
2006                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2007                     // Write CPU number (in first argument) to I/O port via field
2008                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2009                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2010                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2011                     &aml::If::new(
2012                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2013                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2014                     ),
2015                     // Release lock
2016                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2017                     // Return 0 or 0xf
2018                     &aml::Return::new(&aml::Local(0)),
2019                 ],
2020             )
2021             .to_aml_bytes(sink);
2022 
2023             let mut cpu_notifies = Vec::new();
2024             for cpu_id in 0..self.max_vcpus {
2025                 cpu_notifies.push(CpuNotify { cpu_id });
2026             }
2027 
2028             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2029             for cpu_id in 0..self.max_vcpus {
2030                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2031             }
2032 
2033             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2034 
2035             aml::Method::new(
2036                 "CEJ0".into(),
2037                 1,
2038                 true,
2039                 vec![
2040                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2041                     // Write CPU number (in first argument) to I/O port via field
2042                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2043                     // Set CEJ0 bit
2044                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2045                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2046                 ],
2047             )
2048             .to_aml_bytes(sink);
2049 
2050             aml::Method::new(
2051                 "CSCN".into(),
2052                 0,
2053                 true,
2054                 vec![
2055                     // Take lock defined above
2056                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2057                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2058                     &aml::While::new(
2059                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2060                         vec![
2061                             // Write CPU number (in first argument) to I/O port via field
2062                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2063                             // Check if CINS bit is set
2064                             &aml::If::new(
2065                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2066                                 // Notify device if it is
2067                                 vec![
2068                                     &aml::MethodCall::new(
2069                                         "CTFY".into(),
2070                                         vec![&aml::Local(0), &aml::ONE],
2071                                     ),
2072                                     // Reset CINS bit
2073                                     &aml::Store::new(
2074                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2075                                         &aml::ONE,
2076                                     ),
2077                                 ],
2078                             ),
2079                             // Check if CRMV bit is set
2080                             &aml::If::new(
2081                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2082                                 // Notify device if it is (with the eject constant 0x3)
2083                                 vec![
2084                                     &aml::MethodCall::new(
2085                                         "CTFY".into(),
2086                                         vec![&aml::Local(0), &3u8],
2087                                     ),
2088                                     // Reset CRMV bit
2089                                     &aml::Store::new(
2090                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2091                                         &aml::ONE,
2092                                     ),
2093                                 ],
2094                             ),
2095                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2096                         ],
2097                     ),
2098                     // Release lock
2099                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2100                 ],
2101             )
2102             .to_aml_bytes(sink)
2103         } else {
2104             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2105         }
2106     }
2107 }
2108 
2109 impl Aml for CpuManager {
2110     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2111         #[cfg(target_arch = "x86_64")]
2112         if let Some(acpi_address) = self.acpi_address {
2113             // CPU hotplug controller
2114             aml::Device::new(
2115                 "_SB_.PRES".into(),
2116                 vec![
2117                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2118                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2119                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2120                     &aml::Mutex::new("CPLK".into(), 0),
2121                     &aml::Name::new(
2122                         "_CRS".into(),
2123                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2124                             aml::AddressSpaceCacheable::NotCacheable,
2125                             true,
2126                             acpi_address.0,
2127                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2128                             None,
2129                         )]),
2130                     ),
2131                     // OpRegion and Fields map MMIO range into individual field values
2132                     &aml::OpRegion::new(
2133                         "PRST".into(),
2134                         aml::OpRegionSpace::SystemMemory,
2135                         &(acpi_address.0 as usize),
2136                         &CPU_MANAGER_ACPI_SIZE,
2137                     ),
2138                     &aml::Field::new(
2139                         "PRST".into(),
2140                         aml::FieldAccessType::Byte,
2141                         aml::FieldLockRule::NoLock,
2142                         aml::FieldUpdateRule::WriteAsZeroes,
2143                         vec![
2144                             aml::FieldEntry::Reserved(32),
2145                             aml::FieldEntry::Named(*b"CPEN", 1),
2146                             aml::FieldEntry::Named(*b"CINS", 1),
2147                             aml::FieldEntry::Named(*b"CRMV", 1),
2148                             aml::FieldEntry::Named(*b"CEJ0", 1),
2149                             aml::FieldEntry::Reserved(4),
2150                             aml::FieldEntry::Named(*b"CCMD", 8),
2151                         ],
2152                     ),
2153                     &aml::Field::new(
2154                         "PRST".into(),
2155                         aml::FieldAccessType::DWord,
2156                         aml::FieldLockRule::NoLock,
2157                         aml::FieldUpdateRule::Preserve,
2158                         vec![
2159                             aml::FieldEntry::Named(*b"CSEL", 32),
2160                             aml::FieldEntry::Reserved(32),
2161                             aml::FieldEntry::Named(*b"CDAT", 32),
2162                         ],
2163                     ),
2164                 ],
2165             )
2166             .to_aml_bytes(sink);
2167         }
2168 
2169         // CPU devices
2170         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2171         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2172         // Bundle methods together under a common object
2173         let methods = CpuMethods {
2174             max_vcpus: self.config.max_vcpus,
2175             dynamic: self.dynamic,
2176         };
2177         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2178 
2179         #[cfg(target_arch = "x86_64")]
2180         let topology = self.get_vcpu_topology();
2181         let mut cpu_devices = Vec::new();
2182         for cpu_id in 0..self.config.max_vcpus {
2183             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2184             let cpu_device = Cpu {
2185                 cpu_id,
2186                 proximity_domain,
2187                 dynamic: self.dynamic,
2188                 #[cfg(target_arch = "x86_64")]
2189                 topology,
2190             };
2191 
2192             cpu_devices.push(cpu_device);
2193         }
2194 
2195         for cpu_device in cpu_devices.iter() {
2196             cpu_data_inner.push(cpu_device);
2197         }
2198 
2199         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2200     }
2201 }
2202 
2203 impl Pausable for CpuManager {
2204     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2205         // Tell the vCPUs to pause themselves next time they exit
2206         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2207 
2208         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2209         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2210         // above.
2211         for state in self.vcpu_states.iter() {
2212             state.signal_thread();
2213         }
2214 
2215         for vcpu in self.vcpus.iter() {
2216             let mut vcpu = vcpu.lock().unwrap();
2217             vcpu.pause()?;
2218             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2219             if !self.config.kvm_hyperv {
2220                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2221                     MigratableError::Pause(anyhow!(
2222                         "Could not notify guest it has been paused {:?}",
2223                         e
2224                     ))
2225                 })?;
2226             }
2227         }
2228 
2229         // The vCPU thread will change its paused state before parking, wait here for each
2230         // activated vCPU change their state to ensure they have parked.
2231         for state in self.vcpu_states.iter() {
2232             if state.active() {
2233                 while !state.paused.load(Ordering::SeqCst) {
2234                     // To avoid a priority inversion with the vCPU thread
2235                     thread::sleep(std::time::Duration::from_millis(1));
2236                 }
2237             }
2238         }
2239 
2240         Ok(())
2241     }
2242 
2243     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2244         for vcpu in self.vcpus.iter() {
2245             vcpu.lock().unwrap().resume()?;
2246         }
2247 
2248         // Toggle the vCPUs pause boolean
2249         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2250 
2251         // Unpark all the VCPU threads.
2252         // Once unparked, the next thing they will do is checking for the pause
2253         // boolean. Since it'll be set to false, they will exit their pause loop
2254         // and go back to vmx root.
2255         for state in self.vcpu_states.iter() {
2256             state.paused.store(false, Ordering::SeqCst);
2257             state.unpark_thread();
2258         }
2259         Ok(())
2260     }
2261 }
2262 
2263 impl Snapshottable for CpuManager {
2264     fn id(&self) -> String {
2265         CPU_MANAGER_SNAPSHOT_ID.to_string()
2266     }
2267 
2268     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2269         let mut cpu_manager_snapshot = Snapshot::default();
2270 
2271         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2272         for vcpu in &self.vcpus {
2273             let mut vcpu = vcpu.lock().unwrap();
2274             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2275         }
2276 
2277         Ok(cpu_manager_snapshot)
2278     }
2279 }
2280 
2281 impl Transportable for CpuManager {}
2282 impl Migratable for CpuManager {}
2283 
2284 #[cfg(feature = "guest_debug")]
2285 impl Debuggable for CpuManager {
2286     #[cfg(feature = "kvm")]
2287     fn set_guest_debug(
2288         &self,
2289         cpu_id: usize,
2290         addrs: &[GuestAddress],
2291         singlestep: bool,
2292     ) -> std::result::Result<(), DebuggableError> {
2293         self.vcpus[cpu_id]
2294             .lock()
2295             .unwrap()
2296             .vcpu
2297             .set_guest_debug(addrs, singlestep)
2298             .map_err(DebuggableError::SetDebug)
2299     }
2300 
2301     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2302         Ok(())
2303     }
2304 
2305     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2306         Ok(())
2307     }
2308 
2309     #[cfg(target_arch = "x86_64")]
2310     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2311         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2312         let gregs = self
2313             .get_regs(cpu_id as u8)
2314             .map_err(DebuggableError::ReadRegs)?;
2315         let regs = [
2316             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2317             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2318         ];
2319 
2320         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2321         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2322         let eflags = gregs.rflags as u32;
2323         let rip = gregs.rip;
2324 
2325         // Segment registers: CS, SS, DS, ES, FS, GS
2326         let sregs = self
2327             .get_sregs(cpu_id as u8)
2328             .map_err(DebuggableError::ReadRegs)?;
2329         let segments = X86SegmentRegs {
2330             cs: sregs.cs.selector as u32,
2331             ss: sregs.ss.selector as u32,
2332             ds: sregs.ds.selector as u32,
2333             es: sregs.es.selector as u32,
2334             fs: sregs.fs.selector as u32,
2335             gs: sregs.gs.selector as u32,
2336         };
2337 
2338         // TODO: Add other registers
2339 
2340         Ok(CoreRegs {
2341             regs,
2342             eflags,
2343             rip,
2344             segments,
2345             ..Default::default()
2346         })
2347     }
2348 
2349     #[cfg(target_arch = "aarch64")]
2350     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2351         let gregs = self
2352             .get_regs(cpu_id as u8)
2353             .map_err(DebuggableError::ReadRegs)?;
2354         Ok(CoreRegs {
2355             x: gregs.regs.regs,
2356             sp: gregs.regs.sp,
2357             pc: gregs.regs.pc,
2358             ..Default::default()
2359         })
2360     }
2361 
2362     #[cfg(target_arch = "x86_64")]
2363     fn write_regs(
2364         &self,
2365         cpu_id: usize,
2366         regs: &CoreRegs,
2367     ) -> std::result::Result<(), DebuggableError> {
2368         let orig_gregs = self
2369             .get_regs(cpu_id as u8)
2370             .map_err(DebuggableError::ReadRegs)?;
2371         let gregs = StandardRegisters {
2372             rax: regs.regs[0],
2373             rbx: regs.regs[1],
2374             rcx: regs.regs[2],
2375             rdx: regs.regs[3],
2376             rsi: regs.regs[4],
2377             rdi: regs.regs[5],
2378             rbp: regs.regs[6],
2379             rsp: regs.regs[7],
2380             r8: regs.regs[8],
2381             r9: regs.regs[9],
2382             r10: regs.regs[10],
2383             r11: regs.regs[11],
2384             r12: regs.regs[12],
2385             r13: regs.regs[13],
2386             r14: regs.regs[14],
2387             r15: regs.regs[15],
2388             rip: regs.rip,
2389             // Update the lower 32-bit of rflags.
2390             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2391         };
2392 
2393         self.set_regs(cpu_id as u8, &gregs)
2394             .map_err(DebuggableError::WriteRegs)?;
2395 
2396         // Segment registers: CS, SS, DS, ES, FS, GS
2397         // Since GDB care only selectors, we call get_sregs() first.
2398         let mut sregs = self
2399             .get_sregs(cpu_id as u8)
2400             .map_err(DebuggableError::ReadRegs)?;
2401         sregs.cs.selector = regs.segments.cs as u16;
2402         sregs.ss.selector = regs.segments.ss as u16;
2403         sregs.ds.selector = regs.segments.ds as u16;
2404         sregs.es.selector = regs.segments.es as u16;
2405         sregs.fs.selector = regs.segments.fs as u16;
2406         sregs.gs.selector = regs.segments.gs as u16;
2407 
2408         self.set_sregs(cpu_id as u8, &sregs)
2409             .map_err(DebuggableError::WriteRegs)?;
2410 
2411         // TODO: Add other registers
2412 
2413         Ok(())
2414     }
2415 
2416     #[cfg(target_arch = "aarch64")]
2417     fn write_regs(
2418         &self,
2419         cpu_id: usize,
2420         regs: &CoreRegs,
2421     ) -> std::result::Result<(), DebuggableError> {
2422         let mut gregs = self
2423             .get_regs(cpu_id as u8)
2424             .map_err(DebuggableError::ReadRegs)?;
2425 
2426         gregs.regs.regs = regs.x;
2427         gregs.regs.sp = regs.sp;
2428         gregs.regs.pc = regs.pc;
2429 
2430         self.set_regs(cpu_id as u8, &gregs)
2431             .map_err(DebuggableError::WriteRegs)?;
2432 
2433         Ok(())
2434     }
2435 
2436     fn read_mem(
2437         &self,
2438         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2439         cpu_id: usize,
2440         vaddr: GuestAddress,
2441         len: usize,
2442     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2443         let mut buf = vec![0; len];
2444         let mut total_read = 0_u64;
2445 
2446         while total_read < len as u64 {
2447             let gaddr = vaddr.0 + total_read;
2448             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2449                 Ok(paddr) => paddr,
2450                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2451                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2452             };
2453             let psize = arch::PAGE_SIZE as u64;
2454             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2455             guest_memory
2456                 .memory()
2457                 .read(
2458                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2459                     GuestAddress(paddr),
2460                 )
2461                 .map_err(DebuggableError::ReadMem)?;
2462             total_read += read_len;
2463         }
2464         Ok(buf)
2465     }
2466 
2467     fn write_mem(
2468         &self,
2469         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2470         cpu_id: usize,
2471         vaddr: &GuestAddress,
2472         data: &[u8],
2473     ) -> std::result::Result<(), DebuggableError> {
2474         let mut total_written = 0_u64;
2475 
2476         while total_written < data.len() as u64 {
2477             let gaddr = vaddr.0 + total_written;
2478             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2479                 Ok(paddr) => paddr,
2480                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2481                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2482             };
2483             let psize = arch::PAGE_SIZE as u64;
2484             let write_len = std::cmp::min(
2485                 data.len() as u64 - total_written,
2486                 psize - (paddr & (psize - 1)),
2487             );
2488             guest_memory
2489                 .memory()
2490                 .write(
2491                     &data[total_written as usize..total_written as usize + write_len as usize],
2492                     GuestAddress(paddr),
2493                 )
2494                 .map_err(DebuggableError::WriteMem)?;
2495             total_written += write_len;
2496         }
2497         Ok(())
2498     }
2499 
2500     fn active_vcpus(&self) -> usize {
2501         self.present_vcpus() as usize
2502     }
2503 }
2504 
2505 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2506 impl Elf64Writable for CpuManager {}
2507 
2508 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2509 impl CpuElf64Writable for CpuManager {
2510     fn cpu_write_elf64_note(
2511         &mut self,
2512         dump_state: &DumpState,
2513     ) -> std::result::Result<(), GuestDebuggableError> {
2514         let mut coredump_file = dump_state.file.as_ref().unwrap();
2515         for vcpu in &self.vcpus {
2516             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2517             let mut pos: usize = 0;
2518             let mut buf = vec![0; note_size as usize];
2519             let descsz = size_of::<X86_64ElfPrStatus>();
2520             let vcpu_id = vcpu.lock().unwrap().id;
2521 
2522             let note = Elf64_Nhdr {
2523                 n_namesz: COREDUMP_NAME_SIZE,
2524                 n_descsz: descsz as u32,
2525                 n_type: NT_PRSTATUS,
2526             };
2527 
2528             let bytes: &[u8] = note.as_slice();
2529             buf.splice(0.., bytes.to_vec());
2530             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2531             buf.resize(pos + 4, 0);
2532             buf.splice(pos.., "CORE".to_string().into_bytes());
2533 
2534             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2535             buf.resize(pos + 32 + 4, 0);
2536             let pid = vcpu_id as u64;
2537             let bytes: &[u8] = pid.as_slice();
2538             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2539 
2540             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2541 
2542             let orig_rax: u64 = 0;
2543             let gregs = self.vcpus[usize::from(vcpu_id)]
2544                 .lock()
2545                 .unwrap()
2546                 .vcpu
2547                 .get_regs()
2548                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2549 
2550             let regs1 = [
2551                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2552                 gregs.r10,
2553             ];
2554             let regs2 = [
2555                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2556             ];
2557 
2558             let sregs = self.vcpus[usize::from(vcpu_id)]
2559                 .lock()
2560                 .unwrap()
2561                 .vcpu
2562                 .get_sregs()
2563                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2564 
2565             debug!(
2566                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2567                 gregs.rip,
2568                 gregs.rsp,
2569                 sregs.gs.base,
2570                 sregs.cs.selector,
2571                 sregs.ss.selector,
2572                 sregs.ds.selector,
2573             );
2574 
2575             let regs = X86_64UserRegs {
2576                 regs1,
2577                 regs2,
2578                 rip: gregs.rip,
2579                 cs: sregs.cs.selector as u64,
2580                 eflags: gregs.rflags,
2581                 rsp: gregs.rsp,
2582                 ss: sregs.ss.selector as u64,
2583                 fs_base: sregs.fs.base,
2584                 gs_base: sregs.gs.base,
2585                 ds: sregs.ds.selector as u64,
2586                 es: sregs.es.selector as u64,
2587                 fs: sregs.fs.selector as u64,
2588                 gs: sregs.gs.selector as u64,
2589             };
2590 
2591             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2592             let bytes: &[u8] = regs.as_slice();
2593             buf.resize(note_size as usize, 0);
2594             buf.splice(pos.., bytes.to_vec());
2595             buf.resize(note_size as usize, 0);
2596 
2597             coredump_file
2598                 .write(&buf)
2599                 .map_err(GuestDebuggableError::CoredumpFile)?;
2600         }
2601 
2602         Ok(())
2603     }
2604 
2605     fn cpu_write_vmm_note(
2606         &mut self,
2607         dump_state: &DumpState,
2608     ) -> std::result::Result<(), GuestDebuggableError> {
2609         let mut coredump_file = dump_state.file.as_ref().unwrap();
2610         for vcpu in &self.vcpus {
2611             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2612             let mut pos: usize = 0;
2613             let mut buf = vec![0; note_size as usize];
2614             let descsz = size_of::<DumpCpusState>();
2615             let vcpu_id = vcpu.lock().unwrap().id;
2616 
2617             let note = Elf64_Nhdr {
2618                 n_namesz: COREDUMP_NAME_SIZE,
2619                 n_descsz: descsz as u32,
2620                 n_type: 0,
2621             };
2622 
2623             let bytes: &[u8] = note.as_slice();
2624             buf.splice(0.., bytes.to_vec());
2625             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2626 
2627             buf.resize(pos + 4, 0);
2628             buf.splice(pos.., "QEMU".to_string().into_bytes());
2629 
2630             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2631 
2632             let gregs = self.vcpus[usize::from(vcpu_id)]
2633                 .lock()
2634                 .unwrap()
2635                 .vcpu
2636                 .get_regs()
2637                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2638 
2639             let regs1 = [
2640                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2641                 gregs.rbp,
2642             ];
2643 
2644             let regs2 = [
2645                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2646                 gregs.r15,
2647             ];
2648 
2649             let sregs = self.vcpus[usize::from(vcpu_id)]
2650                 .lock()
2651                 .unwrap()
2652                 .vcpu
2653                 .get_sregs()
2654                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2655 
2656             let mut msrs = vec![MsrEntry {
2657                 index: msr_index::MSR_KERNEL_GS_BASE,
2658                 ..Default::default()
2659             }];
2660 
2661             self.vcpus[vcpu_id as usize]
2662                 .lock()
2663                 .unwrap()
2664                 .vcpu
2665                 .get_msrs(&mut msrs)
2666                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2667             let kernel_gs_base = msrs[0].data;
2668 
2669             let cs = CpuSegment::new(sregs.cs);
2670             let ds = CpuSegment::new(sregs.ds);
2671             let es = CpuSegment::new(sregs.es);
2672             let fs = CpuSegment::new(sregs.fs);
2673             let gs = CpuSegment::new(sregs.gs);
2674             let ss = CpuSegment::new(sregs.ss);
2675             let ldt = CpuSegment::new(sregs.ldt);
2676             let tr = CpuSegment::new(sregs.tr);
2677             let gdt = CpuSegment::new_from_table(sregs.gdt);
2678             let idt = CpuSegment::new_from_table(sregs.idt);
2679             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2680             let regs = DumpCpusState {
2681                 version: 1,
2682                 size: size_of::<DumpCpusState>() as u32,
2683                 regs1,
2684                 regs2,
2685                 rip: gregs.rip,
2686                 rflags: gregs.rflags,
2687                 cs,
2688                 ds,
2689                 es,
2690                 fs,
2691                 gs,
2692                 ss,
2693                 ldt,
2694                 tr,
2695                 gdt,
2696                 idt,
2697                 cr,
2698                 kernel_gs_base,
2699             };
2700 
2701             let bytes: &[u8] = regs.as_slice();
2702             buf.resize(note_size as usize, 0);
2703             buf.splice(pos.., bytes.to_vec());
2704             buf.resize(note_size as usize, 0);
2705 
2706             coredump_file
2707                 .write(&buf)
2708                 .map_err(GuestDebuggableError::CoredumpFile)?;
2709         }
2710 
2711         Ok(())
2712     }
2713 }
2714 
2715 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2716 #[cfg(test)]
2717 mod tests {
2718     use arch::layout::BOOT_STACK_POINTER;
2719     use arch::layout::ZERO_PAGE_START;
2720     use arch::x86_64::interrupts::*;
2721     use arch::x86_64::regs::*;
2722     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2723     use linux_loader::loader::bootparam::setup_header;
2724 
2725     #[test]
2726     fn test_setlint() {
2727         let hv = hypervisor::new().unwrap();
2728         let vm = hv.create_vm().expect("new VM fd creation failed");
2729         assert!(hv.check_required_extensions().is_ok());
2730         // Calling get_lapic will fail if there is no irqchip before hand.
2731         assert!(vm.create_irq_chip().is_ok());
2732         let vcpu = vm.create_vcpu(0, None).unwrap();
2733         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2734 
2735         // Compute the value that is expected to represent LVT0 and LVT1.
2736         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2737         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2738         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2739         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2740 
2741         set_lint(&vcpu).unwrap();
2742 
2743         // Compute the value that represents LVT0 and LVT1 after set_lint.
2744         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2745         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2746         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2747         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2748         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2749     }
2750 
2751     #[test]
2752     fn test_setup_fpu() {
2753         let hv = hypervisor::new().unwrap();
2754         let vm = hv.create_vm().expect("new VM fd creation failed");
2755         let vcpu = vm.create_vcpu(0, None).unwrap();
2756         setup_fpu(&vcpu).unwrap();
2757 
2758         let expected_fpu: FpuState = FpuState {
2759             fcw: 0x37f,
2760             mxcsr: 0x1f80,
2761             ..Default::default()
2762         };
2763         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2764         // TODO: auto-generate kvm related structures with PartialEq on.
2765         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2766         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2767         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2768         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2769         // remove it at all.
2770         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2771     }
2772 
2773     #[test]
2774     fn test_setup_msrs() {
2775         use hypervisor::arch::x86::{msr_index, MsrEntry};
2776 
2777         let hv = hypervisor::new().unwrap();
2778         let vm = hv.create_vm().expect("new VM fd creation failed");
2779         let vcpu = vm.create_vcpu(0, None).unwrap();
2780         setup_msrs(&vcpu).unwrap();
2781 
2782         // This test will check against the last MSR entry configured (the tenth one).
2783         // See create_msr_entries for details.
2784         let mut msrs = vec![MsrEntry {
2785             index: msr_index::MSR_IA32_MISC_ENABLE,
2786             ..Default::default()
2787         }];
2788 
2789         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2790         // in this test case scenario.
2791         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2792         assert_eq!(read_msrs, 1);
2793 
2794         // Official entries that were setup when we did setup_msrs. We need to assert that the
2795         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2796         // expect.
2797         let entry_vec = vcpu.boot_msr_entries();
2798         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2799     }
2800 
2801     #[test]
2802     fn test_setup_regs_for_pvh() {
2803         let hv = hypervisor::new().unwrap();
2804         let vm = hv.create_vm().expect("new VM fd creation failed");
2805         let vcpu = vm.create_vcpu(0, None).unwrap();
2806 
2807         let expected_regs: StandardRegisters = StandardRegisters {
2808             rflags: 0x0000000000000002u64,
2809             rbx: arch::layout::PVH_INFO_START.0,
2810             rip: 1,
2811             ..Default::default()
2812         };
2813 
2814         setup_regs(
2815             &vcpu,
2816             arch::EntryPoint {
2817                 entry_addr: vm_memory::GuestAddress(expected_regs.rip),
2818                 setup_header: None,
2819             },
2820         )
2821         .unwrap();
2822 
2823         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2824         assert_eq!(actual_regs, expected_regs);
2825     }
2826 
2827     #[test]
2828     fn test_setup_regs_for_bzimage() {
2829         let hv = hypervisor::new().unwrap();
2830         let vm = hv.create_vm().expect("new VM fd creation failed");
2831         let vcpu = vm.create_vcpu(0, None).unwrap();
2832 
2833         let expected_regs: StandardRegisters = StandardRegisters {
2834             rflags: 0x0000000000000002u64,
2835             rip: 1,
2836             rsp: BOOT_STACK_POINTER.0,
2837             rsi: ZERO_PAGE_START.0,
2838             ..Default::default()
2839         };
2840 
2841         setup_regs(
2842             &vcpu,
2843             arch::EntryPoint {
2844                 entry_addr: vm_memory::GuestAddress(expected_regs.rip),
2845                 setup_header: Some(setup_header {
2846                     ..Default::default()
2847                 }),
2848             },
2849         )
2850         .unwrap();
2851 
2852         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2853         assert_eq!(actual_regs, expected_regs);
2854     }
2855 }
2856 
2857 #[cfg(target_arch = "aarch64")]
2858 #[cfg(test)]
2859 mod tests {
2860     use arch::{aarch64::regs, layout};
2861     use hypervisor::kvm::aarch64::is_system_register;
2862     use hypervisor::kvm::kvm_bindings::{
2863         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2864         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2865     };
2866     use hypervisor::{arm64_core_reg_id, offset_of};
2867     use std::mem;
2868 
2869     #[test]
2870     fn test_setup_regs() {
2871         let hv = hypervisor::new().unwrap();
2872         let vm = hv.create_vm().unwrap();
2873         let vcpu = vm.create_vcpu(0, None).unwrap();
2874 
2875         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2876         // Must fail when vcpu is not initialized yet.
2877         assert!(res.is_err());
2878 
2879         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2880         vm.get_preferred_target(&mut kvi).unwrap();
2881         vcpu.vcpu_init(&kvi).unwrap();
2882 
2883         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2884     }
2885 
2886     #[test]
2887     fn test_read_mpidr() {
2888         let hv = hypervisor::new().unwrap();
2889         let vm = hv.create_vm().unwrap();
2890         let vcpu = vm.create_vcpu(0, None).unwrap();
2891         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2892         vm.get_preferred_target(&mut kvi).unwrap();
2893 
2894         // Must fail when vcpu is not initialized yet.
2895         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2896 
2897         vcpu.vcpu_init(&kvi).unwrap();
2898         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2899     }
2900 
2901     #[test]
2902     fn test_is_system_register() {
2903         let offset = offset_of!(user_pt_regs, pc);
2904         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2905         assert!(!is_system_register(regid));
2906         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2907         assert!(is_system_register(regid));
2908     }
2909 
2910     #[test]
2911     fn test_save_restore_core_regs() {
2912         let hv = hypervisor::new().unwrap();
2913         let vm = hv.create_vm().unwrap();
2914         let vcpu = vm.create_vcpu(0, None).unwrap();
2915         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2916         vm.get_preferred_target(&mut kvi).unwrap();
2917 
2918         // Must fail when vcpu is not initialized yet.
2919         let res = vcpu.get_regs();
2920         assert!(res.is_err());
2921         assert_eq!(
2922             format!("{}", res.unwrap_err()),
2923             "Failed to get core register: Exec format error (os error 8)"
2924         );
2925 
2926         let mut state = kvm_regs::default();
2927         let res = vcpu.set_regs(&state);
2928         assert!(res.is_err());
2929         assert_eq!(
2930             format!("{}", res.unwrap_err()),
2931             "Failed to set core register: Exec format error (os error 8)"
2932         );
2933 
2934         vcpu.vcpu_init(&kvi).unwrap();
2935         let res = vcpu.get_regs();
2936         assert!(res.is_ok());
2937         state = res.unwrap();
2938         assert_eq!(state.regs.pstate, 0x3C5);
2939 
2940         assert!(vcpu.set_regs(&state).is_ok());
2941     }
2942 
2943     #[test]
2944     fn test_get_set_mpstate() {
2945         let hv = hypervisor::new().unwrap();
2946         let vm = hv.create_vm().unwrap();
2947         let vcpu = vm.create_vcpu(0, None).unwrap();
2948         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2949         vm.get_preferred_target(&mut kvi).unwrap();
2950 
2951         let res = vcpu.get_mp_state();
2952         assert!(res.is_ok());
2953         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2954     }
2955 }
2956