xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 8803e4a2e7f8e9596b72f81d3c916390e5b10fbd)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 #[cfg(target_arch = "x86_64")]
35 use arch::x86_64::get_x2apic_id;
36 use arch::EntryPoint;
37 use arch::NumaNodes;
38 #[cfg(target_arch = "aarch64")]
39 use devices::gic::Gic;
40 use devices::interrupt_controller::InterruptController;
41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
45 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
46 use hypervisor::aarch64::StandardRegisters;
47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
48 use hypervisor::arch::x86::msr_index;
49 #[cfg(target_arch = "x86_64")]
50 use hypervisor::arch::x86::CpuIdEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::MsrEntry;
53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
54 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
55 #[cfg(target_arch = "aarch64")]
56 use hypervisor::kvm::kvm_bindings;
57 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
58 use hypervisor::kvm::kvm_ioctls::Cap;
59 #[cfg(feature = "tdx")]
60 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
61 #[cfg(target_arch = "x86_64")]
62 use hypervisor::CpuVendor;
63 #[cfg(feature = "kvm")]
64 use hypervisor::HypervisorType;
65 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
66 use libc::{c_void, siginfo_t};
67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
68 use linux_loader::elf::Elf64_Nhdr;
69 use seccompiler::{apply_filter, SeccompAction};
70 use std::collections::BTreeMap;
71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
72 use std::io::Write;
73 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
74 use std::mem::size_of;
75 use std::os::unix::thread::JoinHandleExt;
76 use std::sync::atomic::{AtomicBool, Ordering};
77 use std::sync::{Arc, Barrier, Mutex};
78 use std::{cmp, io, result, thread};
79 use thiserror::Error;
80 use tracer::trace_scoped;
81 use vm_device::BusDevice;
82 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
83 use vm_memory::ByteValued;
84 #[cfg(feature = "guest_debug")]
85 use vm_memory::{Bytes, GuestAddressSpace};
86 use vm_memory::{GuestAddress, GuestMemoryAtomic};
87 use vm_migration::{
88     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
89     Transportable,
90 };
91 use vmm_sys_util::eventfd::EventFd;
92 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
93 use zerocopy::AsBytes;
94 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
95 /// Extract the specified bits of a 64-bit integer.
96 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
97 /// following expression should return 3 (`0b11`):
98 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
99 ///
100 macro_rules! extract_bits_64 {
101     ($value: tt, $offset: tt, $length: tt) => {
102         ($value >> $offset) & (!0u64 >> (64 - $length))
103     };
104 }
105 
106 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
107 macro_rules! extract_bits_64_without_offset {
108     ($value: tt, $length: tt) => {
109         $value & (!0u64 >> (64 - $length))
110     };
111 }
112 
113 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
114 
115 #[derive(Debug, Error)]
116 pub enum Error {
117     #[error("Error creating vCPU: {0}")]
118     VcpuCreate(#[source] anyhow::Error),
119 
120     #[error("Error running bCPU: {0}")]
121     VcpuRun(#[source] anyhow::Error),
122 
123     #[error("Error spawning vCPU thread: {0}")]
124     VcpuSpawn(#[source] io::Error),
125 
126     #[error("Error generating common CPUID: {0}")]
127     CommonCpuId(#[source] arch::Error),
128 
129     #[error("Error configuring vCPU: {0}")]
130     VcpuConfiguration(#[source] arch::Error),
131 
132     #[error("Still pending removed vcpu")]
133     VcpuPendingRemovedVcpu,
134 
135     #[cfg(target_arch = "aarch64")]
136     #[error("Error fetching preferred target: {0}")]
137     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
138 
139     #[cfg(target_arch = "aarch64")]
140     #[error("Error initialising vCPU: {0}")]
141     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
142 
143     #[error("Failed to join on vCPU threads: {0:?}")]
144     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
145 
146     #[error("Error adding CpuManager to MMIO bus: {0}")]
147     BusError(#[source] vm_device::BusError),
148 
149     #[error("Requested vCPUs exceed maximum")]
150     DesiredVCpuCountExceedsMax,
151 
152     #[error("Cannot create seccomp filter: {0}")]
153     CreateSeccompFilter(#[source] seccompiler::Error),
154 
155     #[error("Cannot apply seccomp filter: {0}")]
156     ApplySeccompFilter(#[source] seccompiler::Error),
157 
158     #[error("Error starting vCPU after restore: {0}")]
159     StartRestoreVcpu(#[source] anyhow::Error),
160 
161     #[error("Unexpected VmExit")]
162     UnexpectedVmExit,
163 
164     #[error("Failed to allocate MMIO address for CpuManager")]
165     AllocateMmmioAddress,
166 
167     #[cfg(feature = "tdx")]
168     #[error("Error initializing TDX: {0}")]
169     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
170 
171     #[cfg(target_arch = "aarch64")]
172     #[error("Error initializing PMU: {0}")]
173     InitPmu(#[source] hypervisor::HypervisorCpuError),
174 
175     #[cfg(feature = "guest_debug")]
176     #[error("Error during CPU debug: {0}")]
177     CpuDebug(#[source] hypervisor::HypervisorCpuError),
178 
179     #[cfg(feature = "guest_debug")]
180     #[error("Error translating virtual address: {0}")]
181     TranslateVirtualAddress(#[source] anyhow::Error),
182 
183     #[cfg(target_arch = "x86_64")]
184     #[error("Error setting up AMX: {0}")]
185     AmxEnable(#[source] anyhow::Error),
186 
187     #[error("Maximum number of vCPUs exceeds host limit")]
188     MaximumVcpusExceeded,
189 
190     #[cfg(feature = "sev_snp")]
191     #[error("Failed to set sev control register: {0}")]
192     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
193 
194     #[cfg(target_arch = "x86_64")]
195     #[error("Failed to inject NMI")]
196     NmiError(hypervisor::HypervisorCpuError),
197 }
198 pub type Result<T> = result::Result<T, Error>;
199 
200 #[cfg(target_arch = "x86_64")]
201 #[allow(dead_code)]
202 #[repr(packed)]
203 #[derive(AsBytes)]
204 struct LocalX2Apic {
205     pub r#type: u8,
206     pub length: u8,
207     pub _reserved: u16,
208     pub apic_id: u32,
209     pub flags: u32,
210     pub processor_id: u32,
211 }
212 
213 #[allow(dead_code)]
214 #[repr(packed)]
215 #[derive(Default, AsBytes)]
216 struct Ioapic {
217     pub r#type: u8,
218     pub length: u8,
219     pub ioapic_id: u8,
220     _reserved: u8,
221     pub apic_address: u32,
222     pub gsi_base: u32,
223 }
224 
225 #[cfg(target_arch = "aarch64")]
226 #[allow(dead_code)]
227 #[repr(packed)]
228 #[derive(AsBytes)]
229 struct GicC {
230     pub r#type: u8,
231     pub length: u8,
232     pub reserved0: u16,
233     pub cpu_interface_number: u32,
234     pub uid: u32,
235     pub flags: u32,
236     pub parking_version: u32,
237     pub performance_interrupt: u32,
238     pub parked_address: u64,
239     pub base_address: u64,
240     pub gicv_base_address: u64,
241     pub gich_base_address: u64,
242     pub vgic_interrupt: u32,
243     pub gicr_base_address: u64,
244     pub mpidr: u64,
245     pub proc_power_effi_class: u8,
246     pub reserved1: u8,
247     pub spe_overflow_interrupt: u16,
248 }
249 
250 #[cfg(target_arch = "aarch64")]
251 #[allow(dead_code)]
252 #[repr(packed)]
253 #[derive(AsBytes)]
254 struct GicD {
255     pub r#type: u8,
256     pub length: u8,
257     pub reserved0: u16,
258     pub gic_id: u32,
259     pub base_address: u64,
260     pub global_irq_base: u32,
261     pub version: u8,
262     pub reserved1: [u8; 3],
263 }
264 
265 #[cfg(target_arch = "aarch64")]
266 #[allow(dead_code)]
267 #[repr(packed)]
268 #[derive(AsBytes)]
269 struct GicR {
270     pub r#type: u8,
271     pub length: u8,
272     pub reserved: u16,
273     pub base_address: u64,
274     pub range_length: u32,
275 }
276 
277 #[cfg(target_arch = "aarch64")]
278 #[allow(dead_code)]
279 #[repr(packed)]
280 #[derive(AsBytes)]
281 struct GicIts {
282     pub r#type: u8,
283     pub length: u8,
284     pub reserved0: u16,
285     pub translation_id: u32,
286     pub base_address: u64,
287     pub reserved1: u32,
288 }
289 
290 #[cfg(target_arch = "aarch64")]
291 #[allow(dead_code)]
292 #[repr(packed)]
293 #[derive(AsBytes)]
294 struct ProcessorHierarchyNode {
295     pub r#type: u8,
296     pub length: u8,
297     pub reserved: u16,
298     pub flags: u32,
299     pub parent: u32,
300     pub acpi_processor_id: u32,
301     pub num_private_resources: u32,
302 }
303 
304 #[allow(dead_code)]
305 #[repr(packed)]
306 #[derive(Default, AsBytes)]
307 struct InterruptSourceOverride {
308     pub r#type: u8,
309     pub length: u8,
310     pub bus: u8,
311     pub source: u8,
312     pub gsi: u32,
313     pub flags: u16,
314 }
315 
316 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
317 macro_rules! round_up {
318     ($n:expr,$d:expr) => {
319         (($n / ($d + 1)) + 1) * $d
320     };
321 }
322 
323 /// A wrapper around creating and using a kvm-based VCPU.
324 pub struct Vcpu {
325     // The hypervisor abstracted CPU.
326     vcpu: Arc<dyn hypervisor::Vcpu>,
327     id: u8,
328     #[cfg(target_arch = "aarch64")]
329     mpidr: u64,
330     saved_state: Option<CpuState>,
331     #[cfg(target_arch = "x86_64")]
332     vendor: CpuVendor,
333 }
334 
335 impl Vcpu {
336     /// Constructs a new VCPU for `vm`.
337     ///
338     /// # Arguments
339     ///
340     /// * `id` - Represents the CPU number between [0, max vcpus).
341     /// * `vm` - The virtual machine this vcpu will get attached to.
342     /// * `vm_ops` - Optional object for exit handling.
343     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
344     pub fn new(
345         id: u8,
346         apic_id: u8,
347         vm: &Arc<dyn hypervisor::Vm>,
348         vm_ops: Option<Arc<dyn VmOps>>,
349         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
350     ) -> Result<Self> {
351         let vcpu = vm
352             .create_vcpu(apic_id, vm_ops)
353             .map_err(|e| Error::VcpuCreate(e.into()))?;
354         // Initially the cpuid per vCPU is the one supported by this VM.
355         Ok(Vcpu {
356             vcpu,
357             id,
358             #[cfg(target_arch = "aarch64")]
359             mpidr: 0,
360             saved_state: None,
361             #[cfg(target_arch = "x86_64")]
362             vendor: cpu_vendor,
363         })
364     }
365 
366     /// Configures a vcpu and should be called once per vcpu when created.
367     ///
368     /// # Arguments
369     ///
370     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
371     /// * `guest_memory` - Guest memory.
372     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
373     pub fn configure(
374         &mut self,
375         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
376         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
377         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
378         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
379         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
380     ) -> Result<()> {
381         #[cfg(target_arch = "aarch64")]
382         {
383             self.init(vm)?;
384             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
385                 .map_err(Error::VcpuConfiguration)?;
386         }
387         info!("Configuring vCPU: cpu_id = {}", self.id);
388         #[cfg(target_arch = "x86_64")]
389         arch::configure_vcpu(
390             &self.vcpu,
391             self.id,
392             boot_setup,
393             cpuid,
394             kvm_hyperv,
395             self.vendor,
396             topology,
397         )
398         .map_err(Error::VcpuConfiguration)?;
399 
400         Ok(())
401     }
402 
403     /// Gets the MPIDR register value.
404     #[cfg(target_arch = "aarch64")]
405     pub fn get_mpidr(&self) -> u64 {
406         self.mpidr
407     }
408 
409     /// Gets the saved vCPU state.
410     #[cfg(target_arch = "aarch64")]
411     pub fn get_saved_state(&self) -> Option<CpuState> {
412         self.saved_state.clone()
413     }
414 
415     /// Initializes an aarch64 specific vcpu for booting Linux.
416     #[cfg(target_arch = "aarch64")]
417     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
418         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
419 
420         // This reads back the kernel's preferred target type.
421         vm.get_preferred_target(&mut kvi)
422             .map_err(Error::VcpuArmPreferredTarget)?;
423         // We already checked that the capability is supported.
424         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
425         if vm
426             .as_any()
427             .downcast_ref::<hypervisor::kvm::KvmVm>()
428             .unwrap()
429             .check_extension(Cap::ArmPmuV3)
430         {
431             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
432         }
433         // Non-boot cpus are powered off initially.
434         if self.id > 0 {
435             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
436         }
437         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
438     }
439 
440     /// Runs the VCPU until it exits, returning the reason.
441     ///
442     /// Note that the state of the VCPU and associated VM must be setup first for this to do
443     /// anything useful.
444     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
445         self.vcpu.run()
446     }
447 
448     #[cfg(feature = "sev_snp")]
449     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
450         self.vcpu
451             .set_sev_control_register(vmsa_pfn)
452             .map_err(Error::SetSevControlRegister)
453     }
454 }
455 
456 impl Pausable for Vcpu {}
457 impl Snapshottable for Vcpu {
458     fn id(&self) -> String {
459         self.id.to_string()
460     }
461 
462     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
463         let saved_state = self
464             .vcpu
465             .state()
466             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
467 
468         self.saved_state = Some(saved_state.clone());
469 
470         Ok(Snapshot::from_data(SnapshotData::new_from_state(
471             &saved_state,
472         )?))
473     }
474 }
475 
476 pub struct CpuManager {
477     config: CpusConfig,
478     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
479     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
480     #[cfg(target_arch = "x86_64")]
481     cpuid: Vec<CpuIdEntry>,
482     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
483     vm: Arc<dyn hypervisor::Vm>,
484     vcpus_kill_signalled: Arc<AtomicBool>,
485     vcpus_pause_signalled: Arc<AtomicBool>,
486     vcpus_kick_signalled: Arc<AtomicBool>,
487     exit_evt: EventFd,
488     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
489     reset_evt: EventFd,
490     #[cfg(feature = "guest_debug")]
491     vm_debug_evt: EventFd,
492     vcpu_states: Vec<VcpuState>,
493     selected_cpu: u8,
494     vcpus: Vec<Arc<Mutex<Vcpu>>>,
495     seccomp_action: SeccompAction,
496     vm_ops: Arc<dyn VmOps>,
497     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
498     acpi_address: Option<GuestAddress>,
499     proximity_domain_per_cpu: BTreeMap<u8, u32>,
500     affinity: BTreeMap<u8, Vec<usize>>,
501     dynamic: bool,
502     hypervisor: Arc<dyn hypervisor::Hypervisor>,
503     #[cfg(feature = "sev_snp")]
504     sev_snp_enabled: bool,
505 }
506 
507 const CPU_ENABLE_FLAG: usize = 0;
508 const CPU_INSERTING_FLAG: usize = 1;
509 const CPU_REMOVING_FLAG: usize = 2;
510 const CPU_EJECT_FLAG: usize = 3;
511 
512 const CPU_STATUS_OFFSET: u64 = 4;
513 const CPU_SELECTION_OFFSET: u64 = 0;
514 
515 impl BusDevice for CpuManager {
516     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
517         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
518         data.fill(0);
519 
520         match offset {
521             CPU_SELECTION_OFFSET => {
522                 data[0] = self.selected_cpu;
523             }
524             CPU_STATUS_OFFSET => {
525                 if self.selected_cpu < self.max_vcpus() {
526                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
527                     if state.active() {
528                         data[0] |= 1 << CPU_ENABLE_FLAG;
529                     }
530                     if state.inserting {
531                         data[0] |= 1 << CPU_INSERTING_FLAG;
532                     }
533                     if state.removing {
534                         data[0] |= 1 << CPU_REMOVING_FLAG;
535                     }
536                 } else {
537                     warn!("Out of range vCPU id: {}", self.selected_cpu);
538                 }
539             }
540             _ => {
541                 warn!(
542                     "Unexpected offset for accessing CPU manager device: {:#}",
543                     offset
544                 );
545             }
546         }
547     }
548 
549     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
550         match offset {
551             CPU_SELECTION_OFFSET => {
552                 self.selected_cpu = data[0];
553             }
554             CPU_STATUS_OFFSET => {
555                 if self.selected_cpu < self.max_vcpus() {
556                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
557                     // The ACPI code writes back a 1 to acknowledge the insertion
558                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
559                         && state.inserting
560                     {
561                         state.inserting = false;
562                     }
563                     // Ditto for removal
564                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
565                         && state.removing
566                     {
567                         state.removing = false;
568                     }
569                     // Trigger removal of vCPU
570                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
571                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
572                             error!("Error removing vCPU: {:?}", e);
573                         }
574                     }
575                 } else {
576                     warn!("Out of range vCPU id: {}", self.selected_cpu);
577                 }
578             }
579             _ => {
580                 warn!(
581                     "Unexpected offset for accessing CPU manager device: {:#}",
582                     offset
583                 );
584             }
585         }
586         None
587     }
588 }
589 
590 #[derive(Default)]
591 struct VcpuState {
592     inserting: bool,
593     removing: bool,
594     pending_removal: Arc<AtomicBool>,
595     handle: Option<thread::JoinHandle<()>>,
596     kill: Arc<AtomicBool>,
597     vcpu_run_interrupted: Arc<AtomicBool>,
598     paused: Arc<AtomicBool>,
599 }
600 
601 impl VcpuState {
602     fn active(&self) -> bool {
603         self.handle.is_some()
604     }
605 
606     fn signal_thread(&self) {
607         if let Some(handle) = self.handle.as_ref() {
608             loop {
609                 // SAFETY: FFI call with correct arguments
610                 unsafe {
611                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
612                 }
613                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
614                     break;
615                 } else {
616                     // This is more effective than thread::yield_now() at
617                     // avoiding a priority inversion with the vCPU thread
618                     thread::sleep(std::time::Duration::from_millis(1));
619                 }
620             }
621         }
622     }
623 
624     fn join_thread(&mut self) -> Result<()> {
625         if let Some(handle) = self.handle.take() {
626             handle.join().map_err(Error::ThreadCleanup)?
627         }
628 
629         Ok(())
630     }
631 
632     fn unpark_thread(&self) {
633         if let Some(handle) = self.handle.as_ref() {
634             handle.thread().unpark()
635         }
636     }
637 }
638 
639 impl CpuManager {
640     #[allow(unused_variables)]
641     #[allow(clippy::too_many_arguments)]
642     pub fn new(
643         config: &CpusConfig,
644         vm: Arc<dyn hypervisor::Vm>,
645         exit_evt: EventFd,
646         reset_evt: EventFd,
647         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
648         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
649         seccomp_action: SeccompAction,
650         vm_ops: Arc<dyn VmOps>,
651         #[cfg(feature = "tdx")] tdx_enabled: bool,
652         numa_nodes: &NumaNodes,
653         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
654     ) -> Result<Arc<Mutex<CpuManager>>> {
655         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
656             return Err(Error::MaximumVcpusExceeded);
657         }
658 
659         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
660         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
661         let hypervisor_type = hypervisor.hypervisor_type();
662         #[cfg(target_arch = "x86_64")]
663         let cpu_vendor = hypervisor.get_cpu_vendor();
664 
665         #[cfg(target_arch = "x86_64")]
666         if config.features.amx {
667             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
668             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
669             const XFEATURE_XTILEDATA: usize = 18;
670             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
671 
672             // SAFETY: the syscall is only modifying kernel internal
673             // data structures that the kernel is itself expected to safeguard.
674             let amx_tile = unsafe {
675                 libc::syscall(
676                     libc::SYS_arch_prctl,
677                     ARCH_REQ_XCOMP_GUEST_PERM,
678                     XFEATURE_XTILEDATA,
679                 )
680             };
681 
682             if amx_tile != 0 {
683                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
684             } else {
685                 let mask: usize = 0;
686                 // SAFETY: the mask being modified (not marked mutable as it is
687                 // modified in unsafe only which is permitted) isn't in use elsewhere.
688                 let result = unsafe {
689                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
690                 };
691                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
692                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
693                 }
694             }
695         }
696 
697         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
698             let mut cpu_list = Vec::new();
699             for (proximity_domain, numa_node) in numa_nodes.iter() {
700                 for cpu in numa_node.cpus.iter() {
701                     cpu_list.push((*cpu, *proximity_domain))
702                 }
703             }
704             cpu_list
705         }
706         .into_iter()
707         .collect();
708 
709         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
710             cpu_affinity
711                 .iter()
712                 .map(|a| (a.vcpu, a.host_cpus.clone()))
713                 .collect()
714         } else {
715             BTreeMap::new()
716         };
717 
718         #[cfg(feature = "tdx")]
719         let dynamic = !tdx_enabled;
720         #[cfg(not(feature = "tdx"))]
721         let dynamic = true;
722 
723         Ok(Arc::new(Mutex::new(CpuManager {
724             config: config.clone(),
725             interrupt_controller: None,
726             #[cfg(target_arch = "x86_64")]
727             cpuid: Vec::new(),
728             vm,
729             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
730             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
731             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
732             vcpu_states,
733             exit_evt,
734             reset_evt,
735             #[cfg(feature = "guest_debug")]
736             vm_debug_evt,
737             selected_cpu: 0,
738             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
739             seccomp_action,
740             vm_ops,
741             acpi_address: None,
742             proximity_domain_per_cpu,
743             affinity,
744             dynamic,
745             hypervisor: hypervisor.clone(),
746             #[cfg(feature = "sev_snp")]
747             sev_snp_enabled,
748         })))
749     }
750 
751     #[cfg(target_arch = "x86_64")]
752     pub fn populate_cpuid(
753         &mut self,
754         memory_manager: &Arc<Mutex<MemoryManager>>,
755         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
756         #[cfg(feature = "tdx")] tdx: bool,
757     ) -> Result<()> {
758         let sgx_epc_sections = memory_manager
759             .lock()
760             .unwrap()
761             .sgx_epc_region()
762             .as_ref()
763             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
764 
765         self.cpuid = {
766             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
767             arch::generate_common_cpuid(
768                 hypervisor,
769                 &arch::CpuidConfig {
770                     sgx_epc_sections,
771                     phys_bits,
772                     kvm_hyperv: self.config.kvm_hyperv,
773                     #[cfg(feature = "tdx")]
774                     tdx,
775                     amx: self.config.features.amx,
776                 },
777             )
778             .map_err(Error::CommonCpuId)?
779         };
780 
781         Ok(())
782     }
783 
784     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
785         info!("Creating vCPU: cpu_id = {}", cpu_id);
786 
787         #[cfg(target_arch = "x86_64")]
788         let topology = self.get_vcpu_topology();
789         #[cfg(target_arch = "x86_64")]
790         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
791         #[cfg(target_arch = "aarch64")]
792         let x2apic_id = cpu_id as u32;
793 
794         let mut vcpu = Vcpu::new(
795             cpu_id,
796             x2apic_id as u8,
797             &self.vm,
798             Some(self.vm_ops.clone()),
799             #[cfg(target_arch = "x86_64")]
800             self.hypervisor.get_cpu_vendor(),
801         )?;
802 
803         if let Some(snapshot) = snapshot {
804             // AArch64 vCPUs should be initialized after created.
805             #[cfg(target_arch = "aarch64")]
806             vcpu.init(&self.vm)?;
807 
808             let state: CpuState = snapshot.to_state().map_err(|e| {
809                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
810             })?;
811             vcpu.vcpu
812                 .set_state(&state)
813                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
814 
815             vcpu.saved_state = Some(state);
816         }
817 
818         let vcpu = Arc::new(Mutex::new(vcpu));
819 
820         // Adding vCPU to the CpuManager's vCPU list.
821         self.vcpus.push(vcpu.clone());
822 
823         Ok(vcpu)
824     }
825 
826     pub fn configure_vcpu(
827         &self,
828         vcpu: Arc<Mutex<Vcpu>>,
829         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
830     ) -> Result<()> {
831         let mut vcpu = vcpu.lock().unwrap();
832 
833         #[cfg(feature = "sev_snp")]
834         if self.sev_snp_enabled {
835             if let Some((kernel_entry_point, _)) = boot_setup {
836                 vcpu.set_sev_control_register(
837                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
838                 )?;
839             }
840 
841             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
842             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
843             return Ok(());
844         }
845 
846         #[cfg(target_arch = "x86_64")]
847         assert!(!self.cpuid.is_empty());
848 
849         #[cfg(target_arch = "x86_64")]
850         let topology = self.config.topology.clone().map_or_else(
851             || Some((1, self.boot_vcpus(), 1)),
852             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
853         );
854         #[cfg(target_arch = "x86_64")]
855         vcpu.configure(
856             boot_setup,
857             self.cpuid.clone(),
858             self.config.kvm_hyperv,
859             topology,
860         )?;
861 
862         #[cfg(target_arch = "aarch64")]
863         vcpu.configure(&self.vm, boot_setup)?;
864 
865         Ok(())
866     }
867 
868     /// Only create new vCPUs if there aren't any inactive ones to reuse
869     fn create_vcpus(
870         &mut self,
871         desired_vcpus: u8,
872         snapshot: Option<Snapshot>,
873     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
874         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
875         info!(
876             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
877             desired_vcpus,
878             self.config.max_vcpus,
879             self.vcpus.len(),
880             self.present_vcpus()
881         );
882 
883         if desired_vcpus > self.config.max_vcpus {
884             return Err(Error::DesiredVCpuCountExceedsMax);
885         }
886 
887         // Only create vCPUs in excess of all the allocated vCPUs.
888         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
889             vcpus.push(self.create_vcpu(
890                 cpu_id,
891                 // TODO: The special format of the CPU id can be removed once
892                 // ready to break live upgrade.
893                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
894             )?);
895         }
896 
897         Ok(vcpus)
898     }
899 
900     #[cfg(target_arch = "aarch64")]
901     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
902         for cpu in self.vcpus.iter() {
903             let cpu = cpu.lock().unwrap();
904             // Check if PMU attr is available, if not, log the information.
905             if cpu.vcpu.has_pmu_support() {
906                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
907             } else {
908                 debug!(
909                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
910                     cpu.id
911                 );
912                 return Ok(false);
913             }
914         }
915 
916         Ok(true)
917     }
918 
919     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
920         self.vcpus.clone()
921     }
922 
923     fn start_vcpu(
924         &mut self,
925         vcpu: Arc<Mutex<Vcpu>>,
926         vcpu_id: u8,
927         vcpu_thread_barrier: Arc<Barrier>,
928         inserting: bool,
929     ) -> Result<()> {
930         let reset_evt = self.reset_evt.try_clone().unwrap();
931         let exit_evt = self.exit_evt.try_clone().unwrap();
932         #[cfg(feature = "kvm")]
933         let hypervisor_type = self.hypervisor.hypervisor_type();
934         #[cfg(feature = "guest_debug")]
935         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
936         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
937         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
938         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
939         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
940 
941         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
942         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
943             .vcpu_run_interrupted
944             .clone();
945         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
946         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
947 
948         // Prepare the CPU set the current vCPU is expected to run onto.
949         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
950             // SAFETY: all zeros is a valid pattern
951             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
952             // SAFETY: FFI call, trivially safe
953             unsafe { libc::CPU_ZERO(&mut cpuset) };
954             for host_cpu in host_cpus {
955                 // SAFETY: FFI call, trivially safe
956                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
957             }
958             cpuset
959         });
960 
961         // Retrieve seccomp filter for vcpu thread
962         let vcpu_seccomp_filter = get_seccomp_filter(
963             &self.seccomp_action,
964             Thread::Vcpu,
965             self.hypervisor.hypervisor_type(),
966         )
967         .map_err(Error::CreateSeccompFilter)?;
968 
969         #[cfg(target_arch = "x86_64")]
970         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
971 
972         info!("Starting vCPU: cpu_id = {}", vcpu_id);
973 
974         let handle = Some(
975             thread::Builder::new()
976                 .name(format!("vcpu{vcpu_id}"))
977                 .spawn(move || {
978                     // Schedule the thread to run on the expected CPU set
979                     if let Some(cpuset) = cpuset.as_ref() {
980                         // SAFETY: FFI call with correct arguments
981                         let ret = unsafe {
982                             libc::sched_setaffinity(
983                                 0,
984                                 std::mem::size_of::<libc::cpu_set_t>(),
985                                 cpuset as *const libc::cpu_set_t,
986                             )
987                         };
988 
989                         if ret != 0 {
990                             error!(
991                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
992                                 vcpu_id,
993                                 io::Error::last_os_error()
994                             );
995                             return;
996                         }
997                     }
998 
999                     // Apply seccomp filter for vcpu thread.
1000                     if !vcpu_seccomp_filter.is_empty() {
1001                         if let Err(e) =
1002                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1003                         {
1004                             error!("Error applying seccomp filter: {:?}", e);
1005                             return;
1006                         }
1007                     }
1008                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1009                     // This uses an async signal safe handler to kill the vcpu handles.
1010                     register_signal_handler(SIGRTMIN(), handle_signal)
1011                         .expect("Failed to register vcpu signal handler");
1012                     // Block until all CPUs are ready.
1013                     vcpu_thread_barrier.wait();
1014 
1015                     std::panic::catch_unwind(move || {
1016                         loop {
1017                             // If we are being told to pause, we park the thread
1018                             // until the pause boolean is toggled.
1019                             // The resume operation is responsible for toggling
1020                             // the boolean and unpark the thread.
1021                             // We enter a loop because park() could spuriously
1022                             // return. We will then park() again unless the
1023                             // pause boolean has been toggled.
1024 
1025                             // Need to use Ordering::SeqCst as we have multiple
1026                             // loads and stores to different atomics and we need
1027                             // to see them in a consistent order in all threads
1028 
1029                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1030                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1031                                 // completed by returning to KVM_RUN. From the kernel docs:
1032                                 //
1033                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1034                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1035                                 // operations are complete (and guest state is consistent) only after userspace
1036                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1037                                 // incomplete operations and then check for pending signals.
1038                                 // The pending state of the operation is not preserved in state which is
1039                                 // visible to userspace, thus userspace should ensure that the operation is
1040                                 // completed before performing a live migration.  Userspace can re-enter the
1041                                 // guest with an unmasked signal pending or with the immediate_exit field set
1042                                 // to complete pending operations without allowing any further instructions
1043                                 // to be executed.
1044 
1045                                 #[cfg(feature = "kvm")]
1046                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1047                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1048                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1049                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1050                                         break;
1051                                     }
1052                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1053                                 }
1054 
1055                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1056 
1057                                 vcpu_paused.store(true, Ordering::SeqCst);
1058                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1059                                     thread::park();
1060                                 }
1061                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1062                             }
1063 
1064                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1065                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1066                                 #[cfg(target_arch = "x86_64")]
1067                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1068                                     Ok(()) => {},
1069                                     Err(e) => {
1070                                         error!("Error when inject nmi {}", e);
1071                                         break;
1072                                     }
1073                                 }
1074                             }
1075 
1076                             // We've been told to terminate
1077                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1078                                 || vcpu_kill.load(Ordering::SeqCst)
1079                             {
1080                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1081                                 break;
1082                             }
1083 
1084                             #[cfg(feature = "tdx")]
1085                             let mut vcpu = vcpu.lock().unwrap();
1086                             #[cfg(not(feature = "tdx"))]
1087                             let vcpu = vcpu.lock().unwrap();
1088                             // vcpu.run() returns false on a triple-fault so trigger a reset
1089                             match vcpu.run() {
1090                                 Ok(run) => match run {
1091                                     #[cfg(feature = "kvm")]
1092                                     VmExit::Debug => {
1093                                         info!("VmExit::Debug");
1094                                         #[cfg(feature = "guest_debug")]
1095                                         {
1096                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1097                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1098                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1099                                         }
1100                                     }
1101                                     #[cfg(target_arch = "x86_64")]
1102                                     VmExit::IoapicEoi(vector) => {
1103                                         if let Some(interrupt_controller) =
1104                                             &interrupt_controller_clone
1105                                         {
1106                                             interrupt_controller
1107                                                 .lock()
1108                                                 .unwrap()
1109                                                 .end_of_interrupt(vector);
1110                                         }
1111                                     }
1112                                     VmExit::Ignore => {}
1113                                     VmExit::Hyperv => {}
1114                                     VmExit::Reset => {
1115                                         info!("VmExit::Reset");
1116                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1117                                         reset_evt.write(1).unwrap();
1118                                         break;
1119                                     }
1120                                     VmExit::Shutdown => {
1121                                         info!("VmExit::Shutdown");
1122                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1123                                         exit_evt.write(1).unwrap();
1124                                         break;
1125                                     }
1126                                     #[cfg(feature = "tdx")]
1127                                     VmExit::Tdx => {
1128                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1129                                             match vcpu.get_tdx_exit_details() {
1130                                                 Ok(details) => match details {
1131                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1132                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1133                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1134                                                     }
1135                                                 },
1136                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1137                                             }
1138                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1139                                         } else {
1140                                             // We should never reach this code as
1141                                             // this means the design from the code
1142                                             // is wrong.
1143                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1144                                         }
1145                                     }
1146                                 },
1147 
1148                                 Err(e) => {
1149                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1150                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1151                                     exit_evt.write(1).unwrap();
1152                                     break;
1153                                 }
1154                             }
1155 
1156                             // We've been told to terminate
1157                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1158                                 || vcpu_kill.load(Ordering::SeqCst)
1159                             {
1160                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1161                                 break;
1162                             }
1163                         }
1164                     })
1165                     .or_else(|_| {
1166                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1167                         error!("vCPU thread panicked");
1168                         panic_exit_evt.write(1)
1169                     })
1170                     .ok();
1171                 })
1172                 .map_err(Error::VcpuSpawn)?,
1173         );
1174 
1175         // On hot plug calls into this function entry_point is None. It is for
1176         // those hotplug CPU additions that we need to set the inserting flag.
1177         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1178         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1179 
1180         Ok(())
1181     }
1182 
1183     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1184     fn activate_vcpus(
1185         &mut self,
1186         desired_vcpus: u8,
1187         inserting: bool,
1188         paused: Option<bool>,
1189     ) -> Result<()> {
1190         if desired_vcpus > self.config.max_vcpus {
1191             return Err(Error::DesiredVCpuCountExceedsMax);
1192         }
1193 
1194         let vcpu_thread_barrier = Arc::new(Barrier::new(
1195             (desired_vcpus - self.present_vcpus() + 1) as usize,
1196         ));
1197 
1198         if let Some(paused) = paused {
1199             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1200         }
1201 
1202         info!(
1203             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1204             desired_vcpus,
1205             self.vcpus.len(),
1206             self.present_vcpus(),
1207             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1208         );
1209 
1210         // This reuses any inactive vCPUs as well as any that were newly created
1211         for vcpu_id in self.present_vcpus()..desired_vcpus {
1212             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1213             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1214         }
1215 
1216         // Unblock all CPU threads.
1217         vcpu_thread_barrier.wait();
1218         Ok(())
1219     }
1220 
1221     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1222         // Mark vCPUs for removal, actual removal happens on ejection
1223         for cpu_id in desired_vcpus..self.present_vcpus() {
1224             self.vcpu_states[usize::from(cpu_id)].removing = true;
1225             self.vcpu_states[usize::from(cpu_id)]
1226                 .pending_removal
1227                 .store(true, Ordering::SeqCst);
1228         }
1229     }
1230 
1231     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1232         for state in self.vcpu_states.iter() {
1233             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1234                 return true;
1235             }
1236         }
1237         false
1238     }
1239 
1240     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1241         info!("Removing vCPU: cpu_id = {}", cpu_id);
1242         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1243         state.kill.store(true, Ordering::SeqCst);
1244         state.signal_thread();
1245         state.join_thread()?;
1246         state.handle = None;
1247 
1248         // Once the thread has exited, clear the "kill" so that it can reused
1249         state.kill.store(false, Ordering::SeqCst);
1250         state.pending_removal.store(false, Ordering::SeqCst);
1251 
1252         Ok(())
1253     }
1254 
1255     pub fn create_boot_vcpus(
1256         &mut self,
1257         snapshot: Option<Snapshot>,
1258     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1259         trace_scoped!("create_boot_vcpus");
1260 
1261         self.create_vcpus(self.boot_vcpus(), snapshot)
1262     }
1263 
1264     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1265     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1266         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1267     }
1268 
1269     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1270         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1271             .map_err(|e| {
1272                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1273             })?;
1274 
1275         Ok(())
1276     }
1277 
1278     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1279         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1280             return Ok(false);
1281         }
1282 
1283         if !self.dynamic {
1284             return Ok(false);
1285         }
1286 
1287         if self.check_pending_removed_vcpu() {
1288             return Err(Error::VcpuPendingRemovedVcpu);
1289         }
1290 
1291         match desired_vcpus.cmp(&self.present_vcpus()) {
1292             cmp::Ordering::Greater => {
1293                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1294                 for vcpu in vcpus {
1295                     self.configure_vcpu(vcpu, None)?
1296                 }
1297                 self.activate_vcpus(desired_vcpus, true, None)?;
1298                 Ok(true)
1299             }
1300             cmp::Ordering::Less => {
1301                 self.mark_vcpus_for_removal(desired_vcpus);
1302                 Ok(true)
1303             }
1304             _ => Ok(false),
1305         }
1306     }
1307 
1308     pub fn shutdown(&mut self) -> Result<()> {
1309         // Tell the vCPUs to stop themselves next time they go through the loop
1310         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1311 
1312         // Toggle the vCPUs pause boolean
1313         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1314 
1315         // Unpark all the VCPU threads.
1316         for state in self.vcpu_states.iter() {
1317             state.unpark_thread();
1318         }
1319 
1320         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1321         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1322         // above.
1323         for state in self.vcpu_states.iter() {
1324             state.signal_thread();
1325         }
1326 
1327         // Wait for all the threads to finish. This removes the state from the vector.
1328         for mut state in self.vcpu_states.drain(..) {
1329             state.join_thread()?;
1330         }
1331 
1332         Ok(())
1333     }
1334 
1335     #[cfg(feature = "tdx")]
1336     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1337         for vcpu in &self.vcpus {
1338             vcpu.lock()
1339                 .unwrap()
1340                 .vcpu
1341                 .tdx_init(hob_address)
1342                 .map_err(Error::InitializeTdx)?;
1343         }
1344         Ok(())
1345     }
1346 
1347     pub fn boot_vcpus(&self) -> u8 {
1348         self.config.boot_vcpus
1349     }
1350 
1351     pub fn max_vcpus(&self) -> u8 {
1352         self.config.max_vcpus
1353     }
1354 
1355     #[cfg(target_arch = "x86_64")]
1356     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1357         assert!(!self.cpuid.is_empty());
1358         self.cpuid.clone()
1359     }
1360 
1361     fn present_vcpus(&self) -> u8 {
1362         self.vcpu_states
1363             .iter()
1364             .fold(0, |acc, state| acc + state.active() as u8)
1365     }
1366 
1367     #[cfg(target_arch = "aarch64")]
1368     pub fn get_mpidrs(&self) -> Vec<u64> {
1369         self.vcpus
1370             .iter()
1371             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1372             .collect()
1373     }
1374 
1375     #[cfg(target_arch = "aarch64")]
1376     pub fn get_saved_states(&self) -> Vec<CpuState> {
1377         self.vcpus
1378             .iter()
1379             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1380             .collect()
1381     }
1382 
1383     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1384         self.config
1385             .topology
1386             .clone()
1387             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1388     }
1389 
1390     pub fn create_madt(&self) -> Sdt {
1391         use crate::acpi;
1392         // This is also checked in the commandline parsing.
1393         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1394 
1395         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1396         #[cfg(target_arch = "x86_64")]
1397         {
1398             madt.write(36, arch::layout::APIC_START.0);
1399 
1400             for cpu in 0..self.config.max_vcpus {
1401                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1402 
1403                 let lapic = LocalX2Apic {
1404                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1405                     length: 16,
1406                     processor_id: cpu.into(),
1407                     apic_id: x2apic_id,
1408                     flags: if cpu < self.config.boot_vcpus {
1409                         1 << MADT_CPU_ENABLE_FLAG
1410                     } else {
1411                         0
1412                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1413                     _reserved: 0,
1414                 };
1415                 madt.append(lapic);
1416             }
1417 
1418             madt.append(Ioapic {
1419                 r#type: acpi::ACPI_APIC_IO,
1420                 length: 12,
1421                 ioapic_id: 0,
1422                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1423                 gsi_base: 0,
1424                 ..Default::default()
1425             });
1426 
1427             madt.append(InterruptSourceOverride {
1428                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1429                 length: 10,
1430                 bus: 0,
1431                 source: 4,
1432                 gsi: 4,
1433                 flags: 0,
1434             });
1435         }
1436 
1437         #[cfg(target_arch = "aarch64")]
1438         {
1439             /* Notes:
1440              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1441              */
1442 
1443             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1444             for cpu in 0..self.config.boot_vcpus {
1445                 let vcpu = &self.vcpus[cpu as usize];
1446                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1447                 /* ARMv8 MPIDR format:
1448                      Bits [63:40] Must be zero
1449                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1450                      Bits [31:24] Must be zero
1451                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1452                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1453                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1454                 */
1455                 let mpidr_mask = 0xff_00ff_ffff;
1456                 let gicc = GicC {
1457                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1458                     length: 80,
1459                     reserved0: 0,
1460                     cpu_interface_number: cpu as u32,
1461                     uid: cpu as u32,
1462                     flags: 1,
1463                     parking_version: 0,
1464                     performance_interrupt: 0,
1465                     parked_address: 0,
1466                     base_address: 0,
1467                     gicv_base_address: 0,
1468                     gich_base_address: 0,
1469                     vgic_interrupt: 0,
1470                     gicr_base_address: 0,
1471                     mpidr: mpidr & mpidr_mask,
1472                     proc_power_effi_class: 0,
1473                     reserved1: 0,
1474                     spe_overflow_interrupt: 0,
1475                 };
1476 
1477                 madt.append(gicc);
1478             }
1479             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1480 
1481             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1482             let gicd = GicD {
1483                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1484                 length: 24,
1485                 reserved0: 0,
1486                 gic_id: 0,
1487                 base_address: vgic_config.dist_addr,
1488                 global_irq_base: 0,
1489                 version: 3,
1490                 reserved1: [0; 3],
1491             };
1492             madt.append(gicd);
1493 
1494             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1495             let gicr = GicR {
1496                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1497                 length: 16,
1498                 reserved: 0,
1499                 base_address: vgic_config.redists_addr,
1500                 range_length: vgic_config.redists_size as u32,
1501             };
1502             madt.append(gicr);
1503 
1504             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1505             let gicits = GicIts {
1506                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1507                 length: 20,
1508                 reserved0: 0,
1509                 translation_id: 0,
1510                 base_address: vgic_config.msi_addr,
1511                 reserved1: 0,
1512             };
1513             madt.append(gicits);
1514 
1515             madt.update_checksum();
1516         }
1517 
1518         madt
1519     }
1520 
1521     #[cfg(target_arch = "aarch64")]
1522     pub fn create_pptt(&self) -> Sdt {
1523         let pptt_start = 0;
1524         let mut cpus = 0;
1525         let mut uid = 0;
1526         // If topology is not specified, the default setting is:
1527         // 1 package, multiple cores, 1 thread per core
1528         // This is also the behavior when PPTT is missing.
1529         let (threads_per_core, cores_per_package, packages) =
1530             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1531 
1532         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1533 
1534         for cluster_idx in 0..packages {
1535             if cpus < self.config.boot_vcpus as usize {
1536                 let cluster_offset = pptt.len() - pptt_start;
1537                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1538                     r#type: 0,
1539                     length: 20,
1540                     reserved: 0,
1541                     flags: 0x2,
1542                     parent: 0,
1543                     acpi_processor_id: cluster_idx as u32,
1544                     num_private_resources: 0,
1545                 };
1546                 pptt.append(cluster_hierarchy_node);
1547 
1548                 for core_idx in 0..cores_per_package {
1549                     let core_offset = pptt.len() - pptt_start;
1550 
1551                     if threads_per_core > 1 {
1552                         let core_hierarchy_node = ProcessorHierarchyNode {
1553                             r#type: 0,
1554                             length: 20,
1555                             reserved: 0,
1556                             flags: 0x2,
1557                             parent: cluster_offset as u32,
1558                             acpi_processor_id: core_idx as u32,
1559                             num_private_resources: 0,
1560                         };
1561                         pptt.append(core_hierarchy_node);
1562 
1563                         for _thread_idx in 0..threads_per_core {
1564                             let thread_hierarchy_node = ProcessorHierarchyNode {
1565                                 r#type: 0,
1566                                 length: 20,
1567                                 reserved: 0,
1568                                 flags: 0xE,
1569                                 parent: core_offset as u32,
1570                                 acpi_processor_id: uid as u32,
1571                                 num_private_resources: 0,
1572                             };
1573                             pptt.append(thread_hierarchy_node);
1574                             uid += 1;
1575                         }
1576                     } else {
1577                         let thread_hierarchy_node = ProcessorHierarchyNode {
1578                             r#type: 0,
1579                             length: 20,
1580                             reserved: 0,
1581                             flags: 0xA,
1582                             parent: cluster_offset as u32,
1583                             acpi_processor_id: uid as u32,
1584                             num_private_resources: 0,
1585                         };
1586                         pptt.append(thread_hierarchy_node);
1587                         uid += 1;
1588                     }
1589                 }
1590                 cpus += (cores_per_package * threads_per_core) as usize;
1591             }
1592         }
1593 
1594         pptt.update_checksum();
1595         pptt
1596     }
1597 
1598     #[cfg(feature = "guest_debug")]
1599     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1600         self.vcpus[usize::from(cpu_id)]
1601             .lock()
1602             .unwrap()
1603             .vcpu
1604             .get_regs()
1605             .map_err(Error::CpuDebug)
1606     }
1607 
1608     #[cfg(feature = "guest_debug")]
1609     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1610         self.vcpus[usize::from(cpu_id)]
1611             .lock()
1612             .unwrap()
1613             .vcpu
1614             .set_regs(regs)
1615             .map_err(Error::CpuDebug)
1616     }
1617 
1618     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1619     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1620         self.vcpus[usize::from(cpu_id)]
1621             .lock()
1622             .unwrap()
1623             .vcpu
1624             .get_sregs()
1625             .map_err(Error::CpuDebug)
1626     }
1627 
1628     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1629     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1630         self.vcpus[usize::from(cpu_id)]
1631             .lock()
1632             .unwrap()
1633             .vcpu
1634             .set_sregs(sregs)
1635             .map_err(Error::CpuDebug)
1636     }
1637 
1638     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1639     fn translate_gva(
1640         &self,
1641         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1642         cpu_id: u8,
1643         gva: u64,
1644     ) -> Result<u64> {
1645         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1646             .lock()
1647             .unwrap()
1648             .vcpu
1649             .translate_gva(gva, /* flags: unused */ 0)
1650             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1651         Ok(gpa)
1652     }
1653 
1654     ///
1655     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1656     /// it in VMM by walking through translation tables.
1657     ///
1658     /// Address translation is big topic, here we only focus the scenario that
1659     /// happens in VMM while debugging kernel. This `translate_gva`
1660     /// implementation is restricted to:
1661     /// - Exception Level 1
1662     /// - Translate high address range only (kernel space)
1663     ///
1664     /// This implementation supports following Arm-v8a features related to
1665     /// address translation:
1666     /// - FEAT_LPA
1667     /// - FEAT_LVA
1668     /// - FEAT_LPA2
1669     ///
1670     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1671     fn translate_gva(
1672         &self,
1673         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1674         cpu_id: u8,
1675         gva: u64,
1676     ) -> Result<u64> {
1677         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1678             .lock()
1679             .unwrap()
1680             .vcpu
1681             .get_sys_reg(regs::TCR_EL1)
1682             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1683         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1684             .lock()
1685             .unwrap()
1686             .vcpu
1687             .get_sys_reg(regs::TTBR1_EL1)
1688             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1689         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1690             .lock()
1691             .unwrap()
1692             .vcpu
1693             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1694             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1695 
1696         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1697         // or low (0x000xxx...).
1698         let high_range = extract_bits_64!(gva, 55, 1);
1699         if high_range == 0 {
1700             info!("VA (0x{:x}) range is not supported!", gva);
1701             return Ok(gva);
1702         }
1703 
1704         // High range size offset
1705         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1706         // Granule size
1707         let tg = extract_bits_64!(tcr_el1, 30, 2);
1708         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1709         let ds = extract_bits_64!(tcr_el1, 59, 1);
1710 
1711         if tsz == 0 {
1712             info!("VA translation is not ready!");
1713             return Ok(gva);
1714         }
1715 
1716         // VA size is determined by TCR_BL1.T1SZ
1717         let va_size = 64 - tsz;
1718         // Number of bits in VA consumed in each level of translation
1719         let stride = match tg {
1720             3 => 13, // 64KB granule size
1721             1 => 11, // 16KB granule size
1722             _ => 9,  // 4KB, default
1723         };
1724         // Starting level of walking
1725         let mut level = 4 - (va_size - 4) / stride;
1726 
1727         // PA or IPA size is determined
1728         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1729         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1730         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1731         // To be safe, we use the minimum value if they are different.
1732         let pa_range = std::cmp::min(tcr_ips, pa_range);
1733         // PA size in bits
1734         let pa_size = match pa_range {
1735             0 => 32,
1736             1 => 36,
1737             2 => 40,
1738             3 => 42,
1739             4 => 44,
1740             5 => 48,
1741             6 => 52,
1742             _ => {
1743                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1744                     "PA range not supported {pa_range}"
1745                 ))))
1746             }
1747         };
1748 
1749         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1750         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1751         // If FEAT_LPA2 is present, the translation table descriptor holds
1752         // 50 bits of the table address of next level.
1753         // Otherwise, it is 48 bits.
1754         let descaddrmask = if ds == 1 {
1755             !0u64 >> (64 - 50) // mask with 50 least significant bits
1756         } else {
1757             !0u64 >> (64 - 48) // mask with 48 least significant bits
1758         };
1759         let descaddrmask = descaddrmask & !indexmask_grainsize;
1760 
1761         // Translation table base address
1762         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1763         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1764         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1765         if pa_size == 52 {
1766             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1767         }
1768 
1769         // Loop through tables of each level
1770         loop {
1771             // Table offset for current level
1772             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1773             descaddr |= table_offset;
1774             descaddr &= !7u64;
1775 
1776             let mut buf = [0; 8];
1777             guest_memory
1778                 .memory()
1779                 .read(&mut buf, GuestAddress(descaddr))
1780                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1781             let descriptor = u64::from_le_bytes(buf);
1782 
1783             descaddr = descriptor & descaddrmask;
1784             // In the case of FEAT_LPA, the next-level translation table address
1785             // bits [48:51] comes from bits [12:15] of the current descriptor.
1786             // For FEAT_LPA2, the next-level translation table address
1787             // bits [50:51] comes from bits [8:9] of the current descriptor,
1788             // bits [48:49] comes from bits [48:49] of the descriptor which was
1789             // handled previously.
1790             if pa_size == 52 {
1791                 if ds == 1 {
1792                     // FEAT_LPA2
1793                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1794                 } else {
1795                     // FEAT_LPA
1796                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1797                 }
1798             }
1799 
1800             if (descriptor & 2) != 0 && (level < 3) {
1801                 // This is a table entry. Go down to next level.
1802                 level += 1;
1803                 indexmask = indexmask_grainsize;
1804                 continue;
1805             }
1806 
1807             break;
1808         }
1809 
1810         // We have reached either:
1811         // - a page entry at level 3 or
1812         // - a block entry at level 1 or 2
1813         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1814         descaddr &= !(page_size - 1);
1815         descaddr |= gva & (page_size - 1);
1816 
1817         Ok(descaddr)
1818     }
1819 
1820     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1821         self.acpi_address = Some(acpi_address);
1822     }
1823 
1824     pub(crate) fn set_interrupt_controller(
1825         &mut self,
1826         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1827     ) {
1828         self.interrupt_controller = Some(interrupt_controller);
1829     }
1830 
1831     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1832         &self.vcpus_kill_signalled
1833     }
1834 
1835     #[cfg(feature = "igvm")]
1836     pub(crate) fn get_cpuid_leaf(
1837         &self,
1838         cpu_id: u8,
1839         eax: u32,
1840         ecx: u32,
1841         xfem: u64,
1842         xss: u64,
1843     ) -> Result<[u32; 4]> {
1844         let leaf_info = self.vcpus[usize::from(cpu_id)]
1845             .lock()
1846             .unwrap()
1847             .vcpu
1848             .get_cpuid_values(eax, ecx, xfem, xss)
1849             .unwrap();
1850         Ok(leaf_info)
1851     }
1852 
1853     #[cfg(feature = "sev_snp")]
1854     pub(crate) fn sev_snp_enabled(&self) -> bool {
1855         self.sev_snp_enabled
1856     }
1857 
1858     pub(crate) fn nmi(&self) -> Result<()> {
1859         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1860 
1861         for state in self.vcpu_states.iter() {
1862             state.signal_thread();
1863         }
1864 
1865         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1866 
1867         Ok(())
1868     }
1869 }
1870 
1871 struct Cpu {
1872     cpu_id: u8,
1873     proximity_domain: u32,
1874     dynamic: bool,
1875     #[cfg(target_arch = "x86_64")]
1876     topology: Option<(u8, u8, u8)>,
1877 }
1878 
1879 #[cfg(target_arch = "x86_64")]
1880 const MADT_CPU_ENABLE_FLAG: usize = 0;
1881 
1882 #[cfg(target_arch = "x86_64")]
1883 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1884 
1885 impl Cpu {
1886     #[cfg(target_arch = "x86_64")]
1887     fn generate_mat(&self) -> Vec<u8> {
1888         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1889 
1890         let lapic = LocalX2Apic {
1891             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1892             length: 16,
1893             processor_id: self.cpu_id.into(),
1894             apic_id: x2apic_id,
1895             flags: 1 << MADT_CPU_ENABLE_FLAG,
1896             _reserved: 0,
1897         };
1898 
1899         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1900         // SAFETY: mat_data is large enough to hold lapic
1901         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1902 
1903         mat_data
1904     }
1905 }
1906 
1907 impl Aml for Cpu {
1908     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1909         #[cfg(target_arch = "x86_64")]
1910         let mat_data: Vec<u8> = self.generate_mat();
1911         #[allow(clippy::if_same_then_else)]
1912         if self.dynamic {
1913             aml::Device::new(
1914                 format!("C{:03X}", self.cpu_id).as_str().into(),
1915                 vec![
1916                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1917                     &aml::Name::new("_UID".into(), &self.cpu_id),
1918                     // Currently, AArch64 cannot support following fields.
1919                     /*
1920                     _STA return value:
1921                     Bit [0] – Set if the device is present.
1922                     Bit [1] – Set if the device is enabled and decoding its resources.
1923                     Bit [2] – Set if the device should be shown in the UI.
1924                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1925                     Bit [4] – Set if the battery is present.
1926                     Bits [31:5] – Reserved (must be cleared).
1927                     */
1928                     #[cfg(target_arch = "x86_64")]
1929                     &aml::Method::new(
1930                         "_STA".into(),
1931                         0,
1932                         false,
1933                         // Call into CSTA method which will interrogate device
1934                         vec![&aml::Return::new(&aml::MethodCall::new(
1935                             "CSTA".into(),
1936                             vec![&self.cpu_id],
1937                         ))],
1938                     ),
1939                     &aml::Method::new(
1940                         "_PXM".into(),
1941                         0,
1942                         false,
1943                         vec![&aml::Return::new(&self.proximity_domain)],
1944                     ),
1945                     // The Linux kernel expects every CPU device to have a _MAT entry
1946                     // containing the LAPIC for this processor with the enabled bit set
1947                     // even it if is disabled in the MADT (non-boot CPU)
1948                     #[cfg(target_arch = "x86_64")]
1949                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1950                     // Trigger CPU ejection
1951                     #[cfg(target_arch = "x86_64")]
1952                     &aml::Method::new(
1953                         "_EJ0".into(),
1954                         1,
1955                         false,
1956                         // Call into CEJ0 method which will actually eject device
1957                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1958                     ),
1959                 ],
1960             )
1961             .to_aml_bytes(sink);
1962         } else {
1963             aml::Device::new(
1964                 format!("C{:03X}", self.cpu_id).as_str().into(),
1965                 vec![
1966                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1967                     &aml::Name::new("_UID".into(), &self.cpu_id),
1968                     #[cfg(target_arch = "x86_64")]
1969                     &aml::Method::new(
1970                         "_STA".into(),
1971                         0,
1972                         false,
1973                         // Mark CPU present see CSTA implementation
1974                         vec![&aml::Return::new(&0xfu8)],
1975                     ),
1976                     &aml::Method::new(
1977                         "_PXM".into(),
1978                         0,
1979                         false,
1980                         vec![&aml::Return::new(&self.proximity_domain)],
1981                     ),
1982                     // The Linux kernel expects every CPU device to have a _MAT entry
1983                     // containing the LAPIC for this processor with the enabled bit set
1984                     // even it if is disabled in the MADT (non-boot CPU)
1985                     #[cfg(target_arch = "x86_64")]
1986                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1987                 ],
1988             )
1989             .to_aml_bytes(sink);
1990         }
1991     }
1992 }
1993 
1994 struct CpuNotify {
1995     cpu_id: u8,
1996 }
1997 
1998 impl Aml for CpuNotify {
1999     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2000         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2001         aml::If::new(
2002             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2003             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2004         )
2005         .to_aml_bytes(sink)
2006     }
2007 }
2008 
2009 struct CpuMethods {
2010     max_vcpus: u8,
2011     dynamic: bool,
2012 }
2013 
2014 impl Aml for CpuMethods {
2015     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2016         if self.dynamic {
2017             // CPU status method
2018             aml::Method::new(
2019                 "CSTA".into(),
2020                 1,
2021                 true,
2022                 vec![
2023                     // Take lock defined above
2024                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2025                     // Write CPU number (in first argument) to I/O port via field
2026                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2027                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2028                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2029                     &aml::If::new(
2030                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2031                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2032                     ),
2033                     // Release lock
2034                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2035                     // Return 0 or 0xf
2036                     &aml::Return::new(&aml::Local(0)),
2037                 ],
2038             )
2039             .to_aml_bytes(sink);
2040 
2041             let mut cpu_notifies = Vec::new();
2042             for cpu_id in 0..self.max_vcpus {
2043                 cpu_notifies.push(CpuNotify { cpu_id });
2044             }
2045 
2046             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2047             for cpu_id in 0..self.max_vcpus {
2048                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2049             }
2050 
2051             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2052 
2053             aml::Method::new(
2054                 "CEJ0".into(),
2055                 1,
2056                 true,
2057                 vec![
2058                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2059                     // Write CPU number (in first argument) to I/O port via field
2060                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2061                     // Set CEJ0 bit
2062                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2063                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2064                 ],
2065             )
2066             .to_aml_bytes(sink);
2067 
2068             aml::Method::new(
2069                 "CSCN".into(),
2070                 0,
2071                 true,
2072                 vec![
2073                     // Take lock defined above
2074                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2075                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2076                     &aml::While::new(
2077                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2078                         vec![
2079                             // Write CPU number (in first argument) to I/O port via field
2080                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2081                             // Check if CINS bit is set
2082                             &aml::If::new(
2083                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2084                                 // Notify device if it is
2085                                 vec![
2086                                     &aml::MethodCall::new(
2087                                         "CTFY".into(),
2088                                         vec![&aml::Local(0), &aml::ONE],
2089                                     ),
2090                                     // Reset CINS bit
2091                                     &aml::Store::new(
2092                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2093                                         &aml::ONE,
2094                                     ),
2095                                 ],
2096                             ),
2097                             // Check if CRMV bit is set
2098                             &aml::If::new(
2099                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2100                                 // Notify device if it is (with the eject constant 0x3)
2101                                 vec![
2102                                     &aml::MethodCall::new(
2103                                         "CTFY".into(),
2104                                         vec![&aml::Local(0), &3u8],
2105                                     ),
2106                                     // Reset CRMV bit
2107                                     &aml::Store::new(
2108                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2109                                         &aml::ONE,
2110                                     ),
2111                                 ],
2112                             ),
2113                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2114                         ],
2115                     ),
2116                     // Release lock
2117                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2118                 ],
2119             )
2120             .to_aml_bytes(sink)
2121         } else {
2122             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2123         }
2124     }
2125 }
2126 
2127 impl Aml for CpuManager {
2128     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2129         #[cfg(target_arch = "x86_64")]
2130         if let Some(acpi_address) = self.acpi_address {
2131             // CPU hotplug controller
2132             aml::Device::new(
2133                 "_SB_.PRES".into(),
2134                 vec![
2135                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2136                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2137                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2138                     &aml::Mutex::new("CPLK".into(), 0),
2139                     &aml::Name::new(
2140                         "_CRS".into(),
2141                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2142                             aml::AddressSpaceCacheable::NotCacheable,
2143                             true,
2144                             acpi_address.0,
2145                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2146                             None,
2147                         )]),
2148                     ),
2149                     // OpRegion and Fields map MMIO range into individual field values
2150                     &aml::OpRegion::new(
2151                         "PRST".into(),
2152                         aml::OpRegionSpace::SystemMemory,
2153                         &(acpi_address.0 as usize),
2154                         &CPU_MANAGER_ACPI_SIZE,
2155                     ),
2156                     &aml::Field::new(
2157                         "PRST".into(),
2158                         aml::FieldAccessType::Byte,
2159                         aml::FieldLockRule::NoLock,
2160                         aml::FieldUpdateRule::WriteAsZeroes,
2161                         vec![
2162                             aml::FieldEntry::Reserved(32),
2163                             aml::FieldEntry::Named(*b"CPEN", 1),
2164                             aml::FieldEntry::Named(*b"CINS", 1),
2165                             aml::FieldEntry::Named(*b"CRMV", 1),
2166                             aml::FieldEntry::Named(*b"CEJ0", 1),
2167                             aml::FieldEntry::Reserved(4),
2168                             aml::FieldEntry::Named(*b"CCMD", 8),
2169                         ],
2170                     ),
2171                     &aml::Field::new(
2172                         "PRST".into(),
2173                         aml::FieldAccessType::DWord,
2174                         aml::FieldLockRule::NoLock,
2175                         aml::FieldUpdateRule::Preserve,
2176                         vec![
2177                             aml::FieldEntry::Named(*b"CSEL", 32),
2178                             aml::FieldEntry::Reserved(32),
2179                             aml::FieldEntry::Named(*b"CDAT", 32),
2180                         ],
2181                     ),
2182                 ],
2183             )
2184             .to_aml_bytes(sink);
2185         }
2186 
2187         // CPU devices
2188         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2189         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2190         // Bundle methods together under a common object
2191         let methods = CpuMethods {
2192             max_vcpus: self.config.max_vcpus,
2193             dynamic: self.dynamic,
2194         };
2195         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2196 
2197         #[cfg(target_arch = "x86_64")]
2198         let topology = self.get_vcpu_topology();
2199         let mut cpu_devices = Vec::new();
2200         for cpu_id in 0..self.config.max_vcpus {
2201             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2202             let cpu_device = Cpu {
2203                 cpu_id,
2204                 proximity_domain,
2205                 dynamic: self.dynamic,
2206                 #[cfg(target_arch = "x86_64")]
2207                 topology,
2208             };
2209 
2210             cpu_devices.push(cpu_device);
2211         }
2212 
2213         for cpu_device in cpu_devices.iter() {
2214             cpu_data_inner.push(cpu_device);
2215         }
2216 
2217         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2218     }
2219 }
2220 
2221 impl Pausable for CpuManager {
2222     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2223         // Tell the vCPUs to pause themselves next time they exit
2224         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2225 
2226         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2227         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2228         // above.
2229         for state in self.vcpu_states.iter() {
2230             state.signal_thread();
2231         }
2232 
2233         for vcpu in self.vcpus.iter() {
2234             let mut vcpu = vcpu.lock().unwrap();
2235             vcpu.pause()?;
2236             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2237             if !self.config.kvm_hyperv {
2238                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2239                     MigratableError::Pause(anyhow!(
2240                         "Could not notify guest it has been paused {:?}",
2241                         e
2242                     ))
2243                 })?;
2244             }
2245         }
2246 
2247         // The vCPU thread will change its paused state before parking, wait here for each
2248         // activated vCPU change their state to ensure they have parked.
2249         for state in self.vcpu_states.iter() {
2250             if state.active() {
2251                 while !state.paused.load(Ordering::SeqCst) {
2252                     // To avoid a priority inversion with the vCPU thread
2253                     thread::sleep(std::time::Duration::from_millis(1));
2254                 }
2255             }
2256         }
2257 
2258         Ok(())
2259     }
2260 
2261     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2262         for vcpu in self.vcpus.iter() {
2263             vcpu.lock().unwrap().resume()?;
2264         }
2265 
2266         // Toggle the vCPUs pause boolean
2267         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2268 
2269         // Unpark all the VCPU threads.
2270         // Once unparked, the next thing they will do is checking for the pause
2271         // boolean. Since it'll be set to false, they will exit their pause loop
2272         // and go back to vmx root.
2273         for state in self.vcpu_states.iter() {
2274             state.paused.store(false, Ordering::SeqCst);
2275             state.unpark_thread();
2276         }
2277         Ok(())
2278     }
2279 }
2280 
2281 impl Snapshottable for CpuManager {
2282     fn id(&self) -> String {
2283         CPU_MANAGER_SNAPSHOT_ID.to_string()
2284     }
2285 
2286     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2287         let mut cpu_manager_snapshot = Snapshot::default();
2288 
2289         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2290         for vcpu in &self.vcpus {
2291             let mut vcpu = vcpu.lock().unwrap();
2292             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2293         }
2294 
2295         Ok(cpu_manager_snapshot)
2296     }
2297 }
2298 
2299 impl Transportable for CpuManager {}
2300 impl Migratable for CpuManager {}
2301 
2302 #[cfg(feature = "guest_debug")]
2303 impl Debuggable for CpuManager {
2304     #[cfg(feature = "kvm")]
2305     fn set_guest_debug(
2306         &self,
2307         cpu_id: usize,
2308         addrs: &[GuestAddress],
2309         singlestep: bool,
2310     ) -> std::result::Result<(), DebuggableError> {
2311         self.vcpus[cpu_id]
2312             .lock()
2313             .unwrap()
2314             .vcpu
2315             .set_guest_debug(addrs, singlestep)
2316             .map_err(DebuggableError::SetDebug)
2317     }
2318 
2319     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2320         Ok(())
2321     }
2322 
2323     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2324         Ok(())
2325     }
2326 
2327     #[cfg(target_arch = "x86_64")]
2328     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2329         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2330         let gregs = self
2331             .get_regs(cpu_id as u8)
2332             .map_err(DebuggableError::ReadRegs)?;
2333         let regs = [
2334             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2335             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2336         ];
2337 
2338         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2339         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2340         let eflags = gregs.rflags as u32;
2341         let rip = gregs.rip;
2342 
2343         // Segment registers: CS, SS, DS, ES, FS, GS
2344         let sregs = self
2345             .get_sregs(cpu_id as u8)
2346             .map_err(DebuggableError::ReadRegs)?;
2347         let segments = X86SegmentRegs {
2348             cs: sregs.cs.selector as u32,
2349             ss: sregs.ss.selector as u32,
2350             ds: sregs.ds.selector as u32,
2351             es: sregs.es.selector as u32,
2352             fs: sregs.fs.selector as u32,
2353             gs: sregs.gs.selector as u32,
2354         };
2355 
2356         // TODO: Add other registers
2357 
2358         Ok(CoreRegs {
2359             regs,
2360             eflags,
2361             rip,
2362             segments,
2363             ..Default::default()
2364         })
2365     }
2366 
2367     #[cfg(target_arch = "aarch64")]
2368     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2369         let gregs = self
2370             .get_regs(cpu_id as u8)
2371             .map_err(DebuggableError::ReadRegs)?;
2372         Ok(CoreRegs {
2373             x: gregs.regs.regs,
2374             sp: gregs.regs.sp,
2375             pc: gregs.regs.pc,
2376             ..Default::default()
2377         })
2378     }
2379 
2380     #[cfg(target_arch = "x86_64")]
2381     fn write_regs(
2382         &self,
2383         cpu_id: usize,
2384         regs: &CoreRegs,
2385     ) -> std::result::Result<(), DebuggableError> {
2386         let orig_gregs = self
2387             .get_regs(cpu_id as u8)
2388             .map_err(DebuggableError::ReadRegs)?;
2389         let gregs = StandardRegisters {
2390             rax: regs.regs[0],
2391             rbx: regs.regs[1],
2392             rcx: regs.regs[2],
2393             rdx: regs.regs[3],
2394             rsi: regs.regs[4],
2395             rdi: regs.regs[5],
2396             rbp: regs.regs[6],
2397             rsp: regs.regs[7],
2398             r8: regs.regs[8],
2399             r9: regs.regs[9],
2400             r10: regs.regs[10],
2401             r11: regs.regs[11],
2402             r12: regs.regs[12],
2403             r13: regs.regs[13],
2404             r14: regs.regs[14],
2405             r15: regs.regs[15],
2406             rip: regs.rip,
2407             // Update the lower 32-bit of rflags.
2408             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2409         };
2410 
2411         self.set_regs(cpu_id as u8, &gregs)
2412             .map_err(DebuggableError::WriteRegs)?;
2413 
2414         // Segment registers: CS, SS, DS, ES, FS, GS
2415         // Since GDB care only selectors, we call get_sregs() first.
2416         let mut sregs = self
2417             .get_sregs(cpu_id as u8)
2418             .map_err(DebuggableError::ReadRegs)?;
2419         sregs.cs.selector = regs.segments.cs as u16;
2420         sregs.ss.selector = regs.segments.ss as u16;
2421         sregs.ds.selector = regs.segments.ds as u16;
2422         sregs.es.selector = regs.segments.es as u16;
2423         sregs.fs.selector = regs.segments.fs as u16;
2424         sregs.gs.selector = regs.segments.gs as u16;
2425 
2426         self.set_sregs(cpu_id as u8, &sregs)
2427             .map_err(DebuggableError::WriteRegs)?;
2428 
2429         // TODO: Add other registers
2430 
2431         Ok(())
2432     }
2433 
2434     #[cfg(target_arch = "aarch64")]
2435     fn write_regs(
2436         &self,
2437         cpu_id: usize,
2438         regs: &CoreRegs,
2439     ) -> std::result::Result<(), DebuggableError> {
2440         let mut gregs = self
2441             .get_regs(cpu_id as u8)
2442             .map_err(DebuggableError::ReadRegs)?;
2443 
2444         gregs.regs.regs = regs.x;
2445         gregs.regs.sp = regs.sp;
2446         gregs.regs.pc = regs.pc;
2447 
2448         self.set_regs(cpu_id as u8, &gregs)
2449             .map_err(DebuggableError::WriteRegs)?;
2450 
2451         Ok(())
2452     }
2453 
2454     fn read_mem(
2455         &self,
2456         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2457         cpu_id: usize,
2458         vaddr: GuestAddress,
2459         len: usize,
2460     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2461         let mut buf = vec![0; len];
2462         let mut total_read = 0_u64;
2463 
2464         while total_read < len as u64 {
2465             let gaddr = vaddr.0 + total_read;
2466             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2467                 Ok(paddr) => paddr,
2468                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2469                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2470             };
2471             let psize = arch::PAGE_SIZE as u64;
2472             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2473             guest_memory
2474                 .memory()
2475                 .read(
2476                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2477                     GuestAddress(paddr),
2478                 )
2479                 .map_err(DebuggableError::ReadMem)?;
2480             total_read += read_len;
2481         }
2482         Ok(buf)
2483     }
2484 
2485     fn write_mem(
2486         &self,
2487         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2488         cpu_id: usize,
2489         vaddr: &GuestAddress,
2490         data: &[u8],
2491     ) -> std::result::Result<(), DebuggableError> {
2492         let mut total_written = 0_u64;
2493 
2494         while total_written < data.len() as u64 {
2495             let gaddr = vaddr.0 + total_written;
2496             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2497                 Ok(paddr) => paddr,
2498                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2499                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2500             };
2501             let psize = arch::PAGE_SIZE as u64;
2502             let write_len = std::cmp::min(
2503                 data.len() as u64 - total_written,
2504                 psize - (paddr & (psize - 1)),
2505             );
2506             guest_memory
2507                 .memory()
2508                 .write(
2509                     &data[total_written as usize..total_written as usize + write_len as usize],
2510                     GuestAddress(paddr),
2511                 )
2512                 .map_err(DebuggableError::WriteMem)?;
2513             total_written += write_len;
2514         }
2515         Ok(())
2516     }
2517 
2518     fn active_vcpus(&self) -> usize {
2519         self.present_vcpus() as usize
2520     }
2521 }
2522 
2523 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2524 impl Elf64Writable for CpuManager {}
2525 
2526 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2527 impl CpuElf64Writable for CpuManager {
2528     fn cpu_write_elf64_note(
2529         &mut self,
2530         dump_state: &DumpState,
2531     ) -> std::result::Result<(), GuestDebuggableError> {
2532         let mut coredump_file = dump_state.file.as_ref().unwrap();
2533         for vcpu in &self.vcpus {
2534             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2535             let mut pos: usize = 0;
2536             let mut buf = vec![0; note_size as usize];
2537             let descsz = size_of::<X86_64ElfPrStatus>();
2538             let vcpu_id = vcpu.lock().unwrap().id;
2539 
2540             let note = Elf64_Nhdr {
2541                 n_namesz: COREDUMP_NAME_SIZE,
2542                 n_descsz: descsz as u32,
2543                 n_type: NT_PRSTATUS,
2544             };
2545 
2546             let bytes: &[u8] = note.as_slice();
2547             buf.splice(0.., bytes.to_vec());
2548             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2549             buf.resize(pos + 4, 0);
2550             buf.splice(pos.., "CORE".to_string().into_bytes());
2551 
2552             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2553             buf.resize(pos + 32 + 4, 0);
2554             let pid = vcpu_id as u64;
2555             let bytes: &[u8] = pid.as_slice();
2556             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2557 
2558             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2559 
2560             let orig_rax: u64 = 0;
2561             let gregs = self.vcpus[usize::from(vcpu_id)]
2562                 .lock()
2563                 .unwrap()
2564                 .vcpu
2565                 .get_regs()
2566                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2567 
2568             let regs1 = [
2569                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2570                 gregs.r10,
2571             ];
2572             let regs2 = [
2573                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2574             ];
2575 
2576             let sregs = self.vcpus[usize::from(vcpu_id)]
2577                 .lock()
2578                 .unwrap()
2579                 .vcpu
2580                 .get_sregs()
2581                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2582 
2583             debug!(
2584                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2585                 gregs.rip,
2586                 gregs.rsp,
2587                 sregs.gs.base,
2588                 sregs.cs.selector,
2589                 sregs.ss.selector,
2590                 sregs.ds.selector,
2591             );
2592 
2593             let regs = X86_64UserRegs {
2594                 regs1,
2595                 regs2,
2596                 rip: gregs.rip,
2597                 cs: sregs.cs.selector as u64,
2598                 eflags: gregs.rflags,
2599                 rsp: gregs.rsp,
2600                 ss: sregs.ss.selector as u64,
2601                 fs_base: sregs.fs.base,
2602                 gs_base: sregs.gs.base,
2603                 ds: sregs.ds.selector as u64,
2604                 es: sregs.es.selector as u64,
2605                 fs: sregs.fs.selector as u64,
2606                 gs: sregs.gs.selector as u64,
2607             };
2608 
2609             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2610             let bytes: &[u8] = regs.as_slice();
2611             buf.resize(note_size as usize, 0);
2612             buf.splice(pos.., bytes.to_vec());
2613             buf.resize(note_size as usize, 0);
2614 
2615             coredump_file
2616                 .write(&buf)
2617                 .map_err(GuestDebuggableError::CoredumpFile)?;
2618         }
2619 
2620         Ok(())
2621     }
2622 
2623     fn cpu_write_vmm_note(
2624         &mut self,
2625         dump_state: &DumpState,
2626     ) -> std::result::Result<(), GuestDebuggableError> {
2627         let mut coredump_file = dump_state.file.as_ref().unwrap();
2628         for vcpu in &self.vcpus {
2629             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2630             let mut pos: usize = 0;
2631             let mut buf = vec![0; note_size as usize];
2632             let descsz = size_of::<DumpCpusState>();
2633             let vcpu_id = vcpu.lock().unwrap().id;
2634 
2635             let note = Elf64_Nhdr {
2636                 n_namesz: COREDUMP_NAME_SIZE,
2637                 n_descsz: descsz as u32,
2638                 n_type: 0,
2639             };
2640 
2641             let bytes: &[u8] = note.as_slice();
2642             buf.splice(0.., bytes.to_vec());
2643             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2644 
2645             buf.resize(pos + 4, 0);
2646             buf.splice(pos.., "QEMU".to_string().into_bytes());
2647 
2648             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2649 
2650             let gregs = self.vcpus[usize::from(vcpu_id)]
2651                 .lock()
2652                 .unwrap()
2653                 .vcpu
2654                 .get_regs()
2655                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2656 
2657             let regs1 = [
2658                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2659                 gregs.rbp,
2660             ];
2661 
2662             let regs2 = [
2663                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2664                 gregs.r15,
2665             ];
2666 
2667             let sregs = self.vcpus[usize::from(vcpu_id)]
2668                 .lock()
2669                 .unwrap()
2670                 .vcpu
2671                 .get_sregs()
2672                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2673 
2674             let mut msrs = vec![MsrEntry {
2675                 index: msr_index::MSR_KERNEL_GS_BASE,
2676                 ..Default::default()
2677             }];
2678 
2679             self.vcpus[vcpu_id as usize]
2680                 .lock()
2681                 .unwrap()
2682                 .vcpu
2683                 .get_msrs(&mut msrs)
2684                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2685             let kernel_gs_base = msrs[0].data;
2686 
2687             let cs = CpuSegment::new(sregs.cs);
2688             let ds = CpuSegment::new(sregs.ds);
2689             let es = CpuSegment::new(sregs.es);
2690             let fs = CpuSegment::new(sregs.fs);
2691             let gs = CpuSegment::new(sregs.gs);
2692             let ss = CpuSegment::new(sregs.ss);
2693             let ldt = CpuSegment::new(sregs.ldt);
2694             let tr = CpuSegment::new(sregs.tr);
2695             let gdt = CpuSegment::new_from_table(sregs.gdt);
2696             let idt = CpuSegment::new_from_table(sregs.idt);
2697             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2698             let regs = DumpCpusState {
2699                 version: 1,
2700                 size: size_of::<DumpCpusState>() as u32,
2701                 regs1,
2702                 regs2,
2703                 rip: gregs.rip,
2704                 rflags: gregs.rflags,
2705                 cs,
2706                 ds,
2707                 es,
2708                 fs,
2709                 gs,
2710                 ss,
2711                 ldt,
2712                 tr,
2713                 gdt,
2714                 idt,
2715                 cr,
2716                 kernel_gs_base,
2717             };
2718 
2719             let bytes: &[u8] = regs.as_slice();
2720             buf.resize(note_size as usize, 0);
2721             buf.splice(pos.., bytes.to_vec());
2722             buf.resize(note_size as usize, 0);
2723 
2724             coredump_file
2725                 .write(&buf)
2726                 .map_err(GuestDebuggableError::CoredumpFile)?;
2727         }
2728 
2729         Ok(())
2730     }
2731 }
2732 
2733 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2734 #[cfg(test)]
2735 mod tests {
2736     use arch::layout::BOOT_STACK_POINTER;
2737     use arch::layout::ZERO_PAGE_START;
2738     use arch::x86_64::interrupts::*;
2739     use arch::x86_64::regs::*;
2740     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2741     use linux_loader::loader::bootparam::setup_header;
2742 
2743     #[test]
2744     fn test_setlint() {
2745         let hv = hypervisor::new().unwrap();
2746         let vm = hv.create_vm().expect("new VM fd creation failed");
2747         assert!(hv.check_required_extensions().is_ok());
2748         // Calling get_lapic will fail if there is no irqchip before hand.
2749         assert!(vm.create_irq_chip().is_ok());
2750         let vcpu = vm.create_vcpu(0, None).unwrap();
2751         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2752 
2753         // Compute the value that is expected to represent LVT0 and LVT1.
2754         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2755         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2756         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2757         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2758 
2759         set_lint(&vcpu).unwrap();
2760 
2761         // Compute the value that represents LVT0 and LVT1 after set_lint.
2762         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2763         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2764         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2765         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2766         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2767     }
2768 
2769     #[test]
2770     fn test_setup_fpu() {
2771         let hv = hypervisor::new().unwrap();
2772         let vm = hv.create_vm().expect("new VM fd creation failed");
2773         let vcpu = vm.create_vcpu(0, None).unwrap();
2774         setup_fpu(&vcpu).unwrap();
2775 
2776         let expected_fpu: FpuState = FpuState {
2777             fcw: 0x37f,
2778             mxcsr: 0x1f80,
2779             ..Default::default()
2780         };
2781         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2782         // TODO: auto-generate kvm related structures with PartialEq on.
2783         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2784         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2785         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2786         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2787         // remove it at all.
2788         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2789     }
2790 
2791     #[test]
2792     fn test_setup_msrs() {
2793         use hypervisor::arch::x86::{msr_index, MsrEntry};
2794 
2795         let hv = hypervisor::new().unwrap();
2796         let vm = hv.create_vm().expect("new VM fd creation failed");
2797         let vcpu = vm.create_vcpu(0, None).unwrap();
2798         setup_msrs(&vcpu).unwrap();
2799 
2800         // This test will check against the last MSR entry configured (the tenth one).
2801         // See create_msr_entries for details.
2802         let mut msrs = vec![MsrEntry {
2803             index: msr_index::MSR_IA32_MISC_ENABLE,
2804             ..Default::default()
2805         }];
2806 
2807         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2808         // in this test case scenario.
2809         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2810         assert_eq!(read_msrs, 1);
2811 
2812         // Official entries that were setup when we did setup_msrs. We need to assert that the
2813         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2814         // expect.
2815         let entry_vec = vcpu.boot_msr_entries();
2816         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2817     }
2818 
2819     #[test]
2820     fn test_setup_regs_for_pvh() {
2821         let hv = hypervisor::new().unwrap();
2822         let vm = hv.create_vm().expect("new VM fd creation failed");
2823         let vcpu = vm.create_vcpu(0, None).unwrap();
2824 
2825         let expected_regs: StandardRegisters = StandardRegisters {
2826             rflags: 0x0000000000000002u64,
2827             rbx: arch::layout::PVH_INFO_START.0,
2828             rip: 1,
2829             ..Default::default()
2830         };
2831 
2832         setup_regs(
2833             &vcpu,
2834             arch::EntryPoint {
2835                 entry_addr: vm_memory::GuestAddress(expected_regs.rip),
2836                 setup_header: None,
2837             },
2838         )
2839         .unwrap();
2840 
2841         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2842         assert_eq!(actual_regs, expected_regs);
2843     }
2844 
2845     #[test]
2846     fn test_setup_regs_for_bzimage() {
2847         let hv = hypervisor::new().unwrap();
2848         let vm = hv.create_vm().expect("new VM fd creation failed");
2849         let vcpu = vm.create_vcpu(0, None).unwrap();
2850 
2851         let expected_regs: StandardRegisters = StandardRegisters {
2852             rflags: 0x0000000000000002u64,
2853             rip: 1,
2854             rsp: BOOT_STACK_POINTER.0,
2855             rsi: ZERO_PAGE_START.0,
2856             ..Default::default()
2857         };
2858 
2859         setup_regs(
2860             &vcpu,
2861             arch::EntryPoint {
2862                 entry_addr: vm_memory::GuestAddress(expected_regs.rip),
2863                 setup_header: Some(setup_header {
2864                     ..Default::default()
2865                 }),
2866             },
2867         )
2868         .unwrap();
2869 
2870         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2871         assert_eq!(actual_regs, expected_regs);
2872     }
2873 }
2874 
2875 #[cfg(target_arch = "aarch64")]
2876 #[cfg(test)]
2877 mod tests {
2878     use arch::{aarch64::regs, layout};
2879     use hypervisor::kvm::aarch64::is_system_register;
2880     use hypervisor::kvm::kvm_bindings::{
2881         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2882         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2883     };
2884     use hypervisor::{arm64_core_reg_id, offset_of};
2885     use std::mem;
2886 
2887     #[test]
2888     fn test_setup_regs() {
2889         let hv = hypervisor::new().unwrap();
2890         let vm = hv.create_vm().unwrap();
2891         let vcpu = vm.create_vcpu(0, None).unwrap();
2892 
2893         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2894         // Must fail when vcpu is not initialized yet.
2895         assert!(res.is_err());
2896 
2897         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2898         vm.get_preferred_target(&mut kvi).unwrap();
2899         vcpu.vcpu_init(&kvi).unwrap();
2900 
2901         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2902     }
2903 
2904     #[test]
2905     fn test_read_mpidr() {
2906         let hv = hypervisor::new().unwrap();
2907         let vm = hv.create_vm().unwrap();
2908         let vcpu = vm.create_vcpu(0, None).unwrap();
2909         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2910         vm.get_preferred_target(&mut kvi).unwrap();
2911 
2912         // Must fail when vcpu is not initialized yet.
2913         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2914 
2915         vcpu.vcpu_init(&kvi).unwrap();
2916         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2917     }
2918 
2919     #[test]
2920     fn test_is_system_register() {
2921         let offset = offset_of!(user_pt_regs, pc);
2922         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2923         assert!(!is_system_register(regid));
2924         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2925         assert!(is_system_register(regid));
2926     }
2927 
2928     #[test]
2929     fn test_save_restore_core_regs() {
2930         let hv = hypervisor::new().unwrap();
2931         let vm = hv.create_vm().unwrap();
2932         let vcpu = vm.create_vcpu(0, None).unwrap();
2933         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2934         vm.get_preferred_target(&mut kvi).unwrap();
2935 
2936         // Must fail when vcpu is not initialized yet.
2937         let res = vcpu.get_regs();
2938         assert!(res.is_err());
2939         assert_eq!(
2940             format!("{}", res.unwrap_err()),
2941             "Failed to get core register: Exec format error (os error 8)"
2942         );
2943 
2944         let mut state = kvm_regs::default();
2945         let res = vcpu.set_regs(&state);
2946         assert!(res.is_err());
2947         assert_eq!(
2948             format!("{}", res.unwrap_err()),
2949             "Failed to set core register: Exec format error (os error 8)"
2950         );
2951 
2952         vcpu.vcpu_init(&kvi).unwrap();
2953         let res = vcpu.get_regs();
2954         assert!(res.is_ok());
2955         state = res.unwrap();
2956         assert_eq!(state.regs.pstate, 0x3C5);
2957 
2958         assert!(vcpu.set_regs(&state).is_ok());
2959     }
2960 
2961     #[test]
2962     fn test_get_set_mpstate() {
2963         let hv = hypervisor::new().unwrap();
2964         let vm = hv.create_vm().unwrap();
2965         let vcpu = vm.create_vcpu(0, None).unwrap();
2966         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2967         vm.get_preferred_target(&mut kvi).unwrap();
2968 
2969         let res = vcpu.get_mp_state();
2970         assert!(res.is_ok());
2971         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2972     }
2973 }
2974