xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 07d1208dd53a207a65b649b8952780dfd0ca59d9)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 use arch::EntryPoint;
35 use arch::NumaNodes;
36 #[cfg(target_arch = "aarch64")]
37 use devices::gic::Gic;
38 use devices::interrupt_controller::InterruptController;
39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
44 use hypervisor::aarch64::StandardRegisters;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::msr_index;
47 #[cfg(target_arch = "x86_64")]
48 use hypervisor::arch::x86::CpuIdEntry;
49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
50 use hypervisor::arch::x86::MsrEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
53 #[cfg(target_arch = "aarch64")]
54 use hypervisor::kvm::kvm_bindings;
55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
56 use hypervisor::kvm::kvm_ioctls::Cap;
57 #[cfg(feature = "tdx")]
58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
59 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
60 use libc::{c_void, siginfo_t};
61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
62 use linux_loader::elf::Elf64_Nhdr;
63 use seccompiler::{apply_filter, SeccompAction};
64 use std::collections::BTreeMap;
65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
66 use std::io::Write;
67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
68 use std::mem::size_of;
69 use std::os::unix::thread::JoinHandleExt;
70 use std::sync::atomic::{AtomicBool, Ordering};
71 use std::sync::{Arc, Barrier, Mutex};
72 use std::{cmp, io, result, thread};
73 use thiserror::Error;
74 use tracer::trace_scoped;
75 use vm_device::BusDevice;
76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
77 use vm_memory::ByteValued;
78 #[cfg(feature = "guest_debug")]
79 use vm_memory::{Bytes, GuestAddressSpace};
80 use vm_memory::{GuestAddress, GuestMemoryAtomic};
81 use vm_migration::{
82     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
83     Transportable,
84 };
85 use vmm_sys_util::eventfd::EventFd;
86 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
87 use zerocopy::AsBytes;
88 
89 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
90 /// Extract the specified bits of a 64-bit integer.
91 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
92 /// following expression should return 3 (`0b11`):
93 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
94 ///
95 macro_rules! extract_bits_64 {
96     ($value: tt, $offset: tt, $length: tt) => {
97         ($value >> $offset) & (!0u64 >> (64 - $length))
98     };
99 }
100 
101 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
102 macro_rules! extract_bits_64_without_offset {
103     ($value: tt, $length: tt) => {
104         $value & (!0u64 >> (64 - $length))
105     };
106 }
107 
108 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
109 
110 #[derive(Debug, Error)]
111 pub enum Error {
112     #[error("Error creating vCPU: {0}")]
113     VcpuCreate(#[source] anyhow::Error),
114 
115     #[error("Error running bCPU: {0}")]
116     VcpuRun(#[source] anyhow::Error),
117 
118     #[error("Error spawning vCPU thread: {0}")]
119     VcpuSpawn(#[source] io::Error),
120 
121     #[error("Error generating common CPUID: {0}")]
122     CommonCpuId(#[source] arch::Error),
123 
124     #[error("Error configuring vCPU: {0}")]
125     VcpuConfiguration(#[source] arch::Error),
126 
127     #[error("Still pending removed vcpu")]
128     VcpuPendingRemovedVcpu,
129 
130     #[cfg(target_arch = "aarch64")]
131     #[error("Error fetching preferred target: {0}")]
132     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
133 
134     #[cfg(target_arch = "aarch64")]
135     #[error("Error initialising vCPU: {0}")]
136     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
137 
138     #[error("Failed to join on vCPU threads: {0:?}")]
139     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
140 
141     #[error("Error adding CpuManager to MMIO bus: {0}")]
142     BusError(#[source] vm_device::BusError),
143 
144     #[error("Requested vCPUs exceed maximum")]
145     DesiredVCpuCountExceedsMax,
146 
147     #[error("Cannot create seccomp filter: {0}")]
148     CreateSeccompFilter(#[source] seccompiler::Error),
149 
150     #[error("Cannot apply seccomp filter: {0}")]
151     ApplySeccompFilter(#[source] seccompiler::Error),
152 
153     #[error("Error starting vCPU after restore: {0}")]
154     StartRestoreVcpu(#[source] anyhow::Error),
155 
156     #[error("Unexpected VmExit")]
157     UnexpectedVmExit,
158 
159     #[error("Failed to allocate MMIO address for CpuManager")]
160     AllocateMmmioAddress,
161 
162     #[cfg(feature = "tdx")]
163     #[error("Error initializing TDX: {0}")]
164     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
165 
166     #[cfg(target_arch = "aarch64")]
167     #[error("Error initializing PMU: {0}")]
168     InitPmu(#[source] hypervisor::HypervisorCpuError),
169 
170     #[cfg(feature = "guest_debug")]
171     #[error("Error during CPU debug: {0}")]
172     CpuDebug(#[source] hypervisor::HypervisorCpuError),
173 
174     #[cfg(feature = "guest_debug")]
175     #[error("Error translating virtual address: {0}")]
176     TranslateVirtualAddress(#[source] anyhow::Error),
177 
178     #[cfg(target_arch = "x86_64")]
179     #[error("Error setting up AMX: {0}")]
180     AmxEnable(#[source] anyhow::Error),
181 
182     #[error("Maximum number of vCPUs exceeds host limit")]
183     MaximumVcpusExceeded,
184 }
185 pub type Result<T> = result::Result<T, Error>;
186 
187 #[cfg(target_arch = "x86_64")]
188 #[allow(dead_code)]
189 #[repr(packed)]
190 #[derive(AsBytes)]
191 struct LocalX2Apic {
192     pub r#type: u8,
193     pub length: u8,
194     pub _reserved: u16,
195     pub apic_id: u32,
196     pub flags: u32,
197     pub processor_id: u32,
198 }
199 
200 #[allow(dead_code)]
201 #[repr(packed)]
202 #[derive(Default, AsBytes)]
203 struct Ioapic {
204     pub r#type: u8,
205     pub length: u8,
206     pub ioapic_id: u8,
207     _reserved: u8,
208     pub apic_address: u32,
209     pub gsi_base: u32,
210 }
211 
212 #[cfg(target_arch = "aarch64")]
213 #[allow(dead_code)]
214 #[repr(packed)]
215 #[derive(AsBytes)]
216 struct GicC {
217     pub r#type: u8,
218     pub length: u8,
219     pub reserved0: u16,
220     pub cpu_interface_number: u32,
221     pub uid: u32,
222     pub flags: u32,
223     pub parking_version: u32,
224     pub performance_interrupt: u32,
225     pub parked_address: u64,
226     pub base_address: u64,
227     pub gicv_base_address: u64,
228     pub gich_base_address: u64,
229     pub vgic_interrupt: u32,
230     pub gicr_base_address: u64,
231     pub mpidr: u64,
232     pub proc_power_effi_class: u8,
233     pub reserved1: u8,
234     pub spe_overflow_interrupt: u16,
235 }
236 
237 #[cfg(target_arch = "aarch64")]
238 #[allow(dead_code)]
239 #[repr(packed)]
240 #[derive(AsBytes)]
241 struct GicD {
242     pub r#type: u8,
243     pub length: u8,
244     pub reserved0: u16,
245     pub gic_id: u32,
246     pub base_address: u64,
247     pub global_irq_base: u32,
248     pub version: u8,
249     pub reserved1: [u8; 3],
250 }
251 
252 #[cfg(target_arch = "aarch64")]
253 #[allow(dead_code)]
254 #[repr(packed)]
255 #[derive(AsBytes)]
256 struct GicR {
257     pub r#type: u8,
258     pub length: u8,
259     pub reserved: u16,
260     pub base_address: u64,
261     pub range_length: u32,
262 }
263 
264 #[cfg(target_arch = "aarch64")]
265 #[allow(dead_code)]
266 #[repr(packed)]
267 #[derive(AsBytes)]
268 struct GicIts {
269     pub r#type: u8,
270     pub length: u8,
271     pub reserved0: u16,
272     pub translation_id: u32,
273     pub base_address: u64,
274     pub reserved1: u32,
275 }
276 
277 #[cfg(target_arch = "aarch64")]
278 #[allow(dead_code)]
279 #[repr(packed)]
280 #[derive(AsBytes)]
281 struct ProcessorHierarchyNode {
282     pub r#type: u8,
283     pub length: u8,
284     pub reserved: u16,
285     pub flags: u32,
286     pub parent: u32,
287     pub acpi_processor_id: u32,
288     pub num_private_resources: u32,
289 }
290 
291 #[allow(dead_code)]
292 #[repr(packed)]
293 #[derive(Default, AsBytes)]
294 struct InterruptSourceOverride {
295     pub r#type: u8,
296     pub length: u8,
297     pub bus: u8,
298     pub source: u8,
299     pub gsi: u32,
300     pub flags: u16,
301 }
302 
303 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
304 macro_rules! round_up {
305     ($n:expr,$d:expr) => {
306         (($n / ($d + 1)) + 1) * $d
307     };
308 }
309 
310 /// A wrapper around creating and using a kvm-based VCPU.
311 pub struct Vcpu {
312     // The hypervisor abstracted CPU.
313     vcpu: Arc<dyn hypervisor::Vcpu>,
314     id: u8,
315     #[cfg(target_arch = "aarch64")]
316     mpidr: u64,
317     saved_state: Option<CpuState>,
318 }
319 
320 impl Vcpu {
321     /// Constructs a new VCPU for `vm`.
322     ///
323     /// # Arguments
324     ///
325     /// * `id` - Represents the CPU number between [0, max vcpus).
326     /// * `vm` - The virtual machine this vcpu will get attached to.
327     /// * `vm_ops` - Optional object for exit handling.
328     pub fn new(
329         id: u8,
330         vm: &Arc<dyn hypervisor::Vm>,
331         vm_ops: Option<Arc<dyn VmOps>>,
332     ) -> Result<Self> {
333         let vcpu = vm
334             .create_vcpu(id, vm_ops)
335             .map_err(|e| Error::VcpuCreate(e.into()))?;
336         // Initially the cpuid per vCPU is the one supported by this VM.
337         Ok(Vcpu {
338             vcpu,
339             id,
340             #[cfg(target_arch = "aarch64")]
341             mpidr: 0,
342             saved_state: None,
343         })
344     }
345 
346     /// Configures a vcpu and should be called once per vcpu when created.
347     ///
348     /// # Arguments
349     ///
350     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
351     /// * `guest_memory` - Guest memory.
352     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
353     pub fn configure(
354         &mut self,
355         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
356         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
357         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
358         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
359     ) -> Result<()> {
360         #[cfg(target_arch = "aarch64")]
361         {
362             self.init(vm)?;
363             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
364                 .map_err(Error::VcpuConfiguration)?;
365         }
366         info!("Configuring vCPU: cpu_id = {}", self.id);
367         #[cfg(target_arch = "x86_64")]
368         arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv)
369             .map_err(Error::VcpuConfiguration)?;
370 
371         Ok(())
372     }
373 
374     /// Gets the MPIDR register value.
375     #[cfg(target_arch = "aarch64")]
376     pub fn get_mpidr(&self) -> u64 {
377         self.mpidr
378     }
379 
380     /// Gets the saved vCPU state.
381     #[cfg(target_arch = "aarch64")]
382     pub fn get_saved_state(&self) -> Option<CpuState> {
383         self.saved_state.clone()
384     }
385 
386     /// Initializes an aarch64 specific vcpu for booting Linux.
387     #[cfg(target_arch = "aarch64")]
388     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
389         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
390 
391         // This reads back the kernel's preferred target type.
392         vm.get_preferred_target(&mut kvi)
393             .map_err(Error::VcpuArmPreferredTarget)?;
394         // We already checked that the capability is supported.
395         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
396         if vm
397             .as_any()
398             .downcast_ref::<hypervisor::kvm::KvmVm>()
399             .unwrap()
400             .check_extension(Cap::ArmPmuV3)
401         {
402             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
403         }
404         // Non-boot cpus are powered off initially.
405         if self.id > 0 {
406             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
407         }
408         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
409     }
410 
411     /// Runs the VCPU until it exits, returning the reason.
412     ///
413     /// Note that the state of the VCPU and associated VM must be setup first for this to do
414     /// anything useful.
415     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
416         self.vcpu.run()
417     }
418 }
419 
420 impl Pausable for Vcpu {}
421 impl Snapshottable for Vcpu {
422     fn id(&self) -> String {
423         self.id.to_string()
424     }
425 
426     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
427         let saved_state = self
428             .vcpu
429             .state()
430             .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?;
431 
432         self.saved_state = Some(saved_state.clone());
433 
434         Ok(Snapshot::from_data(SnapshotData::new_from_state(
435             &saved_state,
436         )?))
437     }
438 }
439 
440 pub struct CpuManager {
441     hypervisor_type: HypervisorType,
442     config: CpusConfig,
443     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
444     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
445     #[cfg(target_arch = "x86_64")]
446     cpuid: Vec<CpuIdEntry>,
447     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
448     vm: Arc<dyn hypervisor::Vm>,
449     vcpus_kill_signalled: Arc<AtomicBool>,
450     vcpus_pause_signalled: Arc<AtomicBool>,
451     exit_evt: EventFd,
452     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
453     reset_evt: EventFd,
454     #[cfg(feature = "guest_debug")]
455     vm_debug_evt: EventFd,
456     vcpu_states: Vec<VcpuState>,
457     selected_cpu: u8,
458     vcpus: Vec<Arc<Mutex<Vcpu>>>,
459     seccomp_action: SeccompAction,
460     vm_ops: Arc<dyn VmOps>,
461     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
462     acpi_address: Option<GuestAddress>,
463     proximity_domain_per_cpu: BTreeMap<u8, u32>,
464     affinity: BTreeMap<u8, Vec<u8>>,
465     dynamic: bool,
466 }
467 
468 const CPU_ENABLE_FLAG: usize = 0;
469 const CPU_INSERTING_FLAG: usize = 1;
470 const CPU_REMOVING_FLAG: usize = 2;
471 const CPU_EJECT_FLAG: usize = 3;
472 
473 const CPU_STATUS_OFFSET: u64 = 4;
474 const CPU_SELECTION_OFFSET: u64 = 0;
475 
476 impl BusDevice for CpuManager {
477     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
478         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
479         data.fill(0);
480 
481         match offset {
482             CPU_SELECTION_OFFSET => {
483                 data[0] = self.selected_cpu;
484             }
485             CPU_STATUS_OFFSET => {
486                 if self.selected_cpu < self.max_vcpus() {
487                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
488                     if state.active() {
489                         data[0] |= 1 << CPU_ENABLE_FLAG;
490                     }
491                     if state.inserting {
492                         data[0] |= 1 << CPU_INSERTING_FLAG;
493                     }
494                     if state.removing {
495                         data[0] |= 1 << CPU_REMOVING_FLAG;
496                     }
497                 } else {
498                     warn!("Out of range vCPU id: {}", self.selected_cpu);
499                 }
500             }
501             _ => {
502                 warn!(
503                     "Unexpected offset for accessing CPU manager device: {:#}",
504                     offset
505                 );
506             }
507         }
508     }
509 
510     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
511         match offset {
512             CPU_SELECTION_OFFSET => {
513                 self.selected_cpu = data[0];
514             }
515             CPU_STATUS_OFFSET => {
516                 if self.selected_cpu < self.max_vcpus() {
517                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
518                     // The ACPI code writes back a 1 to acknowledge the insertion
519                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
520                         && state.inserting
521                     {
522                         state.inserting = false;
523                     }
524                     // Ditto for removal
525                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
526                         && state.removing
527                     {
528                         state.removing = false;
529                     }
530                     // Trigger removal of vCPU
531                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
532                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
533                             error!("Error removing vCPU: {:?}", e);
534                         }
535                     }
536                 } else {
537                     warn!("Out of range vCPU id: {}", self.selected_cpu);
538                 }
539             }
540             _ => {
541                 warn!(
542                     "Unexpected offset for accessing CPU manager device: {:#}",
543                     offset
544                 );
545             }
546         }
547         None
548     }
549 }
550 
551 #[derive(Default)]
552 struct VcpuState {
553     inserting: bool,
554     removing: bool,
555     pending_removal: Arc<AtomicBool>,
556     handle: Option<thread::JoinHandle<()>>,
557     kill: Arc<AtomicBool>,
558     vcpu_run_interrupted: Arc<AtomicBool>,
559     paused: Arc<AtomicBool>,
560 }
561 
562 impl VcpuState {
563     fn active(&self) -> bool {
564         self.handle.is_some()
565     }
566 
567     fn signal_thread(&self) {
568         if let Some(handle) = self.handle.as_ref() {
569             loop {
570                 // SAFETY: FFI call with correct arguments
571                 unsafe {
572                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
573                 }
574                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
575                     break;
576                 } else {
577                     // This is more effective than thread::yield_now() at
578                     // avoiding a priority inversion with the vCPU thread
579                     thread::sleep(std::time::Duration::from_millis(1));
580                 }
581             }
582         }
583     }
584 
585     fn join_thread(&mut self) -> Result<()> {
586         if let Some(handle) = self.handle.take() {
587             handle.join().map_err(Error::ThreadCleanup)?
588         }
589 
590         Ok(())
591     }
592 
593     fn unpark_thread(&self) {
594         if let Some(handle) = self.handle.as_ref() {
595             handle.thread().unpark()
596         }
597     }
598 }
599 
600 impl CpuManager {
601     #[allow(unused_variables)]
602     #[allow(clippy::too_many_arguments)]
603     pub fn new(
604         config: &CpusConfig,
605         vm: Arc<dyn hypervisor::Vm>,
606         exit_evt: EventFd,
607         reset_evt: EventFd,
608         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
609         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
610         seccomp_action: SeccompAction,
611         vm_ops: Arc<dyn VmOps>,
612         #[cfg(feature = "tdx")] tdx_enabled: bool,
613         numa_nodes: &NumaNodes,
614     ) -> Result<Arc<Mutex<CpuManager>>> {
615         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
616             return Err(Error::MaximumVcpusExceeded);
617         }
618 
619         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
620         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
621         let hypervisor_type = hypervisor.hypervisor_type();
622 
623         #[cfg(target_arch = "x86_64")]
624         if config.features.amx {
625             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
626             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
627             const XFEATURE_XTILEDATA: usize = 18;
628             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
629 
630             // SAFETY: the syscall is only modifing kernel internal
631             // data structures that the kernel is itself expected to safeguard.
632             let amx_tile = unsafe {
633                 libc::syscall(
634                     libc::SYS_arch_prctl,
635                     ARCH_REQ_XCOMP_GUEST_PERM,
636                     XFEATURE_XTILEDATA,
637                 )
638             };
639 
640             if amx_tile != 0 {
641                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
642             } else {
643                 let mask: usize = 0;
644                 // SAFETY: the mask being modified (not marked mutable as it is
645                 // modified in unsafe only which is permitted) isn't in use elsewhere.
646                 let result = unsafe {
647                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
648                 };
649                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
650                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
651                 }
652             }
653         }
654 
655         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
656             let mut cpu_list = Vec::new();
657             for (proximity_domain, numa_node) in numa_nodes.iter() {
658                 for cpu in numa_node.cpus.iter() {
659                     cpu_list.push((*cpu, *proximity_domain))
660                 }
661             }
662             cpu_list
663         }
664         .into_iter()
665         .collect();
666 
667         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
668             cpu_affinity
669                 .iter()
670                 .map(|a| (a.vcpu, a.host_cpus.clone()))
671                 .collect()
672         } else {
673             BTreeMap::new()
674         };
675 
676         #[cfg(feature = "tdx")]
677         let dynamic = !tdx_enabled;
678         #[cfg(not(feature = "tdx"))]
679         let dynamic = true;
680 
681         Ok(Arc::new(Mutex::new(CpuManager {
682             hypervisor_type,
683             config: config.clone(),
684             interrupt_controller: None,
685             #[cfg(target_arch = "x86_64")]
686             cpuid: Vec::new(),
687             vm,
688             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
689             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
690             vcpu_states,
691             exit_evt,
692             reset_evt,
693             #[cfg(feature = "guest_debug")]
694             vm_debug_evt,
695             selected_cpu: 0,
696             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
697             seccomp_action,
698             vm_ops,
699             acpi_address: None,
700             proximity_domain_per_cpu,
701             affinity,
702             dynamic,
703         })))
704     }
705 
706     #[cfg(target_arch = "x86_64")]
707     pub fn populate_cpuid(
708         &mut self,
709         memory_manager: &Arc<Mutex<MemoryManager>>,
710         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
711         #[cfg(feature = "tdx")] tdx_enabled: bool,
712     ) -> Result<()> {
713         let sgx_epc_sections = memory_manager
714             .lock()
715             .unwrap()
716             .sgx_epc_region()
717             .as_ref()
718             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
719 
720         let topology = self.config.topology.clone().map_or_else(
721             || {
722                 #[cfg(feature = "mshv")]
723                 if matches!(hypervisor.hypervisor_type(), HypervisorType::Mshv) {
724                     return Some((1, self.boot_vcpus(), 1));
725                 }
726                 None
727             },
728             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
729         );
730 
731         self.cpuid = {
732             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
733             arch::generate_common_cpuid(
734                 hypervisor,
735                 topology,
736                 sgx_epc_sections,
737                 phys_bits,
738                 self.config.kvm_hyperv,
739                 #[cfg(feature = "tdx")]
740                 tdx_enabled,
741             )
742             .map_err(Error::CommonCpuId)?
743         };
744 
745         Ok(())
746     }
747 
748     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
749         info!("Creating vCPU: cpu_id = {}", cpu_id);
750 
751         let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?;
752 
753         if let Some(snapshot) = snapshot {
754             // AArch64 vCPUs should be initialized after created.
755             #[cfg(target_arch = "aarch64")]
756             vcpu.init(&self.vm)?;
757 
758             let state: CpuState = snapshot.to_state().map_err(|e| {
759                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
760             })?;
761             vcpu.vcpu
762                 .set_state(&state)
763                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
764 
765             vcpu.saved_state = Some(state);
766         }
767 
768         let vcpu = Arc::new(Mutex::new(vcpu));
769 
770         // Adding vCPU to the CpuManager's vCPU list.
771         self.vcpus.push(vcpu.clone());
772 
773         Ok(vcpu)
774     }
775 
776     pub fn configure_vcpu(
777         &self,
778         vcpu: Arc<Mutex<Vcpu>>,
779         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
780     ) -> Result<()> {
781         let mut vcpu = vcpu.lock().unwrap();
782 
783         #[cfg(target_arch = "x86_64")]
784         assert!(!self.cpuid.is_empty());
785 
786         #[cfg(target_arch = "x86_64")]
787         vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?;
788 
789         #[cfg(target_arch = "aarch64")]
790         vcpu.configure(&self.vm, boot_setup)?;
791 
792         Ok(())
793     }
794 
795     /// Only create new vCPUs if there aren't any inactive ones to reuse
796     fn create_vcpus(
797         &mut self,
798         desired_vcpus: u8,
799         snapshot: Option<Snapshot>,
800     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
801         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
802         info!(
803             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
804             desired_vcpus,
805             self.config.max_vcpus,
806             self.vcpus.len(),
807             self.present_vcpus()
808         );
809 
810         if desired_vcpus > self.config.max_vcpus {
811             return Err(Error::DesiredVCpuCountExceedsMax);
812         }
813 
814         // Only create vCPUs in excess of all the allocated vCPUs.
815         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
816             vcpus.push(self.create_vcpu(
817                 cpu_id,
818                 // TODO: The special format of the CPU id can be removed once
819                 // ready to break live upgrade.
820                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
821             )?);
822         }
823 
824         Ok(vcpus)
825     }
826 
827     #[cfg(target_arch = "aarch64")]
828     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
829         for cpu in self.vcpus.iter() {
830             let cpu = cpu.lock().unwrap();
831             // Check if PMU attr is available, if not, log the information.
832             if cpu.vcpu.has_pmu_support() {
833                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
834             } else {
835                 debug!(
836                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
837                     cpu.id
838                 );
839                 return Ok(false);
840             }
841         }
842 
843         Ok(true)
844     }
845 
846     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
847         self.vcpus.clone()
848     }
849 
850     fn start_vcpu(
851         &mut self,
852         vcpu: Arc<Mutex<Vcpu>>,
853         vcpu_id: u8,
854         vcpu_thread_barrier: Arc<Barrier>,
855         inserting: bool,
856     ) -> Result<()> {
857         let reset_evt = self.reset_evt.try_clone().unwrap();
858         let exit_evt = self.exit_evt.try_clone().unwrap();
859         #[cfg(feature = "kvm")]
860         let hypervisor_type = self.hypervisor_type;
861         #[cfg(feature = "guest_debug")]
862         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
863         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
864         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
865         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
866 
867         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
868         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
869             .vcpu_run_interrupted
870             .clone();
871         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
872         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
873 
874         // Prepare the CPU set the current vCPU is expected to run onto.
875         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
876             // SAFETY: all zeros is a valid pattern
877             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
878             // SAFETY: FFI call, trivially safe
879             unsafe { libc::CPU_ZERO(&mut cpuset) };
880             for host_cpu in host_cpus {
881                 // SAFETY: FFI call, trivially safe
882                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
883             }
884             cpuset
885         });
886 
887         // Retrieve seccomp filter for vcpu thread
888         let vcpu_seccomp_filter =
889             get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type)
890                 .map_err(Error::CreateSeccompFilter)?;
891 
892         #[cfg(target_arch = "x86_64")]
893         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
894 
895         info!("Starting vCPU: cpu_id = {}", vcpu_id);
896 
897         let handle = Some(
898             thread::Builder::new()
899                 .name(format!("vcpu{vcpu_id}"))
900                 .spawn(move || {
901                     // Schedule the thread to run on the expected CPU set
902                     if let Some(cpuset) = cpuset.as_ref() {
903                         // SAFETY: FFI call with correct arguments
904                         let ret = unsafe {
905                             libc::sched_setaffinity(
906                                 0,
907                                 std::mem::size_of::<libc::cpu_set_t>(),
908                                 cpuset as *const libc::cpu_set_t,
909                             )
910                         };
911 
912                         if ret != 0 {
913                             error!(
914                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
915                                 vcpu_id,
916                                 io::Error::last_os_error()
917                             );
918                             return;
919                         }
920                     }
921 
922                     // Apply seccomp filter for vcpu thread.
923                     if !vcpu_seccomp_filter.is_empty() {
924                         if let Err(e) =
925                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
926                         {
927                             error!("Error applying seccomp filter: {:?}", e);
928                             return;
929                         }
930                     }
931                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
932                     // This uses an async signal safe handler to kill the vcpu handles.
933                     register_signal_handler(SIGRTMIN(), handle_signal)
934                         .expect("Failed to register vcpu signal handler");
935                     // Block until all CPUs are ready.
936                     vcpu_thread_barrier.wait();
937 
938                     std::panic::catch_unwind(move || {
939                         loop {
940                             // If we are being told to pause, we park the thread
941                             // until the pause boolean is toggled.
942                             // The resume operation is responsible for toggling
943                             // the boolean and unpark the thread.
944                             // We enter a loop because park() could spuriously
945                             // return. We will then park() again unless the
946                             // pause boolean has been toggled.
947 
948                             // Need to use Ordering::SeqCst as we have multiple
949                             // loads and stores to different atomics and we need
950                             // to see them in a consistent order in all threads
951 
952                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
953                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
954                                 // completed by returning to KVM_RUN. From the kernel docs:
955                                 //
956                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
957                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
958                                 // operations are complete (and guest state is consistent) only after userspace
959                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
960                                 // incomplete operations and then check for pending signals.
961                                 // The pending state of the operation is not preserved in state which is
962                                 // visible to userspace, thus userspace should ensure that the operation is
963                                 // completed before performing a live migration.  Userspace can re-enter the
964                                 // guest with an unmasked signal pending or with the immediate_exit field set
965                                 // to complete pending operations without allowing any further instructions
966                                 // to be executed.
967 
968                                 #[cfg(feature = "kvm")]
969                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
970                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
971                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
972                                         error!("Unexpected VM exit on \"immediate_exit\" run");
973                                         break;
974                                     }
975                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
976                                 }
977 
978                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
979 
980                                 vcpu_paused.store(true, Ordering::SeqCst);
981                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
982                                     thread::park();
983                                 }
984                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
985                             }
986 
987                             // We've been told to terminate
988                             if vcpu_kill_signalled.load(Ordering::SeqCst)
989                                 || vcpu_kill.load(Ordering::SeqCst)
990                             {
991                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
992                                 break;
993                             }
994 
995                             #[cfg(feature = "tdx")]
996                             let mut vcpu = vcpu.lock().unwrap();
997                             #[cfg(not(feature = "tdx"))]
998                             let vcpu = vcpu.lock().unwrap();
999                             // vcpu.run() returns false on a triple-fault so trigger a reset
1000                             match vcpu.run() {
1001                                 Ok(run) => match run {
1002                                     #[cfg(feature = "kvm")]
1003                                     VmExit::Debug => {
1004                                         info!("VmExit::Debug");
1005                                         #[cfg(feature = "guest_debug")]
1006                                         {
1007                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1008                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1009                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1010                                         }
1011                                     }
1012                                     #[cfg(target_arch = "x86_64")]
1013                                     VmExit::IoapicEoi(vector) => {
1014                                         if let Some(interrupt_controller) =
1015                                             &interrupt_controller_clone
1016                                         {
1017                                             interrupt_controller
1018                                                 .lock()
1019                                                 .unwrap()
1020                                                 .end_of_interrupt(vector);
1021                                         }
1022                                     }
1023                                     VmExit::Ignore => {}
1024                                     VmExit::Hyperv => {}
1025                                     VmExit::Reset => {
1026                                         info!("VmExit::Reset");
1027                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1028                                         reset_evt.write(1).unwrap();
1029                                         break;
1030                                     }
1031                                     VmExit::Shutdown => {
1032                                         info!("VmExit::Shutdown");
1033                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1034                                         exit_evt.write(1).unwrap();
1035                                         break;
1036                                     }
1037                                     #[cfg(feature = "tdx")]
1038                                     VmExit::Tdx => {
1039                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1040                                             match vcpu.get_tdx_exit_details() {
1041                                                 Ok(details) => match details {
1042                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1043                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1044                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1045                                                     }
1046                                                 },
1047                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1048                                             }
1049                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1050                                         } else {
1051                                             // We should never reach this code as
1052                                             // this means the design from the code
1053                                             // is wrong.
1054                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1055                                         }
1056                                     }
1057                                     _ => {
1058                                         error!(
1059                                             "VCPU generated error: {:?}",
1060                                             Error::UnexpectedVmExit
1061                                         );
1062                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1063                                         exit_evt.write(1).unwrap();
1064                                         break;
1065                                     }
1066                                 },
1067 
1068                                 Err(e) => {
1069                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1070                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1071                                     exit_evt.write(1).unwrap();
1072                                     break;
1073                                 }
1074                             }
1075 
1076                             // We've been told to terminate
1077                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1078                                 || vcpu_kill.load(Ordering::SeqCst)
1079                             {
1080                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1081                                 break;
1082                             }
1083                         }
1084                     })
1085                     .or_else(|_| {
1086                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1087                         error!("vCPU thread panicked");
1088                         panic_exit_evt.write(1)
1089                     })
1090                     .ok();
1091                 })
1092                 .map_err(Error::VcpuSpawn)?,
1093         );
1094 
1095         // On hot plug calls into this function entry_point is None. It is for
1096         // those hotplug CPU additions that we need to set the inserting flag.
1097         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1098         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1099 
1100         Ok(())
1101     }
1102 
1103     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1104     fn activate_vcpus(
1105         &mut self,
1106         desired_vcpus: u8,
1107         inserting: bool,
1108         paused: Option<bool>,
1109     ) -> Result<()> {
1110         if desired_vcpus > self.config.max_vcpus {
1111             return Err(Error::DesiredVCpuCountExceedsMax);
1112         }
1113 
1114         let vcpu_thread_barrier = Arc::new(Barrier::new(
1115             (desired_vcpus - self.present_vcpus() + 1) as usize,
1116         ));
1117 
1118         if let Some(paused) = paused {
1119             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1120         }
1121 
1122         info!(
1123             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1124             desired_vcpus,
1125             self.vcpus.len(),
1126             self.present_vcpus(),
1127             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1128         );
1129 
1130         // This reuses any inactive vCPUs as well as any that were newly created
1131         for vcpu_id in self.present_vcpus()..desired_vcpus {
1132             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1133             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1134         }
1135 
1136         // Unblock all CPU threads.
1137         vcpu_thread_barrier.wait();
1138         Ok(())
1139     }
1140 
1141     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1142         // Mark vCPUs for removal, actual removal happens on ejection
1143         for cpu_id in desired_vcpus..self.present_vcpus() {
1144             self.vcpu_states[usize::from(cpu_id)].removing = true;
1145             self.vcpu_states[usize::from(cpu_id)]
1146                 .pending_removal
1147                 .store(true, Ordering::SeqCst);
1148         }
1149     }
1150 
1151     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1152         for state in self.vcpu_states.iter() {
1153             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1154                 return true;
1155             }
1156         }
1157         false
1158     }
1159 
1160     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1161         info!("Removing vCPU: cpu_id = {}", cpu_id);
1162         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1163         state.kill.store(true, Ordering::SeqCst);
1164         state.signal_thread();
1165         state.join_thread()?;
1166         state.handle = None;
1167 
1168         // Once the thread has exited, clear the "kill" so that it can reused
1169         state.kill.store(false, Ordering::SeqCst);
1170         state.pending_removal.store(false, Ordering::SeqCst);
1171 
1172         Ok(())
1173     }
1174 
1175     pub fn create_boot_vcpus(
1176         &mut self,
1177         snapshot: Option<Snapshot>,
1178     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1179         trace_scoped!("create_boot_vcpus");
1180 
1181         self.create_vcpus(self.boot_vcpus(), snapshot)
1182     }
1183 
1184     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1185     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1186         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1187     }
1188 
1189     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1190         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1191             .map_err(|e| {
1192                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1193             })?;
1194 
1195         Ok(())
1196     }
1197 
1198     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1199         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1200             return Ok(false);
1201         }
1202 
1203         if !self.dynamic {
1204             return Ok(false);
1205         }
1206 
1207         if self.check_pending_removed_vcpu() {
1208             return Err(Error::VcpuPendingRemovedVcpu);
1209         }
1210 
1211         match desired_vcpus.cmp(&self.present_vcpus()) {
1212             cmp::Ordering::Greater => {
1213                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1214                 for vcpu in vcpus {
1215                     self.configure_vcpu(vcpu, None)?
1216                 }
1217                 self.activate_vcpus(desired_vcpus, true, None)?;
1218                 Ok(true)
1219             }
1220             cmp::Ordering::Less => {
1221                 self.mark_vcpus_for_removal(desired_vcpus);
1222                 Ok(true)
1223             }
1224             _ => Ok(false),
1225         }
1226     }
1227 
1228     pub fn shutdown(&mut self) -> Result<()> {
1229         // Tell the vCPUs to stop themselves next time they go through the loop
1230         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1231 
1232         // Toggle the vCPUs pause boolean
1233         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1234 
1235         // Unpark all the VCPU threads.
1236         for state in self.vcpu_states.iter() {
1237             state.unpark_thread();
1238         }
1239 
1240         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1241         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1242         // above.
1243         for state in self.vcpu_states.iter() {
1244             state.signal_thread();
1245         }
1246 
1247         // Wait for all the threads to finish. This removes the state from the vector.
1248         for mut state in self.vcpu_states.drain(..) {
1249             state.join_thread()?;
1250         }
1251 
1252         Ok(())
1253     }
1254 
1255     #[cfg(feature = "tdx")]
1256     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1257         for vcpu in &self.vcpus {
1258             vcpu.lock()
1259                 .unwrap()
1260                 .vcpu
1261                 .tdx_init(hob_address)
1262                 .map_err(Error::InitializeTdx)?;
1263         }
1264         Ok(())
1265     }
1266 
1267     pub fn boot_vcpus(&self) -> u8 {
1268         self.config.boot_vcpus
1269     }
1270 
1271     pub fn max_vcpus(&self) -> u8 {
1272         self.config.max_vcpus
1273     }
1274 
1275     #[cfg(target_arch = "x86_64")]
1276     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1277         assert!(!self.cpuid.is_empty());
1278         self.cpuid.clone()
1279     }
1280 
1281     fn present_vcpus(&self) -> u8 {
1282         self.vcpu_states
1283             .iter()
1284             .fold(0, |acc, state| acc + state.active() as u8)
1285     }
1286 
1287     #[cfg(target_arch = "aarch64")]
1288     pub fn get_mpidrs(&self) -> Vec<u64> {
1289         self.vcpus
1290             .iter()
1291             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1292             .collect()
1293     }
1294 
1295     #[cfg(target_arch = "aarch64")]
1296     pub fn get_saved_states(&self) -> Vec<CpuState> {
1297         self.vcpus
1298             .iter()
1299             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1300             .collect()
1301     }
1302 
1303     #[cfg(target_arch = "aarch64")]
1304     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1305         self.config
1306             .topology
1307             .clone()
1308             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1309     }
1310 
1311     pub fn create_madt(&self) -> Sdt {
1312         use crate::acpi;
1313         // This is also checked in the commandline parsing.
1314         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1315 
1316         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1317         #[cfg(target_arch = "x86_64")]
1318         {
1319             madt.write(36, arch::layout::APIC_START.0);
1320 
1321             for cpu in 0..self.config.max_vcpus {
1322                 let lapic = LocalX2Apic {
1323                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1324                     length: 16,
1325                     processor_id: cpu.into(),
1326                     apic_id: cpu.into(),
1327                     flags: if cpu < self.config.boot_vcpus {
1328                         1 << MADT_CPU_ENABLE_FLAG
1329                     } else {
1330                         0
1331                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1332                     _reserved: 0,
1333                 };
1334                 madt.append(lapic);
1335             }
1336 
1337             madt.append(Ioapic {
1338                 r#type: acpi::ACPI_APIC_IO,
1339                 length: 12,
1340                 ioapic_id: 0,
1341                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1342                 gsi_base: 0,
1343                 ..Default::default()
1344             });
1345 
1346             madt.append(InterruptSourceOverride {
1347                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1348                 length: 10,
1349                 bus: 0,
1350                 source: 4,
1351                 gsi: 4,
1352                 flags: 0,
1353             });
1354         }
1355 
1356         #[cfg(target_arch = "aarch64")]
1357         {
1358             /* Notes:
1359              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1360              */
1361 
1362             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1363             for cpu in 0..self.config.boot_vcpus {
1364                 let vcpu = &self.vcpus[cpu as usize];
1365                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1366                 /* ARMv8 MPIDR format:
1367                      Bits [63:40] Must be zero
1368                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1369                      Bits [31:24] Must be zero
1370                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1371                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1372                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1373                 */
1374                 let mpidr_mask = 0xff_00ff_ffff;
1375                 let gicc = GicC {
1376                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1377                     length: 80,
1378                     reserved0: 0,
1379                     cpu_interface_number: cpu as u32,
1380                     uid: cpu as u32,
1381                     flags: 1,
1382                     parking_version: 0,
1383                     performance_interrupt: 0,
1384                     parked_address: 0,
1385                     base_address: 0,
1386                     gicv_base_address: 0,
1387                     gich_base_address: 0,
1388                     vgic_interrupt: 0,
1389                     gicr_base_address: 0,
1390                     mpidr: mpidr & mpidr_mask,
1391                     proc_power_effi_class: 0,
1392                     reserved1: 0,
1393                     spe_overflow_interrupt: 0,
1394                 };
1395 
1396                 madt.append(gicc);
1397             }
1398             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1399 
1400             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1401             let gicd = GicD {
1402                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1403                 length: 24,
1404                 reserved0: 0,
1405                 gic_id: 0,
1406                 base_address: vgic_config.dist_addr,
1407                 global_irq_base: 0,
1408                 version: 3,
1409                 reserved1: [0; 3],
1410             };
1411             madt.append(gicd);
1412 
1413             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1414             let gicr = GicR {
1415                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1416                 length: 16,
1417                 reserved: 0,
1418                 base_address: vgic_config.redists_addr,
1419                 range_length: vgic_config.redists_size as u32,
1420             };
1421             madt.append(gicr);
1422 
1423             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1424             let gicits = GicIts {
1425                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1426                 length: 20,
1427                 reserved0: 0,
1428                 translation_id: 0,
1429                 base_address: vgic_config.msi_addr,
1430                 reserved1: 0,
1431             };
1432             madt.append(gicits);
1433 
1434             madt.update_checksum();
1435         }
1436 
1437         madt
1438     }
1439 
1440     #[cfg(target_arch = "aarch64")]
1441     pub fn create_pptt(&self) -> Sdt {
1442         let pptt_start = 0;
1443         let mut cpus = 0;
1444         let mut uid = 0;
1445         // If topology is not specified, the default setting is:
1446         // 1 package, multiple cores, 1 thread per core
1447         // This is also the behavior when PPTT is missing.
1448         let (threads_per_core, cores_per_package, packages) =
1449             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1450 
1451         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1452 
1453         for cluster_idx in 0..packages {
1454             if cpus < self.config.boot_vcpus as usize {
1455                 let cluster_offset = pptt.len() - pptt_start;
1456                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1457                     r#type: 0,
1458                     length: 20,
1459                     reserved: 0,
1460                     flags: 0x2,
1461                     parent: 0,
1462                     acpi_processor_id: cluster_idx as u32,
1463                     num_private_resources: 0,
1464                 };
1465                 pptt.append(cluster_hierarchy_node);
1466 
1467                 for core_idx in 0..cores_per_package {
1468                     let core_offset = pptt.len() - pptt_start;
1469 
1470                     if threads_per_core > 1 {
1471                         let core_hierarchy_node = ProcessorHierarchyNode {
1472                             r#type: 0,
1473                             length: 20,
1474                             reserved: 0,
1475                             flags: 0x2,
1476                             parent: cluster_offset as u32,
1477                             acpi_processor_id: core_idx as u32,
1478                             num_private_resources: 0,
1479                         };
1480                         pptt.append(core_hierarchy_node);
1481 
1482                         for _thread_idx in 0..threads_per_core {
1483                             let thread_hierarchy_node = ProcessorHierarchyNode {
1484                                 r#type: 0,
1485                                 length: 20,
1486                                 reserved: 0,
1487                                 flags: 0xE,
1488                                 parent: core_offset as u32,
1489                                 acpi_processor_id: uid as u32,
1490                                 num_private_resources: 0,
1491                             };
1492                             pptt.append(thread_hierarchy_node);
1493                             uid += 1;
1494                         }
1495                     } else {
1496                         let thread_hierarchy_node = ProcessorHierarchyNode {
1497                             r#type: 0,
1498                             length: 20,
1499                             reserved: 0,
1500                             flags: 0xA,
1501                             parent: cluster_offset as u32,
1502                             acpi_processor_id: uid as u32,
1503                             num_private_resources: 0,
1504                         };
1505                         pptt.append(thread_hierarchy_node);
1506                         uid += 1;
1507                     }
1508                 }
1509                 cpus += (cores_per_package * threads_per_core) as usize;
1510             }
1511         }
1512 
1513         pptt.update_checksum();
1514         pptt
1515     }
1516 
1517     #[cfg(feature = "guest_debug")]
1518     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1519         self.vcpus[usize::from(cpu_id)]
1520             .lock()
1521             .unwrap()
1522             .vcpu
1523             .get_regs()
1524             .map_err(Error::CpuDebug)
1525     }
1526 
1527     #[cfg(feature = "guest_debug")]
1528     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1529         self.vcpus[usize::from(cpu_id)]
1530             .lock()
1531             .unwrap()
1532             .vcpu
1533             .set_regs(regs)
1534             .map_err(Error::CpuDebug)
1535     }
1536 
1537     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1538     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1539         self.vcpus[usize::from(cpu_id)]
1540             .lock()
1541             .unwrap()
1542             .vcpu
1543             .get_sregs()
1544             .map_err(Error::CpuDebug)
1545     }
1546 
1547     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1548     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1549         self.vcpus[usize::from(cpu_id)]
1550             .lock()
1551             .unwrap()
1552             .vcpu
1553             .set_sregs(sregs)
1554             .map_err(Error::CpuDebug)
1555     }
1556 
1557     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1558     fn translate_gva(
1559         &self,
1560         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1561         cpu_id: u8,
1562         gva: u64,
1563     ) -> Result<u64> {
1564         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1565             .lock()
1566             .unwrap()
1567             .vcpu
1568             .translate_gva(gva, /* flags: unused */ 0)
1569             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1570         Ok(gpa)
1571     }
1572 
1573     ///
1574     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1575     /// it in VMM by walking through translation tables.
1576     ///
1577     /// Address translation is big topic, here we only focus the scenario that
1578     /// happens in VMM while debugging kernel. This `translate_gva`
1579     /// implementation is restricted to:
1580     /// - Exception Level 1
1581     /// - Translate high address range only (kernel space)
1582     ///
1583     /// This implementation supports following Arm-v8a features related to
1584     /// address translation:
1585     /// - FEAT_LPA
1586     /// - FEAT_LVA
1587     /// - FEAT_LPA2
1588     ///
1589     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1590     fn translate_gva(
1591         &self,
1592         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1593         cpu_id: u8,
1594         gva: u64,
1595     ) -> Result<u64> {
1596         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1597             .lock()
1598             .unwrap()
1599             .vcpu
1600             .get_sys_reg(regs::TCR_EL1)
1601             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1602         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1603             .lock()
1604             .unwrap()
1605             .vcpu
1606             .get_sys_reg(regs::TTBR1_EL1)
1607             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1608         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1609             .lock()
1610             .unwrap()
1611             .vcpu
1612             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1613             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1614 
1615         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1616         // or low (0x000xxx...).
1617         let high_range = extract_bits_64!(gva, 55, 1);
1618         if high_range == 0 {
1619             info!("VA (0x{:x}) range is not supported!", gva);
1620             return Ok(gva);
1621         }
1622 
1623         // High range size offset
1624         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1625         // Granule size
1626         let tg = extract_bits_64!(tcr_el1, 30, 2);
1627         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1628         let ds = extract_bits_64!(tcr_el1, 59, 1);
1629 
1630         if tsz == 0 {
1631             info!("VA translation is not ready!");
1632             return Ok(gva);
1633         }
1634 
1635         // VA size is determined by TCR_BL1.T1SZ
1636         let va_size = 64 - tsz;
1637         // Number of bits in VA consumed in each level of translation
1638         let stride = match tg {
1639             3 => 13, // 64KB granule size
1640             1 => 11, // 16KB granule size
1641             _ => 9,  // 4KB, default
1642         };
1643         // Starting level of walking
1644         let mut level = 4 - (va_size - 4) / stride;
1645 
1646         // PA or IPA size is determined
1647         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1648         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1649         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1650         // To be safe, we use the minimum value if they are different.
1651         let pa_range = std::cmp::min(tcr_ips, pa_range);
1652         // PA size in bits
1653         let pa_size = match pa_range {
1654             0 => 32,
1655             1 => 36,
1656             2 => 40,
1657             3 => 42,
1658             4 => 44,
1659             5 => 48,
1660             6 => 52,
1661             _ => {
1662                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1663                     "PA range not supported {pa_range}"
1664                 ))))
1665             }
1666         };
1667 
1668         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1669         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1670         // If FEAT_LPA2 is present, the translation table descriptor holds
1671         // 50 bits of the table address of next level.
1672         // Otherwise, it is 48 bits.
1673         let descaddrmask = if ds == 1 {
1674             !0u64 >> (64 - 50) // mask with 50 least significant bits
1675         } else {
1676             !0u64 >> (64 - 48) // mask with 48 least significant bits
1677         };
1678         let descaddrmask = descaddrmask & !indexmask_grainsize;
1679 
1680         // Translation table base address
1681         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1682         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1683         // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5].
1684         if pa_size == 52 {
1685             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1686         }
1687 
1688         // Loop through tables of each level
1689         loop {
1690             // Table offset for current level
1691             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1692             descaddr |= table_offset;
1693             descaddr &= !7u64;
1694 
1695             let mut buf = [0; 8];
1696             guest_memory
1697                 .memory()
1698                 .read(&mut buf, GuestAddress(descaddr))
1699                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1700             let descriptor = u64::from_le_bytes(buf);
1701 
1702             descaddr = descriptor & descaddrmask;
1703             // In the case of FEAT_LPA, the next-level translation table address
1704             // bits [48:51] comes from bits [12:15] of the current descriptor.
1705             // For FEAT_LPA2, the next-level translation table address
1706             // bits [50:51] comes from bits [8:9] of the current descriptor,
1707             // bits [48:49] comes from bits [48:49] of the descriptor which was
1708             // handled previously.
1709             if pa_size == 52 {
1710                 if ds == 1 {
1711                     // FEAT_LPA2
1712                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1713                 } else {
1714                     // FEAT_LPA
1715                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1716                 }
1717             }
1718 
1719             if (descriptor & 2) != 0 && (level < 3) {
1720                 // This is a table entry. Go down to next level.
1721                 level += 1;
1722                 indexmask = indexmask_grainsize;
1723                 continue;
1724             }
1725 
1726             break;
1727         }
1728 
1729         // We have reached either:
1730         // - a page entry at level 3 or
1731         // - a block entry at level 1 or 2
1732         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1733         descaddr &= !(page_size - 1);
1734         descaddr |= gva & (page_size - 1);
1735 
1736         Ok(descaddr)
1737     }
1738 
1739     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1740         self.acpi_address = Some(acpi_address);
1741     }
1742 
1743     pub(crate) fn set_interrupt_controller(
1744         &mut self,
1745         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1746     ) {
1747         self.interrupt_controller = Some(interrupt_controller);
1748     }
1749 
1750     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1751         &self.vcpus_kill_signalled
1752     }
1753 }
1754 
1755 struct Cpu {
1756     cpu_id: u8,
1757     proximity_domain: u32,
1758     dynamic: bool,
1759 }
1760 
1761 #[cfg(target_arch = "x86_64")]
1762 const MADT_CPU_ENABLE_FLAG: usize = 0;
1763 
1764 #[cfg(target_arch = "x86_64")]
1765 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1766 
1767 impl Cpu {
1768     #[cfg(target_arch = "x86_64")]
1769     fn generate_mat(&self) -> Vec<u8> {
1770         let lapic = LocalX2Apic {
1771             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1772             length: 16,
1773             processor_id: self.cpu_id.into(),
1774             apic_id: self.cpu_id.into(),
1775             flags: 1 << MADT_CPU_ENABLE_FLAG,
1776             _reserved: 0,
1777         };
1778 
1779         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1780         // SAFETY: mat_data is large enough to hold lapic
1781         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1782 
1783         mat_data
1784     }
1785 }
1786 
1787 impl Aml for Cpu {
1788     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1789         #[cfg(target_arch = "x86_64")]
1790         let mat_data: Vec<u8> = self.generate_mat();
1791         #[allow(clippy::if_same_then_else)]
1792         if self.dynamic {
1793             aml::Device::new(
1794                 format!("C{:03X}", self.cpu_id).as_str().into(),
1795                 vec![
1796                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1797                     &aml::Name::new("_UID".into(), &self.cpu_id),
1798                     // Currently, AArch64 cannot support following fields.
1799                     /*
1800                     _STA return value:
1801                     Bit [0] – Set if the device is present.
1802                     Bit [1] – Set if the device is enabled and decoding its resources.
1803                     Bit [2] – Set if the device should be shown in the UI.
1804                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1805                     Bit [4] – Set if the battery is present.
1806                     Bits [31:5] – Reserved (must be cleared).
1807                     */
1808                     #[cfg(target_arch = "x86_64")]
1809                     &aml::Method::new(
1810                         "_STA".into(),
1811                         0,
1812                         false,
1813                         // Call into CSTA method which will interrogate device
1814                         vec![&aml::Return::new(&aml::MethodCall::new(
1815                             "CSTA".into(),
1816                             vec![&self.cpu_id],
1817                         ))],
1818                     ),
1819                     &aml::Method::new(
1820                         "_PXM".into(),
1821                         0,
1822                         false,
1823                         vec![&aml::Return::new(&self.proximity_domain)],
1824                     ),
1825                     // The Linux kernel expects every CPU device to have a _MAT entry
1826                     // containing the LAPIC for this processor with the enabled bit set
1827                     // even it if is disabled in the MADT (non-boot CPU)
1828                     #[cfg(target_arch = "x86_64")]
1829                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1830                     // Trigger CPU ejection
1831                     #[cfg(target_arch = "x86_64")]
1832                     &aml::Method::new(
1833                         "_EJ0".into(),
1834                         1,
1835                         false,
1836                         // Call into CEJ0 method which will actually eject device
1837                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1838                     ),
1839                 ],
1840             )
1841             .to_aml_bytes(sink);
1842         } else {
1843             aml::Device::new(
1844                 format!("C{:03X}", self.cpu_id).as_str().into(),
1845                 vec![
1846                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1847                     &aml::Name::new("_UID".into(), &self.cpu_id),
1848                     #[cfg(target_arch = "x86_64")]
1849                     &aml::Method::new(
1850                         "_STA".into(),
1851                         0,
1852                         false,
1853                         // Mark CPU present see CSTA implementation
1854                         vec![&aml::Return::new(&0xfu8)],
1855                     ),
1856                     &aml::Method::new(
1857                         "_PXM".into(),
1858                         0,
1859                         false,
1860                         vec![&aml::Return::new(&self.proximity_domain)],
1861                     ),
1862                     // The Linux kernel expects every CPU device to have a _MAT entry
1863                     // containing the LAPIC for this processor with the enabled bit set
1864                     // even it if is disabled in the MADT (non-boot CPU)
1865                     #[cfg(target_arch = "x86_64")]
1866                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1867                 ],
1868             )
1869             .to_aml_bytes(sink);
1870         }
1871     }
1872 }
1873 
1874 struct CpuNotify {
1875     cpu_id: u8,
1876 }
1877 
1878 impl Aml for CpuNotify {
1879     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1880         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
1881         aml::If::new(
1882             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1883             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1884         )
1885         .to_aml_bytes(sink)
1886     }
1887 }
1888 
1889 struct CpuMethods {
1890     max_vcpus: u8,
1891     dynamic: bool,
1892 }
1893 
1894 impl Aml for CpuMethods {
1895     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1896         if self.dynamic {
1897             // CPU status method
1898             aml::Method::new(
1899                 "CSTA".into(),
1900                 1,
1901                 true,
1902                 vec![
1903                     // Take lock defined above
1904                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1905                     // Write CPU number (in first argument) to I/O port via field
1906                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1907                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1908                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1909                     &aml::If::new(
1910                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1911                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1912                     ),
1913                     // Release lock
1914                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1915                     // Return 0 or 0xf
1916                     &aml::Return::new(&aml::Local(0)),
1917                 ],
1918             )
1919             .to_aml_bytes(sink);
1920 
1921             let mut cpu_notifies = Vec::new();
1922             for cpu_id in 0..self.max_vcpus {
1923                 cpu_notifies.push(CpuNotify { cpu_id });
1924             }
1925 
1926             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
1927             for cpu_id in 0..self.max_vcpus {
1928                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1929             }
1930 
1931             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
1932 
1933             aml::Method::new(
1934                 "CEJ0".into(),
1935                 1,
1936                 true,
1937                 vec![
1938                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1939                     // Write CPU number (in first argument) to I/O port via field
1940                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1941                     // Set CEJ0 bit
1942                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1943                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1944                 ],
1945             )
1946             .to_aml_bytes(sink);
1947 
1948             aml::Method::new(
1949                 "CSCN".into(),
1950                 0,
1951                 true,
1952                 vec![
1953                     // Take lock defined above
1954                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1955                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1956                     &aml::While::new(
1957                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1958                         vec![
1959                             // Write CPU number (in first argument) to I/O port via field
1960                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1961                             // Check if CINS bit is set
1962                             &aml::If::new(
1963                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1964                                 // Notify device if it is
1965                                 vec![
1966                                     &aml::MethodCall::new(
1967                                         "CTFY".into(),
1968                                         vec![&aml::Local(0), &aml::ONE],
1969                                     ),
1970                                     // Reset CINS bit
1971                                     &aml::Store::new(
1972                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1973                                         &aml::ONE,
1974                                     ),
1975                                 ],
1976                             ),
1977                             // Check if CRMV bit is set
1978                             &aml::If::new(
1979                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1980                                 // Notify device if it is (with the eject constant 0x3)
1981                                 vec![
1982                                     &aml::MethodCall::new(
1983                                         "CTFY".into(),
1984                                         vec![&aml::Local(0), &3u8],
1985                                     ),
1986                                     // Reset CRMV bit
1987                                     &aml::Store::new(
1988                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1989                                         &aml::ONE,
1990                                     ),
1991                                 ],
1992                             ),
1993                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1994                         ],
1995                     ),
1996                     // Release lock
1997                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1998                 ],
1999             )
2000             .to_aml_bytes(sink)
2001         } else {
2002             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2003         }
2004     }
2005 }
2006 
2007 impl Aml for CpuManager {
2008     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2009         #[cfg(target_arch = "x86_64")]
2010         if let Some(acpi_address) = self.acpi_address {
2011             // CPU hotplug controller
2012             aml::Device::new(
2013                 "_SB_.PRES".into(),
2014                 vec![
2015                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2016                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2017                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2018                     &aml::Mutex::new("CPLK".into(), 0),
2019                     &aml::Name::new(
2020                         "_CRS".into(),
2021                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2022                             aml::AddressSpaceCacheable::NotCacheable,
2023                             true,
2024                             acpi_address.0,
2025                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2026                             None,
2027                         )]),
2028                     ),
2029                     // OpRegion and Fields map MMIO range into individual field values
2030                     &aml::OpRegion::new(
2031                         "PRST".into(),
2032                         aml::OpRegionSpace::SystemMemory,
2033                         &(acpi_address.0 as usize),
2034                         &CPU_MANAGER_ACPI_SIZE,
2035                     ),
2036                     &aml::Field::new(
2037                         "PRST".into(),
2038                         aml::FieldAccessType::Byte,
2039                         aml::FieldLockRule::NoLock,
2040                         aml::FieldUpdateRule::WriteAsZeroes,
2041                         vec![
2042                             aml::FieldEntry::Reserved(32),
2043                             aml::FieldEntry::Named(*b"CPEN", 1),
2044                             aml::FieldEntry::Named(*b"CINS", 1),
2045                             aml::FieldEntry::Named(*b"CRMV", 1),
2046                             aml::FieldEntry::Named(*b"CEJ0", 1),
2047                             aml::FieldEntry::Reserved(4),
2048                             aml::FieldEntry::Named(*b"CCMD", 8),
2049                         ],
2050                     ),
2051                     &aml::Field::new(
2052                         "PRST".into(),
2053                         aml::FieldAccessType::DWord,
2054                         aml::FieldLockRule::NoLock,
2055                         aml::FieldUpdateRule::Preserve,
2056                         vec![
2057                             aml::FieldEntry::Named(*b"CSEL", 32),
2058                             aml::FieldEntry::Reserved(32),
2059                             aml::FieldEntry::Named(*b"CDAT", 32),
2060                         ],
2061                     ),
2062                 ],
2063             )
2064             .to_aml_bytes(sink);
2065         }
2066 
2067         // CPU devices
2068         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2069         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2070         // Bundle methods together under a common object
2071         let methods = CpuMethods {
2072             max_vcpus: self.config.max_vcpus,
2073             dynamic: self.dynamic,
2074         };
2075         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2076 
2077         let mut cpu_devices = Vec::new();
2078         for cpu_id in 0..self.config.max_vcpus {
2079             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2080             let cpu_device = Cpu {
2081                 cpu_id,
2082                 proximity_domain,
2083                 dynamic: self.dynamic,
2084             };
2085 
2086             cpu_devices.push(cpu_device);
2087         }
2088 
2089         for cpu_device in cpu_devices.iter() {
2090             cpu_data_inner.push(cpu_device);
2091         }
2092 
2093         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2094     }
2095 }
2096 
2097 impl Pausable for CpuManager {
2098     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2099         // Tell the vCPUs to pause themselves next time they exit
2100         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2101 
2102         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2103         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2104         // above.
2105         for state in self.vcpu_states.iter() {
2106             state.signal_thread();
2107         }
2108 
2109         for vcpu in self.vcpus.iter() {
2110             let mut vcpu = vcpu.lock().unwrap();
2111             vcpu.pause()?;
2112             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2113             if !self.config.kvm_hyperv {
2114                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2115                     MigratableError::Pause(anyhow!(
2116                         "Could not notify guest it has been paused {:?}",
2117                         e
2118                     ))
2119                 })?;
2120             }
2121         }
2122 
2123         // The vCPU thread will change its paused state before parking, wait here for each
2124         // actived vCPU change their state to ensure they have parked.
2125         for state in self.vcpu_states.iter() {
2126             if state.active() {
2127                 while !state.paused.load(Ordering::SeqCst) {
2128                     // To avoid a priority inversion with the vCPU thread
2129                     thread::sleep(std::time::Duration::from_millis(1));
2130                 }
2131             }
2132         }
2133 
2134         Ok(())
2135     }
2136 
2137     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2138         for vcpu in self.vcpus.iter() {
2139             vcpu.lock().unwrap().resume()?;
2140         }
2141 
2142         // Toggle the vCPUs pause boolean
2143         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2144 
2145         // Unpark all the VCPU threads.
2146         // Once unparked, the next thing they will do is checking for the pause
2147         // boolean. Since it'll be set to false, they will exit their pause loop
2148         // and go back to vmx root.
2149         for state in self.vcpu_states.iter() {
2150             state.paused.store(false, Ordering::SeqCst);
2151             state.unpark_thread();
2152         }
2153         Ok(())
2154     }
2155 }
2156 
2157 impl Snapshottable for CpuManager {
2158     fn id(&self) -> String {
2159         CPU_MANAGER_SNAPSHOT_ID.to_string()
2160     }
2161 
2162     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2163         let mut cpu_manager_snapshot = Snapshot::default();
2164 
2165         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2166         for vcpu in &self.vcpus {
2167             let mut vcpu = vcpu.lock().unwrap();
2168             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2169         }
2170 
2171         Ok(cpu_manager_snapshot)
2172     }
2173 }
2174 
2175 impl Transportable for CpuManager {}
2176 impl Migratable for CpuManager {}
2177 
2178 #[cfg(feature = "guest_debug")]
2179 impl Debuggable for CpuManager {
2180     #[cfg(feature = "kvm")]
2181     fn set_guest_debug(
2182         &self,
2183         cpu_id: usize,
2184         addrs: &[GuestAddress],
2185         singlestep: bool,
2186     ) -> std::result::Result<(), DebuggableError> {
2187         self.vcpus[cpu_id]
2188             .lock()
2189             .unwrap()
2190             .vcpu
2191             .set_guest_debug(addrs, singlestep)
2192             .map_err(DebuggableError::SetDebug)
2193     }
2194 
2195     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2196         Ok(())
2197     }
2198 
2199     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2200         Ok(())
2201     }
2202 
2203     #[cfg(target_arch = "x86_64")]
2204     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2205         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2206         let gregs = self
2207             .get_regs(cpu_id as u8)
2208             .map_err(DebuggableError::ReadRegs)?;
2209         let regs = [
2210             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2211             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2212         ];
2213 
2214         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2215         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2216         let eflags = gregs.rflags as u32;
2217         let rip = gregs.rip;
2218 
2219         // Segment registers: CS, SS, DS, ES, FS, GS
2220         let sregs = self
2221             .get_sregs(cpu_id as u8)
2222             .map_err(DebuggableError::ReadRegs)?;
2223         let segments = X86SegmentRegs {
2224             cs: sregs.cs.selector as u32,
2225             ss: sregs.ss.selector as u32,
2226             ds: sregs.ds.selector as u32,
2227             es: sregs.es.selector as u32,
2228             fs: sregs.fs.selector as u32,
2229             gs: sregs.gs.selector as u32,
2230         };
2231 
2232         // TODO: Add other registers
2233 
2234         Ok(CoreRegs {
2235             regs,
2236             eflags,
2237             rip,
2238             segments,
2239             ..Default::default()
2240         })
2241     }
2242 
2243     #[cfg(target_arch = "aarch64")]
2244     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2245         let gregs = self
2246             .get_regs(cpu_id as u8)
2247             .map_err(DebuggableError::ReadRegs)?;
2248         Ok(CoreRegs {
2249             x: gregs.regs.regs,
2250             sp: gregs.regs.sp,
2251             pc: gregs.regs.pc,
2252             ..Default::default()
2253         })
2254     }
2255 
2256     #[cfg(target_arch = "x86_64")]
2257     fn write_regs(
2258         &self,
2259         cpu_id: usize,
2260         regs: &CoreRegs,
2261     ) -> std::result::Result<(), DebuggableError> {
2262         let orig_gregs = self
2263             .get_regs(cpu_id as u8)
2264             .map_err(DebuggableError::ReadRegs)?;
2265         let gregs = StandardRegisters {
2266             rax: regs.regs[0],
2267             rbx: regs.regs[1],
2268             rcx: regs.regs[2],
2269             rdx: regs.regs[3],
2270             rsi: regs.regs[4],
2271             rdi: regs.regs[5],
2272             rbp: regs.regs[6],
2273             rsp: regs.regs[7],
2274             r8: regs.regs[8],
2275             r9: regs.regs[9],
2276             r10: regs.regs[10],
2277             r11: regs.regs[11],
2278             r12: regs.regs[12],
2279             r13: regs.regs[13],
2280             r14: regs.regs[14],
2281             r15: regs.regs[15],
2282             rip: regs.rip,
2283             // Update the lower 32-bit of rflags.
2284             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2285         };
2286 
2287         self.set_regs(cpu_id as u8, &gregs)
2288             .map_err(DebuggableError::WriteRegs)?;
2289 
2290         // Segment registers: CS, SS, DS, ES, FS, GS
2291         // Since GDB care only selectors, we call get_sregs() first.
2292         let mut sregs = self
2293             .get_sregs(cpu_id as u8)
2294             .map_err(DebuggableError::ReadRegs)?;
2295         sregs.cs.selector = regs.segments.cs as u16;
2296         sregs.ss.selector = regs.segments.ss as u16;
2297         sregs.ds.selector = regs.segments.ds as u16;
2298         sregs.es.selector = regs.segments.es as u16;
2299         sregs.fs.selector = regs.segments.fs as u16;
2300         sregs.gs.selector = regs.segments.gs as u16;
2301 
2302         self.set_sregs(cpu_id as u8, &sregs)
2303             .map_err(DebuggableError::WriteRegs)?;
2304 
2305         // TODO: Add other registers
2306 
2307         Ok(())
2308     }
2309 
2310     #[cfg(target_arch = "aarch64")]
2311     fn write_regs(
2312         &self,
2313         cpu_id: usize,
2314         regs: &CoreRegs,
2315     ) -> std::result::Result<(), DebuggableError> {
2316         let mut gregs = self
2317             .get_regs(cpu_id as u8)
2318             .map_err(DebuggableError::ReadRegs)?;
2319 
2320         gregs.regs.regs = regs.x;
2321         gregs.regs.sp = regs.sp;
2322         gregs.regs.pc = regs.pc;
2323 
2324         self.set_regs(cpu_id as u8, &gregs)
2325             .map_err(DebuggableError::WriteRegs)?;
2326 
2327         Ok(())
2328     }
2329 
2330     fn read_mem(
2331         &self,
2332         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2333         cpu_id: usize,
2334         vaddr: GuestAddress,
2335         len: usize,
2336     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2337         let mut buf = vec![0; len];
2338         let mut total_read = 0_u64;
2339 
2340         while total_read < len as u64 {
2341             let gaddr = vaddr.0 + total_read;
2342             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2343                 Ok(paddr) => paddr,
2344                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2345                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2346             };
2347             let psize = arch::PAGE_SIZE as u64;
2348             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2349             guest_memory
2350                 .memory()
2351                 .read(
2352                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2353                     GuestAddress(paddr),
2354                 )
2355                 .map_err(DebuggableError::ReadMem)?;
2356             total_read += read_len;
2357         }
2358         Ok(buf)
2359     }
2360 
2361     fn write_mem(
2362         &self,
2363         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2364         cpu_id: usize,
2365         vaddr: &GuestAddress,
2366         data: &[u8],
2367     ) -> std::result::Result<(), DebuggableError> {
2368         let mut total_written = 0_u64;
2369 
2370         while total_written < data.len() as u64 {
2371             let gaddr = vaddr.0 + total_written;
2372             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2373                 Ok(paddr) => paddr,
2374                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2375                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2376             };
2377             let psize = arch::PAGE_SIZE as u64;
2378             let write_len = std::cmp::min(
2379                 data.len() as u64 - total_written,
2380                 psize - (paddr & (psize - 1)),
2381             );
2382             guest_memory
2383                 .memory()
2384                 .write(
2385                     &data[total_written as usize..total_written as usize + write_len as usize],
2386                     GuestAddress(paddr),
2387                 )
2388                 .map_err(DebuggableError::WriteMem)?;
2389             total_written += write_len;
2390         }
2391         Ok(())
2392     }
2393 
2394     fn active_vcpus(&self) -> usize {
2395         self.present_vcpus() as usize
2396     }
2397 }
2398 
2399 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2400 impl Elf64Writable for CpuManager {}
2401 
2402 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2403 impl CpuElf64Writable for CpuManager {
2404     fn cpu_write_elf64_note(
2405         &mut self,
2406         dump_state: &DumpState,
2407     ) -> std::result::Result<(), GuestDebuggableError> {
2408         let mut coredump_file = dump_state.file.as_ref().unwrap();
2409         for vcpu in &self.vcpus {
2410             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2411             let mut pos: usize = 0;
2412             let mut buf = vec![0; note_size as usize];
2413             let descsz = size_of::<X86_64ElfPrStatus>();
2414             let vcpu_id = vcpu.lock().unwrap().id;
2415 
2416             let note = Elf64_Nhdr {
2417                 n_namesz: COREDUMP_NAME_SIZE,
2418                 n_descsz: descsz as u32,
2419                 n_type: NT_PRSTATUS,
2420             };
2421 
2422             let bytes: &[u8] = note.as_slice();
2423             buf.splice(0.., bytes.to_vec());
2424             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2425             buf.resize(pos + 4, 0);
2426             buf.splice(pos.., "CORE".to_string().into_bytes());
2427 
2428             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2429             buf.resize(pos + 32 + 4, 0);
2430             let pid = vcpu_id as u64;
2431             let bytes: &[u8] = pid.as_slice();
2432             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2433 
2434             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2435 
2436             let orig_rax: u64 = 0;
2437             let gregs = self.vcpus[usize::from(vcpu_id)]
2438                 .lock()
2439                 .unwrap()
2440                 .vcpu
2441                 .get_regs()
2442                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2443 
2444             let regs1 = [
2445                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2446                 gregs.r10,
2447             ];
2448             let regs2 = [
2449                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2450             ];
2451 
2452             let sregs = self.vcpus[usize::from(vcpu_id)]
2453                 .lock()
2454                 .unwrap()
2455                 .vcpu
2456                 .get_sregs()
2457                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2458 
2459             debug!(
2460                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2461                 gregs.rip,
2462                 gregs.rsp,
2463                 sregs.gs.base,
2464                 sregs.cs.selector,
2465                 sregs.ss.selector,
2466                 sregs.ds.selector,
2467             );
2468 
2469             let regs = X86_64UserRegs {
2470                 regs1,
2471                 regs2,
2472                 rip: gregs.rip,
2473                 cs: sregs.cs.selector as u64,
2474                 eflags: gregs.rflags,
2475                 rsp: gregs.rsp,
2476                 ss: sregs.ss.selector as u64,
2477                 fs_base: sregs.fs.base,
2478                 gs_base: sregs.gs.base,
2479                 ds: sregs.ds.selector as u64,
2480                 es: sregs.es.selector as u64,
2481                 fs: sregs.fs.selector as u64,
2482                 gs: sregs.gs.selector as u64,
2483             };
2484 
2485             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2486             let bytes: &[u8] = regs.as_slice();
2487             buf.resize(note_size as usize, 0);
2488             buf.splice(pos.., bytes.to_vec());
2489             buf.resize(note_size as usize, 0);
2490 
2491             coredump_file
2492                 .write(&buf)
2493                 .map_err(GuestDebuggableError::CoredumpFile)?;
2494         }
2495 
2496         Ok(())
2497     }
2498 
2499     fn cpu_write_vmm_note(
2500         &mut self,
2501         dump_state: &DumpState,
2502     ) -> std::result::Result<(), GuestDebuggableError> {
2503         let mut coredump_file = dump_state.file.as_ref().unwrap();
2504         for vcpu in &self.vcpus {
2505             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2506             let mut pos: usize = 0;
2507             let mut buf = vec![0; note_size as usize];
2508             let descsz = size_of::<DumpCpusState>();
2509             let vcpu_id = vcpu.lock().unwrap().id;
2510 
2511             let note = Elf64_Nhdr {
2512                 n_namesz: COREDUMP_NAME_SIZE,
2513                 n_descsz: descsz as u32,
2514                 n_type: 0,
2515             };
2516 
2517             let bytes: &[u8] = note.as_slice();
2518             buf.splice(0.., bytes.to_vec());
2519             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2520 
2521             buf.resize(pos + 4, 0);
2522             buf.splice(pos.., "QEMU".to_string().into_bytes());
2523 
2524             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2525 
2526             let gregs = self.vcpus[usize::from(vcpu_id)]
2527                 .lock()
2528                 .unwrap()
2529                 .vcpu
2530                 .get_regs()
2531                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2532 
2533             let regs1 = [
2534                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2535                 gregs.rbp,
2536             ];
2537 
2538             let regs2 = [
2539                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2540                 gregs.r15,
2541             ];
2542 
2543             let sregs = self.vcpus[usize::from(vcpu_id)]
2544                 .lock()
2545                 .unwrap()
2546                 .vcpu
2547                 .get_sregs()
2548                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2549 
2550             let mut msrs = vec![MsrEntry {
2551                 index: msr_index::MSR_KERNEL_GS_BASE,
2552                 ..Default::default()
2553             }];
2554 
2555             self.vcpus[vcpu_id as usize]
2556                 .lock()
2557                 .unwrap()
2558                 .vcpu
2559                 .get_msrs(&mut msrs)
2560                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2561             let kernel_gs_base = msrs[0].data;
2562 
2563             let cs = CpuSegment::new(sregs.cs);
2564             let ds = CpuSegment::new(sregs.ds);
2565             let es = CpuSegment::new(sregs.es);
2566             let fs = CpuSegment::new(sregs.fs);
2567             let gs = CpuSegment::new(sregs.gs);
2568             let ss = CpuSegment::new(sregs.ss);
2569             let ldt = CpuSegment::new(sregs.ldt);
2570             let tr = CpuSegment::new(sregs.tr);
2571             let gdt = CpuSegment::new_from_table(sregs.gdt);
2572             let idt = CpuSegment::new_from_table(sregs.idt);
2573             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2574             let regs = DumpCpusState {
2575                 version: 1,
2576                 size: size_of::<DumpCpusState>() as u32,
2577                 regs1,
2578                 regs2,
2579                 rip: gregs.rip,
2580                 rflags: gregs.rflags,
2581                 cs,
2582                 ds,
2583                 es,
2584                 fs,
2585                 gs,
2586                 ss,
2587                 ldt,
2588                 tr,
2589                 gdt,
2590                 idt,
2591                 cr,
2592                 kernel_gs_base,
2593             };
2594 
2595             let bytes: &[u8] = regs.as_slice();
2596             buf.resize(note_size as usize, 0);
2597             buf.splice(pos.., bytes.to_vec());
2598             buf.resize(note_size as usize, 0);
2599 
2600             coredump_file
2601                 .write(&buf)
2602                 .map_err(GuestDebuggableError::CoredumpFile)?;
2603         }
2604 
2605         Ok(())
2606     }
2607 }
2608 
2609 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2610 #[cfg(test)]
2611 mod tests {
2612     use arch::x86_64::interrupts::*;
2613     use arch::x86_64::regs::*;
2614     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2615 
2616     #[test]
2617     fn test_setlint() {
2618         let hv = hypervisor::new().unwrap();
2619         let vm = hv.create_vm().expect("new VM fd creation failed");
2620         assert!(hv.check_required_extensions().is_ok());
2621         // Calling get_lapic will fail if there is no irqchip before hand.
2622         assert!(vm.create_irq_chip().is_ok());
2623         let vcpu = vm.create_vcpu(0, None).unwrap();
2624         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2625 
2626         // Compute the value that is expected to represent LVT0 and LVT1.
2627         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2628         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2629         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2630         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2631 
2632         set_lint(&vcpu).unwrap();
2633 
2634         // Compute the value that represents LVT0 and LVT1 after set_lint.
2635         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2636         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2637         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2638         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2639         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2640     }
2641 
2642     #[test]
2643     fn test_setup_fpu() {
2644         let hv = hypervisor::new().unwrap();
2645         let vm = hv.create_vm().expect("new VM fd creation failed");
2646         let vcpu = vm.create_vcpu(0, None).unwrap();
2647         setup_fpu(&vcpu).unwrap();
2648 
2649         let expected_fpu: FpuState = FpuState {
2650             fcw: 0x37f,
2651             mxcsr: 0x1f80,
2652             ..Default::default()
2653         };
2654         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2655         // TODO: auto-generate kvm related structures with PartialEq on.
2656         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2657         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2658         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2659         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2660         // remove it at all.
2661         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2662     }
2663 
2664     #[test]
2665     fn test_setup_msrs() {
2666         use hypervisor::arch::x86::{msr_index, MsrEntry};
2667 
2668         let hv = hypervisor::new().unwrap();
2669         let vm = hv.create_vm().expect("new VM fd creation failed");
2670         let vcpu = vm.create_vcpu(0, None).unwrap();
2671         setup_msrs(&vcpu).unwrap();
2672 
2673         // This test will check against the last MSR entry configured (the tenth one).
2674         // See create_msr_entries for details.
2675         let mut msrs = vec![MsrEntry {
2676             index: msr_index::MSR_IA32_MISC_ENABLE,
2677             ..Default::default()
2678         }];
2679 
2680         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2681         // in this test case scenario.
2682         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2683         assert_eq!(read_msrs, 1);
2684 
2685         // Official entries that were setup when we did setup_msrs. We need to assert that the
2686         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2687         // expect.
2688         let entry_vec = vcpu.boot_msr_entries();
2689         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2690     }
2691 
2692     #[test]
2693     fn test_setup_regs() {
2694         let hv = hypervisor::new().unwrap();
2695         let vm = hv.create_vm().expect("new VM fd creation failed");
2696         let vcpu = vm.create_vcpu(0, None).unwrap();
2697 
2698         let expected_regs: StandardRegisters = StandardRegisters {
2699             rflags: 0x0000000000000002u64,
2700             rbx: arch::layout::PVH_INFO_START.0,
2701             rip: 1,
2702             ..Default::default()
2703         };
2704 
2705         setup_regs(&vcpu, expected_regs.rip).unwrap();
2706 
2707         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2708         assert_eq!(actual_regs, expected_regs);
2709     }
2710 }
2711 
2712 #[cfg(target_arch = "aarch64")]
2713 #[cfg(test)]
2714 mod tests {
2715     use arch::{aarch64::regs, layout};
2716     use hypervisor::kvm::aarch64::is_system_register;
2717     use hypervisor::kvm::kvm_bindings::{
2718         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2719         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2720     };
2721     use hypervisor::{arm64_core_reg_id, offset_of};
2722     use std::mem;
2723 
2724     #[test]
2725     fn test_setup_regs() {
2726         let hv = hypervisor::new().unwrap();
2727         let vm = hv.create_vm().unwrap();
2728         let vcpu = vm.create_vcpu(0, None).unwrap();
2729 
2730         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2731         // Must fail when vcpu is not initialized yet.
2732         assert!(res.is_err());
2733 
2734         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2735         vm.get_preferred_target(&mut kvi).unwrap();
2736         vcpu.vcpu_init(&kvi).unwrap();
2737 
2738         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2739     }
2740 
2741     #[test]
2742     fn test_read_mpidr() {
2743         let hv = hypervisor::new().unwrap();
2744         let vm = hv.create_vm().unwrap();
2745         let vcpu = vm.create_vcpu(0, None).unwrap();
2746         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2747         vm.get_preferred_target(&mut kvi).unwrap();
2748 
2749         // Must fail when vcpu is not initialized yet.
2750         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2751 
2752         vcpu.vcpu_init(&kvi).unwrap();
2753         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2754     }
2755 
2756     #[test]
2757     fn test_is_system_register() {
2758         let offset = offset_of!(user_pt_regs, pc);
2759         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2760         assert!(!is_system_register(regid));
2761         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2762         assert!(is_system_register(regid));
2763     }
2764 
2765     #[test]
2766     fn test_save_restore_core_regs() {
2767         let hv = hypervisor::new().unwrap();
2768         let vm = hv.create_vm().unwrap();
2769         let vcpu = vm.create_vcpu(0, None).unwrap();
2770         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2771         vm.get_preferred_target(&mut kvi).unwrap();
2772 
2773         // Must fail when vcpu is not initialized yet.
2774         let res = vcpu.get_regs();
2775         assert!(res.is_err());
2776         assert_eq!(
2777             format!("{}", res.unwrap_err()),
2778             "Failed to get core register: Exec format error (os error 8)"
2779         );
2780 
2781         let mut state = kvm_regs::default();
2782         let res = vcpu.set_regs(&state);
2783         assert!(res.is_err());
2784         assert_eq!(
2785             format!("{}", res.unwrap_err()),
2786             "Failed to set core register: Exec format error (os error 8)"
2787         );
2788 
2789         vcpu.vcpu_init(&kvi).unwrap();
2790         let res = vcpu.get_regs();
2791         assert!(res.is_ok());
2792         state = res.unwrap();
2793         assert_eq!(state.regs.pstate, 0x3C5);
2794 
2795         assert!(vcpu.set_regs(&state).is_ok());
2796     }
2797 
2798     #[test]
2799     fn test_get_set_mpstate() {
2800         let hv = hypervisor::new().unwrap();
2801         let vm = hv.create_vm().unwrap();
2802         let vcpu = vm.create_vcpu(0, None).unwrap();
2803         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2804         vm.get_preferred_target(&mut kvi).unwrap();
2805 
2806         let res = vcpu.get_mp_state();
2807         assert!(res.is_ok());
2808         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2809     }
2810 }
2811