xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 2571e59438597f53aa4993cd70d6462fe1364ba7)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 use arch::EntryPoint;
35 use arch::NumaNodes;
36 #[cfg(target_arch = "aarch64")]
37 use devices::gic::Gic;
38 use devices::interrupt_controller::InterruptController;
39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
44 use hypervisor::aarch64::StandardRegisters;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::msr_index;
47 #[cfg(target_arch = "x86_64")]
48 use hypervisor::arch::x86::CpuIdEntry;
49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
50 use hypervisor::arch::x86::MsrEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
53 #[cfg(target_arch = "aarch64")]
54 use hypervisor::kvm::kvm_bindings;
55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
56 use hypervisor::kvm::kvm_ioctls::Cap;
57 #[cfg(feature = "tdx")]
58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
59 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
60 use libc::{c_void, siginfo_t};
61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
62 use linux_loader::elf::Elf64_Nhdr;
63 use seccompiler::{apply_filter, SeccompAction};
64 use std::collections::BTreeMap;
65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
66 use std::io::Write;
67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
68 use std::mem::size_of;
69 use std::os::unix::thread::JoinHandleExt;
70 use std::sync::atomic::{AtomicBool, Ordering};
71 use std::sync::{Arc, Barrier, Mutex};
72 use std::{cmp, io, result, thread};
73 use thiserror::Error;
74 use tracer::trace_scoped;
75 use vm_device::BusDevice;
76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
77 use vm_memory::ByteValued;
78 #[cfg(feature = "guest_debug")]
79 use vm_memory::{Bytes, GuestAddressSpace};
80 use vm_memory::{GuestAddress, GuestMemoryAtomic};
81 use vm_migration::{
82     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
83     Transportable,
84 };
85 use vmm_sys_util::eventfd::EventFd;
86 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
87 use zerocopy::AsBytes;
88 
89 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
90 /// Extract the specified bits of a 64-bit integer.
91 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
92 /// following expression should return 3 (`0b11`):
93 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
94 ///
95 macro_rules! extract_bits_64 {
96     ($value: tt, $offset: tt, $length: tt) => {
97         ($value >> $offset) & (!0u64 >> (64 - $length))
98     };
99 }
100 
101 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
102 macro_rules! extract_bits_64_without_offset {
103     ($value: tt, $length: tt) => {
104         $value & (!0u64 >> (64 - $length))
105     };
106 }
107 
108 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
109 
110 #[derive(Debug, Error)]
111 pub enum Error {
112     #[error("Error creating vCPU: {0}")]
113     VcpuCreate(#[source] anyhow::Error),
114 
115     #[error("Error running bCPU: {0}")]
116     VcpuRun(#[source] anyhow::Error),
117 
118     #[error("Error spawning vCPU thread: {0}")]
119     VcpuSpawn(#[source] io::Error),
120 
121     #[error("Error generating common CPUID: {0}")]
122     CommonCpuId(#[source] arch::Error),
123 
124     #[error("Error configuring vCPU: {0}")]
125     VcpuConfiguration(#[source] arch::Error),
126 
127     #[cfg(target_arch = "aarch64")]
128     #[error("Error fetching preferred target: {0}")]
129     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
130 
131     #[cfg(target_arch = "aarch64")]
132     #[error("Error initialising vCPU: {0}")]
133     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
134 
135     #[error("Failed to join on vCPU threads: {0:?}")]
136     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
137 
138     #[error("Error adding CpuManager to MMIO bus: {0}")]
139     BusError(#[source] vm_device::BusError),
140 
141     #[error("Requested vCPUs exceed maximum")]
142     DesiredVCpuCountExceedsMax,
143 
144     #[error("Cannot create seccomp filter: {0}")]
145     CreateSeccompFilter(#[source] seccompiler::Error),
146 
147     #[error("Cannot apply seccomp filter: {0}")]
148     ApplySeccompFilter(#[source] seccompiler::Error),
149 
150     #[error("Error starting vCPU after restore: {0}")]
151     StartRestoreVcpu(#[source] anyhow::Error),
152 
153     #[error("Unexpected VmExit")]
154     UnexpectedVmExit,
155 
156     #[error("Failed to allocate MMIO address for CpuManager")]
157     AllocateMmmioAddress,
158 
159     #[cfg(feature = "tdx")]
160     #[error("Error initializing TDX: {0}")]
161     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
162 
163     #[cfg(target_arch = "aarch64")]
164     #[error("Error initializing PMU: {0}")]
165     InitPmu(#[source] hypervisor::HypervisorCpuError),
166 
167     #[cfg(feature = "guest_debug")]
168     #[error("Error during CPU debug: {0}")]
169     CpuDebug(#[source] hypervisor::HypervisorCpuError),
170 
171     #[cfg(feature = "guest_debug")]
172     #[error("Error translating virtual address: {0}")]
173     TranslateVirtualAddress(#[source] anyhow::Error),
174 
175     #[cfg(target_arch = "x86_64")]
176     #[error("Error setting up AMX: {0}")]
177     AmxEnable(#[source] anyhow::Error),
178 
179     #[error("Maximum number of vCPUs exceeds host limit")]
180     MaximumVcpusExceeded,
181 }
182 pub type Result<T> = result::Result<T, Error>;
183 
184 #[cfg(target_arch = "x86_64")]
185 #[allow(dead_code)]
186 #[repr(packed)]
187 #[derive(AsBytes)]
188 struct LocalX2Apic {
189     pub r#type: u8,
190     pub length: u8,
191     pub _reserved: u16,
192     pub apic_id: u32,
193     pub flags: u32,
194     pub processor_id: u32,
195 }
196 
197 #[allow(dead_code)]
198 #[repr(packed)]
199 #[derive(Default, AsBytes)]
200 struct Ioapic {
201     pub r#type: u8,
202     pub length: u8,
203     pub ioapic_id: u8,
204     _reserved: u8,
205     pub apic_address: u32,
206     pub gsi_base: u32,
207 }
208 
209 #[cfg(target_arch = "aarch64")]
210 #[allow(dead_code)]
211 #[repr(packed)]
212 #[derive(AsBytes)]
213 struct GicC {
214     pub r#type: u8,
215     pub length: u8,
216     pub reserved0: u16,
217     pub cpu_interface_number: u32,
218     pub uid: u32,
219     pub flags: u32,
220     pub parking_version: u32,
221     pub performance_interrupt: u32,
222     pub parked_address: u64,
223     pub base_address: u64,
224     pub gicv_base_address: u64,
225     pub gich_base_address: u64,
226     pub vgic_interrupt: u32,
227     pub gicr_base_address: u64,
228     pub mpidr: u64,
229     pub proc_power_effi_class: u8,
230     pub reserved1: u8,
231     pub spe_overflow_interrupt: u16,
232 }
233 
234 #[cfg(target_arch = "aarch64")]
235 #[allow(dead_code)]
236 #[repr(packed)]
237 #[derive(AsBytes)]
238 struct GicD {
239     pub r#type: u8,
240     pub length: u8,
241     pub reserved0: u16,
242     pub gic_id: u32,
243     pub base_address: u64,
244     pub global_irq_base: u32,
245     pub version: u8,
246     pub reserved1: [u8; 3],
247 }
248 
249 #[cfg(target_arch = "aarch64")]
250 #[allow(dead_code)]
251 #[repr(packed)]
252 #[derive(AsBytes)]
253 struct GicR {
254     pub r#type: u8,
255     pub length: u8,
256     pub reserved: u16,
257     pub base_address: u64,
258     pub range_length: u32,
259 }
260 
261 #[cfg(target_arch = "aarch64")]
262 #[allow(dead_code)]
263 #[repr(packed)]
264 #[derive(AsBytes)]
265 struct GicIts {
266     pub r#type: u8,
267     pub length: u8,
268     pub reserved0: u16,
269     pub translation_id: u32,
270     pub base_address: u64,
271     pub reserved1: u32,
272 }
273 
274 #[cfg(target_arch = "aarch64")]
275 #[allow(dead_code)]
276 #[repr(packed)]
277 #[derive(AsBytes)]
278 struct ProcessorHierarchyNode {
279     pub r#type: u8,
280     pub length: u8,
281     pub reserved: u16,
282     pub flags: u32,
283     pub parent: u32,
284     pub acpi_processor_id: u32,
285     pub num_private_resources: u32,
286 }
287 
288 #[allow(dead_code)]
289 #[repr(packed)]
290 #[derive(Default, AsBytes)]
291 struct InterruptSourceOverride {
292     pub r#type: u8,
293     pub length: u8,
294     pub bus: u8,
295     pub source: u8,
296     pub gsi: u32,
297     pub flags: u16,
298 }
299 
300 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
301 macro_rules! round_up {
302     ($n:expr,$d:expr) => {
303         (($n / ($d + 1)) + 1) * $d
304     };
305 }
306 
307 /// A wrapper around creating and using a kvm-based VCPU.
308 pub struct Vcpu {
309     // The hypervisor abstracted CPU.
310     vcpu: Arc<dyn hypervisor::Vcpu>,
311     id: u8,
312     #[cfg(target_arch = "aarch64")]
313     mpidr: u64,
314     saved_state: Option<CpuState>,
315 }
316 
317 impl Vcpu {
318     /// Constructs a new VCPU for `vm`.
319     ///
320     /// # Arguments
321     ///
322     /// * `id` - Represents the CPU number between [0, max vcpus).
323     /// * `vm` - The virtual machine this vcpu will get attached to.
324     /// * `vm_ops` - Optional object for exit handling.
325     pub fn new(
326         id: u8,
327         vm: &Arc<dyn hypervisor::Vm>,
328         vm_ops: Option<Arc<dyn VmOps>>,
329     ) -> Result<Self> {
330         let vcpu = vm
331             .create_vcpu(id, vm_ops)
332             .map_err(|e| Error::VcpuCreate(e.into()))?;
333         // Initially the cpuid per vCPU is the one supported by this VM.
334         Ok(Vcpu {
335             vcpu,
336             id,
337             #[cfg(target_arch = "aarch64")]
338             mpidr: 0,
339             saved_state: None,
340         })
341     }
342 
343     /// Configures a vcpu and should be called once per vcpu when created.
344     ///
345     /// # Arguments
346     ///
347     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
348     /// * `guest_memory` - Guest memory.
349     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
350     pub fn configure(
351         &mut self,
352         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
353         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
354         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
355         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
356     ) -> Result<()> {
357         #[cfg(target_arch = "aarch64")]
358         {
359             self.init(vm)?;
360             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
361                 .map_err(Error::VcpuConfiguration)?;
362         }
363         info!("Configuring vCPU: cpu_id = {}", self.id);
364         #[cfg(target_arch = "x86_64")]
365         arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv)
366             .map_err(Error::VcpuConfiguration)?;
367 
368         Ok(())
369     }
370 
371     /// Gets the MPIDR register value.
372     #[cfg(target_arch = "aarch64")]
373     pub fn get_mpidr(&self) -> u64 {
374         self.mpidr
375     }
376 
377     /// Gets the saved vCPU state.
378     #[cfg(target_arch = "aarch64")]
379     pub fn get_saved_state(&self) -> Option<CpuState> {
380         self.saved_state.clone()
381     }
382 
383     /// Initializes an aarch64 specific vcpu for booting Linux.
384     #[cfg(target_arch = "aarch64")]
385     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
386         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
387 
388         // This reads back the kernel's preferred target type.
389         vm.get_preferred_target(&mut kvi)
390             .map_err(Error::VcpuArmPreferredTarget)?;
391         // We already checked that the capability is supported.
392         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
393         if vm
394             .as_any()
395             .downcast_ref::<hypervisor::kvm::KvmVm>()
396             .unwrap()
397             .check_extension(Cap::ArmPmuV3)
398         {
399             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
400         }
401         // Non-boot cpus are powered off initially.
402         if self.id > 0 {
403             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
404         }
405         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
406     }
407 
408     /// Runs the VCPU until it exits, returning the reason.
409     ///
410     /// Note that the state of the VCPU and associated VM must be setup first for this to do
411     /// anything useful.
412     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
413         self.vcpu.run()
414     }
415 }
416 
417 impl Pausable for Vcpu {}
418 impl Snapshottable for Vcpu {
419     fn id(&self) -> String {
420         self.id.to_string()
421     }
422 
423     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
424         let saved_state = self
425             .vcpu
426             .state()
427             .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?;
428 
429         self.saved_state = Some(saved_state.clone());
430 
431         Ok(Snapshot::from_data(SnapshotData::new_from_state(
432             &saved_state,
433         )?))
434     }
435 }
436 
437 pub struct CpuManager {
438     hypervisor_type: HypervisorType,
439     config: CpusConfig,
440     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
441     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
442     #[cfg(target_arch = "x86_64")]
443     cpuid: Vec<CpuIdEntry>,
444     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
445     vm: Arc<dyn hypervisor::Vm>,
446     vcpus_kill_signalled: Arc<AtomicBool>,
447     vcpus_pause_signalled: Arc<AtomicBool>,
448     exit_evt: EventFd,
449     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
450     reset_evt: EventFd,
451     #[cfg(feature = "guest_debug")]
452     vm_debug_evt: EventFd,
453     vcpu_states: Vec<VcpuState>,
454     selected_cpu: u8,
455     vcpus: Vec<Arc<Mutex<Vcpu>>>,
456     seccomp_action: SeccompAction,
457     vm_ops: Arc<dyn VmOps>,
458     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
459     acpi_address: Option<GuestAddress>,
460     proximity_domain_per_cpu: BTreeMap<u8, u32>,
461     affinity: BTreeMap<u8, Vec<u8>>,
462     dynamic: bool,
463 }
464 
465 const CPU_ENABLE_FLAG: usize = 0;
466 const CPU_INSERTING_FLAG: usize = 1;
467 const CPU_REMOVING_FLAG: usize = 2;
468 const CPU_EJECT_FLAG: usize = 3;
469 
470 const CPU_STATUS_OFFSET: u64 = 4;
471 const CPU_SELECTION_OFFSET: u64 = 0;
472 
473 impl BusDevice for CpuManager {
474     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
475         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
476         data.fill(0);
477 
478         match offset {
479             CPU_SELECTION_OFFSET => {
480                 data[0] = self.selected_cpu;
481             }
482             CPU_STATUS_OFFSET => {
483                 if self.selected_cpu < self.max_vcpus() {
484                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
485                     if state.active() {
486                         data[0] |= 1 << CPU_ENABLE_FLAG;
487                     }
488                     if state.inserting {
489                         data[0] |= 1 << CPU_INSERTING_FLAG;
490                     }
491                     if state.removing {
492                         data[0] |= 1 << CPU_REMOVING_FLAG;
493                     }
494                 } else {
495                     warn!("Out of range vCPU id: {}", self.selected_cpu);
496                 }
497             }
498             _ => {
499                 warn!(
500                     "Unexpected offset for accessing CPU manager device: {:#}",
501                     offset
502                 );
503             }
504         }
505     }
506 
507     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
508         match offset {
509             CPU_SELECTION_OFFSET => {
510                 self.selected_cpu = data[0];
511             }
512             CPU_STATUS_OFFSET => {
513                 if self.selected_cpu < self.max_vcpus() {
514                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
515                     // The ACPI code writes back a 1 to acknowledge the insertion
516                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
517                         && state.inserting
518                     {
519                         state.inserting = false;
520                     }
521                     // Ditto for removal
522                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
523                         && state.removing
524                     {
525                         state.removing = false;
526                     }
527                     // Trigger removal of vCPU
528                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
529                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
530                             error!("Error removing vCPU: {:?}", e);
531                         }
532                     }
533                 } else {
534                     warn!("Out of range vCPU id: {}", self.selected_cpu);
535                 }
536             }
537             _ => {
538                 warn!(
539                     "Unexpected offset for accessing CPU manager device: {:#}",
540                     offset
541                 );
542             }
543         }
544         None
545     }
546 }
547 
548 #[derive(Default)]
549 struct VcpuState {
550     inserting: bool,
551     removing: bool,
552     handle: Option<thread::JoinHandle<()>>,
553     kill: Arc<AtomicBool>,
554     vcpu_run_interrupted: Arc<AtomicBool>,
555     paused: Arc<AtomicBool>,
556 }
557 
558 impl VcpuState {
559     fn active(&self) -> bool {
560         self.handle.is_some()
561     }
562 
563     fn signal_thread(&self) {
564         if let Some(handle) = self.handle.as_ref() {
565             loop {
566                 // SAFETY: FFI call with correct arguments
567                 unsafe {
568                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
569                 }
570                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
571                     break;
572                 } else {
573                     // This is more effective than thread::yield_now() at
574                     // avoiding a priority inversion with the vCPU thread
575                     thread::sleep(std::time::Duration::from_millis(1));
576                 }
577             }
578         }
579     }
580 
581     fn join_thread(&mut self) -> Result<()> {
582         if let Some(handle) = self.handle.take() {
583             handle.join().map_err(Error::ThreadCleanup)?
584         }
585 
586         Ok(())
587     }
588 
589     fn unpark_thread(&self) {
590         if let Some(handle) = self.handle.as_ref() {
591             handle.thread().unpark()
592         }
593     }
594 }
595 
596 impl CpuManager {
597     #[allow(unused_variables)]
598     #[allow(clippy::too_many_arguments)]
599     pub fn new(
600         config: &CpusConfig,
601         vm: Arc<dyn hypervisor::Vm>,
602         exit_evt: EventFd,
603         reset_evt: EventFd,
604         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
605         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
606         seccomp_action: SeccompAction,
607         vm_ops: Arc<dyn VmOps>,
608         #[cfg(feature = "tdx")] tdx_enabled: bool,
609         numa_nodes: &NumaNodes,
610     ) -> Result<Arc<Mutex<CpuManager>>> {
611         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
612             return Err(Error::MaximumVcpusExceeded);
613         }
614 
615         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
616         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
617         let hypervisor_type = hypervisor.hypervisor_type();
618 
619         #[cfg(target_arch = "x86_64")]
620         if config.features.amx {
621             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
622             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
623             const XFEATURE_XTILEDATA: usize = 18;
624             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
625 
626             // SAFETY: the syscall is only modifing kernel internal
627             // data structures that the kernel is itself expected to safeguard.
628             let amx_tile = unsafe {
629                 libc::syscall(
630                     libc::SYS_arch_prctl,
631                     ARCH_REQ_XCOMP_GUEST_PERM,
632                     XFEATURE_XTILEDATA,
633                 )
634             };
635 
636             if amx_tile != 0 {
637                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
638             } else {
639                 let mask: usize = 0;
640                 // SAFETY: the mask being modified (not marked mutable as it is
641                 // modified in unsafe only which is permitted) isn't in use elsewhere.
642                 let result = unsafe {
643                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
644                 };
645                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
646                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
647                 }
648             }
649         }
650 
651         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
652             let mut cpu_list = Vec::new();
653             for (proximity_domain, numa_node) in numa_nodes.iter() {
654                 for cpu in numa_node.cpus.iter() {
655                     cpu_list.push((*cpu, *proximity_domain))
656                 }
657             }
658             cpu_list
659         }
660         .into_iter()
661         .collect();
662 
663         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
664             cpu_affinity
665                 .iter()
666                 .map(|a| (a.vcpu, a.host_cpus.clone()))
667                 .collect()
668         } else {
669             BTreeMap::new()
670         };
671 
672         #[cfg(feature = "tdx")]
673         let dynamic = !tdx_enabled;
674         #[cfg(not(feature = "tdx"))]
675         let dynamic = true;
676 
677         Ok(Arc::new(Mutex::new(CpuManager {
678             hypervisor_type,
679             config: config.clone(),
680             interrupt_controller: None,
681             #[cfg(target_arch = "x86_64")]
682             cpuid: Vec::new(),
683             vm,
684             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
685             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
686             vcpu_states,
687             exit_evt,
688             reset_evt,
689             #[cfg(feature = "guest_debug")]
690             vm_debug_evt,
691             selected_cpu: 0,
692             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
693             seccomp_action,
694             vm_ops,
695             acpi_address: None,
696             proximity_domain_per_cpu,
697             affinity,
698             dynamic,
699         })))
700     }
701 
702     #[cfg(target_arch = "x86_64")]
703     pub fn populate_cpuid(
704         &mut self,
705         memory_manager: &Arc<Mutex<MemoryManager>>,
706         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
707         #[cfg(feature = "tdx")] tdx_enabled: bool,
708     ) -> Result<()> {
709         let sgx_epc_sections = memory_manager
710             .lock()
711             .unwrap()
712             .sgx_epc_region()
713             .as_ref()
714             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
715 
716         let topology = self.config.topology.clone().map_or_else(
717             || {
718                 #[cfg(feature = "mshv")]
719                 if matches!(hypervisor.hypervisor_type(), HypervisorType::Mshv) {
720                     return Some((1, self.boot_vcpus(), 1));
721                 }
722                 None
723             },
724             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
725         );
726 
727         self.cpuid = {
728             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
729             arch::generate_common_cpuid(
730                 hypervisor,
731                 topology,
732                 sgx_epc_sections,
733                 phys_bits,
734                 self.config.kvm_hyperv,
735                 #[cfg(feature = "tdx")]
736                 tdx_enabled,
737             )
738             .map_err(Error::CommonCpuId)?
739         };
740 
741         Ok(())
742     }
743 
744     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
745         info!("Creating vCPU: cpu_id = {}", cpu_id);
746 
747         let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?;
748 
749         if let Some(snapshot) = snapshot {
750             // AArch64 vCPUs should be initialized after created.
751             #[cfg(target_arch = "aarch64")]
752             vcpu.init(&self.vm)?;
753 
754             let state: CpuState = snapshot.to_state().map_err(|e| {
755                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
756             })?;
757             vcpu.vcpu
758                 .set_state(&state)
759                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
760 
761             vcpu.saved_state = Some(state);
762         }
763 
764         let vcpu = Arc::new(Mutex::new(vcpu));
765 
766         // Adding vCPU to the CpuManager's vCPU list.
767         self.vcpus.push(vcpu.clone());
768 
769         Ok(vcpu)
770     }
771 
772     pub fn configure_vcpu(
773         &self,
774         vcpu: Arc<Mutex<Vcpu>>,
775         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
776     ) -> Result<()> {
777         let mut vcpu = vcpu.lock().unwrap();
778 
779         #[cfg(target_arch = "x86_64")]
780         assert!(!self.cpuid.is_empty());
781 
782         #[cfg(target_arch = "x86_64")]
783         vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?;
784 
785         #[cfg(target_arch = "aarch64")]
786         vcpu.configure(&self.vm, boot_setup)?;
787 
788         Ok(())
789     }
790 
791     /// Only create new vCPUs if there aren't any inactive ones to reuse
792     fn create_vcpus(
793         &mut self,
794         desired_vcpus: u8,
795         snapshot: Option<Snapshot>,
796     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
797         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
798         info!(
799             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
800             desired_vcpus,
801             self.config.max_vcpus,
802             self.vcpus.len(),
803             self.present_vcpus()
804         );
805 
806         if desired_vcpus > self.config.max_vcpus {
807             return Err(Error::DesiredVCpuCountExceedsMax);
808         }
809 
810         // Only create vCPUs in excess of all the allocated vCPUs.
811         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
812             vcpus.push(self.create_vcpu(
813                 cpu_id,
814                 // TODO: The special format of the CPU id can be removed once
815                 // ready to break live upgrade.
816                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
817             )?);
818         }
819 
820         Ok(vcpus)
821     }
822 
823     #[cfg(target_arch = "aarch64")]
824     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
825         for cpu in self.vcpus.iter() {
826             let cpu = cpu.lock().unwrap();
827             // Check if PMU attr is available, if not, log the information.
828             if cpu.vcpu.has_pmu_support() {
829                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
830             } else {
831                 debug!(
832                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
833                     cpu.id
834                 );
835                 return Ok(false);
836             }
837         }
838 
839         Ok(true)
840     }
841 
842     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
843         self.vcpus.clone()
844     }
845 
846     fn start_vcpu(
847         &mut self,
848         vcpu: Arc<Mutex<Vcpu>>,
849         vcpu_id: u8,
850         vcpu_thread_barrier: Arc<Barrier>,
851         inserting: bool,
852     ) -> Result<()> {
853         let reset_evt = self.reset_evt.try_clone().unwrap();
854         let exit_evt = self.exit_evt.try_clone().unwrap();
855         #[cfg(feature = "kvm")]
856         let hypervisor_type = self.hypervisor_type;
857         #[cfg(feature = "guest_debug")]
858         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
859         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
860         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
861         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
862 
863         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
864         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
865             .vcpu_run_interrupted
866             .clone();
867         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
868         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
869 
870         // Prepare the CPU set the current vCPU is expected to run onto.
871         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
872             // SAFETY: all zeros is a valid pattern
873             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
874             // SAFETY: FFI call, trivially safe
875             unsafe { libc::CPU_ZERO(&mut cpuset) };
876             for host_cpu in host_cpus {
877                 // SAFETY: FFI call, trivially safe
878                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
879             }
880             cpuset
881         });
882 
883         // Retrieve seccomp filter for vcpu thread
884         let vcpu_seccomp_filter =
885             get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type)
886                 .map_err(Error::CreateSeccompFilter)?;
887 
888         #[cfg(target_arch = "x86_64")]
889         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
890 
891         info!("Starting vCPU: cpu_id = {}", vcpu_id);
892 
893         let handle = Some(
894             thread::Builder::new()
895                 .name(format!("vcpu{vcpu_id}"))
896                 .spawn(move || {
897                     // Schedule the thread to run on the expected CPU set
898                     if let Some(cpuset) = cpuset.as_ref() {
899                         // SAFETY: FFI call with correct arguments
900                         let ret = unsafe {
901                             libc::sched_setaffinity(
902                                 0,
903                                 std::mem::size_of::<libc::cpu_set_t>(),
904                                 cpuset as *const libc::cpu_set_t,
905                             )
906                         };
907 
908                         if ret != 0 {
909                             error!(
910                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
911                                 vcpu_id,
912                                 io::Error::last_os_error()
913                             );
914                             return;
915                         }
916                     }
917 
918                     // Apply seccomp filter for vcpu thread.
919                     if !vcpu_seccomp_filter.is_empty() {
920                         if let Err(e) =
921                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
922                         {
923                             error!("Error applying seccomp filter: {:?}", e);
924                             return;
925                         }
926                     }
927                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
928                     // This uses an async signal safe handler to kill the vcpu handles.
929                     register_signal_handler(SIGRTMIN(), handle_signal)
930                         .expect("Failed to register vcpu signal handler");
931                     // Block until all CPUs are ready.
932                     vcpu_thread_barrier.wait();
933 
934                     std::panic::catch_unwind(move || {
935                         loop {
936                             // If we are being told to pause, we park the thread
937                             // until the pause boolean is toggled.
938                             // The resume operation is responsible for toggling
939                             // the boolean and unpark the thread.
940                             // We enter a loop because park() could spuriously
941                             // return. We will then park() again unless the
942                             // pause boolean has been toggled.
943 
944                             // Need to use Ordering::SeqCst as we have multiple
945                             // loads and stores to different atomics and we need
946                             // to see them in a consistent order in all threads
947 
948                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
949                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
950                                 // completed by returning to KVM_RUN. From the kernel docs:
951                                 //
952                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
953                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
954                                 // operations are complete (and guest state is consistent) only after userspace
955                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
956                                 // incomplete operations and then check for pending signals.
957                                 // The pending state of the operation is not preserved in state which is
958                                 // visible to userspace, thus userspace should ensure that the operation is
959                                 // completed before performing a live migration.  Userspace can re-enter the
960                                 // guest with an unmasked signal pending or with the immediate_exit field set
961                                 // to complete pending operations without allowing any further instructions
962                                 // to be executed.
963 
964                                 #[cfg(feature = "kvm")]
965                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
966                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
967                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
968                                         error!("Unexpected VM exit on \"immediate_exit\" run");
969                                         break;
970                                     }
971                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
972                                 }
973 
974                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
975 
976                                 vcpu_paused.store(true, Ordering::SeqCst);
977                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
978                                     thread::park();
979                                 }
980                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
981                             }
982 
983                             // We've been told to terminate
984                             if vcpu_kill_signalled.load(Ordering::SeqCst)
985                                 || vcpu_kill.load(Ordering::SeqCst)
986                             {
987                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
988                                 break;
989                             }
990 
991                             #[cfg(feature = "tdx")]
992                             let mut vcpu = vcpu.lock().unwrap();
993                             #[cfg(not(feature = "tdx"))]
994                             let vcpu = vcpu.lock().unwrap();
995                             // vcpu.run() returns false on a triple-fault so trigger a reset
996                             match vcpu.run() {
997                                 Ok(run) => match run {
998                                     #[cfg(feature = "kvm")]
999                                     VmExit::Debug => {
1000                                         info!("VmExit::Debug");
1001                                         #[cfg(feature = "guest_debug")]
1002                                         {
1003                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1004                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1005                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1006                                         }
1007                                     }
1008                                     #[cfg(target_arch = "x86_64")]
1009                                     VmExit::IoapicEoi(vector) => {
1010                                         if let Some(interrupt_controller) =
1011                                             &interrupt_controller_clone
1012                                         {
1013                                             interrupt_controller
1014                                                 .lock()
1015                                                 .unwrap()
1016                                                 .end_of_interrupt(vector);
1017                                         }
1018                                     }
1019                                     VmExit::Ignore => {}
1020                                     VmExit::Hyperv => {}
1021                                     VmExit::Reset => {
1022                                         info!("VmExit::Reset");
1023                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1024                                         reset_evt.write(1).unwrap();
1025                                         break;
1026                                     }
1027                                     VmExit::Shutdown => {
1028                                         info!("VmExit::Shutdown");
1029                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1030                                         exit_evt.write(1).unwrap();
1031                                         break;
1032                                     }
1033                                     #[cfg(feature = "tdx")]
1034                                     VmExit::Tdx => {
1035                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1036                                             match vcpu.get_tdx_exit_details() {
1037                                                 Ok(details) => match details {
1038                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1039                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1040                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1041                                                     }
1042                                                 },
1043                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1044                                             }
1045                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1046                                         } else {
1047                                             // We should never reach this code as
1048                                             // this means the design from the code
1049                                             // is wrong.
1050                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1051                                         }
1052                                     }
1053                                     _ => {
1054                                         error!(
1055                                             "VCPU generated error: {:?}",
1056                                             Error::UnexpectedVmExit
1057                                         );
1058                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1059                                         exit_evt.write(1).unwrap();
1060                                         break;
1061                                     }
1062                                 },
1063 
1064                                 Err(e) => {
1065                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1066                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1067                                     exit_evt.write(1).unwrap();
1068                                     break;
1069                                 }
1070                             }
1071 
1072                             // We've been told to terminate
1073                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1074                                 || vcpu_kill.load(Ordering::SeqCst)
1075                             {
1076                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1077                                 break;
1078                             }
1079                         }
1080                     })
1081                     .or_else(|_| {
1082                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1083                         error!("vCPU thread panicked");
1084                         panic_exit_evt.write(1)
1085                     })
1086                     .ok();
1087                 })
1088                 .map_err(Error::VcpuSpawn)?,
1089         );
1090 
1091         // On hot plug calls into this function entry_point is None. It is for
1092         // those hotplug CPU additions that we need to set the inserting flag.
1093         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1094         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1095 
1096         Ok(())
1097     }
1098 
1099     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1100     fn activate_vcpus(
1101         &mut self,
1102         desired_vcpus: u8,
1103         inserting: bool,
1104         paused: Option<bool>,
1105     ) -> Result<()> {
1106         if desired_vcpus > self.config.max_vcpus {
1107             return Err(Error::DesiredVCpuCountExceedsMax);
1108         }
1109 
1110         let vcpu_thread_barrier = Arc::new(Barrier::new(
1111             (desired_vcpus - self.present_vcpus() + 1) as usize,
1112         ));
1113 
1114         if let Some(paused) = paused {
1115             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1116         }
1117 
1118         info!(
1119             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1120             desired_vcpus,
1121             self.vcpus.len(),
1122             self.present_vcpus(),
1123             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1124         );
1125 
1126         // This reuses any inactive vCPUs as well as any that were newly created
1127         for vcpu_id in self.present_vcpus()..desired_vcpus {
1128             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1129             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1130         }
1131 
1132         // Unblock all CPU threads.
1133         vcpu_thread_barrier.wait();
1134         Ok(())
1135     }
1136 
1137     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1138         // Mark vCPUs for removal, actual removal happens on ejection
1139         for cpu_id in desired_vcpus..self.present_vcpus() {
1140             self.vcpu_states[usize::from(cpu_id)].removing = true;
1141         }
1142     }
1143 
1144     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1145         info!("Removing vCPU: cpu_id = {}", cpu_id);
1146         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1147         state.kill.store(true, Ordering::SeqCst);
1148         state.signal_thread();
1149         state.join_thread()?;
1150         state.handle = None;
1151 
1152         // Once the thread has exited, clear the "kill" so that it can reused
1153         state.kill.store(false, Ordering::SeqCst);
1154 
1155         Ok(())
1156     }
1157 
1158     pub fn create_boot_vcpus(
1159         &mut self,
1160         snapshot: Option<Snapshot>,
1161     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1162         trace_scoped!("create_boot_vcpus");
1163 
1164         self.create_vcpus(self.boot_vcpus(), snapshot)
1165     }
1166 
1167     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1168     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1169         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1170     }
1171 
1172     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1173         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1174             .map_err(|e| {
1175                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1176             })?;
1177 
1178         Ok(())
1179     }
1180 
1181     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1182         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1183             return Ok(false);
1184         }
1185 
1186         if !self.dynamic {
1187             return Ok(false);
1188         }
1189 
1190         match desired_vcpus.cmp(&self.present_vcpus()) {
1191             cmp::Ordering::Greater => {
1192                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1193                 for vcpu in vcpus {
1194                     self.configure_vcpu(vcpu, None)?
1195                 }
1196                 self.activate_vcpus(desired_vcpus, true, None)?;
1197                 Ok(true)
1198             }
1199             cmp::Ordering::Less => {
1200                 self.mark_vcpus_for_removal(desired_vcpus);
1201                 Ok(true)
1202             }
1203             _ => Ok(false),
1204         }
1205     }
1206 
1207     pub fn shutdown(&mut self) -> Result<()> {
1208         // Tell the vCPUs to stop themselves next time they go through the loop
1209         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1210 
1211         // Toggle the vCPUs pause boolean
1212         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1213 
1214         // Unpark all the VCPU threads.
1215         for state in self.vcpu_states.iter() {
1216             state.unpark_thread();
1217         }
1218 
1219         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1220         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1221         // above.
1222         for state in self.vcpu_states.iter() {
1223             state.signal_thread();
1224         }
1225 
1226         // Wait for all the threads to finish. This removes the state from the vector.
1227         for mut state in self.vcpu_states.drain(..) {
1228             state.join_thread()?;
1229         }
1230 
1231         Ok(())
1232     }
1233 
1234     #[cfg(feature = "tdx")]
1235     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1236         for vcpu in &self.vcpus {
1237             vcpu.lock()
1238                 .unwrap()
1239                 .vcpu
1240                 .tdx_init(hob_address)
1241                 .map_err(Error::InitializeTdx)?;
1242         }
1243         Ok(())
1244     }
1245 
1246     pub fn boot_vcpus(&self) -> u8 {
1247         self.config.boot_vcpus
1248     }
1249 
1250     pub fn max_vcpus(&self) -> u8 {
1251         self.config.max_vcpus
1252     }
1253 
1254     #[cfg(target_arch = "x86_64")]
1255     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1256         assert!(!self.cpuid.is_empty());
1257         self.cpuid.clone()
1258     }
1259 
1260     fn present_vcpus(&self) -> u8 {
1261         self.vcpu_states
1262             .iter()
1263             .fold(0, |acc, state| acc + state.active() as u8)
1264     }
1265 
1266     #[cfg(target_arch = "aarch64")]
1267     pub fn get_mpidrs(&self) -> Vec<u64> {
1268         self.vcpus
1269             .iter()
1270             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1271             .collect()
1272     }
1273 
1274     #[cfg(target_arch = "aarch64")]
1275     pub fn get_saved_states(&self) -> Vec<CpuState> {
1276         self.vcpus
1277             .iter()
1278             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1279             .collect()
1280     }
1281 
1282     #[cfg(target_arch = "aarch64")]
1283     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1284         self.config
1285             .topology
1286             .clone()
1287             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1288     }
1289 
1290     pub fn create_madt(&self) -> Sdt {
1291         use crate::acpi;
1292         // This is also checked in the commandline parsing.
1293         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1294 
1295         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1296         #[cfg(target_arch = "x86_64")]
1297         {
1298             madt.write(36, arch::layout::APIC_START.0);
1299 
1300             for cpu in 0..self.config.max_vcpus {
1301                 let lapic = LocalX2Apic {
1302                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1303                     length: 16,
1304                     processor_id: cpu.into(),
1305                     apic_id: cpu.into(),
1306                     flags: if cpu < self.config.boot_vcpus {
1307                         1 << MADT_CPU_ENABLE_FLAG
1308                     } else {
1309                         0
1310                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1311                     _reserved: 0,
1312                 };
1313                 madt.append(lapic);
1314             }
1315 
1316             madt.append(Ioapic {
1317                 r#type: acpi::ACPI_APIC_IO,
1318                 length: 12,
1319                 ioapic_id: 0,
1320                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1321                 gsi_base: 0,
1322                 ..Default::default()
1323             });
1324 
1325             madt.append(InterruptSourceOverride {
1326                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1327                 length: 10,
1328                 bus: 0,
1329                 source: 4,
1330                 gsi: 4,
1331                 flags: 0,
1332             });
1333         }
1334 
1335         #[cfg(target_arch = "aarch64")]
1336         {
1337             /* Notes:
1338              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1339              */
1340 
1341             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1342             for cpu in 0..self.config.boot_vcpus {
1343                 let vcpu = &self.vcpus[cpu as usize];
1344                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1345                 /* ARMv8 MPIDR format:
1346                      Bits [63:40] Must be zero
1347                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1348                      Bits [31:24] Must be zero
1349                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1350                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1351                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1352                 */
1353                 let mpidr_mask = 0xff_00ff_ffff;
1354                 let gicc = GicC {
1355                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1356                     length: 80,
1357                     reserved0: 0,
1358                     cpu_interface_number: cpu as u32,
1359                     uid: cpu as u32,
1360                     flags: 1,
1361                     parking_version: 0,
1362                     performance_interrupt: 0,
1363                     parked_address: 0,
1364                     base_address: 0,
1365                     gicv_base_address: 0,
1366                     gich_base_address: 0,
1367                     vgic_interrupt: 0,
1368                     gicr_base_address: 0,
1369                     mpidr: mpidr & mpidr_mask,
1370                     proc_power_effi_class: 0,
1371                     reserved1: 0,
1372                     spe_overflow_interrupt: 0,
1373                 };
1374 
1375                 madt.append(gicc);
1376             }
1377             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1378 
1379             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1380             let gicd = GicD {
1381                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1382                 length: 24,
1383                 reserved0: 0,
1384                 gic_id: 0,
1385                 base_address: vgic_config.dist_addr,
1386                 global_irq_base: 0,
1387                 version: 3,
1388                 reserved1: [0; 3],
1389             };
1390             madt.append(gicd);
1391 
1392             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1393             let gicr = GicR {
1394                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1395                 length: 16,
1396                 reserved: 0,
1397                 base_address: vgic_config.redists_addr,
1398                 range_length: vgic_config.redists_size as u32,
1399             };
1400             madt.append(gicr);
1401 
1402             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1403             let gicits = GicIts {
1404                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1405                 length: 20,
1406                 reserved0: 0,
1407                 translation_id: 0,
1408                 base_address: vgic_config.msi_addr,
1409                 reserved1: 0,
1410             };
1411             madt.append(gicits);
1412 
1413             madt.update_checksum();
1414         }
1415 
1416         madt
1417     }
1418 
1419     #[cfg(target_arch = "aarch64")]
1420     pub fn create_pptt(&self) -> Sdt {
1421         let pptt_start = 0;
1422         let mut cpus = 0;
1423         let mut uid = 0;
1424         // If topology is not specified, the default setting is:
1425         // 1 package, multiple cores, 1 thread per core
1426         // This is also the behavior when PPTT is missing.
1427         let (threads_per_core, cores_per_package, packages) =
1428             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1429 
1430         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1431 
1432         for cluster_idx in 0..packages {
1433             if cpus < self.config.boot_vcpus as usize {
1434                 let cluster_offset = pptt.len() - pptt_start;
1435                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1436                     r#type: 0,
1437                     length: 20,
1438                     reserved: 0,
1439                     flags: 0x2,
1440                     parent: 0,
1441                     acpi_processor_id: cluster_idx as u32,
1442                     num_private_resources: 0,
1443                 };
1444                 pptt.append(cluster_hierarchy_node);
1445 
1446                 for core_idx in 0..cores_per_package {
1447                     let core_offset = pptt.len() - pptt_start;
1448 
1449                     if threads_per_core > 1 {
1450                         let core_hierarchy_node = ProcessorHierarchyNode {
1451                             r#type: 0,
1452                             length: 20,
1453                             reserved: 0,
1454                             flags: 0x2,
1455                             parent: cluster_offset as u32,
1456                             acpi_processor_id: core_idx as u32,
1457                             num_private_resources: 0,
1458                         };
1459                         pptt.append(core_hierarchy_node);
1460 
1461                         for _thread_idx in 0..threads_per_core {
1462                             let thread_hierarchy_node = ProcessorHierarchyNode {
1463                                 r#type: 0,
1464                                 length: 20,
1465                                 reserved: 0,
1466                                 flags: 0xE,
1467                                 parent: core_offset as u32,
1468                                 acpi_processor_id: uid as u32,
1469                                 num_private_resources: 0,
1470                             };
1471                             pptt.append(thread_hierarchy_node);
1472                             uid += 1;
1473                         }
1474                     } else {
1475                         let thread_hierarchy_node = ProcessorHierarchyNode {
1476                             r#type: 0,
1477                             length: 20,
1478                             reserved: 0,
1479                             flags: 0xA,
1480                             parent: cluster_offset as u32,
1481                             acpi_processor_id: uid as u32,
1482                             num_private_resources: 0,
1483                         };
1484                         pptt.append(thread_hierarchy_node);
1485                         uid += 1;
1486                     }
1487                 }
1488                 cpus += (cores_per_package * threads_per_core) as usize;
1489             }
1490         }
1491 
1492         pptt.update_checksum();
1493         pptt
1494     }
1495 
1496     #[cfg(feature = "guest_debug")]
1497     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1498         self.vcpus[usize::from(cpu_id)]
1499             .lock()
1500             .unwrap()
1501             .vcpu
1502             .get_regs()
1503             .map_err(Error::CpuDebug)
1504     }
1505 
1506     #[cfg(feature = "guest_debug")]
1507     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1508         self.vcpus[usize::from(cpu_id)]
1509             .lock()
1510             .unwrap()
1511             .vcpu
1512             .set_regs(regs)
1513             .map_err(Error::CpuDebug)
1514     }
1515 
1516     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1517     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1518         self.vcpus[usize::from(cpu_id)]
1519             .lock()
1520             .unwrap()
1521             .vcpu
1522             .get_sregs()
1523             .map_err(Error::CpuDebug)
1524     }
1525 
1526     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1527     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1528         self.vcpus[usize::from(cpu_id)]
1529             .lock()
1530             .unwrap()
1531             .vcpu
1532             .set_sregs(sregs)
1533             .map_err(Error::CpuDebug)
1534     }
1535 
1536     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1537     fn translate_gva(
1538         &self,
1539         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1540         cpu_id: u8,
1541         gva: u64,
1542     ) -> Result<u64> {
1543         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1544             .lock()
1545             .unwrap()
1546             .vcpu
1547             .translate_gva(gva, /* flags: unused */ 0)
1548             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1549         Ok(gpa)
1550     }
1551 
1552     ///
1553     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1554     /// it in VMM by walking through translation tables.
1555     ///
1556     /// Address translation is big topic, here we only focus the scenario that
1557     /// happens in VMM while debugging kernel. This `translate_gva`
1558     /// implementation is restricted to:
1559     /// - Exception Level 1
1560     /// - Translate high address range only (kernel space)
1561     ///
1562     /// This implementation supports following Arm-v8a features related to
1563     /// address translation:
1564     /// - FEAT_LPA
1565     /// - FEAT_LVA
1566     /// - FEAT_LPA2
1567     ///
1568     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1569     fn translate_gva(
1570         &self,
1571         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1572         cpu_id: u8,
1573         gva: u64,
1574     ) -> Result<u64> {
1575         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1576             .lock()
1577             .unwrap()
1578             .vcpu
1579             .get_sys_reg(regs::TCR_EL1)
1580             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1581         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1582             .lock()
1583             .unwrap()
1584             .vcpu
1585             .get_sys_reg(regs::TTBR1_EL1)
1586             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1587         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1588             .lock()
1589             .unwrap()
1590             .vcpu
1591             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1592             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1593 
1594         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1595         // or low (0x000xxx...).
1596         let high_range = extract_bits_64!(gva, 55, 1);
1597         if high_range == 0 {
1598             info!("VA (0x{:x}) range is not supported!", gva);
1599             return Ok(gva);
1600         }
1601 
1602         // High range size offset
1603         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1604         // Granule size
1605         let tg = extract_bits_64!(tcr_el1, 30, 2);
1606         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1607         let ds = extract_bits_64!(tcr_el1, 59, 1);
1608 
1609         if tsz == 0 {
1610             info!("VA translation is not ready!");
1611             return Ok(gva);
1612         }
1613 
1614         // VA size is determined by TCR_BL1.T1SZ
1615         let va_size = 64 - tsz;
1616         // Number of bits in VA consumed in each level of translation
1617         let stride = match tg {
1618             3 => 13, // 64KB granule size
1619             1 => 11, // 16KB granule size
1620             _ => 9,  // 4KB, default
1621         };
1622         // Starting level of walking
1623         let mut level = 4 - (va_size - 4) / stride;
1624 
1625         // PA or IPA size is determined
1626         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1627         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1628         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1629         // To be safe, we use the minimum value if they are different.
1630         let pa_range = std::cmp::min(tcr_ips, pa_range);
1631         // PA size in bits
1632         let pa_size = match pa_range {
1633             0 => 32,
1634             1 => 36,
1635             2 => 40,
1636             3 => 42,
1637             4 => 44,
1638             5 => 48,
1639             6 => 52,
1640             _ => {
1641                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1642                     "PA range not supported {pa_range}"
1643                 ))))
1644             }
1645         };
1646 
1647         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1648         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1649         // If FEAT_LPA2 is present, the translation table descriptor holds
1650         // 50 bits of the table address of next level.
1651         // Otherwise, it is 48 bits.
1652         let descaddrmask = if ds == 1 {
1653             !0u64 >> (64 - 50) // mask with 50 least significant bits
1654         } else {
1655             !0u64 >> (64 - 48) // mask with 48 least significant bits
1656         };
1657         let descaddrmask = descaddrmask & !indexmask_grainsize;
1658 
1659         // Translation table base address
1660         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1661         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1662         // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5].
1663         if pa_size == 52 {
1664             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1665         }
1666 
1667         // Loop through tables of each level
1668         loop {
1669             // Table offset for current level
1670             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1671             descaddr |= table_offset;
1672             descaddr &= !7u64;
1673 
1674             let mut buf = [0; 8];
1675             guest_memory
1676                 .memory()
1677                 .read(&mut buf, GuestAddress(descaddr))
1678                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1679             let descriptor = u64::from_le_bytes(buf);
1680 
1681             descaddr = descriptor & descaddrmask;
1682             // In the case of FEAT_LPA, the next-level translation table address
1683             // bits [48:51] comes from bits [12:15] of the current descriptor.
1684             // For FEAT_LPA2, the next-level translation table address
1685             // bits [50:51] comes from bits [8:9] of the current descriptor,
1686             // bits [48:49] comes from bits [48:49] of the descriptor which was
1687             // handled previously.
1688             if pa_size == 52 {
1689                 if ds == 1 {
1690                     // FEAT_LPA2
1691                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1692                 } else {
1693                     // FEAT_LPA
1694                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1695                 }
1696             }
1697 
1698             if (descriptor & 2) != 0 && (level < 3) {
1699                 // This is a table entry. Go down to next level.
1700                 level += 1;
1701                 indexmask = indexmask_grainsize;
1702                 continue;
1703             }
1704 
1705             break;
1706         }
1707 
1708         // We have reached either:
1709         // - a page entry at level 3 or
1710         // - a block entry at level 1 or 2
1711         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1712         descaddr &= !(page_size - 1);
1713         descaddr |= gva & (page_size - 1);
1714 
1715         Ok(descaddr)
1716     }
1717 
1718     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1719         self.acpi_address = Some(acpi_address);
1720     }
1721 
1722     pub(crate) fn set_interrupt_controller(
1723         &mut self,
1724         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1725     ) {
1726         self.interrupt_controller = Some(interrupt_controller);
1727     }
1728 
1729     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1730         &self.vcpus_kill_signalled
1731     }
1732 }
1733 
1734 struct Cpu {
1735     cpu_id: u8,
1736     proximity_domain: u32,
1737     dynamic: bool,
1738 }
1739 
1740 #[cfg(target_arch = "x86_64")]
1741 const MADT_CPU_ENABLE_FLAG: usize = 0;
1742 
1743 #[cfg(target_arch = "x86_64")]
1744 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1745 
1746 impl Cpu {
1747     #[cfg(target_arch = "x86_64")]
1748     fn generate_mat(&self) -> Vec<u8> {
1749         let lapic = LocalX2Apic {
1750             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1751             length: 16,
1752             processor_id: self.cpu_id.into(),
1753             apic_id: self.cpu_id.into(),
1754             flags: 1 << MADT_CPU_ENABLE_FLAG,
1755             _reserved: 0,
1756         };
1757 
1758         let mut mat_data: Vec<u8> = Vec::new();
1759         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1760         // SAFETY: mat_data is large enough to hold lapic
1761         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1762 
1763         mat_data
1764     }
1765 }
1766 
1767 impl Aml for Cpu {
1768     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1769         #[cfg(target_arch = "x86_64")]
1770         let mat_data: Vec<u8> = self.generate_mat();
1771         #[allow(clippy::if_same_then_else)]
1772         if self.dynamic {
1773             aml::Device::new(
1774                 format!("C{:03X}", self.cpu_id).as_str().into(),
1775                 vec![
1776                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1777                     &aml::Name::new("_UID".into(), &self.cpu_id),
1778                     // Currently, AArch64 cannot support following fields.
1779                     /*
1780                     _STA return value:
1781                     Bit [0] – Set if the device is present.
1782                     Bit [1] – Set if the device is enabled and decoding its resources.
1783                     Bit [2] – Set if the device should be shown in the UI.
1784                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1785                     Bit [4] – Set if the battery is present.
1786                     Bits [31:5] – Reserved (must be cleared).
1787                     */
1788                     #[cfg(target_arch = "x86_64")]
1789                     &aml::Method::new(
1790                         "_STA".into(),
1791                         0,
1792                         false,
1793                         // Call into CSTA method which will interrogate device
1794                         vec![&aml::Return::new(&aml::MethodCall::new(
1795                             "CSTA".into(),
1796                             vec![&self.cpu_id],
1797                         ))],
1798                     ),
1799                     &aml::Method::new(
1800                         "_PXM".into(),
1801                         0,
1802                         false,
1803                         vec![&aml::Return::new(&self.proximity_domain)],
1804                     ),
1805                     // The Linux kernel expects every CPU device to have a _MAT entry
1806                     // containing the LAPIC for this processor with the enabled bit set
1807                     // even it if is disabled in the MADT (non-boot CPU)
1808                     #[cfg(target_arch = "x86_64")]
1809                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1810                     // Trigger CPU ejection
1811                     #[cfg(target_arch = "x86_64")]
1812                     &aml::Method::new(
1813                         "_EJ0".into(),
1814                         1,
1815                         false,
1816                         // Call into CEJ0 method which will actually eject device
1817                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1818                     ),
1819                 ],
1820             )
1821             .to_aml_bytes(sink);
1822         } else {
1823             aml::Device::new(
1824                 format!("C{:03X}", self.cpu_id).as_str().into(),
1825                 vec![
1826                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1827                     &aml::Name::new("_UID".into(), &self.cpu_id),
1828                     #[cfg(target_arch = "x86_64")]
1829                     &aml::Method::new(
1830                         "_STA".into(),
1831                         0,
1832                         false,
1833                         // Mark CPU present see CSTA implementation
1834                         vec![&aml::Return::new(&0xfu8)],
1835                     ),
1836                     &aml::Method::new(
1837                         "_PXM".into(),
1838                         0,
1839                         false,
1840                         vec![&aml::Return::new(&self.proximity_domain)],
1841                     ),
1842                     // The Linux kernel expects every CPU device to have a _MAT entry
1843                     // containing the LAPIC for this processor with the enabled bit set
1844                     // even it if is disabled in the MADT (non-boot CPU)
1845                     #[cfg(target_arch = "x86_64")]
1846                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1847                 ],
1848             )
1849             .to_aml_bytes(sink);
1850         }
1851     }
1852 }
1853 
1854 struct CpuNotify {
1855     cpu_id: u8,
1856 }
1857 
1858 impl Aml for CpuNotify {
1859     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1860         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
1861         aml::If::new(
1862             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1863             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1864         )
1865         .to_aml_bytes(sink)
1866     }
1867 }
1868 
1869 struct CpuMethods {
1870     max_vcpus: u8,
1871     dynamic: bool,
1872 }
1873 
1874 impl Aml for CpuMethods {
1875     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1876         if self.dynamic {
1877             // CPU status method
1878             aml::Method::new(
1879                 "CSTA".into(),
1880                 1,
1881                 true,
1882                 vec![
1883                     // Take lock defined above
1884                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1885                     // Write CPU number (in first argument) to I/O port via field
1886                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1887                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1888                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1889                     &aml::If::new(
1890                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1891                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1892                     ),
1893                     // Release lock
1894                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1895                     // Return 0 or 0xf
1896                     &aml::Return::new(&aml::Local(0)),
1897                 ],
1898             )
1899             .to_aml_bytes(sink);
1900 
1901             let mut cpu_notifies = Vec::new();
1902             for cpu_id in 0..self.max_vcpus {
1903                 cpu_notifies.push(CpuNotify { cpu_id });
1904             }
1905 
1906             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
1907             for cpu_id in 0..self.max_vcpus {
1908                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1909             }
1910 
1911             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
1912 
1913             aml::Method::new(
1914                 "CEJ0".into(),
1915                 1,
1916                 true,
1917                 vec![
1918                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1919                     // Write CPU number (in first argument) to I/O port via field
1920                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1921                     // Set CEJ0 bit
1922                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1923                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1924                 ],
1925             )
1926             .to_aml_bytes(sink);
1927 
1928             aml::Method::new(
1929                 "CSCN".into(),
1930                 0,
1931                 true,
1932                 vec![
1933                     // Take lock defined above
1934                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1935                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1936                     &aml::While::new(
1937                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1938                         vec![
1939                             // Write CPU number (in first argument) to I/O port via field
1940                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1941                             // Check if CINS bit is set
1942                             &aml::If::new(
1943                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1944                                 // Notify device if it is
1945                                 vec![
1946                                     &aml::MethodCall::new(
1947                                         "CTFY".into(),
1948                                         vec![&aml::Local(0), &aml::ONE],
1949                                     ),
1950                                     // Reset CINS bit
1951                                     &aml::Store::new(
1952                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1953                                         &aml::ONE,
1954                                     ),
1955                                 ],
1956                             ),
1957                             // Check if CRMV bit is set
1958                             &aml::If::new(
1959                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1960                                 // Notify device if it is (with the eject constant 0x3)
1961                                 vec![
1962                                     &aml::MethodCall::new(
1963                                         "CTFY".into(),
1964                                         vec![&aml::Local(0), &3u8],
1965                                     ),
1966                                     // Reset CRMV bit
1967                                     &aml::Store::new(
1968                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1969                                         &aml::ONE,
1970                                     ),
1971                                 ],
1972                             ),
1973                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1974                         ],
1975                     ),
1976                     // Release lock
1977                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1978                 ],
1979             )
1980             .to_aml_bytes(sink)
1981         } else {
1982             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
1983         }
1984     }
1985 }
1986 
1987 impl Aml for CpuManager {
1988     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1989         #[cfg(target_arch = "x86_64")]
1990         if let Some(acpi_address) = self.acpi_address {
1991             // CPU hotplug controller
1992             aml::Device::new(
1993                 "_SB_.PRES".into(),
1994                 vec![
1995                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
1996                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1997                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1998                     &aml::Mutex::new("CPLK".into(), 0),
1999                     &aml::Name::new(
2000                         "_CRS".into(),
2001                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2002                             aml::AddressSpaceCachable::NotCacheable,
2003                             true,
2004                             acpi_address.0,
2005                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2006                             None,
2007                         )]),
2008                     ),
2009                     // OpRegion and Fields map MMIO range into individual field values
2010                     &aml::OpRegion::new(
2011                         "PRST".into(),
2012                         aml::OpRegionSpace::SystemMemory,
2013                         &(acpi_address.0 as usize),
2014                         &CPU_MANAGER_ACPI_SIZE,
2015                     ),
2016                     &aml::Field::new(
2017                         "PRST".into(),
2018                         aml::FieldAccessType::Byte,
2019                         aml::FieldLockRule::NoLock,
2020                         aml::FieldUpdateRule::WriteAsZeroes,
2021                         vec![
2022                             aml::FieldEntry::Reserved(32),
2023                             aml::FieldEntry::Named(*b"CPEN", 1),
2024                             aml::FieldEntry::Named(*b"CINS", 1),
2025                             aml::FieldEntry::Named(*b"CRMV", 1),
2026                             aml::FieldEntry::Named(*b"CEJ0", 1),
2027                             aml::FieldEntry::Reserved(4),
2028                             aml::FieldEntry::Named(*b"CCMD", 8),
2029                         ],
2030                     ),
2031                     &aml::Field::new(
2032                         "PRST".into(),
2033                         aml::FieldAccessType::DWord,
2034                         aml::FieldLockRule::NoLock,
2035                         aml::FieldUpdateRule::Preserve,
2036                         vec![
2037                             aml::FieldEntry::Named(*b"CSEL", 32),
2038                             aml::FieldEntry::Reserved(32),
2039                             aml::FieldEntry::Named(*b"CDAT", 32),
2040                         ],
2041                     ),
2042                 ],
2043             )
2044             .to_aml_bytes(sink);
2045         }
2046 
2047         // CPU devices
2048         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2049         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2050         // Bundle methods together under a common object
2051         let methods = CpuMethods {
2052             max_vcpus: self.config.max_vcpus,
2053             dynamic: self.dynamic,
2054         };
2055         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2056 
2057         let mut cpu_devices = Vec::new();
2058         for cpu_id in 0..self.config.max_vcpus {
2059             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2060             let cpu_device = Cpu {
2061                 cpu_id,
2062                 proximity_domain,
2063                 dynamic: self.dynamic,
2064             };
2065 
2066             cpu_devices.push(cpu_device);
2067         }
2068 
2069         for cpu_device in cpu_devices.iter() {
2070             cpu_data_inner.push(cpu_device);
2071         }
2072 
2073         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2074     }
2075 }
2076 
2077 impl Pausable for CpuManager {
2078     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2079         // Tell the vCPUs to pause themselves next time they exit
2080         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2081 
2082         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2083         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2084         // above.
2085         for state in self.vcpu_states.iter() {
2086             state.signal_thread();
2087         }
2088 
2089         for vcpu in self.vcpus.iter() {
2090             let mut vcpu = vcpu.lock().unwrap();
2091             vcpu.pause()?;
2092             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2093             if !self.config.kvm_hyperv {
2094                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2095                     MigratableError::Pause(anyhow!(
2096                         "Could not notify guest it has been paused {:?}",
2097                         e
2098                     ))
2099                 })?;
2100             }
2101         }
2102 
2103         // The vCPU thread will change its paused state before parking, wait here for each
2104         // actived vCPU change their state to ensure they have parked.
2105         for state in self.vcpu_states.iter() {
2106             if state.active() {
2107                 while !state.paused.load(Ordering::SeqCst) {
2108                     // To avoid a priority inversion with the vCPU thread
2109                     thread::sleep(std::time::Duration::from_millis(1));
2110                 }
2111             }
2112         }
2113 
2114         Ok(())
2115     }
2116 
2117     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2118         for vcpu in self.vcpus.iter() {
2119             vcpu.lock().unwrap().resume()?;
2120         }
2121 
2122         // Toggle the vCPUs pause boolean
2123         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2124 
2125         // Unpark all the VCPU threads.
2126         // Once unparked, the next thing they will do is checking for the pause
2127         // boolean. Since it'll be set to false, they will exit their pause loop
2128         // and go back to vmx root.
2129         for state in self.vcpu_states.iter() {
2130             state.paused.store(false, Ordering::SeqCst);
2131             state.unpark_thread();
2132         }
2133         Ok(())
2134     }
2135 }
2136 
2137 impl Snapshottable for CpuManager {
2138     fn id(&self) -> String {
2139         CPU_MANAGER_SNAPSHOT_ID.to_string()
2140     }
2141 
2142     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2143         let mut cpu_manager_snapshot = Snapshot::default();
2144 
2145         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2146         for vcpu in &self.vcpus {
2147             let mut vcpu = vcpu.lock().unwrap();
2148             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2149         }
2150 
2151         Ok(cpu_manager_snapshot)
2152     }
2153 }
2154 
2155 impl Transportable for CpuManager {}
2156 impl Migratable for CpuManager {}
2157 
2158 #[cfg(feature = "guest_debug")]
2159 impl Debuggable for CpuManager {
2160     #[cfg(feature = "kvm")]
2161     fn set_guest_debug(
2162         &self,
2163         cpu_id: usize,
2164         addrs: &[GuestAddress],
2165         singlestep: bool,
2166     ) -> std::result::Result<(), DebuggableError> {
2167         self.vcpus[cpu_id]
2168             .lock()
2169             .unwrap()
2170             .vcpu
2171             .set_guest_debug(addrs, singlestep)
2172             .map_err(DebuggableError::SetDebug)
2173     }
2174 
2175     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2176         Ok(())
2177     }
2178 
2179     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2180         Ok(())
2181     }
2182 
2183     #[cfg(target_arch = "x86_64")]
2184     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2185         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2186         let gregs = self
2187             .get_regs(cpu_id as u8)
2188             .map_err(DebuggableError::ReadRegs)?;
2189         let regs = [
2190             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2191             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2192         ];
2193 
2194         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2195         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2196         let eflags = gregs.rflags as u32;
2197         let rip = gregs.rip;
2198 
2199         // Segment registers: CS, SS, DS, ES, FS, GS
2200         let sregs = self
2201             .get_sregs(cpu_id as u8)
2202             .map_err(DebuggableError::ReadRegs)?;
2203         let segments = X86SegmentRegs {
2204             cs: sregs.cs.selector as u32,
2205             ss: sregs.ss.selector as u32,
2206             ds: sregs.ds.selector as u32,
2207             es: sregs.es.selector as u32,
2208             fs: sregs.fs.selector as u32,
2209             gs: sregs.gs.selector as u32,
2210         };
2211 
2212         // TODO: Add other registers
2213 
2214         Ok(CoreRegs {
2215             regs,
2216             eflags,
2217             rip,
2218             segments,
2219             ..Default::default()
2220         })
2221     }
2222 
2223     #[cfg(target_arch = "aarch64")]
2224     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2225         let gregs = self
2226             .get_regs(cpu_id as u8)
2227             .map_err(DebuggableError::ReadRegs)?;
2228         Ok(CoreRegs {
2229             x: gregs.regs.regs,
2230             sp: gregs.regs.sp,
2231             pc: gregs.regs.pc,
2232             ..Default::default()
2233         })
2234     }
2235 
2236     #[cfg(target_arch = "x86_64")]
2237     fn write_regs(
2238         &self,
2239         cpu_id: usize,
2240         regs: &CoreRegs,
2241     ) -> std::result::Result<(), DebuggableError> {
2242         let orig_gregs = self
2243             .get_regs(cpu_id as u8)
2244             .map_err(DebuggableError::ReadRegs)?;
2245         let gregs = StandardRegisters {
2246             rax: regs.regs[0],
2247             rbx: regs.regs[1],
2248             rcx: regs.regs[2],
2249             rdx: regs.regs[3],
2250             rsi: regs.regs[4],
2251             rdi: regs.regs[5],
2252             rbp: regs.regs[6],
2253             rsp: regs.regs[7],
2254             r8: regs.regs[8],
2255             r9: regs.regs[9],
2256             r10: regs.regs[10],
2257             r11: regs.regs[11],
2258             r12: regs.regs[12],
2259             r13: regs.regs[13],
2260             r14: regs.regs[14],
2261             r15: regs.regs[15],
2262             rip: regs.rip,
2263             // Update the lower 32-bit of rflags.
2264             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2265         };
2266 
2267         self.set_regs(cpu_id as u8, &gregs)
2268             .map_err(DebuggableError::WriteRegs)?;
2269 
2270         // Segment registers: CS, SS, DS, ES, FS, GS
2271         // Since GDB care only selectors, we call get_sregs() first.
2272         let mut sregs = self
2273             .get_sregs(cpu_id as u8)
2274             .map_err(DebuggableError::ReadRegs)?;
2275         sregs.cs.selector = regs.segments.cs as u16;
2276         sregs.ss.selector = regs.segments.ss as u16;
2277         sregs.ds.selector = regs.segments.ds as u16;
2278         sregs.es.selector = regs.segments.es as u16;
2279         sregs.fs.selector = regs.segments.fs as u16;
2280         sregs.gs.selector = regs.segments.gs as u16;
2281 
2282         self.set_sregs(cpu_id as u8, &sregs)
2283             .map_err(DebuggableError::WriteRegs)?;
2284 
2285         // TODO: Add other registers
2286 
2287         Ok(())
2288     }
2289 
2290     #[cfg(target_arch = "aarch64")]
2291     fn write_regs(
2292         &self,
2293         cpu_id: usize,
2294         regs: &CoreRegs,
2295     ) -> std::result::Result<(), DebuggableError> {
2296         let mut gregs = self
2297             .get_regs(cpu_id as u8)
2298             .map_err(DebuggableError::ReadRegs)?;
2299 
2300         gregs.regs.regs = regs.x;
2301         gregs.regs.sp = regs.sp;
2302         gregs.regs.pc = regs.pc;
2303 
2304         self.set_regs(cpu_id as u8, &gregs)
2305             .map_err(DebuggableError::WriteRegs)?;
2306 
2307         Ok(())
2308     }
2309 
2310     fn read_mem(
2311         &self,
2312         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2313         cpu_id: usize,
2314         vaddr: GuestAddress,
2315         len: usize,
2316     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2317         let mut buf = vec![0; len];
2318         let mut total_read = 0_u64;
2319 
2320         while total_read < len as u64 {
2321             let gaddr = vaddr.0 + total_read;
2322             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2323                 Ok(paddr) => paddr,
2324                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2325                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2326             };
2327             let psize = arch::PAGE_SIZE as u64;
2328             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2329             guest_memory
2330                 .memory()
2331                 .read(
2332                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2333                     GuestAddress(paddr),
2334                 )
2335                 .map_err(DebuggableError::ReadMem)?;
2336             total_read += read_len;
2337         }
2338         Ok(buf)
2339     }
2340 
2341     fn write_mem(
2342         &self,
2343         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2344         cpu_id: usize,
2345         vaddr: &GuestAddress,
2346         data: &[u8],
2347     ) -> std::result::Result<(), DebuggableError> {
2348         let mut total_written = 0_u64;
2349 
2350         while total_written < data.len() as u64 {
2351             let gaddr = vaddr.0 + total_written;
2352             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2353                 Ok(paddr) => paddr,
2354                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2355                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2356             };
2357             let psize = arch::PAGE_SIZE as u64;
2358             let write_len = std::cmp::min(
2359                 data.len() as u64 - total_written,
2360                 psize - (paddr & (psize - 1)),
2361             );
2362             guest_memory
2363                 .memory()
2364                 .write(
2365                     &data[total_written as usize..total_written as usize + write_len as usize],
2366                     GuestAddress(paddr),
2367                 )
2368                 .map_err(DebuggableError::WriteMem)?;
2369             total_written += write_len;
2370         }
2371         Ok(())
2372     }
2373 
2374     fn active_vcpus(&self) -> usize {
2375         self.present_vcpus() as usize
2376     }
2377 }
2378 
2379 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2380 impl Elf64Writable for CpuManager {}
2381 
2382 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2383 impl CpuElf64Writable for CpuManager {
2384     fn cpu_write_elf64_note(
2385         &mut self,
2386         dump_state: &DumpState,
2387     ) -> std::result::Result<(), GuestDebuggableError> {
2388         let mut coredump_file = dump_state.file.as_ref().unwrap();
2389         for vcpu in &self.vcpus {
2390             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2391             let mut pos: usize = 0;
2392             let mut buf = vec![0; note_size as usize];
2393             let descsz = size_of::<X86_64ElfPrStatus>();
2394             let vcpu_id = vcpu.lock().unwrap().id;
2395 
2396             let note = Elf64_Nhdr {
2397                 n_namesz: COREDUMP_NAME_SIZE,
2398                 n_descsz: descsz as u32,
2399                 n_type: NT_PRSTATUS,
2400             };
2401 
2402             let bytes: &[u8] = note.as_slice();
2403             buf.splice(0.., bytes.to_vec());
2404             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2405             buf.resize(pos + 4, 0);
2406             buf.splice(pos.., "CORE".to_string().into_bytes());
2407 
2408             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2409             buf.resize(pos + 32 + 4, 0);
2410             let pid = vcpu_id as u64;
2411             let bytes: &[u8] = pid.as_slice();
2412             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2413 
2414             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2415 
2416             let orig_rax: u64 = 0;
2417             let gregs = self.vcpus[usize::from(vcpu_id)]
2418                 .lock()
2419                 .unwrap()
2420                 .vcpu
2421                 .get_regs()
2422                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2423 
2424             let regs1 = [
2425                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2426                 gregs.r10,
2427             ];
2428             let regs2 = [
2429                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2430             ];
2431 
2432             let sregs = self.vcpus[usize::from(vcpu_id)]
2433                 .lock()
2434                 .unwrap()
2435                 .vcpu
2436                 .get_sregs()
2437                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2438 
2439             debug!(
2440                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2441                 gregs.rip,
2442                 gregs.rsp,
2443                 sregs.gs.base,
2444                 sregs.cs.selector,
2445                 sregs.ss.selector,
2446                 sregs.ds.selector,
2447             );
2448 
2449             let regs = X86_64UserRegs {
2450                 regs1,
2451                 regs2,
2452                 rip: gregs.rip,
2453                 cs: sregs.cs.selector as u64,
2454                 eflags: gregs.rflags,
2455                 rsp: gregs.rsp,
2456                 ss: sregs.ss.selector as u64,
2457                 fs_base: sregs.fs.base,
2458                 gs_base: sregs.gs.base,
2459                 ds: sregs.ds.selector as u64,
2460                 es: sregs.es.selector as u64,
2461                 fs: sregs.fs.selector as u64,
2462                 gs: sregs.gs.selector as u64,
2463             };
2464 
2465             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2466             let bytes: &[u8] = regs.as_slice();
2467             buf.resize(note_size as usize, 0);
2468             buf.splice(pos.., bytes.to_vec());
2469             buf.resize(note_size as usize, 0);
2470 
2471             coredump_file
2472                 .write(&buf)
2473                 .map_err(GuestDebuggableError::CoredumpFile)?;
2474         }
2475 
2476         Ok(())
2477     }
2478 
2479     fn cpu_write_vmm_note(
2480         &mut self,
2481         dump_state: &DumpState,
2482     ) -> std::result::Result<(), GuestDebuggableError> {
2483         let mut coredump_file = dump_state.file.as_ref().unwrap();
2484         for vcpu in &self.vcpus {
2485             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2486             let mut pos: usize = 0;
2487             let mut buf = vec![0; note_size as usize];
2488             let descsz = size_of::<DumpCpusState>();
2489             let vcpu_id = vcpu.lock().unwrap().id;
2490 
2491             let note = Elf64_Nhdr {
2492                 n_namesz: COREDUMP_NAME_SIZE,
2493                 n_descsz: descsz as u32,
2494                 n_type: 0,
2495             };
2496 
2497             let bytes: &[u8] = note.as_slice();
2498             buf.splice(0.., bytes.to_vec());
2499             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2500 
2501             buf.resize(pos + 4, 0);
2502             buf.splice(pos.., "QEMU".to_string().into_bytes());
2503 
2504             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2505 
2506             let gregs = self.vcpus[usize::from(vcpu_id)]
2507                 .lock()
2508                 .unwrap()
2509                 .vcpu
2510                 .get_regs()
2511                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2512 
2513             let regs1 = [
2514                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2515                 gregs.rbp,
2516             ];
2517 
2518             let regs2 = [
2519                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2520                 gregs.r15,
2521             ];
2522 
2523             let sregs = self.vcpus[usize::from(vcpu_id)]
2524                 .lock()
2525                 .unwrap()
2526                 .vcpu
2527                 .get_sregs()
2528                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2529 
2530             let mut msrs = vec![MsrEntry {
2531                 index: msr_index::MSR_KERNEL_GS_BASE,
2532                 ..Default::default()
2533             }];
2534 
2535             self.vcpus[vcpu_id as usize]
2536                 .lock()
2537                 .unwrap()
2538                 .vcpu
2539                 .get_msrs(&mut msrs)
2540                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2541             let kernel_gs_base = msrs[0].data;
2542 
2543             let cs = CpuSegment::new(sregs.cs);
2544             let ds = CpuSegment::new(sregs.ds);
2545             let es = CpuSegment::new(sregs.es);
2546             let fs = CpuSegment::new(sregs.fs);
2547             let gs = CpuSegment::new(sregs.gs);
2548             let ss = CpuSegment::new(sregs.ss);
2549             let ldt = CpuSegment::new(sregs.ldt);
2550             let tr = CpuSegment::new(sregs.tr);
2551             let gdt = CpuSegment::new_from_table(sregs.gdt);
2552             let idt = CpuSegment::new_from_table(sregs.idt);
2553             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2554             let regs = DumpCpusState {
2555                 version: 1,
2556                 size: size_of::<DumpCpusState>() as u32,
2557                 regs1,
2558                 regs2,
2559                 rip: gregs.rip,
2560                 rflags: gregs.rflags,
2561                 cs,
2562                 ds,
2563                 es,
2564                 fs,
2565                 gs,
2566                 ss,
2567                 ldt,
2568                 tr,
2569                 gdt,
2570                 idt,
2571                 cr,
2572                 kernel_gs_base,
2573             };
2574 
2575             let bytes: &[u8] = regs.as_slice();
2576             buf.resize(note_size as usize, 0);
2577             buf.splice(pos.., bytes.to_vec());
2578             buf.resize(note_size as usize, 0);
2579 
2580             coredump_file
2581                 .write(&buf)
2582                 .map_err(GuestDebuggableError::CoredumpFile)?;
2583         }
2584 
2585         Ok(())
2586     }
2587 }
2588 
2589 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2590 #[cfg(test)]
2591 mod tests {
2592     use arch::x86_64::interrupts::*;
2593     use arch::x86_64::regs::*;
2594     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2595 
2596     #[test]
2597     fn test_setlint() {
2598         let hv = hypervisor::new().unwrap();
2599         let vm = hv.create_vm().expect("new VM fd creation failed");
2600         assert!(hv.check_required_extensions().is_ok());
2601         // Calling get_lapic will fail if there is no irqchip before hand.
2602         assert!(vm.create_irq_chip().is_ok());
2603         let vcpu = vm.create_vcpu(0, None).unwrap();
2604         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2605 
2606         // Compute the value that is expected to represent LVT0 and LVT1.
2607         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2608         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2609         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2610         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2611 
2612         set_lint(&vcpu).unwrap();
2613 
2614         // Compute the value that represents LVT0 and LVT1 after set_lint.
2615         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2616         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2617         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2618         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2619         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2620     }
2621 
2622     #[test]
2623     fn test_setup_fpu() {
2624         let hv = hypervisor::new().unwrap();
2625         let vm = hv.create_vm().expect("new VM fd creation failed");
2626         let vcpu = vm.create_vcpu(0, None).unwrap();
2627         setup_fpu(&vcpu).unwrap();
2628 
2629         let expected_fpu: FpuState = FpuState {
2630             fcw: 0x37f,
2631             mxcsr: 0x1f80,
2632             ..Default::default()
2633         };
2634         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2635         // TODO: auto-generate kvm related structures with PartialEq on.
2636         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2637         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2638         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2639         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2640         // remove it at all.
2641         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2642     }
2643 
2644     #[test]
2645     fn test_setup_msrs() {
2646         use hypervisor::arch::x86::{msr_index, MsrEntry};
2647 
2648         let hv = hypervisor::new().unwrap();
2649         let vm = hv.create_vm().expect("new VM fd creation failed");
2650         let vcpu = vm.create_vcpu(0, None).unwrap();
2651         setup_msrs(&vcpu).unwrap();
2652 
2653         // This test will check against the last MSR entry configured (the tenth one).
2654         // See create_msr_entries for details.
2655         let mut msrs = vec![MsrEntry {
2656             index: msr_index::MSR_IA32_MISC_ENABLE,
2657             ..Default::default()
2658         }];
2659 
2660         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2661         // in this test case scenario.
2662         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2663         assert_eq!(read_msrs, 1);
2664 
2665         // Official entries that were setup when we did setup_msrs. We need to assert that the
2666         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2667         // expect.
2668         let entry_vec = vcpu.boot_msr_entries();
2669         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2670     }
2671 
2672     #[test]
2673     fn test_setup_regs() {
2674         let hv = hypervisor::new().unwrap();
2675         let vm = hv.create_vm().expect("new VM fd creation failed");
2676         let vcpu = vm.create_vcpu(0, None).unwrap();
2677 
2678         let expected_regs: StandardRegisters = StandardRegisters {
2679             rflags: 0x0000000000000002u64,
2680             rbx: arch::layout::PVH_INFO_START.0,
2681             rip: 1,
2682             ..Default::default()
2683         };
2684 
2685         setup_regs(&vcpu, expected_regs.rip).unwrap();
2686 
2687         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2688         assert_eq!(actual_regs, expected_regs);
2689     }
2690 }
2691 
2692 #[cfg(target_arch = "aarch64")]
2693 #[cfg(test)]
2694 mod tests {
2695     use arch::{aarch64::regs, layout};
2696     use hypervisor::kvm::aarch64::is_system_register;
2697     use hypervisor::kvm::kvm_bindings::{
2698         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2699         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2700     };
2701     use hypervisor::{arm64_core_reg_id, offset_of};
2702     use std::mem;
2703 
2704     #[test]
2705     fn test_setup_regs() {
2706         let hv = hypervisor::new().unwrap();
2707         let vm = hv.create_vm().unwrap();
2708         let vcpu = vm.create_vcpu(0, None).unwrap();
2709 
2710         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2711         // Must fail when vcpu is not initialized yet.
2712         assert!(res.is_err());
2713 
2714         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2715         vm.get_preferred_target(&mut kvi).unwrap();
2716         vcpu.vcpu_init(&kvi).unwrap();
2717 
2718         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2719     }
2720 
2721     #[test]
2722     fn test_read_mpidr() {
2723         let hv = hypervisor::new().unwrap();
2724         let vm = hv.create_vm().unwrap();
2725         let vcpu = vm.create_vcpu(0, None).unwrap();
2726         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2727         vm.get_preferred_target(&mut kvi).unwrap();
2728 
2729         // Must fail when vcpu is not initialized yet.
2730         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2731 
2732         vcpu.vcpu_init(&kvi).unwrap();
2733         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2734     }
2735 
2736     #[test]
2737     fn test_is_system_register() {
2738         let offset = offset_of!(user_pt_regs, pc);
2739         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2740         assert!(!is_system_register(regid));
2741         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2742         assert!(is_system_register(regid));
2743     }
2744 
2745     #[test]
2746     fn test_save_restore_core_regs() {
2747         let hv = hypervisor::new().unwrap();
2748         let vm = hv.create_vm().unwrap();
2749         let vcpu = vm.create_vcpu(0, None).unwrap();
2750         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2751         vm.get_preferred_target(&mut kvi).unwrap();
2752 
2753         // Must fail when vcpu is not initialized yet.
2754         let res = vcpu.get_regs();
2755         assert!(res.is_err());
2756         assert_eq!(
2757             format!("{}", res.unwrap_err()),
2758             "Failed to get core register: Exec format error (os error 8)"
2759         );
2760 
2761         let mut state = kvm_regs::default();
2762         let res = vcpu.set_regs(&state);
2763         assert!(res.is_err());
2764         assert_eq!(
2765             format!("{}", res.unwrap_err()),
2766             "Failed to set core register: Exec format error (os error 8)"
2767         );
2768 
2769         vcpu.vcpu_init(&kvi).unwrap();
2770         let res = vcpu.get_regs();
2771         assert!(res.is_ok());
2772         state = res.unwrap();
2773         assert_eq!(state.regs.pstate, 0x3C5);
2774 
2775         assert!(vcpu.set_regs(&state).is_ok());
2776     }
2777 
2778     #[test]
2779     fn test_get_set_mpstate() {
2780         let hv = hypervisor::new().unwrap();
2781         let vm = hv.create_vm().unwrap();
2782         let vcpu = vm.create_vcpu(0, None).unwrap();
2783         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2784         vm.get_preferred_target(&mut kvi).unwrap();
2785 
2786         let res = vcpu.get_mp_state();
2787         assert!(res.is_ok());
2788         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2789     }
2790 }
2791