xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 274f1aa2e738d579ffff9d4cfd7ed7c45293af31)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 use arch::EntryPoint;
35 use arch::NumaNodes;
36 #[cfg(target_arch = "aarch64")]
37 use devices::gic::Gic;
38 use devices::interrupt_controller::InterruptController;
39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
44 use hypervisor::aarch64::StandardRegisters;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::msr_index;
47 #[cfg(target_arch = "x86_64")]
48 use hypervisor::arch::x86::CpuIdEntry;
49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
50 use hypervisor::arch::x86::MsrEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
53 #[cfg(target_arch = "aarch64")]
54 use hypervisor::kvm::kvm_bindings;
55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
56 use hypervisor::kvm::kvm_ioctls::Cap;
57 #[cfg(feature = "tdx")]
58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
59 #[cfg(target_arch = "x86_64")]
60 use hypervisor::CpuVendor;
61 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
62 use libc::{c_void, siginfo_t};
63 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
64 use linux_loader::elf::Elf64_Nhdr;
65 use seccompiler::{apply_filter, SeccompAction};
66 use std::collections::BTreeMap;
67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
68 use std::io::Write;
69 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
70 use std::mem::size_of;
71 use std::os::unix::thread::JoinHandleExt;
72 use std::sync::atomic::{AtomicBool, Ordering};
73 use std::sync::{Arc, Barrier, Mutex};
74 use std::{cmp, io, result, thread};
75 use thiserror::Error;
76 use tracer::trace_scoped;
77 use vm_device::BusDevice;
78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
79 use vm_memory::ByteValued;
80 #[cfg(feature = "guest_debug")]
81 use vm_memory::{Bytes, GuestAddressSpace};
82 use vm_memory::{GuestAddress, GuestMemoryAtomic};
83 use vm_migration::{
84     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
85     Transportable,
86 };
87 use vmm_sys_util::eventfd::EventFd;
88 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
89 use zerocopy::AsBytes;
90 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
91 /// Extract the specified bits of a 64-bit integer.
92 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
93 /// following expression should return 3 (`0b11`):
94 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
95 ///
96 macro_rules! extract_bits_64 {
97     ($value: tt, $offset: tt, $length: tt) => {
98         ($value >> $offset) & (!0u64 >> (64 - $length))
99     };
100 }
101 
102 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
103 macro_rules! extract_bits_64_without_offset {
104     ($value: tt, $length: tt) => {
105         $value & (!0u64 >> (64 - $length))
106     };
107 }
108 
109 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
110 
111 #[derive(Debug, Error)]
112 pub enum Error {
113     #[error("Error creating vCPU: {0}")]
114     VcpuCreate(#[source] anyhow::Error),
115 
116     #[error("Error running bCPU: {0}")]
117     VcpuRun(#[source] anyhow::Error),
118 
119     #[error("Error spawning vCPU thread: {0}")]
120     VcpuSpawn(#[source] io::Error),
121 
122     #[error("Error generating common CPUID: {0}")]
123     CommonCpuId(#[source] arch::Error),
124 
125     #[error("Error configuring vCPU: {0}")]
126     VcpuConfiguration(#[source] arch::Error),
127 
128     #[error("Still pending removed vcpu")]
129     VcpuPendingRemovedVcpu,
130 
131     #[cfg(target_arch = "aarch64")]
132     #[error("Error fetching preferred target: {0}")]
133     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
134 
135     #[cfg(target_arch = "aarch64")]
136     #[error("Error initialising vCPU: {0}")]
137     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
138 
139     #[error("Failed to join on vCPU threads: {0:?}")]
140     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
141 
142     #[error("Error adding CpuManager to MMIO bus: {0}")]
143     BusError(#[source] vm_device::BusError),
144 
145     #[error("Requested vCPUs exceed maximum")]
146     DesiredVCpuCountExceedsMax,
147 
148     #[error("Cannot create seccomp filter: {0}")]
149     CreateSeccompFilter(#[source] seccompiler::Error),
150 
151     #[error("Cannot apply seccomp filter: {0}")]
152     ApplySeccompFilter(#[source] seccompiler::Error),
153 
154     #[error("Error starting vCPU after restore: {0}")]
155     StartRestoreVcpu(#[source] anyhow::Error),
156 
157     #[error("Unexpected VmExit")]
158     UnexpectedVmExit,
159 
160     #[error("Failed to allocate MMIO address for CpuManager")]
161     AllocateMmmioAddress,
162 
163     #[cfg(feature = "tdx")]
164     #[error("Error initializing TDX: {0}")]
165     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
166 
167     #[cfg(target_arch = "aarch64")]
168     #[error("Error initializing PMU: {0}")]
169     InitPmu(#[source] hypervisor::HypervisorCpuError),
170 
171     #[cfg(feature = "guest_debug")]
172     #[error("Error during CPU debug: {0}")]
173     CpuDebug(#[source] hypervisor::HypervisorCpuError),
174 
175     #[cfg(feature = "guest_debug")]
176     #[error("Error translating virtual address: {0}")]
177     TranslateVirtualAddress(#[source] anyhow::Error),
178 
179     #[cfg(target_arch = "x86_64")]
180     #[error("Error setting up AMX: {0}")]
181     AmxEnable(#[source] anyhow::Error),
182 
183     #[error("Maximum number of vCPUs exceeds host limit")]
184     MaximumVcpusExceeded,
185 }
186 pub type Result<T> = result::Result<T, Error>;
187 
188 #[cfg(target_arch = "x86_64")]
189 #[allow(dead_code)]
190 #[repr(packed)]
191 #[derive(AsBytes)]
192 struct LocalX2Apic {
193     pub r#type: u8,
194     pub length: u8,
195     pub _reserved: u16,
196     pub apic_id: u32,
197     pub flags: u32,
198     pub processor_id: u32,
199 }
200 
201 #[allow(dead_code)]
202 #[repr(packed)]
203 #[derive(Default, AsBytes)]
204 struct Ioapic {
205     pub r#type: u8,
206     pub length: u8,
207     pub ioapic_id: u8,
208     _reserved: u8,
209     pub apic_address: u32,
210     pub gsi_base: u32,
211 }
212 
213 #[cfg(target_arch = "aarch64")]
214 #[allow(dead_code)]
215 #[repr(packed)]
216 #[derive(AsBytes)]
217 struct GicC {
218     pub r#type: u8,
219     pub length: u8,
220     pub reserved0: u16,
221     pub cpu_interface_number: u32,
222     pub uid: u32,
223     pub flags: u32,
224     pub parking_version: u32,
225     pub performance_interrupt: u32,
226     pub parked_address: u64,
227     pub base_address: u64,
228     pub gicv_base_address: u64,
229     pub gich_base_address: u64,
230     pub vgic_interrupt: u32,
231     pub gicr_base_address: u64,
232     pub mpidr: u64,
233     pub proc_power_effi_class: u8,
234     pub reserved1: u8,
235     pub spe_overflow_interrupt: u16,
236 }
237 
238 #[cfg(target_arch = "aarch64")]
239 #[allow(dead_code)]
240 #[repr(packed)]
241 #[derive(AsBytes)]
242 struct GicD {
243     pub r#type: u8,
244     pub length: u8,
245     pub reserved0: u16,
246     pub gic_id: u32,
247     pub base_address: u64,
248     pub global_irq_base: u32,
249     pub version: u8,
250     pub reserved1: [u8; 3],
251 }
252 
253 #[cfg(target_arch = "aarch64")]
254 #[allow(dead_code)]
255 #[repr(packed)]
256 #[derive(AsBytes)]
257 struct GicR {
258     pub r#type: u8,
259     pub length: u8,
260     pub reserved: u16,
261     pub base_address: u64,
262     pub range_length: u32,
263 }
264 
265 #[cfg(target_arch = "aarch64")]
266 #[allow(dead_code)]
267 #[repr(packed)]
268 #[derive(AsBytes)]
269 struct GicIts {
270     pub r#type: u8,
271     pub length: u8,
272     pub reserved0: u16,
273     pub translation_id: u32,
274     pub base_address: u64,
275     pub reserved1: u32,
276 }
277 
278 #[cfg(target_arch = "aarch64")]
279 #[allow(dead_code)]
280 #[repr(packed)]
281 #[derive(AsBytes)]
282 struct ProcessorHierarchyNode {
283     pub r#type: u8,
284     pub length: u8,
285     pub reserved: u16,
286     pub flags: u32,
287     pub parent: u32,
288     pub acpi_processor_id: u32,
289     pub num_private_resources: u32,
290 }
291 
292 #[allow(dead_code)]
293 #[repr(packed)]
294 #[derive(Default, AsBytes)]
295 struct InterruptSourceOverride {
296     pub r#type: u8,
297     pub length: u8,
298     pub bus: u8,
299     pub source: u8,
300     pub gsi: u32,
301     pub flags: u16,
302 }
303 
304 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
305 macro_rules! round_up {
306     ($n:expr,$d:expr) => {
307         (($n / ($d + 1)) + 1) * $d
308     };
309 }
310 
311 /// A wrapper around creating and using a kvm-based VCPU.
312 pub struct Vcpu {
313     // The hypervisor abstracted CPU.
314     vcpu: Arc<dyn hypervisor::Vcpu>,
315     id: u8,
316     #[cfg(target_arch = "aarch64")]
317     mpidr: u64,
318     saved_state: Option<CpuState>,
319     #[cfg(target_arch = "x86_64")]
320     vendor: CpuVendor,
321 }
322 
323 impl Vcpu {
324     /// Constructs a new VCPU for `vm`.
325     ///
326     /// # Arguments
327     ///
328     /// * `id` - Represents the CPU number between [0, max vcpus).
329     /// * `vm` - The virtual machine this vcpu will get attached to.
330     /// * `vm_ops` - Optional object for exit handling.
331     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
332     pub fn new(
333         id: u8,
334         vm: &Arc<dyn hypervisor::Vm>,
335         vm_ops: Option<Arc<dyn VmOps>>,
336         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
337     ) -> Result<Self> {
338         let vcpu = vm
339             .create_vcpu(id, vm_ops)
340             .map_err(|e| Error::VcpuCreate(e.into()))?;
341         // Initially the cpuid per vCPU is the one supported by this VM.
342         Ok(Vcpu {
343             vcpu,
344             id,
345             #[cfg(target_arch = "aarch64")]
346             mpidr: 0,
347             saved_state: None,
348             #[cfg(target_arch = "x86_64")]
349             vendor: cpu_vendor,
350         })
351     }
352 
353     /// Configures a vcpu and should be called once per vcpu when created.
354     ///
355     /// # Arguments
356     ///
357     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
358     /// * `guest_memory` - Guest memory.
359     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
360     pub fn configure(
361         &mut self,
362         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
363         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
364         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
365         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
366         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
367     ) -> Result<()> {
368         #[cfg(target_arch = "aarch64")]
369         {
370             self.init(vm)?;
371             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
372                 .map_err(Error::VcpuConfiguration)?;
373         }
374         info!("Configuring vCPU: cpu_id = {}", self.id);
375         #[cfg(target_arch = "x86_64")]
376         arch::configure_vcpu(
377             &self.vcpu,
378             self.id,
379             boot_setup,
380             cpuid,
381             kvm_hyperv,
382             self.vendor,
383             topology,
384         )
385         .map_err(Error::VcpuConfiguration)?;
386 
387         Ok(())
388     }
389 
390     /// Gets the MPIDR register value.
391     #[cfg(target_arch = "aarch64")]
392     pub fn get_mpidr(&self) -> u64 {
393         self.mpidr
394     }
395 
396     /// Gets the saved vCPU state.
397     #[cfg(target_arch = "aarch64")]
398     pub fn get_saved_state(&self) -> Option<CpuState> {
399         self.saved_state.clone()
400     }
401 
402     /// Initializes an aarch64 specific vcpu for booting Linux.
403     #[cfg(target_arch = "aarch64")]
404     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
405         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
406 
407         // This reads back the kernel's preferred target type.
408         vm.get_preferred_target(&mut kvi)
409             .map_err(Error::VcpuArmPreferredTarget)?;
410         // We already checked that the capability is supported.
411         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
412         if vm
413             .as_any()
414             .downcast_ref::<hypervisor::kvm::KvmVm>()
415             .unwrap()
416             .check_extension(Cap::ArmPmuV3)
417         {
418             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
419         }
420         // Non-boot cpus are powered off initially.
421         if self.id > 0 {
422             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
423         }
424         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
425     }
426 
427     /// Runs the VCPU until it exits, returning the reason.
428     ///
429     /// Note that the state of the VCPU and associated VM must be setup first for this to do
430     /// anything useful.
431     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
432         self.vcpu.run()
433     }
434 }
435 
436 impl Pausable for Vcpu {}
437 impl Snapshottable for Vcpu {
438     fn id(&self) -> String {
439         self.id.to_string()
440     }
441 
442     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
443         let saved_state = self
444             .vcpu
445             .state()
446             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
447 
448         self.saved_state = Some(saved_state.clone());
449 
450         Ok(Snapshot::from_data(SnapshotData::new_from_state(
451             &saved_state,
452         )?))
453     }
454 }
455 
456 pub struct CpuManager {
457     config: CpusConfig,
458     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
459     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
460     #[cfg(target_arch = "x86_64")]
461     cpuid: Vec<CpuIdEntry>,
462     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
463     vm: Arc<dyn hypervisor::Vm>,
464     vcpus_kill_signalled: Arc<AtomicBool>,
465     vcpus_pause_signalled: Arc<AtomicBool>,
466     exit_evt: EventFd,
467     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
468     reset_evt: EventFd,
469     #[cfg(feature = "guest_debug")]
470     vm_debug_evt: EventFd,
471     vcpu_states: Vec<VcpuState>,
472     selected_cpu: u8,
473     vcpus: Vec<Arc<Mutex<Vcpu>>>,
474     seccomp_action: SeccompAction,
475     vm_ops: Arc<dyn VmOps>,
476     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
477     acpi_address: Option<GuestAddress>,
478     proximity_domain_per_cpu: BTreeMap<u8, u32>,
479     affinity: BTreeMap<u8, Vec<u8>>,
480     dynamic: bool,
481     hypervisor: Arc<dyn hypervisor::Hypervisor>,
482 }
483 
484 const CPU_ENABLE_FLAG: usize = 0;
485 const CPU_INSERTING_FLAG: usize = 1;
486 const CPU_REMOVING_FLAG: usize = 2;
487 const CPU_EJECT_FLAG: usize = 3;
488 
489 const CPU_STATUS_OFFSET: u64 = 4;
490 const CPU_SELECTION_OFFSET: u64 = 0;
491 
492 impl BusDevice for CpuManager {
493     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
494         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
495         data.fill(0);
496 
497         match offset {
498             CPU_SELECTION_OFFSET => {
499                 data[0] = self.selected_cpu;
500             }
501             CPU_STATUS_OFFSET => {
502                 if self.selected_cpu < self.max_vcpus() {
503                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
504                     if state.active() {
505                         data[0] |= 1 << CPU_ENABLE_FLAG;
506                     }
507                     if state.inserting {
508                         data[0] |= 1 << CPU_INSERTING_FLAG;
509                     }
510                     if state.removing {
511                         data[0] |= 1 << CPU_REMOVING_FLAG;
512                     }
513                 } else {
514                     warn!("Out of range vCPU id: {}", self.selected_cpu);
515                 }
516             }
517             _ => {
518                 warn!(
519                     "Unexpected offset for accessing CPU manager device: {:#}",
520                     offset
521                 );
522             }
523         }
524     }
525 
526     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
527         match offset {
528             CPU_SELECTION_OFFSET => {
529                 self.selected_cpu = data[0];
530             }
531             CPU_STATUS_OFFSET => {
532                 if self.selected_cpu < self.max_vcpus() {
533                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
534                     // The ACPI code writes back a 1 to acknowledge the insertion
535                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
536                         && state.inserting
537                     {
538                         state.inserting = false;
539                     }
540                     // Ditto for removal
541                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
542                         && state.removing
543                     {
544                         state.removing = false;
545                     }
546                     // Trigger removal of vCPU
547                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
548                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
549                             error!("Error removing vCPU: {:?}", e);
550                         }
551                     }
552                 } else {
553                     warn!("Out of range vCPU id: {}", self.selected_cpu);
554                 }
555             }
556             _ => {
557                 warn!(
558                     "Unexpected offset for accessing CPU manager device: {:#}",
559                     offset
560                 );
561             }
562         }
563         None
564     }
565 }
566 
567 #[derive(Default)]
568 struct VcpuState {
569     inserting: bool,
570     removing: bool,
571     pending_removal: Arc<AtomicBool>,
572     handle: Option<thread::JoinHandle<()>>,
573     kill: Arc<AtomicBool>,
574     vcpu_run_interrupted: Arc<AtomicBool>,
575     paused: Arc<AtomicBool>,
576 }
577 
578 impl VcpuState {
579     fn active(&self) -> bool {
580         self.handle.is_some()
581     }
582 
583     fn signal_thread(&self) {
584         if let Some(handle) = self.handle.as_ref() {
585             loop {
586                 // SAFETY: FFI call with correct arguments
587                 unsafe {
588                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
589                 }
590                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
591                     break;
592                 } else {
593                     // This is more effective than thread::yield_now() at
594                     // avoiding a priority inversion with the vCPU thread
595                     thread::sleep(std::time::Duration::from_millis(1));
596                 }
597             }
598         }
599     }
600 
601     fn join_thread(&mut self) -> Result<()> {
602         if let Some(handle) = self.handle.take() {
603             handle.join().map_err(Error::ThreadCleanup)?
604         }
605 
606         Ok(())
607     }
608 
609     fn unpark_thread(&self) {
610         if let Some(handle) = self.handle.as_ref() {
611             handle.thread().unpark()
612         }
613     }
614 }
615 
616 impl CpuManager {
617     #[allow(unused_variables)]
618     #[allow(clippy::too_many_arguments)]
619     pub fn new(
620         config: &CpusConfig,
621         vm: Arc<dyn hypervisor::Vm>,
622         exit_evt: EventFd,
623         reset_evt: EventFd,
624         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
625         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
626         seccomp_action: SeccompAction,
627         vm_ops: Arc<dyn VmOps>,
628         #[cfg(feature = "tdx")] tdx_enabled: bool,
629         numa_nodes: &NumaNodes,
630     ) -> Result<Arc<Mutex<CpuManager>>> {
631         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
632             return Err(Error::MaximumVcpusExceeded);
633         }
634 
635         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
636         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
637         let hypervisor_type = hypervisor.hypervisor_type();
638         #[cfg(target_arch = "x86_64")]
639         let cpu_vendor = hypervisor.get_cpu_vendor();
640 
641         #[cfg(target_arch = "x86_64")]
642         if config.features.amx {
643             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
644             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
645             const XFEATURE_XTILEDATA: usize = 18;
646             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
647 
648             // SAFETY: the syscall is only modifying kernel internal
649             // data structures that the kernel is itself expected to safeguard.
650             let amx_tile = unsafe {
651                 libc::syscall(
652                     libc::SYS_arch_prctl,
653                     ARCH_REQ_XCOMP_GUEST_PERM,
654                     XFEATURE_XTILEDATA,
655                 )
656             };
657 
658             if amx_tile != 0 {
659                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
660             } else {
661                 let mask: usize = 0;
662                 // SAFETY: the mask being modified (not marked mutable as it is
663                 // modified in unsafe only which is permitted) isn't in use elsewhere.
664                 let result = unsafe {
665                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
666                 };
667                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
668                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
669                 }
670             }
671         }
672 
673         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
674             let mut cpu_list = Vec::new();
675             for (proximity_domain, numa_node) in numa_nodes.iter() {
676                 for cpu in numa_node.cpus.iter() {
677                     cpu_list.push((*cpu, *proximity_domain))
678                 }
679             }
680             cpu_list
681         }
682         .into_iter()
683         .collect();
684 
685         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
686             cpu_affinity
687                 .iter()
688                 .map(|a| (a.vcpu, a.host_cpus.clone()))
689                 .collect()
690         } else {
691             BTreeMap::new()
692         };
693 
694         #[cfg(feature = "tdx")]
695         let dynamic = !tdx_enabled;
696         #[cfg(not(feature = "tdx"))]
697         let dynamic = true;
698 
699         Ok(Arc::new(Mutex::new(CpuManager {
700             config: config.clone(),
701             interrupt_controller: None,
702             #[cfg(target_arch = "x86_64")]
703             cpuid: Vec::new(),
704             vm,
705             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
706             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
707             vcpu_states,
708             exit_evt,
709             reset_evt,
710             #[cfg(feature = "guest_debug")]
711             vm_debug_evt,
712             selected_cpu: 0,
713             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
714             seccomp_action,
715             vm_ops,
716             acpi_address: None,
717             proximity_domain_per_cpu,
718             affinity,
719             dynamic,
720             hypervisor: hypervisor.clone(),
721         })))
722     }
723 
724     #[cfg(target_arch = "x86_64")]
725     pub fn populate_cpuid(
726         &mut self,
727         memory_manager: &Arc<Mutex<MemoryManager>>,
728         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
729         #[cfg(feature = "tdx")] tdx: bool,
730     ) -> Result<()> {
731         let sgx_epc_sections = memory_manager
732             .lock()
733             .unwrap()
734             .sgx_epc_region()
735             .as_ref()
736             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
737 
738         self.cpuid = {
739             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
740             arch::generate_common_cpuid(
741                 hypervisor,
742                 &arch::CpuidConfig {
743                     sgx_epc_sections,
744                     phys_bits,
745                     kvm_hyperv: self.config.kvm_hyperv,
746                     #[cfg(feature = "tdx")]
747                     tdx,
748                     amx: self.config.features.amx,
749                 },
750             )
751             .map_err(Error::CommonCpuId)?
752         };
753 
754         Ok(())
755     }
756 
757     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
758         info!("Creating vCPU: cpu_id = {}", cpu_id);
759 
760         let mut vcpu = Vcpu::new(
761             cpu_id,
762             &self.vm,
763             Some(self.vm_ops.clone()),
764             #[cfg(target_arch = "x86_64")]
765             self.hypervisor.get_cpu_vendor(),
766         )?;
767 
768         if let Some(snapshot) = snapshot {
769             // AArch64 vCPUs should be initialized after created.
770             #[cfg(target_arch = "aarch64")]
771             vcpu.init(&self.vm)?;
772 
773             let state: CpuState = snapshot.to_state().map_err(|e| {
774                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
775             })?;
776             vcpu.vcpu
777                 .set_state(&state)
778                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
779 
780             vcpu.saved_state = Some(state);
781         }
782 
783         let vcpu = Arc::new(Mutex::new(vcpu));
784 
785         // Adding vCPU to the CpuManager's vCPU list.
786         self.vcpus.push(vcpu.clone());
787 
788         Ok(vcpu)
789     }
790 
791     pub fn configure_vcpu(
792         &self,
793         vcpu: Arc<Mutex<Vcpu>>,
794         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
795     ) -> Result<()> {
796         let mut vcpu = vcpu.lock().unwrap();
797 
798         #[cfg(target_arch = "x86_64")]
799         assert!(!self.cpuid.is_empty());
800 
801         #[cfg(target_arch = "x86_64")]
802         let topology = self.config.topology.clone().map_or_else(
803             || {
804                 #[cfg(feature = "mshv")]
805                 if matches!(self.hypervisor.hypervisor_type(), HypervisorType::Mshv) {
806                     return Some((1, self.boot_vcpus(), 1));
807                 }
808                 None
809             },
810             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
811         );
812         #[cfg(target_arch = "x86_64")]
813         vcpu.configure(
814             boot_setup,
815             self.cpuid.clone(),
816             self.config.kvm_hyperv,
817             topology,
818         )?;
819 
820         #[cfg(target_arch = "aarch64")]
821         vcpu.configure(&self.vm, boot_setup)?;
822 
823         Ok(())
824     }
825 
826     /// Only create new vCPUs if there aren't any inactive ones to reuse
827     fn create_vcpus(
828         &mut self,
829         desired_vcpus: u8,
830         snapshot: Option<Snapshot>,
831     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
832         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
833         info!(
834             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
835             desired_vcpus,
836             self.config.max_vcpus,
837             self.vcpus.len(),
838             self.present_vcpus()
839         );
840 
841         if desired_vcpus > self.config.max_vcpus {
842             return Err(Error::DesiredVCpuCountExceedsMax);
843         }
844 
845         // Only create vCPUs in excess of all the allocated vCPUs.
846         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
847             vcpus.push(self.create_vcpu(
848                 cpu_id,
849                 // TODO: The special format of the CPU id can be removed once
850                 // ready to break live upgrade.
851                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
852             )?);
853         }
854 
855         Ok(vcpus)
856     }
857 
858     #[cfg(target_arch = "aarch64")]
859     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
860         for cpu in self.vcpus.iter() {
861             let cpu = cpu.lock().unwrap();
862             // Check if PMU attr is available, if not, log the information.
863             if cpu.vcpu.has_pmu_support() {
864                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
865             } else {
866                 debug!(
867                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
868                     cpu.id
869                 );
870                 return Ok(false);
871             }
872         }
873 
874         Ok(true)
875     }
876 
877     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
878         self.vcpus.clone()
879     }
880 
881     fn start_vcpu(
882         &mut self,
883         vcpu: Arc<Mutex<Vcpu>>,
884         vcpu_id: u8,
885         vcpu_thread_barrier: Arc<Barrier>,
886         inserting: bool,
887     ) -> Result<()> {
888         let reset_evt = self.reset_evt.try_clone().unwrap();
889         let exit_evt = self.exit_evt.try_clone().unwrap();
890         #[cfg(feature = "kvm")]
891         let hypervisor_type = self.hypervisor.hypervisor_type();
892         #[cfg(feature = "guest_debug")]
893         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
894         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
895         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
896         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
897 
898         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
899         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
900             .vcpu_run_interrupted
901             .clone();
902         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
903         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
904 
905         // Prepare the CPU set the current vCPU is expected to run onto.
906         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
907             // SAFETY: all zeros is a valid pattern
908             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
909             // SAFETY: FFI call, trivially safe
910             unsafe { libc::CPU_ZERO(&mut cpuset) };
911             for host_cpu in host_cpus {
912                 // SAFETY: FFI call, trivially safe
913                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
914             }
915             cpuset
916         });
917 
918         // Retrieve seccomp filter for vcpu thread
919         let vcpu_seccomp_filter = get_seccomp_filter(
920             &self.seccomp_action,
921             Thread::Vcpu,
922             self.hypervisor.hypervisor_type(),
923         )
924         .map_err(Error::CreateSeccompFilter)?;
925 
926         #[cfg(target_arch = "x86_64")]
927         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
928 
929         info!("Starting vCPU: cpu_id = {}", vcpu_id);
930 
931         let handle = Some(
932             thread::Builder::new()
933                 .name(format!("vcpu{vcpu_id}"))
934                 .spawn(move || {
935                     // Schedule the thread to run on the expected CPU set
936                     if let Some(cpuset) = cpuset.as_ref() {
937                         // SAFETY: FFI call with correct arguments
938                         let ret = unsafe {
939                             libc::sched_setaffinity(
940                                 0,
941                                 std::mem::size_of::<libc::cpu_set_t>(),
942                                 cpuset as *const libc::cpu_set_t,
943                             )
944                         };
945 
946                         if ret != 0 {
947                             error!(
948                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
949                                 vcpu_id,
950                                 io::Error::last_os_error()
951                             );
952                             return;
953                         }
954                     }
955 
956                     // Apply seccomp filter for vcpu thread.
957                     if !vcpu_seccomp_filter.is_empty() {
958                         if let Err(e) =
959                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
960                         {
961                             error!("Error applying seccomp filter: {:?}", e);
962                             return;
963                         }
964                     }
965                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
966                     // This uses an async signal safe handler to kill the vcpu handles.
967                     register_signal_handler(SIGRTMIN(), handle_signal)
968                         .expect("Failed to register vcpu signal handler");
969                     // Block until all CPUs are ready.
970                     vcpu_thread_barrier.wait();
971 
972                     std::panic::catch_unwind(move || {
973                         loop {
974                             // If we are being told to pause, we park the thread
975                             // until the pause boolean is toggled.
976                             // The resume operation is responsible for toggling
977                             // the boolean and unpark the thread.
978                             // We enter a loop because park() could spuriously
979                             // return. We will then park() again unless the
980                             // pause boolean has been toggled.
981 
982                             // Need to use Ordering::SeqCst as we have multiple
983                             // loads and stores to different atomics and we need
984                             // to see them in a consistent order in all threads
985 
986                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
987                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
988                                 // completed by returning to KVM_RUN. From the kernel docs:
989                                 //
990                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
991                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
992                                 // operations are complete (and guest state is consistent) only after userspace
993                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
994                                 // incomplete operations and then check for pending signals.
995                                 // The pending state of the operation is not preserved in state which is
996                                 // visible to userspace, thus userspace should ensure that the operation is
997                                 // completed before performing a live migration.  Userspace can re-enter the
998                                 // guest with an unmasked signal pending or with the immediate_exit field set
999                                 // to complete pending operations without allowing any further instructions
1000                                 // to be executed.
1001 
1002                                 #[cfg(feature = "kvm")]
1003                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1004                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1005                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1006                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1007                                         break;
1008                                     }
1009                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1010                                 }
1011 
1012                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1013 
1014                                 vcpu_paused.store(true, Ordering::SeqCst);
1015                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1016                                     thread::park();
1017                                 }
1018                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1019                             }
1020 
1021                             // We've been told to terminate
1022                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1023                                 || vcpu_kill.load(Ordering::SeqCst)
1024                             {
1025                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1026                                 break;
1027                             }
1028 
1029                             #[cfg(feature = "tdx")]
1030                             let mut vcpu = vcpu.lock().unwrap();
1031                             #[cfg(not(feature = "tdx"))]
1032                             let vcpu = vcpu.lock().unwrap();
1033                             // vcpu.run() returns false on a triple-fault so trigger a reset
1034                             match vcpu.run() {
1035                                 Ok(run) => match run {
1036                                     #[cfg(feature = "kvm")]
1037                                     VmExit::Debug => {
1038                                         info!("VmExit::Debug");
1039                                         #[cfg(feature = "guest_debug")]
1040                                         {
1041                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1042                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1043                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1044                                         }
1045                                     }
1046                                     #[cfg(target_arch = "x86_64")]
1047                                     VmExit::IoapicEoi(vector) => {
1048                                         if let Some(interrupt_controller) =
1049                                             &interrupt_controller_clone
1050                                         {
1051                                             interrupt_controller
1052                                                 .lock()
1053                                                 .unwrap()
1054                                                 .end_of_interrupt(vector);
1055                                         }
1056                                     }
1057                                     VmExit::Ignore => {}
1058                                     VmExit::Hyperv => {}
1059                                     VmExit::Reset => {
1060                                         info!("VmExit::Reset");
1061                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1062                                         reset_evt.write(1).unwrap();
1063                                         break;
1064                                     }
1065                                     VmExit::Shutdown => {
1066                                         info!("VmExit::Shutdown");
1067                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1068                                         exit_evt.write(1).unwrap();
1069                                         break;
1070                                     }
1071                                     #[cfg(feature = "tdx")]
1072                                     VmExit::Tdx => {
1073                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1074                                             match vcpu.get_tdx_exit_details() {
1075                                                 Ok(details) => match details {
1076                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1077                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1078                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1079                                                     }
1080                                                 },
1081                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1082                                             }
1083                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1084                                         } else {
1085                                             // We should never reach this code as
1086                                             // this means the design from the code
1087                                             // is wrong.
1088                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1089                                         }
1090                                     }
1091                                     _ => {
1092                                         error!(
1093                                             "VCPU generated error: {:?}",
1094                                             Error::UnexpectedVmExit
1095                                         );
1096                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1097                                         exit_evt.write(1).unwrap();
1098                                         break;
1099                                     }
1100                                 },
1101 
1102                                 Err(e) => {
1103                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1104                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1105                                     exit_evt.write(1).unwrap();
1106                                     break;
1107                                 }
1108                             }
1109 
1110                             // We've been told to terminate
1111                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1112                                 || vcpu_kill.load(Ordering::SeqCst)
1113                             {
1114                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1115                                 break;
1116                             }
1117                         }
1118                     })
1119                     .or_else(|_| {
1120                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1121                         error!("vCPU thread panicked");
1122                         panic_exit_evt.write(1)
1123                     })
1124                     .ok();
1125                 })
1126                 .map_err(Error::VcpuSpawn)?,
1127         );
1128 
1129         // On hot plug calls into this function entry_point is None. It is for
1130         // those hotplug CPU additions that we need to set the inserting flag.
1131         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1132         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1133 
1134         Ok(())
1135     }
1136 
1137     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1138     fn activate_vcpus(
1139         &mut self,
1140         desired_vcpus: u8,
1141         inserting: bool,
1142         paused: Option<bool>,
1143     ) -> Result<()> {
1144         if desired_vcpus > self.config.max_vcpus {
1145             return Err(Error::DesiredVCpuCountExceedsMax);
1146         }
1147 
1148         let vcpu_thread_barrier = Arc::new(Barrier::new(
1149             (desired_vcpus - self.present_vcpus() + 1) as usize,
1150         ));
1151 
1152         if let Some(paused) = paused {
1153             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1154         }
1155 
1156         info!(
1157             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1158             desired_vcpus,
1159             self.vcpus.len(),
1160             self.present_vcpus(),
1161             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1162         );
1163 
1164         // This reuses any inactive vCPUs as well as any that were newly created
1165         for vcpu_id in self.present_vcpus()..desired_vcpus {
1166             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1167             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1168         }
1169 
1170         // Unblock all CPU threads.
1171         vcpu_thread_barrier.wait();
1172         Ok(())
1173     }
1174 
1175     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1176         // Mark vCPUs for removal, actual removal happens on ejection
1177         for cpu_id in desired_vcpus..self.present_vcpus() {
1178             self.vcpu_states[usize::from(cpu_id)].removing = true;
1179             self.vcpu_states[usize::from(cpu_id)]
1180                 .pending_removal
1181                 .store(true, Ordering::SeqCst);
1182         }
1183     }
1184 
1185     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1186         for state in self.vcpu_states.iter() {
1187             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1188                 return true;
1189             }
1190         }
1191         false
1192     }
1193 
1194     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1195         info!("Removing vCPU: cpu_id = {}", cpu_id);
1196         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1197         state.kill.store(true, Ordering::SeqCst);
1198         state.signal_thread();
1199         state.join_thread()?;
1200         state.handle = None;
1201 
1202         // Once the thread has exited, clear the "kill" so that it can reused
1203         state.kill.store(false, Ordering::SeqCst);
1204         state.pending_removal.store(false, Ordering::SeqCst);
1205 
1206         Ok(())
1207     }
1208 
1209     pub fn create_boot_vcpus(
1210         &mut self,
1211         snapshot: Option<Snapshot>,
1212     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1213         trace_scoped!("create_boot_vcpus");
1214 
1215         self.create_vcpus(self.boot_vcpus(), snapshot)
1216     }
1217 
1218     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1219     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1220         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1221     }
1222 
1223     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1224         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1225             .map_err(|e| {
1226                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1227             })?;
1228 
1229         Ok(())
1230     }
1231 
1232     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1233         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1234             return Ok(false);
1235         }
1236 
1237         if !self.dynamic {
1238             return Ok(false);
1239         }
1240 
1241         if self.check_pending_removed_vcpu() {
1242             return Err(Error::VcpuPendingRemovedVcpu);
1243         }
1244 
1245         match desired_vcpus.cmp(&self.present_vcpus()) {
1246             cmp::Ordering::Greater => {
1247                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1248                 for vcpu in vcpus {
1249                     self.configure_vcpu(vcpu, None)?
1250                 }
1251                 self.activate_vcpus(desired_vcpus, true, None)?;
1252                 Ok(true)
1253             }
1254             cmp::Ordering::Less => {
1255                 self.mark_vcpus_for_removal(desired_vcpus);
1256                 Ok(true)
1257             }
1258             _ => Ok(false),
1259         }
1260     }
1261 
1262     pub fn shutdown(&mut self) -> Result<()> {
1263         // Tell the vCPUs to stop themselves next time they go through the loop
1264         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1265 
1266         // Toggle the vCPUs pause boolean
1267         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1268 
1269         // Unpark all the VCPU threads.
1270         for state in self.vcpu_states.iter() {
1271             state.unpark_thread();
1272         }
1273 
1274         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1275         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1276         // above.
1277         for state in self.vcpu_states.iter() {
1278             state.signal_thread();
1279         }
1280 
1281         // Wait for all the threads to finish. This removes the state from the vector.
1282         for mut state in self.vcpu_states.drain(..) {
1283             state.join_thread()?;
1284         }
1285 
1286         Ok(())
1287     }
1288 
1289     #[cfg(feature = "tdx")]
1290     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1291         for vcpu in &self.vcpus {
1292             vcpu.lock()
1293                 .unwrap()
1294                 .vcpu
1295                 .tdx_init(hob_address)
1296                 .map_err(Error::InitializeTdx)?;
1297         }
1298         Ok(())
1299     }
1300 
1301     pub fn boot_vcpus(&self) -> u8 {
1302         self.config.boot_vcpus
1303     }
1304 
1305     pub fn max_vcpus(&self) -> u8 {
1306         self.config.max_vcpus
1307     }
1308 
1309     #[cfg(target_arch = "x86_64")]
1310     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1311         assert!(!self.cpuid.is_empty());
1312         self.cpuid.clone()
1313     }
1314 
1315     fn present_vcpus(&self) -> u8 {
1316         self.vcpu_states
1317             .iter()
1318             .fold(0, |acc, state| acc + state.active() as u8)
1319     }
1320 
1321     #[cfg(target_arch = "aarch64")]
1322     pub fn get_mpidrs(&self) -> Vec<u64> {
1323         self.vcpus
1324             .iter()
1325             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1326             .collect()
1327     }
1328 
1329     #[cfg(target_arch = "aarch64")]
1330     pub fn get_saved_states(&self) -> Vec<CpuState> {
1331         self.vcpus
1332             .iter()
1333             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1334             .collect()
1335     }
1336 
1337     #[cfg(target_arch = "aarch64")]
1338     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1339         self.config
1340             .topology
1341             .clone()
1342             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1343     }
1344 
1345     pub fn create_madt(&self) -> Sdt {
1346         use crate::acpi;
1347         // This is also checked in the commandline parsing.
1348         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1349 
1350         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1351         #[cfg(target_arch = "x86_64")]
1352         {
1353             madt.write(36, arch::layout::APIC_START.0);
1354 
1355             for cpu in 0..self.config.max_vcpus {
1356                 let lapic = LocalX2Apic {
1357                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1358                     length: 16,
1359                     processor_id: cpu.into(),
1360                     apic_id: cpu.into(),
1361                     flags: if cpu < self.config.boot_vcpus {
1362                         1 << MADT_CPU_ENABLE_FLAG
1363                     } else {
1364                         0
1365                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1366                     _reserved: 0,
1367                 };
1368                 madt.append(lapic);
1369             }
1370 
1371             madt.append(Ioapic {
1372                 r#type: acpi::ACPI_APIC_IO,
1373                 length: 12,
1374                 ioapic_id: 0,
1375                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1376                 gsi_base: 0,
1377                 ..Default::default()
1378             });
1379 
1380             madt.append(InterruptSourceOverride {
1381                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1382                 length: 10,
1383                 bus: 0,
1384                 source: 4,
1385                 gsi: 4,
1386                 flags: 0,
1387             });
1388         }
1389 
1390         #[cfg(target_arch = "aarch64")]
1391         {
1392             /* Notes:
1393              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1394              */
1395 
1396             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1397             for cpu in 0..self.config.boot_vcpus {
1398                 let vcpu = &self.vcpus[cpu as usize];
1399                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1400                 /* ARMv8 MPIDR format:
1401                      Bits [63:40] Must be zero
1402                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1403                      Bits [31:24] Must be zero
1404                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1405                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1406                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1407                 */
1408                 let mpidr_mask = 0xff_00ff_ffff;
1409                 let gicc = GicC {
1410                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1411                     length: 80,
1412                     reserved0: 0,
1413                     cpu_interface_number: cpu as u32,
1414                     uid: cpu as u32,
1415                     flags: 1,
1416                     parking_version: 0,
1417                     performance_interrupt: 0,
1418                     parked_address: 0,
1419                     base_address: 0,
1420                     gicv_base_address: 0,
1421                     gich_base_address: 0,
1422                     vgic_interrupt: 0,
1423                     gicr_base_address: 0,
1424                     mpidr: mpidr & mpidr_mask,
1425                     proc_power_effi_class: 0,
1426                     reserved1: 0,
1427                     spe_overflow_interrupt: 0,
1428                 };
1429 
1430                 madt.append(gicc);
1431             }
1432             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1433 
1434             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1435             let gicd = GicD {
1436                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1437                 length: 24,
1438                 reserved0: 0,
1439                 gic_id: 0,
1440                 base_address: vgic_config.dist_addr,
1441                 global_irq_base: 0,
1442                 version: 3,
1443                 reserved1: [0; 3],
1444             };
1445             madt.append(gicd);
1446 
1447             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1448             let gicr = GicR {
1449                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1450                 length: 16,
1451                 reserved: 0,
1452                 base_address: vgic_config.redists_addr,
1453                 range_length: vgic_config.redists_size as u32,
1454             };
1455             madt.append(gicr);
1456 
1457             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1458             let gicits = GicIts {
1459                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1460                 length: 20,
1461                 reserved0: 0,
1462                 translation_id: 0,
1463                 base_address: vgic_config.msi_addr,
1464                 reserved1: 0,
1465             };
1466             madt.append(gicits);
1467 
1468             madt.update_checksum();
1469         }
1470 
1471         madt
1472     }
1473 
1474     #[cfg(target_arch = "aarch64")]
1475     pub fn create_pptt(&self) -> Sdt {
1476         let pptt_start = 0;
1477         let mut cpus = 0;
1478         let mut uid = 0;
1479         // If topology is not specified, the default setting is:
1480         // 1 package, multiple cores, 1 thread per core
1481         // This is also the behavior when PPTT is missing.
1482         let (threads_per_core, cores_per_package, packages) =
1483             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1484 
1485         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1486 
1487         for cluster_idx in 0..packages {
1488             if cpus < self.config.boot_vcpus as usize {
1489                 let cluster_offset = pptt.len() - pptt_start;
1490                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1491                     r#type: 0,
1492                     length: 20,
1493                     reserved: 0,
1494                     flags: 0x2,
1495                     parent: 0,
1496                     acpi_processor_id: cluster_idx as u32,
1497                     num_private_resources: 0,
1498                 };
1499                 pptt.append(cluster_hierarchy_node);
1500 
1501                 for core_idx in 0..cores_per_package {
1502                     let core_offset = pptt.len() - pptt_start;
1503 
1504                     if threads_per_core > 1 {
1505                         let core_hierarchy_node = ProcessorHierarchyNode {
1506                             r#type: 0,
1507                             length: 20,
1508                             reserved: 0,
1509                             flags: 0x2,
1510                             parent: cluster_offset as u32,
1511                             acpi_processor_id: core_idx as u32,
1512                             num_private_resources: 0,
1513                         };
1514                         pptt.append(core_hierarchy_node);
1515 
1516                         for _thread_idx in 0..threads_per_core {
1517                             let thread_hierarchy_node = ProcessorHierarchyNode {
1518                                 r#type: 0,
1519                                 length: 20,
1520                                 reserved: 0,
1521                                 flags: 0xE,
1522                                 parent: core_offset as u32,
1523                                 acpi_processor_id: uid as u32,
1524                                 num_private_resources: 0,
1525                             };
1526                             pptt.append(thread_hierarchy_node);
1527                             uid += 1;
1528                         }
1529                     } else {
1530                         let thread_hierarchy_node = ProcessorHierarchyNode {
1531                             r#type: 0,
1532                             length: 20,
1533                             reserved: 0,
1534                             flags: 0xA,
1535                             parent: cluster_offset as u32,
1536                             acpi_processor_id: uid as u32,
1537                             num_private_resources: 0,
1538                         };
1539                         pptt.append(thread_hierarchy_node);
1540                         uid += 1;
1541                     }
1542                 }
1543                 cpus += (cores_per_package * threads_per_core) as usize;
1544             }
1545         }
1546 
1547         pptt.update_checksum();
1548         pptt
1549     }
1550 
1551     #[cfg(feature = "guest_debug")]
1552     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1553         self.vcpus[usize::from(cpu_id)]
1554             .lock()
1555             .unwrap()
1556             .vcpu
1557             .get_regs()
1558             .map_err(Error::CpuDebug)
1559     }
1560 
1561     #[cfg(feature = "guest_debug")]
1562     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1563         self.vcpus[usize::from(cpu_id)]
1564             .lock()
1565             .unwrap()
1566             .vcpu
1567             .set_regs(regs)
1568             .map_err(Error::CpuDebug)
1569     }
1570 
1571     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1572     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1573         self.vcpus[usize::from(cpu_id)]
1574             .lock()
1575             .unwrap()
1576             .vcpu
1577             .get_sregs()
1578             .map_err(Error::CpuDebug)
1579     }
1580 
1581     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1582     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1583         self.vcpus[usize::from(cpu_id)]
1584             .lock()
1585             .unwrap()
1586             .vcpu
1587             .set_sregs(sregs)
1588             .map_err(Error::CpuDebug)
1589     }
1590 
1591     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1592     fn translate_gva(
1593         &self,
1594         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1595         cpu_id: u8,
1596         gva: u64,
1597     ) -> Result<u64> {
1598         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1599             .lock()
1600             .unwrap()
1601             .vcpu
1602             .translate_gva(gva, /* flags: unused */ 0)
1603             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1604         Ok(gpa)
1605     }
1606 
1607     ///
1608     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1609     /// it in VMM by walking through translation tables.
1610     ///
1611     /// Address translation is big topic, here we only focus the scenario that
1612     /// happens in VMM while debugging kernel. This `translate_gva`
1613     /// implementation is restricted to:
1614     /// - Exception Level 1
1615     /// - Translate high address range only (kernel space)
1616     ///
1617     /// This implementation supports following Arm-v8a features related to
1618     /// address translation:
1619     /// - FEAT_LPA
1620     /// - FEAT_LVA
1621     /// - FEAT_LPA2
1622     ///
1623     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1624     fn translate_gva(
1625         &self,
1626         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1627         cpu_id: u8,
1628         gva: u64,
1629     ) -> Result<u64> {
1630         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1631             .lock()
1632             .unwrap()
1633             .vcpu
1634             .get_sys_reg(regs::TCR_EL1)
1635             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1636         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1637             .lock()
1638             .unwrap()
1639             .vcpu
1640             .get_sys_reg(regs::TTBR1_EL1)
1641             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1642         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1643             .lock()
1644             .unwrap()
1645             .vcpu
1646             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1647             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1648 
1649         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1650         // or low (0x000xxx...).
1651         let high_range = extract_bits_64!(gva, 55, 1);
1652         if high_range == 0 {
1653             info!("VA (0x{:x}) range is not supported!", gva);
1654             return Ok(gva);
1655         }
1656 
1657         // High range size offset
1658         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1659         // Granule size
1660         let tg = extract_bits_64!(tcr_el1, 30, 2);
1661         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1662         let ds = extract_bits_64!(tcr_el1, 59, 1);
1663 
1664         if tsz == 0 {
1665             info!("VA translation is not ready!");
1666             return Ok(gva);
1667         }
1668 
1669         // VA size is determined by TCR_BL1.T1SZ
1670         let va_size = 64 - tsz;
1671         // Number of bits in VA consumed in each level of translation
1672         let stride = match tg {
1673             3 => 13, // 64KB granule size
1674             1 => 11, // 16KB granule size
1675             _ => 9,  // 4KB, default
1676         };
1677         // Starting level of walking
1678         let mut level = 4 - (va_size - 4) / stride;
1679 
1680         // PA or IPA size is determined
1681         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1682         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1683         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1684         // To be safe, we use the minimum value if they are different.
1685         let pa_range = std::cmp::min(tcr_ips, pa_range);
1686         // PA size in bits
1687         let pa_size = match pa_range {
1688             0 => 32,
1689             1 => 36,
1690             2 => 40,
1691             3 => 42,
1692             4 => 44,
1693             5 => 48,
1694             6 => 52,
1695             _ => {
1696                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1697                     "PA range not supported {pa_range}"
1698                 ))))
1699             }
1700         };
1701 
1702         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1703         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1704         // If FEAT_LPA2 is present, the translation table descriptor holds
1705         // 50 bits of the table address of next level.
1706         // Otherwise, it is 48 bits.
1707         let descaddrmask = if ds == 1 {
1708             !0u64 >> (64 - 50) // mask with 50 least significant bits
1709         } else {
1710             !0u64 >> (64 - 48) // mask with 48 least significant bits
1711         };
1712         let descaddrmask = descaddrmask & !indexmask_grainsize;
1713 
1714         // Translation table base address
1715         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1716         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1717         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1718         if pa_size == 52 {
1719             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1720         }
1721 
1722         // Loop through tables of each level
1723         loop {
1724             // Table offset for current level
1725             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1726             descaddr |= table_offset;
1727             descaddr &= !7u64;
1728 
1729             let mut buf = [0; 8];
1730             guest_memory
1731                 .memory()
1732                 .read(&mut buf, GuestAddress(descaddr))
1733                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1734             let descriptor = u64::from_le_bytes(buf);
1735 
1736             descaddr = descriptor & descaddrmask;
1737             // In the case of FEAT_LPA, the next-level translation table address
1738             // bits [48:51] comes from bits [12:15] of the current descriptor.
1739             // For FEAT_LPA2, the next-level translation table address
1740             // bits [50:51] comes from bits [8:9] of the current descriptor,
1741             // bits [48:49] comes from bits [48:49] of the descriptor which was
1742             // handled previously.
1743             if pa_size == 52 {
1744                 if ds == 1 {
1745                     // FEAT_LPA2
1746                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1747                 } else {
1748                     // FEAT_LPA
1749                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1750                 }
1751             }
1752 
1753             if (descriptor & 2) != 0 && (level < 3) {
1754                 // This is a table entry. Go down to next level.
1755                 level += 1;
1756                 indexmask = indexmask_grainsize;
1757                 continue;
1758             }
1759 
1760             break;
1761         }
1762 
1763         // We have reached either:
1764         // - a page entry at level 3 or
1765         // - a block entry at level 1 or 2
1766         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1767         descaddr &= !(page_size - 1);
1768         descaddr |= gva & (page_size - 1);
1769 
1770         Ok(descaddr)
1771     }
1772 
1773     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1774         self.acpi_address = Some(acpi_address);
1775     }
1776 
1777     pub(crate) fn set_interrupt_controller(
1778         &mut self,
1779         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1780     ) {
1781         self.interrupt_controller = Some(interrupt_controller);
1782     }
1783 
1784     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1785         &self.vcpus_kill_signalled
1786     }
1787 }
1788 
1789 struct Cpu {
1790     cpu_id: u8,
1791     proximity_domain: u32,
1792     dynamic: bool,
1793 }
1794 
1795 #[cfg(target_arch = "x86_64")]
1796 const MADT_CPU_ENABLE_FLAG: usize = 0;
1797 
1798 #[cfg(target_arch = "x86_64")]
1799 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1800 
1801 impl Cpu {
1802     #[cfg(target_arch = "x86_64")]
1803     fn generate_mat(&self) -> Vec<u8> {
1804         let lapic = LocalX2Apic {
1805             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1806             length: 16,
1807             processor_id: self.cpu_id.into(),
1808             apic_id: self.cpu_id.into(),
1809             flags: 1 << MADT_CPU_ENABLE_FLAG,
1810             _reserved: 0,
1811         };
1812 
1813         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1814         // SAFETY: mat_data is large enough to hold lapic
1815         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1816 
1817         mat_data
1818     }
1819 }
1820 
1821 impl Aml for Cpu {
1822     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1823         #[cfg(target_arch = "x86_64")]
1824         let mat_data: Vec<u8> = self.generate_mat();
1825         #[allow(clippy::if_same_then_else)]
1826         if self.dynamic {
1827             aml::Device::new(
1828                 format!("C{:03X}", self.cpu_id).as_str().into(),
1829                 vec![
1830                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1831                     &aml::Name::new("_UID".into(), &self.cpu_id),
1832                     // Currently, AArch64 cannot support following fields.
1833                     /*
1834                     _STA return value:
1835                     Bit [0] – Set if the device is present.
1836                     Bit [1] – Set if the device is enabled and decoding its resources.
1837                     Bit [2] – Set if the device should be shown in the UI.
1838                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1839                     Bit [4] – Set if the battery is present.
1840                     Bits [31:5] – Reserved (must be cleared).
1841                     */
1842                     #[cfg(target_arch = "x86_64")]
1843                     &aml::Method::new(
1844                         "_STA".into(),
1845                         0,
1846                         false,
1847                         // Call into CSTA method which will interrogate device
1848                         vec![&aml::Return::new(&aml::MethodCall::new(
1849                             "CSTA".into(),
1850                             vec![&self.cpu_id],
1851                         ))],
1852                     ),
1853                     &aml::Method::new(
1854                         "_PXM".into(),
1855                         0,
1856                         false,
1857                         vec![&aml::Return::new(&self.proximity_domain)],
1858                     ),
1859                     // The Linux kernel expects every CPU device to have a _MAT entry
1860                     // containing the LAPIC for this processor with the enabled bit set
1861                     // even it if is disabled in the MADT (non-boot CPU)
1862                     #[cfg(target_arch = "x86_64")]
1863                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1864                     // Trigger CPU ejection
1865                     #[cfg(target_arch = "x86_64")]
1866                     &aml::Method::new(
1867                         "_EJ0".into(),
1868                         1,
1869                         false,
1870                         // Call into CEJ0 method which will actually eject device
1871                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1872                     ),
1873                 ],
1874             )
1875             .to_aml_bytes(sink);
1876         } else {
1877             aml::Device::new(
1878                 format!("C{:03X}", self.cpu_id).as_str().into(),
1879                 vec![
1880                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1881                     &aml::Name::new("_UID".into(), &self.cpu_id),
1882                     #[cfg(target_arch = "x86_64")]
1883                     &aml::Method::new(
1884                         "_STA".into(),
1885                         0,
1886                         false,
1887                         // Mark CPU present see CSTA implementation
1888                         vec![&aml::Return::new(&0xfu8)],
1889                     ),
1890                     &aml::Method::new(
1891                         "_PXM".into(),
1892                         0,
1893                         false,
1894                         vec![&aml::Return::new(&self.proximity_domain)],
1895                     ),
1896                     // The Linux kernel expects every CPU device to have a _MAT entry
1897                     // containing the LAPIC for this processor with the enabled bit set
1898                     // even it if is disabled in the MADT (non-boot CPU)
1899                     #[cfg(target_arch = "x86_64")]
1900                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1901                 ],
1902             )
1903             .to_aml_bytes(sink);
1904         }
1905     }
1906 }
1907 
1908 struct CpuNotify {
1909     cpu_id: u8,
1910 }
1911 
1912 impl Aml for CpuNotify {
1913     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1914         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
1915         aml::If::new(
1916             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1917             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1918         )
1919         .to_aml_bytes(sink)
1920     }
1921 }
1922 
1923 struct CpuMethods {
1924     max_vcpus: u8,
1925     dynamic: bool,
1926 }
1927 
1928 impl Aml for CpuMethods {
1929     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1930         if self.dynamic {
1931             // CPU status method
1932             aml::Method::new(
1933                 "CSTA".into(),
1934                 1,
1935                 true,
1936                 vec![
1937                     // Take lock defined above
1938                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1939                     // Write CPU number (in first argument) to I/O port via field
1940                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1941                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1942                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1943                     &aml::If::new(
1944                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1945                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1946                     ),
1947                     // Release lock
1948                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1949                     // Return 0 or 0xf
1950                     &aml::Return::new(&aml::Local(0)),
1951                 ],
1952             )
1953             .to_aml_bytes(sink);
1954 
1955             let mut cpu_notifies = Vec::new();
1956             for cpu_id in 0..self.max_vcpus {
1957                 cpu_notifies.push(CpuNotify { cpu_id });
1958             }
1959 
1960             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
1961             for cpu_id in 0..self.max_vcpus {
1962                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1963             }
1964 
1965             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
1966 
1967             aml::Method::new(
1968                 "CEJ0".into(),
1969                 1,
1970                 true,
1971                 vec![
1972                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1973                     // Write CPU number (in first argument) to I/O port via field
1974                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1975                     // Set CEJ0 bit
1976                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1977                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1978                 ],
1979             )
1980             .to_aml_bytes(sink);
1981 
1982             aml::Method::new(
1983                 "CSCN".into(),
1984                 0,
1985                 true,
1986                 vec![
1987                     // Take lock defined above
1988                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1989                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1990                     &aml::While::new(
1991                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1992                         vec![
1993                             // Write CPU number (in first argument) to I/O port via field
1994                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1995                             // Check if CINS bit is set
1996                             &aml::If::new(
1997                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1998                                 // Notify device if it is
1999                                 vec![
2000                                     &aml::MethodCall::new(
2001                                         "CTFY".into(),
2002                                         vec![&aml::Local(0), &aml::ONE],
2003                                     ),
2004                                     // Reset CINS bit
2005                                     &aml::Store::new(
2006                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2007                                         &aml::ONE,
2008                                     ),
2009                                 ],
2010                             ),
2011                             // Check if CRMV bit is set
2012                             &aml::If::new(
2013                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2014                                 // Notify device if it is (with the eject constant 0x3)
2015                                 vec![
2016                                     &aml::MethodCall::new(
2017                                         "CTFY".into(),
2018                                         vec![&aml::Local(0), &3u8],
2019                                     ),
2020                                     // Reset CRMV bit
2021                                     &aml::Store::new(
2022                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2023                                         &aml::ONE,
2024                                     ),
2025                                 ],
2026                             ),
2027                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2028                         ],
2029                     ),
2030                     // Release lock
2031                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2032                 ],
2033             )
2034             .to_aml_bytes(sink)
2035         } else {
2036             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2037         }
2038     }
2039 }
2040 
2041 impl Aml for CpuManager {
2042     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2043         #[cfg(target_arch = "x86_64")]
2044         if let Some(acpi_address) = self.acpi_address {
2045             // CPU hotplug controller
2046             aml::Device::new(
2047                 "_SB_.PRES".into(),
2048                 vec![
2049                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2050                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2051                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2052                     &aml::Mutex::new("CPLK".into(), 0),
2053                     &aml::Name::new(
2054                         "_CRS".into(),
2055                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2056                             aml::AddressSpaceCacheable::NotCacheable,
2057                             true,
2058                             acpi_address.0,
2059                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2060                             None,
2061                         )]),
2062                     ),
2063                     // OpRegion and Fields map MMIO range into individual field values
2064                     &aml::OpRegion::new(
2065                         "PRST".into(),
2066                         aml::OpRegionSpace::SystemMemory,
2067                         &(acpi_address.0 as usize),
2068                         &CPU_MANAGER_ACPI_SIZE,
2069                     ),
2070                     &aml::Field::new(
2071                         "PRST".into(),
2072                         aml::FieldAccessType::Byte,
2073                         aml::FieldLockRule::NoLock,
2074                         aml::FieldUpdateRule::WriteAsZeroes,
2075                         vec![
2076                             aml::FieldEntry::Reserved(32),
2077                             aml::FieldEntry::Named(*b"CPEN", 1),
2078                             aml::FieldEntry::Named(*b"CINS", 1),
2079                             aml::FieldEntry::Named(*b"CRMV", 1),
2080                             aml::FieldEntry::Named(*b"CEJ0", 1),
2081                             aml::FieldEntry::Reserved(4),
2082                             aml::FieldEntry::Named(*b"CCMD", 8),
2083                         ],
2084                     ),
2085                     &aml::Field::new(
2086                         "PRST".into(),
2087                         aml::FieldAccessType::DWord,
2088                         aml::FieldLockRule::NoLock,
2089                         aml::FieldUpdateRule::Preserve,
2090                         vec![
2091                             aml::FieldEntry::Named(*b"CSEL", 32),
2092                             aml::FieldEntry::Reserved(32),
2093                             aml::FieldEntry::Named(*b"CDAT", 32),
2094                         ],
2095                     ),
2096                 ],
2097             )
2098             .to_aml_bytes(sink);
2099         }
2100 
2101         // CPU devices
2102         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2103         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2104         // Bundle methods together under a common object
2105         let methods = CpuMethods {
2106             max_vcpus: self.config.max_vcpus,
2107             dynamic: self.dynamic,
2108         };
2109         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2110 
2111         let mut cpu_devices = Vec::new();
2112         for cpu_id in 0..self.config.max_vcpus {
2113             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2114             let cpu_device = Cpu {
2115                 cpu_id,
2116                 proximity_domain,
2117                 dynamic: self.dynamic,
2118             };
2119 
2120             cpu_devices.push(cpu_device);
2121         }
2122 
2123         for cpu_device in cpu_devices.iter() {
2124             cpu_data_inner.push(cpu_device);
2125         }
2126 
2127         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2128     }
2129 }
2130 
2131 impl Pausable for CpuManager {
2132     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2133         // Tell the vCPUs to pause themselves next time they exit
2134         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2135 
2136         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2137         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2138         // above.
2139         for state in self.vcpu_states.iter() {
2140             state.signal_thread();
2141         }
2142 
2143         for vcpu in self.vcpus.iter() {
2144             let mut vcpu = vcpu.lock().unwrap();
2145             vcpu.pause()?;
2146             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2147             if !self.config.kvm_hyperv {
2148                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2149                     MigratableError::Pause(anyhow!(
2150                         "Could not notify guest it has been paused {:?}",
2151                         e
2152                     ))
2153                 })?;
2154             }
2155         }
2156 
2157         // The vCPU thread will change its paused state before parking, wait here for each
2158         // activated vCPU change their state to ensure they have parked.
2159         for state in self.vcpu_states.iter() {
2160             if state.active() {
2161                 while !state.paused.load(Ordering::SeqCst) {
2162                     // To avoid a priority inversion with the vCPU thread
2163                     thread::sleep(std::time::Duration::from_millis(1));
2164                 }
2165             }
2166         }
2167 
2168         Ok(())
2169     }
2170 
2171     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2172         for vcpu in self.vcpus.iter() {
2173             vcpu.lock().unwrap().resume()?;
2174         }
2175 
2176         // Toggle the vCPUs pause boolean
2177         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2178 
2179         // Unpark all the VCPU threads.
2180         // Once unparked, the next thing they will do is checking for the pause
2181         // boolean. Since it'll be set to false, they will exit their pause loop
2182         // and go back to vmx root.
2183         for state in self.vcpu_states.iter() {
2184             state.paused.store(false, Ordering::SeqCst);
2185             state.unpark_thread();
2186         }
2187         Ok(())
2188     }
2189 }
2190 
2191 impl Snapshottable for CpuManager {
2192     fn id(&self) -> String {
2193         CPU_MANAGER_SNAPSHOT_ID.to_string()
2194     }
2195 
2196     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2197         let mut cpu_manager_snapshot = Snapshot::default();
2198 
2199         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2200         for vcpu in &self.vcpus {
2201             let mut vcpu = vcpu.lock().unwrap();
2202             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2203         }
2204 
2205         Ok(cpu_manager_snapshot)
2206     }
2207 }
2208 
2209 impl Transportable for CpuManager {}
2210 impl Migratable for CpuManager {}
2211 
2212 #[cfg(feature = "guest_debug")]
2213 impl Debuggable for CpuManager {
2214     #[cfg(feature = "kvm")]
2215     fn set_guest_debug(
2216         &self,
2217         cpu_id: usize,
2218         addrs: &[GuestAddress],
2219         singlestep: bool,
2220     ) -> std::result::Result<(), DebuggableError> {
2221         self.vcpus[cpu_id]
2222             .lock()
2223             .unwrap()
2224             .vcpu
2225             .set_guest_debug(addrs, singlestep)
2226             .map_err(DebuggableError::SetDebug)
2227     }
2228 
2229     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2230         Ok(())
2231     }
2232 
2233     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2234         Ok(())
2235     }
2236 
2237     #[cfg(target_arch = "x86_64")]
2238     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2239         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2240         let gregs = self
2241             .get_regs(cpu_id as u8)
2242             .map_err(DebuggableError::ReadRegs)?;
2243         let regs = [
2244             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2245             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2246         ];
2247 
2248         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2249         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2250         let eflags = gregs.rflags as u32;
2251         let rip = gregs.rip;
2252 
2253         // Segment registers: CS, SS, DS, ES, FS, GS
2254         let sregs = self
2255             .get_sregs(cpu_id as u8)
2256             .map_err(DebuggableError::ReadRegs)?;
2257         let segments = X86SegmentRegs {
2258             cs: sregs.cs.selector as u32,
2259             ss: sregs.ss.selector as u32,
2260             ds: sregs.ds.selector as u32,
2261             es: sregs.es.selector as u32,
2262             fs: sregs.fs.selector as u32,
2263             gs: sregs.gs.selector as u32,
2264         };
2265 
2266         // TODO: Add other registers
2267 
2268         Ok(CoreRegs {
2269             regs,
2270             eflags,
2271             rip,
2272             segments,
2273             ..Default::default()
2274         })
2275     }
2276 
2277     #[cfg(target_arch = "aarch64")]
2278     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2279         let gregs = self
2280             .get_regs(cpu_id as u8)
2281             .map_err(DebuggableError::ReadRegs)?;
2282         Ok(CoreRegs {
2283             x: gregs.regs.regs,
2284             sp: gregs.regs.sp,
2285             pc: gregs.regs.pc,
2286             ..Default::default()
2287         })
2288     }
2289 
2290     #[cfg(target_arch = "x86_64")]
2291     fn write_regs(
2292         &self,
2293         cpu_id: usize,
2294         regs: &CoreRegs,
2295     ) -> std::result::Result<(), DebuggableError> {
2296         let orig_gregs = self
2297             .get_regs(cpu_id as u8)
2298             .map_err(DebuggableError::ReadRegs)?;
2299         let gregs = StandardRegisters {
2300             rax: regs.regs[0],
2301             rbx: regs.regs[1],
2302             rcx: regs.regs[2],
2303             rdx: regs.regs[3],
2304             rsi: regs.regs[4],
2305             rdi: regs.regs[5],
2306             rbp: regs.regs[6],
2307             rsp: regs.regs[7],
2308             r8: regs.regs[8],
2309             r9: regs.regs[9],
2310             r10: regs.regs[10],
2311             r11: regs.regs[11],
2312             r12: regs.regs[12],
2313             r13: regs.regs[13],
2314             r14: regs.regs[14],
2315             r15: regs.regs[15],
2316             rip: regs.rip,
2317             // Update the lower 32-bit of rflags.
2318             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2319         };
2320 
2321         self.set_regs(cpu_id as u8, &gregs)
2322             .map_err(DebuggableError::WriteRegs)?;
2323 
2324         // Segment registers: CS, SS, DS, ES, FS, GS
2325         // Since GDB care only selectors, we call get_sregs() first.
2326         let mut sregs = self
2327             .get_sregs(cpu_id as u8)
2328             .map_err(DebuggableError::ReadRegs)?;
2329         sregs.cs.selector = regs.segments.cs as u16;
2330         sregs.ss.selector = regs.segments.ss as u16;
2331         sregs.ds.selector = regs.segments.ds as u16;
2332         sregs.es.selector = regs.segments.es as u16;
2333         sregs.fs.selector = regs.segments.fs as u16;
2334         sregs.gs.selector = regs.segments.gs as u16;
2335 
2336         self.set_sregs(cpu_id as u8, &sregs)
2337             .map_err(DebuggableError::WriteRegs)?;
2338 
2339         // TODO: Add other registers
2340 
2341         Ok(())
2342     }
2343 
2344     #[cfg(target_arch = "aarch64")]
2345     fn write_regs(
2346         &self,
2347         cpu_id: usize,
2348         regs: &CoreRegs,
2349     ) -> std::result::Result<(), DebuggableError> {
2350         let mut gregs = self
2351             .get_regs(cpu_id as u8)
2352             .map_err(DebuggableError::ReadRegs)?;
2353 
2354         gregs.regs.regs = regs.x;
2355         gregs.regs.sp = regs.sp;
2356         gregs.regs.pc = regs.pc;
2357 
2358         self.set_regs(cpu_id as u8, &gregs)
2359             .map_err(DebuggableError::WriteRegs)?;
2360 
2361         Ok(())
2362     }
2363 
2364     fn read_mem(
2365         &self,
2366         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2367         cpu_id: usize,
2368         vaddr: GuestAddress,
2369         len: usize,
2370     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2371         let mut buf = vec![0; len];
2372         let mut total_read = 0_u64;
2373 
2374         while total_read < len as u64 {
2375             let gaddr = vaddr.0 + total_read;
2376             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2377                 Ok(paddr) => paddr,
2378                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2379                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2380             };
2381             let psize = arch::PAGE_SIZE as u64;
2382             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2383             guest_memory
2384                 .memory()
2385                 .read(
2386                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2387                     GuestAddress(paddr),
2388                 )
2389                 .map_err(DebuggableError::ReadMem)?;
2390             total_read += read_len;
2391         }
2392         Ok(buf)
2393     }
2394 
2395     fn write_mem(
2396         &self,
2397         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2398         cpu_id: usize,
2399         vaddr: &GuestAddress,
2400         data: &[u8],
2401     ) -> std::result::Result<(), DebuggableError> {
2402         let mut total_written = 0_u64;
2403 
2404         while total_written < data.len() as u64 {
2405             let gaddr = vaddr.0 + total_written;
2406             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2407                 Ok(paddr) => paddr,
2408                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2409                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2410             };
2411             let psize = arch::PAGE_SIZE as u64;
2412             let write_len = std::cmp::min(
2413                 data.len() as u64 - total_written,
2414                 psize - (paddr & (psize - 1)),
2415             );
2416             guest_memory
2417                 .memory()
2418                 .write(
2419                     &data[total_written as usize..total_written as usize + write_len as usize],
2420                     GuestAddress(paddr),
2421                 )
2422                 .map_err(DebuggableError::WriteMem)?;
2423             total_written += write_len;
2424         }
2425         Ok(())
2426     }
2427 
2428     fn active_vcpus(&self) -> usize {
2429         self.present_vcpus() as usize
2430     }
2431 }
2432 
2433 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2434 impl Elf64Writable for CpuManager {}
2435 
2436 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2437 impl CpuElf64Writable for CpuManager {
2438     fn cpu_write_elf64_note(
2439         &mut self,
2440         dump_state: &DumpState,
2441     ) -> std::result::Result<(), GuestDebuggableError> {
2442         let mut coredump_file = dump_state.file.as_ref().unwrap();
2443         for vcpu in &self.vcpus {
2444             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2445             let mut pos: usize = 0;
2446             let mut buf = vec![0; note_size as usize];
2447             let descsz = size_of::<X86_64ElfPrStatus>();
2448             let vcpu_id = vcpu.lock().unwrap().id;
2449 
2450             let note = Elf64_Nhdr {
2451                 n_namesz: COREDUMP_NAME_SIZE,
2452                 n_descsz: descsz as u32,
2453                 n_type: NT_PRSTATUS,
2454             };
2455 
2456             let bytes: &[u8] = note.as_slice();
2457             buf.splice(0.., bytes.to_vec());
2458             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2459             buf.resize(pos + 4, 0);
2460             buf.splice(pos.., "CORE".to_string().into_bytes());
2461 
2462             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2463             buf.resize(pos + 32 + 4, 0);
2464             let pid = vcpu_id as u64;
2465             let bytes: &[u8] = pid.as_slice();
2466             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2467 
2468             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2469 
2470             let orig_rax: u64 = 0;
2471             let gregs = self.vcpus[usize::from(vcpu_id)]
2472                 .lock()
2473                 .unwrap()
2474                 .vcpu
2475                 .get_regs()
2476                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2477 
2478             let regs1 = [
2479                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2480                 gregs.r10,
2481             ];
2482             let regs2 = [
2483                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2484             ];
2485 
2486             let sregs = self.vcpus[usize::from(vcpu_id)]
2487                 .lock()
2488                 .unwrap()
2489                 .vcpu
2490                 .get_sregs()
2491                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2492 
2493             debug!(
2494                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2495                 gregs.rip,
2496                 gregs.rsp,
2497                 sregs.gs.base,
2498                 sregs.cs.selector,
2499                 sregs.ss.selector,
2500                 sregs.ds.selector,
2501             );
2502 
2503             let regs = X86_64UserRegs {
2504                 regs1,
2505                 regs2,
2506                 rip: gregs.rip,
2507                 cs: sregs.cs.selector as u64,
2508                 eflags: gregs.rflags,
2509                 rsp: gregs.rsp,
2510                 ss: sregs.ss.selector as u64,
2511                 fs_base: sregs.fs.base,
2512                 gs_base: sregs.gs.base,
2513                 ds: sregs.ds.selector as u64,
2514                 es: sregs.es.selector as u64,
2515                 fs: sregs.fs.selector as u64,
2516                 gs: sregs.gs.selector as u64,
2517             };
2518 
2519             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2520             let bytes: &[u8] = regs.as_slice();
2521             buf.resize(note_size as usize, 0);
2522             buf.splice(pos.., bytes.to_vec());
2523             buf.resize(note_size as usize, 0);
2524 
2525             coredump_file
2526                 .write(&buf)
2527                 .map_err(GuestDebuggableError::CoredumpFile)?;
2528         }
2529 
2530         Ok(())
2531     }
2532 
2533     fn cpu_write_vmm_note(
2534         &mut self,
2535         dump_state: &DumpState,
2536     ) -> std::result::Result<(), GuestDebuggableError> {
2537         let mut coredump_file = dump_state.file.as_ref().unwrap();
2538         for vcpu in &self.vcpus {
2539             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2540             let mut pos: usize = 0;
2541             let mut buf = vec![0; note_size as usize];
2542             let descsz = size_of::<DumpCpusState>();
2543             let vcpu_id = vcpu.lock().unwrap().id;
2544 
2545             let note = Elf64_Nhdr {
2546                 n_namesz: COREDUMP_NAME_SIZE,
2547                 n_descsz: descsz as u32,
2548                 n_type: 0,
2549             };
2550 
2551             let bytes: &[u8] = note.as_slice();
2552             buf.splice(0.., bytes.to_vec());
2553             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2554 
2555             buf.resize(pos + 4, 0);
2556             buf.splice(pos.., "QEMU".to_string().into_bytes());
2557 
2558             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2559 
2560             let gregs = self.vcpus[usize::from(vcpu_id)]
2561                 .lock()
2562                 .unwrap()
2563                 .vcpu
2564                 .get_regs()
2565                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2566 
2567             let regs1 = [
2568                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2569                 gregs.rbp,
2570             ];
2571 
2572             let regs2 = [
2573                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2574                 gregs.r15,
2575             ];
2576 
2577             let sregs = self.vcpus[usize::from(vcpu_id)]
2578                 .lock()
2579                 .unwrap()
2580                 .vcpu
2581                 .get_sregs()
2582                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2583 
2584             let mut msrs = vec![MsrEntry {
2585                 index: msr_index::MSR_KERNEL_GS_BASE,
2586                 ..Default::default()
2587             }];
2588 
2589             self.vcpus[vcpu_id as usize]
2590                 .lock()
2591                 .unwrap()
2592                 .vcpu
2593                 .get_msrs(&mut msrs)
2594                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2595             let kernel_gs_base = msrs[0].data;
2596 
2597             let cs = CpuSegment::new(sregs.cs);
2598             let ds = CpuSegment::new(sregs.ds);
2599             let es = CpuSegment::new(sregs.es);
2600             let fs = CpuSegment::new(sregs.fs);
2601             let gs = CpuSegment::new(sregs.gs);
2602             let ss = CpuSegment::new(sregs.ss);
2603             let ldt = CpuSegment::new(sregs.ldt);
2604             let tr = CpuSegment::new(sregs.tr);
2605             let gdt = CpuSegment::new_from_table(sregs.gdt);
2606             let idt = CpuSegment::new_from_table(sregs.idt);
2607             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2608             let regs = DumpCpusState {
2609                 version: 1,
2610                 size: size_of::<DumpCpusState>() as u32,
2611                 regs1,
2612                 regs2,
2613                 rip: gregs.rip,
2614                 rflags: gregs.rflags,
2615                 cs,
2616                 ds,
2617                 es,
2618                 fs,
2619                 gs,
2620                 ss,
2621                 ldt,
2622                 tr,
2623                 gdt,
2624                 idt,
2625                 cr,
2626                 kernel_gs_base,
2627             };
2628 
2629             let bytes: &[u8] = regs.as_slice();
2630             buf.resize(note_size as usize, 0);
2631             buf.splice(pos.., bytes.to_vec());
2632             buf.resize(note_size as usize, 0);
2633 
2634             coredump_file
2635                 .write(&buf)
2636                 .map_err(GuestDebuggableError::CoredumpFile)?;
2637         }
2638 
2639         Ok(())
2640     }
2641 }
2642 
2643 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2644 #[cfg(test)]
2645 mod tests {
2646     use arch::x86_64::interrupts::*;
2647     use arch::x86_64::regs::*;
2648     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2649 
2650     #[test]
2651     fn test_setlint() {
2652         let hv = hypervisor::new().unwrap();
2653         let vm = hv.create_vm().expect("new VM fd creation failed");
2654         assert!(hv.check_required_extensions().is_ok());
2655         // Calling get_lapic will fail if there is no irqchip before hand.
2656         assert!(vm.create_irq_chip().is_ok());
2657         let vcpu = vm.create_vcpu(0, None).unwrap();
2658         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2659 
2660         // Compute the value that is expected to represent LVT0 and LVT1.
2661         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2662         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2663         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2664         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2665 
2666         set_lint(&vcpu).unwrap();
2667 
2668         // Compute the value that represents LVT0 and LVT1 after set_lint.
2669         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2670         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2671         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2672         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2673         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2674     }
2675 
2676     #[test]
2677     fn test_setup_fpu() {
2678         let hv = hypervisor::new().unwrap();
2679         let vm = hv.create_vm().expect("new VM fd creation failed");
2680         let vcpu = vm.create_vcpu(0, None).unwrap();
2681         setup_fpu(&vcpu).unwrap();
2682 
2683         let expected_fpu: FpuState = FpuState {
2684             fcw: 0x37f,
2685             mxcsr: 0x1f80,
2686             ..Default::default()
2687         };
2688         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2689         // TODO: auto-generate kvm related structures with PartialEq on.
2690         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2691         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2692         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2693         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2694         // remove it at all.
2695         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2696     }
2697 
2698     #[test]
2699     fn test_setup_msrs() {
2700         use hypervisor::arch::x86::{msr_index, MsrEntry};
2701 
2702         let hv = hypervisor::new().unwrap();
2703         let vm = hv.create_vm().expect("new VM fd creation failed");
2704         let vcpu = vm.create_vcpu(0, None).unwrap();
2705         setup_msrs(&vcpu).unwrap();
2706 
2707         // This test will check against the last MSR entry configured (the tenth one).
2708         // See create_msr_entries for details.
2709         let mut msrs = vec![MsrEntry {
2710             index: msr_index::MSR_IA32_MISC_ENABLE,
2711             ..Default::default()
2712         }];
2713 
2714         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2715         // in this test case scenario.
2716         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2717         assert_eq!(read_msrs, 1);
2718 
2719         // Official entries that were setup when we did setup_msrs. We need to assert that the
2720         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2721         // expect.
2722         let entry_vec = vcpu.boot_msr_entries();
2723         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2724     }
2725 
2726     #[test]
2727     fn test_setup_regs() {
2728         let hv = hypervisor::new().unwrap();
2729         let vm = hv.create_vm().expect("new VM fd creation failed");
2730         let vcpu = vm.create_vcpu(0, None).unwrap();
2731 
2732         let expected_regs: StandardRegisters = StandardRegisters {
2733             rflags: 0x0000000000000002u64,
2734             rbx: arch::layout::PVH_INFO_START.0,
2735             rip: 1,
2736             ..Default::default()
2737         };
2738 
2739         setup_regs(&vcpu, expected_regs.rip).unwrap();
2740 
2741         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2742         assert_eq!(actual_regs, expected_regs);
2743     }
2744 }
2745 
2746 #[cfg(target_arch = "aarch64")]
2747 #[cfg(test)]
2748 mod tests {
2749     use arch::{aarch64::regs, layout};
2750     use hypervisor::kvm::aarch64::is_system_register;
2751     use hypervisor::kvm::kvm_bindings::{
2752         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2753         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2754     };
2755     use hypervisor::{arm64_core_reg_id, offset_of};
2756     use std::mem;
2757 
2758     #[test]
2759     fn test_setup_regs() {
2760         let hv = hypervisor::new().unwrap();
2761         let vm = hv.create_vm().unwrap();
2762         let vcpu = vm.create_vcpu(0, None).unwrap();
2763 
2764         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2765         // Must fail when vcpu is not initialized yet.
2766         assert!(res.is_err());
2767 
2768         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2769         vm.get_preferred_target(&mut kvi).unwrap();
2770         vcpu.vcpu_init(&kvi).unwrap();
2771 
2772         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2773     }
2774 
2775     #[test]
2776     fn test_read_mpidr() {
2777         let hv = hypervisor::new().unwrap();
2778         let vm = hv.create_vm().unwrap();
2779         let vcpu = vm.create_vcpu(0, None).unwrap();
2780         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2781         vm.get_preferred_target(&mut kvi).unwrap();
2782 
2783         // Must fail when vcpu is not initialized yet.
2784         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2785 
2786         vcpu.vcpu_init(&kvi).unwrap();
2787         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2788     }
2789 
2790     #[test]
2791     fn test_is_system_register() {
2792         let offset = offset_of!(user_pt_regs, pc);
2793         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2794         assert!(!is_system_register(regid));
2795         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2796         assert!(is_system_register(regid));
2797     }
2798 
2799     #[test]
2800     fn test_save_restore_core_regs() {
2801         let hv = hypervisor::new().unwrap();
2802         let vm = hv.create_vm().unwrap();
2803         let vcpu = vm.create_vcpu(0, None).unwrap();
2804         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2805         vm.get_preferred_target(&mut kvi).unwrap();
2806 
2807         // Must fail when vcpu is not initialized yet.
2808         let res = vcpu.get_regs();
2809         assert!(res.is_err());
2810         assert_eq!(
2811             format!("{}", res.unwrap_err()),
2812             "Failed to get core register: Exec format error (os error 8)"
2813         );
2814 
2815         let mut state = kvm_regs::default();
2816         let res = vcpu.set_regs(&state);
2817         assert!(res.is_err());
2818         assert_eq!(
2819             format!("{}", res.unwrap_err()),
2820             "Failed to set core register: Exec format error (os error 8)"
2821         );
2822 
2823         vcpu.vcpu_init(&kvi).unwrap();
2824         let res = vcpu.get_regs();
2825         assert!(res.is_ok());
2826         state = res.unwrap();
2827         assert_eq!(state.regs.pstate, 0x3C5);
2828 
2829         assert!(vcpu.set_regs(&state).is_ok());
2830     }
2831 
2832     #[test]
2833     fn test_get_set_mpstate() {
2834         let hv = hypervisor::new().unwrap();
2835         let vm = hv.create_vm().unwrap();
2836         let vcpu = vm.create_vcpu(0, None).unwrap();
2837         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2838         vm.get_preferred_target(&mut kvi).unwrap();
2839 
2840         let res = vcpu.get_mp_state();
2841         assert!(res.is_ok());
2842         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2843     }
2844 }
2845