xref: /cloud-hypervisor/vmm/src/cpu.rs (revision ed63b352d1ebf70f36c7d36a0d6b52fc96186581)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::collections::BTreeMap;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use std::io::Write;
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::os::unix::thread::JoinHandleExt;
20 use std::sync::atomic::{AtomicBool, Ordering};
21 use std::sync::{Arc, Barrier, Mutex};
22 use std::{cmp, io, result, thread};
23 
24 #[cfg(not(target_arch = "riscv64"))]
25 use acpi_tables::sdt::Sdt;
26 use acpi_tables::{aml, Aml};
27 use anyhow::anyhow;
28 #[cfg(target_arch = "x86_64")]
29 use arch::x86_64::get_x2apic_id;
30 use arch::{EntryPoint, NumaNodes};
31 #[cfg(target_arch = "aarch64")]
32 use devices::gic::Gic;
33 use devices::interrupt_controller::InterruptController;
34 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
35 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
36 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
37 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
39 use hypervisor::arch::aarch64::regs::{ID_AA64MMFR0_EL1, TCR_EL1, TTBR1_EL1};
40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
41 use hypervisor::arch::x86::msr_index;
42 #[cfg(target_arch = "x86_64")]
43 use hypervisor::arch::x86::CpuIdEntry;
44 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
45 use hypervisor::arch::x86::MsrEntry;
46 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
47 use hypervisor::arch::x86::SpecialRegisters;
48 #[cfg(feature = "tdx")]
49 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
50 #[cfg(target_arch = "x86_64")]
51 use hypervisor::CpuVendor;
52 #[cfg(feature = "kvm")]
53 use hypervisor::HypervisorType;
54 #[cfg(feature = "guest_debug")]
55 use hypervisor::StandardRegisters;
56 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
57 use libc::{c_void, siginfo_t};
58 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
59 use linux_loader::elf::Elf64_Nhdr;
60 use seccompiler::{apply_filter, SeccompAction};
61 use thiserror::Error;
62 use tracer::trace_scoped;
63 use vm_device::BusDevice;
64 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
65 use vm_memory::ByteValued;
66 #[cfg(feature = "guest_debug")]
67 use vm_memory::{Bytes, GuestAddressSpace};
68 use vm_memory::{GuestAddress, GuestMemoryAtomic};
69 use vm_migration::{
70     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
71     Transportable,
72 };
73 use vmm_sys_util::eventfd::EventFd;
74 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
75 use zerocopy::{FromBytes, Immutable, IntoBytes};
76 
77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
78 use crate::coredump::{
79     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
80     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
81     NT_PRSTATUS,
82 };
83 #[cfg(feature = "guest_debug")]
84 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
85 #[cfg(target_arch = "x86_64")]
86 use crate::memory_manager::MemoryManager;
87 use crate::seccomp_filters::{get_seccomp_filter, Thread};
88 #[cfg(target_arch = "x86_64")]
89 use crate::vm::physical_bits;
90 use crate::vm_config::CpusConfig;
91 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID};
92 
93 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
94 /// Extract the specified bits of a 64-bit integer.
95 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
96 /// following expression should return 3 (`0b11`):
97 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
98 ///
99 macro_rules! extract_bits_64 {
100     ($value: tt, $offset: tt, $length: tt) => {
101         ($value >> $offset) & (!0u64 >> (64 - $length))
102     };
103 }
104 
105 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
106 macro_rules! extract_bits_64_without_offset {
107     ($value: tt, $length: tt) => {
108         $value & (!0u64 >> (64 - $length))
109     };
110 }
111 
112 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
113 
114 #[derive(Debug, Error)]
115 pub enum Error {
116     #[error("Error creating vCPU")]
117     VcpuCreate(#[source] anyhow::Error),
118 
119     #[error("Error running vCPU")]
120     VcpuRun(#[source] anyhow::Error),
121 
122     #[error("Error spawning vCPU thread")]
123     VcpuSpawn(#[source] io::Error),
124 
125     #[error("Error generating common CPUID")]
126     CommonCpuId(#[source] arch::Error),
127 
128     #[error("Error configuring vCPU")]
129     VcpuConfiguration(#[source] arch::Error),
130 
131     #[error("Still pending removed vCPU")]
132     VcpuPendingRemovedVcpu,
133 
134     #[cfg(target_arch = "aarch64")]
135     #[error("Error fetching preferred target")]
136     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
137 
138     #[cfg(target_arch = "aarch64")]
139     #[error("Error setting vCPU processor features")]
140     VcpuSetProcessorFeatures(#[source] hypervisor::HypervisorCpuError),
141 
142     #[cfg(target_arch = "aarch64")]
143     #[error("Error initialising vCPU")]
144     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
145 
146     #[cfg(target_arch = "aarch64")]
147     #[error("Error finalising vCPU")]
148     VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError),
149 
150     #[cfg(target_arch = "aarch64")]
151     #[error("Error initialising GICR base address")]
152     VcpuSetGicrBaseAddr(#[source] hypervisor::HypervisorCpuError),
153 
154     #[error("Failed to join on vCPU threads: {0:?}")]
155     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
156 
157     #[error("Error adding CpuManager to MMIO bus")]
158     BusError(#[source] vm_device::BusError),
159 
160     #[error("Requested zero vCPUs")]
161     DesiredVCpuCountIsZero,
162 
163     #[error("Requested vCPUs exceed maximum")]
164     DesiredVCpuCountExceedsMax,
165 
166     #[error("Cannot create seccomp filter")]
167     CreateSeccompFilter(#[source] seccompiler::Error),
168 
169     #[error("Cannot apply seccomp filter")]
170     ApplySeccompFilter(#[source] seccompiler::Error),
171 
172     #[error("Error starting vCPU after restore")]
173     StartRestoreVcpu(#[source] anyhow::Error),
174 
175     #[error("Unexpected VmExit")]
176     UnexpectedVmExit,
177 
178     #[error("Failed to allocate MMIO address for CpuManager")]
179     AllocateMmmioAddress,
180 
181     #[cfg(feature = "tdx")]
182     #[error("Error initializing TDX")]
183     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
184 
185     #[cfg(target_arch = "aarch64")]
186     #[error("Error initializing PMU")]
187     InitPmu(#[source] hypervisor::HypervisorCpuError),
188 
189     #[cfg(feature = "guest_debug")]
190     #[error("Error during CPU debug")]
191     CpuDebug(#[source] hypervisor::HypervisorCpuError),
192 
193     #[cfg(feature = "guest_debug")]
194     #[error("Error translating virtual address")]
195     TranslateVirtualAddress(#[source] anyhow::Error),
196 
197     #[cfg(target_arch = "x86_64")]
198     #[error("Error setting up AMX")]
199     AmxEnable(#[source] anyhow::Error),
200 
201     #[error("Maximum number of vCPUs exceeds host limit")]
202     MaximumVcpusExceeded,
203 
204     #[cfg(feature = "sev_snp")]
205     #[error("Failed to set sev control register")]
206     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
207 
208     #[cfg(target_arch = "x86_64")]
209     #[error("Failed to inject NMI")]
210     NmiError(#[source] hypervisor::HypervisorCpuError),
211 }
212 pub type Result<T> = result::Result<T, Error>;
213 
214 #[cfg(target_arch = "x86_64")]
215 #[allow(dead_code)]
216 #[repr(C, packed)]
217 #[derive(IntoBytes, Immutable, FromBytes)]
218 struct LocalX2Apic {
219     pub r#type: u8,
220     pub length: u8,
221     pub _reserved: u16,
222     pub apic_id: u32,
223     pub flags: u32,
224     pub processor_id: u32,
225 }
226 
227 #[allow(dead_code)]
228 #[repr(C, packed)]
229 #[derive(Default, IntoBytes, Immutable, FromBytes)]
230 struct Ioapic {
231     pub r#type: u8,
232     pub length: u8,
233     pub ioapic_id: u8,
234     _reserved: u8,
235     pub apic_address: u32,
236     pub gsi_base: u32,
237 }
238 
239 #[cfg(target_arch = "aarch64")]
240 #[allow(dead_code)]
241 #[repr(C, packed)]
242 #[derive(IntoBytes, Immutable, FromBytes)]
243 struct GicC {
244     pub r#type: u8,
245     pub length: u8,
246     pub reserved0: u16,
247     pub cpu_interface_number: u32,
248     pub uid: u32,
249     pub flags: u32,
250     pub parking_version: u32,
251     pub performance_interrupt: u32,
252     pub parked_address: u64,
253     pub base_address: u64,
254     pub gicv_base_address: u64,
255     pub gich_base_address: u64,
256     pub vgic_interrupt: u32,
257     pub gicr_base_address: u64,
258     pub mpidr: u64,
259     pub proc_power_effi_class: u8,
260     pub reserved1: u8,
261     pub spe_overflow_interrupt: u16,
262 }
263 
264 #[cfg(target_arch = "aarch64")]
265 #[allow(dead_code)]
266 #[repr(C, packed)]
267 #[derive(IntoBytes, Immutable, FromBytes)]
268 struct GicD {
269     pub r#type: u8,
270     pub length: u8,
271     pub reserved0: u16,
272     pub gic_id: u32,
273     pub base_address: u64,
274     pub global_irq_base: u32,
275     pub version: u8,
276     pub reserved1: [u8; 3],
277 }
278 
279 #[cfg(target_arch = "aarch64")]
280 #[allow(dead_code)]
281 #[repr(C, packed)]
282 #[derive(IntoBytes, Immutable, FromBytes)]
283 struct GicR {
284     pub r#type: u8,
285     pub length: u8,
286     pub reserved: u16,
287     pub base_address: u64,
288     pub range_length: u32,
289 }
290 
291 #[cfg(target_arch = "aarch64")]
292 #[allow(dead_code)]
293 #[repr(C, packed)]
294 #[derive(IntoBytes, Immutable, FromBytes)]
295 struct GicIts {
296     pub r#type: u8,
297     pub length: u8,
298     pub reserved0: u16,
299     pub translation_id: u32,
300     pub base_address: u64,
301     pub reserved1: u32,
302 }
303 
304 #[cfg(target_arch = "aarch64")]
305 #[allow(dead_code)]
306 #[repr(C, packed)]
307 #[derive(IntoBytes, Immutable, FromBytes)]
308 struct ProcessorHierarchyNode {
309     pub r#type: u8,
310     pub length: u8,
311     pub reserved: u16,
312     pub flags: u32,
313     pub parent: u32,
314     pub acpi_processor_id: u32,
315     pub num_private_resources: u32,
316 }
317 
318 #[allow(dead_code)]
319 #[repr(C, packed)]
320 #[derive(Default, IntoBytes, Immutable, FromBytes)]
321 struct InterruptSourceOverride {
322     pub r#type: u8,
323     pub length: u8,
324     pub bus: u8,
325     pub source: u8,
326     pub gsi: u32,
327     pub flags: u16,
328 }
329 
330 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
331 macro_rules! round_up {
332     ($n:expr,$d:expr) => {
333         (($n / ($d + 1)) + 1) * $d
334     };
335 }
336 
337 /// A wrapper around creating and using a kvm-based VCPU.
338 pub struct Vcpu {
339     // The hypervisor abstracted CPU.
340     vcpu: Arc<dyn hypervisor::Vcpu>,
341     id: u8,
342     #[cfg(target_arch = "aarch64")]
343     mpidr: u64,
344     saved_state: Option<CpuState>,
345     #[cfg(target_arch = "x86_64")]
346     vendor: CpuVendor,
347 }
348 
349 impl Vcpu {
350     /// Constructs a new VCPU for `vm`.
351     ///
352     /// # Arguments
353     ///
354     /// * `id` - Represents the CPU number between [0, max vcpus).
355     /// * `vm` - The virtual machine this vcpu will get attached to.
356     /// * `vm_ops` - Optional object for exit handling.
357     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
358     pub fn new(
359         id: u8,
360         apic_id: u8,
361         vm: &Arc<dyn hypervisor::Vm>,
362         vm_ops: Option<Arc<dyn VmOps>>,
363         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
364     ) -> Result<Self> {
365         let vcpu = vm
366             .create_vcpu(apic_id, vm_ops)
367             .map_err(|e| Error::VcpuCreate(e.into()))?;
368         // Initially the cpuid per vCPU is the one supported by this VM.
369         Ok(Vcpu {
370             vcpu,
371             id,
372             #[cfg(target_arch = "aarch64")]
373             mpidr: 0,
374             saved_state: None,
375             #[cfg(target_arch = "x86_64")]
376             vendor: cpu_vendor,
377         })
378     }
379 
380     /// Configures a vcpu and should be called once per vcpu when created.
381     ///
382     /// # Arguments
383     ///
384     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
385     /// * `guest_memory` - Guest memory.
386     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
387     pub fn configure(
388         &mut self,
389         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
390         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
391         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
392         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
393         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
394     ) -> Result<()> {
395         #[cfg(target_arch = "aarch64")]
396         {
397             self.init(vm)?;
398             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
399                 .map_err(Error::VcpuConfiguration)?;
400         }
401         #[cfg(target_arch = "riscv64")]
402         arch::configure_vcpu(&self.vcpu, self.id, boot_setup).map_err(Error::VcpuConfiguration)?;
403         info!("Configuring vCPU: cpu_id = {}", self.id);
404         #[cfg(target_arch = "x86_64")]
405         arch::configure_vcpu(
406             &self.vcpu,
407             self.id,
408             boot_setup,
409             cpuid,
410             kvm_hyperv,
411             self.vendor,
412             topology,
413         )
414         .map_err(Error::VcpuConfiguration)?;
415 
416         Ok(())
417     }
418 
419     /// Gets the MPIDR register value.
420     #[cfg(target_arch = "aarch64")]
421     pub fn get_mpidr(&self) -> u64 {
422         self.mpidr
423     }
424 
425     /// Gets the saved vCPU state.
426     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
427     pub fn get_saved_state(&self) -> Option<CpuState> {
428         self.saved_state.clone()
429     }
430 
431     /// Initializes an aarch64 specific vcpu for booting Linux.
432     #[cfg(target_arch = "aarch64")]
433     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
434         use std::arch::is_aarch64_feature_detected;
435         #[allow(clippy::nonminimal_bool)]
436         let sve_supported =
437             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
438         let mut kvi = self.vcpu.create_vcpu_init();
439 
440         // This reads back the kernel's preferred target type.
441         vm.get_preferred_target(&mut kvi)
442             .map_err(Error::VcpuArmPreferredTarget)?;
443 
444         self.vcpu
445             .vcpu_set_processor_features(vm, &mut kvi, self.id)
446             .map_err(Error::VcpuSetProcessorFeatures)?;
447 
448         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?;
449 
450         if sve_supported {
451             let finalized_features = self.vcpu.vcpu_get_finalized_features();
452             self.vcpu
453                 .vcpu_finalize(finalized_features)
454                 .map_err(Error::VcpuArmFinalize)?;
455         }
456         Ok(())
457     }
458 
459     /// Runs the VCPU until it exits, returning the reason.
460     ///
461     /// Note that the state of the VCPU and associated VM must be setup first for this to do
462     /// anything useful.
463     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
464         self.vcpu.run()
465     }
466 
467     #[cfg(feature = "sev_snp")]
468     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
469         self.vcpu
470             .set_sev_control_register(vmsa_pfn)
471             .map_err(Error::SetSevControlRegister)
472     }
473 
474     ///
475     /// Sets the vCPU's GIC redistributor base address.
476     ///
477     #[cfg(target_arch = "aarch64")]
478     pub fn set_gic_redistributor_addr(
479         &self,
480         base_redist_addr: u64,
481         redist_size: u64,
482     ) -> Result<()> {
483         let gicr_base = base_redist_addr + (arch::layout::GIC_V3_REDIST_SIZE * self.id as u64);
484         assert!(gicr_base + arch::layout::GIC_V3_REDIST_SIZE <= base_redist_addr + redist_size);
485         self.vcpu
486             .set_gic_redistributor_addr(gicr_base)
487             .map_err(Error::VcpuSetGicrBaseAddr)?;
488         Ok(())
489     }
490 }
491 
492 impl Pausable for Vcpu {}
493 impl Snapshottable for Vcpu {
494     fn id(&self) -> String {
495         self.id.to_string()
496     }
497 
498     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
499         let saved_state = self
500             .vcpu
501             .state()
502             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
503 
504         self.saved_state = Some(saved_state.clone());
505 
506         Ok(Snapshot::from_data(SnapshotData::new_from_state(
507             &saved_state,
508         )?))
509     }
510 }
511 
512 pub struct CpuManager {
513     config: CpusConfig,
514     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
515     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
516     #[cfg(target_arch = "x86_64")]
517     cpuid: Vec<CpuIdEntry>,
518     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
519     vm: Arc<dyn hypervisor::Vm>,
520     vcpus_kill_signalled: Arc<AtomicBool>,
521     vcpus_pause_signalled: Arc<AtomicBool>,
522     vcpus_kick_signalled: Arc<AtomicBool>,
523     exit_evt: EventFd,
524     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
525     reset_evt: EventFd,
526     #[cfg(feature = "guest_debug")]
527     vm_debug_evt: EventFd,
528     vcpu_states: Vec<VcpuState>,
529     selected_cpu: u8,
530     vcpus: Vec<Arc<Mutex<Vcpu>>>,
531     seccomp_action: SeccompAction,
532     vm_ops: Arc<dyn VmOps>,
533     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
534     acpi_address: Option<GuestAddress>,
535     proximity_domain_per_cpu: BTreeMap<u8, u32>,
536     affinity: BTreeMap<u8, Vec<usize>>,
537     dynamic: bool,
538     hypervisor: Arc<dyn hypervisor::Hypervisor>,
539     #[cfg(feature = "sev_snp")]
540     sev_snp_enabled: bool,
541 }
542 
543 const CPU_ENABLE_FLAG: usize = 0;
544 const CPU_INSERTING_FLAG: usize = 1;
545 const CPU_REMOVING_FLAG: usize = 2;
546 const CPU_EJECT_FLAG: usize = 3;
547 
548 const CPU_STATUS_OFFSET: u64 = 4;
549 const CPU_SELECTION_OFFSET: u64 = 0;
550 
551 impl BusDevice for CpuManager {
552     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
553         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
554         data.fill(0);
555 
556         match offset {
557             CPU_SELECTION_OFFSET => {
558                 data[0] = self.selected_cpu;
559             }
560             CPU_STATUS_OFFSET => {
561                 if self.selected_cpu < self.max_vcpus() {
562                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
563                     if state.active() {
564                         data[0] |= 1 << CPU_ENABLE_FLAG;
565                     }
566                     if state.inserting {
567                         data[0] |= 1 << CPU_INSERTING_FLAG;
568                     }
569                     if state.removing {
570                         data[0] |= 1 << CPU_REMOVING_FLAG;
571                     }
572                 } else {
573                     warn!("Out of range vCPU id: {}", self.selected_cpu);
574                 }
575             }
576             _ => {
577                 warn!(
578                     "Unexpected offset for accessing CPU manager device: {:#}",
579                     offset
580                 );
581             }
582         }
583     }
584 
585     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
586         match offset {
587             CPU_SELECTION_OFFSET => {
588                 self.selected_cpu = data[0];
589             }
590             CPU_STATUS_OFFSET => {
591                 if self.selected_cpu < self.max_vcpus() {
592                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
593                     // The ACPI code writes back a 1 to acknowledge the insertion
594                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
595                         && state.inserting
596                     {
597                         state.inserting = false;
598                     }
599                     // Ditto for removal
600                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
601                         && state.removing
602                     {
603                         state.removing = false;
604                     }
605                     // Trigger removal of vCPU
606                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
607                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
608                             error!("Error removing vCPU: {:?}", e);
609                         }
610                     }
611                 } else {
612                     warn!("Out of range vCPU id: {}", self.selected_cpu);
613                 }
614             }
615             _ => {
616                 warn!(
617                     "Unexpected offset for accessing CPU manager device: {:#}",
618                     offset
619                 );
620             }
621         }
622         None
623     }
624 }
625 
626 #[derive(Default)]
627 struct VcpuState {
628     inserting: bool,
629     removing: bool,
630     pending_removal: Arc<AtomicBool>,
631     handle: Option<thread::JoinHandle<()>>,
632     kill: Arc<AtomicBool>,
633     vcpu_run_interrupted: Arc<AtomicBool>,
634     paused: Arc<AtomicBool>,
635 }
636 
637 impl VcpuState {
638     fn active(&self) -> bool {
639         self.handle.is_some()
640     }
641 
642     fn signal_thread(&self) {
643         if let Some(handle) = self.handle.as_ref() {
644             loop {
645                 // SAFETY: FFI call with correct arguments
646                 unsafe {
647                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
648                 }
649                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
650                     break;
651                 } else {
652                     // This is more effective than thread::yield_now() at
653                     // avoiding a priority inversion with the vCPU thread
654                     thread::sleep(std::time::Duration::from_millis(1));
655                 }
656             }
657         }
658     }
659 
660     fn join_thread(&mut self) -> Result<()> {
661         if let Some(handle) = self.handle.take() {
662             handle.join().map_err(Error::ThreadCleanup)?
663         }
664 
665         Ok(())
666     }
667 
668     fn unpark_thread(&self) {
669         if let Some(handle) = self.handle.as_ref() {
670             handle.thread().unpark()
671         }
672     }
673 }
674 
675 impl CpuManager {
676     #[allow(unused_variables)]
677     #[allow(clippy::too_many_arguments)]
678     pub fn new(
679         config: &CpusConfig,
680         vm: Arc<dyn hypervisor::Vm>,
681         exit_evt: EventFd,
682         reset_evt: EventFd,
683         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
684         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
685         seccomp_action: SeccompAction,
686         vm_ops: Arc<dyn VmOps>,
687         #[cfg(feature = "tdx")] tdx_enabled: bool,
688         numa_nodes: &NumaNodes,
689         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
690     ) -> Result<Arc<Mutex<CpuManager>>> {
691         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
692             return Err(Error::MaximumVcpusExceeded);
693         }
694 
695         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
696         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
697         let hypervisor_type = hypervisor.hypervisor_type();
698         #[cfg(target_arch = "x86_64")]
699         let cpu_vendor = hypervisor.get_cpu_vendor();
700 
701         #[cfg(target_arch = "x86_64")]
702         if config.features.amx {
703             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
704             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
705             const XFEATURE_XTILEDATA: usize = 18;
706             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
707 
708             // SAFETY: the syscall is only modifying kernel internal
709             // data structures that the kernel is itself expected to safeguard.
710             let amx_tile = unsafe {
711                 libc::syscall(
712                     libc::SYS_arch_prctl,
713                     ARCH_REQ_XCOMP_GUEST_PERM,
714                     XFEATURE_XTILEDATA,
715                 )
716             };
717 
718             if amx_tile != 0 {
719                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
720             } else {
721                 let mask: usize = 0;
722                 // SAFETY: the mask being modified (not marked mutable as it is
723                 // modified in unsafe only which is permitted) isn't in use elsewhere.
724                 let result = unsafe {
725                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
726                 };
727                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
728                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
729                 }
730             }
731         }
732 
733         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
734             let mut cpu_list = Vec::new();
735             for (proximity_domain, numa_node) in numa_nodes.iter() {
736                 for cpu in numa_node.cpus.iter() {
737                     cpu_list.push((*cpu, *proximity_domain))
738                 }
739             }
740             cpu_list
741         }
742         .into_iter()
743         .collect();
744 
745         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
746             cpu_affinity
747                 .iter()
748                 .map(|a| (a.vcpu, a.host_cpus.clone()))
749                 .collect()
750         } else {
751             BTreeMap::new()
752         };
753 
754         #[cfg(feature = "tdx")]
755         let dynamic = !tdx_enabled;
756         #[cfg(not(feature = "tdx"))]
757         let dynamic = true;
758 
759         Ok(Arc::new(Mutex::new(CpuManager {
760             config: config.clone(),
761             interrupt_controller: None,
762             #[cfg(target_arch = "x86_64")]
763             cpuid: Vec::new(),
764             vm,
765             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
766             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
767             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
768             vcpu_states,
769             exit_evt,
770             reset_evt,
771             #[cfg(feature = "guest_debug")]
772             vm_debug_evt,
773             selected_cpu: 0,
774             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
775             seccomp_action,
776             vm_ops,
777             acpi_address: None,
778             proximity_domain_per_cpu,
779             affinity,
780             dynamic,
781             hypervisor: hypervisor.clone(),
782             #[cfg(feature = "sev_snp")]
783             sev_snp_enabled,
784         })))
785     }
786 
787     #[cfg(target_arch = "x86_64")]
788     pub fn populate_cpuid(
789         &mut self,
790         memory_manager: &Arc<Mutex<MemoryManager>>,
791         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
792         #[cfg(feature = "tdx")] tdx: bool,
793     ) -> Result<()> {
794         let sgx_epc_sections = memory_manager
795             .lock()
796             .unwrap()
797             .sgx_epc_region()
798             .as_ref()
799             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
800 
801         self.cpuid = {
802             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
803             arch::generate_common_cpuid(
804                 hypervisor,
805                 &arch::CpuidConfig {
806                     sgx_epc_sections,
807                     phys_bits,
808                     kvm_hyperv: self.config.kvm_hyperv,
809                     #[cfg(feature = "tdx")]
810                     tdx,
811                     amx: self.config.features.amx,
812                 },
813             )
814             .map_err(Error::CommonCpuId)?
815         };
816 
817         Ok(())
818     }
819 
820     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
821         info!("Creating vCPU: cpu_id = {}", cpu_id);
822 
823         #[cfg(target_arch = "x86_64")]
824         let topology = self.get_vcpu_topology();
825         #[cfg(target_arch = "x86_64")]
826         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
827         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
828         let x2apic_id = cpu_id as u32;
829 
830         let mut vcpu = Vcpu::new(
831             cpu_id,
832             x2apic_id as u8,
833             &self.vm,
834             Some(self.vm_ops.clone()),
835             #[cfg(target_arch = "x86_64")]
836             self.hypervisor.get_cpu_vendor(),
837         )?;
838 
839         if let Some(snapshot) = snapshot {
840             // AArch64 vCPUs should be initialized after created.
841             #[cfg(target_arch = "aarch64")]
842             vcpu.init(&self.vm)?;
843 
844             let state: CpuState = snapshot.to_state().map_err(|e| {
845                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
846             })?;
847             vcpu.vcpu
848                 .set_state(&state)
849                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
850 
851             vcpu.saved_state = Some(state);
852         }
853 
854         let vcpu = Arc::new(Mutex::new(vcpu));
855 
856         // Adding vCPU to the CpuManager's vCPU list.
857         self.vcpus.push(vcpu.clone());
858 
859         Ok(vcpu)
860     }
861 
862     pub fn configure_vcpu(
863         &self,
864         vcpu: Arc<Mutex<Vcpu>>,
865         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
866     ) -> Result<()> {
867         let mut vcpu = vcpu.lock().unwrap();
868 
869         #[cfg(feature = "sev_snp")]
870         if self.sev_snp_enabled {
871             if let Some((kernel_entry_point, _)) = boot_setup {
872                 vcpu.set_sev_control_register(
873                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
874                 )?;
875             }
876 
877             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
878             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
879             return Ok(());
880         }
881 
882         #[cfg(target_arch = "x86_64")]
883         assert!(!self.cpuid.is_empty());
884 
885         #[cfg(target_arch = "x86_64")]
886         let topology = self.config.topology.clone().map_or_else(
887             || Some((1, self.boot_vcpus(), 1)),
888             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
889         );
890         #[cfg(target_arch = "x86_64")]
891         vcpu.configure(
892             boot_setup,
893             self.cpuid.clone(),
894             self.config.kvm_hyperv,
895             topology,
896         )?;
897 
898         #[cfg(target_arch = "aarch64")]
899         vcpu.configure(&self.vm, boot_setup)?;
900 
901         #[cfg(target_arch = "riscv64")]
902         vcpu.configure(boot_setup)?;
903 
904         Ok(())
905     }
906 
907     /// Only create new vCPUs if there aren't any inactive ones to reuse
908     fn create_vcpus(
909         &mut self,
910         desired_vcpus: u8,
911         snapshot: Option<Snapshot>,
912     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
913         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
914         info!(
915             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
916             desired_vcpus,
917             self.config.max_vcpus,
918             self.vcpus.len(),
919             self.present_vcpus()
920         );
921 
922         if desired_vcpus > self.config.max_vcpus {
923             return Err(Error::DesiredVCpuCountExceedsMax);
924         }
925 
926         // Only create vCPUs in excess of all the allocated vCPUs.
927         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
928             vcpus.push(self.create_vcpu(
929                 cpu_id,
930                 // TODO: The special format of the CPU id can be removed once
931                 // ready to break live upgrade.
932                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
933             )?);
934         }
935 
936         Ok(vcpus)
937     }
938 
939     #[cfg(target_arch = "aarch64")]
940     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
941         for cpu in self.vcpus.iter() {
942             let cpu = cpu.lock().unwrap();
943             // Check if PMU attr is available, if not, log the information.
944             if cpu.vcpu.has_pmu_support() {
945                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
946             } else {
947                 debug!(
948                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
949                     cpu.id
950                 );
951                 return Ok(false);
952             }
953         }
954 
955         Ok(true)
956     }
957 
958     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
959         self.vcpus.clone()
960     }
961 
962     fn start_vcpu(
963         &mut self,
964         vcpu: Arc<Mutex<Vcpu>>,
965         vcpu_id: u8,
966         vcpu_thread_barrier: Arc<Barrier>,
967         inserting: bool,
968     ) -> Result<()> {
969         let reset_evt = self.reset_evt.try_clone().unwrap();
970         let exit_evt = self.exit_evt.try_clone().unwrap();
971         #[cfg(feature = "kvm")]
972         let hypervisor_type = self.hypervisor.hypervisor_type();
973         #[cfg(feature = "guest_debug")]
974         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
975         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
976         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
977         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
978         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
979 
980         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
981         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
982             .vcpu_run_interrupted
983             .clone();
984         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
985         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
986 
987         // Prepare the CPU set the current vCPU is expected to run onto.
988         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
989             // SAFETY: all zeros is a valid pattern
990             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
991             // SAFETY: FFI call, trivially safe
992             unsafe { libc::CPU_ZERO(&mut cpuset) };
993             for host_cpu in host_cpus {
994                 // SAFETY: FFI call, trivially safe
995                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
996             }
997             cpuset
998         });
999 
1000         // Retrieve seccomp filter for vcpu thread
1001         let vcpu_seccomp_filter = get_seccomp_filter(
1002             &self.seccomp_action,
1003             Thread::Vcpu,
1004             self.hypervisor.hypervisor_type(),
1005         )
1006         .map_err(Error::CreateSeccompFilter)?;
1007 
1008         #[cfg(target_arch = "x86_64")]
1009         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
1010 
1011         info!("Starting vCPU: cpu_id = {}", vcpu_id);
1012 
1013         let handle = Some(
1014             thread::Builder::new()
1015                 .name(format!("vcpu{vcpu_id}"))
1016                 .spawn(move || {
1017                     // Schedule the thread to run on the expected CPU set
1018                     if let Some(cpuset) = cpuset.as_ref() {
1019                         // SAFETY: FFI call with correct arguments
1020                         let ret = unsafe {
1021                             libc::sched_setaffinity(
1022                                 0,
1023                                 std::mem::size_of::<libc::cpu_set_t>(),
1024                                 cpuset as *const libc::cpu_set_t,
1025                             )
1026                         };
1027 
1028                         if ret != 0 {
1029                             error!(
1030                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
1031                                 vcpu_id,
1032                                 io::Error::last_os_error()
1033                             );
1034                             return;
1035                         }
1036                     }
1037 
1038                     // Apply seccomp filter for vcpu thread.
1039                     if !vcpu_seccomp_filter.is_empty() {
1040                         if let Err(e) =
1041                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1042                         {
1043                             error!("Error applying seccomp filter: {:?}", e);
1044                             return;
1045                         }
1046                     }
1047                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1048                     // This uses an async signal safe handler to kill the vcpu handles.
1049                     register_signal_handler(SIGRTMIN(), handle_signal)
1050                         .expect("Failed to register vcpu signal handler");
1051                     // Block until all CPUs are ready.
1052                     vcpu_thread_barrier.wait();
1053 
1054                     std::panic::catch_unwind(move || {
1055                         loop {
1056                             // If we are being told to pause, we park the thread
1057                             // until the pause boolean is toggled.
1058                             // The resume operation is responsible for toggling
1059                             // the boolean and unpark the thread.
1060                             // We enter a loop because park() could spuriously
1061                             // return. We will then park() again unless the
1062                             // pause boolean has been toggled.
1063 
1064                             // Need to use Ordering::SeqCst as we have multiple
1065                             // loads and stores to different atomics and we need
1066                             // to see them in a consistent order in all threads
1067 
1068                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1069                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1070                                 // completed by returning to KVM_RUN. From the kernel docs:
1071                                 //
1072                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1073                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1074                                 // operations are complete (and guest state is consistent) only after userspace
1075                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1076                                 // incomplete operations and then check for pending signals.
1077                                 // The pending state of the operation is not preserved in state which is
1078                                 // visible to userspace, thus userspace should ensure that the operation is
1079                                 // completed before performing a live migration.  Userspace can re-enter the
1080                                 // guest with an unmasked signal pending or with the immediate_exit field set
1081                                 // to complete pending operations without allowing any further instructions
1082                                 // to be executed.
1083 
1084                                 #[cfg(feature = "kvm")]
1085                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1086                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1087                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1088                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1089                                         break;
1090                                     }
1091                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1092                                 }
1093 
1094                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1095 
1096                                 vcpu_paused.store(true, Ordering::SeqCst);
1097                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1098                                     thread::park();
1099                                 }
1100                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1101                             }
1102 
1103                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1104                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1105                                 #[cfg(target_arch = "x86_64")]
1106                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1107                                     Ok(()) => {},
1108                                     Err(e) => {
1109                                         error!("Error when inject nmi {}", e);
1110                                         break;
1111                                     }
1112                                 }
1113                             }
1114 
1115                             // We've been told to terminate
1116                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1117                                 || vcpu_kill.load(Ordering::SeqCst)
1118                             {
1119                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1120                                 break;
1121                             }
1122 
1123                             #[cfg(feature = "tdx")]
1124                             let mut vcpu = vcpu.lock().unwrap();
1125                             #[cfg(not(feature = "tdx"))]
1126                             let vcpu = vcpu.lock().unwrap();
1127                             // vcpu.run() returns false on a triple-fault so trigger a reset
1128                             match vcpu.run() {
1129                                 Ok(run) => match run {
1130                                     #[cfg(feature = "kvm")]
1131                                     VmExit::Debug => {
1132                                         info!("VmExit::Debug");
1133                                         #[cfg(feature = "guest_debug")]
1134                                         {
1135                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1136                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1137                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1138                                         }
1139                                     }
1140                                     #[cfg(target_arch = "x86_64")]
1141                                     VmExit::IoapicEoi(vector) => {
1142                                         if let Some(interrupt_controller) =
1143                                             &interrupt_controller_clone
1144                                         {
1145                                             interrupt_controller
1146                                                 .lock()
1147                                                 .unwrap()
1148                                                 .end_of_interrupt(vector);
1149                                         }
1150                                     }
1151                                     VmExit::Ignore => {}
1152                                     VmExit::Hyperv => {}
1153                                     VmExit::Reset => {
1154                                         info!("VmExit::Reset");
1155                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1156                                         reset_evt.write(1).unwrap();
1157                                         break;
1158                                     }
1159                                     VmExit::Shutdown => {
1160                                         info!("VmExit::Shutdown");
1161                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1162                                         exit_evt.write(1).unwrap();
1163                                         break;
1164                                     }
1165                                     #[cfg(feature = "tdx")]
1166                                     VmExit::Tdx => {
1167                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1168                                             match vcpu.get_tdx_exit_details() {
1169                                                 Ok(details) => match details {
1170                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1171                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1172                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1173                                                     }
1174                                                 },
1175                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1176                                             }
1177                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1178                                         } else {
1179                                             // We should never reach this code as
1180                                             // this means the design from the code
1181                                             // is wrong.
1182                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1183                                         }
1184                                     }
1185                                 },
1186 
1187                                 Err(e) => {
1188                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1189                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1190                                     exit_evt.write(1).unwrap();
1191                                     break;
1192                                 }
1193                             }
1194 
1195                             // We've been told to terminate
1196                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1197                                 || vcpu_kill.load(Ordering::SeqCst)
1198                             {
1199                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1200                                 break;
1201                             }
1202                         }
1203                     })
1204                     .or_else(|_| {
1205                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1206                         error!("vCPU thread panicked");
1207                         panic_exit_evt.write(1)
1208                     })
1209                     .ok();
1210                 })
1211                 .map_err(Error::VcpuSpawn)?,
1212         );
1213 
1214         // On hot plug calls into this function entry_point is None. It is for
1215         // those hotplug CPU additions that we need to set the inserting flag.
1216         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1217         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1218 
1219         Ok(())
1220     }
1221 
1222     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1223     fn activate_vcpus(
1224         &mut self,
1225         desired_vcpus: u8,
1226         inserting: bool,
1227         paused: Option<bool>,
1228     ) -> Result<()> {
1229         if desired_vcpus > self.config.max_vcpus {
1230             return Err(Error::DesiredVCpuCountExceedsMax);
1231         }
1232 
1233         let vcpu_thread_barrier = Arc::new(Barrier::new(
1234             (desired_vcpus - self.present_vcpus() + 1) as usize,
1235         ));
1236 
1237         if let Some(paused) = paused {
1238             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1239         }
1240 
1241         info!(
1242             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1243             desired_vcpus,
1244             self.vcpus.len(),
1245             self.present_vcpus(),
1246             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1247         );
1248 
1249         // This reuses any inactive vCPUs as well as any that were newly created
1250         for vcpu_id in self.present_vcpus()..desired_vcpus {
1251             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1252             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1253         }
1254 
1255         // Unblock all CPU threads.
1256         vcpu_thread_barrier.wait();
1257         Ok(())
1258     }
1259 
1260     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1261         // Mark vCPUs for removal, actual removal happens on ejection
1262         for cpu_id in desired_vcpus..self.present_vcpus() {
1263             self.vcpu_states[usize::from(cpu_id)].removing = true;
1264             self.vcpu_states[usize::from(cpu_id)]
1265                 .pending_removal
1266                 .store(true, Ordering::SeqCst);
1267         }
1268     }
1269 
1270     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1271         for state in self.vcpu_states.iter() {
1272             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1273                 return true;
1274             }
1275         }
1276         false
1277     }
1278 
1279     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1280         info!("Removing vCPU: cpu_id = {}", cpu_id);
1281         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1282         state.kill.store(true, Ordering::SeqCst);
1283         state.signal_thread();
1284         state.join_thread()?;
1285         state.handle = None;
1286 
1287         // Once the thread has exited, clear the "kill" so that it can reused
1288         state.kill.store(false, Ordering::SeqCst);
1289         state.pending_removal.store(false, Ordering::SeqCst);
1290 
1291         Ok(())
1292     }
1293 
1294     pub fn create_boot_vcpus(
1295         &mut self,
1296         snapshot: Option<Snapshot>,
1297     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1298         trace_scoped!("create_boot_vcpus");
1299 
1300         self.create_vcpus(self.boot_vcpus(), snapshot)
1301     }
1302 
1303     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1304     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1305         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1306     }
1307 
1308     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1309         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1310             .map_err(|e| {
1311                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1312             })?;
1313 
1314         Ok(())
1315     }
1316 
1317     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1318         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1319             return Ok(false);
1320         }
1321 
1322         if !self.dynamic {
1323             return Ok(false);
1324         }
1325 
1326         if desired_vcpus < 1 {
1327             return Err(Error::DesiredVCpuCountIsZero);
1328         }
1329 
1330         if self.check_pending_removed_vcpu() {
1331             return Err(Error::VcpuPendingRemovedVcpu);
1332         }
1333 
1334         match desired_vcpus.cmp(&self.present_vcpus()) {
1335             cmp::Ordering::Greater => {
1336                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1337                 for vcpu in vcpus {
1338                     self.configure_vcpu(vcpu, None)?
1339                 }
1340                 self.activate_vcpus(desired_vcpus, true, None)?;
1341                 Ok(true)
1342             }
1343             cmp::Ordering::Less => {
1344                 self.mark_vcpus_for_removal(desired_vcpus);
1345                 Ok(true)
1346             }
1347             _ => Ok(false),
1348         }
1349     }
1350 
1351     pub fn shutdown(&mut self) -> Result<()> {
1352         // Tell the vCPUs to stop themselves next time they go through the loop
1353         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1354 
1355         // Toggle the vCPUs pause boolean
1356         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1357 
1358         // Unpark all the VCPU threads.
1359         for state in self.vcpu_states.iter() {
1360             state.unpark_thread();
1361         }
1362 
1363         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1364         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1365         // above.
1366         for state in self.vcpu_states.iter() {
1367             state.signal_thread();
1368         }
1369 
1370         // Wait for all the threads to finish. This removes the state from the vector.
1371         for mut state in self.vcpu_states.drain(..) {
1372             state.join_thread()?;
1373         }
1374 
1375         Ok(())
1376     }
1377 
1378     #[cfg(feature = "tdx")]
1379     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1380         for vcpu in &self.vcpus {
1381             vcpu.lock()
1382                 .unwrap()
1383                 .vcpu
1384                 .tdx_init(hob_address)
1385                 .map_err(Error::InitializeTdx)?;
1386         }
1387         Ok(())
1388     }
1389 
1390     pub fn boot_vcpus(&self) -> u8 {
1391         self.config.boot_vcpus
1392     }
1393 
1394     pub fn max_vcpus(&self) -> u8 {
1395         self.config.max_vcpus
1396     }
1397 
1398     #[cfg(target_arch = "x86_64")]
1399     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1400         assert!(!self.cpuid.is_empty());
1401         self.cpuid.clone()
1402     }
1403 
1404     fn present_vcpus(&self) -> u8 {
1405         self.vcpu_states
1406             .iter()
1407             .fold(0, |acc, state| acc + state.active() as u8)
1408     }
1409 
1410     #[cfg(target_arch = "aarch64")]
1411     pub fn get_mpidrs(&self) -> Vec<u64> {
1412         self.vcpus
1413             .iter()
1414             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1415             .collect()
1416     }
1417 
1418     #[cfg(target_arch = "aarch64")]
1419     pub fn get_saved_states(&self) -> Vec<CpuState> {
1420         self.vcpus
1421             .iter()
1422             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1423             .collect()
1424     }
1425 
1426     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1427         self.config
1428             .topology
1429             .clone()
1430             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1431     }
1432 
1433     #[cfg(not(target_arch = "riscv64"))]
1434     pub fn create_madt(&self) -> Sdt {
1435         use crate::acpi;
1436         // This is also checked in the commandline parsing.
1437         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1438 
1439         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1440         #[cfg(target_arch = "x86_64")]
1441         {
1442             madt.write(36, arch::layout::APIC_START.0);
1443 
1444             for cpu in 0..self.config.max_vcpus {
1445                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1446 
1447                 let lapic = LocalX2Apic {
1448                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1449                     length: 16,
1450                     processor_id: cpu.into(),
1451                     apic_id: x2apic_id,
1452                     flags: if cpu < self.config.boot_vcpus {
1453                         1 << MADT_CPU_ENABLE_FLAG
1454                     } else {
1455                         0
1456                     } | (1 << MADT_CPU_ONLINE_CAPABLE_FLAG),
1457                     _reserved: 0,
1458                 };
1459                 madt.append(lapic);
1460             }
1461 
1462             madt.append(Ioapic {
1463                 r#type: acpi::ACPI_APIC_IO,
1464                 length: 12,
1465                 ioapic_id: 0,
1466                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1467                 gsi_base: 0,
1468                 ..Default::default()
1469             });
1470 
1471             madt.append(InterruptSourceOverride {
1472                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1473                 length: 10,
1474                 bus: 0,
1475                 source: 4,
1476                 gsi: 4,
1477                 flags: 0,
1478             });
1479         }
1480 
1481         #[cfg(target_arch = "aarch64")]
1482         {
1483             /* Notes:
1484              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1485              */
1486 
1487             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1488             for cpu in 0..self.config.boot_vcpus {
1489                 let vcpu = &self.vcpus[cpu as usize];
1490                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1491                 /* ARMv8 MPIDR format:
1492                      Bits [63:40] Must be zero
1493                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1494                      Bits [31:24] Must be zero
1495                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1496                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1497                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1498                 */
1499                 let mpidr_mask = 0xff_00ff_ffff;
1500                 let gicc = GicC {
1501                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1502                     length: 80,
1503                     reserved0: 0,
1504                     cpu_interface_number: cpu as u32,
1505                     uid: cpu as u32,
1506                     flags: 1,
1507                     parking_version: 0,
1508                     performance_interrupt: 0,
1509                     parked_address: 0,
1510                     base_address: 0,
1511                     gicv_base_address: 0,
1512                     gich_base_address: 0,
1513                     vgic_interrupt: 0,
1514                     gicr_base_address: 0,
1515                     mpidr: mpidr & mpidr_mask,
1516                     proc_power_effi_class: 0,
1517                     reserved1: 0,
1518                     spe_overflow_interrupt: 0,
1519                 };
1520 
1521                 madt.append(gicc);
1522             }
1523             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1524 
1525             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1526             let gicd = GicD {
1527                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1528                 length: 24,
1529                 reserved0: 0,
1530                 gic_id: 0,
1531                 base_address: vgic_config.dist_addr,
1532                 global_irq_base: 0,
1533                 version: 3,
1534                 reserved1: [0; 3],
1535             };
1536             madt.append(gicd);
1537 
1538             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1539             let gicr = GicR {
1540                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1541                 length: 16,
1542                 reserved: 0,
1543                 base_address: vgic_config.redists_addr,
1544                 range_length: vgic_config.redists_size as u32,
1545             };
1546             madt.append(gicr);
1547 
1548             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1549             let gicits = GicIts {
1550                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1551                 length: 20,
1552                 reserved0: 0,
1553                 translation_id: 0,
1554                 base_address: vgic_config.msi_addr,
1555                 reserved1: 0,
1556             };
1557             madt.append(gicits);
1558 
1559             madt.update_checksum();
1560         }
1561 
1562         madt
1563     }
1564 
1565     #[cfg(target_arch = "aarch64")]
1566     pub fn create_pptt(&self) -> Sdt {
1567         let pptt_start = 0;
1568         let mut cpus = 0;
1569         let mut uid = 0;
1570         // If topology is not specified, the default setting is:
1571         // 1 package, multiple cores, 1 thread per core
1572         // This is also the behavior when PPTT is missing.
1573         let (threads_per_core, cores_per_package, packages) =
1574             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1575 
1576         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1577 
1578         for cluster_idx in 0..packages {
1579             if cpus < self.config.boot_vcpus as usize {
1580                 let cluster_offset = pptt.len() - pptt_start;
1581                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1582                     r#type: 0,
1583                     length: 20,
1584                     reserved: 0,
1585                     flags: 0x2,
1586                     parent: 0,
1587                     acpi_processor_id: cluster_idx as u32,
1588                     num_private_resources: 0,
1589                 };
1590                 pptt.append(cluster_hierarchy_node);
1591 
1592                 for core_idx in 0..cores_per_package {
1593                     let core_offset = pptt.len() - pptt_start;
1594 
1595                     if threads_per_core > 1 {
1596                         let core_hierarchy_node = ProcessorHierarchyNode {
1597                             r#type: 0,
1598                             length: 20,
1599                             reserved: 0,
1600                             flags: 0x2,
1601                             parent: cluster_offset as u32,
1602                             acpi_processor_id: core_idx as u32,
1603                             num_private_resources: 0,
1604                         };
1605                         pptt.append(core_hierarchy_node);
1606 
1607                         for _thread_idx in 0..threads_per_core {
1608                             let thread_hierarchy_node = ProcessorHierarchyNode {
1609                                 r#type: 0,
1610                                 length: 20,
1611                                 reserved: 0,
1612                                 flags: 0xE,
1613                                 parent: core_offset as u32,
1614                                 acpi_processor_id: uid as u32,
1615                                 num_private_resources: 0,
1616                             };
1617                             pptt.append(thread_hierarchy_node);
1618                             uid += 1;
1619                         }
1620                     } else {
1621                         let thread_hierarchy_node = ProcessorHierarchyNode {
1622                             r#type: 0,
1623                             length: 20,
1624                             reserved: 0,
1625                             flags: 0xA,
1626                             parent: cluster_offset as u32,
1627                             acpi_processor_id: uid as u32,
1628                             num_private_resources: 0,
1629                         };
1630                         pptt.append(thread_hierarchy_node);
1631                         uid += 1;
1632                     }
1633                 }
1634                 cpus += (cores_per_package * threads_per_core) as usize;
1635             }
1636         }
1637 
1638         pptt.update_checksum();
1639         pptt
1640     }
1641 
1642     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1643     fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters {
1644         self.vcpus[usize::from(cpu_id)]
1645             .lock()
1646             .unwrap()
1647             .vcpu
1648             .create_standard_regs()
1649     }
1650 
1651     #[cfg(feature = "guest_debug")]
1652     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1653         self.vcpus[usize::from(cpu_id)]
1654             .lock()
1655             .unwrap()
1656             .vcpu
1657             .get_regs()
1658             .map_err(Error::CpuDebug)
1659     }
1660 
1661     #[cfg(feature = "guest_debug")]
1662     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1663         self.vcpus[usize::from(cpu_id)]
1664             .lock()
1665             .unwrap()
1666             .vcpu
1667             .set_regs(regs)
1668             .map_err(Error::CpuDebug)
1669     }
1670 
1671     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1672     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1673         self.vcpus[usize::from(cpu_id)]
1674             .lock()
1675             .unwrap()
1676             .vcpu
1677             .get_sregs()
1678             .map_err(Error::CpuDebug)
1679     }
1680 
1681     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1682     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1683         self.vcpus[usize::from(cpu_id)]
1684             .lock()
1685             .unwrap()
1686             .vcpu
1687             .set_sregs(sregs)
1688             .map_err(Error::CpuDebug)
1689     }
1690 
1691     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1692     fn translate_gva(
1693         &self,
1694         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1695         cpu_id: u8,
1696         gva: u64,
1697     ) -> Result<u64> {
1698         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1699             .lock()
1700             .unwrap()
1701             .vcpu
1702             .translate_gva(gva, /* flags: unused */ 0)
1703             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1704         Ok(gpa)
1705     }
1706 
1707     ///
1708     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1709     /// it in VMM by walking through translation tables.
1710     ///
1711     /// Address translation is big topic, here we only focus the scenario that
1712     /// happens in VMM while debugging kernel. This `translate_gva`
1713     /// implementation is restricted to:
1714     /// - Exception Level 1
1715     /// - Translate high address range only (kernel space)
1716     ///
1717     /// This implementation supports following Arm-v8a features related to
1718     /// address translation:
1719     /// - FEAT_LPA
1720     /// - FEAT_LVA
1721     /// - FEAT_LPA2
1722     ///
1723     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1724     fn translate_gva(
1725         &self,
1726         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1727         cpu_id: u8,
1728         gva: u64,
1729     ) -> Result<u64> {
1730         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1731             .lock()
1732             .unwrap()
1733             .vcpu
1734             .get_sys_reg(TCR_EL1)
1735             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1736         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1737             .lock()
1738             .unwrap()
1739             .vcpu
1740             .get_sys_reg(TTBR1_EL1)
1741             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1742         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1743             .lock()
1744             .unwrap()
1745             .vcpu
1746             .get_sys_reg(ID_AA64MMFR0_EL1)
1747             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1748 
1749         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1750         // or low (0x000xxx...).
1751         let high_range = extract_bits_64!(gva, 55, 1);
1752         if high_range == 0 {
1753             info!("VA (0x{:x}) range is not supported!", gva);
1754             return Ok(gva);
1755         }
1756 
1757         // High range size offset
1758         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1759         // Granule size
1760         let tg = extract_bits_64!(tcr_el1, 30, 2);
1761         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1762         let ds = extract_bits_64!(tcr_el1, 59, 1);
1763 
1764         if tsz == 0 {
1765             info!("VA translation is not ready!");
1766             return Ok(gva);
1767         }
1768 
1769         // VA size is determined by TCR_BL1.T1SZ
1770         let va_size = 64 - tsz;
1771         // Number of bits in VA consumed in each level of translation
1772         let stride = match tg {
1773             3 => 13, // 64KB granule size
1774             1 => 11, // 16KB granule size
1775             _ => 9,  // 4KB, default
1776         };
1777         // Starting level of walking
1778         let mut level = 4 - (va_size - 4) / stride;
1779 
1780         // PA or IPA size is determined
1781         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1782         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1783         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1784         // To be safe, we use the minimum value if they are different.
1785         let pa_range = std::cmp::min(tcr_ips, pa_range);
1786         // PA size in bits
1787         let pa_size = match pa_range {
1788             0 => 32,
1789             1 => 36,
1790             2 => 40,
1791             3 => 42,
1792             4 => 44,
1793             5 => 48,
1794             6 => 52,
1795             _ => {
1796                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1797                     "PA range not supported {pa_range}"
1798                 ))))
1799             }
1800         };
1801 
1802         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1803         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1804         // If FEAT_LPA2 is present, the translation table descriptor holds
1805         // 50 bits of the table address of next level.
1806         // Otherwise, it is 48 bits.
1807         let descaddrmask = if ds == 1 {
1808             !0u64 >> (64 - 50) // mask with 50 least significant bits
1809         } else {
1810             !0u64 >> (64 - 48) // mask with 48 least significant bits
1811         };
1812         let descaddrmask = descaddrmask & !indexmask_grainsize;
1813 
1814         // Translation table base address
1815         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1816         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1817         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1818         if pa_size == 52 {
1819             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1820         }
1821 
1822         // Loop through tables of each level
1823         loop {
1824             // Table offset for current level
1825             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1826             descaddr |= table_offset;
1827             descaddr &= !7u64;
1828 
1829             let mut buf = [0; 8];
1830             guest_memory
1831                 .memory()
1832                 .read(&mut buf, GuestAddress(descaddr))
1833                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1834             let descriptor = u64::from_le_bytes(buf);
1835 
1836             descaddr = descriptor & descaddrmask;
1837             // In the case of FEAT_LPA, the next-level translation table address
1838             // bits [48:51] comes from bits [12:15] of the current descriptor.
1839             // For FEAT_LPA2, the next-level translation table address
1840             // bits [50:51] comes from bits [8:9] of the current descriptor,
1841             // bits [48:49] comes from bits [48:49] of the descriptor which was
1842             // handled previously.
1843             if pa_size == 52 {
1844                 if ds == 1 {
1845                     // FEAT_LPA2
1846                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1847                 } else {
1848                     // FEAT_LPA
1849                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1850                 }
1851             }
1852 
1853             if (descriptor & 2) != 0 && (level < 3) {
1854                 // This is a table entry. Go down to next level.
1855                 level += 1;
1856                 indexmask = indexmask_grainsize;
1857                 continue;
1858             }
1859 
1860             break;
1861         }
1862 
1863         // We have reached either:
1864         // - a page entry at level 3 or
1865         // - a block entry at level 1 or 2
1866         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1867         descaddr &= !(page_size - 1);
1868         descaddr |= gva & (page_size - 1);
1869 
1870         Ok(descaddr)
1871     }
1872 
1873     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1874         self.acpi_address = Some(acpi_address);
1875     }
1876 
1877     pub(crate) fn set_interrupt_controller(
1878         &mut self,
1879         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1880     ) {
1881         self.interrupt_controller = Some(interrupt_controller);
1882     }
1883 
1884     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1885         &self.vcpus_kill_signalled
1886     }
1887 
1888     #[cfg(feature = "igvm")]
1889     pub(crate) fn get_cpuid_leaf(
1890         &self,
1891         cpu_id: u8,
1892         eax: u32,
1893         ecx: u32,
1894         xfem: u64,
1895         xss: u64,
1896     ) -> Result<[u32; 4]> {
1897         let leaf_info = self.vcpus[usize::from(cpu_id)]
1898             .lock()
1899             .unwrap()
1900             .vcpu
1901             .get_cpuid_values(eax, ecx, xfem, xss)
1902             .unwrap();
1903         Ok(leaf_info)
1904     }
1905 
1906     #[cfg(feature = "sev_snp")]
1907     pub(crate) fn sev_snp_enabled(&self) -> bool {
1908         self.sev_snp_enabled
1909     }
1910 
1911     pub(crate) fn nmi(&self) -> Result<()> {
1912         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1913 
1914         for state in self.vcpu_states.iter() {
1915             state.signal_thread();
1916         }
1917 
1918         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1919 
1920         Ok(())
1921     }
1922 }
1923 
1924 struct Cpu {
1925     cpu_id: u8,
1926     proximity_domain: u32,
1927     dynamic: bool,
1928     #[cfg(target_arch = "x86_64")]
1929     topology: Option<(u8, u8, u8)>,
1930 }
1931 
1932 #[cfg(target_arch = "x86_64")]
1933 const MADT_CPU_ENABLE_FLAG: usize = 0;
1934 
1935 #[cfg(target_arch = "x86_64")]
1936 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1937 
1938 impl Cpu {
1939     #[cfg(target_arch = "x86_64")]
1940     fn generate_mat(&self) -> Vec<u8> {
1941         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1942 
1943         let lapic = LocalX2Apic {
1944             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1945             length: 16,
1946             processor_id: self.cpu_id.into(),
1947             apic_id: x2apic_id,
1948             flags: 1 << MADT_CPU_ENABLE_FLAG,
1949             _reserved: 0,
1950         };
1951 
1952         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1953         // SAFETY: mat_data is large enough to hold lapic
1954         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1955 
1956         mat_data
1957     }
1958 }
1959 
1960 impl Aml for Cpu {
1961     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1962         #[cfg(target_arch = "x86_64")]
1963         let mat_data: Vec<u8> = self.generate_mat();
1964         #[allow(clippy::if_same_then_else)]
1965         if self.dynamic {
1966             aml::Device::new(
1967                 format!("C{:03X}", self.cpu_id).as_str().into(),
1968                 vec![
1969                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1970                     &aml::Name::new("_UID".into(), &self.cpu_id),
1971                     // Currently, AArch64 cannot support following fields.
1972                     /*
1973                     _STA return value:
1974                     Bit [0] – Set if the device is present.
1975                     Bit [1] – Set if the device is enabled and decoding its resources.
1976                     Bit [2] – Set if the device should be shown in the UI.
1977                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1978                     Bit [4] – Set if the battery is present.
1979                     Bits [31:5] – Reserved (must be cleared).
1980                     */
1981                     #[cfg(target_arch = "x86_64")]
1982                     &aml::Method::new(
1983                         "_STA".into(),
1984                         0,
1985                         false,
1986                         // Call into CSTA method which will interrogate device
1987                         vec![&aml::Return::new(&aml::MethodCall::new(
1988                             "CSTA".into(),
1989                             vec![&self.cpu_id],
1990                         ))],
1991                     ),
1992                     &aml::Method::new(
1993                         "_PXM".into(),
1994                         0,
1995                         false,
1996                         vec![&aml::Return::new(&self.proximity_domain)],
1997                     ),
1998                     // The Linux kernel expects every CPU device to have a _MAT entry
1999                     // containing the LAPIC for this processor with the enabled bit set
2000                     // even it if is disabled in the MADT (non-boot CPU)
2001                     #[cfg(target_arch = "x86_64")]
2002                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
2003                     // Trigger CPU ejection
2004                     #[cfg(target_arch = "x86_64")]
2005                     &aml::Method::new(
2006                         "_EJ0".into(),
2007                         1,
2008                         false,
2009                         // Call into CEJ0 method which will actually eject device
2010                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
2011                     ),
2012                 ],
2013             )
2014             .to_aml_bytes(sink);
2015         } else {
2016             aml::Device::new(
2017                 format!("C{:03X}", self.cpu_id).as_str().into(),
2018                 vec![
2019                     &aml::Name::new("_HID".into(), &"ACPI0007"),
2020                     &aml::Name::new("_UID".into(), &self.cpu_id),
2021                     #[cfg(target_arch = "x86_64")]
2022                     &aml::Method::new(
2023                         "_STA".into(),
2024                         0,
2025                         false,
2026                         // Mark CPU present see CSTA implementation
2027                         vec![&aml::Return::new(&0xfu8)],
2028                     ),
2029                     &aml::Method::new(
2030                         "_PXM".into(),
2031                         0,
2032                         false,
2033                         vec![&aml::Return::new(&self.proximity_domain)],
2034                     ),
2035                     // The Linux kernel expects every CPU device to have a _MAT entry
2036                     // containing the LAPIC for this processor with the enabled bit set
2037                     // even it if is disabled in the MADT (non-boot CPU)
2038                     #[cfg(target_arch = "x86_64")]
2039                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
2040                 ],
2041             )
2042             .to_aml_bytes(sink);
2043         }
2044     }
2045 }
2046 
2047 struct CpuNotify {
2048     cpu_id: u8,
2049 }
2050 
2051 impl Aml for CpuNotify {
2052     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2053         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2054         aml::If::new(
2055             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2056             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2057         )
2058         .to_aml_bytes(sink)
2059     }
2060 }
2061 
2062 struct CpuMethods {
2063     max_vcpus: u8,
2064     dynamic: bool,
2065 }
2066 
2067 impl Aml for CpuMethods {
2068     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2069         if self.dynamic {
2070             // CPU status method
2071             aml::Method::new(
2072                 "CSTA".into(),
2073                 1,
2074                 true,
2075                 vec![
2076                     // Take lock defined above
2077                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2078                     // Write CPU number (in first argument) to I/O port via field
2079                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2080                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2081                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2082                     &aml::If::new(
2083                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2084                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2085                     ),
2086                     // Release lock
2087                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2088                     // Return 0 or 0xf
2089                     &aml::Return::new(&aml::Local(0)),
2090                 ],
2091             )
2092             .to_aml_bytes(sink);
2093 
2094             let mut cpu_notifies = Vec::new();
2095             for cpu_id in 0..self.max_vcpus {
2096                 cpu_notifies.push(CpuNotify { cpu_id });
2097             }
2098 
2099             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2100             for cpu_id in 0..self.max_vcpus {
2101                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2102             }
2103 
2104             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2105 
2106             aml::Method::new(
2107                 "CEJ0".into(),
2108                 1,
2109                 true,
2110                 vec![
2111                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2112                     // Write CPU number (in first argument) to I/O port via field
2113                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2114                     // Set CEJ0 bit
2115                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2116                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2117                 ],
2118             )
2119             .to_aml_bytes(sink);
2120 
2121             aml::Method::new(
2122                 "CSCN".into(),
2123                 0,
2124                 true,
2125                 vec![
2126                     // Take lock defined above
2127                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2128                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2129                     &aml::While::new(
2130                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2131                         vec![
2132                             // Write CPU number (in first argument) to I/O port via field
2133                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2134                             // Check if CINS bit is set
2135                             &aml::If::new(
2136                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2137                                 // Notify device if it is
2138                                 vec![
2139                                     &aml::MethodCall::new(
2140                                         "CTFY".into(),
2141                                         vec![&aml::Local(0), &aml::ONE],
2142                                     ),
2143                                     // Reset CINS bit
2144                                     &aml::Store::new(
2145                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2146                                         &aml::ONE,
2147                                     ),
2148                                 ],
2149                             ),
2150                             // Check if CRMV bit is set
2151                             &aml::If::new(
2152                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2153                                 // Notify device if it is (with the eject constant 0x3)
2154                                 vec![
2155                                     &aml::MethodCall::new(
2156                                         "CTFY".into(),
2157                                         vec![&aml::Local(0), &3u8],
2158                                     ),
2159                                     // Reset CRMV bit
2160                                     &aml::Store::new(
2161                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2162                                         &aml::ONE,
2163                                     ),
2164                                 ],
2165                             ),
2166                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2167                         ],
2168                     ),
2169                     // Release lock
2170                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2171                 ],
2172             )
2173             .to_aml_bytes(sink)
2174         } else {
2175             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2176         }
2177     }
2178 }
2179 
2180 impl Aml for CpuManager {
2181     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2182         #[cfg(target_arch = "x86_64")]
2183         if let Some(acpi_address) = self.acpi_address {
2184             // CPU hotplug controller
2185             aml::Device::new(
2186                 "_SB_.PRES".into(),
2187                 vec![
2188                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2189                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2190                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2191                     &aml::Mutex::new("CPLK".into(), 0),
2192                     &aml::Name::new(
2193                         "_CRS".into(),
2194                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2195                             aml::AddressSpaceCacheable::NotCacheable,
2196                             true,
2197                             acpi_address.0,
2198                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2199                             None,
2200                         )]),
2201                     ),
2202                     // OpRegion and Fields map MMIO range into individual field values
2203                     &aml::OpRegion::new(
2204                         "PRST".into(),
2205                         aml::OpRegionSpace::SystemMemory,
2206                         &(acpi_address.0 as usize),
2207                         &CPU_MANAGER_ACPI_SIZE,
2208                     ),
2209                     &aml::Field::new(
2210                         "PRST".into(),
2211                         aml::FieldAccessType::Byte,
2212                         aml::FieldLockRule::NoLock,
2213                         aml::FieldUpdateRule::WriteAsZeroes,
2214                         vec![
2215                             aml::FieldEntry::Reserved(32),
2216                             aml::FieldEntry::Named(*b"CPEN", 1),
2217                             aml::FieldEntry::Named(*b"CINS", 1),
2218                             aml::FieldEntry::Named(*b"CRMV", 1),
2219                             aml::FieldEntry::Named(*b"CEJ0", 1),
2220                             aml::FieldEntry::Reserved(4),
2221                             aml::FieldEntry::Named(*b"CCMD", 8),
2222                         ],
2223                     ),
2224                     &aml::Field::new(
2225                         "PRST".into(),
2226                         aml::FieldAccessType::DWord,
2227                         aml::FieldLockRule::NoLock,
2228                         aml::FieldUpdateRule::Preserve,
2229                         vec![
2230                             aml::FieldEntry::Named(*b"CSEL", 32),
2231                             aml::FieldEntry::Reserved(32),
2232                             aml::FieldEntry::Named(*b"CDAT", 32),
2233                         ],
2234                     ),
2235                 ],
2236             )
2237             .to_aml_bytes(sink);
2238         }
2239 
2240         // CPU devices
2241         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2242         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2243         // Bundle methods together under a common object
2244         let methods = CpuMethods {
2245             max_vcpus: self.config.max_vcpus,
2246             dynamic: self.dynamic,
2247         };
2248         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2249 
2250         #[cfg(target_arch = "x86_64")]
2251         let topology = self.get_vcpu_topology();
2252         let mut cpu_devices = Vec::new();
2253         for cpu_id in 0..self.config.max_vcpus {
2254             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2255             let cpu_device = Cpu {
2256                 cpu_id,
2257                 proximity_domain,
2258                 dynamic: self.dynamic,
2259                 #[cfg(target_arch = "x86_64")]
2260                 topology,
2261             };
2262 
2263             cpu_devices.push(cpu_device);
2264         }
2265 
2266         for cpu_device in cpu_devices.iter() {
2267             cpu_data_inner.push(cpu_device);
2268         }
2269 
2270         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2271     }
2272 }
2273 
2274 impl Pausable for CpuManager {
2275     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2276         // Tell the vCPUs to pause themselves next time they exit
2277         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2278 
2279         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2280         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2281         // above.
2282         for state in self.vcpu_states.iter() {
2283             state.signal_thread();
2284         }
2285 
2286         for vcpu in self.vcpus.iter() {
2287             let mut vcpu = vcpu.lock().unwrap();
2288             vcpu.pause()?;
2289             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2290             if !self.config.kvm_hyperv {
2291                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2292                     MigratableError::Pause(anyhow!(
2293                         "Could not notify guest it has been paused {:?}",
2294                         e
2295                     ))
2296                 })?;
2297             }
2298         }
2299 
2300         // The vCPU thread will change its paused state before parking, wait here for each
2301         // activated vCPU change their state to ensure they have parked.
2302         for state in self.vcpu_states.iter() {
2303             if state.active() {
2304                 while !state.paused.load(Ordering::SeqCst) {
2305                     // To avoid a priority inversion with the vCPU thread
2306                     thread::sleep(std::time::Duration::from_millis(1));
2307                 }
2308             }
2309         }
2310 
2311         Ok(())
2312     }
2313 
2314     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2315         for vcpu in self.vcpus.iter() {
2316             vcpu.lock().unwrap().resume()?;
2317         }
2318 
2319         // Toggle the vCPUs pause boolean
2320         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2321 
2322         // Unpark all the VCPU threads.
2323         // Once unparked, the next thing they will do is checking for the pause
2324         // boolean. Since it'll be set to false, they will exit their pause loop
2325         // and go back to vmx root.
2326         for state in self.vcpu_states.iter() {
2327             state.paused.store(false, Ordering::SeqCst);
2328             state.unpark_thread();
2329         }
2330         Ok(())
2331     }
2332 }
2333 
2334 impl Snapshottable for CpuManager {
2335     fn id(&self) -> String {
2336         CPU_MANAGER_SNAPSHOT_ID.to_string()
2337     }
2338 
2339     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2340         let mut cpu_manager_snapshot = Snapshot::default();
2341 
2342         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2343         for vcpu in &self.vcpus {
2344             let mut vcpu = vcpu.lock().unwrap();
2345             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2346         }
2347 
2348         Ok(cpu_manager_snapshot)
2349     }
2350 }
2351 
2352 impl Transportable for CpuManager {}
2353 impl Migratable for CpuManager {}
2354 
2355 #[cfg(feature = "guest_debug")]
2356 impl Debuggable for CpuManager {
2357     #[cfg(feature = "kvm")]
2358     fn set_guest_debug(
2359         &self,
2360         cpu_id: usize,
2361         addrs: &[GuestAddress],
2362         singlestep: bool,
2363     ) -> std::result::Result<(), DebuggableError> {
2364         self.vcpus[cpu_id]
2365             .lock()
2366             .unwrap()
2367             .vcpu
2368             .set_guest_debug(addrs, singlestep)
2369             .map_err(DebuggableError::SetDebug)
2370     }
2371 
2372     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2373         Ok(())
2374     }
2375 
2376     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2377         Ok(())
2378     }
2379 
2380     #[cfg(target_arch = "x86_64")]
2381     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2382         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2383         let gregs = self
2384             .get_regs(cpu_id as u8)
2385             .map_err(DebuggableError::ReadRegs)?;
2386         let regs = [
2387             gregs.get_rax(),
2388             gregs.get_rbx(),
2389             gregs.get_rcx(),
2390             gregs.get_rdx(),
2391             gregs.get_rsi(),
2392             gregs.get_rdi(),
2393             gregs.get_rbp(),
2394             gregs.get_rsp(),
2395             gregs.get_r8(),
2396             gregs.get_r9(),
2397             gregs.get_r10(),
2398             gregs.get_r11(),
2399             gregs.get_r12(),
2400             gregs.get_r13(),
2401             gregs.get_r14(),
2402             gregs.get_r15(),
2403         ];
2404 
2405         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2406         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2407         let eflags = gregs.get_rflags() as u32;
2408         let rip = gregs.get_rip();
2409 
2410         // Segment registers: CS, SS, DS, ES, FS, GS
2411         let sregs = self
2412             .get_sregs(cpu_id as u8)
2413             .map_err(DebuggableError::ReadRegs)?;
2414         let segments = X86SegmentRegs {
2415             cs: sregs.cs.selector as u32,
2416             ss: sregs.ss.selector as u32,
2417             ds: sregs.ds.selector as u32,
2418             es: sregs.es.selector as u32,
2419             fs: sregs.fs.selector as u32,
2420             gs: sregs.gs.selector as u32,
2421         };
2422 
2423         // TODO: Add other registers
2424 
2425         Ok(CoreRegs {
2426             regs,
2427             eflags,
2428             rip,
2429             segments,
2430             ..Default::default()
2431         })
2432     }
2433 
2434     #[cfg(target_arch = "aarch64")]
2435     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2436         let gregs = self
2437             .get_regs(cpu_id as u8)
2438             .map_err(DebuggableError::ReadRegs)?;
2439         Ok(CoreRegs {
2440             x: gregs.get_regs(),
2441             sp: gregs.get_sp(),
2442             pc: gregs.get_pc(),
2443             ..Default::default()
2444         })
2445     }
2446 
2447     #[cfg(target_arch = "x86_64")]
2448     fn write_regs(
2449         &self,
2450         cpu_id: usize,
2451         regs: &CoreRegs,
2452     ) -> std::result::Result<(), DebuggableError> {
2453         let orig_gregs = self
2454             .get_regs(cpu_id as u8)
2455             .map_err(DebuggableError::ReadRegs)?;
2456         let mut gregs = self.create_standard_regs(cpu_id as u8);
2457         gregs.set_rax(regs.regs[0]);
2458         gregs.set_rbx(regs.regs[1]);
2459         gregs.set_rcx(regs.regs[2]);
2460         gregs.set_rdx(regs.regs[3]);
2461         gregs.set_rsi(regs.regs[4]);
2462         gregs.set_rdi(regs.regs[5]);
2463         gregs.set_rbp(regs.regs[6]);
2464         gregs.set_rsp(regs.regs[7]);
2465         gregs.set_r8(regs.regs[8]);
2466         gregs.set_r9(regs.regs[9]);
2467         gregs.set_r10(regs.regs[10]);
2468         gregs.set_r11(regs.regs[11]);
2469         gregs.set_r12(regs.regs[12]);
2470         gregs.set_r13(regs.regs[13]);
2471         gregs.set_r14(regs.regs[14]);
2472         gregs.set_r15(regs.regs[15]);
2473         gregs.set_rip(regs.rip);
2474         // Update the lower 32-bit of rflags.
2475         gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64));
2476 
2477         self.set_regs(cpu_id as u8, &gregs)
2478             .map_err(DebuggableError::WriteRegs)?;
2479 
2480         // Segment registers: CS, SS, DS, ES, FS, GS
2481         // Since GDB care only selectors, we call get_sregs() first.
2482         let mut sregs = self
2483             .get_sregs(cpu_id as u8)
2484             .map_err(DebuggableError::ReadRegs)?;
2485         sregs.cs.selector = regs.segments.cs as u16;
2486         sregs.ss.selector = regs.segments.ss as u16;
2487         sregs.ds.selector = regs.segments.ds as u16;
2488         sregs.es.selector = regs.segments.es as u16;
2489         sregs.fs.selector = regs.segments.fs as u16;
2490         sregs.gs.selector = regs.segments.gs as u16;
2491 
2492         self.set_sregs(cpu_id as u8, &sregs)
2493             .map_err(DebuggableError::WriteRegs)?;
2494 
2495         // TODO: Add other registers
2496 
2497         Ok(())
2498     }
2499 
2500     #[cfg(target_arch = "aarch64")]
2501     fn write_regs(
2502         &self,
2503         cpu_id: usize,
2504         regs: &CoreRegs,
2505     ) -> std::result::Result<(), DebuggableError> {
2506         let mut gregs = self
2507             .get_regs(cpu_id as u8)
2508             .map_err(DebuggableError::ReadRegs)?;
2509 
2510         gregs.set_regs(regs.x);
2511         gregs.set_sp(regs.sp);
2512         gregs.set_pc(regs.pc);
2513 
2514         self.set_regs(cpu_id as u8, &gregs)
2515             .map_err(DebuggableError::WriteRegs)?;
2516 
2517         Ok(())
2518     }
2519 
2520     fn read_mem(
2521         &self,
2522         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2523         cpu_id: usize,
2524         vaddr: GuestAddress,
2525         len: usize,
2526     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2527         let mut buf = vec![0; len];
2528         let mut total_read = 0_u64;
2529 
2530         while total_read < len as u64 {
2531             let gaddr = vaddr.0 + total_read;
2532             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2533                 Ok(paddr) => paddr,
2534                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2535                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2536             };
2537             let psize = arch::PAGE_SIZE as u64;
2538             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2539             guest_memory
2540                 .memory()
2541                 .read(
2542                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2543                     GuestAddress(paddr),
2544                 )
2545                 .map_err(DebuggableError::ReadMem)?;
2546             total_read += read_len;
2547         }
2548         Ok(buf)
2549     }
2550 
2551     fn write_mem(
2552         &self,
2553         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2554         cpu_id: usize,
2555         vaddr: &GuestAddress,
2556         data: &[u8],
2557     ) -> std::result::Result<(), DebuggableError> {
2558         let mut total_written = 0_u64;
2559 
2560         while total_written < data.len() as u64 {
2561             let gaddr = vaddr.0 + total_written;
2562             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2563                 Ok(paddr) => paddr,
2564                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2565                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2566             };
2567             let psize = arch::PAGE_SIZE as u64;
2568             let write_len = std::cmp::min(
2569                 data.len() as u64 - total_written,
2570                 psize - (paddr & (psize - 1)),
2571             );
2572             guest_memory
2573                 .memory()
2574                 .write(
2575                     &data[total_written as usize..total_written as usize + write_len as usize],
2576                     GuestAddress(paddr),
2577                 )
2578                 .map_err(DebuggableError::WriteMem)?;
2579             total_written += write_len;
2580         }
2581         Ok(())
2582     }
2583 
2584     fn active_vcpus(&self) -> usize {
2585         self.present_vcpus() as usize
2586     }
2587 }
2588 
2589 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2590 impl Elf64Writable for CpuManager {}
2591 
2592 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2593 impl CpuElf64Writable for CpuManager {
2594     fn cpu_write_elf64_note(
2595         &mut self,
2596         dump_state: &DumpState,
2597     ) -> std::result::Result<(), GuestDebuggableError> {
2598         let mut coredump_file = dump_state.file.as_ref().unwrap();
2599         for vcpu in &self.vcpus {
2600             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2601             let mut pos: usize = 0;
2602             let mut buf = vec![0; note_size as usize];
2603             let descsz = size_of::<X86_64ElfPrStatus>();
2604             let vcpu_id = vcpu.lock().unwrap().id;
2605 
2606             let note = Elf64_Nhdr {
2607                 n_namesz: COREDUMP_NAME_SIZE,
2608                 n_descsz: descsz as u32,
2609                 n_type: NT_PRSTATUS,
2610             };
2611 
2612             let bytes: &[u8] = note.as_slice();
2613             buf.splice(0.., bytes.to_vec());
2614             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2615             buf.resize(pos + 4, 0);
2616             buf.splice(pos.., "CORE".to_string().into_bytes());
2617 
2618             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2619             buf.resize(pos + 32 + 4, 0);
2620             let pid = vcpu_id as u64;
2621             let bytes: &[u8] = pid.as_slice();
2622             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2623 
2624             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2625 
2626             let orig_rax: u64 = 0;
2627             let gregs = self.vcpus[usize::from(vcpu_id)]
2628                 .lock()
2629                 .unwrap()
2630                 .vcpu
2631                 .get_regs()
2632                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2633 
2634             let regs1 = [
2635                 gregs.get_r15(),
2636                 gregs.get_r14(),
2637                 gregs.get_r13(),
2638                 gregs.get_r12(),
2639                 gregs.get_rbp(),
2640                 gregs.get_rbx(),
2641                 gregs.get_r11(),
2642                 gregs.get_r10(),
2643             ];
2644             let regs2 = [
2645                 gregs.get_r9(),
2646                 gregs.get_r8(),
2647                 gregs.get_rax(),
2648                 gregs.get_rcx(),
2649                 gregs.get_rdx(),
2650                 gregs.get_rsi(),
2651                 gregs.get_rdi(),
2652                 orig_rax,
2653             ];
2654 
2655             let sregs = self.vcpus[usize::from(vcpu_id)]
2656                 .lock()
2657                 .unwrap()
2658                 .vcpu
2659                 .get_sregs()
2660                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2661 
2662             debug!(
2663                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2664                 gregs.get_rip(),
2665                 gregs.get_rsp(),
2666                 sregs.gs.base,
2667                 sregs.cs.selector,
2668                 sregs.ss.selector,
2669                 sregs.ds.selector,
2670             );
2671 
2672             let regs = X86_64UserRegs {
2673                 regs1,
2674                 regs2,
2675                 rip: gregs.get_rip(),
2676                 cs: sregs.cs.selector as u64,
2677                 eflags: gregs.get_rflags(),
2678                 rsp: gregs.get_rsp(),
2679                 ss: sregs.ss.selector as u64,
2680                 fs_base: sregs.fs.base,
2681                 gs_base: sregs.gs.base,
2682                 ds: sregs.ds.selector as u64,
2683                 es: sregs.es.selector as u64,
2684                 fs: sregs.fs.selector as u64,
2685                 gs: sregs.gs.selector as u64,
2686             };
2687 
2688             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2689             let bytes: &[u8] = regs.as_slice();
2690             buf.resize(note_size as usize, 0);
2691             buf.splice(pos.., bytes.to_vec());
2692             buf.resize(note_size as usize, 0);
2693 
2694             coredump_file
2695                 .write(&buf)
2696                 .map_err(GuestDebuggableError::CoredumpFile)?;
2697         }
2698 
2699         Ok(())
2700     }
2701 
2702     fn cpu_write_vmm_note(
2703         &mut self,
2704         dump_state: &DumpState,
2705     ) -> std::result::Result<(), GuestDebuggableError> {
2706         let mut coredump_file = dump_state.file.as_ref().unwrap();
2707         for vcpu in &self.vcpus {
2708             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2709             let mut pos: usize = 0;
2710             let mut buf = vec![0; note_size as usize];
2711             let descsz = size_of::<DumpCpusState>();
2712             let vcpu_id = vcpu.lock().unwrap().id;
2713 
2714             let note = Elf64_Nhdr {
2715                 n_namesz: COREDUMP_NAME_SIZE,
2716                 n_descsz: descsz as u32,
2717                 n_type: 0,
2718             };
2719 
2720             let bytes: &[u8] = note.as_slice();
2721             buf.splice(0.., bytes.to_vec());
2722             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2723 
2724             buf.resize(pos + 4, 0);
2725             buf.splice(pos.., "QEMU".to_string().into_bytes());
2726 
2727             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2728 
2729             let gregs = self.vcpus[usize::from(vcpu_id)]
2730                 .lock()
2731                 .unwrap()
2732                 .vcpu
2733                 .get_regs()
2734                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2735 
2736             let regs1 = [
2737                 gregs.get_rax(),
2738                 gregs.get_rbx(),
2739                 gregs.get_rcx(),
2740                 gregs.get_rdx(),
2741                 gregs.get_rsi(),
2742                 gregs.get_rdi(),
2743                 gregs.get_rsp(),
2744                 gregs.get_rbp(),
2745             ];
2746 
2747             let regs2 = [
2748                 gregs.get_r8(),
2749                 gregs.get_r9(),
2750                 gregs.get_r10(),
2751                 gregs.get_r11(),
2752                 gregs.get_r12(),
2753                 gregs.get_r13(),
2754                 gregs.get_r14(),
2755                 gregs.get_r15(),
2756             ];
2757 
2758             let sregs = self.vcpus[usize::from(vcpu_id)]
2759                 .lock()
2760                 .unwrap()
2761                 .vcpu
2762                 .get_sregs()
2763                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2764 
2765             let mut msrs = vec![MsrEntry {
2766                 index: msr_index::MSR_KERNEL_GS_BASE,
2767                 ..Default::default()
2768             }];
2769 
2770             self.vcpus[vcpu_id as usize]
2771                 .lock()
2772                 .unwrap()
2773                 .vcpu
2774                 .get_msrs(&mut msrs)
2775                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2776             let kernel_gs_base = msrs[0].data;
2777 
2778             let cs = CpuSegment::new(sregs.cs);
2779             let ds = CpuSegment::new(sregs.ds);
2780             let es = CpuSegment::new(sregs.es);
2781             let fs = CpuSegment::new(sregs.fs);
2782             let gs = CpuSegment::new(sregs.gs);
2783             let ss = CpuSegment::new(sregs.ss);
2784             let ldt = CpuSegment::new(sregs.ldt);
2785             let tr = CpuSegment::new(sregs.tr);
2786             let gdt = CpuSegment::new_from_table(sregs.gdt);
2787             let idt = CpuSegment::new_from_table(sregs.idt);
2788             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2789             let regs = DumpCpusState {
2790                 version: 1,
2791                 size: size_of::<DumpCpusState>() as u32,
2792                 regs1,
2793                 regs2,
2794                 rip: gregs.get_rip(),
2795                 rflags: gregs.get_rflags(),
2796                 cs,
2797                 ds,
2798                 es,
2799                 fs,
2800                 gs,
2801                 ss,
2802                 ldt,
2803                 tr,
2804                 gdt,
2805                 idt,
2806                 cr,
2807                 kernel_gs_base,
2808             };
2809 
2810             let bytes: &[u8] = regs.as_slice();
2811             buf.resize(note_size as usize, 0);
2812             buf.splice(pos.., bytes.to_vec());
2813             buf.resize(note_size as usize, 0);
2814 
2815             coredump_file
2816                 .write(&buf)
2817                 .map_err(GuestDebuggableError::CoredumpFile)?;
2818         }
2819 
2820         Ok(())
2821     }
2822 }
2823 
2824 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2825 #[cfg(test)]
2826 mod tests {
2827     use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START};
2828     use arch::x86_64::interrupts::*;
2829     use arch::x86_64::regs::*;
2830     use hypervisor::arch::x86::{FpuState, LapicState};
2831     use hypervisor::StandardRegisters;
2832     use linux_loader::loader::bootparam::setup_header;
2833 
2834     #[test]
2835     fn test_setlint() {
2836         let hv = hypervisor::new().unwrap();
2837         let vm = hv.create_vm().expect("new VM fd creation failed");
2838         hv.check_required_extensions().unwrap();
2839         // Calling get_lapic will fail if there is no irqchip before hand.
2840         vm.create_irq_chip().unwrap();
2841         let vcpu = vm.create_vcpu(0, None).unwrap();
2842         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2843 
2844         // Compute the value that is expected to represent LVT0 and LVT1.
2845         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2846         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2847         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2848         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2849 
2850         set_lint(&vcpu).unwrap();
2851 
2852         // Compute the value that represents LVT0 and LVT1 after set_lint.
2853         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2854         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2855         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2856         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2857         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2858     }
2859 
2860     #[test]
2861     fn test_setup_fpu() {
2862         let hv = hypervisor::new().unwrap();
2863         let vm = hv.create_vm().expect("new VM fd creation failed");
2864         let vcpu = vm.create_vcpu(0, None).unwrap();
2865         setup_fpu(&vcpu).unwrap();
2866 
2867         let expected_fpu: FpuState = FpuState {
2868             fcw: 0x37f,
2869             mxcsr: 0x1f80,
2870             ..Default::default()
2871         };
2872         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2873         // TODO: auto-generate kvm related structures with PartialEq on.
2874         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2875         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2876         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2877         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2878         // remove it at all.
2879         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2880     }
2881 
2882     #[test]
2883     fn test_setup_msrs() {
2884         use hypervisor::arch::x86::{msr_index, MsrEntry};
2885 
2886         let hv = hypervisor::new().unwrap();
2887         let vm = hv.create_vm().expect("new VM fd creation failed");
2888         let vcpu = vm.create_vcpu(0, None).unwrap();
2889         setup_msrs(&vcpu).unwrap();
2890 
2891         // This test will check against the last MSR entry configured (the tenth one).
2892         // See create_msr_entries for details.
2893         let mut msrs = vec![MsrEntry {
2894             index: msr_index::MSR_IA32_MISC_ENABLE,
2895             ..Default::default()
2896         }];
2897 
2898         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2899         // in this test case scenario.
2900         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2901         assert_eq!(read_msrs, 1);
2902 
2903         // Official entries that were setup when we did setup_msrs. We need to assert that the
2904         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2905         // expect.
2906         let entry_vec = vcpu.boot_msr_entries();
2907         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2908     }
2909 
2910     #[test]
2911     fn test_setup_regs_for_pvh() {
2912         let hv = hypervisor::new().unwrap();
2913         let vm = hv.create_vm().expect("new VM fd creation failed");
2914         let vcpu = vm.create_vcpu(0, None).unwrap();
2915 
2916         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2917         expected_regs.set_rflags(0x0000000000000002u64);
2918         expected_regs.set_rbx(arch::layout::PVH_INFO_START.0);
2919         expected_regs.set_rip(1);
2920 
2921         setup_regs(
2922             &vcpu,
2923             arch::EntryPoint {
2924                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2925                 setup_header: None,
2926             },
2927         )
2928         .unwrap();
2929 
2930         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2931         assert_eq!(actual_regs, expected_regs);
2932     }
2933 
2934     #[test]
2935     fn test_setup_regs_for_bzimage() {
2936         let hv = hypervisor::new().unwrap();
2937         let vm = hv.create_vm().expect("new VM fd creation failed");
2938         let vcpu = vm.create_vcpu(0, None).unwrap();
2939 
2940         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2941         expected_regs.set_rflags(0x0000000000000002u64);
2942         expected_regs.set_rip(1);
2943         expected_regs.set_rsp(BOOT_STACK_POINTER.0);
2944         expected_regs.set_rsi(ZERO_PAGE_START.0);
2945 
2946         setup_regs(
2947             &vcpu,
2948             arch::EntryPoint {
2949                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2950                 setup_header: Some(setup_header {
2951                     ..Default::default()
2952                 }),
2953             },
2954         )
2955         .unwrap();
2956 
2957         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2958         assert_eq!(actual_regs, expected_regs);
2959     }
2960 }
2961 
2962 #[cfg(target_arch = "aarch64")]
2963 #[cfg(test)]
2964 mod tests {
2965     #[cfg(feature = "kvm")]
2966     use std::mem;
2967 
2968     use arch::layout;
2969     use hypervisor::arch::aarch64::regs::MPIDR_EL1;
2970     #[cfg(feature = "kvm")]
2971     use hypervisor::kvm::aarch64::is_system_register;
2972     #[cfg(feature = "kvm")]
2973     use hypervisor::kvm::kvm_bindings::{
2974         user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2975     };
2976     use hypervisor::HypervisorCpuError;
2977     #[cfg(feature = "kvm")]
2978     use hypervisor::{arm64_core_reg_id, offset_of};
2979 
2980     #[test]
2981     fn test_setup_regs() {
2982         let hv = hypervisor::new().unwrap();
2983         let vm = hv.create_vm().unwrap();
2984         let vcpu = vm.create_vcpu(0, None).unwrap();
2985 
2986         // Must fail when vcpu is not initialized yet.
2987         vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap_err();
2988 
2989         let mut kvi = vcpu.create_vcpu_init();
2990         vm.get_preferred_target(&mut kvi).unwrap();
2991         vcpu.vcpu_init(&kvi).unwrap();
2992 
2993         vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap();
2994     }
2995 
2996     #[test]
2997     fn test_read_mpidr() {
2998         let hv = hypervisor::new().unwrap();
2999         let vm = hv.create_vm().unwrap();
3000         let vcpu = vm.create_vcpu(0, None).unwrap();
3001         let mut kvi = vcpu.create_vcpu_init();
3002         vm.get_preferred_target(&mut kvi).unwrap();
3003 
3004         // Must fail when vcpu is not initialized yet.
3005         vcpu.get_sys_reg(MPIDR_EL1).unwrap_err();
3006 
3007         vcpu.vcpu_init(&kvi).unwrap();
3008         assert_eq!(vcpu.get_sys_reg(MPIDR_EL1).unwrap(), 0x80000000);
3009     }
3010 
3011     #[cfg(feature = "kvm")]
3012     #[test]
3013     fn test_is_system_register() {
3014         let offset = offset_of!(user_pt_regs, pc);
3015         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
3016         assert!(!is_system_register(regid));
3017         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
3018         assert!(is_system_register(regid));
3019     }
3020 
3021     #[test]
3022     fn test_save_restore_core_regs() {
3023         let hv = hypervisor::new().unwrap();
3024         let vm = hv.create_vm().unwrap();
3025         let vcpu = vm.create_vcpu(0, None).unwrap();
3026         let mut kvi = vcpu.create_vcpu_init();
3027         vm.get_preferred_target(&mut kvi).unwrap();
3028 
3029         fn hypervisor_cpu_error_to_raw_os_error(error: &anyhow::Error) -> libc::c_int {
3030             let cause = error.chain().next().expect("should have root cause");
3031             cause
3032                 .downcast_ref::<vmm_sys_util::errno::Error>()
3033                 .unwrap_or_else(|| panic!("should be io::Error but is: {cause:?}"))
3034                 .errno() as libc::c_int
3035         }
3036 
3037         // test get_regs
3038         {
3039             let error = vcpu
3040                 .get_regs()
3041                 .expect_err("should fail as vCPU is not initialized");
3042             let io_error_raw = if let HypervisorCpuError::GetAarchCoreRegister(error) = error {
3043                 hypervisor_cpu_error_to_raw_os_error(&error)
3044             } else {
3045                 panic!("get_regs() must fail with error HypervisorCpuError::GetAarchCoreRegister");
3046             };
3047             assert_eq!(io_error_raw, libc::ENOEXEC);
3048         }
3049 
3050         // test set_regs
3051         let mut state = vcpu.create_standard_regs();
3052         {
3053             let error = vcpu
3054                 .set_regs(&state)
3055                 .expect_err("should fail as vCPU is not initialized");
3056             let io_error_raw = if let HypervisorCpuError::SetAarchCoreRegister(error) = error {
3057                 hypervisor_cpu_error_to_raw_os_error(&error)
3058             } else {
3059                 panic!("set_regs() must fail with error HypervisorCpuError::SetAarchCoreRegister");
3060             };
3061             assert_eq!(io_error_raw, libc::ENOEXEC);
3062         }
3063 
3064         vcpu.vcpu_init(&kvi).unwrap();
3065         state = vcpu.get_regs().unwrap();
3066         assert_eq!(state.get_pstate(), 0x3C5);
3067 
3068         vcpu.set_regs(&state).unwrap();
3069     }
3070 
3071     #[test]
3072     fn test_get_set_mpstate() {
3073         let hv = hypervisor::new().unwrap();
3074         let vm = hv.create_vm().unwrap();
3075         let vcpu = vm.create_vcpu(0, None).unwrap();
3076         let mut kvi = vcpu.create_vcpu_init();
3077         vm.get_preferred_target(&mut kvi).unwrap();
3078 
3079         let state = vcpu.get_mp_state().unwrap();
3080         vcpu.set_mp_state(state).unwrap();
3081     }
3082 }
3083