xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 5a9dd7489cdd27ad83b278ae94609214350b07b1)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 use arch::EntryPoint;
35 use arch::NumaNodes;
36 #[cfg(target_arch = "aarch64")]
37 use devices::gic::Gic;
38 use devices::interrupt_controller::InterruptController;
39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
44 use hypervisor::aarch64::StandardRegisters;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::msr_index;
47 #[cfg(target_arch = "x86_64")]
48 use hypervisor::arch::x86::CpuIdEntry;
49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
50 use hypervisor::arch::x86::MsrEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
53 #[cfg(target_arch = "aarch64")]
54 use hypervisor::kvm::kvm_bindings;
55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
56 use hypervisor::kvm::kvm_ioctls::Cap;
57 #[cfg(feature = "tdx")]
58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
59 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
60 use libc::{c_void, siginfo_t};
61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
62 use linux_loader::elf::Elf64_Nhdr;
63 use seccompiler::{apply_filter, SeccompAction};
64 use std::collections::BTreeMap;
65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
66 use std::io::Write;
67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
68 use std::mem::size_of;
69 use std::os::unix::thread::JoinHandleExt;
70 use std::sync::atomic::{AtomicBool, Ordering};
71 use std::sync::{Arc, Barrier, Mutex};
72 use std::{cmp, io, result, thread};
73 use thiserror::Error;
74 use tracer::trace_scoped;
75 use vm_device::BusDevice;
76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
77 use vm_memory::ByteValued;
78 #[cfg(feature = "guest_debug")]
79 use vm_memory::{Bytes, GuestAddressSpace};
80 use vm_memory::{GuestAddress, GuestMemoryAtomic};
81 use vm_migration::{
82     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
83     Transportable,
84 };
85 use vmm_sys_util::eventfd::EventFd;
86 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
87 use zerocopy::AsBytes;
88 
89 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
90 /// Extract the specified bits of a 64-bit integer.
91 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
92 /// following expression should return 3 (`0b11`):
93 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
94 ///
95 macro_rules! extract_bits_64 {
96     ($value: tt, $offset: tt, $length: tt) => {
97         ($value >> $offset) & (!0u64 >> (64 - $length))
98     };
99 }
100 
101 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
102 
103 #[derive(Debug, Error)]
104 pub enum Error {
105     #[error("Error creating vCPU: {0}")]
106     VcpuCreate(#[source] anyhow::Error),
107 
108     #[error("Error running bCPU: {0}")]
109     VcpuRun(#[source] anyhow::Error),
110 
111     #[error("Error spawning vCPU thread: {0}")]
112     VcpuSpawn(#[source] io::Error),
113 
114     #[error("Error generating common CPUID: {0}")]
115     CommonCpuId(#[source] arch::Error),
116 
117     #[error("Error configuring vCPU: {0}")]
118     VcpuConfiguration(#[source] arch::Error),
119 
120     #[cfg(target_arch = "aarch64")]
121     #[error("Error fetching preferred target: {0}")]
122     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
123 
124     #[cfg(target_arch = "aarch64")]
125     #[error("Error initialising vCPU: {0}")]
126     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
127 
128     #[error("Failed to join on vCPU threads: {0:?}")]
129     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
130 
131     #[error("Error adding CpuManager to MMIO bus: {0}")]
132     BusError(#[source] vm_device::BusError),
133 
134     #[error("Requested vCPUs exceed maximum")]
135     DesiredVCpuCountExceedsMax,
136 
137     #[error("Cannot create seccomp filter: {0}")]
138     CreateSeccompFilter(#[source] seccompiler::Error),
139 
140     #[error("Cannot apply seccomp filter: {0}")]
141     ApplySeccompFilter(#[source] seccompiler::Error),
142 
143     #[error("Error starting vCPU after restore: {0}")]
144     StartRestoreVcpu(#[source] anyhow::Error),
145 
146     #[error("Unexpected VmExit")]
147     UnexpectedVmExit,
148 
149     #[error("Failed to allocate MMIO address for CpuManager")]
150     AllocateMmmioAddress,
151 
152     #[cfg(feature = "tdx")]
153     #[error("Error initializing TDX: {0}")]
154     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
155 
156     #[cfg(target_arch = "aarch64")]
157     #[error("Error initializing PMU: {0}")]
158     InitPmu(#[source] hypervisor::HypervisorCpuError),
159 
160     #[cfg(feature = "guest_debug")]
161     #[error("Error during CPU debug: {0}")]
162     CpuDebug(#[source] hypervisor::HypervisorCpuError),
163 
164     #[cfg(feature = "guest_debug")]
165     #[error("Error translating virtual address: {0}")]
166     TranslateVirtualAddress(#[source] anyhow::Error),
167 
168     #[cfg(target_arch = "x86_64")]
169     #[error("Error setting up AMX: {0}")]
170     AmxEnable(#[source] anyhow::Error),
171 
172     #[error("Maximum number of vCPUs exceeds host limit")]
173     MaximumVcpusExceeded,
174 }
175 pub type Result<T> = result::Result<T, Error>;
176 
177 #[cfg(target_arch = "x86_64")]
178 #[allow(dead_code)]
179 #[repr(packed)]
180 #[derive(AsBytes)]
181 struct LocalApic {
182     pub r#type: u8,
183     pub length: u8,
184     pub processor_id: u8,
185     pub apic_id: u8,
186     pub flags: u32,
187 }
188 
189 #[allow(dead_code)]
190 #[repr(packed)]
191 #[derive(Default, AsBytes)]
192 struct Ioapic {
193     pub r#type: u8,
194     pub length: u8,
195     pub ioapic_id: u8,
196     _reserved: u8,
197     pub apic_address: u32,
198     pub gsi_base: u32,
199 }
200 
201 #[cfg(target_arch = "aarch64")]
202 #[allow(dead_code)]
203 #[repr(packed)]
204 #[derive(AsBytes)]
205 struct GicC {
206     pub r#type: u8,
207     pub length: u8,
208     pub reserved0: u16,
209     pub cpu_interface_number: u32,
210     pub uid: u32,
211     pub flags: u32,
212     pub parking_version: u32,
213     pub performance_interrupt: u32,
214     pub parked_address: u64,
215     pub base_address: u64,
216     pub gicv_base_address: u64,
217     pub gich_base_address: u64,
218     pub vgic_interrupt: u32,
219     pub gicr_base_address: u64,
220     pub mpidr: u64,
221     pub proc_power_effi_class: u8,
222     pub reserved1: u8,
223     pub spe_overflow_interrupt: u16,
224 }
225 
226 #[cfg(target_arch = "aarch64")]
227 #[allow(dead_code)]
228 #[repr(packed)]
229 #[derive(AsBytes)]
230 struct GicD {
231     pub r#type: u8,
232     pub length: u8,
233     pub reserved0: u16,
234     pub gic_id: u32,
235     pub base_address: u64,
236     pub global_irq_base: u32,
237     pub version: u8,
238     pub reserved1: [u8; 3],
239 }
240 
241 #[cfg(target_arch = "aarch64")]
242 #[allow(dead_code)]
243 #[repr(packed)]
244 #[derive(AsBytes)]
245 struct GicR {
246     pub r#type: u8,
247     pub length: u8,
248     pub reserved: u16,
249     pub base_address: u64,
250     pub range_length: u32,
251 }
252 
253 #[cfg(target_arch = "aarch64")]
254 #[allow(dead_code)]
255 #[repr(packed)]
256 #[derive(AsBytes)]
257 struct GicIts {
258     pub r#type: u8,
259     pub length: u8,
260     pub reserved0: u16,
261     pub translation_id: u32,
262     pub base_address: u64,
263     pub reserved1: u32,
264 }
265 
266 #[cfg(target_arch = "aarch64")]
267 #[allow(dead_code)]
268 #[repr(packed)]
269 #[derive(AsBytes)]
270 struct ProcessorHierarchyNode {
271     pub r#type: u8,
272     pub length: u8,
273     pub reserved: u16,
274     pub flags: u32,
275     pub parent: u32,
276     pub acpi_processor_id: u32,
277     pub num_private_resources: u32,
278 }
279 
280 #[allow(dead_code)]
281 #[repr(packed)]
282 #[derive(Default, AsBytes)]
283 struct InterruptSourceOverride {
284     pub r#type: u8,
285     pub length: u8,
286     pub bus: u8,
287     pub source: u8,
288     pub gsi: u32,
289     pub flags: u16,
290 }
291 
292 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
293 macro_rules! round_up {
294     ($n:expr,$d:expr) => {
295         (($n / ($d + 1)) + 1) * $d
296     };
297 }
298 
299 /// A wrapper around creating and using a kvm-based VCPU.
300 pub struct Vcpu {
301     // The hypervisor abstracted CPU.
302     vcpu: Arc<dyn hypervisor::Vcpu>,
303     id: u8,
304     #[cfg(target_arch = "aarch64")]
305     mpidr: u64,
306     saved_state: Option<CpuState>,
307 }
308 
309 impl Vcpu {
310     /// Constructs a new VCPU for `vm`.
311     ///
312     /// # Arguments
313     ///
314     /// * `id` - Represents the CPU number between [0, max vcpus).
315     /// * `vm` - The virtual machine this vcpu will get attached to.
316     /// * `vm_ops` - Optional object for exit handling.
317     pub fn new(
318         id: u8,
319         vm: &Arc<dyn hypervisor::Vm>,
320         vm_ops: Option<Arc<dyn VmOps>>,
321     ) -> Result<Self> {
322         let vcpu = vm
323             .create_vcpu(id, vm_ops)
324             .map_err(|e| Error::VcpuCreate(e.into()))?;
325         // Initially the cpuid per vCPU is the one supported by this VM.
326         Ok(Vcpu {
327             vcpu,
328             id,
329             #[cfg(target_arch = "aarch64")]
330             mpidr: 0,
331             saved_state: None,
332         })
333     }
334 
335     /// Configures a vcpu and should be called once per vcpu when created.
336     ///
337     /// # Arguments
338     ///
339     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
340     /// * `guest_memory` - Guest memory.
341     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
342     pub fn configure(
343         &mut self,
344         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
345         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
346         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
347         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
348     ) -> Result<()> {
349         #[cfg(target_arch = "aarch64")]
350         {
351             self.init(vm)?;
352             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
353                 .map_err(Error::VcpuConfiguration)?;
354         }
355         info!("Configuring vCPU: cpu_id = {}", self.id);
356         #[cfg(target_arch = "x86_64")]
357         arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv)
358             .map_err(Error::VcpuConfiguration)?;
359 
360         Ok(())
361     }
362 
363     /// Gets the MPIDR register value.
364     #[cfg(target_arch = "aarch64")]
365     pub fn get_mpidr(&self) -> u64 {
366         self.mpidr
367     }
368 
369     /// Gets the saved vCPU state.
370     #[cfg(target_arch = "aarch64")]
371     pub fn get_saved_state(&self) -> Option<CpuState> {
372         self.saved_state.clone()
373     }
374 
375     /// Initializes an aarch64 specific vcpu for booting Linux.
376     #[cfg(target_arch = "aarch64")]
377     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
378         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
379 
380         // This reads back the kernel's preferred target type.
381         vm.get_preferred_target(&mut kvi)
382             .map_err(Error::VcpuArmPreferredTarget)?;
383         // We already checked that the capability is supported.
384         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
385         if vm
386             .as_any()
387             .downcast_ref::<hypervisor::kvm::KvmVm>()
388             .unwrap()
389             .check_extension(Cap::ArmPmuV3)
390         {
391             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
392         }
393         // Non-boot cpus are powered off initially.
394         if self.id > 0 {
395             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
396         }
397         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
398     }
399 
400     /// Runs the VCPU until it exits, returning the reason.
401     ///
402     /// Note that the state of the VCPU and associated VM must be setup first for this to do
403     /// anything useful.
404     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
405         self.vcpu.run()
406     }
407 }
408 
409 impl Pausable for Vcpu {}
410 impl Snapshottable for Vcpu {
411     fn id(&self) -> String {
412         self.id.to_string()
413     }
414 
415     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
416         let saved_state = self
417             .vcpu
418             .state()
419             .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?;
420 
421         self.saved_state = Some(saved_state.clone());
422 
423         Ok(Snapshot::from_data(SnapshotData::new_from_state(
424             &saved_state,
425         )?))
426     }
427 }
428 
429 pub struct CpuManager {
430     hypervisor_type: HypervisorType,
431     config: CpusConfig,
432     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
433     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
434     #[cfg(target_arch = "x86_64")]
435     cpuid: Vec<CpuIdEntry>,
436     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
437     vm: Arc<dyn hypervisor::Vm>,
438     vcpus_kill_signalled: Arc<AtomicBool>,
439     vcpus_pause_signalled: Arc<AtomicBool>,
440     exit_evt: EventFd,
441     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
442     reset_evt: EventFd,
443     #[cfg(feature = "guest_debug")]
444     vm_debug_evt: EventFd,
445     vcpu_states: Vec<VcpuState>,
446     selected_cpu: u8,
447     vcpus: Vec<Arc<Mutex<Vcpu>>>,
448     seccomp_action: SeccompAction,
449     vm_ops: Arc<dyn VmOps>,
450     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
451     acpi_address: Option<GuestAddress>,
452     proximity_domain_per_cpu: BTreeMap<u8, u32>,
453     affinity: BTreeMap<u8, Vec<u8>>,
454     dynamic: bool,
455 }
456 
457 const CPU_ENABLE_FLAG: usize = 0;
458 const CPU_INSERTING_FLAG: usize = 1;
459 const CPU_REMOVING_FLAG: usize = 2;
460 const CPU_EJECT_FLAG: usize = 3;
461 
462 const CPU_STATUS_OFFSET: u64 = 4;
463 const CPU_SELECTION_OFFSET: u64 = 0;
464 
465 impl BusDevice for CpuManager {
466     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
467         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
468         data.fill(0);
469 
470         match offset {
471             CPU_SELECTION_OFFSET => {
472                 data[0] = self.selected_cpu;
473             }
474             CPU_STATUS_OFFSET => {
475                 if self.selected_cpu < self.max_vcpus() {
476                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
477                     if state.active() {
478                         data[0] |= 1 << CPU_ENABLE_FLAG;
479                     }
480                     if state.inserting {
481                         data[0] |= 1 << CPU_INSERTING_FLAG;
482                     }
483                     if state.removing {
484                         data[0] |= 1 << CPU_REMOVING_FLAG;
485                     }
486                 } else {
487                     warn!("Out of range vCPU id: {}", self.selected_cpu);
488                 }
489             }
490             _ => {
491                 warn!(
492                     "Unexpected offset for accessing CPU manager device: {:#}",
493                     offset
494                 );
495             }
496         }
497     }
498 
499     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
500         match offset {
501             CPU_SELECTION_OFFSET => {
502                 self.selected_cpu = data[0];
503             }
504             CPU_STATUS_OFFSET => {
505                 if self.selected_cpu < self.max_vcpus() {
506                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
507                     // The ACPI code writes back a 1 to acknowledge the insertion
508                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
509                         && state.inserting
510                     {
511                         state.inserting = false;
512                     }
513                     // Ditto for removal
514                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
515                         && state.removing
516                     {
517                         state.removing = false;
518                     }
519                     // Trigger removal of vCPU
520                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
521                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
522                             error!("Error removing vCPU: {:?}", e);
523                         }
524                     }
525                 } else {
526                     warn!("Out of range vCPU id: {}", self.selected_cpu);
527                 }
528             }
529             _ => {
530                 warn!(
531                     "Unexpected offset for accessing CPU manager device: {:#}",
532                     offset
533                 );
534             }
535         }
536         None
537     }
538 }
539 
540 #[derive(Default)]
541 struct VcpuState {
542     inserting: bool,
543     removing: bool,
544     handle: Option<thread::JoinHandle<()>>,
545     kill: Arc<AtomicBool>,
546     vcpu_run_interrupted: Arc<AtomicBool>,
547 }
548 
549 impl VcpuState {
550     fn active(&self) -> bool {
551         self.handle.is_some()
552     }
553 
554     fn signal_thread(&self) {
555         if let Some(handle) = self.handle.as_ref() {
556             loop {
557                 // SAFETY: FFI call with correct arguments
558                 unsafe {
559                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
560                 }
561                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
562                     break;
563                 } else {
564                     // This is more effective than thread::yield_now() at
565                     // avoiding a priority inversion with the vCPU thread
566                     thread::sleep(std::time::Duration::from_millis(1));
567                 }
568             }
569         }
570     }
571 
572     fn join_thread(&mut self) -> Result<()> {
573         if let Some(handle) = self.handle.take() {
574             handle.join().map_err(Error::ThreadCleanup)?
575         }
576 
577         Ok(())
578     }
579 
580     fn unpark_thread(&self) {
581         if let Some(handle) = self.handle.as_ref() {
582             handle.thread().unpark()
583         }
584     }
585 }
586 
587 impl CpuManager {
588     #[allow(unused_variables)]
589     #[allow(clippy::too_many_arguments)]
590     pub fn new(
591         config: &CpusConfig,
592         vm: Arc<dyn hypervisor::Vm>,
593         exit_evt: EventFd,
594         reset_evt: EventFd,
595         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
596         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
597         seccomp_action: SeccompAction,
598         vm_ops: Arc<dyn VmOps>,
599         #[cfg(feature = "tdx")] tdx_enabled: bool,
600         numa_nodes: &NumaNodes,
601     ) -> Result<Arc<Mutex<CpuManager>>> {
602         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
603             return Err(Error::MaximumVcpusExceeded);
604         }
605 
606         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
607         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
608         let hypervisor_type = hypervisor.hypervisor_type();
609 
610         #[cfg(target_arch = "x86_64")]
611         if config.features.amx {
612             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
613             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
614             const XFEATURE_XTILEDATA: usize = 18;
615             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
616 
617             // SAFETY: the syscall is only modifing kernel internal
618             // data structures that the kernel is itself expected to safeguard.
619             let amx_tile = unsafe {
620                 libc::syscall(
621                     libc::SYS_arch_prctl,
622                     ARCH_REQ_XCOMP_GUEST_PERM,
623                     XFEATURE_XTILEDATA,
624                 )
625             };
626 
627             if amx_tile != 0 {
628                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
629             } else {
630                 let mask: usize = 0;
631                 // SAFETY: the mask being modified (not marked mutable as it is
632                 // modified in unsafe only which is permitted) isn't in use elsewhere.
633                 let result = unsafe {
634                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
635                 };
636                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
637                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
638                 }
639             }
640         }
641 
642         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
643             let mut cpu_list = Vec::new();
644             for (proximity_domain, numa_node) in numa_nodes.iter() {
645                 for cpu in numa_node.cpus.iter() {
646                     cpu_list.push((*cpu, *proximity_domain))
647                 }
648             }
649             cpu_list
650         }
651         .into_iter()
652         .collect();
653 
654         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
655             cpu_affinity
656                 .iter()
657                 .map(|a| (a.vcpu, a.host_cpus.clone()))
658                 .collect()
659         } else {
660             BTreeMap::new()
661         };
662 
663         #[cfg(feature = "tdx")]
664         let dynamic = !tdx_enabled;
665         #[cfg(not(feature = "tdx"))]
666         let dynamic = true;
667 
668         Ok(Arc::new(Mutex::new(CpuManager {
669             hypervisor_type,
670             config: config.clone(),
671             interrupt_controller: None,
672             #[cfg(target_arch = "x86_64")]
673             cpuid: Vec::new(),
674             vm,
675             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
676             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
677             vcpu_states,
678             exit_evt,
679             reset_evt,
680             #[cfg(feature = "guest_debug")]
681             vm_debug_evt,
682             selected_cpu: 0,
683             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
684             seccomp_action,
685             vm_ops,
686             acpi_address: None,
687             proximity_domain_per_cpu,
688             affinity,
689             dynamic,
690         })))
691     }
692 
693     #[cfg(target_arch = "x86_64")]
694     pub fn populate_cpuid(
695         &mut self,
696         memory_manager: &Arc<Mutex<MemoryManager>>,
697         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
698         #[cfg(feature = "tdx")] tdx_enabled: bool,
699     ) -> Result<()> {
700         let sgx_epc_sections = memory_manager
701             .lock()
702             .unwrap()
703             .sgx_epc_region()
704             .as_ref()
705             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
706 
707         let topology = self.config.topology.clone().map_or_else(
708             || {
709                 #[cfg(feature = "mshv")]
710                 if matches!(hypervisor.hypervisor_type(), HypervisorType::Mshv) {
711                     return Some((1, self.boot_vcpus(), 1));
712                 }
713                 None
714             },
715             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
716         );
717 
718         self.cpuid = {
719             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
720             arch::generate_common_cpuid(
721                 hypervisor,
722                 topology,
723                 sgx_epc_sections,
724                 phys_bits,
725                 self.config.kvm_hyperv,
726                 #[cfg(feature = "tdx")]
727                 tdx_enabled,
728             )
729             .map_err(Error::CommonCpuId)?
730         };
731 
732         Ok(())
733     }
734 
735     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
736         info!("Creating vCPU: cpu_id = {}", cpu_id);
737 
738         let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?;
739 
740         if let Some(snapshot) = snapshot {
741             // AArch64 vCPUs should be initialized after created.
742             #[cfg(target_arch = "aarch64")]
743             vcpu.init(&self.vm)?;
744 
745             let state: CpuState = snapshot.to_state().map_err(|e| {
746                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
747             })?;
748             vcpu.vcpu
749                 .set_state(&state)
750                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
751 
752             vcpu.saved_state = Some(state);
753         }
754 
755         let vcpu = Arc::new(Mutex::new(vcpu));
756 
757         // Adding vCPU to the CpuManager's vCPU list.
758         self.vcpus.push(vcpu.clone());
759 
760         Ok(vcpu)
761     }
762 
763     pub fn configure_vcpu(
764         &self,
765         vcpu: Arc<Mutex<Vcpu>>,
766         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
767     ) -> Result<()> {
768         let mut vcpu = vcpu.lock().unwrap();
769 
770         #[cfg(target_arch = "x86_64")]
771         assert!(!self.cpuid.is_empty());
772 
773         #[cfg(target_arch = "x86_64")]
774         vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?;
775 
776         #[cfg(target_arch = "aarch64")]
777         vcpu.configure(&self.vm, boot_setup)?;
778 
779         Ok(())
780     }
781 
782     /// Only create new vCPUs if there aren't any inactive ones to reuse
783     fn create_vcpus(
784         &mut self,
785         desired_vcpus: u8,
786         snapshot: Option<Snapshot>,
787     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
788         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
789         info!(
790             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
791             desired_vcpus,
792             self.config.max_vcpus,
793             self.vcpus.len(),
794             self.present_vcpus()
795         );
796 
797         if desired_vcpus > self.config.max_vcpus {
798             return Err(Error::DesiredVCpuCountExceedsMax);
799         }
800 
801         // Only create vCPUs in excess of all the allocated vCPUs.
802         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
803             vcpus.push(self.create_vcpu(
804                 cpu_id,
805                 // TODO: The special format of the CPU id can be removed once
806                 // ready to break live upgrade.
807                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
808             )?);
809         }
810 
811         Ok(vcpus)
812     }
813 
814     #[cfg(target_arch = "aarch64")]
815     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
816         for cpu in self.vcpus.iter() {
817             let cpu = cpu.lock().unwrap();
818             // Check if PMU attr is available, if not, log the information.
819             if cpu.vcpu.has_pmu_support() {
820                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
821             } else {
822                 debug!(
823                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
824                     cpu.id
825                 );
826                 return Ok(false);
827             }
828         }
829 
830         Ok(true)
831     }
832 
833     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
834         self.vcpus.clone()
835     }
836 
837     fn start_vcpu(
838         &mut self,
839         vcpu: Arc<Mutex<Vcpu>>,
840         vcpu_id: u8,
841         vcpu_thread_barrier: Arc<Barrier>,
842         inserting: bool,
843     ) -> Result<()> {
844         let reset_evt = self.reset_evt.try_clone().unwrap();
845         let exit_evt = self.exit_evt.try_clone().unwrap();
846         #[cfg(feature = "kvm")]
847         let hypervisor_type = self.hypervisor_type;
848         #[cfg(feature = "guest_debug")]
849         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
850         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
851         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
852         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
853 
854         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
855         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
856             .vcpu_run_interrupted
857             .clone();
858         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
859 
860         // Prepare the CPU set the current vCPU is expected to run onto.
861         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
862             // SAFETY: all zeros is a valid pattern
863             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
864             // SAFETY: FFI call, trivially safe
865             unsafe { libc::CPU_ZERO(&mut cpuset) };
866             for host_cpu in host_cpus {
867                 // SAFETY: FFI call, trivially safe
868                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
869             }
870             cpuset
871         });
872 
873         // Retrieve seccomp filter for vcpu thread
874         let vcpu_seccomp_filter =
875             get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type)
876                 .map_err(Error::CreateSeccompFilter)?;
877 
878         #[cfg(target_arch = "x86_64")]
879         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
880 
881         info!("Starting vCPU: cpu_id = {}", vcpu_id);
882 
883         let handle = Some(
884             thread::Builder::new()
885                 .name(format!("vcpu{vcpu_id}"))
886                 .spawn(move || {
887                     // Schedule the thread to run on the expected CPU set
888                     if let Some(cpuset) = cpuset.as_ref() {
889                         // SAFETY: FFI call with correct arguments
890                         let ret = unsafe {
891                             libc::sched_setaffinity(
892                                 0,
893                                 std::mem::size_of::<libc::cpu_set_t>(),
894                                 cpuset as *const libc::cpu_set_t,
895                             )
896                         };
897 
898                         if ret != 0 {
899                             error!(
900                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
901                                 vcpu_id,
902                                 io::Error::last_os_error()
903                             );
904                             return;
905                         }
906                     }
907 
908                     // Apply seccomp filter for vcpu thread.
909                     if !vcpu_seccomp_filter.is_empty() {
910                         if let Err(e) =
911                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
912                         {
913                             error!("Error applying seccomp filter: {:?}", e);
914                             return;
915                         }
916                     }
917                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
918                     // This uses an async signal safe handler to kill the vcpu handles.
919                     register_signal_handler(SIGRTMIN(), handle_signal)
920                         .expect("Failed to register vcpu signal handler");
921                     // Block until all CPUs are ready.
922                     vcpu_thread_barrier.wait();
923 
924                     std::panic::catch_unwind(move || {
925                         loop {
926                             // If we are being told to pause, we park the thread
927                             // until the pause boolean is toggled.
928                             // The resume operation is responsible for toggling
929                             // the boolean and unpark the thread.
930                             // We enter a loop because park() could spuriously
931                             // return. We will then park() again unless the
932                             // pause boolean has been toggled.
933 
934                             // Need to use Ordering::SeqCst as we have multiple
935                             // loads and stores to different atomics and we need
936                             // to see them in a consistent order in all threads
937 
938                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
939                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
940                                 // completed by returning to KVM_RUN. From the kernel docs:
941                                 //
942                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
943                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
944                                 // operations are complete (and guest state is consistent) only after userspace
945                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
946                                 // incomplete operations and then check for pending signals.
947                                 // The pending state of the operation is not preserved in state which is
948                                 // visible to userspace, thus userspace should ensure that the operation is
949                                 // completed before performing a live migration.  Userspace can re-enter the
950                                 // guest with an unmasked signal pending or with the immediate_exit field set
951                                 // to complete pending operations without allowing any further instructions
952                                 // to be executed.
953 
954                                 #[cfg(feature = "kvm")]
955                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
956                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
957                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
958                                         error!("Unexpected VM exit on \"immediate_exit\" run");
959                                         break;
960                                     }
961                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
962                                 }
963 
964                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
965                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
966                                     thread::park();
967                                 }
968                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
969                             }
970 
971                             // We've been told to terminate
972                             if vcpu_kill_signalled.load(Ordering::SeqCst)
973                                 || vcpu_kill.load(Ordering::SeqCst)
974                             {
975                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
976                                 break;
977                             }
978 
979                             #[cfg(feature = "tdx")]
980                             let mut vcpu = vcpu.lock().unwrap();
981                             #[cfg(not(feature = "tdx"))]
982                             let vcpu = vcpu.lock().unwrap();
983                             // vcpu.run() returns false on a triple-fault so trigger a reset
984                             match vcpu.run() {
985                                 Ok(run) => match run {
986                                     #[cfg(feature = "kvm")]
987                                     VmExit::Debug => {
988                                         info!("VmExit::Debug");
989                                         #[cfg(feature = "guest_debug")]
990                                         {
991                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
992                                             let raw_tid = get_raw_tid(vcpu_id as usize);
993                                             vm_debug_evt.write(raw_tid as u64).unwrap();
994                                         }
995                                     }
996                                     #[cfg(target_arch = "x86_64")]
997                                     VmExit::IoapicEoi(vector) => {
998                                         if let Some(interrupt_controller) =
999                                             &interrupt_controller_clone
1000                                         {
1001                                             interrupt_controller
1002                                                 .lock()
1003                                                 .unwrap()
1004                                                 .end_of_interrupt(vector);
1005                                         }
1006                                     }
1007                                     VmExit::Ignore => {}
1008                                     VmExit::Hyperv => {}
1009                                     VmExit::Reset => {
1010                                         info!("VmExit::Reset");
1011                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1012                                         reset_evt.write(1).unwrap();
1013                                         break;
1014                                     }
1015                                     VmExit::Shutdown => {
1016                                         info!("VmExit::Shutdown");
1017                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1018                                         exit_evt.write(1).unwrap();
1019                                         break;
1020                                     }
1021                                     #[cfg(feature = "tdx")]
1022                                     VmExit::Tdx => {
1023                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1024                                             match vcpu.get_tdx_exit_details() {
1025                                                 Ok(details) => match details {
1026                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1027                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1028                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1029                                                     }
1030                                                 },
1031                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1032                                             }
1033                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1034                                         } else {
1035                                             // We should never reach this code as
1036                                             // this means the design from the code
1037                                             // is wrong.
1038                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1039                                         }
1040                                     }
1041                                     _ => {
1042                                         error!(
1043                                             "VCPU generated error: {:?}",
1044                                             Error::UnexpectedVmExit
1045                                         );
1046                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1047                                         exit_evt.write(1).unwrap();
1048                                         break;
1049                                     }
1050                                 },
1051 
1052                                 Err(e) => {
1053                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1054                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1055                                     exit_evt.write(1).unwrap();
1056                                     break;
1057                                 }
1058                             }
1059 
1060                             // We've been told to terminate
1061                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1062                                 || vcpu_kill.load(Ordering::SeqCst)
1063                             {
1064                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1065                                 break;
1066                             }
1067                         }
1068                     })
1069                     .or_else(|_| {
1070                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1071                         error!("vCPU thread panicked");
1072                         panic_exit_evt.write(1)
1073                     })
1074                     .ok();
1075                 })
1076                 .map_err(Error::VcpuSpawn)?,
1077         );
1078 
1079         // On hot plug calls into this function entry_point is None. It is for
1080         // those hotplug CPU additions that we need to set the inserting flag.
1081         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1082         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1083 
1084         Ok(())
1085     }
1086 
1087     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1088     fn activate_vcpus(
1089         &mut self,
1090         desired_vcpus: u8,
1091         inserting: bool,
1092         paused: Option<bool>,
1093     ) -> Result<()> {
1094         if desired_vcpus > self.config.max_vcpus {
1095             return Err(Error::DesiredVCpuCountExceedsMax);
1096         }
1097 
1098         let vcpu_thread_barrier = Arc::new(Barrier::new(
1099             (desired_vcpus - self.present_vcpus() + 1) as usize,
1100         ));
1101 
1102         if let Some(paused) = paused {
1103             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1104         }
1105 
1106         info!(
1107             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1108             desired_vcpus,
1109             self.vcpus.len(),
1110             self.present_vcpus(),
1111             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1112         );
1113 
1114         // This reuses any inactive vCPUs as well as any that were newly created
1115         for vcpu_id in self.present_vcpus()..desired_vcpus {
1116             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1117             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1118         }
1119 
1120         // Unblock all CPU threads.
1121         vcpu_thread_barrier.wait();
1122         Ok(())
1123     }
1124 
1125     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1126         // Mark vCPUs for removal, actual removal happens on ejection
1127         for cpu_id in desired_vcpus..self.present_vcpus() {
1128             self.vcpu_states[usize::from(cpu_id)].removing = true;
1129         }
1130     }
1131 
1132     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1133         info!("Removing vCPU: cpu_id = {}", cpu_id);
1134         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1135         state.kill.store(true, Ordering::SeqCst);
1136         state.signal_thread();
1137         state.join_thread()?;
1138         state.handle = None;
1139 
1140         // Once the thread has exited, clear the "kill" so that it can reused
1141         state.kill.store(false, Ordering::SeqCst);
1142 
1143         Ok(())
1144     }
1145 
1146     pub fn create_boot_vcpus(
1147         &mut self,
1148         snapshot: Option<Snapshot>,
1149     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1150         trace_scoped!("create_boot_vcpus");
1151 
1152         self.create_vcpus(self.boot_vcpus(), snapshot)
1153     }
1154 
1155     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1156     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1157         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1158     }
1159 
1160     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1161         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1162             .map_err(|e| {
1163                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1164             })?;
1165 
1166         Ok(())
1167     }
1168 
1169     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1170         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1171             return Ok(false);
1172         }
1173 
1174         if !self.dynamic {
1175             return Ok(false);
1176         }
1177 
1178         match desired_vcpus.cmp(&self.present_vcpus()) {
1179             cmp::Ordering::Greater => {
1180                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1181                 for vcpu in vcpus {
1182                     self.configure_vcpu(vcpu, None)?
1183                 }
1184                 self.activate_vcpus(desired_vcpus, true, None)?;
1185                 Ok(true)
1186             }
1187             cmp::Ordering::Less => {
1188                 self.mark_vcpus_for_removal(desired_vcpus);
1189                 Ok(true)
1190             }
1191             _ => Ok(false),
1192         }
1193     }
1194 
1195     pub fn shutdown(&mut self) -> Result<()> {
1196         // Tell the vCPUs to stop themselves next time they go through the loop
1197         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1198 
1199         // Toggle the vCPUs pause boolean
1200         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1201 
1202         // Unpark all the VCPU threads.
1203         for state in self.vcpu_states.iter() {
1204             state.unpark_thread();
1205         }
1206 
1207         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1208         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1209         // above.
1210         for state in self.vcpu_states.iter() {
1211             state.signal_thread();
1212         }
1213 
1214         // Wait for all the threads to finish. This removes the state from the vector.
1215         for mut state in self.vcpu_states.drain(..) {
1216             state.join_thread()?;
1217         }
1218 
1219         Ok(())
1220     }
1221 
1222     #[cfg(feature = "tdx")]
1223     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1224         for vcpu in &self.vcpus {
1225             vcpu.lock()
1226                 .unwrap()
1227                 .vcpu
1228                 .tdx_init(hob_address)
1229                 .map_err(Error::InitializeTdx)?;
1230         }
1231         Ok(())
1232     }
1233 
1234     pub fn boot_vcpus(&self) -> u8 {
1235         self.config.boot_vcpus
1236     }
1237 
1238     pub fn max_vcpus(&self) -> u8 {
1239         self.config.max_vcpus
1240     }
1241 
1242     #[cfg(target_arch = "x86_64")]
1243     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1244         assert!(!self.cpuid.is_empty());
1245         self.cpuid.clone()
1246     }
1247 
1248     fn present_vcpus(&self) -> u8 {
1249         self.vcpu_states
1250             .iter()
1251             .fold(0, |acc, state| acc + state.active() as u8)
1252     }
1253 
1254     #[cfg(target_arch = "aarch64")]
1255     pub fn get_mpidrs(&self) -> Vec<u64> {
1256         self.vcpus
1257             .iter()
1258             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1259             .collect()
1260     }
1261 
1262     #[cfg(target_arch = "aarch64")]
1263     pub fn get_saved_states(&self) -> Vec<CpuState> {
1264         self.vcpus
1265             .iter()
1266             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1267             .collect()
1268     }
1269 
1270     #[cfg(target_arch = "aarch64")]
1271     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1272         self.config
1273             .topology
1274             .clone()
1275             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1276     }
1277 
1278     pub fn create_madt(&self) -> Sdt {
1279         use crate::acpi;
1280         // This is also checked in the commandline parsing.
1281         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1282 
1283         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1284         #[cfg(target_arch = "x86_64")]
1285         {
1286             madt.write(36, arch::layout::APIC_START.0);
1287 
1288             for cpu in 0..self.config.max_vcpus {
1289                 let lapic = LocalApic {
1290                     r#type: acpi::ACPI_APIC_PROCESSOR,
1291                     length: 8,
1292                     processor_id: cpu,
1293                     apic_id: cpu,
1294                     flags: if cpu < self.config.boot_vcpus {
1295                         1 << MADT_CPU_ENABLE_FLAG
1296                     } else {
1297                         0
1298                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1299                 };
1300                 madt.append(lapic);
1301             }
1302 
1303             madt.append(Ioapic {
1304                 r#type: acpi::ACPI_APIC_IO,
1305                 length: 12,
1306                 ioapic_id: 0,
1307                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1308                 gsi_base: 0,
1309                 ..Default::default()
1310             });
1311 
1312             madt.append(InterruptSourceOverride {
1313                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1314                 length: 10,
1315                 bus: 0,
1316                 source: 4,
1317                 gsi: 4,
1318                 flags: 0,
1319             });
1320         }
1321 
1322         #[cfg(target_arch = "aarch64")]
1323         {
1324             /* Notes:
1325              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1326              */
1327 
1328             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1329             for cpu in 0..self.config.boot_vcpus {
1330                 let vcpu = &self.vcpus[cpu as usize];
1331                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1332                 /* ARMv8 MPIDR format:
1333                      Bits [63:40] Must be zero
1334                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1335                      Bits [31:24] Must be zero
1336                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1337                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1338                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1339                 */
1340                 let mpidr_mask = 0xff_00ff_ffff;
1341                 let gicc = GicC {
1342                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1343                     length: 80,
1344                     reserved0: 0,
1345                     cpu_interface_number: cpu as u32,
1346                     uid: cpu as u32,
1347                     flags: 1,
1348                     parking_version: 0,
1349                     performance_interrupt: 0,
1350                     parked_address: 0,
1351                     base_address: 0,
1352                     gicv_base_address: 0,
1353                     gich_base_address: 0,
1354                     vgic_interrupt: 0,
1355                     gicr_base_address: 0,
1356                     mpidr: mpidr & mpidr_mask,
1357                     proc_power_effi_class: 0,
1358                     reserved1: 0,
1359                     spe_overflow_interrupt: 0,
1360                 };
1361 
1362                 madt.append(gicc);
1363             }
1364             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1365 
1366             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1367             let gicd = GicD {
1368                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1369                 length: 24,
1370                 reserved0: 0,
1371                 gic_id: 0,
1372                 base_address: vgic_config.dist_addr,
1373                 global_irq_base: 0,
1374                 version: 3,
1375                 reserved1: [0; 3],
1376             };
1377             madt.append(gicd);
1378 
1379             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1380             let gicr = GicR {
1381                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1382                 length: 16,
1383                 reserved: 0,
1384                 base_address: vgic_config.redists_addr,
1385                 range_length: vgic_config.redists_size as u32,
1386             };
1387             madt.append(gicr);
1388 
1389             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1390             let gicits = GicIts {
1391                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1392                 length: 20,
1393                 reserved0: 0,
1394                 translation_id: 0,
1395                 base_address: vgic_config.msi_addr,
1396                 reserved1: 0,
1397             };
1398             madt.append(gicits);
1399 
1400             madt.update_checksum();
1401         }
1402 
1403         madt
1404     }
1405 
1406     #[cfg(target_arch = "aarch64")]
1407     pub fn create_pptt(&self) -> Sdt {
1408         let pptt_start = 0;
1409         let mut cpus = 0;
1410         let mut uid = 0;
1411         // If topology is not specified, the default setting is:
1412         // 1 package, multiple cores, 1 thread per core
1413         // This is also the behavior when PPTT is missing.
1414         let (threads_per_core, cores_per_package, packages) =
1415             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1416 
1417         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1418 
1419         for cluster_idx in 0..packages {
1420             if cpus < self.config.boot_vcpus as usize {
1421                 let cluster_offset = pptt.len() - pptt_start;
1422                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1423                     r#type: 0,
1424                     length: 20,
1425                     reserved: 0,
1426                     flags: 0x2,
1427                     parent: 0,
1428                     acpi_processor_id: cluster_idx as u32,
1429                     num_private_resources: 0,
1430                 };
1431                 pptt.append(cluster_hierarchy_node);
1432 
1433                 for core_idx in 0..cores_per_package {
1434                     let core_offset = pptt.len() - pptt_start;
1435 
1436                     if threads_per_core > 1 {
1437                         let core_hierarchy_node = ProcessorHierarchyNode {
1438                             r#type: 0,
1439                             length: 20,
1440                             reserved: 0,
1441                             flags: 0x2,
1442                             parent: cluster_offset as u32,
1443                             acpi_processor_id: core_idx as u32,
1444                             num_private_resources: 0,
1445                         };
1446                         pptt.append(core_hierarchy_node);
1447 
1448                         for _thread_idx in 0..threads_per_core {
1449                             let thread_hierarchy_node = ProcessorHierarchyNode {
1450                                 r#type: 0,
1451                                 length: 20,
1452                                 reserved: 0,
1453                                 flags: 0xE,
1454                                 parent: core_offset as u32,
1455                                 acpi_processor_id: uid as u32,
1456                                 num_private_resources: 0,
1457                             };
1458                             pptt.append(thread_hierarchy_node);
1459                             uid += 1;
1460                         }
1461                     } else {
1462                         let thread_hierarchy_node = ProcessorHierarchyNode {
1463                             r#type: 0,
1464                             length: 20,
1465                             reserved: 0,
1466                             flags: 0xA,
1467                             parent: cluster_offset as u32,
1468                             acpi_processor_id: uid as u32,
1469                             num_private_resources: 0,
1470                         };
1471                         pptt.append(thread_hierarchy_node);
1472                         uid += 1;
1473                     }
1474                 }
1475                 cpus += (cores_per_package * threads_per_core) as usize;
1476             }
1477         }
1478 
1479         pptt.update_checksum();
1480         pptt
1481     }
1482 
1483     #[cfg(feature = "guest_debug")]
1484     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1485         self.vcpus[usize::from(cpu_id)]
1486             .lock()
1487             .unwrap()
1488             .vcpu
1489             .get_regs()
1490             .map_err(Error::CpuDebug)
1491     }
1492 
1493     #[cfg(feature = "guest_debug")]
1494     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1495         self.vcpus[usize::from(cpu_id)]
1496             .lock()
1497             .unwrap()
1498             .vcpu
1499             .set_regs(regs)
1500             .map_err(Error::CpuDebug)
1501     }
1502 
1503     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1504     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1505         self.vcpus[usize::from(cpu_id)]
1506             .lock()
1507             .unwrap()
1508             .vcpu
1509             .get_sregs()
1510             .map_err(Error::CpuDebug)
1511     }
1512 
1513     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1514     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1515         self.vcpus[usize::from(cpu_id)]
1516             .lock()
1517             .unwrap()
1518             .vcpu
1519             .set_sregs(sregs)
1520             .map_err(Error::CpuDebug)
1521     }
1522 
1523     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1524     fn translate_gva(
1525         &self,
1526         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1527         cpu_id: u8,
1528         gva: u64,
1529     ) -> Result<u64> {
1530         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1531             .lock()
1532             .unwrap()
1533             .vcpu
1534             .translate_gva(gva, /* flags: unused */ 0)
1535             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1536         Ok(gpa)
1537     }
1538 
1539     ///
1540     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1541     /// it in VMM by walking through translation tables.
1542     ///
1543     /// Address translation is big topic, here we only focus the scenario that
1544     /// happens in VMM while debugging kernel. This `translate_gva`
1545     /// implementation is restricted to:
1546     /// - Exception Level 1
1547     /// - Translate high address range only (kernel space)
1548     ///
1549     /// This implementation supports following Arm-v8a features related to
1550     /// address translation:
1551     /// - FEAT_LPA
1552     /// - FEAT_LVA
1553     /// - FEAT_LPA2
1554     ///
1555     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1556     fn translate_gva(
1557         &self,
1558         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1559         cpu_id: u8,
1560         gva: u64,
1561     ) -> Result<u64> {
1562         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1563             .lock()
1564             .unwrap()
1565             .vcpu
1566             .get_sys_reg(regs::TCR_EL1)
1567             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1568         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1569             .lock()
1570             .unwrap()
1571             .vcpu
1572             .get_sys_reg(regs::TTBR1_EL1)
1573             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1574         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1575             .lock()
1576             .unwrap()
1577             .vcpu
1578             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1579             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1580 
1581         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1582         // or low (0x000xxx...).
1583         let high_range = extract_bits_64!(gva, 55, 1);
1584         if high_range == 0 {
1585             info!("VA (0x{:x}) range is not supported!", gva);
1586             return Ok(gva);
1587         }
1588 
1589         // High range size offset
1590         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1591         // Granule size
1592         let tg = extract_bits_64!(tcr_el1, 30, 2);
1593         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1594         let ds = extract_bits_64!(tcr_el1, 59, 1);
1595 
1596         if tsz == 0 {
1597             info!("VA translation is not ready!");
1598             return Ok(gva);
1599         }
1600 
1601         // VA size is determined by TCR_BL1.T1SZ
1602         let va_size = 64 - tsz;
1603         // Number of bits in VA consumed in each level of translation
1604         let stride = match tg {
1605             3 => 13, // 64KB granule size
1606             1 => 11, // 16KB granule size
1607             _ => 9,  // 4KB, default
1608         };
1609         // Starting level of walking
1610         let mut level = 4 - (va_size - 4) / stride;
1611 
1612         // PA or IPA size is determined
1613         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1614         #[allow(clippy::identity_op)]
1615         let pa_range = extract_bits_64!(id_aa64mmfr0_el1, 0, 4);
1616         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1617         // To be safe, we use the minimum value if they are different.
1618         let pa_range = std::cmp::min(tcr_ips, pa_range);
1619         // PA size in bits
1620         let pa_size = match pa_range {
1621             0 => 32,
1622             1 => 36,
1623             2 => 40,
1624             3 => 42,
1625             4 => 44,
1626             5 => 48,
1627             6 => 52,
1628             _ => {
1629                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1630                     "PA range not supported {pa_range}"
1631                 ))))
1632             }
1633         };
1634 
1635         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1636         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1637         // If FEAT_LPA2 is present, the translation table descriptor holds
1638         // 50 bits of the table address of next level.
1639         // Otherwise, it is 48 bits.
1640         let descaddrmask = if ds == 1 {
1641             !0u64 >> (64 - 50) // mask with 50 least significant bits
1642         } else {
1643             !0u64 >> (64 - 48) // mask with 48 least significant bits
1644         };
1645         let descaddrmask = descaddrmask & !indexmask_grainsize;
1646 
1647         // Translation table base address
1648         #[allow(clippy::identity_op)]
1649         let mut descaddr: u64 = extract_bits_64!(ttbr1_el1, 0, 48);
1650         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1651         // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5].
1652         if pa_size == 52 {
1653             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1654         }
1655 
1656         // Loop through tables of each level
1657         loop {
1658             // Table offset for current level
1659             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1660             descaddr |= table_offset;
1661             descaddr &= !7u64;
1662 
1663             let mut buf = [0; 8];
1664             guest_memory
1665                 .memory()
1666                 .read(&mut buf, GuestAddress(descaddr))
1667                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1668             let descriptor = u64::from_le_bytes(buf);
1669 
1670             descaddr = descriptor & descaddrmask;
1671             // In the case of FEAT_LPA, the next-level translation table address
1672             // bits [48:51] comes from bits [12:15] of the current descriptor.
1673             // For FEAT_LPA2, the next-level translation table address
1674             // bits [50:51] comes from bits [8:9] of the current descriptor,
1675             // bits [48:49] comes from bits [48:49] of the descriptor which was
1676             // handled previously.
1677             if pa_size == 52 {
1678                 if ds == 1 {
1679                     // FEAT_LPA2
1680                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1681                 } else {
1682                     // FEAT_LPA
1683                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1684                 }
1685             }
1686 
1687             if (descriptor & 2) != 0 && (level < 3) {
1688                 // This is a table entry. Go down to next level.
1689                 level += 1;
1690                 indexmask = indexmask_grainsize;
1691                 continue;
1692             }
1693 
1694             break;
1695         }
1696 
1697         // We have reached either:
1698         // - a page entry at level 3 or
1699         // - a block entry at level 1 or 2
1700         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1701         descaddr &= !(page_size - 1);
1702         descaddr |= gva & (page_size - 1);
1703 
1704         Ok(descaddr)
1705     }
1706 
1707     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1708         self.acpi_address = Some(acpi_address);
1709     }
1710 
1711     pub(crate) fn set_interrupt_controller(
1712         &mut self,
1713         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1714     ) {
1715         self.interrupt_controller = Some(interrupt_controller);
1716     }
1717 }
1718 
1719 struct Cpu {
1720     cpu_id: u8,
1721     proximity_domain: u32,
1722     dynamic: bool,
1723 }
1724 
1725 #[cfg(target_arch = "x86_64")]
1726 const MADT_CPU_ENABLE_FLAG: usize = 0;
1727 
1728 #[cfg(target_arch = "x86_64")]
1729 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1730 
1731 impl Cpu {
1732     #[cfg(target_arch = "x86_64")]
1733     fn generate_mat(&self) -> Vec<u8> {
1734         let lapic = LocalApic {
1735             r#type: 0,
1736             length: 8,
1737             processor_id: self.cpu_id,
1738             apic_id: self.cpu_id,
1739             flags: 1 << MADT_CPU_ENABLE_FLAG,
1740         };
1741 
1742         let mut mat_data: Vec<u8> = Vec::new();
1743         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1744         // SAFETY: mat_data is large enough to hold lapic
1745         unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic };
1746 
1747         mat_data
1748     }
1749 }
1750 
1751 impl Aml for Cpu {
1752     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1753         #[cfg(target_arch = "x86_64")]
1754         let mat_data: Vec<u8> = self.generate_mat();
1755         #[allow(clippy::if_same_then_else)]
1756         if self.dynamic {
1757             aml::Device::new(
1758                 format!("C{:03}", self.cpu_id).as_str().into(),
1759                 vec![
1760                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1761                     &aml::Name::new("_UID".into(), &self.cpu_id),
1762                     // Currently, AArch64 cannot support following fields.
1763                     /*
1764                     _STA return value:
1765                     Bit [0] – Set if the device is present.
1766                     Bit [1] – Set if the device is enabled and decoding its resources.
1767                     Bit [2] – Set if the device should be shown in the UI.
1768                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1769                     Bit [4] – Set if the battery is present.
1770                     Bits [31:5] – Reserved (must be cleared).
1771                     */
1772                     #[cfg(target_arch = "x86_64")]
1773                     &aml::Method::new(
1774                         "_STA".into(),
1775                         0,
1776                         false,
1777                         // Call into CSTA method which will interrogate device
1778                         vec![&aml::Return::new(&aml::MethodCall::new(
1779                             "CSTA".into(),
1780                             vec![&self.cpu_id],
1781                         ))],
1782                     ),
1783                     &aml::Method::new(
1784                         "_PXM".into(),
1785                         0,
1786                         false,
1787                         vec![&aml::Return::new(&self.proximity_domain)],
1788                     ),
1789                     // The Linux kernel expects every CPU device to have a _MAT entry
1790                     // containing the LAPIC for this processor with the enabled bit set
1791                     // even it if is disabled in the MADT (non-boot CPU)
1792                     #[cfg(target_arch = "x86_64")]
1793                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1794                     // Trigger CPU ejection
1795                     #[cfg(target_arch = "x86_64")]
1796                     &aml::Method::new(
1797                         "_EJ0".into(),
1798                         1,
1799                         false,
1800                         // Call into CEJ0 method which will actually eject device
1801                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1802                     ),
1803                 ],
1804             )
1805             .to_aml_bytes(sink);
1806         } else {
1807             aml::Device::new(
1808                 format!("C{:03}", self.cpu_id).as_str().into(),
1809                 vec![
1810                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1811                     &aml::Name::new("_UID".into(), &self.cpu_id),
1812                     #[cfg(target_arch = "x86_64")]
1813                     &aml::Method::new(
1814                         "_STA".into(),
1815                         0,
1816                         false,
1817                         // Mark CPU present see CSTA implementation
1818                         vec![&aml::Return::new(&0xfu8)],
1819                     ),
1820                     &aml::Method::new(
1821                         "_PXM".into(),
1822                         0,
1823                         false,
1824                         vec![&aml::Return::new(&self.proximity_domain)],
1825                     ),
1826                     // The Linux kernel expects every CPU device to have a _MAT entry
1827                     // containing the LAPIC for this processor with the enabled bit set
1828                     // even it if is disabled in the MADT (non-boot CPU)
1829                     #[cfg(target_arch = "x86_64")]
1830                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1831                 ],
1832             )
1833             .to_aml_bytes(sink);
1834         }
1835     }
1836 }
1837 
1838 struct CpuNotify {
1839     cpu_id: u8,
1840 }
1841 
1842 impl Aml for CpuNotify {
1843     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1844         let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
1845         aml::If::new(
1846             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1847             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1848         )
1849         .to_aml_bytes(sink)
1850     }
1851 }
1852 
1853 struct CpuMethods {
1854     max_vcpus: u8,
1855     dynamic: bool,
1856 }
1857 
1858 impl Aml for CpuMethods {
1859     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1860         if self.dynamic {
1861             // CPU status method
1862             aml::Method::new(
1863                 "CSTA".into(),
1864                 1,
1865                 true,
1866                 vec![
1867                     // Take lock defined above
1868                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1869                     // Write CPU number (in first argument) to I/O port via field
1870                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1871                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1872                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1873                     &aml::If::new(
1874                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1875                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1876                     ),
1877                     // Release lock
1878                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1879                     // Return 0 or 0xf
1880                     &aml::Return::new(&aml::Local(0)),
1881                 ],
1882             )
1883             .to_aml_bytes(sink);
1884 
1885             let mut cpu_notifies = Vec::new();
1886             for cpu_id in 0..self.max_vcpus {
1887                 cpu_notifies.push(CpuNotify { cpu_id });
1888             }
1889 
1890             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
1891             for cpu_id in 0..self.max_vcpus {
1892                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1893             }
1894 
1895             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
1896 
1897             aml::Method::new(
1898                 "CEJ0".into(),
1899                 1,
1900                 true,
1901                 vec![
1902                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1903                     // Write CPU number (in first argument) to I/O port via field
1904                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1905                     // Set CEJ0 bit
1906                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1907                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1908                 ],
1909             )
1910             .to_aml_bytes(sink);
1911 
1912             aml::Method::new(
1913                 "CSCN".into(),
1914                 0,
1915                 true,
1916                 vec![
1917                     // Take lock defined above
1918                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1919                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1920                     &aml::While::new(
1921                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1922                         vec![
1923                             // Write CPU number (in first argument) to I/O port via field
1924                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1925                             // Check if CINS bit is set
1926                             &aml::If::new(
1927                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1928                                 // Notify device if it is
1929                                 vec![
1930                                     &aml::MethodCall::new(
1931                                         "CTFY".into(),
1932                                         vec![&aml::Local(0), &aml::ONE],
1933                                     ),
1934                                     // Reset CINS bit
1935                                     &aml::Store::new(
1936                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1937                                         &aml::ONE,
1938                                     ),
1939                                 ],
1940                             ),
1941                             // Check if CRMV bit is set
1942                             &aml::If::new(
1943                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1944                                 // Notify device if it is (with the eject constant 0x3)
1945                                 vec![
1946                                     &aml::MethodCall::new(
1947                                         "CTFY".into(),
1948                                         vec![&aml::Local(0), &3u8],
1949                                     ),
1950                                     // Reset CRMV bit
1951                                     &aml::Store::new(
1952                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1953                                         &aml::ONE,
1954                                     ),
1955                                 ],
1956                             ),
1957                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1958                         ],
1959                     ),
1960                     // Release lock
1961                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1962                 ],
1963             )
1964             .to_aml_bytes(sink)
1965         } else {
1966             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
1967         }
1968     }
1969 }
1970 
1971 impl Aml for CpuManager {
1972     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1973         #[cfg(target_arch = "x86_64")]
1974         if let Some(acpi_address) = self.acpi_address {
1975             // CPU hotplug controller
1976             aml::Device::new(
1977                 "_SB_.PRES".into(),
1978                 vec![
1979                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
1980                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1981                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1982                     &aml::Mutex::new("CPLK".into(), 0),
1983                     &aml::Name::new(
1984                         "_CRS".into(),
1985                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
1986                             aml::AddressSpaceCachable::NotCacheable,
1987                             true,
1988                             acpi_address.0,
1989                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
1990                             None,
1991                         )]),
1992                     ),
1993                     // OpRegion and Fields map MMIO range into individual field values
1994                     &aml::OpRegion::new(
1995                         "PRST".into(),
1996                         aml::OpRegionSpace::SystemMemory,
1997                         &(acpi_address.0 as usize),
1998                         &CPU_MANAGER_ACPI_SIZE,
1999                     ),
2000                     &aml::Field::new(
2001                         "PRST".into(),
2002                         aml::FieldAccessType::Byte,
2003                         aml::FieldLockRule::NoLock,
2004                         aml::FieldUpdateRule::WriteAsZeroes,
2005                         vec![
2006                             aml::FieldEntry::Reserved(32),
2007                             aml::FieldEntry::Named(*b"CPEN", 1),
2008                             aml::FieldEntry::Named(*b"CINS", 1),
2009                             aml::FieldEntry::Named(*b"CRMV", 1),
2010                             aml::FieldEntry::Named(*b"CEJ0", 1),
2011                             aml::FieldEntry::Reserved(4),
2012                             aml::FieldEntry::Named(*b"CCMD", 8),
2013                         ],
2014                     ),
2015                     &aml::Field::new(
2016                         "PRST".into(),
2017                         aml::FieldAccessType::DWord,
2018                         aml::FieldLockRule::NoLock,
2019                         aml::FieldUpdateRule::Preserve,
2020                         vec![
2021                             aml::FieldEntry::Named(*b"CSEL", 32),
2022                             aml::FieldEntry::Reserved(32),
2023                             aml::FieldEntry::Named(*b"CDAT", 32),
2024                         ],
2025                     ),
2026                 ],
2027             )
2028             .to_aml_bytes(sink);
2029         }
2030 
2031         // CPU devices
2032         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2033         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2034         // Bundle methods together under a common object
2035         let methods = CpuMethods {
2036             max_vcpus: self.config.max_vcpus,
2037             dynamic: self.dynamic,
2038         };
2039         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2040 
2041         let mut cpu_devices = Vec::new();
2042         for cpu_id in 0..self.config.max_vcpus {
2043             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2044             let cpu_device = Cpu {
2045                 cpu_id,
2046                 proximity_domain,
2047                 dynamic: self.dynamic,
2048             };
2049 
2050             cpu_devices.push(cpu_device);
2051         }
2052 
2053         for cpu_device in cpu_devices.iter() {
2054             cpu_data_inner.push(cpu_device);
2055         }
2056 
2057         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2058     }
2059 }
2060 
2061 impl Pausable for CpuManager {
2062     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2063         // Tell the vCPUs to pause themselves next time they exit
2064         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2065 
2066         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2067         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2068         // above.
2069         for state in self.vcpu_states.iter() {
2070             state.signal_thread();
2071         }
2072 
2073         for vcpu in self.vcpus.iter() {
2074             let mut vcpu = vcpu.lock().unwrap();
2075             vcpu.pause()?;
2076             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2077             if !self.config.kvm_hyperv {
2078                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2079                     MigratableError::Pause(anyhow!(
2080                         "Could not notify guest it has been paused {:?}",
2081                         e
2082                     ))
2083                 })?;
2084             }
2085         }
2086 
2087         Ok(())
2088     }
2089 
2090     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2091         for vcpu in self.vcpus.iter() {
2092             vcpu.lock().unwrap().resume()?;
2093         }
2094 
2095         // Toggle the vCPUs pause boolean
2096         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2097 
2098         // Unpark all the VCPU threads.
2099         // Once unparked, the next thing they will do is checking for the pause
2100         // boolean. Since it'll be set to false, they will exit their pause loop
2101         // and go back to vmx root.
2102         for state in self.vcpu_states.iter() {
2103             state.unpark_thread();
2104         }
2105         Ok(())
2106     }
2107 }
2108 
2109 impl Snapshottable for CpuManager {
2110     fn id(&self) -> String {
2111         CPU_MANAGER_SNAPSHOT_ID.to_string()
2112     }
2113 
2114     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2115         let mut cpu_manager_snapshot = Snapshot::default();
2116 
2117         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2118         for vcpu in &self.vcpus {
2119             let mut vcpu = vcpu.lock().unwrap();
2120             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2121         }
2122 
2123         Ok(cpu_manager_snapshot)
2124     }
2125 }
2126 
2127 impl Transportable for CpuManager {}
2128 impl Migratable for CpuManager {}
2129 
2130 #[cfg(feature = "guest_debug")]
2131 impl Debuggable for CpuManager {
2132     #[cfg(feature = "kvm")]
2133     fn set_guest_debug(
2134         &self,
2135         cpu_id: usize,
2136         addrs: &[GuestAddress],
2137         singlestep: bool,
2138     ) -> std::result::Result<(), DebuggableError> {
2139         self.vcpus[cpu_id]
2140             .lock()
2141             .unwrap()
2142             .vcpu
2143             .set_guest_debug(addrs, singlestep)
2144             .map_err(DebuggableError::SetDebug)
2145     }
2146 
2147     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2148         Ok(())
2149     }
2150 
2151     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2152         Ok(())
2153     }
2154 
2155     #[cfg(target_arch = "x86_64")]
2156     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2157         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2158         let gregs = self
2159             .get_regs(cpu_id as u8)
2160             .map_err(DebuggableError::ReadRegs)?;
2161         let regs = [
2162             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2163             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2164         ];
2165 
2166         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2167         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2168         let eflags = gregs.rflags as u32;
2169         let rip = gregs.rip;
2170 
2171         // Segment registers: CS, SS, DS, ES, FS, GS
2172         let sregs = self
2173             .get_sregs(cpu_id as u8)
2174             .map_err(DebuggableError::ReadRegs)?;
2175         let segments = X86SegmentRegs {
2176             cs: sregs.cs.selector as u32,
2177             ss: sregs.ss.selector as u32,
2178             ds: sregs.ds.selector as u32,
2179             es: sregs.es.selector as u32,
2180             fs: sregs.fs.selector as u32,
2181             gs: sregs.gs.selector as u32,
2182         };
2183 
2184         // TODO: Add other registers
2185 
2186         Ok(CoreRegs {
2187             regs,
2188             eflags,
2189             rip,
2190             segments,
2191             ..Default::default()
2192         })
2193     }
2194 
2195     #[cfg(target_arch = "aarch64")]
2196     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2197         let gregs = self
2198             .get_regs(cpu_id as u8)
2199             .map_err(DebuggableError::ReadRegs)?;
2200         Ok(CoreRegs {
2201             x: gregs.regs.regs,
2202             sp: gregs.regs.sp,
2203             pc: gregs.regs.pc,
2204             ..Default::default()
2205         })
2206     }
2207 
2208     #[cfg(target_arch = "x86_64")]
2209     fn write_regs(
2210         &self,
2211         cpu_id: usize,
2212         regs: &CoreRegs,
2213     ) -> std::result::Result<(), DebuggableError> {
2214         let orig_gregs = self
2215             .get_regs(cpu_id as u8)
2216             .map_err(DebuggableError::ReadRegs)?;
2217         let gregs = StandardRegisters {
2218             rax: regs.regs[0],
2219             rbx: regs.regs[1],
2220             rcx: regs.regs[2],
2221             rdx: regs.regs[3],
2222             rsi: regs.regs[4],
2223             rdi: regs.regs[5],
2224             rbp: regs.regs[6],
2225             rsp: regs.regs[7],
2226             r8: regs.regs[8],
2227             r9: regs.regs[9],
2228             r10: regs.regs[10],
2229             r11: regs.regs[11],
2230             r12: regs.regs[12],
2231             r13: regs.regs[13],
2232             r14: regs.regs[14],
2233             r15: regs.regs[15],
2234             rip: regs.rip,
2235             // Update the lower 32-bit of rflags.
2236             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2237         };
2238 
2239         self.set_regs(cpu_id as u8, &gregs)
2240             .map_err(DebuggableError::WriteRegs)?;
2241 
2242         // Segment registers: CS, SS, DS, ES, FS, GS
2243         // Since GDB care only selectors, we call get_sregs() first.
2244         let mut sregs = self
2245             .get_sregs(cpu_id as u8)
2246             .map_err(DebuggableError::ReadRegs)?;
2247         sregs.cs.selector = regs.segments.cs as u16;
2248         sregs.ss.selector = regs.segments.ss as u16;
2249         sregs.ds.selector = regs.segments.ds as u16;
2250         sregs.es.selector = regs.segments.es as u16;
2251         sregs.fs.selector = regs.segments.fs as u16;
2252         sregs.gs.selector = regs.segments.gs as u16;
2253 
2254         self.set_sregs(cpu_id as u8, &sregs)
2255             .map_err(DebuggableError::WriteRegs)?;
2256 
2257         // TODO: Add other registers
2258 
2259         Ok(())
2260     }
2261 
2262     #[cfg(target_arch = "aarch64")]
2263     fn write_regs(
2264         &self,
2265         cpu_id: usize,
2266         regs: &CoreRegs,
2267     ) -> std::result::Result<(), DebuggableError> {
2268         let mut gregs = self
2269             .get_regs(cpu_id as u8)
2270             .map_err(DebuggableError::ReadRegs)?;
2271 
2272         gregs.regs.regs = regs.x;
2273         gregs.regs.sp = regs.sp;
2274         gregs.regs.pc = regs.pc;
2275 
2276         self.set_regs(cpu_id as u8, &gregs)
2277             .map_err(DebuggableError::WriteRegs)?;
2278 
2279         Ok(())
2280     }
2281 
2282     fn read_mem(
2283         &self,
2284         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2285         cpu_id: usize,
2286         vaddr: GuestAddress,
2287         len: usize,
2288     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2289         let mut buf = vec![0; len];
2290         let mut total_read = 0_u64;
2291 
2292         while total_read < len as u64 {
2293             let gaddr = vaddr.0 + total_read;
2294             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2295                 Ok(paddr) => paddr,
2296                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2297                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2298             };
2299             let psize = arch::PAGE_SIZE as u64;
2300             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2301             guest_memory
2302                 .memory()
2303                 .read(
2304                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2305                     GuestAddress(paddr),
2306                 )
2307                 .map_err(DebuggableError::ReadMem)?;
2308             total_read += read_len;
2309         }
2310         Ok(buf)
2311     }
2312 
2313     fn write_mem(
2314         &self,
2315         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2316         cpu_id: usize,
2317         vaddr: &GuestAddress,
2318         data: &[u8],
2319     ) -> std::result::Result<(), DebuggableError> {
2320         let mut total_written = 0_u64;
2321 
2322         while total_written < data.len() as u64 {
2323             let gaddr = vaddr.0 + total_written;
2324             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2325                 Ok(paddr) => paddr,
2326                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2327                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2328             };
2329             let psize = arch::PAGE_SIZE as u64;
2330             let write_len = std::cmp::min(
2331                 data.len() as u64 - total_written,
2332                 psize - (paddr & (psize - 1)),
2333             );
2334             guest_memory
2335                 .memory()
2336                 .write(
2337                     &data[total_written as usize..total_written as usize + write_len as usize],
2338                     GuestAddress(paddr),
2339                 )
2340                 .map_err(DebuggableError::WriteMem)?;
2341             total_written += write_len;
2342         }
2343         Ok(())
2344     }
2345 
2346     fn active_vcpus(&self) -> usize {
2347         self.present_vcpus() as usize
2348     }
2349 }
2350 
2351 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2352 impl Elf64Writable for CpuManager {}
2353 
2354 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2355 impl CpuElf64Writable for CpuManager {
2356     fn cpu_write_elf64_note(
2357         &mut self,
2358         dump_state: &DumpState,
2359     ) -> std::result::Result<(), GuestDebuggableError> {
2360         let mut coredump_file = dump_state.file.as_ref().unwrap();
2361         for vcpu in &self.vcpus {
2362             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2363             let mut pos: usize = 0;
2364             let mut buf = vec![0; note_size as usize];
2365             let descsz = size_of::<X86_64ElfPrStatus>();
2366             let vcpu_id = vcpu.lock().unwrap().id;
2367 
2368             let note = Elf64_Nhdr {
2369                 n_namesz: COREDUMP_NAME_SIZE,
2370                 n_descsz: descsz as u32,
2371                 n_type: NT_PRSTATUS,
2372             };
2373 
2374             let bytes: &[u8] = note.as_slice();
2375             buf.splice(0.., bytes.to_vec());
2376             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2377             buf.resize(pos + 4, 0);
2378             buf.splice(pos.., "CORE".to_string().into_bytes());
2379 
2380             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2381             buf.resize(pos + 32 + 4, 0);
2382             let pid = vcpu_id as u64;
2383             let bytes: &[u8] = pid.as_slice();
2384             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2385 
2386             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2387 
2388             let orig_rax: u64 = 0;
2389             let gregs = self.vcpus[usize::from(vcpu_id)]
2390                 .lock()
2391                 .unwrap()
2392                 .vcpu
2393                 .get_regs()
2394                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2395 
2396             let regs1 = [
2397                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2398                 gregs.r10,
2399             ];
2400             let regs2 = [
2401                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2402             ];
2403 
2404             let sregs = self.vcpus[usize::from(vcpu_id)]
2405                 .lock()
2406                 .unwrap()
2407                 .vcpu
2408                 .get_sregs()
2409                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2410 
2411             debug!(
2412                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2413                 gregs.rip,
2414                 gregs.rsp,
2415                 sregs.gs.base,
2416                 sregs.cs.selector,
2417                 sregs.ss.selector,
2418                 sregs.ds.selector,
2419             );
2420 
2421             let regs = X86_64UserRegs {
2422                 regs1,
2423                 regs2,
2424                 rip: gregs.rip,
2425                 cs: sregs.cs.selector as u64,
2426                 eflags: gregs.rflags,
2427                 rsp: gregs.rsp,
2428                 ss: sregs.ss.selector as u64,
2429                 fs_base: sregs.fs.base,
2430                 gs_base: sregs.gs.base,
2431                 ds: sregs.ds.selector as u64,
2432                 es: sregs.es.selector as u64,
2433                 fs: sregs.fs.selector as u64,
2434                 gs: sregs.gs.selector as u64,
2435             };
2436 
2437             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2438             let bytes: &[u8] = regs.as_slice();
2439             buf.resize(note_size as usize, 0);
2440             buf.splice(pos.., bytes.to_vec());
2441             buf.resize(note_size as usize, 0);
2442 
2443             coredump_file
2444                 .write(&buf)
2445                 .map_err(GuestDebuggableError::CoredumpFile)?;
2446         }
2447 
2448         Ok(())
2449     }
2450 
2451     fn cpu_write_vmm_note(
2452         &mut self,
2453         dump_state: &DumpState,
2454     ) -> std::result::Result<(), GuestDebuggableError> {
2455         let mut coredump_file = dump_state.file.as_ref().unwrap();
2456         for vcpu in &self.vcpus {
2457             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2458             let mut pos: usize = 0;
2459             let mut buf = vec![0; note_size as usize];
2460             let descsz = size_of::<DumpCpusState>();
2461             let vcpu_id = vcpu.lock().unwrap().id;
2462 
2463             let note = Elf64_Nhdr {
2464                 n_namesz: COREDUMP_NAME_SIZE,
2465                 n_descsz: descsz as u32,
2466                 n_type: 0,
2467             };
2468 
2469             let bytes: &[u8] = note.as_slice();
2470             buf.splice(0.., bytes.to_vec());
2471             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2472 
2473             buf.resize(pos + 4, 0);
2474             buf.splice(pos.., "QEMU".to_string().into_bytes());
2475 
2476             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2477 
2478             let gregs = self.vcpus[usize::from(vcpu_id)]
2479                 .lock()
2480                 .unwrap()
2481                 .vcpu
2482                 .get_regs()
2483                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2484 
2485             let regs1 = [
2486                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2487                 gregs.rbp,
2488             ];
2489 
2490             let regs2 = [
2491                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2492                 gregs.r15,
2493             ];
2494 
2495             let sregs = self.vcpus[usize::from(vcpu_id)]
2496                 .lock()
2497                 .unwrap()
2498                 .vcpu
2499                 .get_sregs()
2500                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2501 
2502             let mut msrs = vec![MsrEntry {
2503                 index: msr_index::MSR_KERNEL_GS_BASE,
2504                 ..Default::default()
2505             }];
2506 
2507             self.vcpus[vcpu_id as usize]
2508                 .lock()
2509                 .unwrap()
2510                 .vcpu
2511                 .get_msrs(&mut msrs)
2512                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2513             let kernel_gs_base = msrs[0].data;
2514 
2515             let cs = CpuSegment::new(sregs.cs);
2516             let ds = CpuSegment::new(sregs.ds);
2517             let es = CpuSegment::new(sregs.es);
2518             let fs = CpuSegment::new(sregs.fs);
2519             let gs = CpuSegment::new(sregs.gs);
2520             let ss = CpuSegment::new(sregs.ss);
2521             let ldt = CpuSegment::new(sregs.ldt);
2522             let tr = CpuSegment::new(sregs.tr);
2523             let gdt = CpuSegment::new_from_table(sregs.gdt);
2524             let idt = CpuSegment::new_from_table(sregs.idt);
2525             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2526             let regs = DumpCpusState {
2527                 version: 1,
2528                 size: size_of::<DumpCpusState>() as u32,
2529                 regs1,
2530                 regs2,
2531                 rip: gregs.rip,
2532                 rflags: gregs.rflags,
2533                 cs,
2534                 ds,
2535                 es,
2536                 fs,
2537                 gs,
2538                 ss,
2539                 ldt,
2540                 tr,
2541                 gdt,
2542                 idt,
2543                 cr,
2544                 kernel_gs_base,
2545             };
2546 
2547             let bytes: &[u8] = regs.as_slice();
2548             buf.resize(note_size as usize, 0);
2549             buf.splice(pos.., bytes.to_vec());
2550             buf.resize(note_size as usize, 0);
2551 
2552             coredump_file
2553                 .write(&buf)
2554                 .map_err(GuestDebuggableError::CoredumpFile)?;
2555         }
2556 
2557         Ok(())
2558     }
2559 }
2560 
2561 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2562 #[cfg(test)]
2563 mod tests {
2564     use arch::x86_64::interrupts::*;
2565     use arch::x86_64::regs::*;
2566     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2567 
2568     #[test]
2569     fn test_setlint() {
2570         let hv = hypervisor::new().unwrap();
2571         let vm = hv.create_vm().expect("new VM fd creation failed");
2572         assert!(hv.check_required_extensions().is_ok());
2573         // Calling get_lapic will fail if there is no irqchip before hand.
2574         assert!(vm.create_irq_chip().is_ok());
2575         let vcpu = vm.create_vcpu(0, None).unwrap();
2576         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2577 
2578         // Compute the value that is expected to represent LVT0 and LVT1.
2579         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2580         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2581         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2582         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2583 
2584         set_lint(&vcpu).unwrap();
2585 
2586         // Compute the value that represents LVT0 and LVT1 after set_lint.
2587         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2588         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2589         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2590         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2591         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2592     }
2593 
2594     #[test]
2595     fn test_setup_fpu() {
2596         let hv = hypervisor::new().unwrap();
2597         let vm = hv.create_vm().expect("new VM fd creation failed");
2598         let vcpu = vm.create_vcpu(0, None).unwrap();
2599         setup_fpu(&vcpu).unwrap();
2600 
2601         let expected_fpu: FpuState = FpuState {
2602             fcw: 0x37f,
2603             mxcsr: 0x1f80,
2604             ..Default::default()
2605         };
2606         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2607         // TODO: auto-generate kvm related structures with PartialEq on.
2608         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2609         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2610         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2611         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2612         // remove it at all.
2613         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2614     }
2615 
2616     #[test]
2617     fn test_setup_msrs() {
2618         use hypervisor::arch::x86::{msr_index, MsrEntry};
2619 
2620         let hv = hypervisor::new().unwrap();
2621         let vm = hv.create_vm().expect("new VM fd creation failed");
2622         let vcpu = vm.create_vcpu(0, None).unwrap();
2623         setup_msrs(&vcpu).unwrap();
2624 
2625         // This test will check against the last MSR entry configured (the tenth one).
2626         // See create_msr_entries for details.
2627         let mut msrs = vec![MsrEntry {
2628             index: msr_index::MSR_IA32_MISC_ENABLE,
2629             ..Default::default()
2630         }];
2631 
2632         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2633         // in this test case scenario.
2634         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2635         assert_eq!(read_msrs, 1);
2636 
2637         // Official entries that were setup when we did setup_msrs. We need to assert that the
2638         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2639         // expect.
2640         let entry_vec = vcpu.boot_msr_entries();
2641         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2642     }
2643 
2644     #[test]
2645     fn test_setup_regs() {
2646         let hv = hypervisor::new().unwrap();
2647         let vm = hv.create_vm().expect("new VM fd creation failed");
2648         let vcpu = vm.create_vcpu(0, None).unwrap();
2649 
2650         let expected_regs: StandardRegisters = StandardRegisters {
2651             rflags: 0x0000000000000002u64,
2652             rbx: arch::layout::PVH_INFO_START.0,
2653             rip: 1,
2654             ..Default::default()
2655         };
2656 
2657         setup_regs(&vcpu, expected_regs.rip).unwrap();
2658 
2659         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2660         assert_eq!(actual_regs, expected_regs);
2661     }
2662 }
2663 
2664 #[cfg(target_arch = "aarch64")]
2665 #[cfg(test)]
2666 mod tests {
2667     use arch::{aarch64::regs, layout};
2668     use hypervisor::kvm::aarch64::is_system_register;
2669     use hypervisor::kvm::kvm_bindings::{
2670         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2671         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2672     };
2673     use hypervisor::{arm64_core_reg_id, offset_of};
2674     use std::mem;
2675 
2676     #[test]
2677     fn test_setup_regs() {
2678         let hv = hypervisor::new().unwrap();
2679         let vm = hv.create_vm().unwrap();
2680         let vcpu = vm.create_vcpu(0, None).unwrap();
2681 
2682         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2683         // Must fail when vcpu is not initialized yet.
2684         assert!(res.is_err());
2685 
2686         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2687         vm.get_preferred_target(&mut kvi).unwrap();
2688         vcpu.vcpu_init(&kvi).unwrap();
2689 
2690         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2691     }
2692 
2693     #[test]
2694     fn test_read_mpidr() {
2695         let hv = hypervisor::new().unwrap();
2696         let vm = hv.create_vm().unwrap();
2697         let vcpu = vm.create_vcpu(0, None).unwrap();
2698         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2699         vm.get_preferred_target(&mut kvi).unwrap();
2700 
2701         // Must fail when vcpu is not initialized yet.
2702         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2703 
2704         vcpu.vcpu_init(&kvi).unwrap();
2705         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2706     }
2707 
2708     #[test]
2709     fn test_is_system_register() {
2710         let offset = offset_of!(user_pt_regs, pc);
2711         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2712         assert!(!is_system_register(regid));
2713         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2714         assert!(is_system_register(regid));
2715     }
2716 
2717     #[test]
2718     fn test_save_restore_core_regs() {
2719         let hv = hypervisor::new().unwrap();
2720         let vm = hv.create_vm().unwrap();
2721         let vcpu = vm.create_vcpu(0, None).unwrap();
2722         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2723         vm.get_preferred_target(&mut kvi).unwrap();
2724 
2725         // Must fail when vcpu is not initialized yet.
2726         let res = vcpu.get_regs();
2727         assert!(res.is_err());
2728         assert_eq!(
2729             format!("{}", res.unwrap_err()),
2730             "Failed to get core register: Exec format error (os error 8)"
2731         );
2732 
2733         let mut state = kvm_regs::default();
2734         let res = vcpu.set_regs(&state);
2735         assert!(res.is_err());
2736         assert_eq!(
2737             format!("{}", res.unwrap_err()),
2738             "Failed to set core register: Exec format error (os error 8)"
2739         );
2740 
2741         vcpu.vcpu_init(&kvi).unwrap();
2742         let res = vcpu.get_regs();
2743         assert!(res.is_ok());
2744         state = res.unwrap();
2745         assert_eq!(state.regs.pstate, 0x3C5);
2746 
2747         assert!(vcpu.set_regs(&state).is_ok());
2748     }
2749 
2750     #[test]
2751     fn test_get_set_mpstate() {
2752         let hv = hypervisor::new().unwrap();
2753         let vm = hv.create_vm().unwrap();
2754         let vcpu = vm.create_vcpu(0, None).unwrap();
2755         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2756         vm.get_preferred_target(&mut kvi).unwrap();
2757 
2758         let res = vcpu.get_mp_state();
2759         assert!(res.is_ok());
2760         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2761     }
2762 }
2763