xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 5e52729453cb62edbe4fb3a4aa24f8cca31e667e)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, aml::Aml, sdt::Sdt};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 use arch::EntryPoint;
35 use arch::NumaNodes;
36 #[cfg(target_arch = "aarch64")]
37 use devices::gic::Gic;
38 use devices::interrupt_controller::InterruptController;
39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
44 use hypervisor::aarch64::StandardRegisters;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::msr_index;
47 #[cfg(target_arch = "x86_64")]
48 use hypervisor::arch::x86::CpuIdEntry;
49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
50 use hypervisor::arch::x86::MsrEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
53 #[cfg(target_arch = "aarch64")]
54 use hypervisor::kvm::kvm_bindings;
55 #[cfg(feature = "tdx")]
56 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
57 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
58 use libc::{c_void, siginfo_t};
59 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
60 use linux_loader::elf::Elf64_Nhdr;
61 use seccompiler::{apply_filter, SeccompAction};
62 use std::collections::BTreeMap;
63 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
64 use std::io::Write;
65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
66 use std::mem::size_of;
67 use std::os::unix::thread::JoinHandleExt;
68 use std::sync::atomic::{AtomicBool, Ordering};
69 use std::sync::{Arc, Barrier, Mutex};
70 use std::{cmp, io, result, thread};
71 use thiserror::Error;
72 use tracer::trace_scoped;
73 use vm_device::BusDevice;
74 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
75 use vm_memory::ByteValued;
76 #[cfg(feature = "guest_debug")]
77 use vm_memory::{Bytes, GuestAddressSpace};
78 use vm_memory::{GuestAddress, GuestMemoryAtomic};
79 use vm_migration::{
80     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
81     Transportable,
82 };
83 use vmm_sys_util::eventfd::EventFd;
84 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
85 
86 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
87 /// Extract the specified bits of a 64-bit integer.
88 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
89 /// following expression should return 3 (`0b11`):
90 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
91 ///
92 macro_rules! extract_bits_64 {
93     ($value: tt, $offset: tt, $length: tt) => {
94         ($value >> $offset) & (!0u64 >> (64 - $length))
95     };
96 }
97 
98 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
99 
100 #[derive(Debug, Error)]
101 pub enum Error {
102     #[error("Error creating vCPU: {0}")]
103     VcpuCreate(#[source] anyhow::Error),
104 
105     #[error("Error running bCPU: {0}")]
106     VcpuRun(#[source] anyhow::Error),
107 
108     #[error("Error spawning vCPU thread: {0}")]
109     VcpuSpawn(#[source] io::Error),
110 
111     #[error("Error generating common CPUID: {0}")]
112     CommonCpuId(#[source] arch::Error),
113 
114     #[error("Error configuring vCPU: {0}")]
115     VcpuConfiguration(#[source] arch::Error),
116 
117     #[cfg(target_arch = "aarch64")]
118     #[error("Error fetching preferred target: {0}")]
119     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
120 
121     #[cfg(target_arch = "aarch64")]
122     #[error("Error initialising vCPU: {0}")]
123     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
124 
125     #[error("Failed to join on vCPU threads: {0:?}")]
126     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
127 
128     #[error("Error adding CpuManager to MMIO bus: {0}")]
129     BusError(#[source] vm_device::BusError),
130 
131     #[error("Requested vCPUs exceed maximum")]
132     DesiredVCpuCountExceedsMax,
133 
134     #[error("Cannot create seccomp filter: {0}")]
135     CreateSeccompFilter(#[source] seccompiler::Error),
136 
137     #[error("Cannot apply seccomp filter: {0}")]
138     ApplySeccompFilter(#[source] seccompiler::Error),
139 
140     #[error("Error starting vCPU after restore: {0}")]
141     StartRestoreVcpu(#[source] anyhow::Error),
142 
143     #[error("Unexpected VmExit")]
144     UnexpectedVmExit,
145 
146     #[error("Failed to allocate MMIO address for CpuManager")]
147     AllocateMmmioAddress,
148 
149     #[cfg(feature = "tdx")]
150     #[error("Error initializing TDX: {0}")]
151     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
152 
153     #[cfg(target_arch = "aarch64")]
154     #[error("Error initializing PMU: {0}")]
155     InitPmu(#[source] hypervisor::HypervisorCpuError),
156 
157     #[cfg(feature = "guest_debug")]
158     #[error("Error during CPU debug: {0}")]
159     CpuDebug(#[source] hypervisor::HypervisorCpuError),
160 
161     #[cfg(feature = "guest_debug")]
162     #[error("Error translating virtual address: {0}")]
163     TranslateVirtualAddress(#[source] anyhow::Error),
164 
165     #[cfg(target_arch = "x86_64")]
166     #[error("Error setting up AMX: {0}")]
167     AmxEnable(#[source] anyhow::Error),
168 }
169 pub type Result<T> = result::Result<T, Error>;
170 
171 #[cfg(target_arch = "x86_64")]
172 #[allow(dead_code)]
173 #[repr(packed)]
174 struct LocalApic {
175     pub r#type: u8,
176     pub length: u8,
177     pub processor_id: u8,
178     pub apic_id: u8,
179     pub flags: u32,
180 }
181 
182 #[allow(dead_code)]
183 #[repr(packed)]
184 #[derive(Default)]
185 struct Ioapic {
186     pub r#type: u8,
187     pub length: u8,
188     pub ioapic_id: u8,
189     _reserved: u8,
190     pub apic_address: u32,
191     pub gsi_base: u32,
192 }
193 
194 #[cfg(target_arch = "aarch64")]
195 #[allow(dead_code)]
196 #[repr(packed)]
197 struct GicC {
198     pub r#type: u8,
199     pub length: u8,
200     pub reserved0: u16,
201     pub cpu_interface_number: u32,
202     pub uid: u32,
203     pub flags: u32,
204     pub parking_version: u32,
205     pub performance_interrupt: u32,
206     pub parked_address: u64,
207     pub base_address: u64,
208     pub gicv_base_address: u64,
209     pub gich_base_address: u64,
210     pub vgic_interrupt: u32,
211     pub gicr_base_address: u64,
212     pub mpidr: u64,
213     pub proc_power_effi_class: u8,
214     pub reserved1: u8,
215     pub spe_overflow_interrupt: u16,
216 }
217 
218 #[cfg(target_arch = "aarch64")]
219 #[allow(dead_code)]
220 #[repr(packed)]
221 struct GicD {
222     pub r#type: u8,
223     pub length: u8,
224     pub reserved0: u16,
225     pub gic_id: u32,
226     pub base_address: u64,
227     pub global_irq_base: u32,
228     pub version: u8,
229     pub reserved1: [u8; 3],
230 }
231 
232 #[cfg(target_arch = "aarch64")]
233 #[allow(dead_code)]
234 #[repr(packed)]
235 struct GicR {
236     pub r#type: u8,
237     pub length: u8,
238     pub reserved: u16,
239     pub base_address: u64,
240     pub range_length: u32,
241 }
242 
243 #[cfg(target_arch = "aarch64")]
244 #[allow(dead_code)]
245 #[repr(packed)]
246 struct GicIts {
247     pub r#type: u8,
248     pub length: u8,
249     pub reserved0: u16,
250     pub translation_id: u32,
251     pub base_address: u64,
252     pub reserved1: u32,
253 }
254 
255 #[cfg(target_arch = "aarch64")]
256 #[allow(dead_code)]
257 #[repr(packed)]
258 struct ProcessorHierarchyNode {
259     pub r#type: u8,
260     pub length: u8,
261     pub reserved: u16,
262     pub flags: u32,
263     pub parent: u32,
264     pub acpi_processor_id: u32,
265     pub num_private_resources: u32,
266 }
267 
268 #[allow(dead_code)]
269 #[repr(packed)]
270 #[derive(Default)]
271 struct InterruptSourceOverride {
272     pub r#type: u8,
273     pub length: u8,
274     pub bus: u8,
275     pub source: u8,
276     pub gsi: u32,
277     pub flags: u16,
278 }
279 
280 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
281 macro_rules! round_up {
282     ($n:expr,$d:expr) => {
283         (($n / ($d + 1)) + 1) * $d
284     };
285 }
286 
287 /// A wrapper around creating and using a kvm-based VCPU.
288 pub struct Vcpu {
289     // The hypervisor abstracted CPU.
290     vcpu: Arc<dyn hypervisor::Vcpu>,
291     id: u8,
292     #[cfg(target_arch = "aarch64")]
293     mpidr: u64,
294     saved_state: Option<CpuState>,
295 }
296 
297 impl Vcpu {
298     /// Constructs a new VCPU for `vm`.
299     ///
300     /// # Arguments
301     ///
302     /// * `id` - Represents the CPU number between [0, max vcpus).
303     /// * `vm` - The virtual machine this vcpu will get attached to.
304     /// * `vm_ops` - Optional object for exit handling.
305     pub fn new(
306         id: u8,
307         vm: &Arc<dyn hypervisor::Vm>,
308         vm_ops: Option<Arc<dyn VmOps>>,
309     ) -> Result<Self> {
310         let vcpu = vm
311             .create_vcpu(id, vm_ops)
312             .map_err(|e| Error::VcpuCreate(e.into()))?;
313         // Initially the cpuid per vCPU is the one supported by this VM.
314         Ok(Vcpu {
315             vcpu,
316             id,
317             #[cfg(target_arch = "aarch64")]
318             mpidr: 0,
319             saved_state: None,
320         })
321     }
322 
323     /// Configures a vcpu and should be called once per vcpu when created.
324     ///
325     /// # Arguments
326     ///
327     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
328     /// * `guest_memory` - Guest memory.
329     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
330     pub fn configure(
331         &mut self,
332         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
333         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
334         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
335         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
336     ) -> Result<()> {
337         #[cfg(target_arch = "aarch64")]
338         {
339             self.init(vm)?;
340             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
341                 .map_err(Error::VcpuConfiguration)?;
342         }
343         info!("Configuring vCPU: cpu_id = {}", self.id);
344         #[cfg(target_arch = "x86_64")]
345         arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv)
346             .map_err(Error::VcpuConfiguration)?;
347 
348         Ok(())
349     }
350 
351     /// Gets the MPIDR register value.
352     #[cfg(target_arch = "aarch64")]
353     pub fn get_mpidr(&self) -> u64 {
354         self.mpidr
355     }
356 
357     /// Gets the saved vCPU state.
358     #[cfg(target_arch = "aarch64")]
359     pub fn get_saved_state(&self) -> Option<CpuState> {
360         self.saved_state.clone()
361     }
362 
363     /// Initializes an aarch64 specific vcpu for booting Linux.
364     #[cfg(target_arch = "aarch64")]
365     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
366         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
367 
368         // This reads back the kernel's preferred target type.
369         vm.get_preferred_target(&mut kvi)
370             .map_err(Error::VcpuArmPreferredTarget)?;
371         // We already checked that the capability is supported.
372         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
373         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
374         // Non-boot cpus are powered off initially.
375         if self.id > 0 {
376             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
377         }
378         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
379     }
380 
381     /// Runs the VCPU until it exits, returning the reason.
382     ///
383     /// Note that the state of the VCPU and associated VM must be setup first for this to do
384     /// anything useful.
385     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
386         self.vcpu.run()
387     }
388 }
389 
390 impl Pausable for Vcpu {}
391 impl Snapshottable for Vcpu {
392     fn id(&self) -> String {
393         self.id.to_string()
394     }
395 
396     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
397         let saved_state = self
398             .vcpu
399             .state()
400             .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?;
401 
402         self.saved_state = Some(saved_state.clone());
403 
404         Ok(Snapshot::from_data(SnapshotData::new_from_state(
405             &saved_state,
406         )?))
407     }
408 }
409 
410 pub struct CpuManager {
411     hypervisor_type: HypervisorType,
412     config: CpusConfig,
413     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
414     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
415     #[cfg(target_arch = "x86_64")]
416     cpuid: Vec<CpuIdEntry>,
417     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
418     vm: Arc<dyn hypervisor::Vm>,
419     vcpus_kill_signalled: Arc<AtomicBool>,
420     vcpus_pause_signalled: Arc<AtomicBool>,
421     exit_evt: EventFd,
422     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
423     reset_evt: EventFd,
424     #[cfg(feature = "guest_debug")]
425     vm_debug_evt: EventFd,
426     vcpu_states: Vec<VcpuState>,
427     selected_cpu: u8,
428     vcpus: Vec<Arc<Mutex<Vcpu>>>,
429     seccomp_action: SeccompAction,
430     vm_ops: Arc<dyn VmOps>,
431     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
432     acpi_address: Option<GuestAddress>,
433     proximity_domain_per_cpu: BTreeMap<u8, u32>,
434     affinity: BTreeMap<u8, Vec<u8>>,
435     dynamic: bool,
436 }
437 
438 const CPU_ENABLE_FLAG: usize = 0;
439 const CPU_INSERTING_FLAG: usize = 1;
440 const CPU_REMOVING_FLAG: usize = 2;
441 const CPU_EJECT_FLAG: usize = 3;
442 
443 const CPU_STATUS_OFFSET: u64 = 4;
444 const CPU_SELECTION_OFFSET: u64 = 0;
445 
446 impl BusDevice for CpuManager {
447     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
448         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
449         data.fill(0);
450 
451         match offset {
452             CPU_SELECTION_OFFSET => {
453                 data[0] = self.selected_cpu;
454             }
455             CPU_STATUS_OFFSET => {
456                 if self.selected_cpu < self.max_vcpus() {
457                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
458                     if state.active() {
459                         data[0] |= 1 << CPU_ENABLE_FLAG;
460                     }
461                     if state.inserting {
462                         data[0] |= 1 << CPU_INSERTING_FLAG;
463                     }
464                     if state.removing {
465                         data[0] |= 1 << CPU_REMOVING_FLAG;
466                     }
467                 } else {
468                     warn!("Out of range vCPU id: {}", self.selected_cpu);
469                 }
470             }
471             _ => {
472                 warn!(
473                     "Unexpected offset for accessing CPU manager device: {:#}",
474                     offset
475                 );
476             }
477         }
478     }
479 
480     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
481         match offset {
482             CPU_SELECTION_OFFSET => {
483                 self.selected_cpu = data[0];
484             }
485             CPU_STATUS_OFFSET => {
486                 if self.selected_cpu < self.max_vcpus() {
487                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
488                     // The ACPI code writes back a 1 to acknowledge the insertion
489                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
490                         && state.inserting
491                     {
492                         state.inserting = false;
493                     }
494                     // Ditto for removal
495                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
496                         && state.removing
497                     {
498                         state.removing = false;
499                     }
500                     // Trigger removal of vCPU
501                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
502                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
503                             error!("Error removing vCPU: {:?}", e);
504                         }
505                     }
506                 } else {
507                     warn!("Out of range vCPU id: {}", self.selected_cpu);
508                 }
509             }
510             _ => {
511                 warn!(
512                     "Unexpected offset for accessing CPU manager device: {:#}",
513                     offset
514                 );
515             }
516         }
517         None
518     }
519 }
520 
521 #[derive(Default)]
522 struct VcpuState {
523     inserting: bool,
524     removing: bool,
525     handle: Option<thread::JoinHandle<()>>,
526     kill: Arc<AtomicBool>,
527     vcpu_run_interrupted: Arc<AtomicBool>,
528 }
529 
530 impl VcpuState {
531     fn active(&self) -> bool {
532         self.handle.is_some()
533     }
534 
535     fn signal_thread(&self) {
536         if let Some(handle) = self.handle.as_ref() {
537             loop {
538                 // SAFETY: FFI call with correct arguments
539                 unsafe {
540                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
541                 }
542                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
543                     break;
544                 } else {
545                     // This is more effective than thread::yield_now() at
546                     // avoiding a priority inversion with the vCPU thread
547                     thread::sleep(std::time::Duration::from_millis(1));
548                 }
549             }
550         }
551     }
552 
553     fn join_thread(&mut self) -> Result<()> {
554         if let Some(handle) = self.handle.take() {
555             handle.join().map_err(Error::ThreadCleanup)?
556         }
557 
558         Ok(())
559     }
560 
561     fn unpark_thread(&self) {
562         if let Some(handle) = self.handle.as_ref() {
563             handle.thread().unpark()
564         }
565     }
566 }
567 
568 impl CpuManager {
569     #[allow(unused_variables)]
570     #[allow(clippy::too_many_arguments)]
571     pub fn new(
572         config: &CpusConfig,
573         vm: Arc<dyn hypervisor::Vm>,
574         exit_evt: EventFd,
575         reset_evt: EventFd,
576         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
577         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
578         seccomp_action: SeccompAction,
579         vm_ops: Arc<dyn VmOps>,
580         #[cfg(feature = "tdx")] tdx_enabled: bool,
581         numa_nodes: &NumaNodes,
582     ) -> Result<Arc<Mutex<CpuManager>>> {
583         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
584         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
585         let hypervisor_type = hypervisor.hypervisor_type();
586 
587         #[cfg(target_arch = "x86_64")]
588         if config.features.amx {
589             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
590             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
591             const XFEATURE_XTILEDATA: usize = 18;
592             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
593 
594             // SAFETY: the syscall is only modifing kernel internal
595             // data structures that the kernel is itself expected to safeguard.
596             let amx_tile = unsafe {
597                 libc::syscall(
598                     libc::SYS_arch_prctl,
599                     ARCH_REQ_XCOMP_GUEST_PERM,
600                     XFEATURE_XTILEDATA,
601                 )
602             };
603 
604             if amx_tile != 0 {
605                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
606             } else {
607                 let mask: usize = 0;
608                 // SAFETY: the mask being modified (not marked mutable as it is
609                 // modified in unsafe only which is permitted) isn't in use elsewhere.
610                 let result = unsafe {
611                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
612                 };
613                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
614                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
615                 }
616             }
617         }
618 
619         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
620             let mut cpu_list = Vec::new();
621             for (proximity_domain, numa_node) in numa_nodes.iter() {
622                 for cpu in numa_node.cpus.iter() {
623                     cpu_list.push((*cpu, *proximity_domain))
624                 }
625             }
626             cpu_list
627         }
628         .into_iter()
629         .collect();
630 
631         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
632             cpu_affinity
633                 .iter()
634                 .map(|a| (a.vcpu, a.host_cpus.clone()))
635                 .collect()
636         } else {
637             BTreeMap::new()
638         };
639 
640         #[cfg(feature = "tdx")]
641         let dynamic = !tdx_enabled;
642         #[cfg(not(feature = "tdx"))]
643         let dynamic = true;
644 
645         Ok(Arc::new(Mutex::new(CpuManager {
646             hypervisor_type,
647             config: config.clone(),
648             interrupt_controller: None,
649             #[cfg(target_arch = "x86_64")]
650             cpuid: Vec::new(),
651             vm,
652             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
653             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
654             vcpu_states,
655             exit_evt,
656             reset_evt,
657             #[cfg(feature = "guest_debug")]
658             vm_debug_evt,
659             selected_cpu: 0,
660             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
661             seccomp_action,
662             vm_ops,
663             acpi_address: None,
664             proximity_domain_per_cpu,
665             affinity,
666             dynamic,
667         })))
668     }
669 
670     #[cfg(target_arch = "x86_64")]
671     pub fn populate_cpuid(
672         &mut self,
673         memory_manager: &Arc<Mutex<MemoryManager>>,
674         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
675         #[cfg(feature = "tdx")] tdx_enabled: bool,
676     ) -> Result<()> {
677         let sgx_epc_sections = memory_manager
678             .lock()
679             .unwrap()
680             .sgx_epc_region()
681             .as_ref()
682             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
683         self.cpuid = {
684             let phys_bits = physical_bits(self.config.max_phys_bits);
685             arch::generate_common_cpuid(
686                 hypervisor,
687                 self.config
688                     .topology
689                     .clone()
690                     .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)),
691                 sgx_epc_sections,
692                 phys_bits,
693                 self.config.kvm_hyperv,
694                 #[cfg(feature = "tdx")]
695                 tdx_enabled,
696             )
697             .map_err(Error::CommonCpuId)?
698         };
699 
700         Ok(())
701     }
702 
703     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
704         info!("Creating vCPU: cpu_id = {}", cpu_id);
705 
706         let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?;
707 
708         if let Some(snapshot) = snapshot {
709             // AArch64 vCPUs should be initialized after created.
710             #[cfg(target_arch = "aarch64")]
711             vcpu.init(&self.vm)?;
712 
713             let state: CpuState = snapshot.to_state().map_err(|e| {
714                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
715             })?;
716             vcpu.vcpu
717                 .set_state(&state)
718                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
719 
720             vcpu.saved_state = Some(state);
721         }
722 
723         let vcpu = Arc::new(Mutex::new(vcpu));
724 
725         // Adding vCPU to the CpuManager's vCPU list.
726         self.vcpus.push(vcpu.clone());
727 
728         Ok(vcpu)
729     }
730 
731     pub fn configure_vcpu(
732         &self,
733         vcpu: Arc<Mutex<Vcpu>>,
734         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
735     ) -> Result<()> {
736         let mut vcpu = vcpu.lock().unwrap();
737 
738         #[cfg(target_arch = "x86_64")]
739         assert!(!self.cpuid.is_empty());
740 
741         #[cfg(target_arch = "x86_64")]
742         vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?;
743 
744         #[cfg(target_arch = "aarch64")]
745         vcpu.configure(&self.vm, boot_setup)?;
746 
747         Ok(())
748     }
749 
750     /// Only create new vCPUs if there aren't any inactive ones to reuse
751     fn create_vcpus(
752         &mut self,
753         desired_vcpus: u8,
754         snapshot: Option<Snapshot>,
755     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
756         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
757         info!(
758             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
759             desired_vcpus,
760             self.config.max_vcpus,
761             self.vcpus.len(),
762             self.present_vcpus()
763         );
764 
765         if desired_vcpus > self.config.max_vcpus {
766             return Err(Error::DesiredVCpuCountExceedsMax);
767         }
768 
769         // Only create vCPUs in excess of all the allocated vCPUs.
770         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
771             vcpus.push(self.create_vcpu(
772                 cpu_id,
773                 // TODO: The special format of the CPU id can be removed once
774                 // ready to break live upgrade.
775                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
776             )?);
777         }
778 
779         Ok(vcpus)
780     }
781 
782     #[cfg(target_arch = "aarch64")]
783     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
784         for cpu in self.vcpus.iter() {
785             let cpu = cpu.lock().unwrap();
786             // Check if PMU attr is available, if not, log the information.
787             if cpu.vcpu.has_pmu_support() {
788                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
789             } else {
790                 debug!(
791                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
792                     cpu.id
793                 );
794                 return Ok(false);
795             }
796         }
797 
798         Ok(true)
799     }
800 
801     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
802         self.vcpus.clone()
803     }
804 
805     fn start_vcpu(
806         &mut self,
807         vcpu: Arc<Mutex<Vcpu>>,
808         vcpu_id: u8,
809         vcpu_thread_barrier: Arc<Barrier>,
810         inserting: bool,
811     ) -> Result<()> {
812         let reset_evt = self.reset_evt.try_clone().unwrap();
813         let exit_evt = self.exit_evt.try_clone().unwrap();
814         #[cfg(feature = "guest_debug")]
815         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
816         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
817         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
818         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
819 
820         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
821         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
822             .vcpu_run_interrupted
823             .clone();
824         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
825 
826         // Prepare the CPU set the current vCPU is expected to run onto.
827         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
828             // SAFETY: all zeros is a valid pattern
829             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
830             // SAFETY: FFI call, trivially safe
831             unsafe { libc::CPU_ZERO(&mut cpuset) };
832             for host_cpu in host_cpus {
833                 // SAFETY: FFI call, trivially safe
834                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
835             }
836             cpuset
837         });
838 
839         // Retrieve seccomp filter for vcpu thread
840         let vcpu_seccomp_filter =
841             get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type)
842                 .map_err(Error::CreateSeccompFilter)?;
843 
844         #[cfg(target_arch = "x86_64")]
845         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
846 
847         info!("Starting vCPU: cpu_id = {}", vcpu_id);
848 
849         let handle = Some(
850             thread::Builder::new()
851                 .name(format!("vcpu{vcpu_id}"))
852                 .spawn(move || {
853                     // Schedule the thread to run on the expected CPU set
854                     if let Some(cpuset) = cpuset.as_ref() {
855                         // SAFETY: FFI call with correct arguments
856                         let ret = unsafe {
857                             libc::sched_setaffinity(
858                                 0,
859                                 std::mem::size_of::<libc::cpu_set_t>(),
860                                 cpuset as *const libc::cpu_set_t,
861                             )
862                         };
863 
864                         if ret != 0 {
865                             error!(
866                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
867                                 vcpu_id,
868                                 io::Error::last_os_error()
869                             );
870                             return;
871                         }
872                     }
873 
874                     // Apply seccomp filter for vcpu thread.
875                     if !vcpu_seccomp_filter.is_empty() {
876                         if let Err(e) =
877                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
878                         {
879                             error!("Error applying seccomp filter: {:?}", e);
880                             return;
881                         }
882                     }
883                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
884                     // This uses an async signal safe handler to kill the vcpu handles.
885                     register_signal_handler(SIGRTMIN(), handle_signal)
886                         .expect("Failed to register vcpu signal handler");
887                     // Block until all CPUs are ready.
888                     vcpu_thread_barrier.wait();
889 
890                     std::panic::catch_unwind(move || {
891                         loop {
892                             // If we are being told to pause, we park the thread
893                             // until the pause boolean is toggled.
894                             // The resume operation is responsible for toggling
895                             // the boolean and unpark the thread.
896                             // We enter a loop because park() could spuriously
897                             // return. We will then park() again unless the
898                             // pause boolean has been toggled.
899 
900                             // Need to use Ordering::SeqCst as we have multiple
901                             // loads and stores to different atomics and we need
902                             // to see them in a consistent order in all threads
903 
904                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
905                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
906                                 // completed by returning to KVM_RUN. From the kernel docs:
907                                 //
908                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
909                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
910                                 // operations are complete (and guest state is consistent) only after userspace
911                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
912                                 // incomplete operations and then check for pending signals.
913                                 // The pending state of the operation is not preserved in state which is
914                                 // visible to userspace, thus userspace should ensure that the operation is
915                                 // completed before performing a live migration.  Userspace can re-enter the
916                                 // guest with an unmasked signal pending or with the immediate_exit field set
917                                 // to complete pending operations without allowing any further instructions
918                                 // to be executed.
919 
920                                 #[cfg(feature = "kvm")]
921                                 {
922                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
923                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
924                                         error!("Unexpected VM exit on \"immediate_exit\" run");
925                                         break;
926                                     }
927                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
928                                 }
929 
930                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
931                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
932                                     thread::park();
933                                 }
934                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
935                             }
936 
937                             // We've been told to terminate
938                             if vcpu_kill_signalled.load(Ordering::SeqCst)
939                                 || vcpu_kill.load(Ordering::SeqCst)
940                             {
941                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
942                                 break;
943                             }
944 
945                             #[cfg(feature = "tdx")]
946                             let mut vcpu = vcpu.lock().unwrap();
947                             #[cfg(not(feature = "tdx"))]
948                             let vcpu = vcpu.lock().unwrap();
949                             // vcpu.run() returns false on a triple-fault so trigger a reset
950                             match vcpu.run() {
951                                 Ok(run) => match run {
952                                     #[cfg(feature = "kvm")]
953                                     VmExit::Debug => {
954                                         info!("VmExit::Debug");
955                                         #[cfg(feature = "guest_debug")]
956                                         {
957                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
958                                             let raw_tid = get_raw_tid(vcpu_id as usize);
959                                             vm_debug_evt.write(raw_tid as u64).unwrap();
960                                         }
961                                     }
962                                     #[cfg(target_arch = "x86_64")]
963                                     VmExit::IoapicEoi(vector) => {
964                                         if let Some(interrupt_controller) =
965                                             &interrupt_controller_clone
966                                         {
967                                             interrupt_controller
968                                                 .lock()
969                                                 .unwrap()
970                                                 .end_of_interrupt(vector);
971                                         }
972                                     }
973                                     VmExit::Ignore => {}
974                                     VmExit::Hyperv => {}
975                                     VmExit::Reset => {
976                                         info!("VmExit::Reset");
977                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
978                                         reset_evt.write(1).unwrap();
979                                         break;
980                                     }
981                                     VmExit::Shutdown => {
982                                         info!("VmExit::Shutdown");
983                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
984                                         exit_evt.write(1).unwrap();
985                                         break;
986                                     }
987                                     #[cfg(feature = "tdx")]
988                                     VmExit::Tdx => {
989                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
990                                             match vcpu.get_tdx_exit_details() {
991                                                 Ok(details) => match details {
992                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
993                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
994                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
995                                                     }
996                                                 },
997                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
998                                             }
999                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1000                                         } else {
1001                                             // We should never reach this code as
1002                                             // this means the design from the code
1003                                             // is wrong.
1004                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1005                                         }
1006                                     }
1007                                     _ => {
1008                                         error!(
1009                                             "VCPU generated error: {:?}",
1010                                             Error::UnexpectedVmExit
1011                                         );
1012                                         break;
1013                                     }
1014                                 },
1015 
1016                                 Err(e) => {
1017                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1018                                     break;
1019                                 }
1020                             }
1021 
1022                             // We've been told to terminate
1023                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1024                                 || vcpu_kill.load(Ordering::SeqCst)
1025                             {
1026                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1027                                 break;
1028                             }
1029                         }
1030                     })
1031                     .or_else(|_| {
1032                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1033                         error!("vCPU thread panicked");
1034                         panic_exit_evt.write(1)
1035                     })
1036                     .ok();
1037                 })
1038                 .map_err(Error::VcpuSpawn)?,
1039         );
1040 
1041         // On hot plug calls into this function entry_point is None. It is for
1042         // those hotplug CPU additions that we need to set the inserting flag.
1043         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1044         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1045 
1046         Ok(())
1047     }
1048 
1049     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1050     fn activate_vcpus(
1051         &mut self,
1052         desired_vcpus: u8,
1053         inserting: bool,
1054         paused: Option<bool>,
1055     ) -> Result<()> {
1056         if desired_vcpus > self.config.max_vcpus {
1057             return Err(Error::DesiredVCpuCountExceedsMax);
1058         }
1059 
1060         let vcpu_thread_barrier = Arc::new(Barrier::new(
1061             (desired_vcpus - self.present_vcpus() + 1) as usize,
1062         ));
1063 
1064         if let Some(paused) = paused {
1065             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1066         }
1067 
1068         info!(
1069             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1070             desired_vcpus,
1071             self.vcpus.len(),
1072             self.present_vcpus(),
1073             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1074         );
1075 
1076         // This reuses any inactive vCPUs as well as any that were newly created
1077         for vcpu_id in self.present_vcpus()..desired_vcpus {
1078             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1079             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1080         }
1081 
1082         // Unblock all CPU threads.
1083         vcpu_thread_barrier.wait();
1084         Ok(())
1085     }
1086 
1087     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1088         // Mark vCPUs for removal, actual removal happens on ejection
1089         for cpu_id in desired_vcpus..self.present_vcpus() {
1090             self.vcpu_states[usize::from(cpu_id)].removing = true;
1091         }
1092     }
1093 
1094     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1095         info!("Removing vCPU: cpu_id = {}", cpu_id);
1096         let mut state = &mut self.vcpu_states[usize::from(cpu_id)];
1097         state.kill.store(true, Ordering::SeqCst);
1098         state.signal_thread();
1099         state.join_thread()?;
1100         state.handle = None;
1101 
1102         // Once the thread has exited, clear the "kill" so that it can reused
1103         state.kill.store(false, Ordering::SeqCst);
1104 
1105         Ok(())
1106     }
1107 
1108     pub fn create_boot_vcpus(
1109         &mut self,
1110         snapshot: Option<Snapshot>,
1111     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1112         trace_scoped!("create_boot_vcpus");
1113 
1114         self.create_vcpus(self.boot_vcpus(), snapshot)
1115     }
1116 
1117     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1118     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1119         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1120     }
1121 
1122     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1123         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1124             .map_err(|e| {
1125                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1126             })?;
1127 
1128         Ok(())
1129     }
1130 
1131     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1132         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1133             return Ok(false);
1134         }
1135 
1136         if !self.dynamic {
1137             return Ok(false);
1138         }
1139 
1140         match desired_vcpus.cmp(&self.present_vcpus()) {
1141             cmp::Ordering::Greater => {
1142                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1143                 for vcpu in vcpus {
1144                     self.configure_vcpu(vcpu, None)?
1145                 }
1146                 self.activate_vcpus(desired_vcpus, true, None)?;
1147                 Ok(true)
1148             }
1149             cmp::Ordering::Less => {
1150                 self.mark_vcpus_for_removal(desired_vcpus);
1151                 Ok(true)
1152             }
1153             _ => Ok(false),
1154         }
1155     }
1156 
1157     pub fn shutdown(&mut self) -> Result<()> {
1158         // Tell the vCPUs to stop themselves next time they go through the loop
1159         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1160 
1161         // Toggle the vCPUs pause boolean
1162         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1163 
1164         // Unpark all the VCPU threads.
1165         for state in self.vcpu_states.iter() {
1166             state.unpark_thread();
1167         }
1168 
1169         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1170         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1171         // above.
1172         for state in self.vcpu_states.iter() {
1173             state.signal_thread();
1174         }
1175 
1176         // Wait for all the threads to finish. This removes the state from the vector.
1177         for mut state in self.vcpu_states.drain(..) {
1178             state.join_thread()?;
1179         }
1180 
1181         Ok(())
1182     }
1183 
1184     #[cfg(feature = "tdx")]
1185     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1186         for vcpu in &self.vcpus {
1187             vcpu.lock()
1188                 .unwrap()
1189                 .vcpu
1190                 .tdx_init(hob_address)
1191                 .map_err(Error::InitializeTdx)?;
1192         }
1193         Ok(())
1194     }
1195 
1196     pub fn boot_vcpus(&self) -> u8 {
1197         self.config.boot_vcpus
1198     }
1199 
1200     pub fn max_vcpus(&self) -> u8 {
1201         self.config.max_vcpus
1202     }
1203 
1204     #[cfg(target_arch = "x86_64")]
1205     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1206         assert!(!self.cpuid.is_empty());
1207         self.cpuid.clone()
1208     }
1209 
1210     fn present_vcpus(&self) -> u8 {
1211         self.vcpu_states
1212             .iter()
1213             .fold(0, |acc, state| acc + state.active() as u8)
1214     }
1215 
1216     #[cfg(target_arch = "aarch64")]
1217     pub fn get_mpidrs(&self) -> Vec<u64> {
1218         self.vcpus
1219             .iter()
1220             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1221             .collect()
1222     }
1223 
1224     #[cfg(target_arch = "aarch64")]
1225     pub fn get_saved_states(&self) -> Vec<CpuState> {
1226         self.vcpus
1227             .iter()
1228             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1229             .collect()
1230     }
1231 
1232     #[cfg(target_arch = "aarch64")]
1233     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1234         self.config
1235             .topology
1236             .clone()
1237             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1238     }
1239 
1240     pub fn create_madt(&self) -> Sdt {
1241         use crate::acpi;
1242         // This is also checked in the commandline parsing.
1243         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1244 
1245         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1246         #[cfg(target_arch = "x86_64")]
1247         {
1248             madt.write(36, arch::layout::APIC_START);
1249 
1250             for cpu in 0..self.config.max_vcpus {
1251                 let lapic = LocalApic {
1252                     r#type: acpi::ACPI_APIC_PROCESSOR,
1253                     length: 8,
1254                     processor_id: cpu,
1255                     apic_id: cpu,
1256                     flags: if cpu < self.config.boot_vcpus {
1257                         1 << MADT_CPU_ENABLE_FLAG
1258                     } else {
1259                         0
1260                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1261                 };
1262                 madt.append(lapic);
1263             }
1264 
1265             madt.append(Ioapic {
1266                 r#type: acpi::ACPI_APIC_IO,
1267                 length: 12,
1268                 ioapic_id: 0,
1269                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1270                 gsi_base: 0,
1271                 ..Default::default()
1272             });
1273 
1274             madt.append(InterruptSourceOverride {
1275                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1276                 length: 10,
1277                 bus: 0,
1278                 source: 4,
1279                 gsi: 4,
1280                 flags: 0,
1281             });
1282         }
1283 
1284         #[cfg(target_arch = "aarch64")]
1285         {
1286             /* Notes:
1287              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1288              */
1289 
1290             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1291             for cpu in 0..self.config.boot_vcpus {
1292                 let vcpu = &self.vcpus[cpu as usize];
1293                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1294                 /* ARMv8 MPIDR format:
1295                      Bits [63:40] Must be zero
1296                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1297                      Bits [31:24] Must be zero
1298                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1299                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1300                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1301                 */
1302                 let mpidr_mask = 0xff_00ff_ffff;
1303                 let gicc = GicC {
1304                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1305                     length: 80,
1306                     reserved0: 0,
1307                     cpu_interface_number: cpu as u32,
1308                     uid: cpu as u32,
1309                     flags: 1,
1310                     parking_version: 0,
1311                     performance_interrupt: 0,
1312                     parked_address: 0,
1313                     base_address: 0,
1314                     gicv_base_address: 0,
1315                     gich_base_address: 0,
1316                     vgic_interrupt: 0,
1317                     gicr_base_address: 0,
1318                     mpidr: mpidr & mpidr_mask,
1319                     proc_power_effi_class: 0,
1320                     reserved1: 0,
1321                     spe_overflow_interrupt: 0,
1322                 };
1323 
1324                 madt.append(gicc);
1325             }
1326             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1327 
1328             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1329             let gicd = GicD {
1330                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1331                 length: 24,
1332                 reserved0: 0,
1333                 gic_id: 0,
1334                 base_address: vgic_config.dist_addr,
1335                 global_irq_base: 0,
1336                 version: 3,
1337                 reserved1: [0; 3],
1338             };
1339             madt.append(gicd);
1340 
1341             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1342             let gicr = GicR {
1343                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1344                 length: 16,
1345                 reserved: 0,
1346                 base_address: vgic_config.redists_addr,
1347                 range_length: vgic_config.redists_size as u32,
1348             };
1349             madt.append(gicr);
1350 
1351             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1352             let gicits = GicIts {
1353                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1354                 length: 20,
1355                 reserved0: 0,
1356                 translation_id: 0,
1357                 base_address: vgic_config.msi_addr,
1358                 reserved1: 0,
1359             };
1360             madt.append(gicits);
1361 
1362             madt.update_checksum();
1363         }
1364 
1365         madt
1366     }
1367 
1368     #[cfg(target_arch = "aarch64")]
1369     pub fn create_pptt(&self) -> Sdt {
1370         let pptt_start = 0;
1371         let mut cpus = 0;
1372         let mut uid = 0;
1373         // If topology is not specified, the default setting is:
1374         // 1 package, multiple cores, 1 thread per core
1375         // This is also the behavior when PPTT is missing.
1376         let (threads_per_core, cores_per_package, packages) =
1377             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1378 
1379         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1380 
1381         for cluster_idx in 0..packages {
1382             if cpus < self.config.boot_vcpus as usize {
1383                 let cluster_offset = pptt.len() - pptt_start;
1384                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1385                     r#type: 0,
1386                     length: 20,
1387                     reserved: 0,
1388                     flags: 0x2,
1389                     parent: 0,
1390                     acpi_processor_id: cluster_idx as u32,
1391                     num_private_resources: 0,
1392                 };
1393                 pptt.append(cluster_hierarchy_node);
1394 
1395                 for core_idx in 0..cores_per_package {
1396                     let core_offset = pptt.len() - pptt_start;
1397 
1398                     if threads_per_core > 1 {
1399                         let core_hierarchy_node = ProcessorHierarchyNode {
1400                             r#type: 0,
1401                             length: 20,
1402                             reserved: 0,
1403                             flags: 0x2,
1404                             parent: cluster_offset as u32,
1405                             acpi_processor_id: core_idx as u32,
1406                             num_private_resources: 0,
1407                         };
1408                         pptt.append(core_hierarchy_node);
1409 
1410                         for _thread_idx in 0..threads_per_core {
1411                             let thread_hierarchy_node = ProcessorHierarchyNode {
1412                                 r#type: 0,
1413                                 length: 20,
1414                                 reserved: 0,
1415                                 flags: 0xE,
1416                                 parent: core_offset as u32,
1417                                 acpi_processor_id: uid as u32,
1418                                 num_private_resources: 0,
1419                             };
1420                             pptt.append(thread_hierarchy_node);
1421                             uid += 1;
1422                         }
1423                     } else {
1424                         let thread_hierarchy_node = ProcessorHierarchyNode {
1425                             r#type: 0,
1426                             length: 20,
1427                             reserved: 0,
1428                             flags: 0xA,
1429                             parent: cluster_offset as u32,
1430                             acpi_processor_id: uid as u32,
1431                             num_private_resources: 0,
1432                         };
1433                         pptt.append(thread_hierarchy_node);
1434                         uid += 1;
1435                     }
1436                 }
1437                 cpus += (cores_per_package * threads_per_core) as usize;
1438             }
1439         }
1440 
1441         pptt.update_checksum();
1442         pptt
1443     }
1444 
1445     #[cfg(feature = "guest_debug")]
1446     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1447         self.vcpus[usize::from(cpu_id)]
1448             .lock()
1449             .unwrap()
1450             .vcpu
1451             .get_regs()
1452             .map_err(Error::CpuDebug)
1453     }
1454 
1455     #[cfg(feature = "guest_debug")]
1456     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1457         self.vcpus[usize::from(cpu_id)]
1458             .lock()
1459             .unwrap()
1460             .vcpu
1461             .set_regs(regs)
1462             .map_err(Error::CpuDebug)
1463     }
1464 
1465     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1466     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1467         self.vcpus[usize::from(cpu_id)]
1468             .lock()
1469             .unwrap()
1470             .vcpu
1471             .get_sregs()
1472             .map_err(Error::CpuDebug)
1473     }
1474 
1475     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1476     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1477         self.vcpus[usize::from(cpu_id)]
1478             .lock()
1479             .unwrap()
1480             .vcpu
1481             .set_sregs(sregs)
1482             .map_err(Error::CpuDebug)
1483     }
1484 
1485     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1486     fn translate_gva(
1487         &self,
1488         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1489         cpu_id: u8,
1490         gva: u64,
1491     ) -> Result<u64> {
1492         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1493             .lock()
1494             .unwrap()
1495             .vcpu
1496             .translate_gva(gva, /* flags: unused */ 0)
1497             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1498         Ok(gpa)
1499     }
1500 
1501     ///
1502     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1503     /// it in VMM by walking through translation tables.
1504     ///
1505     /// Address translation is big topic, here we only focus the scenario that
1506     /// happens in VMM while debugging kernel. This `translate_gva`
1507     /// implementation is restricted to:
1508     /// - Exception Level 1
1509     /// - Translate high address range only (kernel space)
1510     ///
1511     /// This implementation supports following Arm-v8a features related to
1512     /// address translation:
1513     /// - FEAT_LPA
1514     /// - FEAT_LVA
1515     /// - FEAT_LPA2
1516     ///
1517     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1518     fn translate_gva(
1519         &self,
1520         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1521         cpu_id: u8,
1522         gva: u64,
1523     ) -> Result<u64> {
1524         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1525             .lock()
1526             .unwrap()
1527             .vcpu
1528             .get_sys_reg(regs::TCR_EL1)
1529             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1530         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1531             .lock()
1532             .unwrap()
1533             .vcpu
1534             .get_sys_reg(regs::TTBR1_EL1)
1535             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1536         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1537             .lock()
1538             .unwrap()
1539             .vcpu
1540             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1541             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1542 
1543         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1544         // or low (0x000xxx...).
1545         let high_range = extract_bits_64!(gva, 55, 1);
1546         if high_range == 0 {
1547             info!("VA (0x{:x}) range is not supported!", gva);
1548             return Ok(gva);
1549         }
1550 
1551         // High range size offset
1552         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1553         // Granule size
1554         let tg = extract_bits_64!(tcr_el1, 30, 2);
1555         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1556         let ds = extract_bits_64!(tcr_el1, 59, 1);
1557 
1558         if tsz == 0 {
1559             info!("VA translation is not ready!");
1560             return Ok(gva);
1561         }
1562 
1563         // VA size is determined by TCR_BL1.T1SZ
1564         let va_size = 64 - tsz;
1565         // Number of bits in VA consumed in each level of translation
1566         let stride = match tg {
1567             3 => 13, // 64KB granule size
1568             1 => 11, // 16KB granule size
1569             _ => 9,  // 4KB, default
1570         };
1571         // Starting level of walking
1572         let mut level = 4 - (va_size - 4) / stride;
1573 
1574         // PA or IPA size is determined
1575         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1576         #[allow(clippy::identity_op)]
1577         let pa_range = extract_bits_64!(id_aa64mmfr0_el1, 0, 4);
1578         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1579         // To be safe, we use the minimum value if they are different.
1580         let pa_range = std::cmp::min(tcr_ips, pa_range);
1581         // PA size in bits
1582         let pa_size = match pa_range {
1583             0 => 32,
1584             1 => 36,
1585             2 => 40,
1586             3 => 42,
1587             4 => 44,
1588             5 => 48,
1589             6 => 52,
1590             _ => {
1591                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1592                     "PA range not supported {pa_range}"
1593                 ))))
1594             }
1595         };
1596 
1597         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1598         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1599         // If FEAT_LPA2 is present, the translation table descriptor holds
1600         // 50 bits of the table address of next level.
1601         // Otherwise, it is 48 bits.
1602         let descaddrmask = if ds == 1 {
1603             !0u64 >> (64 - 50) // mask with 50 least significant bits
1604         } else {
1605             !0u64 >> (64 - 48) // mask with 48 least significant bits
1606         };
1607         let descaddrmask = descaddrmask & !indexmask_grainsize;
1608 
1609         // Translation table base address
1610         #[allow(clippy::identity_op)]
1611         let mut descaddr: u64 = extract_bits_64!(ttbr1_el1, 0, 48);
1612         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1613         // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5].
1614         if pa_size == 52 {
1615             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1616         }
1617 
1618         // Loop through tables of each level
1619         loop {
1620             // Table offset for current level
1621             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1622             descaddr |= table_offset;
1623             descaddr &= !7u64;
1624 
1625             let mut buf = [0; 8];
1626             guest_memory
1627                 .memory()
1628                 .read(&mut buf, GuestAddress(descaddr))
1629                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1630             let descriptor = u64::from_le_bytes(buf);
1631 
1632             descaddr = descriptor & descaddrmask;
1633             // In the case of FEAT_LPA, the next-level translation table address
1634             // bits [48:51] comes from bits [12:15] of the current descriptor.
1635             // For FEAT_LPA2, the next-level translation table address
1636             // bits [50:51] comes from bits [8:9] of the current descriptor,
1637             // bits [48:49] comes from bits [48:49] of the descriptor which was
1638             // handled previously.
1639             if pa_size == 52 {
1640                 if ds == 1 {
1641                     // FEAT_LPA2
1642                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1643                 } else {
1644                     // FEAT_LPA
1645                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1646                 }
1647             }
1648 
1649             if (descriptor & 2) != 0 && (level < 3) {
1650                 // This is a table entry. Go down to next level.
1651                 level += 1;
1652                 indexmask = indexmask_grainsize;
1653                 continue;
1654             }
1655 
1656             break;
1657         }
1658 
1659         // We have reached either:
1660         // - a page entry at level 3 or
1661         // - a block entry at level 1 or 2
1662         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1663         descaddr &= !(page_size - 1);
1664         descaddr |= gva & (page_size - 1);
1665 
1666         Ok(descaddr)
1667     }
1668 
1669     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1670         self.acpi_address = Some(acpi_address);
1671     }
1672 
1673     pub(crate) fn set_interrupt_controller(
1674         &mut self,
1675         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1676     ) {
1677         self.interrupt_controller = Some(interrupt_controller);
1678     }
1679 }
1680 
1681 struct Cpu {
1682     cpu_id: u8,
1683     proximity_domain: u32,
1684     dynamic: bool,
1685 }
1686 
1687 #[cfg(target_arch = "x86_64")]
1688 const MADT_CPU_ENABLE_FLAG: usize = 0;
1689 
1690 #[cfg(target_arch = "x86_64")]
1691 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1692 
1693 impl Cpu {
1694     #[cfg(target_arch = "x86_64")]
1695     fn generate_mat(&self) -> Vec<u8> {
1696         let lapic = LocalApic {
1697             r#type: 0,
1698             length: 8,
1699             processor_id: self.cpu_id,
1700             apic_id: self.cpu_id,
1701             flags: 1 << MADT_CPU_ENABLE_FLAG,
1702         };
1703 
1704         let mut mat_data: Vec<u8> = Vec::new();
1705         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1706         // SAFETY: mat_data is large enough to hold lapic
1707         unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic };
1708 
1709         mat_data
1710     }
1711 }
1712 
1713 impl Aml for Cpu {
1714     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1715         #[cfg(target_arch = "x86_64")]
1716         let mat_data: Vec<u8> = self.generate_mat();
1717         #[allow(clippy::if_same_then_else)]
1718         if self.dynamic {
1719             aml::Device::new(
1720                 format!("C{:03}", self.cpu_id).as_str().into(),
1721                 vec![
1722                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1723                     &aml::Name::new("_UID".into(), &self.cpu_id),
1724                     // Currently, AArch64 cannot support following fields.
1725                     /*
1726                     _STA return value:
1727                     Bit [0] – Set if the device is present.
1728                     Bit [1] – Set if the device is enabled and decoding its resources.
1729                     Bit [2] – Set if the device should be shown in the UI.
1730                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1731                     Bit [4] – Set if the battery is present.
1732                     Bits [31:5] – Reserved (must be cleared).
1733                     */
1734                     #[cfg(target_arch = "x86_64")]
1735                     &aml::Method::new(
1736                         "_STA".into(),
1737                         0,
1738                         false,
1739                         // Call into CSTA method which will interrogate device
1740                         vec![&aml::Return::new(&aml::MethodCall::new(
1741                             "CSTA".into(),
1742                             vec![&self.cpu_id],
1743                         ))],
1744                     ),
1745                     &aml::Method::new(
1746                         "_PXM".into(),
1747                         0,
1748                         false,
1749                         vec![&aml::Return::new(&self.proximity_domain)],
1750                     ),
1751                     // The Linux kernel expects every CPU device to have a _MAT entry
1752                     // containing the LAPIC for this processor with the enabled bit set
1753                     // even it if is disabled in the MADT (non-boot CPU)
1754                     #[cfg(target_arch = "x86_64")]
1755                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1756                     // Trigger CPU ejection
1757                     #[cfg(target_arch = "x86_64")]
1758                     &aml::Method::new(
1759                         "_EJ0".into(),
1760                         1,
1761                         false,
1762                         // Call into CEJ0 method which will actually eject device
1763                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1764                     ),
1765                 ],
1766             )
1767             .append_aml_bytes(bytes);
1768         } else {
1769             aml::Device::new(
1770                 format!("C{:03}", self.cpu_id).as_str().into(),
1771                 vec![
1772                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1773                     &aml::Name::new("_UID".into(), &self.cpu_id),
1774                     #[cfg(target_arch = "x86_64")]
1775                     &aml::Method::new(
1776                         "_STA".into(),
1777                         0,
1778                         false,
1779                         // Mark CPU present see CSTA implementation
1780                         vec![&aml::Return::new(&0xfu8)],
1781                     ),
1782                     &aml::Method::new(
1783                         "_PXM".into(),
1784                         0,
1785                         false,
1786                         vec![&aml::Return::new(&self.proximity_domain)],
1787                     ),
1788                     // The Linux kernel expects every CPU device to have a _MAT entry
1789                     // containing the LAPIC for this processor with the enabled bit set
1790                     // even it if is disabled in the MADT (non-boot CPU)
1791                     #[cfg(target_arch = "x86_64")]
1792                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1793                 ],
1794             )
1795             .append_aml_bytes(bytes);
1796         }
1797     }
1798 }
1799 
1800 struct CpuNotify {
1801     cpu_id: u8,
1802 }
1803 
1804 impl Aml for CpuNotify {
1805     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1806         let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
1807         aml::If::new(
1808             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1809             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1810         )
1811         .append_aml_bytes(bytes)
1812     }
1813 }
1814 
1815 struct CpuMethods {
1816     max_vcpus: u8,
1817     dynamic: bool,
1818 }
1819 
1820 impl Aml for CpuMethods {
1821     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1822         if self.dynamic {
1823             // CPU status method
1824             aml::Method::new(
1825                 "CSTA".into(),
1826                 1,
1827                 true,
1828                 vec![
1829                     // Take lock defined above
1830                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1831                     // Write CPU number (in first argument) to I/O port via field
1832                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1833                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1834                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1835                     &aml::If::new(
1836                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1837                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1838                     ),
1839                     // Release lock
1840                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1841                     // Return 0 or 0xf
1842                     &aml::Return::new(&aml::Local(0)),
1843                 ],
1844             )
1845             .append_aml_bytes(bytes);
1846 
1847             let mut cpu_notifies = Vec::new();
1848             for cpu_id in 0..self.max_vcpus {
1849                 cpu_notifies.push(CpuNotify { cpu_id });
1850             }
1851 
1852             let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
1853             for cpu_id in 0..self.max_vcpus {
1854                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1855             }
1856 
1857             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes);
1858 
1859             aml::Method::new(
1860                 "CEJ0".into(),
1861                 1,
1862                 true,
1863                 vec![
1864                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1865                     // Write CPU number (in first argument) to I/O port via field
1866                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1867                     // Set CEJ0 bit
1868                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1869                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1870                 ],
1871             )
1872             .append_aml_bytes(bytes);
1873 
1874             aml::Method::new(
1875                 "CSCN".into(),
1876                 0,
1877                 true,
1878                 vec![
1879                     // Take lock defined above
1880                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1881                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1882                     &aml::While::new(
1883                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1884                         vec![
1885                             // Write CPU number (in first argument) to I/O port via field
1886                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1887                             // Check if CINS bit is set
1888                             &aml::If::new(
1889                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1890                                 // Notify device if it is
1891                                 vec![
1892                                     &aml::MethodCall::new(
1893                                         "CTFY".into(),
1894                                         vec![&aml::Local(0), &aml::ONE],
1895                                     ),
1896                                     // Reset CINS bit
1897                                     &aml::Store::new(
1898                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1899                                         &aml::ONE,
1900                                     ),
1901                                 ],
1902                             ),
1903                             // Check if CRMV bit is set
1904                             &aml::If::new(
1905                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1906                                 // Notify device if it is (with the eject constant 0x3)
1907                                 vec![
1908                                     &aml::MethodCall::new(
1909                                         "CTFY".into(),
1910                                         vec![&aml::Local(0), &3u8],
1911                                     ),
1912                                     // Reset CRMV bit
1913                                     &aml::Store::new(
1914                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1915                                         &aml::ONE,
1916                                     ),
1917                                 ],
1918                             ),
1919                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1920                         ],
1921                     ),
1922                     // Release lock
1923                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1924                 ],
1925             )
1926             .append_aml_bytes(bytes)
1927         } else {
1928             aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes)
1929         }
1930     }
1931 }
1932 
1933 impl Aml for CpuManager {
1934     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1935         #[cfg(target_arch = "x86_64")]
1936         if let Some(acpi_address) = self.acpi_address {
1937             // CPU hotplug controller
1938             aml::Device::new(
1939                 "_SB_.PRES".into(),
1940                 vec![
1941                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
1942                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1943                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1944                     &aml::Mutex::new("CPLK".into(), 0),
1945                     &aml::Name::new(
1946                         "_CRS".into(),
1947                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
1948                             aml::AddressSpaceCachable::NotCacheable,
1949                             true,
1950                             acpi_address.0,
1951                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
1952                         )]),
1953                     ),
1954                     // OpRegion and Fields map MMIO range into individual field values
1955                     &aml::OpRegion::new(
1956                         "PRST".into(),
1957                         aml::OpRegionSpace::SystemMemory,
1958                         acpi_address.0 as usize,
1959                         CPU_MANAGER_ACPI_SIZE,
1960                     ),
1961                     &aml::Field::new(
1962                         "PRST".into(),
1963                         aml::FieldAccessType::Byte,
1964                         aml::FieldUpdateRule::WriteAsZeroes,
1965                         vec![
1966                             aml::FieldEntry::Reserved(32),
1967                             aml::FieldEntry::Named(*b"CPEN", 1),
1968                             aml::FieldEntry::Named(*b"CINS", 1),
1969                             aml::FieldEntry::Named(*b"CRMV", 1),
1970                             aml::FieldEntry::Named(*b"CEJ0", 1),
1971                             aml::FieldEntry::Reserved(4),
1972                             aml::FieldEntry::Named(*b"CCMD", 8),
1973                         ],
1974                     ),
1975                     &aml::Field::new(
1976                         "PRST".into(),
1977                         aml::FieldAccessType::DWord,
1978                         aml::FieldUpdateRule::Preserve,
1979                         vec![
1980                             aml::FieldEntry::Named(*b"CSEL", 32),
1981                             aml::FieldEntry::Reserved(32),
1982                             aml::FieldEntry::Named(*b"CDAT", 32),
1983                         ],
1984                     ),
1985                 ],
1986             )
1987             .append_aml_bytes(bytes);
1988         }
1989 
1990         // CPU devices
1991         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
1992         let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05"));
1993         // Bundle methods together under a common object
1994         let methods = CpuMethods {
1995             max_vcpus: self.config.max_vcpus,
1996             dynamic: self.dynamic,
1997         };
1998         let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods];
1999 
2000         let mut cpu_devices = Vec::new();
2001         for cpu_id in 0..self.config.max_vcpus {
2002             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2003             let cpu_device = Cpu {
2004                 cpu_id,
2005                 proximity_domain,
2006                 dynamic: self.dynamic,
2007             };
2008 
2009             cpu_devices.push(cpu_device);
2010         }
2011 
2012         for cpu_device in cpu_devices.iter() {
2013             cpu_data_inner.push(cpu_device);
2014         }
2015 
2016         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes)
2017     }
2018 }
2019 
2020 impl Pausable for CpuManager {
2021     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2022         // Tell the vCPUs to pause themselves next time they exit
2023         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2024 
2025         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2026         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2027         // above.
2028         for state in self.vcpu_states.iter() {
2029             state.signal_thread();
2030         }
2031 
2032         for vcpu in self.vcpus.iter() {
2033             let mut vcpu = vcpu.lock().unwrap();
2034             vcpu.pause()?;
2035             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2036             if !self.config.kvm_hyperv {
2037                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2038                     MigratableError::Pause(anyhow!(
2039                         "Could not notify guest it has been paused {:?}",
2040                         e
2041                     ))
2042                 })?;
2043             }
2044         }
2045 
2046         Ok(())
2047     }
2048 
2049     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2050         for vcpu in self.vcpus.iter() {
2051             vcpu.lock().unwrap().resume()?;
2052         }
2053 
2054         // Toggle the vCPUs pause boolean
2055         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2056 
2057         // Unpark all the VCPU threads.
2058         // Once unparked, the next thing they will do is checking for the pause
2059         // boolean. Since it'll be set to false, they will exit their pause loop
2060         // and go back to vmx root.
2061         for state in self.vcpu_states.iter() {
2062             state.unpark_thread();
2063         }
2064         Ok(())
2065     }
2066 }
2067 
2068 impl Snapshottable for CpuManager {
2069     fn id(&self) -> String {
2070         CPU_MANAGER_SNAPSHOT_ID.to_string()
2071     }
2072 
2073     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2074         let mut cpu_manager_snapshot = Snapshot::default();
2075 
2076         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2077         for vcpu in &self.vcpus {
2078             let mut vcpu = vcpu.lock().unwrap();
2079             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2080         }
2081 
2082         Ok(cpu_manager_snapshot)
2083     }
2084 }
2085 
2086 impl Transportable for CpuManager {}
2087 impl Migratable for CpuManager {}
2088 
2089 #[cfg(feature = "guest_debug")]
2090 impl Debuggable for CpuManager {
2091     #[cfg(feature = "kvm")]
2092     fn set_guest_debug(
2093         &self,
2094         cpu_id: usize,
2095         addrs: &[GuestAddress],
2096         singlestep: bool,
2097     ) -> std::result::Result<(), DebuggableError> {
2098         self.vcpus[cpu_id]
2099             .lock()
2100             .unwrap()
2101             .vcpu
2102             .set_guest_debug(addrs, singlestep)
2103             .map_err(DebuggableError::SetDebug)
2104     }
2105 
2106     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2107         Ok(())
2108     }
2109 
2110     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2111         Ok(())
2112     }
2113 
2114     #[cfg(target_arch = "x86_64")]
2115     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2116         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2117         let gregs = self
2118             .get_regs(cpu_id as u8)
2119             .map_err(DebuggableError::ReadRegs)?;
2120         let regs = [
2121             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2122             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2123         ];
2124 
2125         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2126         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2127         let eflags = gregs.rflags as u32;
2128         let rip = gregs.rip;
2129 
2130         // Segment registers: CS, SS, DS, ES, FS, GS
2131         let sregs = self
2132             .get_sregs(cpu_id as u8)
2133             .map_err(DebuggableError::ReadRegs)?;
2134         let segments = X86SegmentRegs {
2135             cs: sregs.cs.selector as u32,
2136             ss: sregs.ss.selector as u32,
2137             ds: sregs.ds.selector as u32,
2138             es: sregs.es.selector as u32,
2139             fs: sregs.fs.selector as u32,
2140             gs: sregs.gs.selector as u32,
2141         };
2142 
2143         // TODO: Add other registers
2144 
2145         Ok(CoreRegs {
2146             regs,
2147             eflags,
2148             rip,
2149             segments,
2150             ..Default::default()
2151         })
2152     }
2153 
2154     #[cfg(target_arch = "aarch64")]
2155     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2156         let gregs = self
2157             .get_regs(cpu_id as u8)
2158             .map_err(DebuggableError::ReadRegs)?;
2159         Ok(CoreRegs {
2160             x: gregs.regs.regs,
2161             sp: gregs.regs.sp,
2162             pc: gregs.regs.pc,
2163             ..Default::default()
2164         })
2165     }
2166 
2167     #[cfg(target_arch = "x86_64")]
2168     fn write_regs(
2169         &self,
2170         cpu_id: usize,
2171         regs: &CoreRegs,
2172     ) -> std::result::Result<(), DebuggableError> {
2173         let orig_gregs = self
2174             .get_regs(cpu_id as u8)
2175             .map_err(DebuggableError::ReadRegs)?;
2176         let gregs = StandardRegisters {
2177             rax: regs.regs[0],
2178             rbx: regs.regs[1],
2179             rcx: regs.regs[2],
2180             rdx: regs.regs[3],
2181             rsi: regs.regs[4],
2182             rdi: regs.regs[5],
2183             rbp: regs.regs[6],
2184             rsp: regs.regs[7],
2185             r8: regs.regs[8],
2186             r9: regs.regs[9],
2187             r10: regs.regs[10],
2188             r11: regs.regs[11],
2189             r12: regs.regs[12],
2190             r13: regs.regs[13],
2191             r14: regs.regs[14],
2192             r15: regs.regs[15],
2193             rip: regs.rip,
2194             // Update the lower 32-bit of rflags.
2195             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2196         };
2197 
2198         self.set_regs(cpu_id as u8, &gregs)
2199             .map_err(DebuggableError::WriteRegs)?;
2200 
2201         // Segment registers: CS, SS, DS, ES, FS, GS
2202         // Since GDB care only selectors, we call get_sregs() first.
2203         let mut sregs = self
2204             .get_sregs(cpu_id as u8)
2205             .map_err(DebuggableError::ReadRegs)?;
2206         sregs.cs.selector = regs.segments.cs as u16;
2207         sregs.ss.selector = regs.segments.ss as u16;
2208         sregs.ds.selector = regs.segments.ds as u16;
2209         sregs.es.selector = regs.segments.es as u16;
2210         sregs.fs.selector = regs.segments.fs as u16;
2211         sregs.gs.selector = regs.segments.gs as u16;
2212 
2213         self.set_sregs(cpu_id as u8, &sregs)
2214             .map_err(DebuggableError::WriteRegs)?;
2215 
2216         // TODO: Add other registers
2217 
2218         Ok(())
2219     }
2220 
2221     #[cfg(target_arch = "aarch64")]
2222     fn write_regs(
2223         &self,
2224         cpu_id: usize,
2225         regs: &CoreRegs,
2226     ) -> std::result::Result<(), DebuggableError> {
2227         let mut gregs = self
2228             .get_regs(cpu_id as u8)
2229             .map_err(DebuggableError::ReadRegs)?;
2230 
2231         gregs.regs.regs = regs.x;
2232         gregs.regs.sp = regs.sp;
2233         gregs.regs.pc = regs.pc;
2234 
2235         self.set_regs(cpu_id as u8, &gregs)
2236             .map_err(DebuggableError::WriteRegs)?;
2237 
2238         Ok(())
2239     }
2240 
2241     fn read_mem(
2242         &self,
2243         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2244         cpu_id: usize,
2245         vaddr: GuestAddress,
2246         len: usize,
2247     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2248         let mut buf = vec![0; len];
2249         let mut total_read = 0_u64;
2250 
2251         while total_read < len as u64 {
2252             let gaddr = vaddr.0 + total_read;
2253             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2254                 Ok(paddr) => paddr,
2255                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2256                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2257             };
2258             let psize = arch::PAGE_SIZE as u64;
2259             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2260             guest_memory
2261                 .memory()
2262                 .read(
2263                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2264                     GuestAddress(paddr),
2265                 )
2266                 .map_err(DebuggableError::ReadMem)?;
2267             total_read += read_len;
2268         }
2269         Ok(buf)
2270     }
2271 
2272     fn write_mem(
2273         &self,
2274         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2275         cpu_id: usize,
2276         vaddr: &GuestAddress,
2277         data: &[u8],
2278     ) -> std::result::Result<(), DebuggableError> {
2279         let mut total_written = 0_u64;
2280 
2281         while total_written < data.len() as u64 {
2282             let gaddr = vaddr.0 + total_written;
2283             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2284                 Ok(paddr) => paddr,
2285                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2286                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2287             };
2288             let psize = arch::PAGE_SIZE as u64;
2289             let write_len = std::cmp::min(
2290                 data.len() as u64 - total_written,
2291                 psize - (paddr & (psize - 1)),
2292             );
2293             guest_memory
2294                 .memory()
2295                 .write(
2296                     &data[total_written as usize..total_written as usize + write_len as usize],
2297                     GuestAddress(paddr),
2298                 )
2299                 .map_err(DebuggableError::WriteMem)?;
2300             total_written += write_len;
2301         }
2302         Ok(())
2303     }
2304 
2305     fn active_vcpus(&self) -> usize {
2306         self.present_vcpus() as usize
2307     }
2308 }
2309 
2310 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2311 impl Elf64Writable for CpuManager {}
2312 
2313 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2314 impl CpuElf64Writable for CpuManager {
2315     fn cpu_write_elf64_note(
2316         &mut self,
2317         dump_state: &DumpState,
2318     ) -> std::result::Result<(), GuestDebuggableError> {
2319         let mut coredump_file = dump_state.file.as_ref().unwrap();
2320         for vcpu in &self.vcpus {
2321             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2322             let mut pos: usize = 0;
2323             let mut buf = vec![0; note_size as usize];
2324             let descsz = size_of::<X86_64ElfPrStatus>();
2325             let vcpu_id = vcpu.lock().unwrap().id;
2326 
2327             let note = Elf64_Nhdr {
2328                 n_namesz: COREDUMP_NAME_SIZE,
2329                 n_descsz: descsz as u32,
2330                 n_type: NT_PRSTATUS,
2331             };
2332 
2333             let bytes: &[u8] = note.as_slice();
2334             buf.splice(0.., bytes.to_vec());
2335             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2336             buf.resize(pos + 4, 0);
2337             buf.splice(pos.., "CORE".to_string().into_bytes());
2338 
2339             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2340             buf.resize(pos + 32 + 4, 0);
2341             let pid = vcpu_id as u64;
2342             let bytes: &[u8] = pid.as_slice();
2343             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2344 
2345             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2346 
2347             let orig_rax: u64 = 0;
2348             let gregs = self.vcpus[usize::from(vcpu_id)]
2349                 .lock()
2350                 .unwrap()
2351                 .vcpu
2352                 .get_regs()
2353                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2354 
2355             let regs1 = [
2356                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2357                 gregs.r10,
2358             ];
2359             let regs2 = [
2360                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2361             ];
2362 
2363             let sregs = self.vcpus[usize::from(vcpu_id)]
2364                 .lock()
2365                 .unwrap()
2366                 .vcpu
2367                 .get_sregs()
2368                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2369 
2370             debug!(
2371                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2372                 gregs.rip,
2373                 gregs.rsp,
2374                 sregs.gs.base,
2375                 sregs.cs.selector,
2376                 sregs.ss.selector,
2377                 sregs.ds.selector,
2378             );
2379 
2380             let regs = X86_64UserRegs {
2381                 regs1,
2382                 regs2,
2383                 rip: gregs.rip,
2384                 cs: sregs.cs.selector as u64,
2385                 eflags: gregs.rflags,
2386                 rsp: gregs.rsp,
2387                 ss: sregs.ss.selector as u64,
2388                 fs_base: sregs.fs.base,
2389                 gs_base: sregs.gs.base,
2390                 ds: sregs.ds.selector as u64,
2391                 es: sregs.es.selector as u64,
2392                 fs: sregs.fs.selector as u64,
2393                 gs: sregs.gs.selector as u64,
2394             };
2395 
2396             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2397             let bytes: &[u8] = regs.as_slice();
2398             buf.resize(note_size as usize, 0);
2399             buf.splice(pos.., bytes.to_vec());
2400             buf.resize(note_size as usize, 0);
2401 
2402             coredump_file
2403                 .write(&buf)
2404                 .map_err(GuestDebuggableError::CoredumpFile)?;
2405         }
2406 
2407         Ok(())
2408     }
2409 
2410     fn cpu_write_vmm_note(
2411         &mut self,
2412         dump_state: &DumpState,
2413     ) -> std::result::Result<(), GuestDebuggableError> {
2414         let mut coredump_file = dump_state.file.as_ref().unwrap();
2415         for vcpu in &self.vcpus {
2416             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2417             let mut pos: usize = 0;
2418             let mut buf = vec![0; note_size as usize];
2419             let descsz = size_of::<DumpCpusState>();
2420             let vcpu_id = vcpu.lock().unwrap().id;
2421 
2422             let note = Elf64_Nhdr {
2423                 n_namesz: COREDUMP_NAME_SIZE,
2424                 n_descsz: descsz as u32,
2425                 n_type: 0,
2426             };
2427 
2428             let bytes: &[u8] = note.as_slice();
2429             buf.splice(0.., bytes.to_vec());
2430             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2431 
2432             buf.resize(pos + 4, 0);
2433             buf.splice(pos.., "QEMU".to_string().into_bytes());
2434 
2435             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2436 
2437             let gregs = self.vcpus[usize::from(vcpu_id)]
2438                 .lock()
2439                 .unwrap()
2440                 .vcpu
2441                 .get_regs()
2442                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2443 
2444             let regs1 = [
2445                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2446                 gregs.rbp,
2447             ];
2448 
2449             let regs2 = [
2450                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2451                 gregs.r15,
2452             ];
2453 
2454             let sregs = self.vcpus[usize::from(vcpu_id)]
2455                 .lock()
2456                 .unwrap()
2457                 .vcpu
2458                 .get_sregs()
2459                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2460 
2461             let mut msrs = vec![MsrEntry {
2462                 index: msr_index::MSR_KERNEL_GS_BASE,
2463                 ..Default::default()
2464             }];
2465 
2466             self.vcpus[vcpu_id as usize]
2467                 .lock()
2468                 .unwrap()
2469                 .vcpu
2470                 .get_msrs(&mut msrs)
2471                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2472             let kernel_gs_base = msrs[0].data;
2473 
2474             let cs = CpuSegment::new(sregs.cs);
2475             let ds = CpuSegment::new(sregs.ds);
2476             let es = CpuSegment::new(sregs.es);
2477             let fs = CpuSegment::new(sregs.fs);
2478             let gs = CpuSegment::new(sregs.gs);
2479             let ss = CpuSegment::new(sregs.ss);
2480             let ldt = CpuSegment::new(sregs.ldt);
2481             let tr = CpuSegment::new(sregs.tr);
2482             let gdt = CpuSegment::new_from_table(sregs.gdt);
2483             let idt = CpuSegment::new_from_table(sregs.idt);
2484             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2485             let regs = DumpCpusState {
2486                 version: 1,
2487                 size: size_of::<DumpCpusState>() as u32,
2488                 regs1,
2489                 regs2,
2490                 rip: gregs.rip,
2491                 rflags: gregs.rflags,
2492                 cs,
2493                 ds,
2494                 es,
2495                 fs,
2496                 gs,
2497                 ss,
2498                 ldt,
2499                 tr,
2500                 gdt,
2501                 idt,
2502                 cr,
2503                 kernel_gs_base,
2504             };
2505 
2506             let bytes: &[u8] = regs.as_slice();
2507             buf.resize(note_size as usize, 0);
2508             buf.splice(pos.., bytes.to_vec());
2509             buf.resize(note_size as usize, 0);
2510 
2511             coredump_file
2512                 .write(&buf)
2513                 .map_err(GuestDebuggableError::CoredumpFile)?;
2514         }
2515 
2516         Ok(())
2517     }
2518 }
2519 
2520 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2521 #[cfg(test)]
2522 mod tests {
2523     use arch::x86_64::interrupts::*;
2524     use arch::x86_64::regs::*;
2525     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2526 
2527     #[test]
2528     fn test_setlint() {
2529         let hv = hypervisor::new().unwrap();
2530         let vm = hv.create_vm().expect("new VM fd creation failed");
2531         assert!(hv.check_required_extensions().is_ok());
2532         // Calling get_lapic will fail if there is no irqchip before hand.
2533         assert!(vm.create_irq_chip().is_ok());
2534         let vcpu = vm.create_vcpu(0, None).unwrap();
2535         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2536 
2537         // Compute the value that is expected to represent LVT0 and LVT1.
2538         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2539         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2540         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2541         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2542 
2543         set_lint(&vcpu).unwrap();
2544 
2545         // Compute the value that represents LVT0 and LVT1 after set_lint.
2546         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2547         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2548         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2549         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2550         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2551     }
2552 
2553     #[test]
2554     fn test_setup_fpu() {
2555         let hv = hypervisor::new().unwrap();
2556         let vm = hv.create_vm().expect("new VM fd creation failed");
2557         let vcpu = vm.create_vcpu(0, None).unwrap();
2558         setup_fpu(&vcpu).unwrap();
2559 
2560         let expected_fpu: FpuState = FpuState {
2561             fcw: 0x37f,
2562             mxcsr: 0x1f80,
2563             ..Default::default()
2564         };
2565         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2566         // TODO: auto-generate kvm related structures with PartialEq on.
2567         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2568         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2569         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2570         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2571         // remove it at all.
2572         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2573     }
2574 
2575     #[test]
2576     fn test_setup_msrs() {
2577         use hypervisor::arch::x86::{msr_index, MsrEntry};
2578 
2579         let hv = hypervisor::new().unwrap();
2580         let vm = hv.create_vm().expect("new VM fd creation failed");
2581         let vcpu = vm.create_vcpu(0, None).unwrap();
2582         setup_msrs(&vcpu).unwrap();
2583 
2584         // This test will check against the last MSR entry configured (the tenth one).
2585         // See create_msr_entries for details.
2586         let mut msrs = vec![MsrEntry {
2587             index: msr_index::MSR_IA32_MISC_ENABLE,
2588             ..Default::default()
2589         }];
2590 
2591         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2592         // in this test case scenario.
2593         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2594         assert_eq!(read_msrs, 1);
2595 
2596         // Official entries that were setup when we did setup_msrs. We need to assert that the
2597         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2598         // expect.
2599         let entry_vec = vcpu.boot_msr_entries();
2600         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2601     }
2602 
2603     #[test]
2604     fn test_setup_regs() {
2605         let hv = hypervisor::new().unwrap();
2606         let vm = hv.create_vm().expect("new VM fd creation failed");
2607         let vcpu = vm.create_vcpu(0, None).unwrap();
2608 
2609         let expected_regs: StandardRegisters = StandardRegisters {
2610             rflags: 0x0000000000000002u64,
2611             rbx: arch::layout::PVH_INFO_START.0,
2612             rip: 1,
2613             ..Default::default()
2614         };
2615 
2616         setup_regs(&vcpu, expected_regs.rip).unwrap();
2617 
2618         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2619         assert_eq!(actual_regs, expected_regs);
2620     }
2621 }
2622 
2623 #[cfg(target_arch = "aarch64")]
2624 #[cfg(test)]
2625 mod tests {
2626     use arch::{aarch64::regs, layout};
2627     use hypervisor::kvm::aarch64::is_system_register;
2628     use hypervisor::kvm::kvm_bindings::{
2629         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2630         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2631     };
2632     use hypervisor::{arm64_core_reg_id, offset__of};
2633     use std::mem;
2634 
2635     #[test]
2636     fn test_setup_regs() {
2637         let hv = hypervisor::new().unwrap();
2638         let vm = hv.create_vm().unwrap();
2639         let vcpu = vm.create_vcpu(0, None).unwrap();
2640 
2641         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2642         // Must fail when vcpu is not initialized yet.
2643         assert!(res.is_err());
2644 
2645         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2646         vm.get_preferred_target(&mut kvi).unwrap();
2647         vcpu.vcpu_init(&kvi).unwrap();
2648 
2649         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2650     }
2651 
2652     #[test]
2653     fn test_read_mpidr() {
2654         let hv = hypervisor::new().unwrap();
2655         let vm = hv.create_vm().unwrap();
2656         let vcpu = vm.create_vcpu(0, None).unwrap();
2657         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2658         vm.get_preferred_target(&mut kvi).unwrap();
2659 
2660         // Must fail when vcpu is not initialized yet.
2661         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2662 
2663         vcpu.vcpu_init(&kvi).unwrap();
2664         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2665     }
2666 
2667     #[test]
2668     fn test_is_system_register() {
2669         let offset = offset__of!(user_pt_regs, pc);
2670         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2671         assert!(!is_system_register(regid));
2672         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2673         assert!(is_system_register(regid));
2674     }
2675 
2676     #[test]
2677     fn test_save_restore_core_regs() {
2678         let hv = hypervisor::new().unwrap();
2679         let vm = hv.create_vm().unwrap();
2680         let vcpu = vm.create_vcpu(0, None).unwrap();
2681         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2682         vm.get_preferred_target(&mut kvi).unwrap();
2683 
2684         // Must fail when vcpu is not initialized yet.
2685         let res = vcpu.get_regs();
2686         assert!(res.is_err());
2687         assert_eq!(
2688             format!("{}", res.unwrap_err()),
2689             "Failed to get core register: Exec format error (os error 8)"
2690         );
2691 
2692         let mut state = kvm_regs::default();
2693         let res = vcpu.set_regs(&state);
2694         assert!(res.is_err());
2695         assert_eq!(
2696             format!("{}", res.unwrap_err()),
2697             "Failed to set core register: Exec format error (os error 8)"
2698         );
2699 
2700         vcpu.vcpu_init(&kvi).unwrap();
2701         let res = vcpu.get_regs();
2702         assert!(res.is_ok());
2703         state = res.unwrap();
2704         assert_eq!(state.regs.pstate, 0x3C5);
2705 
2706         assert!(vcpu.set_regs(&state).is_ok());
2707     }
2708 
2709     #[test]
2710     fn test_get_set_mpstate() {
2711         let hv = hypervisor::new().unwrap();
2712         let vm = hv.create_vm().unwrap();
2713         let vcpu = vm.create_vcpu(0, None).unwrap();
2714         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2715         vm.get_preferred_target(&mut kvi).unwrap();
2716 
2717         let res = vcpu.get_mp_state();
2718         assert!(res.is_ok());
2719         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2720     }
2721 }
2722