xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 99a98f270d8ac3acaff5a24d126e442350469bca)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 use arch::EntryPoint;
35 use arch::NumaNodes;
36 #[cfg(target_arch = "aarch64")]
37 use devices::gic::Gic;
38 use devices::interrupt_controller::InterruptController;
39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
44 use hypervisor::aarch64::StandardRegisters;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::msr_index;
47 #[cfg(target_arch = "x86_64")]
48 use hypervisor::arch::x86::CpuIdEntry;
49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
50 use hypervisor::arch::x86::MsrEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
53 #[cfg(target_arch = "aarch64")]
54 use hypervisor::kvm::kvm_bindings;
55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
56 use hypervisor::kvm::kvm_ioctls::Cap;
57 #[cfg(feature = "tdx")]
58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
59 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
60 use libc::{c_void, siginfo_t};
61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
62 use linux_loader::elf::Elf64_Nhdr;
63 use seccompiler::{apply_filter, SeccompAction};
64 use std::collections::BTreeMap;
65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
66 use std::io::Write;
67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
68 use std::mem::size_of;
69 use std::os::unix::thread::JoinHandleExt;
70 use std::sync::atomic::{AtomicBool, Ordering};
71 use std::sync::{Arc, Barrier, Mutex};
72 use std::{cmp, io, result, thread};
73 use thiserror::Error;
74 use tracer::trace_scoped;
75 use vm_device::BusDevice;
76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
77 use vm_memory::ByteValued;
78 #[cfg(feature = "guest_debug")]
79 use vm_memory::{Bytes, GuestAddressSpace};
80 use vm_memory::{GuestAddress, GuestMemoryAtomic};
81 use vm_migration::{
82     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
83     Transportable,
84 };
85 use vmm_sys_util::eventfd::EventFd;
86 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
87 use zerocopy::AsBytes;
88 
89 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
90 /// Extract the specified bits of a 64-bit integer.
91 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
92 /// following expression should return 3 (`0b11`):
93 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
94 ///
95 macro_rules! extract_bits_64 {
96     ($value: tt, $offset: tt, $length: tt) => {
97         ($value >> $offset) & (!0u64 >> (64 - $length))
98     };
99 }
100 
101 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
102 macro_rules! extract_bits_64_without_offset {
103     ($value: tt, $length: tt) => {
104         $value & (!0u64 >> (64 - $length))
105     };
106 }
107 
108 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
109 
110 #[derive(Debug, Error)]
111 pub enum Error {
112     #[error("Error creating vCPU: {0}")]
113     VcpuCreate(#[source] anyhow::Error),
114 
115     #[error("Error running bCPU: {0}")]
116     VcpuRun(#[source] anyhow::Error),
117 
118     #[error("Error spawning vCPU thread: {0}")]
119     VcpuSpawn(#[source] io::Error),
120 
121     #[error("Error generating common CPUID: {0}")]
122     CommonCpuId(#[source] arch::Error),
123 
124     #[error("Error configuring vCPU: {0}")]
125     VcpuConfiguration(#[source] arch::Error),
126 
127     #[cfg(target_arch = "aarch64")]
128     #[error("Error fetching preferred target: {0}")]
129     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
130 
131     #[cfg(target_arch = "aarch64")]
132     #[error("Error initialising vCPU: {0}")]
133     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
134 
135     #[error("Failed to join on vCPU threads: {0:?}")]
136     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
137 
138     #[error("Error adding CpuManager to MMIO bus: {0}")]
139     BusError(#[source] vm_device::BusError),
140 
141     #[error("Requested vCPUs exceed maximum")]
142     DesiredVCpuCountExceedsMax,
143 
144     #[error("Cannot create seccomp filter: {0}")]
145     CreateSeccompFilter(#[source] seccompiler::Error),
146 
147     #[error("Cannot apply seccomp filter: {0}")]
148     ApplySeccompFilter(#[source] seccompiler::Error),
149 
150     #[error("Error starting vCPU after restore: {0}")]
151     StartRestoreVcpu(#[source] anyhow::Error),
152 
153     #[error("Unexpected VmExit")]
154     UnexpectedVmExit,
155 
156     #[error("Failed to allocate MMIO address for CpuManager")]
157     AllocateMmmioAddress,
158 
159     #[cfg(feature = "tdx")]
160     #[error("Error initializing TDX: {0}")]
161     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
162 
163     #[cfg(target_arch = "aarch64")]
164     #[error("Error initializing PMU: {0}")]
165     InitPmu(#[source] hypervisor::HypervisorCpuError),
166 
167     #[cfg(feature = "guest_debug")]
168     #[error("Error during CPU debug: {0}")]
169     CpuDebug(#[source] hypervisor::HypervisorCpuError),
170 
171     #[cfg(feature = "guest_debug")]
172     #[error("Error translating virtual address: {0}")]
173     TranslateVirtualAddress(#[source] anyhow::Error),
174 
175     #[cfg(target_arch = "x86_64")]
176     #[error("Error setting up AMX: {0}")]
177     AmxEnable(#[source] anyhow::Error),
178 
179     #[error("Maximum number of vCPUs exceeds host limit")]
180     MaximumVcpusExceeded,
181 }
182 pub type Result<T> = result::Result<T, Error>;
183 
184 #[cfg(target_arch = "x86_64")]
185 #[allow(dead_code)]
186 #[repr(packed)]
187 #[derive(AsBytes)]
188 struct LocalApic {
189     pub r#type: u8,
190     pub length: u8,
191     pub processor_id: u8,
192     pub apic_id: u8,
193     pub flags: u32,
194 }
195 
196 #[allow(dead_code)]
197 #[repr(packed)]
198 #[derive(Default, AsBytes)]
199 struct Ioapic {
200     pub r#type: u8,
201     pub length: u8,
202     pub ioapic_id: u8,
203     _reserved: u8,
204     pub apic_address: u32,
205     pub gsi_base: u32,
206 }
207 
208 #[cfg(target_arch = "aarch64")]
209 #[allow(dead_code)]
210 #[repr(packed)]
211 #[derive(AsBytes)]
212 struct GicC {
213     pub r#type: u8,
214     pub length: u8,
215     pub reserved0: u16,
216     pub cpu_interface_number: u32,
217     pub uid: u32,
218     pub flags: u32,
219     pub parking_version: u32,
220     pub performance_interrupt: u32,
221     pub parked_address: u64,
222     pub base_address: u64,
223     pub gicv_base_address: u64,
224     pub gich_base_address: u64,
225     pub vgic_interrupt: u32,
226     pub gicr_base_address: u64,
227     pub mpidr: u64,
228     pub proc_power_effi_class: u8,
229     pub reserved1: u8,
230     pub spe_overflow_interrupt: u16,
231 }
232 
233 #[cfg(target_arch = "aarch64")]
234 #[allow(dead_code)]
235 #[repr(packed)]
236 #[derive(AsBytes)]
237 struct GicD {
238     pub r#type: u8,
239     pub length: u8,
240     pub reserved0: u16,
241     pub gic_id: u32,
242     pub base_address: u64,
243     pub global_irq_base: u32,
244     pub version: u8,
245     pub reserved1: [u8; 3],
246 }
247 
248 #[cfg(target_arch = "aarch64")]
249 #[allow(dead_code)]
250 #[repr(packed)]
251 #[derive(AsBytes)]
252 struct GicR {
253     pub r#type: u8,
254     pub length: u8,
255     pub reserved: u16,
256     pub base_address: u64,
257     pub range_length: u32,
258 }
259 
260 #[cfg(target_arch = "aarch64")]
261 #[allow(dead_code)]
262 #[repr(packed)]
263 #[derive(AsBytes)]
264 struct GicIts {
265     pub r#type: u8,
266     pub length: u8,
267     pub reserved0: u16,
268     pub translation_id: u32,
269     pub base_address: u64,
270     pub reserved1: u32,
271 }
272 
273 #[cfg(target_arch = "aarch64")]
274 #[allow(dead_code)]
275 #[repr(packed)]
276 #[derive(AsBytes)]
277 struct ProcessorHierarchyNode {
278     pub r#type: u8,
279     pub length: u8,
280     pub reserved: u16,
281     pub flags: u32,
282     pub parent: u32,
283     pub acpi_processor_id: u32,
284     pub num_private_resources: u32,
285 }
286 
287 #[allow(dead_code)]
288 #[repr(packed)]
289 #[derive(Default, AsBytes)]
290 struct InterruptSourceOverride {
291     pub r#type: u8,
292     pub length: u8,
293     pub bus: u8,
294     pub source: u8,
295     pub gsi: u32,
296     pub flags: u16,
297 }
298 
299 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
300 macro_rules! round_up {
301     ($n:expr,$d:expr) => {
302         (($n / ($d + 1)) + 1) * $d
303     };
304 }
305 
306 /// A wrapper around creating and using a kvm-based VCPU.
307 pub struct Vcpu {
308     // The hypervisor abstracted CPU.
309     vcpu: Arc<dyn hypervisor::Vcpu>,
310     id: u8,
311     #[cfg(target_arch = "aarch64")]
312     mpidr: u64,
313     saved_state: Option<CpuState>,
314 }
315 
316 impl Vcpu {
317     /// Constructs a new VCPU for `vm`.
318     ///
319     /// # Arguments
320     ///
321     /// * `id` - Represents the CPU number between [0, max vcpus).
322     /// * `vm` - The virtual machine this vcpu will get attached to.
323     /// * `vm_ops` - Optional object for exit handling.
324     pub fn new(
325         id: u8,
326         vm: &Arc<dyn hypervisor::Vm>,
327         vm_ops: Option<Arc<dyn VmOps>>,
328     ) -> Result<Self> {
329         let vcpu = vm
330             .create_vcpu(id, vm_ops)
331             .map_err(|e| Error::VcpuCreate(e.into()))?;
332         // Initially the cpuid per vCPU is the one supported by this VM.
333         Ok(Vcpu {
334             vcpu,
335             id,
336             #[cfg(target_arch = "aarch64")]
337             mpidr: 0,
338             saved_state: None,
339         })
340     }
341 
342     /// Configures a vcpu and should be called once per vcpu when created.
343     ///
344     /// # Arguments
345     ///
346     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
347     /// * `guest_memory` - Guest memory.
348     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
349     pub fn configure(
350         &mut self,
351         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
352         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
353         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
354         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
355     ) -> Result<()> {
356         #[cfg(target_arch = "aarch64")]
357         {
358             self.init(vm)?;
359             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
360                 .map_err(Error::VcpuConfiguration)?;
361         }
362         info!("Configuring vCPU: cpu_id = {}", self.id);
363         #[cfg(target_arch = "x86_64")]
364         arch::configure_vcpu(&self.vcpu, self.id, boot_setup, cpuid, kvm_hyperv)
365             .map_err(Error::VcpuConfiguration)?;
366 
367         Ok(())
368     }
369 
370     /// Gets the MPIDR register value.
371     #[cfg(target_arch = "aarch64")]
372     pub fn get_mpidr(&self) -> u64 {
373         self.mpidr
374     }
375 
376     /// Gets the saved vCPU state.
377     #[cfg(target_arch = "aarch64")]
378     pub fn get_saved_state(&self) -> Option<CpuState> {
379         self.saved_state.clone()
380     }
381 
382     /// Initializes an aarch64 specific vcpu for booting Linux.
383     #[cfg(target_arch = "aarch64")]
384     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
385         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
386 
387         // This reads back the kernel's preferred target type.
388         vm.get_preferred_target(&mut kvi)
389             .map_err(Error::VcpuArmPreferredTarget)?;
390         // We already checked that the capability is supported.
391         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
392         if vm
393             .as_any()
394             .downcast_ref::<hypervisor::kvm::KvmVm>()
395             .unwrap()
396             .check_extension(Cap::ArmPmuV3)
397         {
398             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
399         }
400         // Non-boot cpus are powered off initially.
401         if self.id > 0 {
402             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
403         }
404         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
405     }
406 
407     /// Runs the VCPU until it exits, returning the reason.
408     ///
409     /// Note that the state of the VCPU and associated VM must be setup first for this to do
410     /// anything useful.
411     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
412         self.vcpu.run()
413     }
414 }
415 
416 impl Pausable for Vcpu {}
417 impl Snapshottable for Vcpu {
418     fn id(&self) -> String {
419         self.id.to_string()
420     }
421 
422     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
423         let saved_state = self
424             .vcpu
425             .state()
426             .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?;
427 
428         self.saved_state = Some(saved_state.clone());
429 
430         Ok(Snapshot::from_data(SnapshotData::new_from_state(
431             &saved_state,
432         )?))
433     }
434 }
435 
436 pub struct CpuManager {
437     hypervisor_type: HypervisorType,
438     config: CpusConfig,
439     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
440     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
441     #[cfg(target_arch = "x86_64")]
442     cpuid: Vec<CpuIdEntry>,
443     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
444     vm: Arc<dyn hypervisor::Vm>,
445     vcpus_kill_signalled: Arc<AtomicBool>,
446     vcpus_pause_signalled: Arc<AtomicBool>,
447     exit_evt: EventFd,
448     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
449     reset_evt: EventFd,
450     #[cfg(feature = "guest_debug")]
451     vm_debug_evt: EventFd,
452     vcpu_states: Vec<VcpuState>,
453     selected_cpu: u8,
454     vcpus: Vec<Arc<Mutex<Vcpu>>>,
455     seccomp_action: SeccompAction,
456     vm_ops: Arc<dyn VmOps>,
457     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
458     acpi_address: Option<GuestAddress>,
459     proximity_domain_per_cpu: BTreeMap<u8, u32>,
460     affinity: BTreeMap<u8, Vec<u8>>,
461     dynamic: bool,
462 }
463 
464 const CPU_ENABLE_FLAG: usize = 0;
465 const CPU_INSERTING_FLAG: usize = 1;
466 const CPU_REMOVING_FLAG: usize = 2;
467 const CPU_EJECT_FLAG: usize = 3;
468 
469 const CPU_STATUS_OFFSET: u64 = 4;
470 const CPU_SELECTION_OFFSET: u64 = 0;
471 
472 impl BusDevice for CpuManager {
473     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
474         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
475         data.fill(0);
476 
477         match offset {
478             CPU_SELECTION_OFFSET => {
479                 data[0] = self.selected_cpu;
480             }
481             CPU_STATUS_OFFSET => {
482                 if self.selected_cpu < self.max_vcpus() {
483                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
484                     if state.active() {
485                         data[0] |= 1 << CPU_ENABLE_FLAG;
486                     }
487                     if state.inserting {
488                         data[0] |= 1 << CPU_INSERTING_FLAG;
489                     }
490                     if state.removing {
491                         data[0] |= 1 << CPU_REMOVING_FLAG;
492                     }
493                 } else {
494                     warn!("Out of range vCPU id: {}", self.selected_cpu);
495                 }
496             }
497             _ => {
498                 warn!(
499                     "Unexpected offset for accessing CPU manager device: {:#}",
500                     offset
501                 );
502             }
503         }
504     }
505 
506     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
507         match offset {
508             CPU_SELECTION_OFFSET => {
509                 self.selected_cpu = data[0];
510             }
511             CPU_STATUS_OFFSET => {
512                 if self.selected_cpu < self.max_vcpus() {
513                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
514                     // The ACPI code writes back a 1 to acknowledge the insertion
515                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
516                         && state.inserting
517                     {
518                         state.inserting = false;
519                     }
520                     // Ditto for removal
521                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
522                         && state.removing
523                     {
524                         state.removing = false;
525                     }
526                     // Trigger removal of vCPU
527                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
528                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
529                             error!("Error removing vCPU: {:?}", e);
530                         }
531                     }
532                 } else {
533                     warn!("Out of range vCPU id: {}", self.selected_cpu);
534                 }
535             }
536             _ => {
537                 warn!(
538                     "Unexpected offset for accessing CPU manager device: {:#}",
539                     offset
540                 );
541             }
542         }
543         None
544     }
545 }
546 
547 #[derive(Default)]
548 struct VcpuState {
549     inserting: bool,
550     removing: bool,
551     handle: Option<thread::JoinHandle<()>>,
552     kill: Arc<AtomicBool>,
553     vcpu_run_interrupted: Arc<AtomicBool>,
554 }
555 
556 impl VcpuState {
557     fn active(&self) -> bool {
558         self.handle.is_some()
559     }
560 
561     fn signal_thread(&self) {
562         if let Some(handle) = self.handle.as_ref() {
563             loop {
564                 // SAFETY: FFI call with correct arguments
565                 unsafe {
566                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
567                 }
568                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
569                     break;
570                 } else {
571                     // This is more effective than thread::yield_now() at
572                     // avoiding a priority inversion with the vCPU thread
573                     thread::sleep(std::time::Duration::from_millis(1));
574                 }
575             }
576         }
577     }
578 
579     fn join_thread(&mut self) -> Result<()> {
580         if let Some(handle) = self.handle.take() {
581             handle.join().map_err(Error::ThreadCleanup)?
582         }
583 
584         Ok(())
585     }
586 
587     fn unpark_thread(&self) {
588         if let Some(handle) = self.handle.as_ref() {
589             handle.thread().unpark()
590         }
591     }
592 }
593 
594 impl CpuManager {
595     #[allow(unused_variables)]
596     #[allow(clippy::too_many_arguments)]
597     pub fn new(
598         config: &CpusConfig,
599         vm: Arc<dyn hypervisor::Vm>,
600         exit_evt: EventFd,
601         reset_evt: EventFd,
602         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
603         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
604         seccomp_action: SeccompAction,
605         vm_ops: Arc<dyn VmOps>,
606         #[cfg(feature = "tdx")] tdx_enabled: bool,
607         numa_nodes: &NumaNodes,
608     ) -> Result<Arc<Mutex<CpuManager>>> {
609         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
610             return Err(Error::MaximumVcpusExceeded);
611         }
612 
613         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
614         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
615         let hypervisor_type = hypervisor.hypervisor_type();
616 
617         #[cfg(target_arch = "x86_64")]
618         if config.features.amx {
619             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
620             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
621             const XFEATURE_XTILEDATA: usize = 18;
622             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
623 
624             // SAFETY: the syscall is only modifing kernel internal
625             // data structures that the kernel is itself expected to safeguard.
626             let amx_tile = unsafe {
627                 libc::syscall(
628                     libc::SYS_arch_prctl,
629                     ARCH_REQ_XCOMP_GUEST_PERM,
630                     XFEATURE_XTILEDATA,
631                 )
632             };
633 
634             if amx_tile != 0 {
635                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
636             } else {
637                 let mask: usize = 0;
638                 // SAFETY: the mask being modified (not marked mutable as it is
639                 // modified in unsafe only which is permitted) isn't in use elsewhere.
640                 let result = unsafe {
641                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
642                 };
643                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
644                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
645                 }
646             }
647         }
648 
649         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
650             let mut cpu_list = Vec::new();
651             for (proximity_domain, numa_node) in numa_nodes.iter() {
652                 for cpu in numa_node.cpus.iter() {
653                     cpu_list.push((*cpu, *proximity_domain))
654                 }
655             }
656             cpu_list
657         }
658         .into_iter()
659         .collect();
660 
661         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
662             cpu_affinity
663                 .iter()
664                 .map(|a| (a.vcpu, a.host_cpus.clone()))
665                 .collect()
666         } else {
667             BTreeMap::new()
668         };
669 
670         #[cfg(feature = "tdx")]
671         let dynamic = !tdx_enabled;
672         #[cfg(not(feature = "tdx"))]
673         let dynamic = true;
674 
675         Ok(Arc::new(Mutex::new(CpuManager {
676             hypervisor_type,
677             config: config.clone(),
678             interrupt_controller: None,
679             #[cfg(target_arch = "x86_64")]
680             cpuid: Vec::new(),
681             vm,
682             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
683             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
684             vcpu_states,
685             exit_evt,
686             reset_evt,
687             #[cfg(feature = "guest_debug")]
688             vm_debug_evt,
689             selected_cpu: 0,
690             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
691             seccomp_action,
692             vm_ops,
693             acpi_address: None,
694             proximity_domain_per_cpu,
695             affinity,
696             dynamic,
697         })))
698     }
699 
700     #[cfg(target_arch = "x86_64")]
701     pub fn populate_cpuid(
702         &mut self,
703         memory_manager: &Arc<Mutex<MemoryManager>>,
704         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
705         #[cfg(feature = "tdx")] tdx_enabled: bool,
706     ) -> Result<()> {
707         let sgx_epc_sections = memory_manager
708             .lock()
709             .unwrap()
710             .sgx_epc_region()
711             .as_ref()
712             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
713 
714         let topology = self.config.topology.clone().map_or_else(
715             || {
716                 #[cfg(feature = "mshv")]
717                 if matches!(hypervisor.hypervisor_type(), HypervisorType::Mshv) {
718                     return Some((1, self.boot_vcpus(), 1));
719                 }
720                 None
721             },
722             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
723         );
724 
725         self.cpuid = {
726             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
727             arch::generate_common_cpuid(
728                 hypervisor,
729                 topology,
730                 sgx_epc_sections,
731                 phys_bits,
732                 self.config.kvm_hyperv,
733                 #[cfg(feature = "tdx")]
734                 tdx_enabled,
735             )
736             .map_err(Error::CommonCpuId)?
737         };
738 
739         Ok(())
740     }
741 
742     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
743         info!("Creating vCPU: cpu_id = {}", cpu_id);
744 
745         let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?;
746 
747         if let Some(snapshot) = snapshot {
748             // AArch64 vCPUs should be initialized after created.
749             #[cfg(target_arch = "aarch64")]
750             vcpu.init(&self.vm)?;
751 
752             let state: CpuState = snapshot.to_state().map_err(|e| {
753                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
754             })?;
755             vcpu.vcpu
756                 .set_state(&state)
757                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
758 
759             vcpu.saved_state = Some(state);
760         }
761 
762         let vcpu = Arc::new(Mutex::new(vcpu));
763 
764         // Adding vCPU to the CpuManager's vCPU list.
765         self.vcpus.push(vcpu.clone());
766 
767         Ok(vcpu)
768     }
769 
770     pub fn configure_vcpu(
771         &self,
772         vcpu: Arc<Mutex<Vcpu>>,
773         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
774     ) -> Result<()> {
775         let mut vcpu = vcpu.lock().unwrap();
776 
777         #[cfg(target_arch = "x86_64")]
778         assert!(!self.cpuid.is_empty());
779 
780         #[cfg(target_arch = "x86_64")]
781         vcpu.configure(boot_setup, self.cpuid.clone(), self.config.kvm_hyperv)?;
782 
783         #[cfg(target_arch = "aarch64")]
784         vcpu.configure(&self.vm, boot_setup)?;
785 
786         Ok(())
787     }
788 
789     /// Only create new vCPUs if there aren't any inactive ones to reuse
790     fn create_vcpus(
791         &mut self,
792         desired_vcpus: u8,
793         snapshot: Option<Snapshot>,
794     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
795         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
796         info!(
797             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
798             desired_vcpus,
799             self.config.max_vcpus,
800             self.vcpus.len(),
801             self.present_vcpus()
802         );
803 
804         if desired_vcpus > self.config.max_vcpus {
805             return Err(Error::DesiredVCpuCountExceedsMax);
806         }
807 
808         // Only create vCPUs in excess of all the allocated vCPUs.
809         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
810             vcpus.push(self.create_vcpu(
811                 cpu_id,
812                 // TODO: The special format of the CPU id can be removed once
813                 // ready to break live upgrade.
814                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
815             )?);
816         }
817 
818         Ok(vcpus)
819     }
820 
821     #[cfg(target_arch = "aarch64")]
822     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
823         for cpu in self.vcpus.iter() {
824             let cpu = cpu.lock().unwrap();
825             // Check if PMU attr is available, if not, log the information.
826             if cpu.vcpu.has_pmu_support() {
827                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
828             } else {
829                 debug!(
830                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
831                     cpu.id
832                 );
833                 return Ok(false);
834             }
835         }
836 
837         Ok(true)
838     }
839 
840     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
841         self.vcpus.clone()
842     }
843 
844     fn start_vcpu(
845         &mut self,
846         vcpu: Arc<Mutex<Vcpu>>,
847         vcpu_id: u8,
848         vcpu_thread_barrier: Arc<Barrier>,
849         inserting: bool,
850     ) -> Result<()> {
851         let reset_evt = self.reset_evt.try_clone().unwrap();
852         let exit_evt = self.exit_evt.try_clone().unwrap();
853         #[cfg(feature = "kvm")]
854         let hypervisor_type = self.hypervisor_type;
855         #[cfg(feature = "guest_debug")]
856         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
857         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
858         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
859         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
860 
861         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
862         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
863             .vcpu_run_interrupted
864             .clone();
865         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
866 
867         // Prepare the CPU set the current vCPU is expected to run onto.
868         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
869             // SAFETY: all zeros is a valid pattern
870             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
871             // SAFETY: FFI call, trivially safe
872             unsafe { libc::CPU_ZERO(&mut cpuset) };
873             for host_cpu in host_cpus {
874                 // SAFETY: FFI call, trivially safe
875                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
876             }
877             cpuset
878         });
879 
880         // Retrieve seccomp filter for vcpu thread
881         let vcpu_seccomp_filter =
882             get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type)
883                 .map_err(Error::CreateSeccompFilter)?;
884 
885         #[cfg(target_arch = "x86_64")]
886         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
887 
888         info!("Starting vCPU: cpu_id = {}", vcpu_id);
889 
890         let handle = Some(
891             thread::Builder::new()
892                 .name(format!("vcpu{vcpu_id}"))
893                 .spawn(move || {
894                     // Schedule the thread to run on the expected CPU set
895                     if let Some(cpuset) = cpuset.as_ref() {
896                         // SAFETY: FFI call with correct arguments
897                         let ret = unsafe {
898                             libc::sched_setaffinity(
899                                 0,
900                                 std::mem::size_of::<libc::cpu_set_t>(),
901                                 cpuset as *const libc::cpu_set_t,
902                             )
903                         };
904 
905                         if ret != 0 {
906                             error!(
907                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
908                                 vcpu_id,
909                                 io::Error::last_os_error()
910                             );
911                             return;
912                         }
913                     }
914 
915                     // Apply seccomp filter for vcpu thread.
916                     if !vcpu_seccomp_filter.is_empty() {
917                         if let Err(e) =
918                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
919                         {
920                             error!("Error applying seccomp filter: {:?}", e);
921                             return;
922                         }
923                     }
924                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
925                     // This uses an async signal safe handler to kill the vcpu handles.
926                     register_signal_handler(SIGRTMIN(), handle_signal)
927                         .expect("Failed to register vcpu signal handler");
928                     // Block until all CPUs are ready.
929                     vcpu_thread_barrier.wait();
930 
931                     std::panic::catch_unwind(move || {
932                         loop {
933                             // If we are being told to pause, we park the thread
934                             // until the pause boolean is toggled.
935                             // The resume operation is responsible for toggling
936                             // the boolean and unpark the thread.
937                             // We enter a loop because park() could spuriously
938                             // return. We will then park() again unless the
939                             // pause boolean has been toggled.
940 
941                             // Need to use Ordering::SeqCst as we have multiple
942                             // loads and stores to different atomics and we need
943                             // to see them in a consistent order in all threads
944 
945                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
946                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
947                                 // completed by returning to KVM_RUN. From the kernel docs:
948                                 //
949                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
950                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
951                                 // operations are complete (and guest state is consistent) only after userspace
952                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
953                                 // incomplete operations and then check for pending signals.
954                                 // The pending state of the operation is not preserved in state which is
955                                 // visible to userspace, thus userspace should ensure that the operation is
956                                 // completed before performing a live migration.  Userspace can re-enter the
957                                 // guest with an unmasked signal pending or with the immediate_exit field set
958                                 // to complete pending operations without allowing any further instructions
959                                 // to be executed.
960 
961                                 #[cfg(feature = "kvm")]
962                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
963                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
964                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
965                                         error!("Unexpected VM exit on \"immediate_exit\" run");
966                                         break;
967                                     }
968                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
969                                 }
970 
971                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
972                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
973                                     thread::park();
974                                 }
975                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
976                             }
977 
978                             // We've been told to terminate
979                             if vcpu_kill_signalled.load(Ordering::SeqCst)
980                                 || vcpu_kill.load(Ordering::SeqCst)
981                             {
982                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
983                                 break;
984                             }
985 
986                             #[cfg(feature = "tdx")]
987                             let mut vcpu = vcpu.lock().unwrap();
988                             #[cfg(not(feature = "tdx"))]
989                             let vcpu = vcpu.lock().unwrap();
990                             // vcpu.run() returns false on a triple-fault so trigger a reset
991                             match vcpu.run() {
992                                 Ok(run) => match run {
993                                     #[cfg(feature = "kvm")]
994                                     VmExit::Debug => {
995                                         info!("VmExit::Debug");
996                                         #[cfg(feature = "guest_debug")]
997                                         {
998                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
999                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1000                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1001                                         }
1002                                     }
1003                                     #[cfg(target_arch = "x86_64")]
1004                                     VmExit::IoapicEoi(vector) => {
1005                                         if let Some(interrupt_controller) =
1006                                             &interrupt_controller_clone
1007                                         {
1008                                             interrupt_controller
1009                                                 .lock()
1010                                                 .unwrap()
1011                                                 .end_of_interrupt(vector);
1012                                         }
1013                                     }
1014                                     VmExit::Ignore => {}
1015                                     VmExit::Hyperv => {}
1016                                     VmExit::Reset => {
1017                                         info!("VmExit::Reset");
1018                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1019                                         reset_evt.write(1).unwrap();
1020                                         break;
1021                                     }
1022                                     VmExit::Shutdown => {
1023                                         info!("VmExit::Shutdown");
1024                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1025                                         exit_evt.write(1).unwrap();
1026                                         break;
1027                                     }
1028                                     #[cfg(feature = "tdx")]
1029                                     VmExit::Tdx => {
1030                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1031                                             match vcpu.get_tdx_exit_details() {
1032                                                 Ok(details) => match details {
1033                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1034                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1035                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1036                                                     }
1037                                                 },
1038                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1039                                             }
1040                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1041                                         } else {
1042                                             // We should never reach this code as
1043                                             // this means the design from the code
1044                                             // is wrong.
1045                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1046                                         }
1047                                     }
1048                                     _ => {
1049                                         error!(
1050                                             "VCPU generated error: {:?}",
1051                                             Error::UnexpectedVmExit
1052                                         );
1053                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1054                                         exit_evt.write(1).unwrap();
1055                                         break;
1056                                     }
1057                                 },
1058 
1059                                 Err(e) => {
1060                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1061                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1062                                     exit_evt.write(1).unwrap();
1063                                     break;
1064                                 }
1065                             }
1066 
1067                             // We've been told to terminate
1068                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1069                                 || vcpu_kill.load(Ordering::SeqCst)
1070                             {
1071                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1072                                 break;
1073                             }
1074                         }
1075                     })
1076                     .or_else(|_| {
1077                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1078                         error!("vCPU thread panicked");
1079                         panic_exit_evt.write(1)
1080                     })
1081                     .ok();
1082                 })
1083                 .map_err(Error::VcpuSpawn)?,
1084         );
1085 
1086         // On hot plug calls into this function entry_point is None. It is for
1087         // those hotplug CPU additions that we need to set the inserting flag.
1088         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1089         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1090 
1091         Ok(())
1092     }
1093 
1094     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1095     fn activate_vcpus(
1096         &mut self,
1097         desired_vcpus: u8,
1098         inserting: bool,
1099         paused: Option<bool>,
1100     ) -> Result<()> {
1101         if desired_vcpus > self.config.max_vcpus {
1102             return Err(Error::DesiredVCpuCountExceedsMax);
1103         }
1104 
1105         let vcpu_thread_barrier = Arc::new(Barrier::new(
1106             (desired_vcpus - self.present_vcpus() + 1) as usize,
1107         ));
1108 
1109         if let Some(paused) = paused {
1110             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1111         }
1112 
1113         info!(
1114             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1115             desired_vcpus,
1116             self.vcpus.len(),
1117             self.present_vcpus(),
1118             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1119         );
1120 
1121         // This reuses any inactive vCPUs as well as any that were newly created
1122         for vcpu_id in self.present_vcpus()..desired_vcpus {
1123             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1124             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1125         }
1126 
1127         // Unblock all CPU threads.
1128         vcpu_thread_barrier.wait();
1129         Ok(())
1130     }
1131 
1132     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1133         // Mark vCPUs for removal, actual removal happens on ejection
1134         for cpu_id in desired_vcpus..self.present_vcpus() {
1135             self.vcpu_states[usize::from(cpu_id)].removing = true;
1136         }
1137     }
1138 
1139     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1140         info!("Removing vCPU: cpu_id = {}", cpu_id);
1141         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1142         state.kill.store(true, Ordering::SeqCst);
1143         state.signal_thread();
1144         state.join_thread()?;
1145         state.handle = None;
1146 
1147         // Once the thread has exited, clear the "kill" so that it can reused
1148         state.kill.store(false, Ordering::SeqCst);
1149 
1150         Ok(())
1151     }
1152 
1153     pub fn create_boot_vcpus(
1154         &mut self,
1155         snapshot: Option<Snapshot>,
1156     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1157         trace_scoped!("create_boot_vcpus");
1158 
1159         self.create_vcpus(self.boot_vcpus(), snapshot)
1160     }
1161 
1162     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1163     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1164         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1165     }
1166 
1167     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1168         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1169             .map_err(|e| {
1170                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1171             })?;
1172 
1173         Ok(())
1174     }
1175 
1176     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1177         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1178             return Ok(false);
1179         }
1180 
1181         if !self.dynamic {
1182             return Ok(false);
1183         }
1184 
1185         match desired_vcpus.cmp(&self.present_vcpus()) {
1186             cmp::Ordering::Greater => {
1187                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1188                 for vcpu in vcpus {
1189                     self.configure_vcpu(vcpu, None)?
1190                 }
1191                 self.activate_vcpus(desired_vcpus, true, None)?;
1192                 Ok(true)
1193             }
1194             cmp::Ordering::Less => {
1195                 self.mark_vcpus_for_removal(desired_vcpus);
1196                 Ok(true)
1197             }
1198             _ => Ok(false),
1199         }
1200     }
1201 
1202     pub fn shutdown(&mut self) -> Result<()> {
1203         // Tell the vCPUs to stop themselves next time they go through the loop
1204         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1205 
1206         // Toggle the vCPUs pause boolean
1207         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1208 
1209         // Unpark all the VCPU threads.
1210         for state in self.vcpu_states.iter() {
1211             state.unpark_thread();
1212         }
1213 
1214         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1215         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1216         // above.
1217         for state in self.vcpu_states.iter() {
1218             state.signal_thread();
1219         }
1220 
1221         // Wait for all the threads to finish. This removes the state from the vector.
1222         for mut state in self.vcpu_states.drain(..) {
1223             state.join_thread()?;
1224         }
1225 
1226         Ok(())
1227     }
1228 
1229     #[cfg(feature = "tdx")]
1230     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1231         for vcpu in &self.vcpus {
1232             vcpu.lock()
1233                 .unwrap()
1234                 .vcpu
1235                 .tdx_init(hob_address)
1236                 .map_err(Error::InitializeTdx)?;
1237         }
1238         Ok(())
1239     }
1240 
1241     pub fn boot_vcpus(&self) -> u8 {
1242         self.config.boot_vcpus
1243     }
1244 
1245     pub fn max_vcpus(&self) -> u8 {
1246         self.config.max_vcpus
1247     }
1248 
1249     #[cfg(target_arch = "x86_64")]
1250     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1251         assert!(!self.cpuid.is_empty());
1252         self.cpuid.clone()
1253     }
1254 
1255     fn present_vcpus(&self) -> u8 {
1256         self.vcpu_states
1257             .iter()
1258             .fold(0, |acc, state| acc + state.active() as u8)
1259     }
1260 
1261     #[cfg(target_arch = "aarch64")]
1262     pub fn get_mpidrs(&self) -> Vec<u64> {
1263         self.vcpus
1264             .iter()
1265             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1266             .collect()
1267     }
1268 
1269     #[cfg(target_arch = "aarch64")]
1270     pub fn get_saved_states(&self) -> Vec<CpuState> {
1271         self.vcpus
1272             .iter()
1273             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1274             .collect()
1275     }
1276 
1277     #[cfg(target_arch = "aarch64")]
1278     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1279         self.config
1280             .topology
1281             .clone()
1282             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1283     }
1284 
1285     pub fn create_madt(&self) -> Sdt {
1286         use crate::acpi;
1287         // This is also checked in the commandline parsing.
1288         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1289 
1290         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1291         #[cfg(target_arch = "x86_64")]
1292         {
1293             madt.write(36, arch::layout::APIC_START.0);
1294 
1295             for cpu in 0..self.config.max_vcpus {
1296                 let lapic = LocalApic {
1297                     r#type: acpi::ACPI_APIC_PROCESSOR,
1298                     length: 8,
1299                     processor_id: cpu,
1300                     apic_id: cpu,
1301                     flags: if cpu < self.config.boot_vcpus {
1302                         1 << MADT_CPU_ENABLE_FLAG
1303                     } else {
1304                         0
1305                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1306                 };
1307                 madt.append(lapic);
1308             }
1309 
1310             madt.append(Ioapic {
1311                 r#type: acpi::ACPI_APIC_IO,
1312                 length: 12,
1313                 ioapic_id: 0,
1314                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1315                 gsi_base: 0,
1316                 ..Default::default()
1317             });
1318 
1319             madt.append(InterruptSourceOverride {
1320                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1321                 length: 10,
1322                 bus: 0,
1323                 source: 4,
1324                 gsi: 4,
1325                 flags: 0,
1326             });
1327         }
1328 
1329         #[cfg(target_arch = "aarch64")]
1330         {
1331             /* Notes:
1332              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1333              */
1334 
1335             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1336             for cpu in 0..self.config.boot_vcpus {
1337                 let vcpu = &self.vcpus[cpu as usize];
1338                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1339                 /* ARMv8 MPIDR format:
1340                      Bits [63:40] Must be zero
1341                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1342                      Bits [31:24] Must be zero
1343                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1344                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1345                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1346                 */
1347                 let mpidr_mask = 0xff_00ff_ffff;
1348                 let gicc = GicC {
1349                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1350                     length: 80,
1351                     reserved0: 0,
1352                     cpu_interface_number: cpu as u32,
1353                     uid: cpu as u32,
1354                     flags: 1,
1355                     parking_version: 0,
1356                     performance_interrupt: 0,
1357                     parked_address: 0,
1358                     base_address: 0,
1359                     gicv_base_address: 0,
1360                     gich_base_address: 0,
1361                     vgic_interrupt: 0,
1362                     gicr_base_address: 0,
1363                     mpidr: mpidr & mpidr_mask,
1364                     proc_power_effi_class: 0,
1365                     reserved1: 0,
1366                     spe_overflow_interrupt: 0,
1367                 };
1368 
1369                 madt.append(gicc);
1370             }
1371             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1372 
1373             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1374             let gicd = GicD {
1375                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1376                 length: 24,
1377                 reserved0: 0,
1378                 gic_id: 0,
1379                 base_address: vgic_config.dist_addr,
1380                 global_irq_base: 0,
1381                 version: 3,
1382                 reserved1: [0; 3],
1383             };
1384             madt.append(gicd);
1385 
1386             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1387             let gicr = GicR {
1388                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1389                 length: 16,
1390                 reserved: 0,
1391                 base_address: vgic_config.redists_addr,
1392                 range_length: vgic_config.redists_size as u32,
1393             };
1394             madt.append(gicr);
1395 
1396             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1397             let gicits = GicIts {
1398                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1399                 length: 20,
1400                 reserved0: 0,
1401                 translation_id: 0,
1402                 base_address: vgic_config.msi_addr,
1403                 reserved1: 0,
1404             };
1405             madt.append(gicits);
1406 
1407             madt.update_checksum();
1408         }
1409 
1410         madt
1411     }
1412 
1413     #[cfg(target_arch = "aarch64")]
1414     pub fn create_pptt(&self) -> Sdt {
1415         let pptt_start = 0;
1416         let mut cpus = 0;
1417         let mut uid = 0;
1418         // If topology is not specified, the default setting is:
1419         // 1 package, multiple cores, 1 thread per core
1420         // This is also the behavior when PPTT is missing.
1421         let (threads_per_core, cores_per_package, packages) =
1422             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1423 
1424         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1425 
1426         for cluster_idx in 0..packages {
1427             if cpus < self.config.boot_vcpus as usize {
1428                 let cluster_offset = pptt.len() - pptt_start;
1429                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1430                     r#type: 0,
1431                     length: 20,
1432                     reserved: 0,
1433                     flags: 0x2,
1434                     parent: 0,
1435                     acpi_processor_id: cluster_idx as u32,
1436                     num_private_resources: 0,
1437                 };
1438                 pptt.append(cluster_hierarchy_node);
1439 
1440                 for core_idx in 0..cores_per_package {
1441                     let core_offset = pptt.len() - pptt_start;
1442 
1443                     if threads_per_core > 1 {
1444                         let core_hierarchy_node = ProcessorHierarchyNode {
1445                             r#type: 0,
1446                             length: 20,
1447                             reserved: 0,
1448                             flags: 0x2,
1449                             parent: cluster_offset as u32,
1450                             acpi_processor_id: core_idx as u32,
1451                             num_private_resources: 0,
1452                         };
1453                         pptt.append(core_hierarchy_node);
1454 
1455                         for _thread_idx in 0..threads_per_core {
1456                             let thread_hierarchy_node = ProcessorHierarchyNode {
1457                                 r#type: 0,
1458                                 length: 20,
1459                                 reserved: 0,
1460                                 flags: 0xE,
1461                                 parent: core_offset as u32,
1462                                 acpi_processor_id: uid as u32,
1463                                 num_private_resources: 0,
1464                             };
1465                             pptt.append(thread_hierarchy_node);
1466                             uid += 1;
1467                         }
1468                     } else {
1469                         let thread_hierarchy_node = ProcessorHierarchyNode {
1470                             r#type: 0,
1471                             length: 20,
1472                             reserved: 0,
1473                             flags: 0xA,
1474                             parent: cluster_offset as u32,
1475                             acpi_processor_id: uid as u32,
1476                             num_private_resources: 0,
1477                         };
1478                         pptt.append(thread_hierarchy_node);
1479                         uid += 1;
1480                     }
1481                 }
1482                 cpus += (cores_per_package * threads_per_core) as usize;
1483             }
1484         }
1485 
1486         pptt.update_checksum();
1487         pptt
1488     }
1489 
1490     #[cfg(feature = "guest_debug")]
1491     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1492         self.vcpus[usize::from(cpu_id)]
1493             .lock()
1494             .unwrap()
1495             .vcpu
1496             .get_regs()
1497             .map_err(Error::CpuDebug)
1498     }
1499 
1500     #[cfg(feature = "guest_debug")]
1501     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1502         self.vcpus[usize::from(cpu_id)]
1503             .lock()
1504             .unwrap()
1505             .vcpu
1506             .set_regs(regs)
1507             .map_err(Error::CpuDebug)
1508     }
1509 
1510     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1511     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1512         self.vcpus[usize::from(cpu_id)]
1513             .lock()
1514             .unwrap()
1515             .vcpu
1516             .get_sregs()
1517             .map_err(Error::CpuDebug)
1518     }
1519 
1520     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1521     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1522         self.vcpus[usize::from(cpu_id)]
1523             .lock()
1524             .unwrap()
1525             .vcpu
1526             .set_sregs(sregs)
1527             .map_err(Error::CpuDebug)
1528     }
1529 
1530     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1531     fn translate_gva(
1532         &self,
1533         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1534         cpu_id: u8,
1535         gva: u64,
1536     ) -> Result<u64> {
1537         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1538             .lock()
1539             .unwrap()
1540             .vcpu
1541             .translate_gva(gva, /* flags: unused */ 0)
1542             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1543         Ok(gpa)
1544     }
1545 
1546     ///
1547     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1548     /// it in VMM by walking through translation tables.
1549     ///
1550     /// Address translation is big topic, here we only focus the scenario that
1551     /// happens in VMM while debugging kernel. This `translate_gva`
1552     /// implementation is restricted to:
1553     /// - Exception Level 1
1554     /// - Translate high address range only (kernel space)
1555     ///
1556     /// This implementation supports following Arm-v8a features related to
1557     /// address translation:
1558     /// - FEAT_LPA
1559     /// - FEAT_LVA
1560     /// - FEAT_LPA2
1561     ///
1562     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1563     fn translate_gva(
1564         &self,
1565         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1566         cpu_id: u8,
1567         gva: u64,
1568     ) -> Result<u64> {
1569         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1570             .lock()
1571             .unwrap()
1572             .vcpu
1573             .get_sys_reg(regs::TCR_EL1)
1574             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1575         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1576             .lock()
1577             .unwrap()
1578             .vcpu
1579             .get_sys_reg(regs::TTBR1_EL1)
1580             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1581         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1582             .lock()
1583             .unwrap()
1584             .vcpu
1585             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1586             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1587 
1588         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1589         // or low (0x000xxx...).
1590         let high_range = extract_bits_64!(gva, 55, 1);
1591         if high_range == 0 {
1592             info!("VA (0x{:x}) range is not supported!", gva);
1593             return Ok(gva);
1594         }
1595 
1596         // High range size offset
1597         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1598         // Granule size
1599         let tg = extract_bits_64!(tcr_el1, 30, 2);
1600         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1601         let ds = extract_bits_64!(tcr_el1, 59, 1);
1602 
1603         if tsz == 0 {
1604             info!("VA translation is not ready!");
1605             return Ok(gva);
1606         }
1607 
1608         // VA size is determined by TCR_BL1.T1SZ
1609         let va_size = 64 - tsz;
1610         // Number of bits in VA consumed in each level of translation
1611         let stride = match tg {
1612             3 => 13, // 64KB granule size
1613             1 => 11, // 16KB granule size
1614             _ => 9,  // 4KB, default
1615         };
1616         // Starting level of walking
1617         let mut level = 4 - (va_size - 4) / stride;
1618 
1619         // PA or IPA size is determined
1620         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1621         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1622         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1623         // To be safe, we use the minimum value if they are different.
1624         let pa_range = std::cmp::min(tcr_ips, pa_range);
1625         // PA size in bits
1626         let pa_size = match pa_range {
1627             0 => 32,
1628             1 => 36,
1629             2 => 40,
1630             3 => 42,
1631             4 => 44,
1632             5 => 48,
1633             6 => 52,
1634             _ => {
1635                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1636                     "PA range not supported {pa_range}"
1637                 ))))
1638             }
1639         };
1640 
1641         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1642         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1643         // If FEAT_LPA2 is present, the translation table descriptor holds
1644         // 50 bits of the table address of next level.
1645         // Otherwise, it is 48 bits.
1646         let descaddrmask = if ds == 1 {
1647             !0u64 >> (64 - 50) // mask with 50 least significant bits
1648         } else {
1649             !0u64 >> (64 - 48) // mask with 48 least significant bits
1650         };
1651         let descaddrmask = descaddrmask & !indexmask_grainsize;
1652 
1653         // Translation table base address
1654         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1655         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1656         // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5].
1657         if pa_size == 52 {
1658             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1659         }
1660 
1661         // Loop through tables of each level
1662         loop {
1663             // Table offset for current level
1664             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1665             descaddr |= table_offset;
1666             descaddr &= !7u64;
1667 
1668             let mut buf = [0; 8];
1669             guest_memory
1670                 .memory()
1671                 .read(&mut buf, GuestAddress(descaddr))
1672                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1673             let descriptor = u64::from_le_bytes(buf);
1674 
1675             descaddr = descriptor & descaddrmask;
1676             // In the case of FEAT_LPA, the next-level translation table address
1677             // bits [48:51] comes from bits [12:15] of the current descriptor.
1678             // For FEAT_LPA2, the next-level translation table address
1679             // bits [50:51] comes from bits [8:9] of the current descriptor,
1680             // bits [48:49] comes from bits [48:49] of the descriptor which was
1681             // handled previously.
1682             if pa_size == 52 {
1683                 if ds == 1 {
1684                     // FEAT_LPA2
1685                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1686                 } else {
1687                     // FEAT_LPA
1688                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1689                 }
1690             }
1691 
1692             if (descriptor & 2) != 0 && (level < 3) {
1693                 // This is a table entry. Go down to next level.
1694                 level += 1;
1695                 indexmask = indexmask_grainsize;
1696                 continue;
1697             }
1698 
1699             break;
1700         }
1701 
1702         // We have reached either:
1703         // - a page entry at level 3 or
1704         // - a block entry at level 1 or 2
1705         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1706         descaddr &= !(page_size - 1);
1707         descaddr |= gva & (page_size - 1);
1708 
1709         Ok(descaddr)
1710     }
1711 
1712     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1713         self.acpi_address = Some(acpi_address);
1714     }
1715 
1716     pub(crate) fn set_interrupt_controller(
1717         &mut self,
1718         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1719     ) {
1720         self.interrupt_controller = Some(interrupt_controller);
1721     }
1722 }
1723 
1724 struct Cpu {
1725     cpu_id: u8,
1726     proximity_domain: u32,
1727     dynamic: bool,
1728 }
1729 
1730 #[cfg(target_arch = "x86_64")]
1731 const MADT_CPU_ENABLE_FLAG: usize = 0;
1732 
1733 #[cfg(target_arch = "x86_64")]
1734 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1735 
1736 impl Cpu {
1737     #[cfg(target_arch = "x86_64")]
1738     fn generate_mat(&self) -> Vec<u8> {
1739         let lapic = LocalApic {
1740             r#type: 0,
1741             length: 8,
1742             processor_id: self.cpu_id,
1743             apic_id: self.cpu_id,
1744             flags: 1 << MADT_CPU_ENABLE_FLAG,
1745         };
1746 
1747         let mut mat_data: Vec<u8> = Vec::new();
1748         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1749         // SAFETY: mat_data is large enough to hold lapic
1750         unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic };
1751 
1752         mat_data
1753     }
1754 }
1755 
1756 impl Aml for Cpu {
1757     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1758         #[cfg(target_arch = "x86_64")]
1759         let mat_data: Vec<u8> = self.generate_mat();
1760         #[allow(clippy::if_same_then_else)]
1761         if self.dynamic {
1762             aml::Device::new(
1763                 format!("C{:03}", self.cpu_id).as_str().into(),
1764                 vec![
1765                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1766                     &aml::Name::new("_UID".into(), &self.cpu_id),
1767                     // Currently, AArch64 cannot support following fields.
1768                     /*
1769                     _STA return value:
1770                     Bit [0] – Set if the device is present.
1771                     Bit [1] – Set if the device is enabled and decoding its resources.
1772                     Bit [2] – Set if the device should be shown in the UI.
1773                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1774                     Bit [4] – Set if the battery is present.
1775                     Bits [31:5] – Reserved (must be cleared).
1776                     */
1777                     #[cfg(target_arch = "x86_64")]
1778                     &aml::Method::new(
1779                         "_STA".into(),
1780                         0,
1781                         false,
1782                         // Call into CSTA method which will interrogate device
1783                         vec![&aml::Return::new(&aml::MethodCall::new(
1784                             "CSTA".into(),
1785                             vec![&self.cpu_id],
1786                         ))],
1787                     ),
1788                     &aml::Method::new(
1789                         "_PXM".into(),
1790                         0,
1791                         false,
1792                         vec![&aml::Return::new(&self.proximity_domain)],
1793                     ),
1794                     // The Linux kernel expects every CPU device to have a _MAT entry
1795                     // containing the LAPIC for this processor with the enabled bit set
1796                     // even it if is disabled in the MADT (non-boot CPU)
1797                     #[cfg(target_arch = "x86_64")]
1798                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1799                     // Trigger CPU ejection
1800                     #[cfg(target_arch = "x86_64")]
1801                     &aml::Method::new(
1802                         "_EJ0".into(),
1803                         1,
1804                         false,
1805                         // Call into CEJ0 method which will actually eject device
1806                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1807                     ),
1808                 ],
1809             )
1810             .to_aml_bytes(sink);
1811         } else {
1812             aml::Device::new(
1813                 format!("C{:03}", self.cpu_id).as_str().into(),
1814                 vec![
1815                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1816                     &aml::Name::new("_UID".into(), &self.cpu_id),
1817                     #[cfg(target_arch = "x86_64")]
1818                     &aml::Method::new(
1819                         "_STA".into(),
1820                         0,
1821                         false,
1822                         // Mark CPU present see CSTA implementation
1823                         vec![&aml::Return::new(&0xfu8)],
1824                     ),
1825                     &aml::Method::new(
1826                         "_PXM".into(),
1827                         0,
1828                         false,
1829                         vec![&aml::Return::new(&self.proximity_domain)],
1830                     ),
1831                     // The Linux kernel expects every CPU device to have a _MAT entry
1832                     // containing the LAPIC for this processor with the enabled bit set
1833                     // even it if is disabled in the MADT (non-boot CPU)
1834                     #[cfg(target_arch = "x86_64")]
1835                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1836                 ],
1837             )
1838             .to_aml_bytes(sink);
1839         }
1840     }
1841 }
1842 
1843 struct CpuNotify {
1844     cpu_id: u8,
1845 }
1846 
1847 impl Aml for CpuNotify {
1848     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1849         let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
1850         aml::If::new(
1851             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1852             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1853         )
1854         .to_aml_bytes(sink)
1855     }
1856 }
1857 
1858 struct CpuMethods {
1859     max_vcpus: u8,
1860     dynamic: bool,
1861 }
1862 
1863 impl Aml for CpuMethods {
1864     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1865         if self.dynamic {
1866             // CPU status method
1867             aml::Method::new(
1868                 "CSTA".into(),
1869                 1,
1870                 true,
1871                 vec![
1872                     // Take lock defined above
1873                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1874                     // Write CPU number (in first argument) to I/O port via field
1875                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1876                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1877                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1878                     &aml::If::new(
1879                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1880                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1881                     ),
1882                     // Release lock
1883                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1884                     // Return 0 or 0xf
1885                     &aml::Return::new(&aml::Local(0)),
1886                 ],
1887             )
1888             .to_aml_bytes(sink);
1889 
1890             let mut cpu_notifies = Vec::new();
1891             for cpu_id in 0..self.max_vcpus {
1892                 cpu_notifies.push(CpuNotify { cpu_id });
1893             }
1894 
1895             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
1896             for cpu_id in 0..self.max_vcpus {
1897                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1898             }
1899 
1900             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
1901 
1902             aml::Method::new(
1903                 "CEJ0".into(),
1904                 1,
1905                 true,
1906                 vec![
1907                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1908                     // Write CPU number (in first argument) to I/O port via field
1909                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1910                     // Set CEJ0 bit
1911                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1912                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1913                 ],
1914             )
1915             .to_aml_bytes(sink);
1916 
1917             aml::Method::new(
1918                 "CSCN".into(),
1919                 0,
1920                 true,
1921                 vec![
1922                     // Take lock defined above
1923                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1924                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1925                     &aml::While::new(
1926                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1927                         vec![
1928                             // Write CPU number (in first argument) to I/O port via field
1929                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1930                             // Check if CINS bit is set
1931                             &aml::If::new(
1932                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1933                                 // Notify device if it is
1934                                 vec![
1935                                     &aml::MethodCall::new(
1936                                         "CTFY".into(),
1937                                         vec![&aml::Local(0), &aml::ONE],
1938                                     ),
1939                                     // Reset CINS bit
1940                                     &aml::Store::new(
1941                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1942                                         &aml::ONE,
1943                                     ),
1944                                 ],
1945                             ),
1946                             // Check if CRMV bit is set
1947                             &aml::If::new(
1948                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1949                                 // Notify device if it is (with the eject constant 0x3)
1950                                 vec![
1951                                     &aml::MethodCall::new(
1952                                         "CTFY".into(),
1953                                         vec![&aml::Local(0), &3u8],
1954                                     ),
1955                                     // Reset CRMV bit
1956                                     &aml::Store::new(
1957                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1958                                         &aml::ONE,
1959                                     ),
1960                                 ],
1961                             ),
1962                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1963                         ],
1964                     ),
1965                     // Release lock
1966                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1967                 ],
1968             )
1969             .to_aml_bytes(sink)
1970         } else {
1971             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
1972         }
1973     }
1974 }
1975 
1976 impl Aml for CpuManager {
1977     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1978         #[cfg(target_arch = "x86_64")]
1979         if let Some(acpi_address) = self.acpi_address {
1980             // CPU hotplug controller
1981             aml::Device::new(
1982                 "_SB_.PRES".into(),
1983                 vec![
1984                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
1985                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1986                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1987                     &aml::Mutex::new("CPLK".into(), 0),
1988                     &aml::Name::new(
1989                         "_CRS".into(),
1990                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
1991                             aml::AddressSpaceCachable::NotCacheable,
1992                             true,
1993                             acpi_address.0,
1994                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
1995                             None,
1996                         )]),
1997                     ),
1998                     // OpRegion and Fields map MMIO range into individual field values
1999                     &aml::OpRegion::new(
2000                         "PRST".into(),
2001                         aml::OpRegionSpace::SystemMemory,
2002                         &(acpi_address.0 as usize),
2003                         &CPU_MANAGER_ACPI_SIZE,
2004                     ),
2005                     &aml::Field::new(
2006                         "PRST".into(),
2007                         aml::FieldAccessType::Byte,
2008                         aml::FieldLockRule::NoLock,
2009                         aml::FieldUpdateRule::WriteAsZeroes,
2010                         vec![
2011                             aml::FieldEntry::Reserved(32),
2012                             aml::FieldEntry::Named(*b"CPEN", 1),
2013                             aml::FieldEntry::Named(*b"CINS", 1),
2014                             aml::FieldEntry::Named(*b"CRMV", 1),
2015                             aml::FieldEntry::Named(*b"CEJ0", 1),
2016                             aml::FieldEntry::Reserved(4),
2017                             aml::FieldEntry::Named(*b"CCMD", 8),
2018                         ],
2019                     ),
2020                     &aml::Field::new(
2021                         "PRST".into(),
2022                         aml::FieldAccessType::DWord,
2023                         aml::FieldLockRule::NoLock,
2024                         aml::FieldUpdateRule::Preserve,
2025                         vec![
2026                             aml::FieldEntry::Named(*b"CSEL", 32),
2027                             aml::FieldEntry::Reserved(32),
2028                             aml::FieldEntry::Named(*b"CDAT", 32),
2029                         ],
2030                     ),
2031                 ],
2032             )
2033             .to_aml_bytes(sink);
2034         }
2035 
2036         // CPU devices
2037         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2038         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2039         // Bundle methods together under a common object
2040         let methods = CpuMethods {
2041             max_vcpus: self.config.max_vcpus,
2042             dynamic: self.dynamic,
2043         };
2044         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2045 
2046         let mut cpu_devices = Vec::new();
2047         for cpu_id in 0..self.config.max_vcpus {
2048             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2049             let cpu_device = Cpu {
2050                 cpu_id,
2051                 proximity_domain,
2052                 dynamic: self.dynamic,
2053             };
2054 
2055             cpu_devices.push(cpu_device);
2056         }
2057 
2058         for cpu_device in cpu_devices.iter() {
2059             cpu_data_inner.push(cpu_device);
2060         }
2061 
2062         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2063     }
2064 }
2065 
2066 impl Pausable for CpuManager {
2067     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2068         // Tell the vCPUs to pause themselves next time they exit
2069         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2070 
2071         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2072         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2073         // above.
2074         for state in self.vcpu_states.iter() {
2075             state.signal_thread();
2076         }
2077 
2078         for vcpu in self.vcpus.iter() {
2079             let mut vcpu = vcpu.lock().unwrap();
2080             vcpu.pause()?;
2081             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2082             if !self.config.kvm_hyperv {
2083                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2084                     MigratableError::Pause(anyhow!(
2085                         "Could not notify guest it has been paused {:?}",
2086                         e
2087                     ))
2088                 })?;
2089             }
2090         }
2091 
2092         Ok(())
2093     }
2094 
2095     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2096         for vcpu in self.vcpus.iter() {
2097             vcpu.lock().unwrap().resume()?;
2098         }
2099 
2100         // Toggle the vCPUs pause boolean
2101         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2102 
2103         // Unpark all the VCPU threads.
2104         // Once unparked, the next thing they will do is checking for the pause
2105         // boolean. Since it'll be set to false, they will exit their pause loop
2106         // and go back to vmx root.
2107         for state in self.vcpu_states.iter() {
2108             state.unpark_thread();
2109         }
2110         Ok(())
2111     }
2112 }
2113 
2114 impl Snapshottable for CpuManager {
2115     fn id(&self) -> String {
2116         CPU_MANAGER_SNAPSHOT_ID.to_string()
2117     }
2118 
2119     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2120         let mut cpu_manager_snapshot = Snapshot::default();
2121 
2122         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2123         for vcpu in &self.vcpus {
2124             let mut vcpu = vcpu.lock().unwrap();
2125             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2126         }
2127 
2128         Ok(cpu_manager_snapshot)
2129     }
2130 }
2131 
2132 impl Transportable for CpuManager {}
2133 impl Migratable for CpuManager {}
2134 
2135 #[cfg(feature = "guest_debug")]
2136 impl Debuggable for CpuManager {
2137     #[cfg(feature = "kvm")]
2138     fn set_guest_debug(
2139         &self,
2140         cpu_id: usize,
2141         addrs: &[GuestAddress],
2142         singlestep: bool,
2143     ) -> std::result::Result<(), DebuggableError> {
2144         self.vcpus[cpu_id]
2145             .lock()
2146             .unwrap()
2147             .vcpu
2148             .set_guest_debug(addrs, singlestep)
2149             .map_err(DebuggableError::SetDebug)
2150     }
2151 
2152     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2153         Ok(())
2154     }
2155 
2156     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2157         Ok(())
2158     }
2159 
2160     #[cfg(target_arch = "x86_64")]
2161     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2162         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2163         let gregs = self
2164             .get_regs(cpu_id as u8)
2165             .map_err(DebuggableError::ReadRegs)?;
2166         let regs = [
2167             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2168             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2169         ];
2170 
2171         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2172         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2173         let eflags = gregs.rflags as u32;
2174         let rip = gregs.rip;
2175 
2176         // Segment registers: CS, SS, DS, ES, FS, GS
2177         let sregs = self
2178             .get_sregs(cpu_id as u8)
2179             .map_err(DebuggableError::ReadRegs)?;
2180         let segments = X86SegmentRegs {
2181             cs: sregs.cs.selector as u32,
2182             ss: sregs.ss.selector as u32,
2183             ds: sregs.ds.selector as u32,
2184             es: sregs.es.selector as u32,
2185             fs: sregs.fs.selector as u32,
2186             gs: sregs.gs.selector as u32,
2187         };
2188 
2189         // TODO: Add other registers
2190 
2191         Ok(CoreRegs {
2192             regs,
2193             eflags,
2194             rip,
2195             segments,
2196             ..Default::default()
2197         })
2198     }
2199 
2200     #[cfg(target_arch = "aarch64")]
2201     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2202         let gregs = self
2203             .get_regs(cpu_id as u8)
2204             .map_err(DebuggableError::ReadRegs)?;
2205         Ok(CoreRegs {
2206             x: gregs.regs.regs,
2207             sp: gregs.regs.sp,
2208             pc: gregs.regs.pc,
2209             ..Default::default()
2210         })
2211     }
2212 
2213     #[cfg(target_arch = "x86_64")]
2214     fn write_regs(
2215         &self,
2216         cpu_id: usize,
2217         regs: &CoreRegs,
2218     ) -> std::result::Result<(), DebuggableError> {
2219         let orig_gregs = self
2220             .get_regs(cpu_id as u8)
2221             .map_err(DebuggableError::ReadRegs)?;
2222         let gregs = StandardRegisters {
2223             rax: regs.regs[0],
2224             rbx: regs.regs[1],
2225             rcx: regs.regs[2],
2226             rdx: regs.regs[3],
2227             rsi: regs.regs[4],
2228             rdi: regs.regs[5],
2229             rbp: regs.regs[6],
2230             rsp: regs.regs[7],
2231             r8: regs.regs[8],
2232             r9: regs.regs[9],
2233             r10: regs.regs[10],
2234             r11: regs.regs[11],
2235             r12: regs.regs[12],
2236             r13: regs.regs[13],
2237             r14: regs.regs[14],
2238             r15: regs.regs[15],
2239             rip: regs.rip,
2240             // Update the lower 32-bit of rflags.
2241             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2242         };
2243 
2244         self.set_regs(cpu_id as u8, &gregs)
2245             .map_err(DebuggableError::WriteRegs)?;
2246 
2247         // Segment registers: CS, SS, DS, ES, FS, GS
2248         // Since GDB care only selectors, we call get_sregs() first.
2249         let mut sregs = self
2250             .get_sregs(cpu_id as u8)
2251             .map_err(DebuggableError::ReadRegs)?;
2252         sregs.cs.selector = regs.segments.cs as u16;
2253         sregs.ss.selector = regs.segments.ss as u16;
2254         sregs.ds.selector = regs.segments.ds as u16;
2255         sregs.es.selector = regs.segments.es as u16;
2256         sregs.fs.selector = regs.segments.fs as u16;
2257         sregs.gs.selector = regs.segments.gs as u16;
2258 
2259         self.set_sregs(cpu_id as u8, &sregs)
2260             .map_err(DebuggableError::WriteRegs)?;
2261 
2262         // TODO: Add other registers
2263 
2264         Ok(())
2265     }
2266 
2267     #[cfg(target_arch = "aarch64")]
2268     fn write_regs(
2269         &self,
2270         cpu_id: usize,
2271         regs: &CoreRegs,
2272     ) -> std::result::Result<(), DebuggableError> {
2273         let mut gregs = self
2274             .get_regs(cpu_id as u8)
2275             .map_err(DebuggableError::ReadRegs)?;
2276 
2277         gregs.regs.regs = regs.x;
2278         gregs.regs.sp = regs.sp;
2279         gregs.regs.pc = regs.pc;
2280 
2281         self.set_regs(cpu_id as u8, &gregs)
2282             .map_err(DebuggableError::WriteRegs)?;
2283 
2284         Ok(())
2285     }
2286 
2287     fn read_mem(
2288         &self,
2289         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2290         cpu_id: usize,
2291         vaddr: GuestAddress,
2292         len: usize,
2293     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2294         let mut buf = vec![0; len];
2295         let mut total_read = 0_u64;
2296 
2297         while total_read < len as u64 {
2298             let gaddr = vaddr.0 + total_read;
2299             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2300                 Ok(paddr) => paddr,
2301                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2302                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2303             };
2304             let psize = arch::PAGE_SIZE as u64;
2305             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2306             guest_memory
2307                 .memory()
2308                 .read(
2309                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2310                     GuestAddress(paddr),
2311                 )
2312                 .map_err(DebuggableError::ReadMem)?;
2313             total_read += read_len;
2314         }
2315         Ok(buf)
2316     }
2317 
2318     fn write_mem(
2319         &self,
2320         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2321         cpu_id: usize,
2322         vaddr: &GuestAddress,
2323         data: &[u8],
2324     ) -> std::result::Result<(), DebuggableError> {
2325         let mut total_written = 0_u64;
2326 
2327         while total_written < data.len() as u64 {
2328             let gaddr = vaddr.0 + total_written;
2329             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2330                 Ok(paddr) => paddr,
2331                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2332                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2333             };
2334             let psize = arch::PAGE_SIZE as u64;
2335             let write_len = std::cmp::min(
2336                 data.len() as u64 - total_written,
2337                 psize - (paddr & (psize - 1)),
2338             );
2339             guest_memory
2340                 .memory()
2341                 .write(
2342                     &data[total_written as usize..total_written as usize + write_len as usize],
2343                     GuestAddress(paddr),
2344                 )
2345                 .map_err(DebuggableError::WriteMem)?;
2346             total_written += write_len;
2347         }
2348         Ok(())
2349     }
2350 
2351     fn active_vcpus(&self) -> usize {
2352         self.present_vcpus() as usize
2353     }
2354 }
2355 
2356 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2357 impl Elf64Writable for CpuManager {}
2358 
2359 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2360 impl CpuElf64Writable for CpuManager {
2361     fn cpu_write_elf64_note(
2362         &mut self,
2363         dump_state: &DumpState,
2364     ) -> std::result::Result<(), GuestDebuggableError> {
2365         let mut coredump_file = dump_state.file.as_ref().unwrap();
2366         for vcpu in &self.vcpus {
2367             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2368             let mut pos: usize = 0;
2369             let mut buf = vec![0; note_size as usize];
2370             let descsz = size_of::<X86_64ElfPrStatus>();
2371             let vcpu_id = vcpu.lock().unwrap().id;
2372 
2373             let note = Elf64_Nhdr {
2374                 n_namesz: COREDUMP_NAME_SIZE,
2375                 n_descsz: descsz as u32,
2376                 n_type: NT_PRSTATUS,
2377             };
2378 
2379             let bytes: &[u8] = note.as_slice();
2380             buf.splice(0.., bytes.to_vec());
2381             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2382             buf.resize(pos + 4, 0);
2383             buf.splice(pos.., "CORE".to_string().into_bytes());
2384 
2385             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2386             buf.resize(pos + 32 + 4, 0);
2387             let pid = vcpu_id as u64;
2388             let bytes: &[u8] = pid.as_slice();
2389             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2390 
2391             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2392 
2393             let orig_rax: u64 = 0;
2394             let gregs = self.vcpus[usize::from(vcpu_id)]
2395                 .lock()
2396                 .unwrap()
2397                 .vcpu
2398                 .get_regs()
2399                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2400 
2401             let regs1 = [
2402                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2403                 gregs.r10,
2404             ];
2405             let regs2 = [
2406                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2407             ];
2408 
2409             let sregs = self.vcpus[usize::from(vcpu_id)]
2410                 .lock()
2411                 .unwrap()
2412                 .vcpu
2413                 .get_sregs()
2414                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2415 
2416             debug!(
2417                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2418                 gregs.rip,
2419                 gregs.rsp,
2420                 sregs.gs.base,
2421                 sregs.cs.selector,
2422                 sregs.ss.selector,
2423                 sregs.ds.selector,
2424             );
2425 
2426             let regs = X86_64UserRegs {
2427                 regs1,
2428                 regs2,
2429                 rip: gregs.rip,
2430                 cs: sregs.cs.selector as u64,
2431                 eflags: gregs.rflags,
2432                 rsp: gregs.rsp,
2433                 ss: sregs.ss.selector as u64,
2434                 fs_base: sregs.fs.base,
2435                 gs_base: sregs.gs.base,
2436                 ds: sregs.ds.selector as u64,
2437                 es: sregs.es.selector as u64,
2438                 fs: sregs.fs.selector as u64,
2439                 gs: sregs.gs.selector as u64,
2440             };
2441 
2442             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2443             let bytes: &[u8] = regs.as_slice();
2444             buf.resize(note_size as usize, 0);
2445             buf.splice(pos.., bytes.to_vec());
2446             buf.resize(note_size as usize, 0);
2447 
2448             coredump_file
2449                 .write(&buf)
2450                 .map_err(GuestDebuggableError::CoredumpFile)?;
2451         }
2452 
2453         Ok(())
2454     }
2455 
2456     fn cpu_write_vmm_note(
2457         &mut self,
2458         dump_state: &DumpState,
2459     ) -> std::result::Result<(), GuestDebuggableError> {
2460         let mut coredump_file = dump_state.file.as_ref().unwrap();
2461         for vcpu in &self.vcpus {
2462             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2463             let mut pos: usize = 0;
2464             let mut buf = vec![0; note_size as usize];
2465             let descsz = size_of::<DumpCpusState>();
2466             let vcpu_id = vcpu.lock().unwrap().id;
2467 
2468             let note = Elf64_Nhdr {
2469                 n_namesz: COREDUMP_NAME_SIZE,
2470                 n_descsz: descsz as u32,
2471                 n_type: 0,
2472             };
2473 
2474             let bytes: &[u8] = note.as_slice();
2475             buf.splice(0.., bytes.to_vec());
2476             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2477 
2478             buf.resize(pos + 4, 0);
2479             buf.splice(pos.., "QEMU".to_string().into_bytes());
2480 
2481             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2482 
2483             let gregs = self.vcpus[usize::from(vcpu_id)]
2484                 .lock()
2485                 .unwrap()
2486                 .vcpu
2487                 .get_regs()
2488                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2489 
2490             let regs1 = [
2491                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2492                 gregs.rbp,
2493             ];
2494 
2495             let regs2 = [
2496                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2497                 gregs.r15,
2498             ];
2499 
2500             let sregs = self.vcpus[usize::from(vcpu_id)]
2501                 .lock()
2502                 .unwrap()
2503                 .vcpu
2504                 .get_sregs()
2505                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2506 
2507             let mut msrs = vec![MsrEntry {
2508                 index: msr_index::MSR_KERNEL_GS_BASE,
2509                 ..Default::default()
2510             }];
2511 
2512             self.vcpus[vcpu_id as usize]
2513                 .lock()
2514                 .unwrap()
2515                 .vcpu
2516                 .get_msrs(&mut msrs)
2517                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2518             let kernel_gs_base = msrs[0].data;
2519 
2520             let cs = CpuSegment::new(sregs.cs);
2521             let ds = CpuSegment::new(sregs.ds);
2522             let es = CpuSegment::new(sregs.es);
2523             let fs = CpuSegment::new(sregs.fs);
2524             let gs = CpuSegment::new(sregs.gs);
2525             let ss = CpuSegment::new(sregs.ss);
2526             let ldt = CpuSegment::new(sregs.ldt);
2527             let tr = CpuSegment::new(sregs.tr);
2528             let gdt = CpuSegment::new_from_table(sregs.gdt);
2529             let idt = CpuSegment::new_from_table(sregs.idt);
2530             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2531             let regs = DumpCpusState {
2532                 version: 1,
2533                 size: size_of::<DumpCpusState>() as u32,
2534                 regs1,
2535                 regs2,
2536                 rip: gregs.rip,
2537                 rflags: gregs.rflags,
2538                 cs,
2539                 ds,
2540                 es,
2541                 fs,
2542                 gs,
2543                 ss,
2544                 ldt,
2545                 tr,
2546                 gdt,
2547                 idt,
2548                 cr,
2549                 kernel_gs_base,
2550             };
2551 
2552             let bytes: &[u8] = regs.as_slice();
2553             buf.resize(note_size as usize, 0);
2554             buf.splice(pos.., bytes.to_vec());
2555             buf.resize(note_size as usize, 0);
2556 
2557             coredump_file
2558                 .write(&buf)
2559                 .map_err(GuestDebuggableError::CoredumpFile)?;
2560         }
2561 
2562         Ok(())
2563     }
2564 }
2565 
2566 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2567 #[cfg(test)]
2568 mod tests {
2569     use arch::x86_64::interrupts::*;
2570     use arch::x86_64::regs::*;
2571     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2572 
2573     #[test]
2574     fn test_setlint() {
2575         let hv = hypervisor::new().unwrap();
2576         let vm = hv.create_vm().expect("new VM fd creation failed");
2577         assert!(hv.check_required_extensions().is_ok());
2578         // Calling get_lapic will fail if there is no irqchip before hand.
2579         assert!(vm.create_irq_chip().is_ok());
2580         let vcpu = vm.create_vcpu(0, None).unwrap();
2581         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2582 
2583         // Compute the value that is expected to represent LVT0 and LVT1.
2584         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2585         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2586         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2587         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2588 
2589         set_lint(&vcpu).unwrap();
2590 
2591         // Compute the value that represents LVT0 and LVT1 after set_lint.
2592         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2593         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2594         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2595         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2596         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2597     }
2598 
2599     #[test]
2600     fn test_setup_fpu() {
2601         let hv = hypervisor::new().unwrap();
2602         let vm = hv.create_vm().expect("new VM fd creation failed");
2603         let vcpu = vm.create_vcpu(0, None).unwrap();
2604         setup_fpu(&vcpu).unwrap();
2605 
2606         let expected_fpu: FpuState = FpuState {
2607             fcw: 0x37f,
2608             mxcsr: 0x1f80,
2609             ..Default::default()
2610         };
2611         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2612         // TODO: auto-generate kvm related structures with PartialEq on.
2613         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2614         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2615         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2616         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2617         // remove it at all.
2618         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2619     }
2620 
2621     #[test]
2622     fn test_setup_msrs() {
2623         use hypervisor::arch::x86::{msr_index, MsrEntry};
2624 
2625         let hv = hypervisor::new().unwrap();
2626         let vm = hv.create_vm().expect("new VM fd creation failed");
2627         let vcpu = vm.create_vcpu(0, None).unwrap();
2628         setup_msrs(&vcpu).unwrap();
2629 
2630         // This test will check against the last MSR entry configured (the tenth one).
2631         // See create_msr_entries for details.
2632         let mut msrs = vec![MsrEntry {
2633             index: msr_index::MSR_IA32_MISC_ENABLE,
2634             ..Default::default()
2635         }];
2636 
2637         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2638         // in this test case scenario.
2639         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2640         assert_eq!(read_msrs, 1);
2641 
2642         // Official entries that were setup when we did setup_msrs. We need to assert that the
2643         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2644         // expect.
2645         let entry_vec = vcpu.boot_msr_entries();
2646         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2647     }
2648 
2649     #[test]
2650     fn test_setup_regs() {
2651         let hv = hypervisor::new().unwrap();
2652         let vm = hv.create_vm().expect("new VM fd creation failed");
2653         let vcpu = vm.create_vcpu(0, None).unwrap();
2654 
2655         let expected_regs: StandardRegisters = StandardRegisters {
2656             rflags: 0x0000000000000002u64,
2657             rbx: arch::layout::PVH_INFO_START.0,
2658             rip: 1,
2659             ..Default::default()
2660         };
2661 
2662         setup_regs(&vcpu, expected_regs.rip).unwrap();
2663 
2664         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2665         assert_eq!(actual_regs, expected_regs);
2666     }
2667 }
2668 
2669 #[cfg(target_arch = "aarch64")]
2670 #[cfg(test)]
2671 mod tests {
2672     use arch::{aarch64::regs, layout};
2673     use hypervisor::kvm::aarch64::is_system_register;
2674     use hypervisor::kvm::kvm_bindings::{
2675         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2676         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2677     };
2678     use hypervisor::{arm64_core_reg_id, offset_of};
2679     use std::mem;
2680 
2681     #[test]
2682     fn test_setup_regs() {
2683         let hv = hypervisor::new().unwrap();
2684         let vm = hv.create_vm().unwrap();
2685         let vcpu = vm.create_vcpu(0, None).unwrap();
2686 
2687         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2688         // Must fail when vcpu is not initialized yet.
2689         assert!(res.is_err());
2690 
2691         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2692         vm.get_preferred_target(&mut kvi).unwrap();
2693         vcpu.vcpu_init(&kvi).unwrap();
2694 
2695         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2696     }
2697 
2698     #[test]
2699     fn test_read_mpidr() {
2700         let hv = hypervisor::new().unwrap();
2701         let vm = hv.create_vm().unwrap();
2702         let vcpu = vm.create_vcpu(0, None).unwrap();
2703         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2704         vm.get_preferred_target(&mut kvi).unwrap();
2705 
2706         // Must fail when vcpu is not initialized yet.
2707         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2708 
2709         vcpu.vcpu_init(&kvi).unwrap();
2710         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2711     }
2712 
2713     #[test]
2714     fn test_is_system_register() {
2715         let offset = offset_of!(user_pt_regs, pc);
2716         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2717         assert!(!is_system_register(regid));
2718         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2719         assert!(is_system_register(regid));
2720     }
2721 
2722     #[test]
2723     fn test_save_restore_core_regs() {
2724         let hv = hypervisor::new().unwrap();
2725         let vm = hv.create_vm().unwrap();
2726         let vcpu = vm.create_vcpu(0, None).unwrap();
2727         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2728         vm.get_preferred_target(&mut kvi).unwrap();
2729 
2730         // Must fail when vcpu is not initialized yet.
2731         let res = vcpu.get_regs();
2732         assert!(res.is_err());
2733         assert_eq!(
2734             format!("{}", res.unwrap_err()),
2735             "Failed to get core register: Exec format error (os error 8)"
2736         );
2737 
2738         let mut state = kvm_regs::default();
2739         let res = vcpu.set_regs(&state);
2740         assert!(res.is_err());
2741         assert_eq!(
2742             format!("{}", res.unwrap_err()),
2743             "Failed to set core register: Exec format error (os error 8)"
2744         );
2745 
2746         vcpu.vcpu_init(&kvi).unwrap();
2747         let res = vcpu.get_regs();
2748         assert!(res.is_ok());
2749         state = res.unwrap();
2750         assert_eq!(state.regs.pstate, 0x3C5);
2751 
2752         assert!(vcpu.set_regs(&state).is_ok());
2753     }
2754 
2755     #[test]
2756     fn test_get_set_mpstate() {
2757         let hv = hypervisor::new().unwrap();
2758         let vm = hv.create_vm().unwrap();
2759         let vcpu = vm.create_vcpu(0, None).unwrap();
2760         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2761         vm.get_preferred_target(&mut kvi).unwrap();
2762 
2763         let res = vcpu.get_mp_state();
2764         assert!(res.is_ok());
2765         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2766     }
2767 }
2768