xref: /cloud-hypervisor/vmm/src/cpu.rs (revision fa7a000dbe9637eb256af18ae8c3c4a8d5bf9c8f)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 #[cfg(target_arch = "x86_64")]
35 use arch::x86_64::get_x2apic_id;
36 use arch::EntryPoint;
37 use arch::NumaNodes;
38 #[cfg(target_arch = "aarch64")]
39 use devices::gic::Gic;
40 use devices::interrupt_controller::InterruptController;
41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
45 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
46 use hypervisor::aarch64::StandardRegisters;
47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
48 use hypervisor::arch::x86::msr_index;
49 #[cfg(target_arch = "x86_64")]
50 use hypervisor::arch::x86::CpuIdEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::MsrEntry;
53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
54 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
55 #[cfg(target_arch = "aarch64")]
56 use hypervisor::kvm::kvm_bindings;
57 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
58 use hypervisor::kvm::kvm_ioctls::Cap;
59 #[cfg(feature = "tdx")]
60 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
61 #[cfg(target_arch = "x86_64")]
62 use hypervisor::CpuVendor;
63 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
64 use libc::{c_void, siginfo_t};
65 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
66 use linux_loader::elf::Elf64_Nhdr;
67 use seccompiler::{apply_filter, SeccompAction};
68 use std::collections::BTreeMap;
69 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
70 use std::io::Write;
71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
72 use std::mem::size_of;
73 use std::os::unix::thread::JoinHandleExt;
74 use std::sync::atomic::{AtomicBool, Ordering};
75 use std::sync::{Arc, Barrier, Mutex};
76 use std::{cmp, io, result, thread};
77 use thiserror::Error;
78 use tracer::trace_scoped;
79 use vm_device::BusDevice;
80 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
81 use vm_memory::ByteValued;
82 #[cfg(feature = "guest_debug")]
83 use vm_memory::{Bytes, GuestAddressSpace};
84 use vm_memory::{GuestAddress, GuestMemoryAtomic};
85 use vm_migration::{
86     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
87     Transportable,
88 };
89 use vmm_sys_util::eventfd::EventFd;
90 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
91 use zerocopy::AsBytes;
92 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
93 /// Extract the specified bits of a 64-bit integer.
94 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
95 /// following expression should return 3 (`0b11`):
96 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
97 ///
98 macro_rules! extract_bits_64 {
99     ($value: tt, $offset: tt, $length: tt) => {
100         ($value >> $offset) & (!0u64 >> (64 - $length))
101     };
102 }
103 
104 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
105 macro_rules! extract_bits_64_without_offset {
106     ($value: tt, $length: tt) => {
107         $value & (!0u64 >> (64 - $length))
108     };
109 }
110 
111 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
112 
113 #[derive(Debug, Error)]
114 pub enum Error {
115     #[error("Error creating vCPU: {0}")]
116     VcpuCreate(#[source] anyhow::Error),
117 
118     #[error("Error running bCPU: {0}")]
119     VcpuRun(#[source] anyhow::Error),
120 
121     #[error("Error spawning vCPU thread: {0}")]
122     VcpuSpawn(#[source] io::Error),
123 
124     #[error("Error generating common CPUID: {0}")]
125     CommonCpuId(#[source] arch::Error),
126 
127     #[error("Error configuring vCPU: {0}")]
128     VcpuConfiguration(#[source] arch::Error),
129 
130     #[error("Still pending removed vcpu")]
131     VcpuPendingRemovedVcpu,
132 
133     #[cfg(target_arch = "aarch64")]
134     #[error("Error fetching preferred target: {0}")]
135     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
136 
137     #[cfg(target_arch = "aarch64")]
138     #[error("Error initialising vCPU: {0}")]
139     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
140 
141     #[error("Failed to join on vCPU threads: {0:?}")]
142     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
143 
144     #[error("Error adding CpuManager to MMIO bus: {0}")]
145     BusError(#[source] vm_device::BusError),
146 
147     #[error("Requested vCPUs exceed maximum")]
148     DesiredVCpuCountExceedsMax,
149 
150     #[error("Cannot create seccomp filter: {0}")]
151     CreateSeccompFilter(#[source] seccompiler::Error),
152 
153     #[error("Cannot apply seccomp filter: {0}")]
154     ApplySeccompFilter(#[source] seccompiler::Error),
155 
156     #[error("Error starting vCPU after restore: {0}")]
157     StartRestoreVcpu(#[source] anyhow::Error),
158 
159     #[error("Unexpected VmExit")]
160     UnexpectedVmExit,
161 
162     #[error("Failed to allocate MMIO address for CpuManager")]
163     AllocateMmmioAddress,
164 
165     #[cfg(feature = "tdx")]
166     #[error("Error initializing TDX: {0}")]
167     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
168 
169     #[cfg(target_arch = "aarch64")]
170     #[error("Error initializing PMU: {0}")]
171     InitPmu(#[source] hypervisor::HypervisorCpuError),
172 
173     #[cfg(feature = "guest_debug")]
174     #[error("Error during CPU debug: {0}")]
175     CpuDebug(#[source] hypervisor::HypervisorCpuError),
176 
177     #[cfg(feature = "guest_debug")]
178     #[error("Error translating virtual address: {0}")]
179     TranslateVirtualAddress(#[source] anyhow::Error),
180 
181     #[cfg(target_arch = "x86_64")]
182     #[error("Error setting up AMX: {0}")]
183     AmxEnable(#[source] anyhow::Error),
184 
185     #[error("Maximum number of vCPUs exceeds host limit")]
186     MaximumVcpusExceeded,
187 
188     #[cfg(feature = "sev_snp")]
189     #[error("Failed to set sev control register: {0}")]
190     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
191 
192     #[cfg(target_arch = "x86_64")]
193     #[error("Failed to inject NMI")]
194     NmiError(hypervisor::HypervisorCpuError),
195 }
196 pub type Result<T> = result::Result<T, Error>;
197 
198 #[cfg(target_arch = "x86_64")]
199 #[allow(dead_code)]
200 #[repr(packed)]
201 #[derive(AsBytes)]
202 struct LocalX2Apic {
203     pub r#type: u8,
204     pub length: u8,
205     pub _reserved: u16,
206     pub apic_id: u32,
207     pub flags: u32,
208     pub processor_id: u32,
209 }
210 
211 #[allow(dead_code)]
212 #[repr(packed)]
213 #[derive(Default, AsBytes)]
214 struct Ioapic {
215     pub r#type: u8,
216     pub length: u8,
217     pub ioapic_id: u8,
218     _reserved: u8,
219     pub apic_address: u32,
220     pub gsi_base: u32,
221 }
222 
223 #[cfg(target_arch = "aarch64")]
224 #[allow(dead_code)]
225 #[repr(packed)]
226 #[derive(AsBytes)]
227 struct GicC {
228     pub r#type: u8,
229     pub length: u8,
230     pub reserved0: u16,
231     pub cpu_interface_number: u32,
232     pub uid: u32,
233     pub flags: u32,
234     pub parking_version: u32,
235     pub performance_interrupt: u32,
236     pub parked_address: u64,
237     pub base_address: u64,
238     pub gicv_base_address: u64,
239     pub gich_base_address: u64,
240     pub vgic_interrupt: u32,
241     pub gicr_base_address: u64,
242     pub mpidr: u64,
243     pub proc_power_effi_class: u8,
244     pub reserved1: u8,
245     pub spe_overflow_interrupt: u16,
246 }
247 
248 #[cfg(target_arch = "aarch64")]
249 #[allow(dead_code)]
250 #[repr(packed)]
251 #[derive(AsBytes)]
252 struct GicD {
253     pub r#type: u8,
254     pub length: u8,
255     pub reserved0: u16,
256     pub gic_id: u32,
257     pub base_address: u64,
258     pub global_irq_base: u32,
259     pub version: u8,
260     pub reserved1: [u8; 3],
261 }
262 
263 #[cfg(target_arch = "aarch64")]
264 #[allow(dead_code)]
265 #[repr(packed)]
266 #[derive(AsBytes)]
267 struct GicR {
268     pub r#type: u8,
269     pub length: u8,
270     pub reserved: u16,
271     pub base_address: u64,
272     pub range_length: u32,
273 }
274 
275 #[cfg(target_arch = "aarch64")]
276 #[allow(dead_code)]
277 #[repr(packed)]
278 #[derive(AsBytes)]
279 struct GicIts {
280     pub r#type: u8,
281     pub length: u8,
282     pub reserved0: u16,
283     pub translation_id: u32,
284     pub base_address: u64,
285     pub reserved1: u32,
286 }
287 
288 #[cfg(target_arch = "aarch64")]
289 #[allow(dead_code)]
290 #[repr(packed)]
291 #[derive(AsBytes)]
292 struct ProcessorHierarchyNode {
293     pub r#type: u8,
294     pub length: u8,
295     pub reserved: u16,
296     pub flags: u32,
297     pub parent: u32,
298     pub acpi_processor_id: u32,
299     pub num_private_resources: u32,
300 }
301 
302 #[allow(dead_code)]
303 #[repr(packed)]
304 #[derive(Default, AsBytes)]
305 struct InterruptSourceOverride {
306     pub r#type: u8,
307     pub length: u8,
308     pub bus: u8,
309     pub source: u8,
310     pub gsi: u32,
311     pub flags: u16,
312 }
313 
314 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
315 macro_rules! round_up {
316     ($n:expr,$d:expr) => {
317         (($n / ($d + 1)) + 1) * $d
318     };
319 }
320 
321 /// A wrapper around creating and using a kvm-based VCPU.
322 pub struct Vcpu {
323     // The hypervisor abstracted CPU.
324     vcpu: Arc<dyn hypervisor::Vcpu>,
325     id: u8,
326     #[cfg(target_arch = "aarch64")]
327     mpidr: u64,
328     saved_state: Option<CpuState>,
329     #[cfg(target_arch = "x86_64")]
330     vendor: CpuVendor,
331 }
332 
333 impl Vcpu {
334     /// Constructs a new VCPU for `vm`.
335     ///
336     /// # Arguments
337     ///
338     /// * `id` - Represents the CPU number between [0, max vcpus).
339     /// * `vm` - The virtual machine this vcpu will get attached to.
340     /// * `vm_ops` - Optional object for exit handling.
341     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
342     pub fn new(
343         id: u8,
344         apic_id: u8,
345         vm: &Arc<dyn hypervisor::Vm>,
346         vm_ops: Option<Arc<dyn VmOps>>,
347         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
348     ) -> Result<Self> {
349         let vcpu = vm
350             .create_vcpu(apic_id, vm_ops)
351             .map_err(|e| Error::VcpuCreate(e.into()))?;
352         // Initially the cpuid per vCPU is the one supported by this VM.
353         Ok(Vcpu {
354             vcpu,
355             id,
356             #[cfg(target_arch = "aarch64")]
357             mpidr: 0,
358             saved_state: None,
359             #[cfg(target_arch = "x86_64")]
360             vendor: cpu_vendor,
361         })
362     }
363 
364     /// Configures a vcpu and should be called once per vcpu when created.
365     ///
366     /// # Arguments
367     ///
368     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
369     /// * `guest_memory` - Guest memory.
370     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
371     pub fn configure(
372         &mut self,
373         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
374         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
375         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
376         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
377         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
378     ) -> Result<()> {
379         #[cfg(target_arch = "aarch64")]
380         {
381             self.init(vm)?;
382             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
383                 .map_err(Error::VcpuConfiguration)?;
384         }
385         info!("Configuring vCPU: cpu_id = {}", self.id);
386         #[cfg(target_arch = "x86_64")]
387         arch::configure_vcpu(
388             &self.vcpu,
389             self.id,
390             boot_setup,
391             cpuid,
392             kvm_hyperv,
393             self.vendor,
394             topology,
395         )
396         .map_err(Error::VcpuConfiguration)?;
397 
398         Ok(())
399     }
400 
401     /// Gets the MPIDR register value.
402     #[cfg(target_arch = "aarch64")]
403     pub fn get_mpidr(&self) -> u64 {
404         self.mpidr
405     }
406 
407     /// Gets the saved vCPU state.
408     #[cfg(target_arch = "aarch64")]
409     pub fn get_saved_state(&self) -> Option<CpuState> {
410         self.saved_state.clone()
411     }
412 
413     /// Initializes an aarch64 specific vcpu for booting Linux.
414     #[cfg(target_arch = "aarch64")]
415     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
416         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
417 
418         // This reads back the kernel's preferred target type.
419         vm.get_preferred_target(&mut kvi)
420             .map_err(Error::VcpuArmPreferredTarget)?;
421         // We already checked that the capability is supported.
422         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
423         if vm
424             .as_any()
425             .downcast_ref::<hypervisor::kvm::KvmVm>()
426             .unwrap()
427             .check_extension(Cap::ArmPmuV3)
428         {
429             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
430         }
431         // Non-boot cpus are powered off initially.
432         if self.id > 0 {
433             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
434         }
435         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
436     }
437 
438     /// Runs the VCPU until it exits, returning the reason.
439     ///
440     /// Note that the state of the VCPU and associated VM must be setup first for this to do
441     /// anything useful.
442     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
443         self.vcpu.run()
444     }
445 
446     #[cfg(feature = "sev_snp")]
447     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
448         self.vcpu
449             .set_sev_control_register(vmsa_pfn)
450             .map_err(Error::SetSevControlRegister)
451     }
452 }
453 
454 impl Pausable for Vcpu {}
455 impl Snapshottable for Vcpu {
456     fn id(&self) -> String {
457         self.id.to_string()
458     }
459 
460     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
461         let saved_state = self
462             .vcpu
463             .state()
464             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
465 
466         self.saved_state = Some(saved_state.clone());
467 
468         Ok(Snapshot::from_data(SnapshotData::new_from_state(
469             &saved_state,
470         )?))
471     }
472 }
473 
474 pub struct CpuManager {
475     config: CpusConfig,
476     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
477     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
478     #[cfg(target_arch = "x86_64")]
479     cpuid: Vec<CpuIdEntry>,
480     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
481     vm: Arc<dyn hypervisor::Vm>,
482     vcpus_kill_signalled: Arc<AtomicBool>,
483     vcpus_pause_signalled: Arc<AtomicBool>,
484     vcpus_kick_signalled: Arc<AtomicBool>,
485     exit_evt: EventFd,
486     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
487     reset_evt: EventFd,
488     #[cfg(feature = "guest_debug")]
489     vm_debug_evt: EventFd,
490     vcpu_states: Vec<VcpuState>,
491     selected_cpu: u8,
492     vcpus: Vec<Arc<Mutex<Vcpu>>>,
493     seccomp_action: SeccompAction,
494     vm_ops: Arc<dyn VmOps>,
495     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
496     acpi_address: Option<GuestAddress>,
497     proximity_domain_per_cpu: BTreeMap<u8, u32>,
498     affinity: BTreeMap<u8, Vec<usize>>,
499     dynamic: bool,
500     hypervisor: Arc<dyn hypervisor::Hypervisor>,
501     #[cfg(feature = "sev_snp")]
502     sev_snp_enabled: bool,
503 }
504 
505 const CPU_ENABLE_FLAG: usize = 0;
506 const CPU_INSERTING_FLAG: usize = 1;
507 const CPU_REMOVING_FLAG: usize = 2;
508 const CPU_EJECT_FLAG: usize = 3;
509 
510 const CPU_STATUS_OFFSET: u64 = 4;
511 const CPU_SELECTION_OFFSET: u64 = 0;
512 
513 impl BusDevice for CpuManager {
514     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
515         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
516         data.fill(0);
517 
518         match offset {
519             CPU_SELECTION_OFFSET => {
520                 data[0] = self.selected_cpu;
521             }
522             CPU_STATUS_OFFSET => {
523                 if self.selected_cpu < self.max_vcpus() {
524                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
525                     if state.active() {
526                         data[0] |= 1 << CPU_ENABLE_FLAG;
527                     }
528                     if state.inserting {
529                         data[0] |= 1 << CPU_INSERTING_FLAG;
530                     }
531                     if state.removing {
532                         data[0] |= 1 << CPU_REMOVING_FLAG;
533                     }
534                 } else {
535                     warn!("Out of range vCPU id: {}", self.selected_cpu);
536                 }
537             }
538             _ => {
539                 warn!(
540                     "Unexpected offset for accessing CPU manager device: {:#}",
541                     offset
542                 );
543             }
544         }
545     }
546 
547     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
548         match offset {
549             CPU_SELECTION_OFFSET => {
550                 self.selected_cpu = data[0];
551             }
552             CPU_STATUS_OFFSET => {
553                 if self.selected_cpu < self.max_vcpus() {
554                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
555                     // The ACPI code writes back a 1 to acknowledge the insertion
556                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
557                         && state.inserting
558                     {
559                         state.inserting = false;
560                     }
561                     // Ditto for removal
562                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
563                         && state.removing
564                     {
565                         state.removing = false;
566                     }
567                     // Trigger removal of vCPU
568                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
569                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
570                             error!("Error removing vCPU: {:?}", e);
571                         }
572                     }
573                 } else {
574                     warn!("Out of range vCPU id: {}", self.selected_cpu);
575                 }
576             }
577             _ => {
578                 warn!(
579                     "Unexpected offset for accessing CPU manager device: {:#}",
580                     offset
581                 );
582             }
583         }
584         None
585     }
586 }
587 
588 #[derive(Default)]
589 struct VcpuState {
590     inserting: bool,
591     removing: bool,
592     pending_removal: Arc<AtomicBool>,
593     handle: Option<thread::JoinHandle<()>>,
594     kill: Arc<AtomicBool>,
595     vcpu_run_interrupted: Arc<AtomicBool>,
596     paused: Arc<AtomicBool>,
597 }
598 
599 impl VcpuState {
600     fn active(&self) -> bool {
601         self.handle.is_some()
602     }
603 
604     fn signal_thread(&self) {
605         if let Some(handle) = self.handle.as_ref() {
606             loop {
607                 // SAFETY: FFI call with correct arguments
608                 unsafe {
609                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
610                 }
611                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
612                     break;
613                 } else {
614                     // This is more effective than thread::yield_now() at
615                     // avoiding a priority inversion with the vCPU thread
616                     thread::sleep(std::time::Duration::from_millis(1));
617                 }
618             }
619         }
620     }
621 
622     fn join_thread(&mut self) -> Result<()> {
623         if let Some(handle) = self.handle.take() {
624             handle.join().map_err(Error::ThreadCleanup)?
625         }
626 
627         Ok(())
628     }
629 
630     fn unpark_thread(&self) {
631         if let Some(handle) = self.handle.as_ref() {
632             handle.thread().unpark()
633         }
634     }
635 }
636 
637 impl CpuManager {
638     #[allow(unused_variables)]
639     #[allow(clippy::too_many_arguments)]
640     pub fn new(
641         config: &CpusConfig,
642         vm: Arc<dyn hypervisor::Vm>,
643         exit_evt: EventFd,
644         reset_evt: EventFd,
645         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
646         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
647         seccomp_action: SeccompAction,
648         vm_ops: Arc<dyn VmOps>,
649         #[cfg(feature = "tdx")] tdx_enabled: bool,
650         numa_nodes: &NumaNodes,
651         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
652     ) -> Result<Arc<Mutex<CpuManager>>> {
653         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
654             return Err(Error::MaximumVcpusExceeded);
655         }
656 
657         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
658         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
659         let hypervisor_type = hypervisor.hypervisor_type();
660         #[cfg(target_arch = "x86_64")]
661         let cpu_vendor = hypervisor.get_cpu_vendor();
662 
663         #[cfg(target_arch = "x86_64")]
664         if config.features.amx {
665             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
666             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
667             const XFEATURE_XTILEDATA: usize = 18;
668             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
669 
670             // SAFETY: the syscall is only modifying kernel internal
671             // data structures that the kernel is itself expected to safeguard.
672             let amx_tile = unsafe {
673                 libc::syscall(
674                     libc::SYS_arch_prctl,
675                     ARCH_REQ_XCOMP_GUEST_PERM,
676                     XFEATURE_XTILEDATA,
677                 )
678             };
679 
680             if amx_tile != 0 {
681                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
682             } else {
683                 let mask: usize = 0;
684                 // SAFETY: the mask being modified (not marked mutable as it is
685                 // modified in unsafe only which is permitted) isn't in use elsewhere.
686                 let result = unsafe {
687                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
688                 };
689                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
690                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
691                 }
692             }
693         }
694 
695         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
696             let mut cpu_list = Vec::new();
697             for (proximity_domain, numa_node) in numa_nodes.iter() {
698                 for cpu in numa_node.cpus.iter() {
699                     cpu_list.push((*cpu, *proximity_domain))
700                 }
701             }
702             cpu_list
703         }
704         .into_iter()
705         .collect();
706 
707         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
708             cpu_affinity
709                 .iter()
710                 .map(|a| (a.vcpu, a.host_cpus.clone()))
711                 .collect()
712         } else {
713             BTreeMap::new()
714         };
715 
716         #[cfg(feature = "tdx")]
717         let dynamic = !tdx_enabled;
718         #[cfg(not(feature = "tdx"))]
719         let dynamic = true;
720 
721         Ok(Arc::new(Mutex::new(CpuManager {
722             config: config.clone(),
723             interrupt_controller: None,
724             #[cfg(target_arch = "x86_64")]
725             cpuid: Vec::new(),
726             vm,
727             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
728             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
729             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
730             vcpu_states,
731             exit_evt,
732             reset_evt,
733             #[cfg(feature = "guest_debug")]
734             vm_debug_evt,
735             selected_cpu: 0,
736             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
737             seccomp_action,
738             vm_ops,
739             acpi_address: None,
740             proximity_domain_per_cpu,
741             affinity,
742             dynamic,
743             hypervisor: hypervisor.clone(),
744             #[cfg(feature = "sev_snp")]
745             sev_snp_enabled,
746         })))
747     }
748 
749     #[cfg(target_arch = "x86_64")]
750     pub fn populate_cpuid(
751         &mut self,
752         memory_manager: &Arc<Mutex<MemoryManager>>,
753         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
754         #[cfg(feature = "tdx")] tdx: bool,
755     ) -> Result<()> {
756         let sgx_epc_sections = memory_manager
757             .lock()
758             .unwrap()
759             .sgx_epc_region()
760             .as_ref()
761             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
762 
763         self.cpuid = {
764             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
765             arch::generate_common_cpuid(
766                 hypervisor,
767                 &arch::CpuidConfig {
768                     sgx_epc_sections,
769                     phys_bits,
770                     kvm_hyperv: self.config.kvm_hyperv,
771                     #[cfg(feature = "tdx")]
772                     tdx,
773                     amx: self.config.features.amx,
774                 },
775             )
776             .map_err(Error::CommonCpuId)?
777         };
778 
779         Ok(())
780     }
781 
782     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
783         info!("Creating vCPU: cpu_id = {}", cpu_id);
784 
785         #[cfg(target_arch = "x86_64")]
786         let topology = self.get_vcpu_topology();
787         #[cfg(target_arch = "x86_64")]
788         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
789         #[cfg(target_arch = "aarch64")]
790         let x2apic_id = cpu_id as u32;
791 
792         let mut vcpu = Vcpu::new(
793             cpu_id,
794             x2apic_id as u8,
795             &self.vm,
796             Some(self.vm_ops.clone()),
797             #[cfg(target_arch = "x86_64")]
798             self.hypervisor.get_cpu_vendor(),
799         )?;
800 
801         if let Some(snapshot) = snapshot {
802             // AArch64 vCPUs should be initialized after created.
803             #[cfg(target_arch = "aarch64")]
804             vcpu.init(&self.vm)?;
805 
806             let state: CpuState = snapshot.to_state().map_err(|e| {
807                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
808             })?;
809             vcpu.vcpu
810                 .set_state(&state)
811                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
812 
813             vcpu.saved_state = Some(state);
814         }
815 
816         let vcpu = Arc::new(Mutex::new(vcpu));
817 
818         // Adding vCPU to the CpuManager's vCPU list.
819         self.vcpus.push(vcpu.clone());
820 
821         Ok(vcpu)
822     }
823 
824     pub fn configure_vcpu(
825         &self,
826         vcpu: Arc<Mutex<Vcpu>>,
827         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
828     ) -> Result<()> {
829         let mut vcpu = vcpu.lock().unwrap();
830 
831         #[cfg(feature = "sev_snp")]
832         if self.sev_snp_enabled {
833             if let Some((kernel_entry_point, _)) = boot_setup {
834                 vcpu.set_sev_control_register(
835                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
836                 )?;
837             }
838 
839             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
840             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
841             return Ok(());
842         }
843 
844         #[cfg(target_arch = "x86_64")]
845         assert!(!self.cpuid.is_empty());
846 
847         #[cfg(target_arch = "x86_64")]
848         let topology = self.config.topology.clone().map_or_else(
849             || {
850                 #[cfg(feature = "mshv")]
851                 if matches!(self.hypervisor.hypervisor_type(), HypervisorType::Mshv) {
852                     return Some((1, self.boot_vcpus(), 1));
853                 }
854                 None
855             },
856             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
857         );
858         #[cfg(target_arch = "x86_64")]
859         vcpu.configure(
860             boot_setup,
861             self.cpuid.clone(),
862             self.config.kvm_hyperv,
863             topology,
864         )?;
865 
866         #[cfg(target_arch = "aarch64")]
867         vcpu.configure(&self.vm, boot_setup)?;
868 
869         Ok(())
870     }
871 
872     /// Only create new vCPUs if there aren't any inactive ones to reuse
873     fn create_vcpus(
874         &mut self,
875         desired_vcpus: u8,
876         snapshot: Option<Snapshot>,
877     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
878         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
879         info!(
880             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
881             desired_vcpus,
882             self.config.max_vcpus,
883             self.vcpus.len(),
884             self.present_vcpus()
885         );
886 
887         if desired_vcpus > self.config.max_vcpus {
888             return Err(Error::DesiredVCpuCountExceedsMax);
889         }
890 
891         // Only create vCPUs in excess of all the allocated vCPUs.
892         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
893             vcpus.push(self.create_vcpu(
894                 cpu_id,
895                 // TODO: The special format of the CPU id can be removed once
896                 // ready to break live upgrade.
897                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
898             )?);
899         }
900 
901         Ok(vcpus)
902     }
903 
904     #[cfg(target_arch = "aarch64")]
905     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
906         for cpu in self.vcpus.iter() {
907             let cpu = cpu.lock().unwrap();
908             // Check if PMU attr is available, if not, log the information.
909             if cpu.vcpu.has_pmu_support() {
910                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
911             } else {
912                 debug!(
913                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
914                     cpu.id
915                 );
916                 return Ok(false);
917             }
918         }
919 
920         Ok(true)
921     }
922 
923     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
924         self.vcpus.clone()
925     }
926 
927     fn start_vcpu(
928         &mut self,
929         vcpu: Arc<Mutex<Vcpu>>,
930         vcpu_id: u8,
931         vcpu_thread_barrier: Arc<Barrier>,
932         inserting: bool,
933     ) -> Result<()> {
934         let reset_evt = self.reset_evt.try_clone().unwrap();
935         let exit_evt = self.exit_evt.try_clone().unwrap();
936         #[cfg(feature = "kvm")]
937         let hypervisor_type = self.hypervisor.hypervisor_type();
938         #[cfg(feature = "guest_debug")]
939         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
940         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
941         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
942         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
943         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
944 
945         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
946         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
947             .vcpu_run_interrupted
948             .clone();
949         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
950         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
951 
952         // Prepare the CPU set the current vCPU is expected to run onto.
953         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
954             // SAFETY: all zeros is a valid pattern
955             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
956             // SAFETY: FFI call, trivially safe
957             unsafe { libc::CPU_ZERO(&mut cpuset) };
958             for host_cpu in host_cpus {
959                 // SAFETY: FFI call, trivially safe
960                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
961             }
962             cpuset
963         });
964 
965         // Retrieve seccomp filter for vcpu thread
966         let vcpu_seccomp_filter = get_seccomp_filter(
967             &self.seccomp_action,
968             Thread::Vcpu,
969             self.hypervisor.hypervisor_type(),
970         )
971         .map_err(Error::CreateSeccompFilter)?;
972 
973         #[cfg(target_arch = "x86_64")]
974         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
975 
976         info!("Starting vCPU: cpu_id = {}", vcpu_id);
977 
978         let handle = Some(
979             thread::Builder::new()
980                 .name(format!("vcpu{vcpu_id}"))
981                 .spawn(move || {
982                     // Schedule the thread to run on the expected CPU set
983                     if let Some(cpuset) = cpuset.as_ref() {
984                         // SAFETY: FFI call with correct arguments
985                         let ret = unsafe {
986                             libc::sched_setaffinity(
987                                 0,
988                                 std::mem::size_of::<libc::cpu_set_t>(),
989                                 cpuset as *const libc::cpu_set_t,
990                             )
991                         };
992 
993                         if ret != 0 {
994                             error!(
995                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
996                                 vcpu_id,
997                                 io::Error::last_os_error()
998                             );
999                             return;
1000                         }
1001                     }
1002 
1003                     // Apply seccomp filter for vcpu thread.
1004                     if !vcpu_seccomp_filter.is_empty() {
1005                         if let Err(e) =
1006                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1007                         {
1008                             error!("Error applying seccomp filter: {:?}", e);
1009                             return;
1010                         }
1011                     }
1012                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1013                     // This uses an async signal safe handler to kill the vcpu handles.
1014                     register_signal_handler(SIGRTMIN(), handle_signal)
1015                         .expect("Failed to register vcpu signal handler");
1016                     // Block until all CPUs are ready.
1017                     vcpu_thread_barrier.wait();
1018 
1019                     std::panic::catch_unwind(move || {
1020                         loop {
1021                             // If we are being told to pause, we park the thread
1022                             // until the pause boolean is toggled.
1023                             // The resume operation is responsible for toggling
1024                             // the boolean and unpark the thread.
1025                             // We enter a loop because park() could spuriously
1026                             // return. We will then park() again unless the
1027                             // pause boolean has been toggled.
1028 
1029                             // Need to use Ordering::SeqCst as we have multiple
1030                             // loads and stores to different atomics and we need
1031                             // to see them in a consistent order in all threads
1032 
1033                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1034                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1035                                 // completed by returning to KVM_RUN. From the kernel docs:
1036                                 //
1037                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1038                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1039                                 // operations are complete (and guest state is consistent) only after userspace
1040                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1041                                 // incomplete operations and then check for pending signals.
1042                                 // The pending state of the operation is not preserved in state which is
1043                                 // visible to userspace, thus userspace should ensure that the operation is
1044                                 // completed before performing a live migration.  Userspace can re-enter the
1045                                 // guest with an unmasked signal pending or with the immediate_exit field set
1046                                 // to complete pending operations without allowing any further instructions
1047                                 // to be executed.
1048 
1049                                 #[cfg(feature = "kvm")]
1050                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1051                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1052                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1053                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1054                                         break;
1055                                     }
1056                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1057                                 }
1058 
1059                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1060 
1061                                 vcpu_paused.store(true, Ordering::SeqCst);
1062                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1063                                     thread::park();
1064                                 }
1065                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1066                             }
1067 
1068                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1069                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1070                                 #[cfg(target_arch = "x86_64")]
1071                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1072                                     Ok(()) => {},
1073                                     Err(e) => {
1074                                         error!("Error when inject nmi {}", e);
1075                                         break;
1076                                     }
1077                                 }
1078                             }
1079 
1080                             // We've been told to terminate
1081                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1082                                 || vcpu_kill.load(Ordering::SeqCst)
1083                             {
1084                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1085                                 break;
1086                             }
1087 
1088                             #[cfg(feature = "tdx")]
1089                             let mut vcpu = vcpu.lock().unwrap();
1090                             #[cfg(not(feature = "tdx"))]
1091                             let vcpu = vcpu.lock().unwrap();
1092                             // vcpu.run() returns false on a triple-fault so trigger a reset
1093                             match vcpu.run() {
1094                                 Ok(run) => match run {
1095                                     #[cfg(feature = "kvm")]
1096                                     VmExit::Debug => {
1097                                         info!("VmExit::Debug");
1098                                         #[cfg(feature = "guest_debug")]
1099                                         {
1100                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1101                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1102                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1103                                         }
1104                                     }
1105                                     #[cfg(target_arch = "x86_64")]
1106                                     VmExit::IoapicEoi(vector) => {
1107                                         if let Some(interrupt_controller) =
1108                                             &interrupt_controller_clone
1109                                         {
1110                                             interrupt_controller
1111                                                 .lock()
1112                                                 .unwrap()
1113                                                 .end_of_interrupt(vector);
1114                                         }
1115                                     }
1116                                     VmExit::Ignore => {}
1117                                     VmExit::Hyperv => {}
1118                                     VmExit::Reset => {
1119                                         info!("VmExit::Reset");
1120                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1121                                         reset_evt.write(1).unwrap();
1122                                         break;
1123                                     }
1124                                     VmExit::Shutdown => {
1125                                         info!("VmExit::Shutdown");
1126                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1127                                         exit_evt.write(1).unwrap();
1128                                         break;
1129                                     }
1130                                     #[cfg(feature = "tdx")]
1131                                     VmExit::Tdx => {
1132                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1133                                             match vcpu.get_tdx_exit_details() {
1134                                                 Ok(details) => match details {
1135                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1136                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1137                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1138                                                     }
1139                                                 },
1140                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1141                                             }
1142                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1143                                         } else {
1144                                             // We should never reach this code as
1145                                             // this means the design from the code
1146                                             // is wrong.
1147                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1148                                         }
1149                                     }
1150                                     _ => {
1151                                         error!(
1152                                             "VCPU generated error: {:?}",
1153                                             Error::UnexpectedVmExit
1154                                         );
1155                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1156                                         exit_evt.write(1).unwrap();
1157                                         break;
1158                                     }
1159                                 },
1160 
1161                                 Err(e) => {
1162                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1163                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1164                                     exit_evt.write(1).unwrap();
1165                                     break;
1166                                 }
1167                             }
1168 
1169                             // We've been told to terminate
1170                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1171                                 || vcpu_kill.load(Ordering::SeqCst)
1172                             {
1173                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1174                                 break;
1175                             }
1176                         }
1177                     })
1178                     .or_else(|_| {
1179                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1180                         error!("vCPU thread panicked");
1181                         panic_exit_evt.write(1)
1182                     })
1183                     .ok();
1184                 })
1185                 .map_err(Error::VcpuSpawn)?,
1186         );
1187 
1188         // On hot plug calls into this function entry_point is None. It is for
1189         // those hotplug CPU additions that we need to set the inserting flag.
1190         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1191         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1192 
1193         Ok(())
1194     }
1195 
1196     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1197     fn activate_vcpus(
1198         &mut self,
1199         desired_vcpus: u8,
1200         inserting: bool,
1201         paused: Option<bool>,
1202     ) -> Result<()> {
1203         if desired_vcpus > self.config.max_vcpus {
1204             return Err(Error::DesiredVCpuCountExceedsMax);
1205         }
1206 
1207         let vcpu_thread_barrier = Arc::new(Barrier::new(
1208             (desired_vcpus - self.present_vcpus() + 1) as usize,
1209         ));
1210 
1211         if let Some(paused) = paused {
1212             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1213         }
1214 
1215         info!(
1216             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1217             desired_vcpus,
1218             self.vcpus.len(),
1219             self.present_vcpus(),
1220             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1221         );
1222 
1223         // This reuses any inactive vCPUs as well as any that were newly created
1224         for vcpu_id in self.present_vcpus()..desired_vcpus {
1225             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1226             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1227         }
1228 
1229         // Unblock all CPU threads.
1230         vcpu_thread_barrier.wait();
1231         Ok(())
1232     }
1233 
1234     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1235         // Mark vCPUs for removal, actual removal happens on ejection
1236         for cpu_id in desired_vcpus..self.present_vcpus() {
1237             self.vcpu_states[usize::from(cpu_id)].removing = true;
1238             self.vcpu_states[usize::from(cpu_id)]
1239                 .pending_removal
1240                 .store(true, Ordering::SeqCst);
1241         }
1242     }
1243 
1244     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1245         for state in self.vcpu_states.iter() {
1246             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1247                 return true;
1248             }
1249         }
1250         false
1251     }
1252 
1253     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1254         info!("Removing vCPU: cpu_id = {}", cpu_id);
1255         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1256         state.kill.store(true, Ordering::SeqCst);
1257         state.signal_thread();
1258         state.join_thread()?;
1259         state.handle = None;
1260 
1261         // Once the thread has exited, clear the "kill" so that it can reused
1262         state.kill.store(false, Ordering::SeqCst);
1263         state.pending_removal.store(false, Ordering::SeqCst);
1264 
1265         Ok(())
1266     }
1267 
1268     pub fn create_boot_vcpus(
1269         &mut self,
1270         snapshot: Option<Snapshot>,
1271     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1272         trace_scoped!("create_boot_vcpus");
1273 
1274         self.create_vcpus(self.boot_vcpus(), snapshot)
1275     }
1276 
1277     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1278     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1279         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1280     }
1281 
1282     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1283         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1284             .map_err(|e| {
1285                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1286             })?;
1287 
1288         Ok(())
1289     }
1290 
1291     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1292         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1293             return Ok(false);
1294         }
1295 
1296         if !self.dynamic {
1297             return Ok(false);
1298         }
1299 
1300         if self.check_pending_removed_vcpu() {
1301             return Err(Error::VcpuPendingRemovedVcpu);
1302         }
1303 
1304         match desired_vcpus.cmp(&self.present_vcpus()) {
1305             cmp::Ordering::Greater => {
1306                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1307                 for vcpu in vcpus {
1308                     self.configure_vcpu(vcpu, None)?
1309                 }
1310                 self.activate_vcpus(desired_vcpus, true, None)?;
1311                 Ok(true)
1312             }
1313             cmp::Ordering::Less => {
1314                 self.mark_vcpus_for_removal(desired_vcpus);
1315                 Ok(true)
1316             }
1317             _ => Ok(false),
1318         }
1319     }
1320 
1321     pub fn shutdown(&mut self) -> Result<()> {
1322         // Tell the vCPUs to stop themselves next time they go through the loop
1323         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1324 
1325         // Toggle the vCPUs pause boolean
1326         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1327 
1328         // Unpark all the VCPU threads.
1329         for state in self.vcpu_states.iter() {
1330             state.unpark_thread();
1331         }
1332 
1333         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1334         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1335         // above.
1336         for state in self.vcpu_states.iter() {
1337             state.signal_thread();
1338         }
1339 
1340         // Wait for all the threads to finish. This removes the state from the vector.
1341         for mut state in self.vcpu_states.drain(..) {
1342             state.join_thread()?;
1343         }
1344 
1345         Ok(())
1346     }
1347 
1348     #[cfg(feature = "tdx")]
1349     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1350         for vcpu in &self.vcpus {
1351             vcpu.lock()
1352                 .unwrap()
1353                 .vcpu
1354                 .tdx_init(hob_address)
1355                 .map_err(Error::InitializeTdx)?;
1356         }
1357         Ok(())
1358     }
1359 
1360     pub fn boot_vcpus(&self) -> u8 {
1361         self.config.boot_vcpus
1362     }
1363 
1364     pub fn max_vcpus(&self) -> u8 {
1365         self.config.max_vcpus
1366     }
1367 
1368     #[cfg(target_arch = "x86_64")]
1369     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1370         assert!(!self.cpuid.is_empty());
1371         self.cpuid.clone()
1372     }
1373 
1374     fn present_vcpus(&self) -> u8 {
1375         self.vcpu_states
1376             .iter()
1377             .fold(0, |acc, state| acc + state.active() as u8)
1378     }
1379 
1380     #[cfg(target_arch = "aarch64")]
1381     pub fn get_mpidrs(&self) -> Vec<u64> {
1382         self.vcpus
1383             .iter()
1384             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1385             .collect()
1386     }
1387 
1388     #[cfg(target_arch = "aarch64")]
1389     pub fn get_saved_states(&self) -> Vec<CpuState> {
1390         self.vcpus
1391             .iter()
1392             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1393             .collect()
1394     }
1395 
1396     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1397         self.config
1398             .topology
1399             .clone()
1400             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1401     }
1402 
1403     pub fn create_madt(&self) -> Sdt {
1404         use crate::acpi;
1405         // This is also checked in the commandline parsing.
1406         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1407 
1408         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1409         #[cfg(target_arch = "x86_64")]
1410         {
1411             madt.write(36, arch::layout::APIC_START.0);
1412 
1413             for cpu in 0..self.config.max_vcpus {
1414                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1415 
1416                 let lapic = LocalX2Apic {
1417                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1418                     length: 16,
1419                     processor_id: cpu.into(),
1420                     apic_id: x2apic_id,
1421                     flags: if cpu < self.config.boot_vcpus {
1422                         1 << MADT_CPU_ENABLE_FLAG
1423                     } else {
1424                         0
1425                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1426                     _reserved: 0,
1427                 };
1428                 madt.append(lapic);
1429             }
1430 
1431             madt.append(Ioapic {
1432                 r#type: acpi::ACPI_APIC_IO,
1433                 length: 12,
1434                 ioapic_id: 0,
1435                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1436                 gsi_base: 0,
1437                 ..Default::default()
1438             });
1439 
1440             madt.append(InterruptSourceOverride {
1441                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1442                 length: 10,
1443                 bus: 0,
1444                 source: 4,
1445                 gsi: 4,
1446                 flags: 0,
1447             });
1448         }
1449 
1450         #[cfg(target_arch = "aarch64")]
1451         {
1452             /* Notes:
1453              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1454              */
1455 
1456             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1457             for cpu in 0..self.config.boot_vcpus {
1458                 let vcpu = &self.vcpus[cpu as usize];
1459                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1460                 /* ARMv8 MPIDR format:
1461                      Bits [63:40] Must be zero
1462                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1463                      Bits [31:24] Must be zero
1464                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1465                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1466                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1467                 */
1468                 let mpidr_mask = 0xff_00ff_ffff;
1469                 let gicc = GicC {
1470                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1471                     length: 80,
1472                     reserved0: 0,
1473                     cpu_interface_number: cpu as u32,
1474                     uid: cpu as u32,
1475                     flags: 1,
1476                     parking_version: 0,
1477                     performance_interrupt: 0,
1478                     parked_address: 0,
1479                     base_address: 0,
1480                     gicv_base_address: 0,
1481                     gich_base_address: 0,
1482                     vgic_interrupt: 0,
1483                     gicr_base_address: 0,
1484                     mpidr: mpidr & mpidr_mask,
1485                     proc_power_effi_class: 0,
1486                     reserved1: 0,
1487                     spe_overflow_interrupt: 0,
1488                 };
1489 
1490                 madt.append(gicc);
1491             }
1492             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1493 
1494             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1495             let gicd = GicD {
1496                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1497                 length: 24,
1498                 reserved0: 0,
1499                 gic_id: 0,
1500                 base_address: vgic_config.dist_addr,
1501                 global_irq_base: 0,
1502                 version: 3,
1503                 reserved1: [0; 3],
1504             };
1505             madt.append(gicd);
1506 
1507             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1508             let gicr = GicR {
1509                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1510                 length: 16,
1511                 reserved: 0,
1512                 base_address: vgic_config.redists_addr,
1513                 range_length: vgic_config.redists_size as u32,
1514             };
1515             madt.append(gicr);
1516 
1517             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1518             let gicits = GicIts {
1519                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1520                 length: 20,
1521                 reserved0: 0,
1522                 translation_id: 0,
1523                 base_address: vgic_config.msi_addr,
1524                 reserved1: 0,
1525             };
1526             madt.append(gicits);
1527 
1528             madt.update_checksum();
1529         }
1530 
1531         madt
1532     }
1533 
1534     #[cfg(target_arch = "aarch64")]
1535     pub fn create_pptt(&self) -> Sdt {
1536         let pptt_start = 0;
1537         let mut cpus = 0;
1538         let mut uid = 0;
1539         // If topology is not specified, the default setting is:
1540         // 1 package, multiple cores, 1 thread per core
1541         // This is also the behavior when PPTT is missing.
1542         let (threads_per_core, cores_per_package, packages) =
1543             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1544 
1545         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1546 
1547         for cluster_idx in 0..packages {
1548             if cpus < self.config.boot_vcpus as usize {
1549                 let cluster_offset = pptt.len() - pptt_start;
1550                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1551                     r#type: 0,
1552                     length: 20,
1553                     reserved: 0,
1554                     flags: 0x2,
1555                     parent: 0,
1556                     acpi_processor_id: cluster_idx as u32,
1557                     num_private_resources: 0,
1558                 };
1559                 pptt.append(cluster_hierarchy_node);
1560 
1561                 for core_idx in 0..cores_per_package {
1562                     let core_offset = pptt.len() - pptt_start;
1563 
1564                     if threads_per_core > 1 {
1565                         let core_hierarchy_node = ProcessorHierarchyNode {
1566                             r#type: 0,
1567                             length: 20,
1568                             reserved: 0,
1569                             flags: 0x2,
1570                             parent: cluster_offset as u32,
1571                             acpi_processor_id: core_idx as u32,
1572                             num_private_resources: 0,
1573                         };
1574                         pptt.append(core_hierarchy_node);
1575 
1576                         for _thread_idx in 0..threads_per_core {
1577                             let thread_hierarchy_node = ProcessorHierarchyNode {
1578                                 r#type: 0,
1579                                 length: 20,
1580                                 reserved: 0,
1581                                 flags: 0xE,
1582                                 parent: core_offset as u32,
1583                                 acpi_processor_id: uid as u32,
1584                                 num_private_resources: 0,
1585                             };
1586                             pptt.append(thread_hierarchy_node);
1587                             uid += 1;
1588                         }
1589                     } else {
1590                         let thread_hierarchy_node = ProcessorHierarchyNode {
1591                             r#type: 0,
1592                             length: 20,
1593                             reserved: 0,
1594                             flags: 0xA,
1595                             parent: cluster_offset as u32,
1596                             acpi_processor_id: uid as u32,
1597                             num_private_resources: 0,
1598                         };
1599                         pptt.append(thread_hierarchy_node);
1600                         uid += 1;
1601                     }
1602                 }
1603                 cpus += (cores_per_package * threads_per_core) as usize;
1604             }
1605         }
1606 
1607         pptt.update_checksum();
1608         pptt
1609     }
1610 
1611     #[cfg(feature = "guest_debug")]
1612     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1613         self.vcpus[usize::from(cpu_id)]
1614             .lock()
1615             .unwrap()
1616             .vcpu
1617             .get_regs()
1618             .map_err(Error::CpuDebug)
1619     }
1620 
1621     #[cfg(feature = "guest_debug")]
1622     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1623         self.vcpus[usize::from(cpu_id)]
1624             .lock()
1625             .unwrap()
1626             .vcpu
1627             .set_regs(regs)
1628             .map_err(Error::CpuDebug)
1629     }
1630 
1631     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1632     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1633         self.vcpus[usize::from(cpu_id)]
1634             .lock()
1635             .unwrap()
1636             .vcpu
1637             .get_sregs()
1638             .map_err(Error::CpuDebug)
1639     }
1640 
1641     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1642     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1643         self.vcpus[usize::from(cpu_id)]
1644             .lock()
1645             .unwrap()
1646             .vcpu
1647             .set_sregs(sregs)
1648             .map_err(Error::CpuDebug)
1649     }
1650 
1651     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1652     fn translate_gva(
1653         &self,
1654         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1655         cpu_id: u8,
1656         gva: u64,
1657     ) -> Result<u64> {
1658         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1659             .lock()
1660             .unwrap()
1661             .vcpu
1662             .translate_gva(gva, /* flags: unused */ 0)
1663             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1664         Ok(gpa)
1665     }
1666 
1667     ///
1668     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1669     /// it in VMM by walking through translation tables.
1670     ///
1671     /// Address translation is big topic, here we only focus the scenario that
1672     /// happens in VMM while debugging kernel. This `translate_gva`
1673     /// implementation is restricted to:
1674     /// - Exception Level 1
1675     /// - Translate high address range only (kernel space)
1676     ///
1677     /// This implementation supports following Arm-v8a features related to
1678     /// address translation:
1679     /// - FEAT_LPA
1680     /// - FEAT_LVA
1681     /// - FEAT_LPA2
1682     ///
1683     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1684     fn translate_gva(
1685         &self,
1686         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1687         cpu_id: u8,
1688         gva: u64,
1689     ) -> Result<u64> {
1690         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1691             .lock()
1692             .unwrap()
1693             .vcpu
1694             .get_sys_reg(regs::TCR_EL1)
1695             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1696         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1697             .lock()
1698             .unwrap()
1699             .vcpu
1700             .get_sys_reg(regs::TTBR1_EL1)
1701             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1702         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1703             .lock()
1704             .unwrap()
1705             .vcpu
1706             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1707             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1708 
1709         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1710         // or low (0x000xxx...).
1711         let high_range = extract_bits_64!(gva, 55, 1);
1712         if high_range == 0 {
1713             info!("VA (0x{:x}) range is not supported!", gva);
1714             return Ok(gva);
1715         }
1716 
1717         // High range size offset
1718         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1719         // Granule size
1720         let tg = extract_bits_64!(tcr_el1, 30, 2);
1721         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1722         let ds = extract_bits_64!(tcr_el1, 59, 1);
1723 
1724         if tsz == 0 {
1725             info!("VA translation is not ready!");
1726             return Ok(gva);
1727         }
1728 
1729         // VA size is determined by TCR_BL1.T1SZ
1730         let va_size = 64 - tsz;
1731         // Number of bits in VA consumed in each level of translation
1732         let stride = match tg {
1733             3 => 13, // 64KB granule size
1734             1 => 11, // 16KB granule size
1735             _ => 9,  // 4KB, default
1736         };
1737         // Starting level of walking
1738         let mut level = 4 - (va_size - 4) / stride;
1739 
1740         // PA or IPA size is determined
1741         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1742         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1743         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1744         // To be safe, we use the minimum value if they are different.
1745         let pa_range = std::cmp::min(tcr_ips, pa_range);
1746         // PA size in bits
1747         let pa_size = match pa_range {
1748             0 => 32,
1749             1 => 36,
1750             2 => 40,
1751             3 => 42,
1752             4 => 44,
1753             5 => 48,
1754             6 => 52,
1755             _ => {
1756                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1757                     "PA range not supported {pa_range}"
1758                 ))))
1759             }
1760         };
1761 
1762         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1763         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1764         // If FEAT_LPA2 is present, the translation table descriptor holds
1765         // 50 bits of the table address of next level.
1766         // Otherwise, it is 48 bits.
1767         let descaddrmask = if ds == 1 {
1768             !0u64 >> (64 - 50) // mask with 50 least significant bits
1769         } else {
1770             !0u64 >> (64 - 48) // mask with 48 least significant bits
1771         };
1772         let descaddrmask = descaddrmask & !indexmask_grainsize;
1773 
1774         // Translation table base address
1775         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1776         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1777         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1778         if pa_size == 52 {
1779             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1780         }
1781 
1782         // Loop through tables of each level
1783         loop {
1784             // Table offset for current level
1785             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1786             descaddr |= table_offset;
1787             descaddr &= !7u64;
1788 
1789             let mut buf = [0; 8];
1790             guest_memory
1791                 .memory()
1792                 .read(&mut buf, GuestAddress(descaddr))
1793                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1794             let descriptor = u64::from_le_bytes(buf);
1795 
1796             descaddr = descriptor & descaddrmask;
1797             // In the case of FEAT_LPA, the next-level translation table address
1798             // bits [48:51] comes from bits [12:15] of the current descriptor.
1799             // For FEAT_LPA2, the next-level translation table address
1800             // bits [50:51] comes from bits [8:9] of the current descriptor,
1801             // bits [48:49] comes from bits [48:49] of the descriptor which was
1802             // handled previously.
1803             if pa_size == 52 {
1804                 if ds == 1 {
1805                     // FEAT_LPA2
1806                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1807                 } else {
1808                     // FEAT_LPA
1809                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1810                 }
1811             }
1812 
1813             if (descriptor & 2) != 0 && (level < 3) {
1814                 // This is a table entry. Go down to next level.
1815                 level += 1;
1816                 indexmask = indexmask_grainsize;
1817                 continue;
1818             }
1819 
1820             break;
1821         }
1822 
1823         // We have reached either:
1824         // - a page entry at level 3 or
1825         // - a block entry at level 1 or 2
1826         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1827         descaddr &= !(page_size - 1);
1828         descaddr |= gva & (page_size - 1);
1829 
1830         Ok(descaddr)
1831     }
1832 
1833     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1834         self.acpi_address = Some(acpi_address);
1835     }
1836 
1837     pub(crate) fn set_interrupt_controller(
1838         &mut self,
1839         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1840     ) {
1841         self.interrupt_controller = Some(interrupt_controller);
1842     }
1843 
1844     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1845         &self.vcpus_kill_signalled
1846     }
1847 
1848     #[cfg(feature = "igvm")]
1849     pub(crate) fn get_cpuid_leaf(
1850         &self,
1851         cpu_id: u8,
1852         eax: u32,
1853         ecx: u32,
1854         xfem: u64,
1855         xss: u64,
1856     ) -> Result<[u32; 4]> {
1857         let leaf_info = self.vcpus[usize::from(cpu_id)]
1858             .lock()
1859             .unwrap()
1860             .vcpu
1861             .get_cpuid_values(eax, ecx, xfem, xss)
1862             .unwrap();
1863         Ok(leaf_info)
1864     }
1865 
1866     #[cfg(feature = "sev_snp")]
1867     pub(crate) fn sev_snp_enabled(&self) -> bool {
1868         self.sev_snp_enabled
1869     }
1870 
1871     pub(crate) fn nmi(&self) -> Result<()> {
1872         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1873 
1874         for state in self.vcpu_states.iter() {
1875             state.signal_thread();
1876         }
1877 
1878         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1879 
1880         Ok(())
1881     }
1882 }
1883 
1884 struct Cpu {
1885     cpu_id: u8,
1886     proximity_domain: u32,
1887     dynamic: bool,
1888     #[cfg(target_arch = "x86_64")]
1889     topology: Option<(u8, u8, u8)>,
1890 }
1891 
1892 #[cfg(target_arch = "x86_64")]
1893 const MADT_CPU_ENABLE_FLAG: usize = 0;
1894 
1895 #[cfg(target_arch = "x86_64")]
1896 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1897 
1898 impl Cpu {
1899     #[cfg(target_arch = "x86_64")]
1900     fn generate_mat(&self) -> Vec<u8> {
1901         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1902 
1903         let lapic = LocalX2Apic {
1904             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1905             length: 16,
1906             processor_id: self.cpu_id.into(),
1907             apic_id: x2apic_id,
1908             flags: 1 << MADT_CPU_ENABLE_FLAG,
1909             _reserved: 0,
1910         };
1911 
1912         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1913         // SAFETY: mat_data is large enough to hold lapic
1914         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1915 
1916         mat_data
1917     }
1918 }
1919 
1920 impl Aml for Cpu {
1921     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1922         #[cfg(target_arch = "x86_64")]
1923         let mat_data: Vec<u8> = self.generate_mat();
1924         #[allow(clippy::if_same_then_else)]
1925         if self.dynamic {
1926             aml::Device::new(
1927                 format!("C{:03X}", self.cpu_id).as_str().into(),
1928                 vec![
1929                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1930                     &aml::Name::new("_UID".into(), &self.cpu_id),
1931                     // Currently, AArch64 cannot support following fields.
1932                     /*
1933                     _STA return value:
1934                     Bit [0] – Set if the device is present.
1935                     Bit [1] – Set if the device is enabled and decoding its resources.
1936                     Bit [2] – Set if the device should be shown in the UI.
1937                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1938                     Bit [4] – Set if the battery is present.
1939                     Bits [31:5] – Reserved (must be cleared).
1940                     */
1941                     #[cfg(target_arch = "x86_64")]
1942                     &aml::Method::new(
1943                         "_STA".into(),
1944                         0,
1945                         false,
1946                         // Call into CSTA method which will interrogate device
1947                         vec![&aml::Return::new(&aml::MethodCall::new(
1948                             "CSTA".into(),
1949                             vec![&self.cpu_id],
1950                         ))],
1951                     ),
1952                     &aml::Method::new(
1953                         "_PXM".into(),
1954                         0,
1955                         false,
1956                         vec![&aml::Return::new(&self.proximity_domain)],
1957                     ),
1958                     // The Linux kernel expects every CPU device to have a _MAT entry
1959                     // containing the LAPIC for this processor with the enabled bit set
1960                     // even it if is disabled in the MADT (non-boot CPU)
1961                     #[cfg(target_arch = "x86_64")]
1962                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1963                     // Trigger CPU ejection
1964                     #[cfg(target_arch = "x86_64")]
1965                     &aml::Method::new(
1966                         "_EJ0".into(),
1967                         1,
1968                         false,
1969                         // Call into CEJ0 method which will actually eject device
1970                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1971                     ),
1972                 ],
1973             )
1974             .to_aml_bytes(sink);
1975         } else {
1976             aml::Device::new(
1977                 format!("C{:03X}", self.cpu_id).as_str().into(),
1978                 vec![
1979                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1980                     &aml::Name::new("_UID".into(), &self.cpu_id),
1981                     #[cfg(target_arch = "x86_64")]
1982                     &aml::Method::new(
1983                         "_STA".into(),
1984                         0,
1985                         false,
1986                         // Mark CPU present see CSTA implementation
1987                         vec![&aml::Return::new(&0xfu8)],
1988                     ),
1989                     &aml::Method::new(
1990                         "_PXM".into(),
1991                         0,
1992                         false,
1993                         vec![&aml::Return::new(&self.proximity_domain)],
1994                     ),
1995                     // The Linux kernel expects every CPU device to have a _MAT entry
1996                     // containing the LAPIC for this processor with the enabled bit set
1997                     // even it if is disabled in the MADT (non-boot CPU)
1998                     #[cfg(target_arch = "x86_64")]
1999                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
2000                 ],
2001             )
2002             .to_aml_bytes(sink);
2003         }
2004     }
2005 }
2006 
2007 struct CpuNotify {
2008     cpu_id: u8,
2009 }
2010 
2011 impl Aml for CpuNotify {
2012     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2013         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2014         aml::If::new(
2015             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2016             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2017         )
2018         .to_aml_bytes(sink)
2019     }
2020 }
2021 
2022 struct CpuMethods {
2023     max_vcpus: u8,
2024     dynamic: bool,
2025 }
2026 
2027 impl Aml for CpuMethods {
2028     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2029         if self.dynamic {
2030             // CPU status method
2031             aml::Method::new(
2032                 "CSTA".into(),
2033                 1,
2034                 true,
2035                 vec![
2036                     // Take lock defined above
2037                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2038                     // Write CPU number (in first argument) to I/O port via field
2039                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2040                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2041                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2042                     &aml::If::new(
2043                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2044                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2045                     ),
2046                     // Release lock
2047                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2048                     // Return 0 or 0xf
2049                     &aml::Return::new(&aml::Local(0)),
2050                 ],
2051             )
2052             .to_aml_bytes(sink);
2053 
2054             let mut cpu_notifies = Vec::new();
2055             for cpu_id in 0..self.max_vcpus {
2056                 cpu_notifies.push(CpuNotify { cpu_id });
2057             }
2058 
2059             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2060             for cpu_id in 0..self.max_vcpus {
2061                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2062             }
2063 
2064             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2065 
2066             aml::Method::new(
2067                 "CEJ0".into(),
2068                 1,
2069                 true,
2070                 vec![
2071                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2072                     // Write CPU number (in first argument) to I/O port via field
2073                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2074                     // Set CEJ0 bit
2075                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2076                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2077                 ],
2078             )
2079             .to_aml_bytes(sink);
2080 
2081             aml::Method::new(
2082                 "CSCN".into(),
2083                 0,
2084                 true,
2085                 vec![
2086                     // Take lock defined above
2087                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2088                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2089                     &aml::While::new(
2090                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2091                         vec![
2092                             // Write CPU number (in first argument) to I/O port via field
2093                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2094                             // Check if CINS bit is set
2095                             &aml::If::new(
2096                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2097                                 // Notify device if it is
2098                                 vec![
2099                                     &aml::MethodCall::new(
2100                                         "CTFY".into(),
2101                                         vec![&aml::Local(0), &aml::ONE],
2102                                     ),
2103                                     // Reset CINS bit
2104                                     &aml::Store::new(
2105                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2106                                         &aml::ONE,
2107                                     ),
2108                                 ],
2109                             ),
2110                             // Check if CRMV bit is set
2111                             &aml::If::new(
2112                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2113                                 // Notify device if it is (with the eject constant 0x3)
2114                                 vec![
2115                                     &aml::MethodCall::new(
2116                                         "CTFY".into(),
2117                                         vec![&aml::Local(0), &3u8],
2118                                     ),
2119                                     // Reset CRMV bit
2120                                     &aml::Store::new(
2121                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2122                                         &aml::ONE,
2123                                     ),
2124                                 ],
2125                             ),
2126                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2127                         ],
2128                     ),
2129                     // Release lock
2130                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2131                 ],
2132             )
2133             .to_aml_bytes(sink)
2134         } else {
2135             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2136         }
2137     }
2138 }
2139 
2140 impl Aml for CpuManager {
2141     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2142         #[cfg(target_arch = "x86_64")]
2143         if let Some(acpi_address) = self.acpi_address {
2144             // CPU hotplug controller
2145             aml::Device::new(
2146                 "_SB_.PRES".into(),
2147                 vec![
2148                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2149                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2150                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2151                     &aml::Mutex::new("CPLK".into(), 0),
2152                     &aml::Name::new(
2153                         "_CRS".into(),
2154                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2155                             aml::AddressSpaceCacheable::NotCacheable,
2156                             true,
2157                             acpi_address.0,
2158                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2159                             None,
2160                         )]),
2161                     ),
2162                     // OpRegion and Fields map MMIO range into individual field values
2163                     &aml::OpRegion::new(
2164                         "PRST".into(),
2165                         aml::OpRegionSpace::SystemMemory,
2166                         &(acpi_address.0 as usize),
2167                         &CPU_MANAGER_ACPI_SIZE,
2168                     ),
2169                     &aml::Field::new(
2170                         "PRST".into(),
2171                         aml::FieldAccessType::Byte,
2172                         aml::FieldLockRule::NoLock,
2173                         aml::FieldUpdateRule::WriteAsZeroes,
2174                         vec![
2175                             aml::FieldEntry::Reserved(32),
2176                             aml::FieldEntry::Named(*b"CPEN", 1),
2177                             aml::FieldEntry::Named(*b"CINS", 1),
2178                             aml::FieldEntry::Named(*b"CRMV", 1),
2179                             aml::FieldEntry::Named(*b"CEJ0", 1),
2180                             aml::FieldEntry::Reserved(4),
2181                             aml::FieldEntry::Named(*b"CCMD", 8),
2182                         ],
2183                     ),
2184                     &aml::Field::new(
2185                         "PRST".into(),
2186                         aml::FieldAccessType::DWord,
2187                         aml::FieldLockRule::NoLock,
2188                         aml::FieldUpdateRule::Preserve,
2189                         vec![
2190                             aml::FieldEntry::Named(*b"CSEL", 32),
2191                             aml::FieldEntry::Reserved(32),
2192                             aml::FieldEntry::Named(*b"CDAT", 32),
2193                         ],
2194                     ),
2195                 ],
2196             )
2197             .to_aml_bytes(sink);
2198         }
2199 
2200         // CPU devices
2201         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2202         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2203         // Bundle methods together under a common object
2204         let methods = CpuMethods {
2205             max_vcpus: self.config.max_vcpus,
2206             dynamic: self.dynamic,
2207         };
2208         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2209 
2210         #[cfg(target_arch = "x86_64")]
2211         let topology = self.get_vcpu_topology();
2212         let mut cpu_devices = Vec::new();
2213         for cpu_id in 0..self.config.max_vcpus {
2214             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2215             let cpu_device = Cpu {
2216                 cpu_id,
2217                 proximity_domain,
2218                 dynamic: self.dynamic,
2219                 #[cfg(target_arch = "x86_64")]
2220                 topology,
2221             };
2222 
2223             cpu_devices.push(cpu_device);
2224         }
2225 
2226         for cpu_device in cpu_devices.iter() {
2227             cpu_data_inner.push(cpu_device);
2228         }
2229 
2230         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2231     }
2232 }
2233 
2234 impl Pausable for CpuManager {
2235     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2236         // Tell the vCPUs to pause themselves next time they exit
2237         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2238 
2239         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2240         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2241         // above.
2242         for state in self.vcpu_states.iter() {
2243             state.signal_thread();
2244         }
2245 
2246         for vcpu in self.vcpus.iter() {
2247             let mut vcpu = vcpu.lock().unwrap();
2248             vcpu.pause()?;
2249             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2250             if !self.config.kvm_hyperv {
2251                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2252                     MigratableError::Pause(anyhow!(
2253                         "Could not notify guest it has been paused {:?}",
2254                         e
2255                     ))
2256                 })?;
2257             }
2258         }
2259 
2260         // The vCPU thread will change its paused state before parking, wait here for each
2261         // activated vCPU change their state to ensure they have parked.
2262         for state in self.vcpu_states.iter() {
2263             if state.active() {
2264                 while !state.paused.load(Ordering::SeqCst) {
2265                     // To avoid a priority inversion with the vCPU thread
2266                     thread::sleep(std::time::Duration::from_millis(1));
2267                 }
2268             }
2269         }
2270 
2271         Ok(())
2272     }
2273 
2274     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2275         for vcpu in self.vcpus.iter() {
2276             vcpu.lock().unwrap().resume()?;
2277         }
2278 
2279         // Toggle the vCPUs pause boolean
2280         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2281 
2282         // Unpark all the VCPU threads.
2283         // Once unparked, the next thing they will do is checking for the pause
2284         // boolean. Since it'll be set to false, they will exit their pause loop
2285         // and go back to vmx root.
2286         for state in self.vcpu_states.iter() {
2287             state.paused.store(false, Ordering::SeqCst);
2288             state.unpark_thread();
2289         }
2290         Ok(())
2291     }
2292 }
2293 
2294 impl Snapshottable for CpuManager {
2295     fn id(&self) -> String {
2296         CPU_MANAGER_SNAPSHOT_ID.to_string()
2297     }
2298 
2299     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2300         let mut cpu_manager_snapshot = Snapshot::default();
2301 
2302         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2303         for vcpu in &self.vcpus {
2304             let mut vcpu = vcpu.lock().unwrap();
2305             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2306         }
2307 
2308         Ok(cpu_manager_snapshot)
2309     }
2310 }
2311 
2312 impl Transportable for CpuManager {}
2313 impl Migratable for CpuManager {}
2314 
2315 #[cfg(feature = "guest_debug")]
2316 impl Debuggable for CpuManager {
2317     #[cfg(feature = "kvm")]
2318     fn set_guest_debug(
2319         &self,
2320         cpu_id: usize,
2321         addrs: &[GuestAddress],
2322         singlestep: bool,
2323     ) -> std::result::Result<(), DebuggableError> {
2324         self.vcpus[cpu_id]
2325             .lock()
2326             .unwrap()
2327             .vcpu
2328             .set_guest_debug(addrs, singlestep)
2329             .map_err(DebuggableError::SetDebug)
2330     }
2331 
2332     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2333         Ok(())
2334     }
2335 
2336     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2337         Ok(())
2338     }
2339 
2340     #[cfg(target_arch = "x86_64")]
2341     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2342         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2343         let gregs = self
2344             .get_regs(cpu_id as u8)
2345             .map_err(DebuggableError::ReadRegs)?;
2346         let regs = [
2347             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2348             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2349         ];
2350 
2351         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2352         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2353         let eflags = gregs.rflags as u32;
2354         let rip = gregs.rip;
2355 
2356         // Segment registers: CS, SS, DS, ES, FS, GS
2357         let sregs = self
2358             .get_sregs(cpu_id as u8)
2359             .map_err(DebuggableError::ReadRegs)?;
2360         let segments = X86SegmentRegs {
2361             cs: sregs.cs.selector as u32,
2362             ss: sregs.ss.selector as u32,
2363             ds: sregs.ds.selector as u32,
2364             es: sregs.es.selector as u32,
2365             fs: sregs.fs.selector as u32,
2366             gs: sregs.gs.selector as u32,
2367         };
2368 
2369         // TODO: Add other registers
2370 
2371         Ok(CoreRegs {
2372             regs,
2373             eflags,
2374             rip,
2375             segments,
2376             ..Default::default()
2377         })
2378     }
2379 
2380     #[cfg(target_arch = "aarch64")]
2381     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2382         let gregs = self
2383             .get_regs(cpu_id as u8)
2384             .map_err(DebuggableError::ReadRegs)?;
2385         Ok(CoreRegs {
2386             x: gregs.regs.regs,
2387             sp: gregs.regs.sp,
2388             pc: gregs.regs.pc,
2389             ..Default::default()
2390         })
2391     }
2392 
2393     #[cfg(target_arch = "x86_64")]
2394     fn write_regs(
2395         &self,
2396         cpu_id: usize,
2397         regs: &CoreRegs,
2398     ) -> std::result::Result<(), DebuggableError> {
2399         let orig_gregs = self
2400             .get_regs(cpu_id as u8)
2401             .map_err(DebuggableError::ReadRegs)?;
2402         let gregs = StandardRegisters {
2403             rax: regs.regs[0],
2404             rbx: regs.regs[1],
2405             rcx: regs.regs[2],
2406             rdx: regs.regs[3],
2407             rsi: regs.regs[4],
2408             rdi: regs.regs[5],
2409             rbp: regs.regs[6],
2410             rsp: regs.regs[7],
2411             r8: regs.regs[8],
2412             r9: regs.regs[9],
2413             r10: regs.regs[10],
2414             r11: regs.regs[11],
2415             r12: regs.regs[12],
2416             r13: regs.regs[13],
2417             r14: regs.regs[14],
2418             r15: regs.regs[15],
2419             rip: regs.rip,
2420             // Update the lower 32-bit of rflags.
2421             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2422         };
2423 
2424         self.set_regs(cpu_id as u8, &gregs)
2425             .map_err(DebuggableError::WriteRegs)?;
2426 
2427         // Segment registers: CS, SS, DS, ES, FS, GS
2428         // Since GDB care only selectors, we call get_sregs() first.
2429         let mut sregs = self
2430             .get_sregs(cpu_id as u8)
2431             .map_err(DebuggableError::ReadRegs)?;
2432         sregs.cs.selector = regs.segments.cs as u16;
2433         sregs.ss.selector = regs.segments.ss as u16;
2434         sregs.ds.selector = regs.segments.ds as u16;
2435         sregs.es.selector = regs.segments.es as u16;
2436         sregs.fs.selector = regs.segments.fs as u16;
2437         sregs.gs.selector = regs.segments.gs as u16;
2438 
2439         self.set_sregs(cpu_id as u8, &sregs)
2440             .map_err(DebuggableError::WriteRegs)?;
2441 
2442         // TODO: Add other registers
2443 
2444         Ok(())
2445     }
2446 
2447     #[cfg(target_arch = "aarch64")]
2448     fn write_regs(
2449         &self,
2450         cpu_id: usize,
2451         regs: &CoreRegs,
2452     ) -> std::result::Result<(), DebuggableError> {
2453         let mut gregs = self
2454             .get_regs(cpu_id as u8)
2455             .map_err(DebuggableError::ReadRegs)?;
2456 
2457         gregs.regs.regs = regs.x;
2458         gregs.regs.sp = regs.sp;
2459         gregs.regs.pc = regs.pc;
2460 
2461         self.set_regs(cpu_id as u8, &gregs)
2462             .map_err(DebuggableError::WriteRegs)?;
2463 
2464         Ok(())
2465     }
2466 
2467     fn read_mem(
2468         &self,
2469         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2470         cpu_id: usize,
2471         vaddr: GuestAddress,
2472         len: usize,
2473     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2474         let mut buf = vec![0; len];
2475         let mut total_read = 0_u64;
2476 
2477         while total_read < len as u64 {
2478             let gaddr = vaddr.0 + total_read;
2479             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2480                 Ok(paddr) => paddr,
2481                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2482                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2483             };
2484             let psize = arch::PAGE_SIZE as u64;
2485             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2486             guest_memory
2487                 .memory()
2488                 .read(
2489                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2490                     GuestAddress(paddr),
2491                 )
2492                 .map_err(DebuggableError::ReadMem)?;
2493             total_read += read_len;
2494         }
2495         Ok(buf)
2496     }
2497 
2498     fn write_mem(
2499         &self,
2500         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2501         cpu_id: usize,
2502         vaddr: &GuestAddress,
2503         data: &[u8],
2504     ) -> std::result::Result<(), DebuggableError> {
2505         let mut total_written = 0_u64;
2506 
2507         while total_written < data.len() as u64 {
2508             let gaddr = vaddr.0 + total_written;
2509             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2510                 Ok(paddr) => paddr,
2511                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2512                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2513             };
2514             let psize = arch::PAGE_SIZE as u64;
2515             let write_len = std::cmp::min(
2516                 data.len() as u64 - total_written,
2517                 psize - (paddr & (psize - 1)),
2518             );
2519             guest_memory
2520                 .memory()
2521                 .write(
2522                     &data[total_written as usize..total_written as usize + write_len as usize],
2523                     GuestAddress(paddr),
2524                 )
2525                 .map_err(DebuggableError::WriteMem)?;
2526             total_written += write_len;
2527         }
2528         Ok(())
2529     }
2530 
2531     fn active_vcpus(&self) -> usize {
2532         self.present_vcpus() as usize
2533     }
2534 }
2535 
2536 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2537 impl Elf64Writable for CpuManager {}
2538 
2539 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2540 impl CpuElf64Writable for CpuManager {
2541     fn cpu_write_elf64_note(
2542         &mut self,
2543         dump_state: &DumpState,
2544     ) -> std::result::Result<(), GuestDebuggableError> {
2545         let mut coredump_file = dump_state.file.as_ref().unwrap();
2546         for vcpu in &self.vcpus {
2547             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2548             let mut pos: usize = 0;
2549             let mut buf = vec![0; note_size as usize];
2550             let descsz = size_of::<X86_64ElfPrStatus>();
2551             let vcpu_id = vcpu.lock().unwrap().id;
2552 
2553             let note = Elf64_Nhdr {
2554                 n_namesz: COREDUMP_NAME_SIZE,
2555                 n_descsz: descsz as u32,
2556                 n_type: NT_PRSTATUS,
2557             };
2558 
2559             let bytes: &[u8] = note.as_slice();
2560             buf.splice(0.., bytes.to_vec());
2561             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2562             buf.resize(pos + 4, 0);
2563             buf.splice(pos.., "CORE".to_string().into_bytes());
2564 
2565             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2566             buf.resize(pos + 32 + 4, 0);
2567             let pid = vcpu_id as u64;
2568             let bytes: &[u8] = pid.as_slice();
2569             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2570 
2571             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2572 
2573             let orig_rax: u64 = 0;
2574             let gregs = self.vcpus[usize::from(vcpu_id)]
2575                 .lock()
2576                 .unwrap()
2577                 .vcpu
2578                 .get_regs()
2579                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2580 
2581             let regs1 = [
2582                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2583                 gregs.r10,
2584             ];
2585             let regs2 = [
2586                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2587             ];
2588 
2589             let sregs = self.vcpus[usize::from(vcpu_id)]
2590                 .lock()
2591                 .unwrap()
2592                 .vcpu
2593                 .get_sregs()
2594                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2595 
2596             debug!(
2597                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2598                 gregs.rip,
2599                 gregs.rsp,
2600                 sregs.gs.base,
2601                 sregs.cs.selector,
2602                 sregs.ss.selector,
2603                 sregs.ds.selector,
2604             );
2605 
2606             let regs = X86_64UserRegs {
2607                 regs1,
2608                 regs2,
2609                 rip: gregs.rip,
2610                 cs: sregs.cs.selector as u64,
2611                 eflags: gregs.rflags,
2612                 rsp: gregs.rsp,
2613                 ss: sregs.ss.selector as u64,
2614                 fs_base: sregs.fs.base,
2615                 gs_base: sregs.gs.base,
2616                 ds: sregs.ds.selector as u64,
2617                 es: sregs.es.selector as u64,
2618                 fs: sregs.fs.selector as u64,
2619                 gs: sregs.gs.selector as u64,
2620             };
2621 
2622             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2623             let bytes: &[u8] = regs.as_slice();
2624             buf.resize(note_size as usize, 0);
2625             buf.splice(pos.., bytes.to_vec());
2626             buf.resize(note_size as usize, 0);
2627 
2628             coredump_file
2629                 .write(&buf)
2630                 .map_err(GuestDebuggableError::CoredumpFile)?;
2631         }
2632 
2633         Ok(())
2634     }
2635 
2636     fn cpu_write_vmm_note(
2637         &mut self,
2638         dump_state: &DumpState,
2639     ) -> std::result::Result<(), GuestDebuggableError> {
2640         let mut coredump_file = dump_state.file.as_ref().unwrap();
2641         for vcpu in &self.vcpus {
2642             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2643             let mut pos: usize = 0;
2644             let mut buf = vec![0; note_size as usize];
2645             let descsz = size_of::<DumpCpusState>();
2646             let vcpu_id = vcpu.lock().unwrap().id;
2647 
2648             let note = Elf64_Nhdr {
2649                 n_namesz: COREDUMP_NAME_SIZE,
2650                 n_descsz: descsz as u32,
2651                 n_type: 0,
2652             };
2653 
2654             let bytes: &[u8] = note.as_slice();
2655             buf.splice(0.., bytes.to_vec());
2656             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2657 
2658             buf.resize(pos + 4, 0);
2659             buf.splice(pos.., "QEMU".to_string().into_bytes());
2660 
2661             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2662 
2663             let gregs = self.vcpus[usize::from(vcpu_id)]
2664                 .lock()
2665                 .unwrap()
2666                 .vcpu
2667                 .get_regs()
2668                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2669 
2670             let regs1 = [
2671                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2672                 gregs.rbp,
2673             ];
2674 
2675             let regs2 = [
2676                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2677                 gregs.r15,
2678             ];
2679 
2680             let sregs = self.vcpus[usize::from(vcpu_id)]
2681                 .lock()
2682                 .unwrap()
2683                 .vcpu
2684                 .get_sregs()
2685                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2686 
2687             let mut msrs = vec![MsrEntry {
2688                 index: msr_index::MSR_KERNEL_GS_BASE,
2689                 ..Default::default()
2690             }];
2691 
2692             self.vcpus[vcpu_id as usize]
2693                 .lock()
2694                 .unwrap()
2695                 .vcpu
2696                 .get_msrs(&mut msrs)
2697                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2698             let kernel_gs_base = msrs[0].data;
2699 
2700             let cs = CpuSegment::new(sregs.cs);
2701             let ds = CpuSegment::new(sregs.ds);
2702             let es = CpuSegment::new(sregs.es);
2703             let fs = CpuSegment::new(sregs.fs);
2704             let gs = CpuSegment::new(sregs.gs);
2705             let ss = CpuSegment::new(sregs.ss);
2706             let ldt = CpuSegment::new(sregs.ldt);
2707             let tr = CpuSegment::new(sregs.tr);
2708             let gdt = CpuSegment::new_from_table(sregs.gdt);
2709             let idt = CpuSegment::new_from_table(sregs.idt);
2710             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2711             let regs = DumpCpusState {
2712                 version: 1,
2713                 size: size_of::<DumpCpusState>() as u32,
2714                 regs1,
2715                 regs2,
2716                 rip: gregs.rip,
2717                 rflags: gregs.rflags,
2718                 cs,
2719                 ds,
2720                 es,
2721                 fs,
2722                 gs,
2723                 ss,
2724                 ldt,
2725                 tr,
2726                 gdt,
2727                 idt,
2728                 cr,
2729                 kernel_gs_base,
2730             };
2731 
2732             let bytes: &[u8] = regs.as_slice();
2733             buf.resize(note_size as usize, 0);
2734             buf.splice(pos.., bytes.to_vec());
2735             buf.resize(note_size as usize, 0);
2736 
2737             coredump_file
2738                 .write(&buf)
2739                 .map_err(GuestDebuggableError::CoredumpFile)?;
2740         }
2741 
2742         Ok(())
2743     }
2744 }
2745 
2746 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2747 #[cfg(test)]
2748 mod tests {
2749     use arch::layout::BOOT_STACK_POINTER;
2750     use arch::layout::ZERO_PAGE_START;
2751     use arch::x86_64::interrupts::*;
2752     use arch::x86_64::regs::*;
2753     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2754     use linux_loader::loader::bootparam::setup_header;
2755 
2756     #[test]
2757     fn test_setlint() {
2758         let hv = hypervisor::new().unwrap();
2759         let vm = hv.create_vm().expect("new VM fd creation failed");
2760         assert!(hv.check_required_extensions().is_ok());
2761         // Calling get_lapic will fail if there is no irqchip before hand.
2762         assert!(vm.create_irq_chip().is_ok());
2763         let vcpu = vm.create_vcpu(0, None).unwrap();
2764         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2765 
2766         // Compute the value that is expected to represent LVT0 and LVT1.
2767         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2768         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2769         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2770         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2771 
2772         set_lint(&vcpu).unwrap();
2773 
2774         // Compute the value that represents LVT0 and LVT1 after set_lint.
2775         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2776         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2777         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2778         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2779         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2780     }
2781 
2782     #[test]
2783     fn test_setup_fpu() {
2784         let hv = hypervisor::new().unwrap();
2785         let vm = hv.create_vm().expect("new VM fd creation failed");
2786         let vcpu = vm.create_vcpu(0, None).unwrap();
2787         setup_fpu(&vcpu).unwrap();
2788 
2789         let expected_fpu: FpuState = FpuState {
2790             fcw: 0x37f,
2791             mxcsr: 0x1f80,
2792             ..Default::default()
2793         };
2794         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2795         // TODO: auto-generate kvm related structures with PartialEq on.
2796         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2797         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2798         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2799         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2800         // remove it at all.
2801         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2802     }
2803 
2804     #[test]
2805     fn test_setup_msrs() {
2806         use hypervisor::arch::x86::{msr_index, MsrEntry};
2807 
2808         let hv = hypervisor::new().unwrap();
2809         let vm = hv.create_vm().expect("new VM fd creation failed");
2810         let vcpu = vm.create_vcpu(0, None).unwrap();
2811         setup_msrs(&vcpu).unwrap();
2812 
2813         // This test will check against the last MSR entry configured (the tenth one).
2814         // See create_msr_entries for details.
2815         let mut msrs = vec![MsrEntry {
2816             index: msr_index::MSR_IA32_MISC_ENABLE,
2817             ..Default::default()
2818         }];
2819 
2820         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2821         // in this test case scenario.
2822         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2823         assert_eq!(read_msrs, 1);
2824 
2825         // Official entries that were setup when we did setup_msrs. We need to assert that the
2826         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2827         // expect.
2828         let entry_vec = vcpu.boot_msr_entries();
2829         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2830     }
2831 
2832     #[test]
2833     fn test_setup_regs_for_pvh() {
2834         let hv = hypervisor::new().unwrap();
2835         let vm = hv.create_vm().expect("new VM fd creation failed");
2836         let vcpu = vm.create_vcpu(0, None).unwrap();
2837 
2838         let expected_regs: StandardRegisters = StandardRegisters {
2839             rflags: 0x0000000000000002u64,
2840             rbx: arch::layout::PVH_INFO_START.0,
2841             rip: 1,
2842             ..Default::default()
2843         };
2844 
2845         setup_regs(
2846             &vcpu,
2847             arch::EntryPoint {
2848                 entry_addr: vm_memory::GuestAddress(expected_regs.rip),
2849                 setup_header: None,
2850             },
2851         )
2852         .unwrap();
2853 
2854         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2855         assert_eq!(actual_regs, expected_regs);
2856     }
2857 
2858     #[test]
2859     fn test_setup_regs_for_bzimage() {
2860         let hv = hypervisor::new().unwrap();
2861         let vm = hv.create_vm().expect("new VM fd creation failed");
2862         let vcpu = vm.create_vcpu(0, None).unwrap();
2863 
2864         let expected_regs: StandardRegisters = StandardRegisters {
2865             rflags: 0x0000000000000002u64,
2866             rip: 1,
2867             rsp: BOOT_STACK_POINTER.0,
2868             rsi: ZERO_PAGE_START.0,
2869             ..Default::default()
2870         };
2871 
2872         setup_regs(
2873             &vcpu,
2874             arch::EntryPoint {
2875                 entry_addr: vm_memory::GuestAddress(expected_regs.rip),
2876                 setup_header: Some(setup_header {
2877                     ..Default::default()
2878                 }),
2879             },
2880         )
2881         .unwrap();
2882 
2883         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2884         assert_eq!(actual_regs, expected_regs);
2885     }
2886 }
2887 
2888 #[cfg(target_arch = "aarch64")]
2889 #[cfg(test)]
2890 mod tests {
2891     use arch::{aarch64::regs, layout};
2892     use hypervisor::kvm::aarch64::is_system_register;
2893     use hypervisor::kvm::kvm_bindings::{
2894         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2895         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2896     };
2897     use hypervisor::{arm64_core_reg_id, offset_of};
2898     use std::mem;
2899 
2900     #[test]
2901     fn test_setup_regs() {
2902         let hv = hypervisor::new().unwrap();
2903         let vm = hv.create_vm().unwrap();
2904         let vcpu = vm.create_vcpu(0, None).unwrap();
2905 
2906         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2907         // Must fail when vcpu is not initialized yet.
2908         assert!(res.is_err());
2909 
2910         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2911         vm.get_preferred_target(&mut kvi).unwrap();
2912         vcpu.vcpu_init(&kvi).unwrap();
2913 
2914         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2915     }
2916 
2917     #[test]
2918     fn test_read_mpidr() {
2919         let hv = hypervisor::new().unwrap();
2920         let vm = hv.create_vm().unwrap();
2921         let vcpu = vm.create_vcpu(0, None).unwrap();
2922         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2923         vm.get_preferred_target(&mut kvi).unwrap();
2924 
2925         // Must fail when vcpu is not initialized yet.
2926         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2927 
2928         vcpu.vcpu_init(&kvi).unwrap();
2929         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2930     }
2931 
2932     #[test]
2933     fn test_is_system_register() {
2934         let offset = offset_of!(user_pt_regs, pc);
2935         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2936         assert!(!is_system_register(regid));
2937         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2938         assert!(is_system_register(regid));
2939     }
2940 
2941     #[test]
2942     fn test_save_restore_core_regs() {
2943         let hv = hypervisor::new().unwrap();
2944         let vm = hv.create_vm().unwrap();
2945         let vcpu = vm.create_vcpu(0, None).unwrap();
2946         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2947         vm.get_preferred_target(&mut kvi).unwrap();
2948 
2949         // Must fail when vcpu is not initialized yet.
2950         let res = vcpu.get_regs();
2951         assert!(res.is_err());
2952         assert_eq!(
2953             format!("{}", res.unwrap_err()),
2954             "Failed to get core register: Exec format error (os error 8)"
2955         );
2956 
2957         let mut state = kvm_regs::default();
2958         let res = vcpu.set_regs(&state);
2959         assert!(res.is_err());
2960         assert_eq!(
2961             format!("{}", res.unwrap_err()),
2962             "Failed to set core register: Exec format error (os error 8)"
2963         );
2964 
2965         vcpu.vcpu_init(&kvi).unwrap();
2966         let res = vcpu.get_regs();
2967         assert!(res.is_ok());
2968         state = res.unwrap();
2969         assert_eq!(state.regs.pstate, 0x3C5);
2970 
2971         assert!(vcpu.set_regs(&state).is_ok());
2972     }
2973 
2974     #[test]
2975     fn test_get_set_mpstate() {
2976         let hv = hypervisor::new().unwrap();
2977         let vm = hv.create_vm().unwrap();
2978         let vcpu = vm.create_vcpu(0, None).unwrap();
2979         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2980         vm.get_preferred_target(&mut kvi).unwrap();
2981 
2982         let res = vcpu.get_mp_state();
2983         assert!(res.is_ok());
2984         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2985     }
2986 }
2987