xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 655d512523353961a67cf19cf3bc227d403898f0)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::collections::BTreeMap;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use std::io::Write;
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::os::unix::thread::JoinHandleExt;
20 use std::sync::atomic::{AtomicBool, Ordering};
21 use std::sync::{Arc, Barrier, Mutex};
22 use std::{cmp, io, result, thread};
23 
24 use acpi_tables::sdt::Sdt;
25 use acpi_tables::{aml, Aml};
26 use anyhow::anyhow;
27 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
28 use arch::aarch64::regs;
29 #[cfg(target_arch = "x86_64")]
30 use arch::x86_64::get_x2apic_id;
31 use arch::{EntryPoint, NumaNodes};
32 #[cfg(target_arch = "aarch64")]
33 use devices::gic::Gic;
34 use devices::interrupt_controller::InterruptController;
35 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
36 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
37 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
38 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
39 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
40 use hypervisor::arch::x86::msr_index;
41 #[cfg(target_arch = "x86_64")]
42 use hypervisor::arch::x86::CpuIdEntry;
43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
44 use hypervisor::arch::x86::MsrEntry;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::SpecialRegisters;
47 #[cfg(feature = "tdx")]
48 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
49 #[cfg(target_arch = "x86_64")]
50 use hypervisor::CpuVendor;
51 #[cfg(feature = "kvm")]
52 use hypervisor::HypervisorType;
53 #[cfg(feature = "guest_debug")]
54 use hypervisor::StandardRegisters;
55 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
56 use libc::{c_void, siginfo_t};
57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
58 use linux_loader::elf::Elf64_Nhdr;
59 use seccompiler::{apply_filter, SeccompAction};
60 use thiserror::Error;
61 use tracer::trace_scoped;
62 use vm_device::BusDevice;
63 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
64 use vm_memory::ByteValued;
65 #[cfg(feature = "guest_debug")]
66 use vm_memory::{Bytes, GuestAddressSpace};
67 use vm_memory::{GuestAddress, GuestMemoryAtomic};
68 use vm_migration::{
69     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
70     Transportable,
71 };
72 use vmm_sys_util::eventfd::EventFd;
73 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
74 use zerocopy::AsBytes;
75 
76 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
77 use crate::coredump::{
78     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
79     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
80     NT_PRSTATUS,
81 };
82 #[cfg(feature = "guest_debug")]
83 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
84 #[cfg(target_arch = "x86_64")]
85 use crate::memory_manager::MemoryManager;
86 use crate::seccomp_filters::{get_seccomp_filter, Thread};
87 #[cfg(target_arch = "x86_64")]
88 use crate::vm::physical_bits;
89 use crate::vm_config::CpusConfig;
90 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID};
91 
92 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
93 /// Extract the specified bits of a 64-bit integer.
94 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
95 /// following expression should return 3 (`0b11`):
96 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
97 ///
98 macro_rules! extract_bits_64 {
99     ($value: tt, $offset: tt, $length: tt) => {
100         ($value >> $offset) & (!0u64 >> (64 - $length))
101     };
102 }
103 
104 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
105 macro_rules! extract_bits_64_without_offset {
106     ($value: tt, $length: tt) => {
107         $value & (!0u64 >> (64 - $length))
108     };
109 }
110 
111 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
112 
113 #[derive(Debug, Error)]
114 pub enum Error {
115     #[error("Error creating vCPU: {0}")]
116     VcpuCreate(#[source] anyhow::Error),
117 
118     #[error("Error running bCPU: {0}")]
119     VcpuRun(#[source] anyhow::Error),
120 
121     #[error("Error spawning vCPU thread: {0}")]
122     VcpuSpawn(#[source] io::Error),
123 
124     #[error("Error generating common CPUID: {0}")]
125     CommonCpuId(#[source] arch::Error),
126 
127     #[error("Error configuring vCPU: {0}")]
128     VcpuConfiguration(#[source] arch::Error),
129 
130     #[error("Still pending removed vcpu")]
131     VcpuPendingRemovedVcpu,
132 
133     #[cfg(target_arch = "aarch64")]
134     #[error("Error fetching preferred target: {0}")]
135     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
136 
137     #[cfg(target_arch = "aarch64")]
138     #[error("Error setting vCPU processor features: {0}")]
139     VcpuSetProcessorFeatures(#[source] hypervisor::HypervisorCpuError),
140 
141     #[cfg(target_arch = "aarch64")]
142     #[error("Error initialising vCPU: {0}")]
143     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
144 
145     #[cfg(target_arch = "aarch64")]
146     #[error("Error finalising vCPU: {0}")]
147     VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError),
148 
149     #[error("Failed to join on vCPU threads: {0:?}")]
150     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
151 
152     #[error("Error adding CpuManager to MMIO bus: {0}")]
153     BusError(#[source] vm_device::BusError),
154 
155     #[error("Requested vCPUs exceed maximum")]
156     DesiredVCpuCountExceedsMax,
157 
158     #[error("Cannot create seccomp filter: {0}")]
159     CreateSeccompFilter(#[source] seccompiler::Error),
160 
161     #[error("Cannot apply seccomp filter: {0}")]
162     ApplySeccompFilter(#[source] seccompiler::Error),
163 
164     #[error("Error starting vCPU after restore: {0}")]
165     StartRestoreVcpu(#[source] anyhow::Error),
166 
167     #[error("Unexpected VmExit")]
168     UnexpectedVmExit,
169 
170     #[error("Failed to allocate MMIO address for CpuManager")]
171     AllocateMmmioAddress,
172 
173     #[cfg(feature = "tdx")]
174     #[error("Error initializing TDX: {0}")]
175     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
176 
177     #[cfg(target_arch = "aarch64")]
178     #[error("Error initializing PMU: {0}")]
179     InitPmu(#[source] hypervisor::HypervisorCpuError),
180 
181     #[cfg(feature = "guest_debug")]
182     #[error("Error during CPU debug: {0}")]
183     CpuDebug(#[source] hypervisor::HypervisorCpuError),
184 
185     #[cfg(feature = "guest_debug")]
186     #[error("Error translating virtual address: {0}")]
187     TranslateVirtualAddress(#[source] anyhow::Error),
188 
189     #[cfg(target_arch = "x86_64")]
190     #[error("Error setting up AMX: {0}")]
191     AmxEnable(#[source] anyhow::Error),
192 
193     #[error("Maximum number of vCPUs exceeds host limit")]
194     MaximumVcpusExceeded,
195 
196     #[cfg(feature = "sev_snp")]
197     #[error("Failed to set sev control register: {0}")]
198     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
199 
200     #[cfg(target_arch = "x86_64")]
201     #[error("Failed to inject NMI")]
202     NmiError(hypervisor::HypervisorCpuError),
203 }
204 pub type Result<T> = result::Result<T, Error>;
205 
206 #[cfg(target_arch = "x86_64")]
207 #[allow(dead_code)]
208 #[repr(C, packed)]
209 #[derive(AsBytes)]
210 struct LocalX2Apic {
211     pub r#type: u8,
212     pub length: u8,
213     pub _reserved: u16,
214     pub apic_id: u32,
215     pub flags: u32,
216     pub processor_id: u32,
217 }
218 
219 #[allow(dead_code)]
220 #[repr(C, packed)]
221 #[derive(Default, AsBytes)]
222 struct Ioapic {
223     pub r#type: u8,
224     pub length: u8,
225     pub ioapic_id: u8,
226     _reserved: u8,
227     pub apic_address: u32,
228     pub gsi_base: u32,
229 }
230 
231 #[cfg(target_arch = "aarch64")]
232 #[allow(dead_code)]
233 #[repr(C, packed)]
234 #[derive(AsBytes)]
235 struct GicC {
236     pub r#type: u8,
237     pub length: u8,
238     pub reserved0: u16,
239     pub cpu_interface_number: u32,
240     pub uid: u32,
241     pub flags: u32,
242     pub parking_version: u32,
243     pub performance_interrupt: u32,
244     pub parked_address: u64,
245     pub base_address: u64,
246     pub gicv_base_address: u64,
247     pub gich_base_address: u64,
248     pub vgic_interrupt: u32,
249     pub gicr_base_address: u64,
250     pub mpidr: u64,
251     pub proc_power_effi_class: u8,
252     pub reserved1: u8,
253     pub spe_overflow_interrupt: u16,
254 }
255 
256 #[cfg(target_arch = "aarch64")]
257 #[allow(dead_code)]
258 #[repr(C, packed)]
259 #[derive(AsBytes)]
260 struct GicD {
261     pub r#type: u8,
262     pub length: u8,
263     pub reserved0: u16,
264     pub gic_id: u32,
265     pub base_address: u64,
266     pub global_irq_base: u32,
267     pub version: u8,
268     pub reserved1: [u8; 3],
269 }
270 
271 #[cfg(target_arch = "aarch64")]
272 #[allow(dead_code)]
273 #[repr(C, packed)]
274 #[derive(AsBytes)]
275 struct GicR {
276     pub r#type: u8,
277     pub length: u8,
278     pub reserved: u16,
279     pub base_address: u64,
280     pub range_length: u32,
281 }
282 
283 #[cfg(target_arch = "aarch64")]
284 #[allow(dead_code)]
285 #[repr(C, packed)]
286 #[derive(AsBytes)]
287 struct GicIts {
288     pub r#type: u8,
289     pub length: u8,
290     pub reserved0: u16,
291     pub translation_id: u32,
292     pub base_address: u64,
293     pub reserved1: u32,
294 }
295 
296 #[cfg(target_arch = "aarch64")]
297 #[allow(dead_code)]
298 #[repr(C, packed)]
299 #[derive(AsBytes)]
300 struct ProcessorHierarchyNode {
301     pub r#type: u8,
302     pub length: u8,
303     pub reserved: u16,
304     pub flags: u32,
305     pub parent: u32,
306     pub acpi_processor_id: u32,
307     pub num_private_resources: u32,
308 }
309 
310 #[allow(dead_code)]
311 #[repr(C, packed)]
312 #[derive(Default, AsBytes)]
313 struct InterruptSourceOverride {
314     pub r#type: u8,
315     pub length: u8,
316     pub bus: u8,
317     pub source: u8,
318     pub gsi: u32,
319     pub flags: u16,
320 }
321 
322 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
323 macro_rules! round_up {
324     ($n:expr,$d:expr) => {
325         (($n / ($d + 1)) + 1) * $d
326     };
327 }
328 
329 /// A wrapper around creating and using a kvm-based VCPU.
330 pub struct Vcpu {
331     // The hypervisor abstracted CPU.
332     vcpu: Arc<dyn hypervisor::Vcpu>,
333     id: u8,
334     #[cfg(target_arch = "aarch64")]
335     mpidr: u64,
336     saved_state: Option<CpuState>,
337     #[cfg(target_arch = "x86_64")]
338     vendor: CpuVendor,
339 }
340 
341 impl Vcpu {
342     /// Constructs a new VCPU for `vm`.
343     ///
344     /// # Arguments
345     ///
346     /// * `id` - Represents the CPU number between [0, max vcpus).
347     /// * `vm` - The virtual machine this vcpu will get attached to.
348     /// * `vm_ops` - Optional object for exit handling.
349     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
350     pub fn new(
351         id: u8,
352         apic_id: u8,
353         vm: &Arc<dyn hypervisor::Vm>,
354         vm_ops: Option<Arc<dyn VmOps>>,
355         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
356     ) -> Result<Self> {
357         let vcpu = vm
358             .create_vcpu(apic_id, vm_ops)
359             .map_err(|e| Error::VcpuCreate(e.into()))?;
360         // Initially the cpuid per vCPU is the one supported by this VM.
361         Ok(Vcpu {
362             vcpu,
363             id,
364             #[cfg(target_arch = "aarch64")]
365             mpidr: 0,
366             saved_state: None,
367             #[cfg(target_arch = "x86_64")]
368             vendor: cpu_vendor,
369         })
370     }
371 
372     /// Configures a vcpu and should be called once per vcpu when created.
373     ///
374     /// # Arguments
375     ///
376     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
377     /// * `guest_memory` - Guest memory.
378     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
379     pub fn configure(
380         &mut self,
381         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
382         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
383         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
384         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
385         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
386     ) -> Result<()> {
387         #[cfg(target_arch = "aarch64")]
388         {
389             self.init(vm)?;
390             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
391                 .map_err(Error::VcpuConfiguration)?;
392         }
393         info!("Configuring vCPU: cpu_id = {}", self.id);
394         #[cfg(target_arch = "x86_64")]
395         arch::configure_vcpu(
396             &self.vcpu,
397             self.id,
398             boot_setup,
399             cpuid,
400             kvm_hyperv,
401             self.vendor,
402             topology,
403         )
404         .map_err(Error::VcpuConfiguration)?;
405 
406         Ok(())
407     }
408 
409     /// Gets the MPIDR register value.
410     #[cfg(target_arch = "aarch64")]
411     pub fn get_mpidr(&self) -> u64 {
412         self.mpidr
413     }
414 
415     /// Gets the saved vCPU state.
416     #[cfg(target_arch = "aarch64")]
417     pub fn get_saved_state(&self) -> Option<CpuState> {
418         self.saved_state.clone()
419     }
420 
421     /// Initializes an aarch64 specific vcpu for booting Linux.
422     #[cfg(target_arch = "aarch64")]
423     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
424         use std::arch::is_aarch64_feature_detected;
425         #[allow(clippy::nonminimal_bool)]
426         let sve_supported =
427             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
428         let mut kvi = self.vcpu.create_vcpu_init();
429 
430         // This reads back the kernel's preferred target type.
431         vm.get_preferred_target(&mut kvi)
432             .map_err(Error::VcpuArmPreferredTarget)?;
433 
434         self.vcpu
435             .vcpu_set_processor_features(vm, &mut kvi, self.id)
436             .map_err(Error::VcpuSetProcessorFeatures)?;
437 
438         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?;
439 
440         if sve_supported {
441             let finalized_features = self.vcpu.vcpu_get_finalized_features();
442             self.vcpu
443                 .vcpu_finalize(finalized_features)
444                 .map_err(Error::VcpuArmFinalize)?;
445         }
446         Ok(())
447     }
448 
449     /// Runs the VCPU until it exits, returning the reason.
450     ///
451     /// Note that the state of the VCPU and associated VM must be setup first for this to do
452     /// anything useful.
453     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
454         self.vcpu.run()
455     }
456 
457     #[cfg(feature = "sev_snp")]
458     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
459         self.vcpu
460             .set_sev_control_register(vmsa_pfn)
461             .map_err(Error::SetSevControlRegister)
462     }
463 }
464 
465 impl Pausable for Vcpu {}
466 impl Snapshottable for Vcpu {
467     fn id(&self) -> String {
468         self.id.to_string()
469     }
470 
471     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
472         let saved_state = self
473             .vcpu
474             .state()
475             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
476 
477         self.saved_state = Some(saved_state.clone());
478 
479         Ok(Snapshot::from_data(SnapshotData::new_from_state(
480             &saved_state,
481         )?))
482     }
483 }
484 
485 pub struct CpuManager {
486     config: CpusConfig,
487     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
488     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
489     #[cfg(target_arch = "x86_64")]
490     cpuid: Vec<CpuIdEntry>,
491     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
492     vm: Arc<dyn hypervisor::Vm>,
493     vcpus_kill_signalled: Arc<AtomicBool>,
494     vcpus_pause_signalled: Arc<AtomicBool>,
495     vcpus_kick_signalled: Arc<AtomicBool>,
496     exit_evt: EventFd,
497     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
498     reset_evt: EventFd,
499     #[cfg(feature = "guest_debug")]
500     vm_debug_evt: EventFd,
501     vcpu_states: Vec<VcpuState>,
502     selected_cpu: u8,
503     vcpus: Vec<Arc<Mutex<Vcpu>>>,
504     seccomp_action: SeccompAction,
505     vm_ops: Arc<dyn VmOps>,
506     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
507     acpi_address: Option<GuestAddress>,
508     proximity_domain_per_cpu: BTreeMap<u8, u32>,
509     affinity: BTreeMap<u8, Vec<usize>>,
510     dynamic: bool,
511     hypervisor: Arc<dyn hypervisor::Hypervisor>,
512     #[cfg(feature = "sev_snp")]
513     sev_snp_enabled: bool,
514 }
515 
516 const CPU_ENABLE_FLAG: usize = 0;
517 const CPU_INSERTING_FLAG: usize = 1;
518 const CPU_REMOVING_FLAG: usize = 2;
519 const CPU_EJECT_FLAG: usize = 3;
520 
521 const CPU_STATUS_OFFSET: u64 = 4;
522 const CPU_SELECTION_OFFSET: u64 = 0;
523 
524 impl BusDevice for CpuManager {
525     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
526         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
527         data.fill(0);
528 
529         match offset {
530             CPU_SELECTION_OFFSET => {
531                 data[0] = self.selected_cpu;
532             }
533             CPU_STATUS_OFFSET => {
534                 if self.selected_cpu < self.max_vcpus() {
535                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
536                     if state.active() {
537                         data[0] |= 1 << CPU_ENABLE_FLAG;
538                     }
539                     if state.inserting {
540                         data[0] |= 1 << CPU_INSERTING_FLAG;
541                     }
542                     if state.removing {
543                         data[0] |= 1 << CPU_REMOVING_FLAG;
544                     }
545                 } else {
546                     warn!("Out of range vCPU id: {}", self.selected_cpu);
547                 }
548             }
549             _ => {
550                 warn!(
551                     "Unexpected offset for accessing CPU manager device: {:#}",
552                     offset
553                 );
554             }
555         }
556     }
557 
558     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
559         match offset {
560             CPU_SELECTION_OFFSET => {
561                 self.selected_cpu = data[0];
562             }
563             CPU_STATUS_OFFSET => {
564                 if self.selected_cpu < self.max_vcpus() {
565                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
566                     // The ACPI code writes back a 1 to acknowledge the insertion
567                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
568                         && state.inserting
569                     {
570                         state.inserting = false;
571                     }
572                     // Ditto for removal
573                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
574                         && state.removing
575                     {
576                         state.removing = false;
577                     }
578                     // Trigger removal of vCPU
579                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
580                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
581                             error!("Error removing vCPU: {:?}", e);
582                         }
583                     }
584                 } else {
585                     warn!("Out of range vCPU id: {}", self.selected_cpu);
586                 }
587             }
588             _ => {
589                 warn!(
590                     "Unexpected offset for accessing CPU manager device: {:#}",
591                     offset
592                 );
593             }
594         }
595         None
596     }
597 }
598 
599 #[derive(Default)]
600 struct VcpuState {
601     inserting: bool,
602     removing: bool,
603     pending_removal: Arc<AtomicBool>,
604     handle: Option<thread::JoinHandle<()>>,
605     kill: Arc<AtomicBool>,
606     vcpu_run_interrupted: Arc<AtomicBool>,
607     paused: Arc<AtomicBool>,
608 }
609 
610 impl VcpuState {
611     fn active(&self) -> bool {
612         self.handle.is_some()
613     }
614 
615     fn signal_thread(&self) {
616         if let Some(handle) = self.handle.as_ref() {
617             loop {
618                 // SAFETY: FFI call with correct arguments
619                 unsafe {
620                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
621                 }
622                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
623                     break;
624                 } else {
625                     // This is more effective than thread::yield_now() at
626                     // avoiding a priority inversion with the vCPU thread
627                     thread::sleep(std::time::Duration::from_millis(1));
628                 }
629             }
630         }
631     }
632 
633     fn join_thread(&mut self) -> Result<()> {
634         if let Some(handle) = self.handle.take() {
635             handle.join().map_err(Error::ThreadCleanup)?
636         }
637 
638         Ok(())
639     }
640 
641     fn unpark_thread(&self) {
642         if let Some(handle) = self.handle.as_ref() {
643             handle.thread().unpark()
644         }
645     }
646 }
647 
648 impl CpuManager {
649     #[allow(unused_variables)]
650     #[allow(clippy::too_many_arguments)]
651     pub fn new(
652         config: &CpusConfig,
653         vm: Arc<dyn hypervisor::Vm>,
654         exit_evt: EventFd,
655         reset_evt: EventFd,
656         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
657         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
658         seccomp_action: SeccompAction,
659         vm_ops: Arc<dyn VmOps>,
660         #[cfg(feature = "tdx")] tdx_enabled: bool,
661         numa_nodes: &NumaNodes,
662         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
663     ) -> Result<Arc<Mutex<CpuManager>>> {
664         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
665             return Err(Error::MaximumVcpusExceeded);
666         }
667 
668         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
669         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
670         let hypervisor_type = hypervisor.hypervisor_type();
671         #[cfg(target_arch = "x86_64")]
672         let cpu_vendor = hypervisor.get_cpu_vendor();
673 
674         #[cfg(target_arch = "x86_64")]
675         if config.features.amx {
676             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
677             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
678             const XFEATURE_XTILEDATA: usize = 18;
679             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
680 
681             // SAFETY: the syscall is only modifying kernel internal
682             // data structures that the kernel is itself expected to safeguard.
683             let amx_tile = unsafe {
684                 libc::syscall(
685                     libc::SYS_arch_prctl,
686                     ARCH_REQ_XCOMP_GUEST_PERM,
687                     XFEATURE_XTILEDATA,
688                 )
689             };
690 
691             if amx_tile != 0 {
692                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
693             } else {
694                 let mask: usize = 0;
695                 // SAFETY: the mask being modified (not marked mutable as it is
696                 // modified in unsafe only which is permitted) isn't in use elsewhere.
697                 let result = unsafe {
698                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
699                 };
700                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
701                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
702                 }
703             }
704         }
705 
706         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
707             let mut cpu_list = Vec::new();
708             for (proximity_domain, numa_node) in numa_nodes.iter() {
709                 for cpu in numa_node.cpus.iter() {
710                     cpu_list.push((*cpu, *proximity_domain))
711                 }
712             }
713             cpu_list
714         }
715         .into_iter()
716         .collect();
717 
718         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
719             cpu_affinity
720                 .iter()
721                 .map(|a| (a.vcpu, a.host_cpus.clone()))
722                 .collect()
723         } else {
724             BTreeMap::new()
725         };
726 
727         #[cfg(feature = "tdx")]
728         let dynamic = !tdx_enabled;
729         #[cfg(not(feature = "tdx"))]
730         let dynamic = true;
731 
732         Ok(Arc::new(Mutex::new(CpuManager {
733             config: config.clone(),
734             interrupt_controller: None,
735             #[cfg(target_arch = "x86_64")]
736             cpuid: Vec::new(),
737             vm,
738             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
739             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
740             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
741             vcpu_states,
742             exit_evt,
743             reset_evt,
744             #[cfg(feature = "guest_debug")]
745             vm_debug_evt,
746             selected_cpu: 0,
747             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
748             seccomp_action,
749             vm_ops,
750             acpi_address: None,
751             proximity_domain_per_cpu,
752             affinity,
753             dynamic,
754             hypervisor: hypervisor.clone(),
755             #[cfg(feature = "sev_snp")]
756             sev_snp_enabled,
757         })))
758     }
759 
760     #[cfg(target_arch = "x86_64")]
761     pub fn populate_cpuid(
762         &mut self,
763         memory_manager: &Arc<Mutex<MemoryManager>>,
764         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
765         #[cfg(feature = "tdx")] tdx: bool,
766     ) -> Result<()> {
767         let sgx_epc_sections = memory_manager
768             .lock()
769             .unwrap()
770             .sgx_epc_region()
771             .as_ref()
772             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
773 
774         self.cpuid = {
775             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
776             arch::generate_common_cpuid(
777                 hypervisor,
778                 &arch::CpuidConfig {
779                     sgx_epc_sections,
780                     phys_bits,
781                     kvm_hyperv: self.config.kvm_hyperv,
782                     #[cfg(feature = "tdx")]
783                     tdx,
784                     amx: self.config.features.amx,
785                 },
786             )
787             .map_err(Error::CommonCpuId)?
788         };
789 
790         Ok(())
791     }
792 
793     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
794         info!("Creating vCPU: cpu_id = {}", cpu_id);
795 
796         #[cfg(target_arch = "x86_64")]
797         let topology = self.get_vcpu_topology();
798         #[cfg(target_arch = "x86_64")]
799         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
800         #[cfg(target_arch = "aarch64")]
801         let x2apic_id = cpu_id as u32;
802 
803         let mut vcpu = Vcpu::new(
804             cpu_id,
805             x2apic_id as u8,
806             &self.vm,
807             Some(self.vm_ops.clone()),
808             #[cfg(target_arch = "x86_64")]
809             self.hypervisor.get_cpu_vendor(),
810         )?;
811 
812         if let Some(snapshot) = snapshot {
813             // AArch64 vCPUs should be initialized after created.
814             #[cfg(target_arch = "aarch64")]
815             vcpu.init(&self.vm)?;
816 
817             let state: CpuState = snapshot.to_state().map_err(|e| {
818                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
819             })?;
820             vcpu.vcpu
821                 .set_state(&state)
822                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
823 
824             vcpu.saved_state = Some(state);
825         }
826 
827         let vcpu = Arc::new(Mutex::new(vcpu));
828 
829         // Adding vCPU to the CpuManager's vCPU list.
830         self.vcpus.push(vcpu.clone());
831 
832         Ok(vcpu)
833     }
834 
835     pub fn configure_vcpu(
836         &self,
837         vcpu: Arc<Mutex<Vcpu>>,
838         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
839     ) -> Result<()> {
840         let mut vcpu = vcpu.lock().unwrap();
841 
842         #[cfg(feature = "sev_snp")]
843         if self.sev_snp_enabled {
844             if let Some((kernel_entry_point, _)) = boot_setup {
845                 vcpu.set_sev_control_register(
846                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
847                 )?;
848             }
849 
850             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
851             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
852             return Ok(());
853         }
854 
855         #[cfg(target_arch = "x86_64")]
856         assert!(!self.cpuid.is_empty());
857 
858         #[cfg(target_arch = "x86_64")]
859         let topology = self.config.topology.clone().map_or_else(
860             || Some((1, self.boot_vcpus(), 1)),
861             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
862         );
863         #[cfg(target_arch = "x86_64")]
864         vcpu.configure(
865             boot_setup,
866             self.cpuid.clone(),
867             self.config.kvm_hyperv,
868             topology,
869         )?;
870 
871         #[cfg(target_arch = "aarch64")]
872         vcpu.configure(&self.vm, boot_setup)?;
873 
874         Ok(())
875     }
876 
877     /// Only create new vCPUs if there aren't any inactive ones to reuse
878     fn create_vcpus(
879         &mut self,
880         desired_vcpus: u8,
881         snapshot: Option<Snapshot>,
882     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
883         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
884         info!(
885             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
886             desired_vcpus,
887             self.config.max_vcpus,
888             self.vcpus.len(),
889             self.present_vcpus()
890         );
891 
892         if desired_vcpus > self.config.max_vcpus {
893             return Err(Error::DesiredVCpuCountExceedsMax);
894         }
895 
896         // Only create vCPUs in excess of all the allocated vCPUs.
897         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
898             vcpus.push(self.create_vcpu(
899                 cpu_id,
900                 // TODO: The special format of the CPU id can be removed once
901                 // ready to break live upgrade.
902                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
903             )?);
904         }
905 
906         Ok(vcpus)
907     }
908 
909     #[cfg(target_arch = "aarch64")]
910     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
911         for cpu in self.vcpus.iter() {
912             let cpu = cpu.lock().unwrap();
913             // Check if PMU attr is available, if not, log the information.
914             if cpu.vcpu.has_pmu_support() {
915                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
916             } else {
917                 debug!(
918                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
919                     cpu.id
920                 );
921                 return Ok(false);
922             }
923         }
924 
925         Ok(true)
926     }
927 
928     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
929         self.vcpus.clone()
930     }
931 
932     fn start_vcpu(
933         &mut self,
934         vcpu: Arc<Mutex<Vcpu>>,
935         vcpu_id: u8,
936         vcpu_thread_barrier: Arc<Barrier>,
937         inserting: bool,
938     ) -> Result<()> {
939         let reset_evt = self.reset_evt.try_clone().unwrap();
940         let exit_evt = self.exit_evt.try_clone().unwrap();
941         #[cfg(feature = "kvm")]
942         let hypervisor_type = self.hypervisor.hypervisor_type();
943         #[cfg(feature = "guest_debug")]
944         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
945         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
946         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
947         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
948         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
949 
950         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
951         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
952             .vcpu_run_interrupted
953             .clone();
954         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
955         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
956 
957         // Prepare the CPU set the current vCPU is expected to run onto.
958         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
959             // SAFETY: all zeros is a valid pattern
960             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
961             // SAFETY: FFI call, trivially safe
962             unsafe { libc::CPU_ZERO(&mut cpuset) };
963             for host_cpu in host_cpus {
964                 // SAFETY: FFI call, trivially safe
965                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
966             }
967             cpuset
968         });
969 
970         // Retrieve seccomp filter for vcpu thread
971         let vcpu_seccomp_filter = get_seccomp_filter(
972             &self.seccomp_action,
973             Thread::Vcpu,
974             self.hypervisor.hypervisor_type(),
975         )
976         .map_err(Error::CreateSeccompFilter)?;
977 
978         #[cfg(target_arch = "x86_64")]
979         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
980 
981         info!("Starting vCPU: cpu_id = {}", vcpu_id);
982 
983         let handle = Some(
984             thread::Builder::new()
985                 .name(format!("vcpu{vcpu_id}"))
986                 .spawn(move || {
987                     // Schedule the thread to run on the expected CPU set
988                     if let Some(cpuset) = cpuset.as_ref() {
989                         // SAFETY: FFI call with correct arguments
990                         let ret = unsafe {
991                             libc::sched_setaffinity(
992                                 0,
993                                 std::mem::size_of::<libc::cpu_set_t>(),
994                                 cpuset as *const libc::cpu_set_t,
995                             )
996                         };
997 
998                         if ret != 0 {
999                             error!(
1000                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
1001                                 vcpu_id,
1002                                 io::Error::last_os_error()
1003                             );
1004                             return;
1005                         }
1006                     }
1007 
1008                     // Apply seccomp filter for vcpu thread.
1009                     if !vcpu_seccomp_filter.is_empty() {
1010                         if let Err(e) =
1011                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1012                         {
1013                             error!("Error applying seccomp filter: {:?}", e);
1014                             return;
1015                         }
1016                     }
1017                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1018                     // This uses an async signal safe handler to kill the vcpu handles.
1019                     register_signal_handler(SIGRTMIN(), handle_signal)
1020                         .expect("Failed to register vcpu signal handler");
1021                     // Block until all CPUs are ready.
1022                     vcpu_thread_barrier.wait();
1023 
1024                     std::panic::catch_unwind(move || {
1025                         loop {
1026                             // If we are being told to pause, we park the thread
1027                             // until the pause boolean is toggled.
1028                             // The resume operation is responsible for toggling
1029                             // the boolean and unpark the thread.
1030                             // We enter a loop because park() could spuriously
1031                             // return. We will then park() again unless the
1032                             // pause boolean has been toggled.
1033 
1034                             // Need to use Ordering::SeqCst as we have multiple
1035                             // loads and stores to different atomics and we need
1036                             // to see them in a consistent order in all threads
1037 
1038                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1039                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1040                                 // completed by returning to KVM_RUN. From the kernel docs:
1041                                 //
1042                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1043                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1044                                 // operations are complete (and guest state is consistent) only after userspace
1045                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1046                                 // incomplete operations and then check for pending signals.
1047                                 // The pending state of the operation is not preserved in state which is
1048                                 // visible to userspace, thus userspace should ensure that the operation is
1049                                 // completed before performing a live migration.  Userspace can re-enter the
1050                                 // guest with an unmasked signal pending or with the immediate_exit field set
1051                                 // to complete pending operations without allowing any further instructions
1052                                 // to be executed.
1053 
1054                                 #[cfg(feature = "kvm")]
1055                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1056                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1057                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1058                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1059                                         break;
1060                                     }
1061                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1062                                 }
1063 
1064                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1065 
1066                                 vcpu_paused.store(true, Ordering::SeqCst);
1067                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1068                                     thread::park();
1069                                 }
1070                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1071                             }
1072 
1073                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1074                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1075                                 #[cfg(target_arch = "x86_64")]
1076                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1077                                     Ok(()) => {},
1078                                     Err(e) => {
1079                                         error!("Error when inject nmi {}", e);
1080                                         break;
1081                                     }
1082                                 }
1083                             }
1084 
1085                             // We've been told to terminate
1086                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1087                                 || vcpu_kill.load(Ordering::SeqCst)
1088                             {
1089                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1090                                 break;
1091                             }
1092 
1093                             #[cfg(feature = "tdx")]
1094                             let mut vcpu = vcpu.lock().unwrap();
1095                             #[cfg(not(feature = "tdx"))]
1096                             let vcpu = vcpu.lock().unwrap();
1097                             // vcpu.run() returns false on a triple-fault so trigger a reset
1098                             match vcpu.run() {
1099                                 Ok(run) => match run {
1100                                     #[cfg(feature = "kvm")]
1101                                     VmExit::Debug => {
1102                                         info!("VmExit::Debug");
1103                                         #[cfg(feature = "guest_debug")]
1104                                         {
1105                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1106                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1107                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1108                                         }
1109                                     }
1110                                     #[cfg(target_arch = "x86_64")]
1111                                     VmExit::IoapicEoi(vector) => {
1112                                         if let Some(interrupt_controller) =
1113                                             &interrupt_controller_clone
1114                                         {
1115                                             interrupt_controller
1116                                                 .lock()
1117                                                 .unwrap()
1118                                                 .end_of_interrupt(vector);
1119                                         }
1120                                     }
1121                                     VmExit::Ignore => {}
1122                                     VmExit::Hyperv => {}
1123                                     VmExit::Reset => {
1124                                         info!("VmExit::Reset");
1125                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1126                                         reset_evt.write(1).unwrap();
1127                                         break;
1128                                     }
1129                                     VmExit::Shutdown => {
1130                                         info!("VmExit::Shutdown");
1131                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1132                                         exit_evt.write(1).unwrap();
1133                                         break;
1134                                     }
1135                                     #[cfg(feature = "tdx")]
1136                                     VmExit::Tdx => {
1137                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1138                                             match vcpu.get_tdx_exit_details() {
1139                                                 Ok(details) => match details {
1140                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1141                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1142                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1143                                                     }
1144                                                 },
1145                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1146                                             }
1147                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1148                                         } else {
1149                                             // We should never reach this code as
1150                                             // this means the design from the code
1151                                             // is wrong.
1152                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1153                                         }
1154                                     }
1155                                 },
1156 
1157                                 Err(e) => {
1158                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1159                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1160                                     exit_evt.write(1).unwrap();
1161                                     break;
1162                                 }
1163                             }
1164 
1165                             // We've been told to terminate
1166                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1167                                 || vcpu_kill.load(Ordering::SeqCst)
1168                             {
1169                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1170                                 break;
1171                             }
1172                         }
1173                     })
1174                     .or_else(|_| {
1175                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1176                         error!("vCPU thread panicked");
1177                         panic_exit_evt.write(1)
1178                     })
1179                     .ok();
1180                 })
1181                 .map_err(Error::VcpuSpawn)?,
1182         );
1183 
1184         // On hot plug calls into this function entry_point is None. It is for
1185         // those hotplug CPU additions that we need to set the inserting flag.
1186         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1187         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1188 
1189         Ok(())
1190     }
1191 
1192     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1193     fn activate_vcpus(
1194         &mut self,
1195         desired_vcpus: u8,
1196         inserting: bool,
1197         paused: Option<bool>,
1198     ) -> Result<()> {
1199         if desired_vcpus > self.config.max_vcpus {
1200             return Err(Error::DesiredVCpuCountExceedsMax);
1201         }
1202 
1203         let vcpu_thread_barrier = Arc::new(Barrier::new(
1204             (desired_vcpus - self.present_vcpus() + 1) as usize,
1205         ));
1206 
1207         if let Some(paused) = paused {
1208             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1209         }
1210 
1211         info!(
1212             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1213             desired_vcpus,
1214             self.vcpus.len(),
1215             self.present_vcpus(),
1216             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1217         );
1218 
1219         // This reuses any inactive vCPUs as well as any that were newly created
1220         for vcpu_id in self.present_vcpus()..desired_vcpus {
1221             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1222             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1223         }
1224 
1225         // Unblock all CPU threads.
1226         vcpu_thread_barrier.wait();
1227         Ok(())
1228     }
1229 
1230     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1231         // Mark vCPUs for removal, actual removal happens on ejection
1232         for cpu_id in desired_vcpus..self.present_vcpus() {
1233             self.vcpu_states[usize::from(cpu_id)].removing = true;
1234             self.vcpu_states[usize::from(cpu_id)]
1235                 .pending_removal
1236                 .store(true, Ordering::SeqCst);
1237         }
1238     }
1239 
1240     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1241         for state in self.vcpu_states.iter() {
1242             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1243                 return true;
1244             }
1245         }
1246         false
1247     }
1248 
1249     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1250         info!("Removing vCPU: cpu_id = {}", cpu_id);
1251         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1252         state.kill.store(true, Ordering::SeqCst);
1253         state.signal_thread();
1254         state.join_thread()?;
1255         state.handle = None;
1256 
1257         // Once the thread has exited, clear the "kill" so that it can reused
1258         state.kill.store(false, Ordering::SeqCst);
1259         state.pending_removal.store(false, Ordering::SeqCst);
1260 
1261         Ok(())
1262     }
1263 
1264     pub fn create_boot_vcpus(
1265         &mut self,
1266         snapshot: Option<Snapshot>,
1267     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1268         trace_scoped!("create_boot_vcpus");
1269 
1270         self.create_vcpus(self.boot_vcpus(), snapshot)
1271     }
1272 
1273     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1274     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1275         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1276     }
1277 
1278     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1279         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1280             .map_err(|e| {
1281                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1282             })?;
1283 
1284         Ok(())
1285     }
1286 
1287     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1288         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1289             return Ok(false);
1290         }
1291 
1292         if !self.dynamic {
1293             return Ok(false);
1294         }
1295 
1296         if self.check_pending_removed_vcpu() {
1297             return Err(Error::VcpuPendingRemovedVcpu);
1298         }
1299 
1300         match desired_vcpus.cmp(&self.present_vcpus()) {
1301             cmp::Ordering::Greater => {
1302                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1303                 for vcpu in vcpus {
1304                     self.configure_vcpu(vcpu, None)?
1305                 }
1306                 self.activate_vcpus(desired_vcpus, true, None)?;
1307                 Ok(true)
1308             }
1309             cmp::Ordering::Less => {
1310                 self.mark_vcpus_for_removal(desired_vcpus);
1311                 Ok(true)
1312             }
1313             _ => Ok(false),
1314         }
1315     }
1316 
1317     pub fn shutdown(&mut self) -> Result<()> {
1318         // Tell the vCPUs to stop themselves next time they go through the loop
1319         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1320 
1321         // Toggle the vCPUs pause boolean
1322         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1323 
1324         // Unpark all the VCPU threads.
1325         for state in self.vcpu_states.iter() {
1326             state.unpark_thread();
1327         }
1328 
1329         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1330         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1331         // above.
1332         for state in self.vcpu_states.iter() {
1333             state.signal_thread();
1334         }
1335 
1336         // Wait for all the threads to finish. This removes the state from the vector.
1337         for mut state in self.vcpu_states.drain(..) {
1338             state.join_thread()?;
1339         }
1340 
1341         Ok(())
1342     }
1343 
1344     #[cfg(feature = "tdx")]
1345     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1346         for vcpu in &self.vcpus {
1347             vcpu.lock()
1348                 .unwrap()
1349                 .vcpu
1350                 .tdx_init(hob_address)
1351                 .map_err(Error::InitializeTdx)?;
1352         }
1353         Ok(())
1354     }
1355 
1356     pub fn boot_vcpus(&self) -> u8 {
1357         self.config.boot_vcpus
1358     }
1359 
1360     pub fn max_vcpus(&self) -> u8 {
1361         self.config.max_vcpus
1362     }
1363 
1364     #[cfg(target_arch = "x86_64")]
1365     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1366         assert!(!self.cpuid.is_empty());
1367         self.cpuid.clone()
1368     }
1369 
1370     fn present_vcpus(&self) -> u8 {
1371         self.vcpu_states
1372             .iter()
1373             .fold(0, |acc, state| acc + state.active() as u8)
1374     }
1375 
1376     #[cfg(target_arch = "aarch64")]
1377     pub fn get_mpidrs(&self) -> Vec<u64> {
1378         self.vcpus
1379             .iter()
1380             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1381             .collect()
1382     }
1383 
1384     #[cfg(target_arch = "aarch64")]
1385     pub fn get_saved_states(&self) -> Vec<CpuState> {
1386         self.vcpus
1387             .iter()
1388             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1389             .collect()
1390     }
1391 
1392     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1393         self.config
1394             .topology
1395             .clone()
1396             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1397     }
1398 
1399     pub fn create_madt(&self) -> Sdt {
1400         use crate::acpi;
1401         // This is also checked in the commandline parsing.
1402         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1403 
1404         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1405         #[cfg(target_arch = "x86_64")]
1406         {
1407             madt.write(36, arch::layout::APIC_START.0);
1408 
1409             for cpu in 0..self.config.max_vcpus {
1410                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1411 
1412                 let lapic = LocalX2Apic {
1413                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1414                     length: 16,
1415                     processor_id: cpu.into(),
1416                     apic_id: x2apic_id,
1417                     flags: if cpu < self.config.boot_vcpus {
1418                         1 << MADT_CPU_ENABLE_FLAG
1419                     } else {
1420                         0
1421                     } | (1 << MADT_CPU_ONLINE_CAPABLE_FLAG),
1422                     _reserved: 0,
1423                 };
1424                 madt.append(lapic);
1425             }
1426 
1427             madt.append(Ioapic {
1428                 r#type: acpi::ACPI_APIC_IO,
1429                 length: 12,
1430                 ioapic_id: 0,
1431                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1432                 gsi_base: 0,
1433                 ..Default::default()
1434             });
1435 
1436             madt.append(InterruptSourceOverride {
1437                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1438                 length: 10,
1439                 bus: 0,
1440                 source: 4,
1441                 gsi: 4,
1442                 flags: 0,
1443             });
1444         }
1445 
1446         #[cfg(target_arch = "aarch64")]
1447         {
1448             /* Notes:
1449              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1450              */
1451 
1452             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1453             for cpu in 0..self.config.boot_vcpus {
1454                 let vcpu = &self.vcpus[cpu as usize];
1455                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1456                 /* ARMv8 MPIDR format:
1457                      Bits [63:40] Must be zero
1458                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1459                      Bits [31:24] Must be zero
1460                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1461                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1462                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1463                 */
1464                 let mpidr_mask = 0xff_00ff_ffff;
1465                 let gicc = GicC {
1466                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1467                     length: 80,
1468                     reserved0: 0,
1469                     cpu_interface_number: cpu as u32,
1470                     uid: cpu as u32,
1471                     flags: 1,
1472                     parking_version: 0,
1473                     performance_interrupt: 0,
1474                     parked_address: 0,
1475                     base_address: 0,
1476                     gicv_base_address: 0,
1477                     gich_base_address: 0,
1478                     vgic_interrupt: 0,
1479                     gicr_base_address: 0,
1480                     mpidr: mpidr & mpidr_mask,
1481                     proc_power_effi_class: 0,
1482                     reserved1: 0,
1483                     spe_overflow_interrupt: 0,
1484                 };
1485 
1486                 madt.append(gicc);
1487             }
1488             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1489 
1490             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1491             let gicd = GicD {
1492                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1493                 length: 24,
1494                 reserved0: 0,
1495                 gic_id: 0,
1496                 base_address: vgic_config.dist_addr,
1497                 global_irq_base: 0,
1498                 version: 3,
1499                 reserved1: [0; 3],
1500             };
1501             madt.append(gicd);
1502 
1503             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1504             let gicr = GicR {
1505                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1506                 length: 16,
1507                 reserved: 0,
1508                 base_address: vgic_config.redists_addr,
1509                 range_length: vgic_config.redists_size as u32,
1510             };
1511             madt.append(gicr);
1512 
1513             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1514             let gicits = GicIts {
1515                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1516                 length: 20,
1517                 reserved0: 0,
1518                 translation_id: 0,
1519                 base_address: vgic_config.msi_addr,
1520                 reserved1: 0,
1521             };
1522             madt.append(gicits);
1523 
1524             madt.update_checksum();
1525         }
1526 
1527         madt
1528     }
1529 
1530     #[cfg(target_arch = "aarch64")]
1531     pub fn create_pptt(&self) -> Sdt {
1532         let pptt_start = 0;
1533         let mut cpus = 0;
1534         let mut uid = 0;
1535         // If topology is not specified, the default setting is:
1536         // 1 package, multiple cores, 1 thread per core
1537         // This is also the behavior when PPTT is missing.
1538         let (threads_per_core, cores_per_package, packages) =
1539             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1540 
1541         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1542 
1543         for cluster_idx in 0..packages {
1544             if cpus < self.config.boot_vcpus as usize {
1545                 let cluster_offset = pptt.len() - pptt_start;
1546                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1547                     r#type: 0,
1548                     length: 20,
1549                     reserved: 0,
1550                     flags: 0x2,
1551                     parent: 0,
1552                     acpi_processor_id: cluster_idx as u32,
1553                     num_private_resources: 0,
1554                 };
1555                 pptt.append(cluster_hierarchy_node);
1556 
1557                 for core_idx in 0..cores_per_package {
1558                     let core_offset = pptt.len() - pptt_start;
1559 
1560                     if threads_per_core > 1 {
1561                         let core_hierarchy_node = ProcessorHierarchyNode {
1562                             r#type: 0,
1563                             length: 20,
1564                             reserved: 0,
1565                             flags: 0x2,
1566                             parent: cluster_offset as u32,
1567                             acpi_processor_id: core_idx as u32,
1568                             num_private_resources: 0,
1569                         };
1570                         pptt.append(core_hierarchy_node);
1571 
1572                         for _thread_idx in 0..threads_per_core {
1573                             let thread_hierarchy_node = ProcessorHierarchyNode {
1574                                 r#type: 0,
1575                                 length: 20,
1576                                 reserved: 0,
1577                                 flags: 0xE,
1578                                 parent: core_offset as u32,
1579                                 acpi_processor_id: uid as u32,
1580                                 num_private_resources: 0,
1581                             };
1582                             pptt.append(thread_hierarchy_node);
1583                             uid += 1;
1584                         }
1585                     } else {
1586                         let thread_hierarchy_node = ProcessorHierarchyNode {
1587                             r#type: 0,
1588                             length: 20,
1589                             reserved: 0,
1590                             flags: 0xA,
1591                             parent: cluster_offset as u32,
1592                             acpi_processor_id: uid as u32,
1593                             num_private_resources: 0,
1594                         };
1595                         pptt.append(thread_hierarchy_node);
1596                         uid += 1;
1597                     }
1598                 }
1599                 cpus += (cores_per_package * threads_per_core) as usize;
1600             }
1601         }
1602 
1603         pptt.update_checksum();
1604         pptt
1605     }
1606 
1607     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1608     fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters {
1609         self.vcpus[usize::from(cpu_id)]
1610             .lock()
1611             .unwrap()
1612             .vcpu
1613             .create_standard_regs()
1614     }
1615 
1616     #[cfg(feature = "guest_debug")]
1617     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1618         self.vcpus[usize::from(cpu_id)]
1619             .lock()
1620             .unwrap()
1621             .vcpu
1622             .get_regs()
1623             .map_err(Error::CpuDebug)
1624     }
1625 
1626     #[cfg(feature = "guest_debug")]
1627     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1628         self.vcpus[usize::from(cpu_id)]
1629             .lock()
1630             .unwrap()
1631             .vcpu
1632             .set_regs(regs)
1633             .map_err(Error::CpuDebug)
1634     }
1635 
1636     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1637     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1638         self.vcpus[usize::from(cpu_id)]
1639             .lock()
1640             .unwrap()
1641             .vcpu
1642             .get_sregs()
1643             .map_err(Error::CpuDebug)
1644     }
1645 
1646     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1647     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1648         self.vcpus[usize::from(cpu_id)]
1649             .lock()
1650             .unwrap()
1651             .vcpu
1652             .set_sregs(sregs)
1653             .map_err(Error::CpuDebug)
1654     }
1655 
1656     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1657     fn translate_gva(
1658         &self,
1659         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1660         cpu_id: u8,
1661         gva: u64,
1662     ) -> Result<u64> {
1663         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1664             .lock()
1665             .unwrap()
1666             .vcpu
1667             .translate_gva(gva, /* flags: unused */ 0)
1668             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1669         Ok(gpa)
1670     }
1671 
1672     ///
1673     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1674     /// it in VMM by walking through translation tables.
1675     ///
1676     /// Address translation is big topic, here we only focus the scenario that
1677     /// happens in VMM while debugging kernel. This `translate_gva`
1678     /// implementation is restricted to:
1679     /// - Exception Level 1
1680     /// - Translate high address range only (kernel space)
1681     ///
1682     /// This implementation supports following Arm-v8a features related to
1683     /// address translation:
1684     /// - FEAT_LPA
1685     /// - FEAT_LVA
1686     /// - FEAT_LPA2
1687     ///
1688     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1689     fn translate_gva(
1690         &self,
1691         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1692         cpu_id: u8,
1693         gva: u64,
1694     ) -> Result<u64> {
1695         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1696             .lock()
1697             .unwrap()
1698             .vcpu
1699             .get_sys_reg(regs::TCR_EL1)
1700             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1701         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1702             .lock()
1703             .unwrap()
1704             .vcpu
1705             .get_sys_reg(regs::TTBR1_EL1)
1706             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1707         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1708             .lock()
1709             .unwrap()
1710             .vcpu
1711             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1712             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1713 
1714         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1715         // or low (0x000xxx...).
1716         let high_range = extract_bits_64!(gva, 55, 1);
1717         if high_range == 0 {
1718             info!("VA (0x{:x}) range is not supported!", gva);
1719             return Ok(gva);
1720         }
1721 
1722         // High range size offset
1723         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1724         // Granule size
1725         let tg = extract_bits_64!(tcr_el1, 30, 2);
1726         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1727         let ds = extract_bits_64!(tcr_el1, 59, 1);
1728 
1729         if tsz == 0 {
1730             info!("VA translation is not ready!");
1731             return Ok(gva);
1732         }
1733 
1734         // VA size is determined by TCR_BL1.T1SZ
1735         let va_size = 64 - tsz;
1736         // Number of bits in VA consumed in each level of translation
1737         let stride = match tg {
1738             3 => 13, // 64KB granule size
1739             1 => 11, // 16KB granule size
1740             _ => 9,  // 4KB, default
1741         };
1742         // Starting level of walking
1743         let mut level = 4 - (va_size - 4) / stride;
1744 
1745         // PA or IPA size is determined
1746         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1747         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1748         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1749         // To be safe, we use the minimum value if they are different.
1750         let pa_range = std::cmp::min(tcr_ips, pa_range);
1751         // PA size in bits
1752         let pa_size = match pa_range {
1753             0 => 32,
1754             1 => 36,
1755             2 => 40,
1756             3 => 42,
1757             4 => 44,
1758             5 => 48,
1759             6 => 52,
1760             _ => {
1761                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1762                     "PA range not supported {pa_range}"
1763                 ))))
1764             }
1765         };
1766 
1767         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1768         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1769         // If FEAT_LPA2 is present, the translation table descriptor holds
1770         // 50 bits of the table address of next level.
1771         // Otherwise, it is 48 bits.
1772         let descaddrmask = if ds == 1 {
1773             !0u64 >> (64 - 50) // mask with 50 least significant bits
1774         } else {
1775             !0u64 >> (64 - 48) // mask with 48 least significant bits
1776         };
1777         let descaddrmask = descaddrmask & !indexmask_grainsize;
1778 
1779         // Translation table base address
1780         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1781         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1782         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1783         if pa_size == 52 {
1784             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1785         }
1786 
1787         // Loop through tables of each level
1788         loop {
1789             // Table offset for current level
1790             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1791             descaddr |= table_offset;
1792             descaddr &= !7u64;
1793 
1794             let mut buf = [0; 8];
1795             guest_memory
1796                 .memory()
1797                 .read(&mut buf, GuestAddress(descaddr))
1798                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1799             let descriptor = u64::from_le_bytes(buf);
1800 
1801             descaddr = descriptor & descaddrmask;
1802             // In the case of FEAT_LPA, the next-level translation table address
1803             // bits [48:51] comes from bits [12:15] of the current descriptor.
1804             // For FEAT_LPA2, the next-level translation table address
1805             // bits [50:51] comes from bits [8:9] of the current descriptor,
1806             // bits [48:49] comes from bits [48:49] of the descriptor which was
1807             // handled previously.
1808             if pa_size == 52 {
1809                 if ds == 1 {
1810                     // FEAT_LPA2
1811                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1812                 } else {
1813                     // FEAT_LPA
1814                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1815                 }
1816             }
1817 
1818             if (descriptor & 2) != 0 && (level < 3) {
1819                 // This is a table entry. Go down to next level.
1820                 level += 1;
1821                 indexmask = indexmask_grainsize;
1822                 continue;
1823             }
1824 
1825             break;
1826         }
1827 
1828         // We have reached either:
1829         // - a page entry at level 3 or
1830         // - a block entry at level 1 or 2
1831         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1832         descaddr &= !(page_size - 1);
1833         descaddr |= gva & (page_size - 1);
1834 
1835         Ok(descaddr)
1836     }
1837 
1838     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1839         self.acpi_address = Some(acpi_address);
1840     }
1841 
1842     pub(crate) fn set_interrupt_controller(
1843         &mut self,
1844         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1845     ) {
1846         self.interrupt_controller = Some(interrupt_controller);
1847     }
1848 
1849     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1850         &self.vcpus_kill_signalled
1851     }
1852 
1853     #[cfg(feature = "igvm")]
1854     pub(crate) fn get_cpuid_leaf(
1855         &self,
1856         cpu_id: u8,
1857         eax: u32,
1858         ecx: u32,
1859         xfem: u64,
1860         xss: u64,
1861     ) -> Result<[u32; 4]> {
1862         let leaf_info = self.vcpus[usize::from(cpu_id)]
1863             .lock()
1864             .unwrap()
1865             .vcpu
1866             .get_cpuid_values(eax, ecx, xfem, xss)
1867             .unwrap();
1868         Ok(leaf_info)
1869     }
1870 
1871     #[cfg(feature = "sev_snp")]
1872     pub(crate) fn sev_snp_enabled(&self) -> bool {
1873         self.sev_snp_enabled
1874     }
1875 
1876     pub(crate) fn nmi(&self) -> Result<()> {
1877         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1878 
1879         for state in self.vcpu_states.iter() {
1880             state.signal_thread();
1881         }
1882 
1883         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1884 
1885         Ok(())
1886     }
1887 }
1888 
1889 struct Cpu {
1890     cpu_id: u8,
1891     proximity_domain: u32,
1892     dynamic: bool,
1893     #[cfg(target_arch = "x86_64")]
1894     topology: Option<(u8, u8, u8)>,
1895 }
1896 
1897 #[cfg(target_arch = "x86_64")]
1898 const MADT_CPU_ENABLE_FLAG: usize = 0;
1899 
1900 #[cfg(target_arch = "x86_64")]
1901 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1902 
1903 impl Cpu {
1904     #[cfg(target_arch = "x86_64")]
1905     fn generate_mat(&self) -> Vec<u8> {
1906         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1907 
1908         let lapic = LocalX2Apic {
1909             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1910             length: 16,
1911             processor_id: self.cpu_id.into(),
1912             apic_id: x2apic_id,
1913             flags: 1 << MADT_CPU_ENABLE_FLAG,
1914             _reserved: 0,
1915         };
1916 
1917         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1918         // SAFETY: mat_data is large enough to hold lapic
1919         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1920 
1921         mat_data
1922     }
1923 }
1924 
1925 impl Aml for Cpu {
1926     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1927         #[cfg(target_arch = "x86_64")]
1928         let mat_data: Vec<u8> = self.generate_mat();
1929         #[allow(clippy::if_same_then_else)]
1930         if self.dynamic {
1931             aml::Device::new(
1932                 format!("C{:03X}", self.cpu_id).as_str().into(),
1933                 vec![
1934                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1935                     &aml::Name::new("_UID".into(), &self.cpu_id),
1936                     // Currently, AArch64 cannot support following fields.
1937                     /*
1938                     _STA return value:
1939                     Bit [0] – Set if the device is present.
1940                     Bit [1] – Set if the device is enabled and decoding its resources.
1941                     Bit [2] – Set if the device should be shown in the UI.
1942                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1943                     Bit [4] – Set if the battery is present.
1944                     Bits [31:5] – Reserved (must be cleared).
1945                     */
1946                     #[cfg(target_arch = "x86_64")]
1947                     &aml::Method::new(
1948                         "_STA".into(),
1949                         0,
1950                         false,
1951                         // Call into CSTA method which will interrogate device
1952                         vec![&aml::Return::new(&aml::MethodCall::new(
1953                             "CSTA".into(),
1954                             vec![&self.cpu_id],
1955                         ))],
1956                     ),
1957                     &aml::Method::new(
1958                         "_PXM".into(),
1959                         0,
1960                         false,
1961                         vec![&aml::Return::new(&self.proximity_domain)],
1962                     ),
1963                     // The Linux kernel expects every CPU device to have a _MAT entry
1964                     // containing the LAPIC for this processor with the enabled bit set
1965                     // even it if is disabled in the MADT (non-boot CPU)
1966                     #[cfg(target_arch = "x86_64")]
1967                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1968                     // Trigger CPU ejection
1969                     #[cfg(target_arch = "x86_64")]
1970                     &aml::Method::new(
1971                         "_EJ0".into(),
1972                         1,
1973                         false,
1974                         // Call into CEJ0 method which will actually eject device
1975                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1976                     ),
1977                 ],
1978             )
1979             .to_aml_bytes(sink);
1980         } else {
1981             aml::Device::new(
1982                 format!("C{:03X}", self.cpu_id).as_str().into(),
1983                 vec![
1984                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1985                     &aml::Name::new("_UID".into(), &self.cpu_id),
1986                     #[cfg(target_arch = "x86_64")]
1987                     &aml::Method::new(
1988                         "_STA".into(),
1989                         0,
1990                         false,
1991                         // Mark CPU present see CSTA implementation
1992                         vec![&aml::Return::new(&0xfu8)],
1993                     ),
1994                     &aml::Method::new(
1995                         "_PXM".into(),
1996                         0,
1997                         false,
1998                         vec![&aml::Return::new(&self.proximity_domain)],
1999                     ),
2000                     // The Linux kernel expects every CPU device to have a _MAT entry
2001                     // containing the LAPIC for this processor with the enabled bit set
2002                     // even it if is disabled in the MADT (non-boot CPU)
2003                     #[cfg(target_arch = "x86_64")]
2004                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
2005                 ],
2006             )
2007             .to_aml_bytes(sink);
2008         }
2009     }
2010 }
2011 
2012 struct CpuNotify {
2013     cpu_id: u8,
2014 }
2015 
2016 impl Aml for CpuNotify {
2017     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2018         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2019         aml::If::new(
2020             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2021             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2022         )
2023         .to_aml_bytes(sink)
2024     }
2025 }
2026 
2027 struct CpuMethods {
2028     max_vcpus: u8,
2029     dynamic: bool,
2030 }
2031 
2032 impl Aml for CpuMethods {
2033     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2034         if self.dynamic {
2035             // CPU status method
2036             aml::Method::new(
2037                 "CSTA".into(),
2038                 1,
2039                 true,
2040                 vec![
2041                     // Take lock defined above
2042                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2043                     // Write CPU number (in first argument) to I/O port via field
2044                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2045                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2046                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2047                     &aml::If::new(
2048                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2049                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2050                     ),
2051                     // Release lock
2052                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2053                     // Return 0 or 0xf
2054                     &aml::Return::new(&aml::Local(0)),
2055                 ],
2056             )
2057             .to_aml_bytes(sink);
2058 
2059             let mut cpu_notifies = Vec::new();
2060             for cpu_id in 0..self.max_vcpus {
2061                 cpu_notifies.push(CpuNotify { cpu_id });
2062             }
2063 
2064             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2065             for cpu_id in 0..self.max_vcpus {
2066                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2067             }
2068 
2069             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2070 
2071             aml::Method::new(
2072                 "CEJ0".into(),
2073                 1,
2074                 true,
2075                 vec![
2076                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2077                     // Write CPU number (in first argument) to I/O port via field
2078                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2079                     // Set CEJ0 bit
2080                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2081                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2082                 ],
2083             )
2084             .to_aml_bytes(sink);
2085 
2086             aml::Method::new(
2087                 "CSCN".into(),
2088                 0,
2089                 true,
2090                 vec![
2091                     // Take lock defined above
2092                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2093                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2094                     &aml::While::new(
2095                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2096                         vec![
2097                             // Write CPU number (in first argument) to I/O port via field
2098                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2099                             // Check if CINS bit is set
2100                             &aml::If::new(
2101                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2102                                 // Notify device if it is
2103                                 vec![
2104                                     &aml::MethodCall::new(
2105                                         "CTFY".into(),
2106                                         vec![&aml::Local(0), &aml::ONE],
2107                                     ),
2108                                     // Reset CINS bit
2109                                     &aml::Store::new(
2110                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2111                                         &aml::ONE,
2112                                     ),
2113                                 ],
2114                             ),
2115                             // Check if CRMV bit is set
2116                             &aml::If::new(
2117                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2118                                 // Notify device if it is (with the eject constant 0x3)
2119                                 vec![
2120                                     &aml::MethodCall::new(
2121                                         "CTFY".into(),
2122                                         vec![&aml::Local(0), &3u8],
2123                                     ),
2124                                     // Reset CRMV bit
2125                                     &aml::Store::new(
2126                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2127                                         &aml::ONE,
2128                                     ),
2129                                 ],
2130                             ),
2131                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2132                         ],
2133                     ),
2134                     // Release lock
2135                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2136                 ],
2137             )
2138             .to_aml_bytes(sink)
2139         } else {
2140             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2141         }
2142     }
2143 }
2144 
2145 impl Aml for CpuManager {
2146     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2147         #[cfg(target_arch = "x86_64")]
2148         if let Some(acpi_address) = self.acpi_address {
2149             // CPU hotplug controller
2150             aml::Device::new(
2151                 "_SB_.PRES".into(),
2152                 vec![
2153                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2154                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2155                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2156                     &aml::Mutex::new("CPLK".into(), 0),
2157                     &aml::Name::new(
2158                         "_CRS".into(),
2159                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2160                             aml::AddressSpaceCacheable::NotCacheable,
2161                             true,
2162                             acpi_address.0,
2163                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2164                             None,
2165                         )]),
2166                     ),
2167                     // OpRegion and Fields map MMIO range into individual field values
2168                     &aml::OpRegion::new(
2169                         "PRST".into(),
2170                         aml::OpRegionSpace::SystemMemory,
2171                         &(acpi_address.0 as usize),
2172                         &CPU_MANAGER_ACPI_SIZE,
2173                     ),
2174                     &aml::Field::new(
2175                         "PRST".into(),
2176                         aml::FieldAccessType::Byte,
2177                         aml::FieldLockRule::NoLock,
2178                         aml::FieldUpdateRule::WriteAsZeroes,
2179                         vec![
2180                             aml::FieldEntry::Reserved(32),
2181                             aml::FieldEntry::Named(*b"CPEN", 1),
2182                             aml::FieldEntry::Named(*b"CINS", 1),
2183                             aml::FieldEntry::Named(*b"CRMV", 1),
2184                             aml::FieldEntry::Named(*b"CEJ0", 1),
2185                             aml::FieldEntry::Reserved(4),
2186                             aml::FieldEntry::Named(*b"CCMD", 8),
2187                         ],
2188                     ),
2189                     &aml::Field::new(
2190                         "PRST".into(),
2191                         aml::FieldAccessType::DWord,
2192                         aml::FieldLockRule::NoLock,
2193                         aml::FieldUpdateRule::Preserve,
2194                         vec![
2195                             aml::FieldEntry::Named(*b"CSEL", 32),
2196                             aml::FieldEntry::Reserved(32),
2197                             aml::FieldEntry::Named(*b"CDAT", 32),
2198                         ],
2199                     ),
2200                 ],
2201             )
2202             .to_aml_bytes(sink);
2203         }
2204 
2205         // CPU devices
2206         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2207         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2208         // Bundle methods together under a common object
2209         let methods = CpuMethods {
2210             max_vcpus: self.config.max_vcpus,
2211             dynamic: self.dynamic,
2212         };
2213         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2214 
2215         #[cfg(target_arch = "x86_64")]
2216         let topology = self.get_vcpu_topology();
2217         let mut cpu_devices = Vec::new();
2218         for cpu_id in 0..self.config.max_vcpus {
2219             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2220             let cpu_device = Cpu {
2221                 cpu_id,
2222                 proximity_domain,
2223                 dynamic: self.dynamic,
2224                 #[cfg(target_arch = "x86_64")]
2225                 topology,
2226             };
2227 
2228             cpu_devices.push(cpu_device);
2229         }
2230 
2231         for cpu_device in cpu_devices.iter() {
2232             cpu_data_inner.push(cpu_device);
2233         }
2234 
2235         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2236     }
2237 }
2238 
2239 impl Pausable for CpuManager {
2240     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2241         // Tell the vCPUs to pause themselves next time they exit
2242         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2243 
2244         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2245         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2246         // above.
2247         for state in self.vcpu_states.iter() {
2248             state.signal_thread();
2249         }
2250 
2251         for vcpu in self.vcpus.iter() {
2252             let mut vcpu = vcpu.lock().unwrap();
2253             vcpu.pause()?;
2254             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2255             if !self.config.kvm_hyperv {
2256                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2257                     MigratableError::Pause(anyhow!(
2258                         "Could not notify guest it has been paused {:?}",
2259                         e
2260                     ))
2261                 })?;
2262             }
2263         }
2264 
2265         // The vCPU thread will change its paused state before parking, wait here for each
2266         // activated vCPU change their state to ensure they have parked.
2267         for state in self.vcpu_states.iter() {
2268             if state.active() {
2269                 while !state.paused.load(Ordering::SeqCst) {
2270                     // To avoid a priority inversion with the vCPU thread
2271                     thread::sleep(std::time::Duration::from_millis(1));
2272                 }
2273             }
2274         }
2275 
2276         Ok(())
2277     }
2278 
2279     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2280         for vcpu in self.vcpus.iter() {
2281             vcpu.lock().unwrap().resume()?;
2282         }
2283 
2284         // Toggle the vCPUs pause boolean
2285         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2286 
2287         // Unpark all the VCPU threads.
2288         // Once unparked, the next thing they will do is checking for the pause
2289         // boolean. Since it'll be set to false, they will exit their pause loop
2290         // and go back to vmx root.
2291         for state in self.vcpu_states.iter() {
2292             state.paused.store(false, Ordering::SeqCst);
2293             state.unpark_thread();
2294         }
2295         Ok(())
2296     }
2297 }
2298 
2299 impl Snapshottable for CpuManager {
2300     fn id(&self) -> String {
2301         CPU_MANAGER_SNAPSHOT_ID.to_string()
2302     }
2303 
2304     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2305         let mut cpu_manager_snapshot = Snapshot::default();
2306 
2307         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2308         for vcpu in &self.vcpus {
2309             let mut vcpu = vcpu.lock().unwrap();
2310             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2311         }
2312 
2313         Ok(cpu_manager_snapshot)
2314     }
2315 }
2316 
2317 impl Transportable for CpuManager {}
2318 impl Migratable for CpuManager {}
2319 
2320 #[cfg(feature = "guest_debug")]
2321 impl Debuggable for CpuManager {
2322     #[cfg(feature = "kvm")]
2323     fn set_guest_debug(
2324         &self,
2325         cpu_id: usize,
2326         addrs: &[GuestAddress],
2327         singlestep: bool,
2328     ) -> std::result::Result<(), DebuggableError> {
2329         self.vcpus[cpu_id]
2330             .lock()
2331             .unwrap()
2332             .vcpu
2333             .set_guest_debug(addrs, singlestep)
2334             .map_err(DebuggableError::SetDebug)
2335     }
2336 
2337     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2338         Ok(())
2339     }
2340 
2341     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2342         Ok(())
2343     }
2344 
2345     #[cfg(target_arch = "x86_64")]
2346     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2347         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2348         let gregs = self
2349             .get_regs(cpu_id as u8)
2350             .map_err(DebuggableError::ReadRegs)?;
2351         let regs = [
2352             gregs.get_rax(),
2353             gregs.get_rbx(),
2354             gregs.get_rcx(),
2355             gregs.get_rdx(),
2356             gregs.get_rsi(),
2357             gregs.get_rdi(),
2358             gregs.get_rbp(),
2359             gregs.get_rsp(),
2360             gregs.get_r8(),
2361             gregs.get_r9(),
2362             gregs.get_r10(),
2363             gregs.get_r11(),
2364             gregs.get_r12(),
2365             gregs.get_r13(),
2366             gregs.get_r14(),
2367             gregs.get_r15(),
2368         ];
2369 
2370         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2371         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2372         let eflags = gregs.get_rflags() as u32;
2373         let rip = gregs.get_rip();
2374 
2375         // Segment registers: CS, SS, DS, ES, FS, GS
2376         let sregs = self
2377             .get_sregs(cpu_id as u8)
2378             .map_err(DebuggableError::ReadRegs)?;
2379         let segments = X86SegmentRegs {
2380             cs: sregs.cs.selector as u32,
2381             ss: sregs.ss.selector as u32,
2382             ds: sregs.ds.selector as u32,
2383             es: sregs.es.selector as u32,
2384             fs: sregs.fs.selector as u32,
2385             gs: sregs.gs.selector as u32,
2386         };
2387 
2388         // TODO: Add other registers
2389 
2390         Ok(CoreRegs {
2391             regs,
2392             eflags,
2393             rip,
2394             segments,
2395             ..Default::default()
2396         })
2397     }
2398 
2399     #[cfg(target_arch = "aarch64")]
2400     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2401         let gregs = self
2402             .get_regs(cpu_id as u8)
2403             .map_err(DebuggableError::ReadRegs)?;
2404         Ok(CoreRegs {
2405             x: gregs.get_regs(),
2406             sp: gregs.get_sp(),
2407             pc: gregs.get_pc(),
2408             ..Default::default()
2409         })
2410     }
2411 
2412     #[cfg(target_arch = "x86_64")]
2413     fn write_regs(
2414         &self,
2415         cpu_id: usize,
2416         regs: &CoreRegs,
2417     ) -> std::result::Result<(), DebuggableError> {
2418         let orig_gregs = self
2419             .get_regs(cpu_id as u8)
2420             .map_err(DebuggableError::ReadRegs)?;
2421         let mut gregs = self.create_standard_regs(cpu_id as u8);
2422         gregs.set_rax(regs.regs[0]);
2423         gregs.set_rbx(regs.regs[1]);
2424         gregs.set_rcx(regs.regs[2]);
2425         gregs.set_rdx(regs.regs[3]);
2426         gregs.set_rsi(regs.regs[4]);
2427         gregs.set_rdi(regs.regs[5]);
2428         gregs.set_rbp(regs.regs[6]);
2429         gregs.set_rsp(regs.regs[7]);
2430         gregs.set_r8(regs.regs[8]);
2431         gregs.set_r9(regs.regs[9]);
2432         gregs.set_r10(regs.regs[10]);
2433         gregs.set_r11(regs.regs[11]);
2434         gregs.set_r12(regs.regs[12]);
2435         gregs.set_r13(regs.regs[13]);
2436         gregs.set_r14(regs.regs[14]);
2437         gregs.set_r15(regs.regs[15]);
2438         gregs.set_rip(regs.rip);
2439         // Update the lower 32-bit of rflags.
2440         gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64));
2441 
2442         self.set_regs(cpu_id as u8, &gregs)
2443             .map_err(DebuggableError::WriteRegs)?;
2444 
2445         // Segment registers: CS, SS, DS, ES, FS, GS
2446         // Since GDB care only selectors, we call get_sregs() first.
2447         let mut sregs = self
2448             .get_sregs(cpu_id as u8)
2449             .map_err(DebuggableError::ReadRegs)?;
2450         sregs.cs.selector = regs.segments.cs as u16;
2451         sregs.ss.selector = regs.segments.ss as u16;
2452         sregs.ds.selector = regs.segments.ds as u16;
2453         sregs.es.selector = regs.segments.es as u16;
2454         sregs.fs.selector = regs.segments.fs as u16;
2455         sregs.gs.selector = regs.segments.gs as u16;
2456 
2457         self.set_sregs(cpu_id as u8, &sregs)
2458             .map_err(DebuggableError::WriteRegs)?;
2459 
2460         // TODO: Add other registers
2461 
2462         Ok(())
2463     }
2464 
2465     #[cfg(target_arch = "aarch64")]
2466     fn write_regs(
2467         &self,
2468         cpu_id: usize,
2469         regs: &CoreRegs,
2470     ) -> std::result::Result<(), DebuggableError> {
2471         let mut gregs = self
2472             .get_regs(cpu_id as u8)
2473             .map_err(DebuggableError::ReadRegs)?;
2474 
2475         gregs.set_regs(regs.x);
2476         gregs.set_sp(regs.sp);
2477         gregs.set_pc(regs.pc);
2478 
2479         self.set_regs(cpu_id as u8, &gregs)
2480             .map_err(DebuggableError::WriteRegs)?;
2481 
2482         Ok(())
2483     }
2484 
2485     fn read_mem(
2486         &self,
2487         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2488         cpu_id: usize,
2489         vaddr: GuestAddress,
2490         len: usize,
2491     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2492         let mut buf = vec![0; len];
2493         let mut total_read = 0_u64;
2494 
2495         while total_read < len as u64 {
2496             let gaddr = vaddr.0 + total_read;
2497             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2498                 Ok(paddr) => paddr,
2499                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2500                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2501             };
2502             let psize = arch::PAGE_SIZE as u64;
2503             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2504             guest_memory
2505                 .memory()
2506                 .read(
2507                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2508                     GuestAddress(paddr),
2509                 )
2510                 .map_err(DebuggableError::ReadMem)?;
2511             total_read += read_len;
2512         }
2513         Ok(buf)
2514     }
2515 
2516     fn write_mem(
2517         &self,
2518         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2519         cpu_id: usize,
2520         vaddr: &GuestAddress,
2521         data: &[u8],
2522     ) -> std::result::Result<(), DebuggableError> {
2523         let mut total_written = 0_u64;
2524 
2525         while total_written < data.len() as u64 {
2526             let gaddr = vaddr.0 + total_written;
2527             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2528                 Ok(paddr) => paddr,
2529                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2530                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2531             };
2532             let psize = arch::PAGE_SIZE as u64;
2533             let write_len = std::cmp::min(
2534                 data.len() as u64 - total_written,
2535                 psize - (paddr & (psize - 1)),
2536             );
2537             guest_memory
2538                 .memory()
2539                 .write(
2540                     &data[total_written as usize..total_written as usize + write_len as usize],
2541                     GuestAddress(paddr),
2542                 )
2543                 .map_err(DebuggableError::WriteMem)?;
2544             total_written += write_len;
2545         }
2546         Ok(())
2547     }
2548 
2549     fn active_vcpus(&self) -> usize {
2550         self.present_vcpus() as usize
2551     }
2552 }
2553 
2554 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2555 impl Elf64Writable for CpuManager {}
2556 
2557 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2558 impl CpuElf64Writable for CpuManager {
2559     fn cpu_write_elf64_note(
2560         &mut self,
2561         dump_state: &DumpState,
2562     ) -> std::result::Result<(), GuestDebuggableError> {
2563         let mut coredump_file = dump_state.file.as_ref().unwrap();
2564         for vcpu in &self.vcpus {
2565             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2566             let mut pos: usize = 0;
2567             let mut buf = vec![0; note_size as usize];
2568             let descsz = size_of::<X86_64ElfPrStatus>();
2569             let vcpu_id = vcpu.lock().unwrap().id;
2570 
2571             let note = Elf64_Nhdr {
2572                 n_namesz: COREDUMP_NAME_SIZE,
2573                 n_descsz: descsz as u32,
2574                 n_type: NT_PRSTATUS,
2575             };
2576 
2577             let bytes: &[u8] = note.as_slice();
2578             buf.splice(0.., bytes.to_vec());
2579             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2580             buf.resize(pos + 4, 0);
2581             buf.splice(pos.., "CORE".to_string().into_bytes());
2582 
2583             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2584             buf.resize(pos + 32 + 4, 0);
2585             let pid = vcpu_id as u64;
2586             let bytes: &[u8] = pid.as_slice();
2587             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2588 
2589             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2590 
2591             let orig_rax: u64 = 0;
2592             let gregs = self.vcpus[usize::from(vcpu_id)]
2593                 .lock()
2594                 .unwrap()
2595                 .vcpu
2596                 .get_regs()
2597                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2598 
2599             let regs1 = [
2600                 gregs.get_r15(),
2601                 gregs.get_r14(),
2602                 gregs.get_r13(),
2603                 gregs.get_r12(),
2604                 gregs.get_rbp(),
2605                 gregs.get_rbx(),
2606                 gregs.get_r11(),
2607                 gregs.get_r10(),
2608             ];
2609             let regs2 = [
2610                 gregs.get_r9(),
2611                 gregs.get_r8(),
2612                 gregs.get_rax(),
2613                 gregs.get_rcx(),
2614                 gregs.get_rdx(),
2615                 gregs.get_rsi(),
2616                 gregs.get_rdi(),
2617                 orig_rax,
2618             ];
2619 
2620             let sregs = self.vcpus[usize::from(vcpu_id)]
2621                 .lock()
2622                 .unwrap()
2623                 .vcpu
2624                 .get_sregs()
2625                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2626 
2627             debug!(
2628                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2629                 gregs.get_rip(),
2630                 gregs.get_rsp(),
2631                 sregs.gs.base,
2632                 sregs.cs.selector,
2633                 sregs.ss.selector,
2634                 sregs.ds.selector,
2635             );
2636 
2637             let regs = X86_64UserRegs {
2638                 regs1,
2639                 regs2,
2640                 rip: gregs.get_rip(),
2641                 cs: sregs.cs.selector as u64,
2642                 eflags: gregs.get_rflags(),
2643                 rsp: gregs.get_rsp(),
2644                 ss: sregs.ss.selector as u64,
2645                 fs_base: sregs.fs.base,
2646                 gs_base: sregs.gs.base,
2647                 ds: sregs.ds.selector as u64,
2648                 es: sregs.es.selector as u64,
2649                 fs: sregs.fs.selector as u64,
2650                 gs: sregs.gs.selector as u64,
2651             };
2652 
2653             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2654             let bytes: &[u8] = regs.as_slice();
2655             buf.resize(note_size as usize, 0);
2656             buf.splice(pos.., bytes.to_vec());
2657             buf.resize(note_size as usize, 0);
2658 
2659             coredump_file
2660                 .write(&buf)
2661                 .map_err(GuestDebuggableError::CoredumpFile)?;
2662         }
2663 
2664         Ok(())
2665     }
2666 
2667     fn cpu_write_vmm_note(
2668         &mut self,
2669         dump_state: &DumpState,
2670     ) -> std::result::Result<(), GuestDebuggableError> {
2671         let mut coredump_file = dump_state.file.as_ref().unwrap();
2672         for vcpu in &self.vcpus {
2673             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2674             let mut pos: usize = 0;
2675             let mut buf = vec![0; note_size as usize];
2676             let descsz = size_of::<DumpCpusState>();
2677             let vcpu_id = vcpu.lock().unwrap().id;
2678 
2679             let note = Elf64_Nhdr {
2680                 n_namesz: COREDUMP_NAME_SIZE,
2681                 n_descsz: descsz as u32,
2682                 n_type: 0,
2683             };
2684 
2685             let bytes: &[u8] = note.as_slice();
2686             buf.splice(0.., bytes.to_vec());
2687             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2688 
2689             buf.resize(pos + 4, 0);
2690             buf.splice(pos.., "QEMU".to_string().into_bytes());
2691 
2692             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2693 
2694             let gregs = self.vcpus[usize::from(vcpu_id)]
2695                 .lock()
2696                 .unwrap()
2697                 .vcpu
2698                 .get_regs()
2699                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2700 
2701             let regs1 = [
2702                 gregs.get_rax(),
2703                 gregs.get_rbx(),
2704                 gregs.get_rcx(),
2705                 gregs.get_rdx(),
2706                 gregs.get_rsi(),
2707                 gregs.get_rdi(),
2708                 gregs.get_rsp(),
2709                 gregs.get_rbp(),
2710             ];
2711 
2712             let regs2 = [
2713                 gregs.get_r8(),
2714                 gregs.get_r9(),
2715                 gregs.get_r10(),
2716                 gregs.get_r11(),
2717                 gregs.get_r12(),
2718                 gregs.get_r13(),
2719                 gregs.get_r14(),
2720                 gregs.get_r15(),
2721             ];
2722 
2723             let sregs = self.vcpus[usize::from(vcpu_id)]
2724                 .lock()
2725                 .unwrap()
2726                 .vcpu
2727                 .get_sregs()
2728                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2729 
2730             let mut msrs = vec![MsrEntry {
2731                 index: msr_index::MSR_KERNEL_GS_BASE,
2732                 ..Default::default()
2733             }];
2734 
2735             self.vcpus[vcpu_id as usize]
2736                 .lock()
2737                 .unwrap()
2738                 .vcpu
2739                 .get_msrs(&mut msrs)
2740                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2741             let kernel_gs_base = msrs[0].data;
2742 
2743             let cs = CpuSegment::new(sregs.cs);
2744             let ds = CpuSegment::new(sregs.ds);
2745             let es = CpuSegment::new(sregs.es);
2746             let fs = CpuSegment::new(sregs.fs);
2747             let gs = CpuSegment::new(sregs.gs);
2748             let ss = CpuSegment::new(sregs.ss);
2749             let ldt = CpuSegment::new(sregs.ldt);
2750             let tr = CpuSegment::new(sregs.tr);
2751             let gdt = CpuSegment::new_from_table(sregs.gdt);
2752             let idt = CpuSegment::new_from_table(sregs.idt);
2753             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2754             let regs = DumpCpusState {
2755                 version: 1,
2756                 size: size_of::<DumpCpusState>() as u32,
2757                 regs1,
2758                 regs2,
2759                 rip: gregs.get_rip(),
2760                 rflags: gregs.get_rflags(),
2761                 cs,
2762                 ds,
2763                 es,
2764                 fs,
2765                 gs,
2766                 ss,
2767                 ldt,
2768                 tr,
2769                 gdt,
2770                 idt,
2771                 cr,
2772                 kernel_gs_base,
2773             };
2774 
2775             let bytes: &[u8] = regs.as_slice();
2776             buf.resize(note_size as usize, 0);
2777             buf.splice(pos.., bytes.to_vec());
2778             buf.resize(note_size as usize, 0);
2779 
2780             coredump_file
2781                 .write(&buf)
2782                 .map_err(GuestDebuggableError::CoredumpFile)?;
2783         }
2784 
2785         Ok(())
2786     }
2787 }
2788 
2789 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2790 #[cfg(test)]
2791 mod tests {
2792     use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START};
2793     use arch::x86_64::interrupts::*;
2794     use arch::x86_64::regs::*;
2795     use hypervisor::arch::x86::{FpuState, LapicState};
2796     use hypervisor::StandardRegisters;
2797     use linux_loader::loader::bootparam::setup_header;
2798 
2799     #[test]
2800     fn test_setlint() {
2801         let hv = hypervisor::new().unwrap();
2802         let vm = hv.create_vm().expect("new VM fd creation failed");
2803         hv.check_required_extensions().unwrap();
2804         // Calling get_lapic will fail if there is no irqchip before hand.
2805         vm.create_irq_chip().unwrap();
2806         let vcpu = vm.create_vcpu(0, None).unwrap();
2807         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2808 
2809         // Compute the value that is expected to represent LVT0 and LVT1.
2810         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2811         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2812         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2813         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2814 
2815         set_lint(&vcpu).unwrap();
2816 
2817         // Compute the value that represents LVT0 and LVT1 after set_lint.
2818         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2819         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2820         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2821         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2822         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2823     }
2824 
2825     #[test]
2826     fn test_setup_fpu() {
2827         let hv = hypervisor::new().unwrap();
2828         let vm = hv.create_vm().expect("new VM fd creation failed");
2829         let vcpu = vm.create_vcpu(0, None).unwrap();
2830         setup_fpu(&vcpu).unwrap();
2831 
2832         let expected_fpu: FpuState = FpuState {
2833             fcw: 0x37f,
2834             mxcsr: 0x1f80,
2835             ..Default::default()
2836         };
2837         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2838         // TODO: auto-generate kvm related structures with PartialEq on.
2839         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2840         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2841         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2842         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2843         // remove it at all.
2844         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2845     }
2846 
2847     #[test]
2848     fn test_setup_msrs() {
2849         use hypervisor::arch::x86::{msr_index, MsrEntry};
2850 
2851         let hv = hypervisor::new().unwrap();
2852         let vm = hv.create_vm().expect("new VM fd creation failed");
2853         let vcpu = vm.create_vcpu(0, None).unwrap();
2854         setup_msrs(&vcpu).unwrap();
2855 
2856         // This test will check against the last MSR entry configured (the tenth one).
2857         // See create_msr_entries for details.
2858         let mut msrs = vec![MsrEntry {
2859             index: msr_index::MSR_IA32_MISC_ENABLE,
2860             ..Default::default()
2861         }];
2862 
2863         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2864         // in this test case scenario.
2865         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2866         assert_eq!(read_msrs, 1);
2867 
2868         // Official entries that were setup when we did setup_msrs. We need to assert that the
2869         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2870         // expect.
2871         let entry_vec = vcpu.boot_msr_entries();
2872         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2873     }
2874 
2875     #[test]
2876     fn test_setup_regs_for_pvh() {
2877         let hv = hypervisor::new().unwrap();
2878         let vm = hv.create_vm().expect("new VM fd creation failed");
2879         let vcpu = vm.create_vcpu(0, None).unwrap();
2880 
2881         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2882         expected_regs.set_rflags(0x0000000000000002u64);
2883         expected_regs.set_rbx(arch::layout::PVH_INFO_START.0);
2884         expected_regs.set_rip(1);
2885 
2886         setup_regs(
2887             &vcpu,
2888             arch::EntryPoint {
2889                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2890                 setup_header: None,
2891             },
2892         )
2893         .unwrap();
2894 
2895         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2896         assert_eq!(actual_regs, expected_regs);
2897     }
2898 
2899     #[test]
2900     fn test_setup_regs_for_bzimage() {
2901         let hv = hypervisor::new().unwrap();
2902         let vm = hv.create_vm().expect("new VM fd creation failed");
2903         let vcpu = vm.create_vcpu(0, None).unwrap();
2904 
2905         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2906         expected_regs.set_rflags(0x0000000000000002u64);
2907         expected_regs.set_rip(1);
2908         expected_regs.set_rsp(BOOT_STACK_POINTER.0);
2909         expected_regs.set_rsi(ZERO_PAGE_START.0);
2910 
2911         setup_regs(
2912             &vcpu,
2913             arch::EntryPoint {
2914                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2915                 setup_header: Some(setup_header {
2916                     ..Default::default()
2917                 }),
2918             },
2919         )
2920         .unwrap();
2921 
2922         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2923         assert_eq!(actual_regs, expected_regs);
2924     }
2925 }
2926 
2927 #[cfg(target_arch = "aarch64")]
2928 #[cfg(test)]
2929 mod tests {
2930     use std::mem;
2931 
2932     use arch::aarch64::regs;
2933     use arch::layout;
2934     use hypervisor::kvm::aarch64::is_system_register;
2935     use hypervisor::kvm::kvm_bindings::{
2936         user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2937     };
2938     use hypervisor::{arm64_core_reg_id, offset_of};
2939 
2940     #[test]
2941     fn test_setup_regs() {
2942         let hv = hypervisor::new().unwrap();
2943         let vm = hv.create_vm().unwrap();
2944         let vcpu = vm.create_vcpu(0, None).unwrap();
2945 
2946         // Must fail when vcpu is not initialized yet.
2947         vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap_err();
2948 
2949         let mut kvi = vcpu.create_vcpu_init();
2950         vm.get_preferred_target(&mut kvi).unwrap();
2951         vcpu.vcpu_init(&kvi).unwrap();
2952 
2953         vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap();
2954     }
2955 
2956     #[test]
2957     fn test_read_mpidr() {
2958         let hv = hypervisor::new().unwrap();
2959         let vm = hv.create_vm().unwrap();
2960         let vcpu = vm.create_vcpu(0, None).unwrap();
2961         let mut kvi = vcpu.create_vcpu_init();
2962         vm.get_preferred_target(&mut kvi).unwrap();
2963 
2964         // Must fail when vcpu is not initialized yet.
2965         vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap_err();
2966 
2967         vcpu.vcpu_init(&kvi).unwrap();
2968         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2969     }
2970 
2971     #[test]
2972     fn test_is_system_register() {
2973         let offset = offset_of!(user_pt_regs, pc);
2974         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2975         assert!(!is_system_register(regid));
2976         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2977         assert!(is_system_register(regid));
2978     }
2979 
2980     #[test]
2981     fn test_save_restore_core_regs() {
2982         let hv = hypervisor::new().unwrap();
2983         let vm = hv.create_vm().unwrap();
2984         let vcpu = vm.create_vcpu(0, None).unwrap();
2985         let mut kvi = vcpu.create_vcpu_init();
2986         vm.get_preferred_target(&mut kvi).unwrap();
2987 
2988         // Must fail when vcpu is not initialized yet.
2989         assert_eq!(
2990             format!("{}", vcpu.get_regs().unwrap_err()),
2991             "Failed to get aarch64 core register: Exec format error (os error 8)"
2992         );
2993 
2994         let mut state = vcpu.create_standard_regs();
2995         assert_eq!(
2996             format!("{}", vcpu.set_regs(&state).unwrap_err()),
2997             "Failed to set aarch64 core register: Exec format error (os error 8)"
2998         );
2999 
3000         vcpu.vcpu_init(&kvi).unwrap();
3001         state = vcpu.get_regs().unwrap();
3002         assert_eq!(state.get_pstate(), 0x3C5);
3003 
3004         vcpu.set_regs(&state).unwrap();
3005     }
3006 
3007     #[test]
3008     fn test_get_set_mpstate() {
3009         let hv = hypervisor::new().unwrap();
3010         let vm = hv.create_vm().unwrap();
3011         let vcpu = vm.create_vcpu(0, None).unwrap();
3012         let mut kvi = vcpu.create_vcpu_init();
3013         vm.get_preferred_target(&mut kvi).unwrap();
3014 
3015         let state = vcpu.get_mp_state().unwrap();
3016         vcpu.set_mp_state(state).unwrap();
3017     }
3018 }
3019