xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 6f8bd27cf7629733582d930519e98d19e90afb16)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(feature = "guest_debug")]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 use crate::memory_manager::MemoryManager;
24 use crate::seccomp_filters::{get_seccomp_filter, Thread};
25 #[cfg(target_arch = "x86_64")]
26 use crate::vm::physical_bits;
27 use crate::GuestMemoryMmap;
28 use crate::CPU_MANAGER_SNAPSHOT_ID;
29 use acpi_tables::{aml, aml::Aml, sdt::Sdt};
30 use anyhow::anyhow;
31 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
32 use arch::aarch64::regs;
33 use arch::EntryPoint;
34 use arch::NumaNodes;
35 #[cfg(target_arch = "aarch64")]
36 use devices::gic::Gic;
37 use devices::interrupt_controller::InterruptController;
38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
39 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
41 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
42 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
43 use hypervisor::aarch64::StandardRegisters;
44 #[cfg(feature = "guest_debug")]
45 use hypervisor::arch::x86::msr_index;
46 #[cfg(target_arch = "x86_64")]
47 use hypervisor::arch::x86::CpuIdEntry;
48 #[cfg(feature = "guest_debug")]
49 use hypervisor::arch::x86::MsrEntry;
50 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
51 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
52 #[cfg(target_arch = "aarch64")]
53 use hypervisor::kvm::kvm_bindings;
54 #[cfg(feature = "tdx")]
55 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
56 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
57 use libc::{c_void, siginfo_t};
58 #[cfg(feature = "guest_debug")]
59 use linux_loader::elf::Elf64_Nhdr;
60 use seccompiler::{apply_filter, SeccompAction};
61 use std::collections::BTreeMap;
62 #[cfg(feature = "guest_debug")]
63 use std::io::Write;
64 #[cfg(feature = "guest_debug")]
65 use std::mem::size_of;
66 use std::os::unix::thread::JoinHandleExt;
67 use std::sync::atomic::{AtomicBool, Ordering};
68 use std::sync::{Arc, Barrier, Mutex};
69 use std::{cmp, io, result, thread};
70 use thiserror::Error;
71 use tracer::trace_scoped;
72 use vm_device::BusDevice;
73 #[cfg(feature = "guest_debug")]
74 use vm_memory::ByteValued;
75 #[cfg(feature = "guest_debug")]
76 use vm_memory::{Bytes, GuestAddressSpace};
77 use vm_memory::{GuestAddress, GuestMemoryAtomic};
78 use vm_migration::{
79     Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
80     Transportable,
81 };
82 use vmm_sys_util::eventfd::EventFd;
83 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
84 
85 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
86 /// Extract the specified bits of a 64-bit integer.
87 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
88 /// following expression should return 3 (`0b11`):
89 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
90 ///
91 macro_rules! extract_bits_64 {
92     ($value: tt, $offset: tt, $length: tt) => {
93         ($value >> $offset) & (!0u64 >> (64 - $length))
94     };
95 }
96 
97 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
98 
99 #[derive(Debug, Error)]
100 pub enum Error {
101     #[error("Error creating vCPU: {0}")]
102     VcpuCreate(#[source] anyhow::Error),
103 
104     #[error("Error running bCPU: {0}")]
105     VcpuRun(#[source] anyhow::Error),
106 
107     #[error("Error spawning vCPU thread: {0}")]
108     VcpuSpawn(#[source] io::Error),
109 
110     #[error("Error generating common CPUID: {0}")]
111     CommonCpuId(#[source] arch::Error),
112 
113     #[error("Error configuring vCPU: {0}")]
114     VcpuConfiguration(#[source] arch::Error),
115 
116     #[cfg(target_arch = "aarch64")]
117     #[error("Error fetching preferred target: {0}")]
118     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
119 
120     #[cfg(target_arch = "aarch64")]
121     #[error("Error initialising vCPU: {0}")]
122     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
123 
124     #[error("Failed to join on vCPU threads: {0:?}")]
125     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
126 
127     #[error("Error adding CpuManager to MMIO bus: {0}")]
128     BusError(#[source] vm_device::BusError),
129 
130     #[error("Requested vCPUs exceed maximum")]
131     DesiredVCpuCountExceedsMax,
132 
133     #[error("Cannot create seccomp filter: {0}")]
134     CreateSeccompFilter(#[source] seccompiler::Error),
135 
136     #[error("Cannot apply seccomp filter: {0}")]
137     ApplySeccompFilter(#[source] seccompiler::Error),
138 
139     #[error("Error starting vCPU after restore: {0}")]
140     StartRestoreVcpu(#[source] anyhow::Error),
141 
142     #[error("Unexpected VmExit")]
143     UnexpectedVmExit,
144 
145     #[error("Failed to allocate MMIO address for CpuManager")]
146     AllocateMmmioAddress,
147 
148     #[cfg(feature = "tdx")]
149     #[error("Error initializing TDX: {0}")]
150     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
151 
152     #[cfg(target_arch = "aarch64")]
153     #[error("Error initializing PMU: {0}")]
154     InitPmu(#[source] hypervisor::HypervisorCpuError),
155 
156     #[cfg(feature = "guest_debug")]
157     #[error("Error during CPU debug: {0}")]
158     CpuDebug(#[source] hypervisor::HypervisorCpuError),
159 
160     #[cfg(feature = "guest_debug")]
161     #[error("Error translating virtual address: {0}")]
162     TranslateVirtualAddress(#[source] anyhow::Error),
163 
164     #[cfg(target_arch = "x86_64")]
165     #[error("Error setting up AMX: {0}")]
166     AmxEnable(#[source] anyhow::Error),
167 }
168 pub type Result<T> = result::Result<T, Error>;
169 
170 #[cfg(target_arch = "x86_64")]
171 #[allow(dead_code)]
172 #[repr(packed)]
173 struct LocalApic {
174     pub r#type: u8,
175     pub length: u8,
176     pub processor_id: u8,
177     pub apic_id: u8,
178     pub flags: u32,
179 }
180 
181 #[allow(dead_code)]
182 #[repr(packed)]
183 #[derive(Default)]
184 struct Ioapic {
185     pub r#type: u8,
186     pub length: u8,
187     pub ioapic_id: u8,
188     _reserved: u8,
189     pub apic_address: u32,
190     pub gsi_base: u32,
191 }
192 
193 #[cfg(target_arch = "aarch64")]
194 #[allow(dead_code)]
195 #[repr(packed)]
196 struct GicC {
197     pub r#type: u8,
198     pub length: u8,
199     pub reserved0: u16,
200     pub cpu_interface_number: u32,
201     pub uid: u32,
202     pub flags: u32,
203     pub parking_version: u32,
204     pub performance_interrupt: u32,
205     pub parked_address: u64,
206     pub base_address: u64,
207     pub gicv_base_address: u64,
208     pub gich_base_address: u64,
209     pub vgic_interrupt: u32,
210     pub gicr_base_address: u64,
211     pub mpidr: u64,
212     pub proc_power_effi_class: u8,
213     pub reserved1: u8,
214     pub spe_overflow_interrupt: u16,
215 }
216 
217 #[cfg(target_arch = "aarch64")]
218 #[allow(dead_code)]
219 #[repr(packed)]
220 struct GicD {
221     pub r#type: u8,
222     pub length: u8,
223     pub reserved0: u16,
224     pub gic_id: u32,
225     pub base_address: u64,
226     pub global_irq_base: u32,
227     pub version: u8,
228     pub reserved1: [u8; 3],
229 }
230 
231 #[cfg(target_arch = "aarch64")]
232 #[allow(dead_code)]
233 #[repr(packed)]
234 struct GicR {
235     pub r#type: u8,
236     pub length: u8,
237     pub reserved: u16,
238     pub base_address: u64,
239     pub range_length: u32,
240 }
241 
242 #[cfg(target_arch = "aarch64")]
243 #[allow(dead_code)]
244 #[repr(packed)]
245 struct GicIts {
246     pub r#type: u8,
247     pub length: u8,
248     pub reserved0: u16,
249     pub translation_id: u32,
250     pub base_address: u64,
251     pub reserved1: u32,
252 }
253 
254 #[cfg(target_arch = "aarch64")]
255 #[allow(dead_code)]
256 #[repr(packed)]
257 struct ProcessorHierarchyNode {
258     pub r#type: u8,
259     pub length: u8,
260     pub reserved: u16,
261     pub flags: u32,
262     pub parent: u32,
263     pub acpi_processor_id: u32,
264     pub num_private_resources: u32,
265 }
266 
267 #[allow(dead_code)]
268 #[repr(packed)]
269 #[derive(Default)]
270 struct InterruptSourceOverride {
271     pub r#type: u8,
272     pub length: u8,
273     pub bus: u8,
274     pub source: u8,
275     pub gsi: u32,
276     pub flags: u16,
277 }
278 
279 #[cfg(feature = "guest_debug")]
280 macro_rules! round_up {
281     ($n:expr,$d:expr) => {
282         (($n / ($d + 1)) + 1) * $d
283     };
284 }
285 
286 /// A wrapper around creating and using a kvm-based VCPU.
287 pub struct Vcpu {
288     // The hypervisor abstracted CPU.
289     vcpu: Arc<dyn hypervisor::Vcpu>,
290     id: u8,
291     #[cfg(target_arch = "aarch64")]
292     mpidr: u64,
293     saved_state: Option<CpuState>,
294 }
295 
296 impl Vcpu {
297     /// Constructs a new VCPU for `vm`.
298     ///
299     /// # Arguments
300     ///
301     /// * `id` - Represents the CPU number between [0, max vcpus).
302     /// * `vm` - The virtual machine this vcpu will get attached to.
303     /// * `vm_ops` - Optional object for exit handling.
304     pub fn new(
305         id: u8,
306         vm: &Arc<dyn hypervisor::Vm>,
307         vm_ops: Option<Arc<dyn VmOps>>,
308     ) -> Result<Self> {
309         let vcpu = vm
310             .create_vcpu(id, vm_ops)
311             .map_err(|e| Error::VcpuCreate(e.into()))?;
312         // Initially the cpuid per vCPU is the one supported by this VM.
313         Ok(Vcpu {
314             vcpu,
315             id,
316             #[cfg(target_arch = "aarch64")]
317             mpidr: 0,
318             saved_state: None,
319         })
320     }
321 
322     /// Configures a vcpu and should be called once per vcpu when created.
323     ///
324     /// # Arguments
325     ///
326     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
327     /// * `vm_memory` - Guest memory.
328     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
329     pub fn configure(
330         &mut self,
331         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
332         kernel_entry_point: Option<EntryPoint>,
333         #[cfg(target_arch = "x86_64")] vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
334         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
335         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
336     ) -> Result<()> {
337         #[cfg(target_arch = "aarch64")]
338         {
339             self.init(vm)?;
340             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point)
341                 .map_err(Error::VcpuConfiguration)?;
342         }
343         info!("Configuring vCPU: cpu_id = {}", self.id);
344         #[cfg(target_arch = "x86_64")]
345         arch::configure_vcpu(
346             &self.vcpu,
347             self.id,
348             kernel_entry_point,
349             vm_memory,
350             cpuid,
351             kvm_hyperv,
352         )
353         .map_err(Error::VcpuConfiguration)?;
354 
355         Ok(())
356     }
357 
358     /// Gets the MPIDR register value.
359     #[cfg(target_arch = "aarch64")]
360     pub fn get_mpidr(&self) -> u64 {
361         self.mpidr
362     }
363 
364     /// Gets the saved vCPU state.
365     #[cfg(target_arch = "aarch64")]
366     pub fn get_saved_state(&self) -> Option<CpuState> {
367         self.saved_state.clone()
368     }
369 
370     /// Initializes an aarch64 specific vcpu for booting Linux.
371     #[cfg(target_arch = "aarch64")]
372     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
373         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
374 
375         // This reads back the kernel's preferred target type.
376         vm.get_preferred_target(&mut kvi)
377             .map_err(Error::VcpuArmPreferredTarget)?;
378         // We already checked that the capability is supported.
379         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
380         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
381         // Non-boot cpus are powered off initially.
382         if self.id > 0 {
383             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
384         }
385         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
386     }
387 
388     /// Runs the VCPU until it exits, returning the reason.
389     ///
390     /// Note that the state of the VCPU and associated VM must be setup first for this to do
391     /// anything useful.
392     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
393         self.vcpu.run()
394     }
395 }
396 
397 const VCPU_SNAPSHOT_ID: &str = "vcpu";
398 impl Pausable for Vcpu {}
399 impl Snapshottable for Vcpu {
400     fn id(&self) -> String {
401         VCPU_SNAPSHOT_ID.to_string()
402     }
403 
404     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
405         let saved_state = self
406             .vcpu
407             .state()
408             .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?;
409 
410         let mut vcpu_snapshot = Snapshot::new(&format!("{:03}", self.id));
411         vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state(
412             VCPU_SNAPSHOT_ID,
413             &saved_state,
414         )?);
415 
416         self.saved_state = Some(saved_state);
417 
418         Ok(vcpu_snapshot)
419     }
420 
421     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
422         let saved_state: CpuState = snapshot.to_state(VCPU_SNAPSHOT_ID)?;
423 
424         self.vcpu
425             .set_state(&saved_state)
426             .map_err(|e| MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)))?;
427 
428         self.saved_state = Some(saved_state);
429 
430         Ok(())
431     }
432 }
433 
434 pub struct CpuManager {
435     hypervisor_type: HypervisorType,
436     config: CpusConfig,
437     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
438     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
439     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
440     vm_memory: GuestMemoryAtomic<GuestMemoryMmap>,
441     #[cfg(target_arch = "x86_64")]
442     cpuid: Vec<CpuIdEntry>,
443     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
444     vm: Arc<dyn hypervisor::Vm>,
445     vcpus_kill_signalled: Arc<AtomicBool>,
446     vcpus_pause_signalled: Arc<AtomicBool>,
447     exit_evt: EventFd,
448     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
449     reset_evt: EventFd,
450     #[cfg(feature = "guest_debug")]
451     vm_debug_evt: EventFd,
452     vcpu_states: Vec<VcpuState>,
453     selected_cpu: u8,
454     vcpus: Vec<Arc<Mutex<Vcpu>>>,
455     seccomp_action: SeccompAction,
456     vm_ops: Arc<dyn VmOps>,
457     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
458     acpi_address: Option<GuestAddress>,
459     proximity_domain_per_cpu: BTreeMap<u8, u32>,
460     affinity: BTreeMap<u8, Vec<u8>>,
461     dynamic: bool,
462 }
463 
464 const CPU_ENABLE_FLAG: usize = 0;
465 const CPU_INSERTING_FLAG: usize = 1;
466 const CPU_REMOVING_FLAG: usize = 2;
467 const CPU_EJECT_FLAG: usize = 3;
468 
469 const CPU_STATUS_OFFSET: u64 = 4;
470 const CPU_SELECTION_OFFSET: u64 = 0;
471 
472 impl BusDevice for CpuManager {
473     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
474         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
475         data.fill(0);
476 
477         match offset {
478             CPU_SELECTION_OFFSET => {
479                 data[0] = self.selected_cpu;
480             }
481             CPU_STATUS_OFFSET => {
482                 if self.selected_cpu < self.max_vcpus() {
483                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
484                     if state.active() {
485                         data[0] |= 1 << CPU_ENABLE_FLAG;
486                     }
487                     if state.inserting {
488                         data[0] |= 1 << CPU_INSERTING_FLAG;
489                     }
490                     if state.removing {
491                         data[0] |= 1 << CPU_REMOVING_FLAG;
492                     }
493                 } else {
494                     warn!("Out of range vCPU id: {}", self.selected_cpu);
495                 }
496             }
497             _ => {
498                 warn!(
499                     "Unexpected offset for accessing CPU manager device: {:#}",
500                     offset
501                 );
502             }
503         }
504     }
505 
506     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
507         match offset {
508             CPU_SELECTION_OFFSET => {
509                 self.selected_cpu = data[0];
510             }
511             CPU_STATUS_OFFSET => {
512                 if self.selected_cpu < self.max_vcpus() {
513                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
514                     // The ACPI code writes back a 1 to acknowledge the insertion
515                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
516                         && state.inserting
517                     {
518                         state.inserting = false;
519                     }
520                     // Ditto for removal
521                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
522                         && state.removing
523                     {
524                         state.removing = false;
525                     }
526                     // Trigger removal of vCPU
527                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
528                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
529                             error!("Error removing vCPU: {:?}", e);
530                         }
531                     }
532                 } else {
533                     warn!("Out of range vCPU id: {}", self.selected_cpu);
534                 }
535             }
536             _ => {
537                 warn!(
538                     "Unexpected offset for accessing CPU manager device: {:#}",
539                     offset
540                 );
541             }
542         }
543         None
544     }
545 }
546 
547 #[derive(Default)]
548 struct VcpuState {
549     inserting: bool,
550     removing: bool,
551     handle: Option<thread::JoinHandle<()>>,
552     kill: Arc<AtomicBool>,
553     vcpu_run_interrupted: Arc<AtomicBool>,
554 }
555 
556 impl VcpuState {
557     fn active(&self) -> bool {
558         self.handle.is_some()
559     }
560 
561     fn signal_thread(&self) {
562         if let Some(handle) = self.handle.as_ref() {
563             loop {
564                 // SAFETY: FFI call with correct arguments
565                 unsafe {
566                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
567                 }
568                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
569                     break;
570                 } else {
571                     // This is more effective than thread::yield_now() at
572                     // avoiding a priority inversion with the vCPU thread
573                     thread::sleep(std::time::Duration::from_millis(1));
574                 }
575             }
576         }
577     }
578 
579     fn join_thread(&mut self) -> Result<()> {
580         if let Some(handle) = self.handle.take() {
581             handle.join().map_err(Error::ThreadCleanup)?
582         }
583 
584         Ok(())
585     }
586 
587     fn unpark_thread(&self) {
588         if let Some(handle) = self.handle.as_ref() {
589             handle.thread().unpark()
590         }
591     }
592 }
593 
594 impl CpuManager {
595     #[allow(unused_variables)]
596     #[allow(clippy::too_many_arguments)]
597     pub fn new(
598         config: &CpusConfig,
599         memory_manager: &Arc<Mutex<MemoryManager>>,
600         vm: Arc<dyn hypervisor::Vm>,
601         exit_evt: EventFd,
602         reset_evt: EventFd,
603         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
604         hypervisor: Arc<dyn hypervisor::Hypervisor>,
605         seccomp_action: SeccompAction,
606         vm_ops: Arc<dyn VmOps>,
607         #[cfg(feature = "tdx")] tdx_enabled: bool,
608         numa_nodes: &NumaNodes,
609     ) -> Result<Arc<Mutex<CpuManager>>> {
610         let guest_memory = memory_manager.lock().unwrap().guest_memory();
611         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
612         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
613         let hypervisor_type = hypervisor.hypervisor_type();
614 
615         #[cfg(target_arch = "x86_64")]
616         let sgx_epc_sections = memory_manager
617             .lock()
618             .unwrap()
619             .sgx_epc_region()
620             .as_ref()
621             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
622         #[cfg(target_arch = "x86_64")]
623         let cpuid = {
624             let phys_bits = physical_bits(config.max_phys_bits);
625             arch::generate_common_cpuid(
626                 hypervisor,
627                 config
628                     .topology
629                     .clone()
630                     .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)),
631                 sgx_epc_sections,
632                 phys_bits,
633                 config.kvm_hyperv,
634                 #[cfg(feature = "tdx")]
635                 tdx_enabled,
636             )
637             .map_err(Error::CommonCpuId)?
638         };
639         #[cfg(target_arch = "x86_64")]
640         if config.features.amx {
641             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
642             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
643             const XFEATURE_XTILEDATA: usize = 18;
644             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
645 
646             // SAFETY: the syscall is only modifing kernel internal
647             // data structures that the kernel is itself expected to safeguard.
648             let amx_tile = unsafe {
649                 libc::syscall(
650                     libc::SYS_arch_prctl,
651                     ARCH_REQ_XCOMP_GUEST_PERM,
652                     XFEATURE_XTILEDATA,
653                 )
654             };
655 
656             if amx_tile != 0 {
657                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
658             } else {
659                 let mask: usize = 0;
660                 // SAFETY: the mask being modified (not marked mutable as it is
661                 // modified in unsafe only which is permitted) isn't in use elsewhere.
662                 let result = unsafe {
663                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
664                 };
665                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
666                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
667                 }
668             }
669         }
670 
671         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
672             let mut cpu_list = Vec::new();
673             for (proximity_domain, numa_node) in numa_nodes.iter() {
674                 for cpu in numa_node.cpus.iter() {
675                     cpu_list.push((*cpu, *proximity_domain))
676                 }
677             }
678             cpu_list
679         }
680         .into_iter()
681         .collect();
682 
683         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
684             cpu_affinity
685                 .iter()
686                 .map(|a| (a.vcpu, a.host_cpus.clone()))
687                 .collect()
688         } else {
689             BTreeMap::new()
690         };
691 
692         #[cfg(feature = "tdx")]
693         let dynamic = !tdx_enabled;
694         #[cfg(not(feature = "tdx"))]
695         let dynamic = true;
696 
697         Ok(Arc::new(Mutex::new(CpuManager {
698             hypervisor_type,
699             config: config.clone(),
700             interrupt_controller: None,
701             vm_memory: guest_memory,
702             #[cfg(target_arch = "x86_64")]
703             cpuid,
704             vm,
705             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
706             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
707             vcpu_states,
708             exit_evt,
709             reset_evt,
710             #[cfg(feature = "guest_debug")]
711             vm_debug_evt,
712             selected_cpu: 0,
713             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
714             seccomp_action,
715             vm_ops,
716             acpi_address: None,
717             proximity_domain_per_cpu,
718             affinity,
719             dynamic,
720         })))
721     }
722 
723     fn create_vcpu(&mut self, cpu_id: u8) -> Result<Arc<Mutex<Vcpu>>> {
724         info!("Creating vCPU: cpu_id = {}", cpu_id);
725 
726         let vcpu = Arc::new(Mutex::new(Vcpu::new(
727             cpu_id,
728             &self.vm,
729             Some(self.vm_ops.clone()),
730         )?));
731 
732         // Adding vCPU to the CpuManager's vCPU list.
733         self.vcpus.push(vcpu.clone());
734 
735         Ok(vcpu)
736     }
737 
738     pub fn configure_vcpu(
739         &self,
740         vcpu: Arc<Mutex<Vcpu>>,
741         entry_point: Option<EntryPoint>,
742         snapshot: Option<Snapshot>,
743     ) -> Result<()> {
744         let mut vcpu = vcpu.lock().unwrap();
745 
746         if let Some(snapshot) = snapshot {
747             // AArch64 vCPUs should be initialized after created.
748             #[cfg(target_arch = "aarch64")]
749             vcpu.init(&self.vm)?;
750 
751             vcpu.restore(snapshot).expect("Failed to restore vCPU");
752         } else {
753             #[cfg(target_arch = "x86_64")]
754             vcpu.configure(
755                 entry_point,
756                 &self.vm_memory,
757                 self.cpuid.clone(),
758                 self.config.kvm_hyperv,
759             )
760             .expect("Failed to configure vCPU");
761 
762             #[cfg(target_arch = "aarch64")]
763             vcpu.configure(&self.vm, entry_point)
764                 .expect("Failed to configure vCPU");
765         }
766 
767         Ok(())
768     }
769 
770     /// Only create new vCPUs if there aren't any inactive ones to reuse
771     fn create_vcpus(&mut self, desired_vcpus: u8) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
772         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
773         info!(
774             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
775             desired_vcpus,
776             self.config.max_vcpus,
777             self.vcpus.len(),
778             self.present_vcpus()
779         );
780 
781         if desired_vcpus > self.config.max_vcpus {
782             return Err(Error::DesiredVCpuCountExceedsMax);
783         }
784 
785         // Only create vCPUs in excess of all the allocated vCPUs.
786         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
787             vcpus.push(self.create_vcpu(cpu_id)?);
788         }
789 
790         Ok(vcpus)
791     }
792 
793     #[cfg(target_arch = "aarch64")]
794     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
795         for cpu in self.vcpus.iter() {
796             let cpu = cpu.lock().unwrap();
797             // Check if PMU attr is available, if not, log the information.
798             if cpu.vcpu.has_pmu_support() {
799                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
800             } else {
801                 debug!(
802                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
803                     cpu.id
804                 );
805                 return Ok(false);
806             }
807         }
808 
809         Ok(true)
810     }
811 
812     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
813         self.vcpus.clone()
814     }
815 
816     fn start_vcpu(
817         &mut self,
818         vcpu: Arc<Mutex<Vcpu>>,
819         vcpu_id: u8,
820         vcpu_thread_barrier: Arc<Barrier>,
821         inserting: bool,
822     ) -> Result<()> {
823         let reset_evt = self.reset_evt.try_clone().unwrap();
824         let exit_evt = self.exit_evt.try_clone().unwrap();
825         #[cfg(feature = "guest_debug")]
826         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
827         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
828         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
829         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
830 
831         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
832         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
833             .vcpu_run_interrupted
834             .clone();
835         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
836 
837         // Prepare the CPU set the current vCPU is expected to run onto.
838         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
839             // SAFETY: all zeros is a valid pattern
840             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
841             // SAFETY: FFI call, trivially safe
842             unsafe { libc::CPU_ZERO(&mut cpuset) };
843             for host_cpu in host_cpus {
844                 // SAFETY: FFI call, trivially safe
845                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
846             }
847             cpuset
848         });
849 
850         // Retrieve seccomp filter for vcpu thread
851         let vcpu_seccomp_filter =
852             get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type)
853                 .map_err(Error::CreateSeccompFilter)?;
854 
855         #[cfg(target_arch = "x86_64")]
856         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
857 
858         info!("Starting vCPU: cpu_id = {}", vcpu_id);
859 
860         let handle = Some(
861             thread::Builder::new()
862                 .name(format!("vcpu{}", vcpu_id))
863                 .spawn(move || {
864                     // Schedule the thread to run on the expected CPU set
865                     if let Some(cpuset) = cpuset.as_ref() {
866                         // SAFETY: FFI call with correct arguments
867                         let ret = unsafe {
868                             libc::sched_setaffinity(
869                                 0,
870                                 std::mem::size_of::<libc::cpu_set_t>(),
871                                 cpuset as *const libc::cpu_set_t,
872                             )
873                         };
874 
875                         if ret != 0 {
876                             error!(
877                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
878                                 vcpu_id,
879                                 io::Error::last_os_error()
880                             );
881                             return;
882                         }
883                     }
884 
885                     // Apply seccomp filter for vcpu thread.
886                     if !vcpu_seccomp_filter.is_empty() {
887                         if let Err(e) =
888                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
889                         {
890                             error!("Error applying seccomp filter: {:?}", e);
891                             return;
892                         }
893                     }
894                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
895                     // This uses an async signal safe handler to kill the vcpu handles.
896                     register_signal_handler(SIGRTMIN(), handle_signal)
897                         .expect("Failed to register vcpu signal handler");
898                     // Block until all CPUs are ready.
899                     vcpu_thread_barrier.wait();
900 
901                     std::panic::catch_unwind(move || {
902                         loop {
903                             // If we are being told to pause, we park the thread
904                             // until the pause boolean is toggled.
905                             // The resume operation is responsible for toggling
906                             // the boolean and unpark the thread.
907                             // We enter a loop because park() could spuriously
908                             // return. We will then park() again unless the
909                             // pause boolean has been toggled.
910 
911                             // Need to use Ordering::SeqCst as we have multiple
912                             // loads and stores to different atomics and we need
913                             // to see them in a consistent order in all threads
914 
915                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
916                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
917                                 // completed by returning to KVM_RUN. From the kernel docs:
918                                 //
919                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
920                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
921                                 // operations are complete (and guest state is consistent) only after userspace
922                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
923                                 // incomplete operations and then check for pending signals.
924                                 // The pending state of the operation is not preserved in state which is
925                                 // visible to userspace, thus userspace should ensure that the operation is
926                                 // completed before performing a live migration.  Userspace can re-enter the
927                                 // guest with an unmasked signal pending or with the immediate_exit field set
928                                 // to complete pending operations without allowing any further instructions
929                                 // to be executed.
930 
931                                 #[cfg(feature = "kvm")]
932                                 {
933                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
934                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
935                                         error!("Unexpected VM exit on \"immediate_exit\" run");
936                                         break;
937                                     }
938                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
939                                 }
940 
941                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
942                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
943                                     thread::park();
944                                 }
945                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
946                             }
947 
948                             // We've been told to terminate
949                             if vcpu_kill_signalled.load(Ordering::SeqCst)
950                                 || vcpu_kill.load(Ordering::SeqCst)
951                             {
952                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
953                                 break;
954                             }
955 
956                             #[cfg(feature = "tdx")]
957                             let mut vcpu = vcpu.lock().unwrap();
958                             #[cfg(not(feature = "tdx"))]
959                             let vcpu = vcpu.lock().unwrap();
960                             // vcpu.run() returns false on a triple-fault so trigger a reset
961                             match vcpu.run() {
962                                 Ok(run) => match run {
963                                     #[cfg(feature = "kvm")]
964                                     VmExit::Debug => {
965                                         info!("VmExit::Debug");
966                                         #[cfg(feature = "guest_debug")]
967                                         {
968                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
969                                             let raw_tid = get_raw_tid(vcpu_id as usize);
970                                             vm_debug_evt.write(raw_tid as u64).unwrap();
971                                         }
972                                     }
973                                     #[cfg(target_arch = "x86_64")]
974                                     VmExit::IoapicEoi(vector) => {
975                                         if let Some(interrupt_controller) =
976                                             &interrupt_controller_clone
977                                         {
978                                             interrupt_controller
979                                                 .lock()
980                                                 .unwrap()
981                                                 .end_of_interrupt(vector);
982                                         }
983                                     }
984                                     VmExit::Ignore => {}
985                                     VmExit::Hyperv => {}
986                                     VmExit::Reset => {
987                                         info!("VmExit::Reset");
988                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
989                                         reset_evt.write(1).unwrap();
990                                         break;
991                                     }
992                                     VmExit::Shutdown => {
993                                         info!("VmExit::Shutdown");
994                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
995                                         exit_evt.write(1).unwrap();
996                                         break;
997                                     }
998                                     #[cfg(feature = "tdx")]
999                                     VmExit::Tdx => {
1000                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1001                                             match vcpu.get_tdx_exit_details() {
1002                                                 Ok(details) => match details {
1003                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1004                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1005                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1006                                                     }
1007                                                 },
1008                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1009                                             }
1010                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1011                                         } else {
1012                                             // We should never reach this code as
1013                                             // this means the design from the code
1014                                             // is wrong.
1015                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1016                                         }
1017                                     }
1018                                     _ => {
1019                                         error!(
1020                                             "VCPU generated error: {:?}",
1021                                             Error::UnexpectedVmExit
1022                                         );
1023                                         break;
1024                                     }
1025                                 },
1026 
1027                                 Err(e) => {
1028                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1029                                     break;
1030                                 }
1031                             }
1032 
1033                             // We've been told to terminate
1034                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1035                                 || vcpu_kill.load(Ordering::SeqCst)
1036                             {
1037                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1038                                 break;
1039                             }
1040                         }
1041                     })
1042                     .or_else(|_| {
1043                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1044                         error!("vCPU thread panicked");
1045                         panic_exit_evt.write(1)
1046                     })
1047                     .ok();
1048                 })
1049                 .map_err(Error::VcpuSpawn)?,
1050         );
1051 
1052         // On hot plug calls into this function entry_point is None. It is for
1053         // those hotplug CPU additions that we need to set the inserting flag.
1054         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1055         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1056 
1057         Ok(())
1058     }
1059 
1060     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1061     fn activate_vcpus(
1062         &mut self,
1063         desired_vcpus: u8,
1064         inserting: bool,
1065         paused: Option<bool>,
1066     ) -> Result<()> {
1067         if desired_vcpus > self.config.max_vcpus {
1068             return Err(Error::DesiredVCpuCountExceedsMax);
1069         }
1070 
1071         let vcpu_thread_barrier = Arc::new(Barrier::new(
1072             (desired_vcpus - self.present_vcpus() + 1) as usize,
1073         ));
1074 
1075         if let Some(paused) = paused {
1076             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1077         }
1078 
1079         info!(
1080             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1081             desired_vcpus,
1082             self.vcpus.len(),
1083             self.present_vcpus(),
1084             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1085         );
1086 
1087         // This reuses any inactive vCPUs as well as any that were newly created
1088         for vcpu_id in self.present_vcpus()..desired_vcpus {
1089             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1090             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1091         }
1092 
1093         // Unblock all CPU threads.
1094         vcpu_thread_barrier.wait();
1095         Ok(())
1096     }
1097 
1098     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1099         // Mark vCPUs for removal, actual removal happens on ejection
1100         for cpu_id in desired_vcpus..self.present_vcpus() {
1101             self.vcpu_states[usize::from(cpu_id)].removing = true;
1102         }
1103     }
1104 
1105     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1106         info!("Removing vCPU: cpu_id = {}", cpu_id);
1107         let mut state = &mut self.vcpu_states[usize::from(cpu_id)];
1108         state.kill.store(true, Ordering::SeqCst);
1109         state.signal_thread();
1110         state.join_thread()?;
1111         state.handle = None;
1112 
1113         // Once the thread has exited, clear the "kill" so that it can reused
1114         state.kill.store(false, Ordering::SeqCst);
1115 
1116         Ok(())
1117     }
1118 
1119     pub fn create_boot_vcpus(&mut self) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1120         trace_scoped!("create_boot_vcpus");
1121 
1122         self.create_vcpus(self.boot_vcpus())
1123     }
1124 
1125     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1126     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1127         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1128     }
1129 
1130     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1131         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1132             .map_err(|e| {
1133                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1134             })?;
1135 
1136         Ok(())
1137     }
1138 
1139     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1140         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1141             return Ok(false);
1142         }
1143 
1144         if !self.dynamic {
1145             return Ok(false);
1146         }
1147 
1148         match desired_vcpus.cmp(&self.present_vcpus()) {
1149             cmp::Ordering::Greater => {
1150                 let vcpus = self.create_vcpus(desired_vcpus)?;
1151                 for vcpu in vcpus {
1152                     self.configure_vcpu(vcpu, None, None)?
1153                 }
1154                 self.activate_vcpus(desired_vcpus, true, None)?;
1155                 Ok(true)
1156             }
1157             cmp::Ordering::Less => {
1158                 self.mark_vcpus_for_removal(desired_vcpus);
1159                 Ok(true)
1160             }
1161             _ => Ok(false),
1162         }
1163     }
1164 
1165     pub fn shutdown(&mut self) -> Result<()> {
1166         // Tell the vCPUs to stop themselves next time they go through the loop
1167         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1168 
1169         // Toggle the vCPUs pause boolean
1170         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1171 
1172         // Unpark all the VCPU threads.
1173         for state in self.vcpu_states.iter() {
1174             state.unpark_thread();
1175         }
1176 
1177         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1178         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1179         // above.
1180         for state in self.vcpu_states.iter() {
1181             state.signal_thread();
1182         }
1183 
1184         // Wait for all the threads to finish. This removes the state from the vector.
1185         for mut state in self.vcpu_states.drain(..) {
1186             state.join_thread()?;
1187         }
1188 
1189         Ok(())
1190     }
1191 
1192     #[cfg(feature = "tdx")]
1193     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1194         for vcpu in &self.vcpus {
1195             vcpu.lock()
1196                 .unwrap()
1197                 .vcpu
1198                 .tdx_init(hob_address)
1199                 .map_err(Error::InitializeTdx)?;
1200         }
1201         Ok(())
1202     }
1203 
1204     pub fn boot_vcpus(&self) -> u8 {
1205         self.config.boot_vcpus
1206     }
1207 
1208     pub fn max_vcpus(&self) -> u8 {
1209         self.config.max_vcpus
1210     }
1211 
1212     #[cfg(target_arch = "x86_64")]
1213     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1214         self.cpuid.clone()
1215     }
1216 
1217     fn present_vcpus(&self) -> u8 {
1218         self.vcpu_states
1219             .iter()
1220             .fold(0, |acc, state| acc + state.active() as u8)
1221     }
1222 
1223     #[cfg(target_arch = "aarch64")]
1224     pub fn get_mpidrs(&self) -> Vec<u64> {
1225         self.vcpus
1226             .iter()
1227             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1228             .collect()
1229     }
1230 
1231     #[cfg(target_arch = "aarch64")]
1232     pub fn get_saved_states(&self) -> Vec<CpuState> {
1233         self.vcpus
1234             .iter()
1235             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1236             .collect()
1237     }
1238 
1239     #[cfg(target_arch = "aarch64")]
1240     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1241         self.config
1242             .topology
1243             .clone()
1244             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1245     }
1246 
1247     pub fn create_madt(&self) -> Sdt {
1248         use crate::acpi;
1249         // This is also checked in the commandline parsing.
1250         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1251 
1252         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1253         #[cfg(target_arch = "x86_64")]
1254         {
1255             madt.write(36, arch::layout::APIC_START);
1256 
1257             for cpu in 0..self.config.max_vcpus {
1258                 let lapic = LocalApic {
1259                     r#type: acpi::ACPI_APIC_PROCESSOR,
1260                     length: 8,
1261                     processor_id: cpu,
1262                     apic_id: cpu,
1263                     flags: if cpu < self.config.boot_vcpus {
1264                         1 << MADT_CPU_ENABLE_FLAG
1265                     } else {
1266                         0
1267                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1268                 };
1269                 madt.append(lapic);
1270             }
1271 
1272             madt.append(Ioapic {
1273                 r#type: acpi::ACPI_APIC_IO,
1274                 length: 12,
1275                 ioapic_id: 0,
1276                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1277                 gsi_base: 0,
1278                 ..Default::default()
1279             });
1280 
1281             madt.append(InterruptSourceOverride {
1282                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1283                 length: 10,
1284                 bus: 0,
1285                 source: 4,
1286                 gsi: 4,
1287                 flags: 0,
1288             });
1289         }
1290 
1291         #[cfg(target_arch = "aarch64")]
1292         {
1293             /* Notes:
1294              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1295              */
1296 
1297             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1298             for cpu in 0..self.config.boot_vcpus {
1299                 let vcpu = &self.vcpus[cpu as usize];
1300                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1301                 /* ARMv8 MPIDR format:
1302                      Bits [63:40] Must be zero
1303                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1304                      Bits [31:24] Must be zero
1305                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1306                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1307                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1308                 */
1309                 let mpidr_mask = 0xff_00ff_ffff;
1310                 let gicc = GicC {
1311                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1312                     length: 80,
1313                     reserved0: 0,
1314                     cpu_interface_number: cpu as u32,
1315                     uid: cpu as u32,
1316                     flags: 1,
1317                     parking_version: 0,
1318                     performance_interrupt: 0,
1319                     parked_address: 0,
1320                     base_address: 0,
1321                     gicv_base_address: 0,
1322                     gich_base_address: 0,
1323                     vgic_interrupt: 0,
1324                     gicr_base_address: 0,
1325                     mpidr: mpidr & mpidr_mask,
1326                     proc_power_effi_class: 0,
1327                     reserved1: 0,
1328                     spe_overflow_interrupt: 0,
1329                 };
1330 
1331                 madt.append(gicc);
1332             }
1333             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1334 
1335             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1336             let gicd = GicD {
1337                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1338                 length: 24,
1339                 reserved0: 0,
1340                 gic_id: 0,
1341                 base_address: vgic_config.dist_addr,
1342                 global_irq_base: 0,
1343                 version: 3,
1344                 reserved1: [0; 3],
1345             };
1346             madt.append(gicd);
1347 
1348             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1349             let gicr = GicR {
1350                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1351                 length: 16,
1352                 reserved: 0,
1353                 base_address: vgic_config.redists_addr,
1354                 range_length: vgic_config.redists_size as u32,
1355             };
1356             madt.append(gicr);
1357 
1358             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1359             let gicits = GicIts {
1360                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1361                 length: 20,
1362                 reserved0: 0,
1363                 translation_id: 0,
1364                 base_address: vgic_config.msi_addr,
1365                 reserved1: 0,
1366             };
1367             madt.append(gicits);
1368 
1369             madt.update_checksum();
1370         }
1371 
1372         madt
1373     }
1374 
1375     #[cfg(target_arch = "aarch64")]
1376     pub fn create_pptt(&self) -> Sdt {
1377         let pptt_start = 0;
1378         let mut cpus = 0;
1379         let mut uid = 0;
1380         // If topology is not specified, the default setting is:
1381         // 1 package, multiple cores, 1 thread per core
1382         // This is also the behavior when PPTT is missing.
1383         let (threads_per_core, cores_per_package, packages) =
1384             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1385 
1386         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1387 
1388         for cluster_idx in 0..packages {
1389             if cpus < self.config.boot_vcpus as usize {
1390                 let cluster_offset = pptt.len() - pptt_start;
1391                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1392                     r#type: 0,
1393                     length: 20,
1394                     reserved: 0,
1395                     flags: 0x2,
1396                     parent: 0,
1397                     acpi_processor_id: cluster_idx as u32,
1398                     num_private_resources: 0,
1399                 };
1400                 pptt.append(cluster_hierarchy_node);
1401 
1402                 for core_idx in 0..cores_per_package {
1403                     let core_offset = pptt.len() - pptt_start;
1404 
1405                     if threads_per_core > 1 {
1406                         let core_hierarchy_node = ProcessorHierarchyNode {
1407                             r#type: 0,
1408                             length: 20,
1409                             reserved: 0,
1410                             flags: 0x2,
1411                             parent: cluster_offset as u32,
1412                             acpi_processor_id: core_idx as u32,
1413                             num_private_resources: 0,
1414                         };
1415                         pptt.append(core_hierarchy_node);
1416 
1417                         for _thread_idx in 0..threads_per_core {
1418                             let thread_hierarchy_node = ProcessorHierarchyNode {
1419                                 r#type: 0,
1420                                 length: 20,
1421                                 reserved: 0,
1422                                 flags: 0xE,
1423                                 parent: core_offset as u32,
1424                                 acpi_processor_id: uid as u32,
1425                                 num_private_resources: 0,
1426                             };
1427                             pptt.append(thread_hierarchy_node);
1428                             uid += 1;
1429                         }
1430                     } else {
1431                         let thread_hierarchy_node = ProcessorHierarchyNode {
1432                             r#type: 0,
1433                             length: 20,
1434                             reserved: 0,
1435                             flags: 0xA,
1436                             parent: cluster_offset as u32,
1437                             acpi_processor_id: uid as u32,
1438                             num_private_resources: 0,
1439                         };
1440                         pptt.append(thread_hierarchy_node);
1441                         uid += 1;
1442                     }
1443                 }
1444                 cpus += (cores_per_package * threads_per_core) as usize;
1445             }
1446         }
1447 
1448         pptt.update_checksum();
1449         pptt
1450     }
1451 
1452     #[cfg(feature = "guest_debug")]
1453     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1454         self.vcpus[usize::from(cpu_id)]
1455             .lock()
1456             .unwrap()
1457             .vcpu
1458             .get_regs()
1459             .map_err(Error::CpuDebug)
1460     }
1461 
1462     #[cfg(feature = "guest_debug")]
1463     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1464         self.vcpus[usize::from(cpu_id)]
1465             .lock()
1466             .unwrap()
1467             .vcpu
1468             .set_regs(regs)
1469             .map_err(Error::CpuDebug)
1470     }
1471 
1472     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1473     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1474         self.vcpus[usize::from(cpu_id)]
1475             .lock()
1476             .unwrap()
1477             .vcpu
1478             .get_sregs()
1479             .map_err(Error::CpuDebug)
1480     }
1481 
1482     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1483     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1484         self.vcpus[usize::from(cpu_id)]
1485             .lock()
1486             .unwrap()
1487             .vcpu
1488             .set_sregs(sregs)
1489             .map_err(Error::CpuDebug)
1490     }
1491 
1492     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1493     fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> {
1494         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1495             .lock()
1496             .unwrap()
1497             .vcpu
1498             .translate_gva(gva, /* flags: unused */ 0)
1499             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1500         Ok(gpa)
1501     }
1502 
1503     ///
1504     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1505     /// it in VMM by walking through translation tables.
1506     ///
1507     /// Address translation is big topic, here we only focus the scenario that
1508     /// happens in VMM while debugging kernel. This `translate_gva`
1509     /// implementation is restricted to:
1510     /// - Exception Level 1
1511     /// - Translate high address range only (kernel space)
1512     ///
1513     /// This implementation supports following Arm-v8a features related to
1514     /// address translation:
1515     /// - FEAT_LPA
1516     /// - FEAT_LVA
1517     /// - FEAT_LPA2
1518     ///
1519     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1520     fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> {
1521         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1522             .lock()
1523             .unwrap()
1524             .vcpu
1525             .get_sys_reg(regs::TCR_EL1)
1526             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1527         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1528             .lock()
1529             .unwrap()
1530             .vcpu
1531             .get_sys_reg(regs::TTBR1_EL1)
1532             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1533         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1534             .lock()
1535             .unwrap()
1536             .vcpu
1537             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1538             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1539 
1540         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1541         // or low (0x000xxx...).
1542         let high_range = extract_bits_64!(gva, 55, 1);
1543         if high_range == 0 {
1544             info!("VA (0x{:x}) range is not supported!", gva);
1545             return Ok(gva);
1546         }
1547 
1548         // High range size offset
1549         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1550         // Granule size
1551         let tg = extract_bits_64!(tcr_el1, 30, 2);
1552         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1553         let ds = extract_bits_64!(tcr_el1, 59, 1);
1554 
1555         if tsz == 0 {
1556             info!("VA translation is not ready!");
1557             return Ok(gva);
1558         }
1559 
1560         // VA size is determined by TCR_BL1.T1SZ
1561         let va_size = 64 - tsz;
1562         // Number of bits in VA consumed in each level of translation
1563         let stride = match tg {
1564             3 => 13, // 64KB granule size
1565             1 => 11, // 16KB granule size
1566             _ => 9,  // 4KB, default
1567         };
1568         // Starting level of walking
1569         let mut level = 4 - (va_size - 4) / stride;
1570 
1571         // PA or IPA size is determined
1572         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1573         #[allow(clippy::identity_op)]
1574         let pa_range = extract_bits_64!(id_aa64mmfr0_el1, 0, 4);
1575         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1576         // To be safe, we use the minimum value if they are different.
1577         let pa_range = std::cmp::min(tcr_ips, pa_range);
1578         // PA size in bits
1579         let pa_size = match pa_range {
1580             0 => 32,
1581             1 => 36,
1582             2 => 40,
1583             3 => 42,
1584             4 => 44,
1585             5 => 48,
1586             6 => 52,
1587             _ => {
1588                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1589                     "PA range not supported {}",
1590                     pa_range
1591                 ))))
1592             }
1593         };
1594 
1595         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1596         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1597         // If FEAT_LPA2 is present, the translation table descriptor holds
1598         // 50 bits of the table address of next level.
1599         // Otherwise, it is 48 bits.
1600         let descaddrmask = if ds == 1 {
1601             !0u64 >> (64 - 50) // mask with 50 least significant bits
1602         } else {
1603             !0u64 >> (64 - 48) // mask with 48 least significant bits
1604         };
1605         let descaddrmask = descaddrmask & !indexmask_grainsize;
1606 
1607         // Translation table base address
1608         #[allow(clippy::identity_op)]
1609         let mut descaddr: u64 = extract_bits_64!(ttbr1_el1, 0, 48);
1610         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1611         // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5].
1612         if pa_size == 52 {
1613             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1614         }
1615 
1616         // Loop through tables of each level
1617         loop {
1618             // Table offset for current level
1619             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1620             descaddr |= table_offset;
1621             descaddr &= !7u64;
1622 
1623             let mut buf = [0; 8];
1624             self.vm_memory
1625                 .memory()
1626                 .read(&mut buf, GuestAddress(descaddr))
1627                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1628             let descriptor = u64::from_le_bytes(buf);
1629 
1630             descaddr = descriptor & descaddrmask;
1631             // In the case of FEAT_LPA, the next-level translation table address
1632             // bits [48:51] comes from bits [12:15] of the current descriptor.
1633             // For FEAT_LPA2, the next-level translation table address
1634             // bits [50:51] comes from bits [8:9] of the current descriptor,
1635             // bits [48:49] comes from bits [48:49] of the descriptor which was
1636             // handled previously.
1637             if pa_size == 52 {
1638                 if ds == 1 {
1639                     // FEAT_LPA2
1640                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1641                 } else {
1642                     // FEAT_LPA
1643                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1644                 }
1645             }
1646 
1647             if (descriptor & 2) != 0 && (level < 3) {
1648                 // This is a table entry. Go down to next level.
1649                 level += 1;
1650                 indexmask = indexmask_grainsize;
1651                 continue;
1652             }
1653 
1654             break;
1655         }
1656 
1657         // We have reached either:
1658         // - a page entry at level 3 or
1659         // - a block entry at level 1 or 2
1660         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1661         descaddr &= !(page_size - 1);
1662         descaddr |= gva & (page_size - 1);
1663 
1664         Ok(descaddr)
1665     }
1666 
1667     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1668         self.acpi_address = Some(acpi_address);
1669     }
1670 
1671     pub(crate) fn set_interrupt_controller(
1672         &mut self,
1673         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1674     ) {
1675         self.interrupt_controller = Some(interrupt_controller);
1676     }
1677 }
1678 
1679 struct Cpu {
1680     cpu_id: u8,
1681     proximity_domain: u32,
1682     dynamic: bool,
1683 }
1684 
1685 #[cfg(target_arch = "x86_64")]
1686 const MADT_CPU_ENABLE_FLAG: usize = 0;
1687 
1688 #[cfg(target_arch = "x86_64")]
1689 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1690 
1691 impl Cpu {
1692     #[cfg(target_arch = "x86_64")]
1693     fn generate_mat(&self) -> Vec<u8> {
1694         let lapic = LocalApic {
1695             r#type: 0,
1696             length: 8,
1697             processor_id: self.cpu_id,
1698             apic_id: self.cpu_id,
1699             flags: 1 << MADT_CPU_ENABLE_FLAG,
1700         };
1701 
1702         let mut mat_data: Vec<u8> = Vec::new();
1703         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1704         // SAFETY: mat_data is large enough to hold lapic
1705         unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic };
1706 
1707         mat_data
1708     }
1709 }
1710 
1711 impl Aml for Cpu {
1712     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1713         #[cfg(target_arch = "x86_64")]
1714         let mat_data: Vec<u8> = self.generate_mat();
1715         #[allow(clippy::if_same_then_else)]
1716         if self.dynamic {
1717             aml::Device::new(
1718                 format!("C{:03}", self.cpu_id).as_str().into(),
1719                 vec![
1720                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1721                     &aml::Name::new("_UID".into(), &self.cpu_id),
1722                     // Currently, AArch64 cannot support following fields.
1723                     /*
1724                     _STA return value:
1725                     Bit [0] – Set if the device is present.
1726                     Bit [1] – Set if the device is enabled and decoding its resources.
1727                     Bit [2] – Set if the device should be shown in the UI.
1728                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1729                     Bit [4] – Set if the battery is present.
1730                     Bits [31:5] – Reserved (must be cleared).
1731                     */
1732                     #[cfg(target_arch = "x86_64")]
1733                     &aml::Method::new(
1734                         "_STA".into(),
1735                         0,
1736                         false,
1737                         // Call into CSTA method which will interrogate device
1738                         vec![&aml::Return::new(&aml::MethodCall::new(
1739                             "CSTA".into(),
1740                             vec![&self.cpu_id],
1741                         ))],
1742                     ),
1743                     &aml::Method::new(
1744                         "_PXM".into(),
1745                         0,
1746                         false,
1747                         vec![&aml::Return::new(&self.proximity_domain)],
1748                     ),
1749                     // The Linux kernel expects every CPU device to have a _MAT entry
1750                     // containing the LAPIC for this processor with the enabled bit set
1751                     // even it if is disabled in the MADT (non-boot CPU)
1752                     #[cfg(target_arch = "x86_64")]
1753                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1754                     // Trigger CPU ejection
1755                     #[cfg(target_arch = "x86_64")]
1756                     &aml::Method::new(
1757                         "_EJ0".into(),
1758                         1,
1759                         false,
1760                         // Call into CEJ0 method which will actually eject device
1761                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1762                     ),
1763                 ],
1764             )
1765             .append_aml_bytes(bytes);
1766         } else {
1767             aml::Device::new(
1768                 format!("C{:03}", self.cpu_id).as_str().into(),
1769                 vec![
1770                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1771                     &aml::Name::new("_UID".into(), &self.cpu_id),
1772                     #[cfg(target_arch = "x86_64")]
1773                     &aml::Method::new(
1774                         "_STA".into(),
1775                         0,
1776                         false,
1777                         // Mark CPU present see CSTA implementation
1778                         vec![&aml::Return::new(&0xfu8)],
1779                     ),
1780                     &aml::Method::new(
1781                         "_PXM".into(),
1782                         0,
1783                         false,
1784                         vec![&aml::Return::new(&self.proximity_domain)],
1785                     ),
1786                     // The Linux kernel expects every CPU device to have a _MAT entry
1787                     // containing the LAPIC for this processor with the enabled bit set
1788                     // even it if is disabled in the MADT (non-boot CPU)
1789                     #[cfg(target_arch = "x86_64")]
1790                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1791                 ],
1792             )
1793             .append_aml_bytes(bytes);
1794         }
1795     }
1796 }
1797 
1798 struct CpuNotify {
1799     cpu_id: u8,
1800 }
1801 
1802 impl Aml for CpuNotify {
1803     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1804         let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
1805         aml::If::new(
1806             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1807             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1808         )
1809         .append_aml_bytes(bytes)
1810     }
1811 }
1812 
1813 struct CpuMethods {
1814     max_vcpus: u8,
1815     dynamic: bool,
1816 }
1817 
1818 impl Aml for CpuMethods {
1819     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1820         if self.dynamic {
1821             // CPU status method
1822             aml::Method::new(
1823                 "CSTA".into(),
1824                 1,
1825                 true,
1826                 vec![
1827                     // Take lock defined above
1828                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1829                     // Write CPU number (in first argument) to I/O port via field
1830                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1831                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1832                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1833                     &aml::If::new(
1834                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1835                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1836                     ),
1837                     // Release lock
1838                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1839                     // Return 0 or 0xf
1840                     &aml::Return::new(&aml::Local(0)),
1841                 ],
1842             )
1843             .append_aml_bytes(bytes);
1844 
1845             let mut cpu_notifies = Vec::new();
1846             for cpu_id in 0..self.max_vcpus {
1847                 cpu_notifies.push(CpuNotify { cpu_id });
1848             }
1849 
1850             let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
1851             for cpu_id in 0..self.max_vcpus {
1852                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1853             }
1854 
1855             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes);
1856 
1857             aml::Method::new(
1858                 "CEJ0".into(),
1859                 1,
1860                 true,
1861                 vec![
1862                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1863                     // Write CPU number (in first argument) to I/O port via field
1864                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1865                     // Set CEJ0 bit
1866                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1867                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1868                 ],
1869             )
1870             .append_aml_bytes(bytes);
1871 
1872             aml::Method::new(
1873                 "CSCN".into(),
1874                 0,
1875                 true,
1876                 vec![
1877                     // Take lock defined above
1878                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1879                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1880                     &aml::While::new(
1881                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1882                         vec![
1883                             // Write CPU number (in first argument) to I/O port via field
1884                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1885                             // Check if CINS bit is set
1886                             &aml::If::new(
1887                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1888                                 // Notify device if it is
1889                                 vec![
1890                                     &aml::MethodCall::new(
1891                                         "CTFY".into(),
1892                                         vec![&aml::Local(0), &aml::ONE],
1893                                     ),
1894                                     // Reset CINS bit
1895                                     &aml::Store::new(
1896                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1897                                         &aml::ONE,
1898                                     ),
1899                                 ],
1900                             ),
1901                             // Check if CRMV bit is set
1902                             &aml::If::new(
1903                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1904                                 // Notify device if it is (with the eject constant 0x3)
1905                                 vec![
1906                                     &aml::MethodCall::new(
1907                                         "CTFY".into(),
1908                                         vec![&aml::Local(0), &3u8],
1909                                     ),
1910                                     // Reset CRMV bit
1911                                     &aml::Store::new(
1912                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1913                                         &aml::ONE,
1914                                     ),
1915                                 ],
1916                             ),
1917                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1918                         ],
1919                     ),
1920                     // Release lock
1921                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1922                 ],
1923             )
1924             .append_aml_bytes(bytes)
1925         } else {
1926             aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes)
1927         }
1928     }
1929 }
1930 
1931 impl Aml for CpuManager {
1932     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1933         #[cfg(target_arch = "x86_64")]
1934         if let Some(acpi_address) = self.acpi_address {
1935             // CPU hotplug controller
1936             aml::Device::new(
1937                 "_SB_.PRES".into(),
1938                 vec![
1939                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
1940                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1941                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1942                     &aml::Mutex::new("CPLK".into(), 0),
1943                     &aml::Name::new(
1944                         "_CRS".into(),
1945                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
1946                             aml::AddressSpaceCachable::NotCacheable,
1947                             true,
1948                             acpi_address.0,
1949                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
1950                         )]),
1951                     ),
1952                     // OpRegion and Fields map MMIO range into individual field values
1953                     &aml::OpRegion::new(
1954                         "PRST".into(),
1955                         aml::OpRegionSpace::SystemMemory,
1956                         acpi_address.0 as usize,
1957                         CPU_MANAGER_ACPI_SIZE,
1958                     ),
1959                     &aml::Field::new(
1960                         "PRST".into(),
1961                         aml::FieldAccessType::Byte,
1962                         aml::FieldUpdateRule::WriteAsZeroes,
1963                         vec![
1964                             aml::FieldEntry::Reserved(32),
1965                             aml::FieldEntry::Named(*b"CPEN", 1),
1966                             aml::FieldEntry::Named(*b"CINS", 1),
1967                             aml::FieldEntry::Named(*b"CRMV", 1),
1968                             aml::FieldEntry::Named(*b"CEJ0", 1),
1969                             aml::FieldEntry::Reserved(4),
1970                             aml::FieldEntry::Named(*b"CCMD", 8),
1971                         ],
1972                     ),
1973                     &aml::Field::new(
1974                         "PRST".into(),
1975                         aml::FieldAccessType::DWord,
1976                         aml::FieldUpdateRule::Preserve,
1977                         vec![
1978                             aml::FieldEntry::Named(*b"CSEL", 32),
1979                             aml::FieldEntry::Reserved(32),
1980                             aml::FieldEntry::Named(*b"CDAT", 32),
1981                         ],
1982                     ),
1983                 ],
1984             )
1985             .append_aml_bytes(bytes);
1986         }
1987 
1988         // CPU devices
1989         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
1990         let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05"));
1991         // Bundle methods together under a common object
1992         let methods = CpuMethods {
1993             max_vcpus: self.config.max_vcpus,
1994             dynamic: self.dynamic,
1995         };
1996         let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods];
1997 
1998         let mut cpu_devices = Vec::new();
1999         for cpu_id in 0..self.config.max_vcpus {
2000             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2001             let cpu_device = Cpu {
2002                 cpu_id,
2003                 proximity_domain,
2004                 dynamic: self.dynamic,
2005             };
2006 
2007             cpu_devices.push(cpu_device);
2008         }
2009 
2010         for cpu_device in cpu_devices.iter() {
2011             cpu_data_inner.push(cpu_device);
2012         }
2013 
2014         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes)
2015     }
2016 }
2017 
2018 impl Pausable for CpuManager {
2019     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2020         // Tell the vCPUs to pause themselves next time they exit
2021         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2022 
2023         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2024         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2025         // above.
2026         for state in self.vcpu_states.iter() {
2027             state.signal_thread();
2028         }
2029 
2030         for vcpu in self.vcpus.iter() {
2031             let mut vcpu = vcpu.lock().unwrap();
2032             vcpu.pause()?;
2033             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2034             if !self.config.kvm_hyperv {
2035                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2036                     MigratableError::Pause(anyhow!(
2037                         "Could not notify guest it has been paused {:?}",
2038                         e
2039                     ))
2040                 })?;
2041             }
2042         }
2043 
2044         Ok(())
2045     }
2046 
2047     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2048         for vcpu in self.vcpus.iter() {
2049             vcpu.lock().unwrap().resume()?;
2050         }
2051 
2052         // Toggle the vCPUs pause boolean
2053         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2054 
2055         // Unpark all the VCPU threads.
2056         // Once unparked, the next thing they will do is checking for the pause
2057         // boolean. Since it'll be set to false, they will exit their pause loop
2058         // and go back to vmx root.
2059         for state in self.vcpu_states.iter() {
2060             state.unpark_thread();
2061         }
2062         Ok(())
2063     }
2064 }
2065 
2066 impl Snapshottable for CpuManager {
2067     fn id(&self) -> String {
2068         CPU_MANAGER_SNAPSHOT_ID.to_string()
2069     }
2070 
2071     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2072         let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID);
2073 
2074         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2075         for vcpu in &self.vcpus {
2076             let cpu_snapshot = vcpu.lock().unwrap().snapshot()?;
2077             cpu_manager_snapshot.add_snapshot(cpu_snapshot);
2078         }
2079 
2080         Ok(cpu_manager_snapshot)
2081     }
2082 
2083     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
2084         for (cpu_id, snapshot) in snapshot.snapshots.iter() {
2085             let cpu_id = cpu_id.parse::<usize>().unwrap();
2086             info!("Restoring VCPU {}", cpu_id);
2087             let vcpu = self.vcpus[cpu_id].clone();
2088             self.configure_vcpu(vcpu, None, Some(*snapshot.clone()))
2089                 .map_err(|e| {
2090                     MigratableError::Restore(anyhow!("Could not configure vCPU {:?}", e))
2091                 })?
2092         }
2093 
2094         Ok(())
2095     }
2096 }
2097 
2098 impl Transportable for CpuManager {}
2099 impl Migratable for CpuManager {}
2100 
2101 #[cfg(feature = "guest_debug")]
2102 impl Debuggable for CpuManager {
2103     #[cfg(feature = "kvm")]
2104     fn set_guest_debug(
2105         &self,
2106         cpu_id: usize,
2107         addrs: &[GuestAddress],
2108         singlestep: bool,
2109     ) -> std::result::Result<(), DebuggableError> {
2110         self.vcpus[cpu_id]
2111             .lock()
2112             .unwrap()
2113             .vcpu
2114             .set_guest_debug(addrs, singlestep)
2115             .map_err(DebuggableError::SetDebug)
2116     }
2117 
2118     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2119         Ok(())
2120     }
2121 
2122     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2123         Ok(())
2124     }
2125 
2126     #[cfg(target_arch = "x86_64")]
2127     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2128         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2129         let gregs = self
2130             .get_regs(cpu_id as u8)
2131             .map_err(DebuggableError::ReadRegs)?;
2132         let regs = [
2133             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2134             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2135         ];
2136 
2137         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2138         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2139         let eflags = gregs.rflags as u32;
2140         let rip = gregs.rip;
2141 
2142         // Segment registers: CS, SS, DS, ES, FS, GS
2143         let sregs = self
2144             .get_sregs(cpu_id as u8)
2145             .map_err(DebuggableError::ReadRegs)?;
2146         let segments = X86SegmentRegs {
2147             cs: sregs.cs.selector as u32,
2148             ss: sregs.ss.selector as u32,
2149             ds: sregs.ds.selector as u32,
2150             es: sregs.es.selector as u32,
2151             fs: sregs.fs.selector as u32,
2152             gs: sregs.gs.selector as u32,
2153         };
2154 
2155         // TODO: Add other registers
2156 
2157         Ok(CoreRegs {
2158             regs,
2159             eflags,
2160             rip,
2161             segments,
2162             ..Default::default()
2163         })
2164     }
2165 
2166     #[cfg(target_arch = "aarch64")]
2167     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2168         let gregs = self
2169             .get_regs(cpu_id as u8)
2170             .map_err(DebuggableError::ReadRegs)?;
2171         Ok(CoreRegs {
2172             x: gregs.regs.regs,
2173             sp: gregs.regs.sp,
2174             pc: gregs.regs.pc,
2175             ..Default::default()
2176         })
2177     }
2178 
2179     #[cfg(target_arch = "x86_64")]
2180     fn write_regs(
2181         &self,
2182         cpu_id: usize,
2183         regs: &CoreRegs,
2184     ) -> std::result::Result<(), DebuggableError> {
2185         let orig_gregs = self
2186             .get_regs(cpu_id as u8)
2187             .map_err(DebuggableError::ReadRegs)?;
2188         let gregs = StandardRegisters {
2189             rax: regs.regs[0],
2190             rbx: regs.regs[1],
2191             rcx: regs.regs[2],
2192             rdx: regs.regs[3],
2193             rsi: regs.regs[4],
2194             rdi: regs.regs[5],
2195             rbp: regs.regs[6],
2196             rsp: regs.regs[7],
2197             r8: regs.regs[8],
2198             r9: regs.regs[9],
2199             r10: regs.regs[10],
2200             r11: regs.regs[11],
2201             r12: regs.regs[12],
2202             r13: regs.regs[13],
2203             r14: regs.regs[14],
2204             r15: regs.regs[15],
2205             rip: regs.rip,
2206             // Update the lower 32-bit of rflags.
2207             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2208         };
2209 
2210         self.set_regs(cpu_id as u8, &gregs)
2211             .map_err(DebuggableError::WriteRegs)?;
2212 
2213         // Segment registers: CS, SS, DS, ES, FS, GS
2214         // Since GDB care only selectors, we call get_sregs() first.
2215         let mut sregs = self
2216             .get_sregs(cpu_id as u8)
2217             .map_err(DebuggableError::ReadRegs)?;
2218         sregs.cs.selector = regs.segments.cs as u16;
2219         sregs.ss.selector = regs.segments.ss as u16;
2220         sregs.ds.selector = regs.segments.ds as u16;
2221         sregs.es.selector = regs.segments.es as u16;
2222         sregs.fs.selector = regs.segments.fs as u16;
2223         sregs.gs.selector = regs.segments.gs as u16;
2224 
2225         self.set_sregs(cpu_id as u8, &sregs)
2226             .map_err(DebuggableError::WriteRegs)?;
2227 
2228         // TODO: Add other registers
2229 
2230         Ok(())
2231     }
2232 
2233     #[cfg(target_arch = "aarch64")]
2234     fn write_regs(
2235         &self,
2236         cpu_id: usize,
2237         regs: &CoreRegs,
2238     ) -> std::result::Result<(), DebuggableError> {
2239         let mut gregs = self
2240             .get_regs(cpu_id as u8)
2241             .map_err(DebuggableError::ReadRegs)?;
2242 
2243         gregs.regs.regs = regs.x;
2244         gregs.regs.sp = regs.sp;
2245         gregs.regs.pc = regs.pc;
2246 
2247         self.set_regs(cpu_id as u8, &gregs)
2248             .map_err(DebuggableError::WriteRegs)?;
2249 
2250         Ok(())
2251     }
2252 
2253     fn read_mem(
2254         &self,
2255         cpu_id: usize,
2256         vaddr: GuestAddress,
2257         len: usize,
2258     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2259         let mut buf = vec![0; len];
2260         let mut total_read = 0_u64;
2261 
2262         while total_read < len as u64 {
2263             let gaddr = vaddr.0 + total_read;
2264             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2265                 Ok(paddr) => paddr,
2266                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2267                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2268             };
2269             let psize = arch::PAGE_SIZE as u64;
2270             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2271             self.vm_memory
2272                 .memory()
2273                 .read(
2274                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2275                     GuestAddress(paddr),
2276                 )
2277                 .map_err(DebuggableError::ReadMem)?;
2278             total_read += read_len;
2279         }
2280         Ok(buf)
2281     }
2282 
2283     fn write_mem(
2284         &self,
2285         cpu_id: usize,
2286         vaddr: &GuestAddress,
2287         data: &[u8],
2288     ) -> std::result::Result<(), DebuggableError> {
2289         let mut total_written = 0_u64;
2290 
2291         while total_written < data.len() as u64 {
2292             let gaddr = vaddr.0 + total_written;
2293             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2294                 Ok(paddr) => paddr,
2295                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2296                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2297             };
2298             let psize = arch::PAGE_SIZE as u64;
2299             let write_len = std::cmp::min(
2300                 data.len() as u64 - total_written,
2301                 psize - (paddr & (psize - 1)),
2302             );
2303             self.vm_memory
2304                 .memory()
2305                 .write(
2306                     &data[total_written as usize..total_written as usize + write_len as usize],
2307                     GuestAddress(paddr),
2308                 )
2309                 .map_err(DebuggableError::WriteMem)?;
2310             total_written += write_len;
2311         }
2312         Ok(())
2313     }
2314 
2315     fn active_vcpus(&self) -> usize {
2316         self.present_vcpus() as usize
2317     }
2318 }
2319 
2320 #[cfg(feature = "guest_debug")]
2321 impl Elf64Writable for CpuManager {}
2322 
2323 #[cfg(feature = "guest_debug")]
2324 impl CpuElf64Writable for CpuManager {
2325     fn cpu_write_elf64_note(
2326         &mut self,
2327         dump_state: &DumpState,
2328     ) -> std::result::Result<(), GuestDebuggableError> {
2329         let mut coredump_file = dump_state.file.as_ref().unwrap();
2330         for vcpu in &self.vcpus {
2331             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2332             let mut pos: usize = 0;
2333             let mut buf = vec![0; note_size as usize];
2334             let descsz = size_of::<X86_64ElfPrStatus>();
2335             let vcpu_id = vcpu.lock().unwrap().id;
2336 
2337             let note = Elf64_Nhdr {
2338                 n_namesz: COREDUMP_NAME_SIZE,
2339                 n_descsz: descsz as u32,
2340                 n_type: NT_PRSTATUS,
2341             };
2342 
2343             let bytes: &[u8] = note.as_slice();
2344             buf.splice(0.., bytes.to_vec());
2345             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2346             buf.resize(pos + 4, 0);
2347             buf.splice(pos.., "CORE".to_string().into_bytes());
2348 
2349             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2350             buf.resize(pos + 32 + 4, 0);
2351             let pid = vcpu_id as u64;
2352             let bytes: &[u8] = pid.as_slice();
2353             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2354 
2355             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2356 
2357             let orig_rax: u64 = 0;
2358             let gregs = self.vcpus[usize::from(vcpu_id)]
2359                 .lock()
2360                 .unwrap()
2361                 .vcpu
2362                 .get_regs()
2363                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2364 
2365             let regs1 = [
2366                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2367                 gregs.r10,
2368             ];
2369             let regs2 = [
2370                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2371             ];
2372 
2373             let sregs = self.vcpus[usize::from(vcpu_id)]
2374                 .lock()
2375                 .unwrap()
2376                 .vcpu
2377                 .get_sregs()
2378                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2379 
2380             debug!(
2381                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2382                 gregs.rip,
2383                 gregs.rsp,
2384                 sregs.gs.base,
2385                 sregs.cs.selector,
2386                 sregs.ss.selector,
2387                 sregs.ds.selector,
2388             );
2389 
2390             let regs = X86_64UserRegs {
2391                 regs1,
2392                 regs2,
2393                 rip: gregs.rip,
2394                 cs: sregs.cs.selector as u64,
2395                 eflags: gregs.rflags,
2396                 rsp: gregs.rsp,
2397                 ss: sregs.ss.selector as u64,
2398                 fs_base: sregs.fs.base,
2399                 gs_base: sregs.gs.base,
2400                 ds: sregs.ds.selector as u64,
2401                 es: sregs.es.selector as u64,
2402                 fs: sregs.fs.selector as u64,
2403                 gs: sregs.gs.selector as u64,
2404             };
2405 
2406             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2407             let bytes: &[u8] = regs.as_slice();
2408             buf.resize(note_size as usize, 0);
2409             buf.splice(pos.., bytes.to_vec());
2410             buf.resize(note_size as usize, 0);
2411 
2412             coredump_file
2413                 .write(&buf)
2414                 .map_err(GuestDebuggableError::CoredumpFile)?;
2415         }
2416 
2417         Ok(())
2418     }
2419 
2420     fn cpu_write_vmm_note(
2421         &mut self,
2422         dump_state: &DumpState,
2423     ) -> std::result::Result<(), GuestDebuggableError> {
2424         let mut coredump_file = dump_state.file.as_ref().unwrap();
2425         for vcpu in &self.vcpus {
2426             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2427             let mut pos: usize = 0;
2428             let mut buf = vec![0; note_size as usize];
2429             let descsz = size_of::<DumpCpusState>();
2430             let vcpu_id = vcpu.lock().unwrap().id;
2431 
2432             let note = Elf64_Nhdr {
2433                 n_namesz: COREDUMP_NAME_SIZE,
2434                 n_descsz: descsz as u32,
2435                 n_type: 0,
2436             };
2437 
2438             let bytes: &[u8] = note.as_slice();
2439             buf.splice(0.., bytes.to_vec());
2440             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2441 
2442             buf.resize(pos + 4, 0);
2443             buf.splice(pos.., "QEMU".to_string().into_bytes());
2444 
2445             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2446 
2447             let gregs = self.vcpus[usize::from(vcpu_id)]
2448                 .lock()
2449                 .unwrap()
2450                 .vcpu
2451                 .get_regs()
2452                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2453 
2454             let regs1 = [
2455                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2456                 gregs.rbp,
2457             ];
2458 
2459             let regs2 = [
2460                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2461                 gregs.r15,
2462             ];
2463 
2464             let sregs = self.vcpus[usize::from(vcpu_id)]
2465                 .lock()
2466                 .unwrap()
2467                 .vcpu
2468                 .get_sregs()
2469                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2470 
2471             let mut msrs = vec![MsrEntry {
2472                 index: msr_index::MSR_KERNEL_GS_BASE,
2473                 ..Default::default()
2474             }];
2475 
2476             self.vcpus[vcpu_id as usize]
2477                 .lock()
2478                 .unwrap()
2479                 .vcpu
2480                 .get_msrs(&mut msrs)
2481                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2482             let kernel_gs_base = msrs[0].data;
2483 
2484             let cs = CpuSegment::new(sregs.cs);
2485             let ds = CpuSegment::new(sregs.ds);
2486             let es = CpuSegment::new(sregs.es);
2487             let fs = CpuSegment::new(sregs.fs);
2488             let gs = CpuSegment::new(sregs.gs);
2489             let ss = CpuSegment::new(sregs.ss);
2490             let ldt = CpuSegment::new(sregs.ldt);
2491             let tr = CpuSegment::new(sregs.tr);
2492             let gdt = CpuSegment::new_from_table(sregs.gdt);
2493             let idt = CpuSegment::new_from_table(sregs.idt);
2494             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2495             let regs = DumpCpusState {
2496                 version: 1,
2497                 size: size_of::<DumpCpusState>() as u32,
2498                 regs1,
2499                 regs2,
2500                 rip: gregs.rip,
2501                 rflags: gregs.rflags,
2502                 cs,
2503                 ds,
2504                 es,
2505                 fs,
2506                 gs,
2507                 ss,
2508                 ldt,
2509                 tr,
2510                 gdt,
2511                 idt,
2512                 cr,
2513                 kernel_gs_base,
2514             };
2515 
2516             let bytes: &[u8] = regs.as_slice();
2517             buf.resize(note_size as usize, 0);
2518             buf.splice(pos.., bytes.to_vec());
2519             buf.resize(note_size as usize, 0);
2520 
2521             coredump_file
2522                 .write(&buf)
2523                 .map_err(GuestDebuggableError::CoredumpFile)?;
2524         }
2525 
2526         Ok(())
2527     }
2528 }
2529 
2530 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2531 #[cfg(test)]
2532 mod tests {
2533     use arch::x86_64::interrupts::*;
2534     use arch::x86_64::regs::*;
2535     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2536 
2537     #[test]
2538     fn test_setlint() {
2539         let hv = hypervisor::new().unwrap();
2540         let vm = hv.create_vm().expect("new VM fd creation failed");
2541         assert!(hv.check_required_extensions().is_ok());
2542         // Calling get_lapic will fail if there is no irqchip before hand.
2543         assert!(vm.create_irq_chip().is_ok());
2544         let vcpu = vm.create_vcpu(0, None).unwrap();
2545         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2546 
2547         // Compute the value that is expected to represent LVT0 and LVT1.
2548         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2549         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2550         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2551         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2552 
2553         set_lint(&vcpu).unwrap();
2554 
2555         // Compute the value that represents LVT0 and LVT1 after set_lint.
2556         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2557         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2558         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2559         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2560         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2561     }
2562 
2563     #[test]
2564     fn test_setup_fpu() {
2565         let hv = hypervisor::new().unwrap();
2566         let vm = hv.create_vm().expect("new VM fd creation failed");
2567         let vcpu = vm.create_vcpu(0, None).unwrap();
2568         setup_fpu(&vcpu).unwrap();
2569 
2570         let expected_fpu: FpuState = FpuState {
2571             fcw: 0x37f,
2572             mxcsr: 0x1f80,
2573             ..Default::default()
2574         };
2575         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2576         // TODO: auto-generate kvm related structures with PartialEq on.
2577         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2578         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2579         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2580         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2581         // remove it at all.
2582         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2583     }
2584 
2585     #[test]
2586     fn test_setup_msrs() {
2587         use hypervisor::arch::x86::{msr_index, MsrEntry};
2588 
2589         let hv = hypervisor::new().unwrap();
2590         let vm = hv.create_vm().expect("new VM fd creation failed");
2591         let vcpu = vm.create_vcpu(0, None).unwrap();
2592         setup_msrs(&vcpu).unwrap();
2593 
2594         // This test will check against the last MSR entry configured (the tenth one).
2595         // See create_msr_entries for details.
2596         let mut msrs = vec![MsrEntry {
2597             index: msr_index::MSR_IA32_MISC_ENABLE,
2598             ..Default::default()
2599         }];
2600 
2601         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2602         // in this test case scenario.
2603         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2604         assert_eq!(read_msrs, 1);
2605 
2606         // Official entries that were setup when we did setup_msrs. We need to assert that the
2607         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2608         // expect.
2609         let entry_vec = vcpu.boot_msr_entries();
2610         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2611     }
2612 
2613     #[test]
2614     fn test_setup_regs() {
2615         let hv = hypervisor::new().unwrap();
2616         let vm = hv.create_vm().expect("new VM fd creation failed");
2617         let vcpu = vm.create_vcpu(0, None).unwrap();
2618 
2619         let expected_regs: StandardRegisters = StandardRegisters {
2620             rflags: 0x0000000000000002u64,
2621             rbx: arch::layout::PVH_INFO_START.0,
2622             rip: 1,
2623             ..Default::default()
2624         };
2625 
2626         setup_regs(&vcpu, expected_regs.rip).unwrap();
2627 
2628         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2629         assert_eq!(actual_regs, expected_regs);
2630     }
2631 }
2632 
2633 #[cfg(target_arch = "aarch64")]
2634 #[cfg(test)]
2635 mod tests {
2636     use arch::{aarch64::regs, layout};
2637     use hypervisor::kvm::aarch64::is_system_register;
2638     use hypervisor::kvm::kvm_bindings::{
2639         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2640         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2641     };
2642     use hypervisor::{arm64_core_reg_id, offset__of};
2643     use std::mem;
2644 
2645     #[test]
2646     fn test_setup_regs() {
2647         let hv = hypervisor::new().unwrap();
2648         let vm = hv.create_vm().unwrap();
2649         let vcpu = vm.create_vcpu(0, None).unwrap();
2650 
2651         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2652         // Must fail when vcpu is not initialized yet.
2653         assert!(res.is_err());
2654 
2655         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2656         vm.get_preferred_target(&mut kvi).unwrap();
2657         vcpu.vcpu_init(&kvi).unwrap();
2658 
2659         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2660     }
2661 
2662     #[test]
2663     fn test_read_mpidr() {
2664         let hv = hypervisor::new().unwrap();
2665         let vm = hv.create_vm().unwrap();
2666         let vcpu = vm.create_vcpu(0, None).unwrap();
2667         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2668         vm.get_preferred_target(&mut kvi).unwrap();
2669 
2670         // Must fail when vcpu is not initialized yet.
2671         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2672 
2673         vcpu.vcpu_init(&kvi).unwrap();
2674         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2675     }
2676 
2677     #[test]
2678     fn test_is_system_register() {
2679         let offset = offset__of!(user_pt_regs, pc);
2680         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2681         assert!(!is_system_register(regid));
2682         let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64;
2683         assert!(is_system_register(regid));
2684     }
2685 
2686     #[test]
2687     fn test_save_restore_core_regs() {
2688         let hv = hypervisor::new().unwrap();
2689         let vm = hv.create_vm().unwrap();
2690         let vcpu = vm.create_vcpu(0, None).unwrap();
2691         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2692         vm.get_preferred_target(&mut kvi).unwrap();
2693 
2694         // Must fail when vcpu is not initialized yet.
2695         let res = vcpu.get_regs();
2696         assert!(res.is_err());
2697         assert_eq!(
2698             format!("{}", res.unwrap_err()),
2699             "Failed to get core register: Exec format error (os error 8)"
2700         );
2701 
2702         let mut state = kvm_regs::default();
2703         let res = vcpu.set_regs(&state);
2704         assert!(res.is_err());
2705         assert_eq!(
2706             format!("{}", res.unwrap_err()),
2707             "Failed to set core register: Exec format error (os error 8)"
2708         );
2709 
2710         vcpu.vcpu_init(&kvi).unwrap();
2711         let res = vcpu.get_regs();
2712         assert!(res.is_ok());
2713         state = res.unwrap();
2714         assert_eq!(state.regs.pstate, 0x3C5);
2715 
2716         assert!(vcpu.set_regs(&state).is_ok());
2717     }
2718 
2719     #[test]
2720     fn test_get_set_mpstate() {
2721         let hv = hypervisor::new().unwrap();
2722         let vm = hv.create_vm().unwrap();
2723         let vcpu = vm.create_vcpu(0, None).unwrap();
2724         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2725         vm.get_preferred_target(&mut kvi).unwrap();
2726 
2727         let res = vcpu.get_mp_state();
2728         assert!(res.is_ok());
2729         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2730     }
2731 }
2732