xref: /cloud-hypervisor/vmm/src/cpu.rs (revision eea9bcea38e0c5649f444c829f3a4f9c22aa486c)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(feature = "guest_debug")]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 use crate::device_manager::DeviceManager;
22 #[cfg(feature = "guest_debug")]
23 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, aml::Aml, sdt::Sdt};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 use arch::EntryPoint;
35 use arch::NumaNodes;
36 #[cfg(target_arch = "aarch64")]
37 use devices::gic::Gic;
38 use devices::interrupt_controller::InterruptController;
39 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
40 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
41 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
42 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
43 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
44 use hypervisor::aarch64::StandardRegisters;
45 #[cfg(feature = "guest_debug")]
46 use hypervisor::arch::x86::msr_index;
47 #[cfg(target_arch = "x86_64")]
48 use hypervisor::arch::x86::CpuIdEntry;
49 #[cfg(feature = "guest_debug")]
50 use hypervisor::arch::x86::MsrEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
53 #[cfg(target_arch = "aarch64")]
54 use hypervisor::kvm::kvm_bindings;
55 #[cfg(feature = "tdx")]
56 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
57 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
58 use libc::{c_void, siginfo_t};
59 #[cfg(feature = "guest_debug")]
60 use linux_loader::elf::Elf64_Nhdr;
61 use seccompiler::{apply_filter, SeccompAction};
62 use std::collections::BTreeMap;
63 #[cfg(feature = "guest_debug")]
64 use std::io::Write;
65 #[cfg(feature = "guest_debug")]
66 use std::mem::size_of;
67 use std::os::unix::thread::JoinHandleExt;
68 use std::sync::atomic::{AtomicBool, Ordering};
69 use std::sync::{Arc, Barrier, Mutex};
70 use std::{cmp, io, result, thread};
71 use thiserror::Error;
72 use tracer::trace_scoped;
73 use vm_device::BusDevice;
74 #[cfg(feature = "guest_debug")]
75 use vm_memory::ByteValued;
76 #[cfg(feature = "guest_debug")]
77 use vm_memory::{Bytes, GuestAddressSpace};
78 use vm_memory::{GuestAddress, GuestMemoryAtomic};
79 use vm_migration::{
80     Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
81     Transportable,
82 };
83 use vmm_sys_util::eventfd::EventFd;
84 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
85 
86 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
87 /// Extract the specified bits of a 64-bit integer.
88 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
89 /// following expression should return 3 (`0b11`):
90 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
91 ///
92 macro_rules! extract_bits_64 {
93     ($value: tt, $offset: tt, $length: tt) => {
94         ($value >> $offset) & (!0u64 >> (64 - $length))
95     };
96 }
97 
98 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
99 
100 #[derive(Debug, Error)]
101 pub enum Error {
102     #[error("Error creating vCPU: {0}")]
103     VcpuCreate(#[source] anyhow::Error),
104 
105     #[error("Error running bCPU: {0}")]
106     VcpuRun(#[source] anyhow::Error),
107 
108     #[error("Error spawning vCPU thread: {0}")]
109     VcpuSpawn(#[source] io::Error),
110 
111     #[error("Error generating common CPUID: {0}")]
112     CommonCpuId(#[source] arch::Error),
113 
114     #[error("Error configuring vCPU: {0}")]
115     VcpuConfiguration(#[source] arch::Error),
116 
117     #[cfg(target_arch = "aarch64")]
118     #[error("Error fetching preferred target: {0}")]
119     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
120 
121     #[cfg(target_arch = "aarch64")]
122     #[error("Error initialising vCPU: {0}")]
123     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
124 
125     #[error("Failed to join on vCPU threads: {0:?}")]
126     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
127 
128     #[error("Error adding CpuManager to MMIO bus: {0}")]
129     BusError(#[source] vm_device::BusError),
130 
131     #[error("Requested vCPUs exceed maximum")]
132     DesiredVCpuCountExceedsMax,
133 
134     #[error("Cannot create seccomp filter: {0}")]
135     CreateSeccompFilter(#[source] seccompiler::Error),
136 
137     #[error("Cannot apply seccomp filter: {0}")]
138     ApplySeccompFilter(#[source] seccompiler::Error),
139 
140     #[error("Error starting vCPU after restore: {0}")]
141     StartRestoreVcpu(#[source] anyhow::Error),
142 
143     #[error("Unexpected VmExit")]
144     UnexpectedVmExit,
145 
146     #[error("Failed to allocate MMIO address for CpuManager")]
147     AllocateMmmioAddress,
148 
149     #[cfg(feature = "tdx")]
150     #[error("Error initializing TDX: {0}")]
151     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
152 
153     #[cfg(target_arch = "aarch64")]
154     #[error("Error initializing PMU: {0}")]
155     InitPmu(#[source] hypervisor::HypervisorCpuError),
156 
157     #[cfg(feature = "guest_debug")]
158     #[error("Error during CPU debug: {0}")]
159     CpuDebug(#[source] hypervisor::HypervisorCpuError),
160 
161     #[cfg(feature = "guest_debug")]
162     #[error("Error translating virtual address: {0}")]
163     TranslateVirtualAddress(#[source] anyhow::Error),
164 
165     #[cfg(target_arch = "x86_64")]
166     #[error("Error setting up AMX: {0}")]
167     AmxEnable(#[source] anyhow::Error),
168 }
169 pub type Result<T> = result::Result<T, Error>;
170 
171 #[cfg(target_arch = "x86_64")]
172 #[allow(dead_code)]
173 #[repr(packed)]
174 struct LocalApic {
175     pub r#type: u8,
176     pub length: u8,
177     pub processor_id: u8,
178     pub apic_id: u8,
179     pub flags: u32,
180 }
181 
182 #[allow(dead_code)]
183 #[repr(packed)]
184 #[derive(Default)]
185 struct Ioapic {
186     pub r#type: u8,
187     pub length: u8,
188     pub ioapic_id: u8,
189     _reserved: u8,
190     pub apic_address: u32,
191     pub gsi_base: u32,
192 }
193 
194 #[cfg(target_arch = "aarch64")]
195 #[allow(dead_code)]
196 #[repr(packed)]
197 struct GicC {
198     pub r#type: u8,
199     pub length: u8,
200     pub reserved0: u16,
201     pub cpu_interface_number: u32,
202     pub uid: u32,
203     pub flags: u32,
204     pub parking_version: u32,
205     pub performance_interrupt: u32,
206     pub parked_address: u64,
207     pub base_address: u64,
208     pub gicv_base_address: u64,
209     pub gich_base_address: u64,
210     pub vgic_interrupt: u32,
211     pub gicr_base_address: u64,
212     pub mpidr: u64,
213     pub proc_power_effi_class: u8,
214     pub reserved1: u8,
215     pub spe_overflow_interrupt: u16,
216 }
217 
218 #[cfg(target_arch = "aarch64")]
219 #[allow(dead_code)]
220 #[repr(packed)]
221 struct GicD {
222     pub r#type: u8,
223     pub length: u8,
224     pub reserved0: u16,
225     pub gic_id: u32,
226     pub base_address: u64,
227     pub global_irq_base: u32,
228     pub version: u8,
229     pub reserved1: [u8; 3],
230 }
231 
232 #[cfg(target_arch = "aarch64")]
233 #[allow(dead_code)]
234 #[repr(packed)]
235 struct GicR {
236     pub r#type: u8,
237     pub length: u8,
238     pub reserved: u16,
239     pub base_address: u64,
240     pub range_length: u32,
241 }
242 
243 #[cfg(target_arch = "aarch64")]
244 #[allow(dead_code)]
245 #[repr(packed)]
246 struct GicIts {
247     pub r#type: u8,
248     pub length: u8,
249     pub reserved0: u16,
250     pub translation_id: u32,
251     pub base_address: u64,
252     pub reserved1: u32,
253 }
254 
255 #[cfg(target_arch = "aarch64")]
256 #[allow(dead_code)]
257 #[repr(packed)]
258 struct ProcessorHierarchyNode {
259     pub r#type: u8,
260     pub length: u8,
261     pub reserved: u16,
262     pub flags: u32,
263     pub parent: u32,
264     pub acpi_processor_id: u32,
265     pub num_private_resources: u32,
266 }
267 
268 #[allow(dead_code)]
269 #[repr(packed)]
270 #[derive(Default)]
271 struct InterruptSourceOverride {
272     pub r#type: u8,
273     pub length: u8,
274     pub bus: u8,
275     pub source: u8,
276     pub gsi: u32,
277     pub flags: u16,
278 }
279 
280 #[cfg(feature = "guest_debug")]
281 macro_rules! round_up {
282     ($n:expr,$d:expr) => {
283         (($n / ($d + 1)) + 1) * $d
284     };
285 }
286 
287 /// A wrapper around creating and using a kvm-based VCPU.
288 pub struct Vcpu {
289     // The hypervisor abstracted CPU.
290     vcpu: Arc<dyn hypervisor::Vcpu>,
291     id: u8,
292     #[cfg(target_arch = "aarch64")]
293     mpidr: u64,
294     saved_state: Option<CpuState>,
295 }
296 
297 impl Vcpu {
298     /// Constructs a new VCPU for `vm`.
299     ///
300     /// # Arguments
301     ///
302     /// * `id` - Represents the CPU number between [0, max vcpus).
303     /// * `vm` - The virtual machine this vcpu will get attached to.
304     /// * `vm_ops` - Optional object for exit handling.
305     pub fn new(
306         id: u8,
307         vm: &Arc<dyn hypervisor::Vm>,
308         vm_ops: Option<Arc<dyn VmOps>>,
309     ) -> Result<Self> {
310         let vcpu = vm
311             .create_vcpu(id, vm_ops)
312             .map_err(|e| Error::VcpuCreate(e.into()))?;
313         // Initially the cpuid per vCPU is the one supported by this VM.
314         Ok(Vcpu {
315             vcpu,
316             id,
317             #[cfg(target_arch = "aarch64")]
318             mpidr: 0,
319             saved_state: None,
320         })
321     }
322 
323     /// Configures a vcpu and should be called once per vcpu when created.
324     ///
325     /// # Arguments
326     ///
327     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
328     /// * `vm_memory` - Guest memory.
329     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
330     pub fn configure(
331         &mut self,
332         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
333         kernel_entry_point: Option<EntryPoint>,
334         #[cfg(target_arch = "x86_64")] vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
335         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
336         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
337     ) -> Result<()> {
338         #[cfg(target_arch = "aarch64")]
339         {
340             self.init(vm)?;
341             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point)
342                 .map_err(Error::VcpuConfiguration)?;
343         }
344         info!("Configuring vCPU: cpu_id = {}", self.id);
345         #[cfg(target_arch = "x86_64")]
346         arch::configure_vcpu(
347             &self.vcpu,
348             self.id,
349             kernel_entry_point,
350             vm_memory,
351             cpuid,
352             kvm_hyperv,
353         )
354         .map_err(Error::VcpuConfiguration)?;
355 
356         Ok(())
357     }
358 
359     /// Gets the MPIDR register value.
360     #[cfg(target_arch = "aarch64")]
361     pub fn get_mpidr(&self) -> u64 {
362         self.mpidr
363     }
364 
365     /// Gets the saved vCPU state.
366     #[cfg(target_arch = "aarch64")]
367     pub fn get_saved_state(&self) -> Option<CpuState> {
368         self.saved_state.clone()
369     }
370 
371     /// Initializes an aarch64 specific vcpu for booting Linux.
372     #[cfg(target_arch = "aarch64")]
373     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
374         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
375 
376         // This reads back the kernel's preferred target type.
377         vm.get_preferred_target(&mut kvi)
378             .map_err(Error::VcpuArmPreferredTarget)?;
379         // We already checked that the capability is supported.
380         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
381         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
382         // Non-boot cpus are powered off initially.
383         if self.id > 0 {
384             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
385         }
386         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
387     }
388 
389     /// Runs the VCPU until it exits, returning the reason.
390     ///
391     /// Note that the state of the VCPU and associated VM must be setup first for this to do
392     /// anything useful.
393     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
394         self.vcpu.run()
395     }
396 }
397 
398 const VCPU_SNAPSHOT_ID: &str = "vcpu";
399 impl Pausable for Vcpu {}
400 impl Snapshottable for Vcpu {
401     fn id(&self) -> String {
402         VCPU_SNAPSHOT_ID.to_string()
403     }
404 
405     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
406         let saved_state = self
407             .vcpu
408             .state()
409             .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?;
410 
411         let mut vcpu_snapshot = Snapshot::new(&format!("{:03}", self.id));
412         vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state(
413             VCPU_SNAPSHOT_ID,
414             &saved_state,
415         )?);
416 
417         self.saved_state = Some(saved_state);
418 
419         Ok(vcpu_snapshot)
420     }
421 
422     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
423         let saved_state: CpuState = snapshot.to_state(VCPU_SNAPSHOT_ID)?;
424 
425         self.vcpu
426             .set_state(&saved_state)
427             .map_err(|e| MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)))?;
428 
429         self.saved_state = Some(saved_state);
430 
431         Ok(())
432     }
433 }
434 
435 pub struct CpuManager {
436     hypervisor_type: HypervisorType,
437     config: CpusConfig,
438     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
439     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
440     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
441     vm_memory: GuestMemoryAtomic<GuestMemoryMmap>,
442     #[cfg(target_arch = "x86_64")]
443     cpuid: Vec<CpuIdEntry>,
444     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
445     vm: Arc<dyn hypervisor::Vm>,
446     vcpus_kill_signalled: Arc<AtomicBool>,
447     vcpus_pause_signalled: Arc<AtomicBool>,
448     exit_evt: EventFd,
449     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
450     reset_evt: EventFd,
451     #[cfg(feature = "guest_debug")]
452     vm_debug_evt: EventFd,
453     vcpu_states: Vec<VcpuState>,
454     selected_cpu: u8,
455     vcpus: Vec<Arc<Mutex<Vcpu>>>,
456     seccomp_action: SeccompAction,
457     vm_ops: Arc<dyn VmOps>,
458     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
459     acpi_address: Option<GuestAddress>,
460     proximity_domain_per_cpu: BTreeMap<u8, u32>,
461     affinity: BTreeMap<u8, Vec<u8>>,
462     dynamic: bool,
463 }
464 
465 const CPU_ENABLE_FLAG: usize = 0;
466 const CPU_INSERTING_FLAG: usize = 1;
467 const CPU_REMOVING_FLAG: usize = 2;
468 const CPU_EJECT_FLAG: usize = 3;
469 
470 const CPU_STATUS_OFFSET: u64 = 4;
471 const CPU_SELECTION_OFFSET: u64 = 0;
472 
473 impl BusDevice for CpuManager {
474     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
475         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
476         data.fill(0);
477 
478         match offset {
479             CPU_SELECTION_OFFSET => {
480                 data[0] = self.selected_cpu;
481             }
482             CPU_STATUS_OFFSET => {
483                 if self.selected_cpu < self.max_vcpus() {
484                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
485                     if state.active() {
486                         data[0] |= 1 << CPU_ENABLE_FLAG;
487                     }
488                     if state.inserting {
489                         data[0] |= 1 << CPU_INSERTING_FLAG;
490                     }
491                     if state.removing {
492                         data[0] |= 1 << CPU_REMOVING_FLAG;
493                     }
494                 } else {
495                     warn!("Out of range vCPU id: {}", self.selected_cpu);
496                 }
497             }
498             _ => {
499                 warn!(
500                     "Unexpected offset for accessing CPU manager device: {:#}",
501                     offset
502                 );
503             }
504         }
505     }
506 
507     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
508         match offset {
509             CPU_SELECTION_OFFSET => {
510                 self.selected_cpu = data[0];
511             }
512             CPU_STATUS_OFFSET => {
513                 if self.selected_cpu < self.max_vcpus() {
514                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
515                     // The ACPI code writes back a 1 to acknowledge the insertion
516                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
517                         && state.inserting
518                     {
519                         state.inserting = false;
520                     }
521                     // Ditto for removal
522                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
523                         && state.removing
524                     {
525                         state.removing = false;
526                     }
527                     // Trigger removal of vCPU
528                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
529                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
530                             error!("Error removing vCPU: {:?}", e);
531                         }
532                     }
533                 } else {
534                     warn!("Out of range vCPU id: {}", self.selected_cpu);
535                 }
536             }
537             _ => {
538                 warn!(
539                     "Unexpected offset for accessing CPU manager device: {:#}",
540                     offset
541                 );
542             }
543         }
544         None
545     }
546 }
547 
548 #[derive(Default)]
549 struct VcpuState {
550     inserting: bool,
551     removing: bool,
552     handle: Option<thread::JoinHandle<()>>,
553     kill: Arc<AtomicBool>,
554     vcpu_run_interrupted: Arc<AtomicBool>,
555 }
556 
557 impl VcpuState {
558     fn active(&self) -> bool {
559         self.handle.is_some()
560     }
561 
562     fn signal_thread(&self) {
563         if let Some(handle) = self.handle.as_ref() {
564             loop {
565                 unsafe {
566                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
567                 }
568                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
569                     break;
570                 } else {
571                     // This is more effective than thread::yield_now() at
572                     // avoiding a priority inversion with the vCPU thread
573                     thread::sleep(std::time::Duration::from_millis(1));
574                 }
575             }
576         }
577     }
578 
579     fn join_thread(&mut self) -> Result<()> {
580         if let Some(handle) = self.handle.take() {
581             handle.join().map_err(Error::ThreadCleanup)?
582         }
583 
584         Ok(())
585     }
586 
587     fn unpark_thread(&self) {
588         if let Some(handle) = self.handle.as_ref() {
589             handle.thread().unpark()
590         }
591     }
592 }
593 
594 impl CpuManager {
595     #[allow(unused_variables)]
596     #[allow(clippy::too_many_arguments)]
597     pub fn new(
598         config: &CpusConfig,
599         device_manager: &Arc<Mutex<DeviceManager>>,
600         memory_manager: &Arc<Mutex<MemoryManager>>,
601         vm: Arc<dyn hypervisor::Vm>,
602         exit_evt: EventFd,
603         reset_evt: EventFd,
604         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
605         hypervisor: Arc<dyn hypervisor::Hypervisor>,
606         seccomp_action: SeccompAction,
607         vm_ops: Arc<dyn VmOps>,
608         #[cfg(feature = "tdx")] tdx_enabled: bool,
609         numa_nodes: &NumaNodes,
610     ) -> Result<Arc<Mutex<CpuManager>>> {
611         let guest_memory = memory_manager.lock().unwrap().guest_memory();
612         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
613         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
614         let hypervisor_type = hypervisor.hypervisor_type();
615 
616         #[cfg(target_arch = "x86_64")]
617         let sgx_epc_sections = memory_manager
618             .lock()
619             .unwrap()
620             .sgx_epc_region()
621             .as_ref()
622             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
623         #[cfg(target_arch = "x86_64")]
624         let cpuid = {
625             let phys_bits = physical_bits(config.max_phys_bits);
626             arch::generate_common_cpuid(
627                 hypervisor,
628                 config
629                     .topology
630                     .clone()
631                     .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)),
632                 sgx_epc_sections,
633                 phys_bits,
634                 config.kvm_hyperv,
635                 #[cfg(feature = "tdx")]
636                 tdx_enabled,
637             )
638             .map_err(Error::CommonCpuId)?
639         };
640         #[cfg(target_arch = "x86_64")]
641         if config.features.amx {
642             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
643             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
644             const XFEATURE_XTILEDATA: usize = 18;
645             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
646 
647             // This is safe as the syscall is only modifing kernel internal
648             // data structures that the kernel is itself expected to safeguard.
649             let amx_tile = unsafe {
650                 libc::syscall(
651                     libc::SYS_arch_prctl,
652                     ARCH_REQ_XCOMP_GUEST_PERM,
653                     XFEATURE_XTILEDATA,
654                 )
655             };
656 
657             if amx_tile != 0 {
658                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
659             } else {
660                 // This is safe as the mask being modified (not marked mutable as it is
661                 // modified in unsafe only which is permitted) isn't in use elsewhere.
662                 let mask: usize = 0;
663                 let result = unsafe {
664                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
665                 };
666                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
667                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
668                 }
669             }
670         }
671 
672         let device_manager = device_manager.lock().unwrap();
673 
674         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
675             let mut cpu_list = Vec::new();
676             for (proximity_domain, numa_node) in numa_nodes.iter() {
677                 for cpu in numa_node.cpus.iter() {
678                     cpu_list.push((*cpu, *proximity_domain))
679                 }
680             }
681             cpu_list
682         }
683         .into_iter()
684         .collect();
685 
686         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
687             cpu_affinity
688                 .iter()
689                 .map(|a| (a.vcpu, a.host_cpus.clone()))
690                 .collect()
691         } else {
692             BTreeMap::new()
693         };
694 
695         #[cfg(feature = "tdx")]
696         let dynamic = !tdx_enabled;
697         #[cfg(not(feature = "tdx"))]
698         let dynamic = true;
699 
700         let acpi_address = if dynamic {
701             Some(
702                 device_manager
703                     .allocator()
704                     .lock()
705                     .unwrap()
706                     .allocate_platform_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None)
707                     .ok_or(Error::AllocateMmmioAddress)?,
708             )
709         } else {
710             None
711         };
712 
713         let cpu_manager = Arc::new(Mutex::new(CpuManager {
714             hypervisor_type,
715             config: config.clone(),
716             interrupt_controller: device_manager.interrupt_controller().clone(),
717             vm_memory: guest_memory,
718             #[cfg(target_arch = "x86_64")]
719             cpuid,
720             vm,
721             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
722             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
723             vcpu_states,
724             exit_evt,
725             reset_evt,
726             #[cfg(feature = "guest_debug")]
727             vm_debug_evt,
728             selected_cpu: 0,
729             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
730             seccomp_action,
731             vm_ops,
732             acpi_address,
733             proximity_domain_per_cpu,
734             affinity,
735             dynamic,
736         }));
737 
738         if let Some(acpi_address) = acpi_address {
739             device_manager
740                 .mmio_bus()
741                 .insert(
742                     cpu_manager.clone(),
743                     acpi_address.0,
744                     CPU_MANAGER_ACPI_SIZE as u64,
745                 )
746                 .map_err(Error::BusError)?;
747         }
748 
749         Ok(cpu_manager)
750     }
751 
752     fn create_vcpu(
753         &mut self,
754         cpu_id: u8,
755         entry_point: Option<EntryPoint>,
756         snapshot: Option<Snapshot>,
757     ) -> Result<()> {
758         info!("Creating vCPU: cpu_id = {}", cpu_id);
759 
760         let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?;
761 
762         if let Some(snapshot) = snapshot {
763             // AArch64 vCPUs should be initialized after created.
764             #[cfg(target_arch = "aarch64")]
765             vcpu.init(&self.vm)?;
766 
767             vcpu.restore(snapshot).expect("Failed to restore vCPU");
768         } else {
769             #[cfg(target_arch = "x86_64")]
770             vcpu.configure(
771                 entry_point,
772                 &self.vm_memory,
773                 self.cpuid.clone(),
774                 self.config.kvm_hyperv,
775             )
776             .expect("Failed to configure vCPU");
777 
778             #[cfg(target_arch = "aarch64")]
779             vcpu.configure(&self.vm, entry_point)
780                 .expect("Failed to configure vCPU");
781         }
782 
783         // Adding vCPU to the CpuManager's vCPU list.
784         let vcpu = Arc::new(Mutex::new(vcpu));
785         self.vcpus.push(vcpu);
786 
787         Ok(())
788     }
789 
790     /// Only create new vCPUs if there aren't any inactive ones to reuse
791     fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> {
792         info!(
793             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
794             desired_vcpus,
795             self.config.max_vcpus,
796             self.vcpus.len(),
797             self.present_vcpus()
798         );
799 
800         if desired_vcpus > self.config.max_vcpus {
801             return Err(Error::DesiredVCpuCountExceedsMax);
802         }
803 
804         // Only create vCPUs in excess of all the allocated vCPUs.
805         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
806             self.create_vcpu(cpu_id, entry_point, None)?;
807         }
808 
809         Ok(())
810     }
811 
812     #[cfg(target_arch = "aarch64")]
813     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
814         for cpu in self.vcpus.iter() {
815             let cpu = cpu.lock().unwrap();
816             // Check if PMU attr is available, if not, log the information.
817             if cpu.vcpu.has_pmu_support() {
818                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
819             } else {
820                 debug!(
821                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
822                     cpu.id
823                 );
824                 return Ok(false);
825             }
826         }
827 
828         Ok(true)
829     }
830 
831     fn start_vcpu(
832         &mut self,
833         vcpu: Arc<Mutex<Vcpu>>,
834         vcpu_id: u8,
835         vcpu_thread_barrier: Arc<Barrier>,
836         inserting: bool,
837     ) -> Result<()> {
838         let reset_evt = self.reset_evt.try_clone().unwrap();
839         let exit_evt = self.exit_evt.try_clone().unwrap();
840         #[cfg(feature = "guest_debug")]
841         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
842         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
843         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
844         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
845 
846         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
847         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
848             .vcpu_run_interrupted
849             .clone();
850         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
851 
852         // Prepare the CPU set the current vCPU is expected to run onto.
853         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
854             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
855             unsafe { libc::CPU_ZERO(&mut cpuset) };
856             for host_cpu in host_cpus {
857                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
858             }
859             cpuset
860         });
861 
862         // Retrieve seccomp filter for vcpu thread
863         let vcpu_seccomp_filter =
864             get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type)
865                 .map_err(Error::CreateSeccompFilter)?;
866 
867         #[cfg(target_arch = "x86_64")]
868         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
869 
870         info!("Starting vCPU: cpu_id = {}", vcpu_id);
871 
872         let handle = Some(
873             thread::Builder::new()
874                 .name(format!("vcpu{}", vcpu_id))
875                 .spawn(move || {
876                     // Schedule the thread to run on the expected CPU set
877                     if let Some(cpuset) = cpuset.as_ref() {
878                         let ret = unsafe {
879                             libc::sched_setaffinity(
880                                 0,
881                                 std::mem::size_of::<libc::cpu_set_t>(),
882                                 cpuset as *const libc::cpu_set_t,
883                             )
884                         };
885 
886                         if ret != 0 {
887                             error!(
888                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
889                                 vcpu_id,
890                                 io::Error::last_os_error()
891                             );
892                             return;
893                         }
894                     }
895 
896                     // Apply seccomp filter for vcpu thread.
897                     if !vcpu_seccomp_filter.is_empty() {
898                         if let Err(e) =
899                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
900                         {
901                             error!("Error applying seccomp filter: {:?}", e);
902                             return;
903                         }
904                     }
905                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
906                     // This uses an async signal safe handler to kill the vcpu handles.
907                     register_signal_handler(SIGRTMIN(), handle_signal)
908                         .expect("Failed to register vcpu signal handler");
909                     // Block until all CPUs are ready.
910                     vcpu_thread_barrier.wait();
911 
912                     std::panic::catch_unwind(move || {
913                         loop {
914                             // If we are being told to pause, we park the thread
915                             // until the pause boolean is toggled.
916                             // The resume operation is responsible for toggling
917                             // the boolean and unpark the thread.
918                             // We enter a loop because park() could spuriously
919                             // return. We will then park() again unless the
920                             // pause boolean has been toggled.
921 
922                             // Need to use Ordering::SeqCst as we have multiple
923                             // loads and stores to different atomics and we need
924                             // to see them in a consistent order in all threads
925 
926                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
927                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
928                                 // completed by returning to KVM_RUN. From the kernel docs:
929                                 //
930                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
931                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
932                                 // operations are complete (and guest state is consistent) only after userspace
933                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
934                                 // incomplete operations and then check for pending signals.
935                                 // The pending state of the operation is not preserved in state which is
936                                 // visible to userspace, thus userspace should ensure that the operation is
937                                 // completed before performing a live migration.  Userspace can re-enter the
938                                 // guest with an unmasked signal pending or with the immediate_exit field set
939                                 // to complete pending operations without allowing any further instructions
940                                 // to be executed.
941 
942                                 #[cfg(feature = "kvm")]
943                                 {
944                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
945                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
946                                         error!("Unexpected VM exit on \"immediate_exit\" run");
947                                         break;
948                                     }
949                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
950                                 }
951 
952                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
953                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
954                                     thread::park();
955                                 }
956                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
957                             }
958 
959                             // We've been told to terminate
960                             if vcpu_kill_signalled.load(Ordering::SeqCst)
961                                 || vcpu_kill.load(Ordering::SeqCst)
962                             {
963                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
964                                 break;
965                             }
966 
967                             #[cfg(feature = "tdx")]
968                             let mut vcpu = vcpu.lock().unwrap();
969                             #[cfg(not(feature = "tdx"))]
970                             let vcpu = vcpu.lock().unwrap();
971                             // vcpu.run() returns false on a triple-fault so trigger a reset
972                             match vcpu.run() {
973                                 Ok(run) => match run {
974                                     #[cfg(feature = "kvm")]
975                                     VmExit::Debug => {
976                                         info!("VmExit::Debug");
977                                         #[cfg(feature = "guest_debug")]
978                                         {
979                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
980                                             let raw_tid = get_raw_tid(vcpu_id as usize);
981                                             vm_debug_evt.write(raw_tid as u64).unwrap();
982                                         }
983                                     }
984                                     #[cfg(target_arch = "x86_64")]
985                                     VmExit::IoapicEoi(vector) => {
986                                         if let Some(interrupt_controller) =
987                                             &interrupt_controller_clone
988                                         {
989                                             interrupt_controller
990                                                 .lock()
991                                                 .unwrap()
992                                                 .end_of_interrupt(vector);
993                                         }
994                                     }
995                                     VmExit::Ignore => {}
996                                     VmExit::Hyperv => {}
997                                     VmExit::Reset => {
998                                         info!("VmExit::Reset");
999                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1000                                         reset_evt.write(1).unwrap();
1001                                         break;
1002                                     }
1003                                     VmExit::Shutdown => {
1004                                         info!("VmExit::Shutdown");
1005                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1006                                         exit_evt.write(1).unwrap();
1007                                         break;
1008                                     }
1009                                     #[cfg(feature = "tdx")]
1010                                     VmExit::Tdx => {
1011                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1012                                             match vcpu.get_tdx_exit_details() {
1013                                                 Ok(details) => match details {
1014                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1015                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1016                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1017                                                     }
1018                                                 },
1019                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1020                                             }
1021                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1022                                         } else {
1023                                             // We should never reach this code as
1024                                             // this means the design from the code
1025                                             // is wrong.
1026                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1027                                         }
1028                                     }
1029                                     _ => {
1030                                         error!(
1031                                             "VCPU generated error: {:?}",
1032                                             Error::UnexpectedVmExit
1033                                         );
1034                                         break;
1035                                     }
1036                                 },
1037 
1038                                 Err(e) => {
1039                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1040                                     break;
1041                                 }
1042                             }
1043 
1044                             // We've been told to terminate
1045                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1046                                 || vcpu_kill.load(Ordering::SeqCst)
1047                             {
1048                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1049                                 break;
1050                             }
1051                         }
1052                     })
1053                     .or_else(|_| {
1054                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1055                         error!("vCPU thread panicked");
1056                         panic_exit_evt.write(1)
1057                     })
1058                     .ok();
1059                 })
1060                 .map_err(Error::VcpuSpawn)?,
1061         );
1062 
1063         // On hot plug calls into this function entry_point is None. It is for
1064         // those hotplug CPU additions that we need to set the inserting flag.
1065         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1066         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1067 
1068         Ok(())
1069     }
1070 
1071     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1072     fn activate_vcpus(
1073         &mut self,
1074         desired_vcpus: u8,
1075         inserting: bool,
1076         paused: Option<bool>,
1077     ) -> Result<()> {
1078         if desired_vcpus > self.config.max_vcpus {
1079             return Err(Error::DesiredVCpuCountExceedsMax);
1080         }
1081 
1082         let vcpu_thread_barrier = Arc::new(Barrier::new(
1083             (desired_vcpus - self.present_vcpus() + 1) as usize,
1084         ));
1085 
1086         if let Some(paused) = paused {
1087             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1088         }
1089 
1090         info!(
1091             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1092             desired_vcpus,
1093             self.vcpus.len(),
1094             self.present_vcpus(),
1095             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1096         );
1097 
1098         // This reuses any inactive vCPUs as well as any that were newly created
1099         for vcpu_id in self.present_vcpus()..desired_vcpus {
1100             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1101             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1102         }
1103 
1104         // Unblock all CPU threads.
1105         vcpu_thread_barrier.wait();
1106         Ok(())
1107     }
1108 
1109     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1110         // Mark vCPUs for removal, actual removal happens on ejection
1111         for cpu_id in desired_vcpus..self.present_vcpus() {
1112             self.vcpu_states[usize::from(cpu_id)].removing = true;
1113         }
1114     }
1115 
1116     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1117         info!("Removing vCPU: cpu_id = {}", cpu_id);
1118         let mut state = &mut self.vcpu_states[usize::from(cpu_id)];
1119         state.kill.store(true, Ordering::SeqCst);
1120         state.signal_thread();
1121         state.join_thread()?;
1122         state.handle = None;
1123 
1124         // Once the thread has exited, clear the "kill" so that it can reused
1125         state.kill.store(false, Ordering::SeqCst);
1126 
1127         Ok(())
1128     }
1129 
1130     pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> {
1131         trace_scoped!("create_boot_vcpus");
1132 
1133         self.create_vcpus(self.boot_vcpus(), entry_point)
1134     }
1135 
1136     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1137     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1138         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1139     }
1140 
1141     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1142         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1143             .map_err(|e| {
1144                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1145             })?;
1146 
1147         Ok(())
1148     }
1149 
1150     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1151         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1152             return Ok(false);
1153         }
1154 
1155         if !self.dynamic {
1156             return Ok(false);
1157         }
1158 
1159         match desired_vcpus.cmp(&self.present_vcpus()) {
1160             cmp::Ordering::Greater => {
1161                 self.create_vcpus(desired_vcpus, None)?;
1162                 self.activate_vcpus(desired_vcpus, true, None)?;
1163                 Ok(true)
1164             }
1165             cmp::Ordering::Less => {
1166                 self.mark_vcpus_for_removal(desired_vcpus);
1167                 Ok(true)
1168             }
1169             _ => Ok(false),
1170         }
1171     }
1172 
1173     pub fn shutdown(&mut self) -> Result<()> {
1174         // Tell the vCPUs to stop themselves next time they go through the loop
1175         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1176 
1177         // Toggle the vCPUs pause boolean
1178         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1179 
1180         // Unpark all the VCPU threads.
1181         for state in self.vcpu_states.iter() {
1182             state.unpark_thread();
1183         }
1184 
1185         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1186         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1187         // above.
1188         for state in self.vcpu_states.iter() {
1189             state.signal_thread();
1190         }
1191 
1192         // Wait for all the threads to finish. This removes the state from the vector.
1193         for mut state in self.vcpu_states.drain(..) {
1194             state.join_thread()?;
1195         }
1196 
1197         Ok(())
1198     }
1199 
1200     #[cfg(feature = "tdx")]
1201     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1202         for vcpu in &self.vcpus {
1203             vcpu.lock()
1204                 .unwrap()
1205                 .vcpu
1206                 .tdx_init(hob_address)
1207                 .map_err(Error::InitializeTdx)?;
1208         }
1209         Ok(())
1210     }
1211 
1212     pub fn boot_vcpus(&self) -> u8 {
1213         self.config.boot_vcpus
1214     }
1215 
1216     pub fn max_vcpus(&self) -> u8 {
1217         self.config.max_vcpus
1218     }
1219 
1220     #[cfg(target_arch = "x86_64")]
1221     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1222         self.cpuid.clone()
1223     }
1224 
1225     fn present_vcpus(&self) -> u8 {
1226         self.vcpu_states
1227             .iter()
1228             .fold(0, |acc, state| acc + state.active() as u8)
1229     }
1230 
1231     #[cfg(target_arch = "aarch64")]
1232     pub fn get_mpidrs(&self) -> Vec<u64> {
1233         self.vcpus
1234             .iter()
1235             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1236             .collect()
1237     }
1238 
1239     #[cfg(target_arch = "aarch64")]
1240     pub fn get_saved_states(&self) -> Vec<CpuState> {
1241         self.vcpus
1242             .iter()
1243             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1244             .collect()
1245     }
1246 
1247     #[cfg(target_arch = "aarch64")]
1248     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1249         self.config
1250             .topology
1251             .clone()
1252             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1253     }
1254 
1255     pub fn create_madt(&self) -> Sdt {
1256         use crate::acpi;
1257         // This is also checked in the commandline parsing.
1258         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1259 
1260         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1261         #[cfg(target_arch = "x86_64")]
1262         {
1263             madt.write(36, arch::layout::APIC_START);
1264 
1265             for cpu in 0..self.config.max_vcpus {
1266                 let lapic = LocalApic {
1267                     r#type: acpi::ACPI_APIC_PROCESSOR,
1268                     length: 8,
1269                     processor_id: cpu,
1270                     apic_id: cpu,
1271                     flags: if cpu < self.config.boot_vcpus {
1272                         1 << MADT_CPU_ENABLE_FLAG
1273                     } else {
1274                         0
1275                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1276                 };
1277                 madt.append(lapic);
1278             }
1279 
1280             madt.append(Ioapic {
1281                 r#type: acpi::ACPI_APIC_IO,
1282                 length: 12,
1283                 ioapic_id: 0,
1284                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1285                 gsi_base: 0,
1286                 ..Default::default()
1287             });
1288 
1289             madt.append(InterruptSourceOverride {
1290                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1291                 length: 10,
1292                 bus: 0,
1293                 source: 4,
1294                 gsi: 4,
1295                 flags: 0,
1296             });
1297         }
1298 
1299         #[cfg(target_arch = "aarch64")]
1300         {
1301             /* Notes:
1302              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1303              */
1304 
1305             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1306             for cpu in 0..self.config.boot_vcpus {
1307                 let vcpu = &self.vcpus[cpu as usize];
1308                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1309                 /* ARMv8 MPIDR format:
1310                      Bits [63:40] Must be zero
1311                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1312                      Bits [31:24] Must be zero
1313                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1314                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1315                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1316                 */
1317                 let mpidr_mask = 0xff_00ff_ffff;
1318                 let gicc = GicC {
1319                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1320                     length: 80,
1321                     reserved0: 0,
1322                     cpu_interface_number: cpu as u32,
1323                     uid: cpu as u32,
1324                     flags: 1,
1325                     parking_version: 0,
1326                     performance_interrupt: 0,
1327                     parked_address: 0,
1328                     base_address: 0,
1329                     gicv_base_address: 0,
1330                     gich_base_address: 0,
1331                     vgic_interrupt: 0,
1332                     gicr_base_address: 0,
1333                     mpidr: mpidr & mpidr_mask,
1334                     proc_power_effi_class: 0,
1335                     reserved1: 0,
1336                     spe_overflow_interrupt: 0,
1337                 };
1338 
1339                 madt.append(gicc);
1340             }
1341             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1342 
1343             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1344             let gicd = GicD {
1345                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1346                 length: 24,
1347                 reserved0: 0,
1348                 gic_id: 0,
1349                 base_address: vgic_config.dist_addr,
1350                 global_irq_base: 0,
1351                 version: 3,
1352                 reserved1: [0; 3],
1353             };
1354             madt.append(gicd);
1355 
1356             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1357             let gicr = GicR {
1358                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1359                 length: 16,
1360                 reserved: 0,
1361                 base_address: vgic_config.redists_addr,
1362                 range_length: vgic_config.redists_size as u32,
1363             };
1364             madt.append(gicr);
1365 
1366             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1367             let gicits = GicIts {
1368                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1369                 length: 20,
1370                 reserved0: 0,
1371                 translation_id: 0,
1372                 base_address: vgic_config.msi_addr,
1373                 reserved1: 0,
1374             };
1375             madt.append(gicits);
1376 
1377             madt.update_checksum();
1378         }
1379 
1380         madt
1381     }
1382 
1383     #[cfg(target_arch = "aarch64")]
1384     pub fn create_pptt(&self) -> Sdt {
1385         let pptt_start = 0;
1386         let mut cpus = 0;
1387         let mut uid = 0;
1388         // If topology is not specified, the default setting is:
1389         // 1 package, multiple cores, 1 thread per core
1390         // This is also the behavior when PPTT is missing.
1391         let (threads_per_core, cores_per_package, packages) =
1392             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1393 
1394         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1395 
1396         for cluster_idx in 0..packages {
1397             if cpus < self.config.boot_vcpus as usize {
1398                 let cluster_offset = pptt.len() - pptt_start;
1399                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1400                     r#type: 0,
1401                     length: 20,
1402                     reserved: 0,
1403                     flags: 0x2,
1404                     parent: 0,
1405                     acpi_processor_id: cluster_idx as u32,
1406                     num_private_resources: 0,
1407                 };
1408                 pptt.append(cluster_hierarchy_node);
1409 
1410                 for core_idx in 0..cores_per_package {
1411                     let core_offset = pptt.len() - pptt_start;
1412 
1413                     if threads_per_core > 1 {
1414                         let core_hierarchy_node = ProcessorHierarchyNode {
1415                             r#type: 0,
1416                             length: 20,
1417                             reserved: 0,
1418                             flags: 0x2,
1419                             parent: cluster_offset as u32,
1420                             acpi_processor_id: core_idx as u32,
1421                             num_private_resources: 0,
1422                         };
1423                         pptt.append(core_hierarchy_node);
1424 
1425                         for _thread_idx in 0..threads_per_core {
1426                             let thread_hierarchy_node = ProcessorHierarchyNode {
1427                                 r#type: 0,
1428                                 length: 20,
1429                                 reserved: 0,
1430                                 flags: 0xE,
1431                                 parent: core_offset as u32,
1432                                 acpi_processor_id: uid as u32,
1433                                 num_private_resources: 0,
1434                             };
1435                             pptt.append(thread_hierarchy_node);
1436                             uid += 1;
1437                         }
1438                     } else {
1439                         let thread_hierarchy_node = ProcessorHierarchyNode {
1440                             r#type: 0,
1441                             length: 20,
1442                             reserved: 0,
1443                             flags: 0xA,
1444                             parent: cluster_offset as u32,
1445                             acpi_processor_id: uid as u32,
1446                             num_private_resources: 0,
1447                         };
1448                         pptt.append(thread_hierarchy_node);
1449                         uid += 1;
1450                     }
1451                 }
1452                 cpus += (cores_per_package * threads_per_core) as usize;
1453             }
1454         }
1455 
1456         pptt.update_checksum();
1457         pptt
1458     }
1459 
1460     #[cfg(feature = "guest_debug")]
1461     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1462         self.vcpus[usize::from(cpu_id)]
1463             .lock()
1464             .unwrap()
1465             .vcpu
1466             .get_regs()
1467             .map_err(Error::CpuDebug)
1468     }
1469 
1470     #[cfg(feature = "guest_debug")]
1471     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1472         self.vcpus[usize::from(cpu_id)]
1473             .lock()
1474             .unwrap()
1475             .vcpu
1476             .set_regs(regs)
1477             .map_err(Error::CpuDebug)
1478     }
1479 
1480     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1481     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1482         self.vcpus[usize::from(cpu_id)]
1483             .lock()
1484             .unwrap()
1485             .vcpu
1486             .get_sregs()
1487             .map_err(Error::CpuDebug)
1488     }
1489 
1490     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1491     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1492         self.vcpus[usize::from(cpu_id)]
1493             .lock()
1494             .unwrap()
1495             .vcpu
1496             .set_sregs(sregs)
1497             .map_err(Error::CpuDebug)
1498     }
1499 
1500     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1501     fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> {
1502         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1503             .lock()
1504             .unwrap()
1505             .vcpu
1506             .translate_gva(gva, /* flags: unused */ 0)
1507             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1508         Ok(gpa)
1509     }
1510 
1511     ///
1512     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1513     /// it in VMM by walking through translation tables.
1514     ///
1515     /// Address translation is big topic, here we only focus the scenario that
1516     /// happens in VMM while debugging kernel. This `translate_gva`
1517     /// implementation is restricted to:
1518     /// - Exception Level 1
1519     /// - Translate high address range only (kernel space)
1520     ///
1521     /// This implementation supports following Arm-v8a features related to
1522     /// address translation:
1523     /// - FEAT_LPA
1524     /// - FEAT_LVA
1525     /// - FEAT_LPA2
1526     ///
1527     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1528     fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> {
1529         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1530             .lock()
1531             .unwrap()
1532             .vcpu
1533             .get_sys_reg(regs::TCR_EL1)
1534             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1535         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1536             .lock()
1537             .unwrap()
1538             .vcpu
1539             .get_sys_reg(regs::TTBR1_EL1)
1540             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1541         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1542             .lock()
1543             .unwrap()
1544             .vcpu
1545             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1546             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1547 
1548         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1549         // or low (0x000xxx...).
1550         let high_range = extract_bits_64!(gva, 55, 1);
1551         if high_range == 0 {
1552             info!("VA (0x{:x}) range is not supported!", gva);
1553             return Ok(gva);
1554         }
1555 
1556         // High range size offset
1557         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1558         // Granule size
1559         let tg = extract_bits_64!(tcr_el1, 30, 2);
1560         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1561         let ds = extract_bits_64!(tcr_el1, 59, 1);
1562 
1563         if tsz == 0 {
1564             info!("VA translation is not ready!");
1565             return Ok(gva);
1566         }
1567 
1568         // VA size is determined by TCR_BL1.T1SZ
1569         let va_size = 64 - tsz;
1570         // Number of bits in VA consumed in each level of translation
1571         let stride = match tg {
1572             3 => 13, // 64KB granule size
1573             1 => 11, // 16KB granule size
1574             _ => 9,  // 4KB, default
1575         };
1576         // Starting level of walking
1577         let mut level = 4 - (va_size - 4) / stride;
1578 
1579         // PA or IPA size is determined
1580         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1581         #[allow(clippy::identity_op)]
1582         let pa_range = extract_bits_64!(id_aa64mmfr0_el1, 0, 4);
1583         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1584         // To be safe, we use the minimum value if they are different.
1585         let pa_range = std::cmp::min(tcr_ips, pa_range);
1586         // PA size in bits
1587         let pa_size = match pa_range {
1588             0 => 32,
1589             1 => 36,
1590             2 => 40,
1591             3 => 42,
1592             4 => 44,
1593             5 => 48,
1594             6 => 52,
1595             _ => {
1596                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1597                     "PA range not supported {}",
1598                     pa_range
1599                 ))))
1600             }
1601         };
1602 
1603         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1604         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1605         // If FEAT_LPA2 is present, the translation table descriptor holds
1606         // 50 bits of the table address of next level.
1607         // Otherwise, it is 48 bits.
1608         let descaddrmask = if ds == 1 {
1609             !0u64 >> (64 - 50) // mask with 50 least significant bits
1610         } else {
1611             !0u64 >> (64 - 48) // mask with 48 least significant bits
1612         };
1613         let descaddrmask = descaddrmask & !indexmask_grainsize;
1614 
1615         // Translation table base address
1616         #[allow(clippy::identity_op)]
1617         let mut descaddr: u64 = extract_bits_64!(ttbr1_el1, 0, 48);
1618         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1619         // addresss bits [48:51] comes from TTBR1_EL1 bits [2:5].
1620         if pa_size == 52 {
1621             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1622         }
1623 
1624         // Loop through tables of each level
1625         loop {
1626             // Table offset for current level
1627             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1628             descaddr |= table_offset;
1629             descaddr &= !7u64;
1630 
1631             let mut buf = [0; 8];
1632             self.vm_memory
1633                 .memory()
1634                 .read(&mut buf, GuestAddress(descaddr))
1635                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1636             let descriptor = u64::from_le_bytes(buf);
1637 
1638             descaddr = descriptor & descaddrmask;
1639             // In the case of FEAT_LPA, the next-level translation table address
1640             // bits [48:51] comes from bits [12:15] of the current descriptor.
1641             // For FEAT_LPA2, the next-level translation table address
1642             // bits [50:51] comes from bits [8:9] of the current descriptor,
1643             // bits [48:49] comes from bits [48:49] of the descriptor which was
1644             // handled previously.
1645             if pa_size == 52 {
1646                 if ds == 1 {
1647                     // FEAT_LPA2
1648                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1649                 } else {
1650                     // FEAT_LPA
1651                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1652                 }
1653             }
1654 
1655             if (descriptor & 2) != 0 && (level < 3) {
1656                 // This is a table entry. Go down to next level.
1657                 level += 1;
1658                 indexmask = indexmask_grainsize;
1659                 continue;
1660             }
1661 
1662             break;
1663         }
1664 
1665         // We have reached either:
1666         // - a page entry at level 3 or
1667         // - a block entry at level 1 or 2
1668         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1669         descaddr &= !(page_size - 1);
1670         descaddr |= gva & (page_size - 1);
1671 
1672         Ok(descaddr)
1673     }
1674 }
1675 
1676 struct Cpu {
1677     cpu_id: u8,
1678     proximity_domain: u32,
1679     dynamic: bool,
1680 }
1681 
1682 #[cfg(target_arch = "x86_64")]
1683 const MADT_CPU_ENABLE_FLAG: usize = 0;
1684 
1685 #[cfg(target_arch = "x86_64")]
1686 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1687 
1688 impl Cpu {
1689     #[cfg(target_arch = "x86_64")]
1690     fn generate_mat(&self) -> Vec<u8> {
1691         let lapic = LocalApic {
1692             r#type: 0,
1693             length: 8,
1694             processor_id: self.cpu_id,
1695             apic_id: self.cpu_id,
1696             flags: 1 << MADT_CPU_ENABLE_FLAG,
1697         };
1698 
1699         let mut mat_data: Vec<u8> = Vec::new();
1700         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1701         unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic };
1702 
1703         mat_data
1704     }
1705 }
1706 
1707 impl Aml for Cpu {
1708     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1709         #[cfg(target_arch = "x86_64")]
1710         let mat_data: Vec<u8> = self.generate_mat();
1711         #[allow(clippy::if_same_then_else)]
1712         if self.dynamic {
1713             aml::Device::new(
1714                 format!("C{:03}", self.cpu_id).as_str().into(),
1715                 vec![
1716                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1717                     &aml::Name::new("_UID".into(), &self.cpu_id),
1718                     // Currently, AArch64 cannot support following fields.
1719                     /*
1720                     _STA return value:
1721                     Bit [0] – Set if the device is present.
1722                     Bit [1] – Set if the device is enabled and decoding its resources.
1723                     Bit [2] – Set if the device should be shown in the UI.
1724                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1725                     Bit [4] – Set if the battery is present.
1726                     Bits [31:5] – Reserved (must be cleared).
1727                     */
1728                     #[cfg(target_arch = "x86_64")]
1729                     &aml::Method::new(
1730                         "_STA".into(),
1731                         0,
1732                         false,
1733                         // Call into CSTA method which will interrogate device
1734                         vec![&aml::Return::new(&aml::MethodCall::new(
1735                             "CSTA".into(),
1736                             vec![&self.cpu_id],
1737                         ))],
1738                     ),
1739                     &aml::Method::new(
1740                         "_PXM".into(),
1741                         0,
1742                         false,
1743                         vec![&aml::Return::new(&self.proximity_domain)],
1744                     ),
1745                     // The Linux kernel expects every CPU device to have a _MAT entry
1746                     // containing the LAPIC for this processor with the enabled bit set
1747                     // even it if is disabled in the MADT (non-boot CPU)
1748                     #[cfg(target_arch = "x86_64")]
1749                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1750                     // Trigger CPU ejection
1751                     #[cfg(target_arch = "x86_64")]
1752                     &aml::Method::new(
1753                         "_EJ0".into(),
1754                         1,
1755                         false,
1756                         // Call into CEJ0 method which will actually eject device
1757                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1758                     ),
1759                 ],
1760             )
1761             .append_aml_bytes(bytes);
1762         } else {
1763             aml::Device::new(
1764                 format!("C{:03}", self.cpu_id).as_str().into(),
1765                 vec![
1766                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1767                     &aml::Name::new("_UID".into(), &self.cpu_id),
1768                     #[cfg(target_arch = "x86_64")]
1769                     &aml::Method::new(
1770                         "_STA".into(),
1771                         0,
1772                         false,
1773                         // Mark CPU present see CSTA implementation
1774                         vec![&aml::Return::new(&0xfu8)],
1775                     ),
1776                     &aml::Method::new(
1777                         "_PXM".into(),
1778                         0,
1779                         false,
1780                         vec![&aml::Return::new(&self.proximity_domain)],
1781                     ),
1782                     // The Linux kernel expects every CPU device to have a _MAT entry
1783                     // containing the LAPIC for this processor with the enabled bit set
1784                     // even it if is disabled in the MADT (non-boot CPU)
1785                     #[cfg(target_arch = "x86_64")]
1786                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1787                 ],
1788             )
1789             .append_aml_bytes(bytes);
1790         }
1791     }
1792 }
1793 
1794 struct CpuNotify {
1795     cpu_id: u8,
1796 }
1797 
1798 impl Aml for CpuNotify {
1799     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1800         let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
1801         aml::If::new(
1802             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1803             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1804         )
1805         .append_aml_bytes(bytes)
1806     }
1807 }
1808 
1809 struct CpuMethods {
1810     max_vcpus: u8,
1811     dynamic: bool,
1812 }
1813 
1814 impl Aml for CpuMethods {
1815     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1816         if self.dynamic {
1817             // CPU status method
1818             aml::Method::new(
1819                 "CSTA".into(),
1820                 1,
1821                 true,
1822                 vec![
1823                     // Take lock defined above
1824                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1825                     // Write CPU number (in first argument) to I/O port via field
1826                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1827                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1828                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1829                     &aml::If::new(
1830                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1831                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1832                     ),
1833                     // Release lock
1834                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1835                     // Return 0 or 0xf
1836                     &aml::Return::new(&aml::Local(0)),
1837                 ],
1838             )
1839             .append_aml_bytes(bytes);
1840 
1841             let mut cpu_notifies = Vec::new();
1842             for cpu_id in 0..self.max_vcpus {
1843                 cpu_notifies.push(CpuNotify { cpu_id });
1844             }
1845 
1846             let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
1847             for cpu_id in 0..self.max_vcpus {
1848                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1849             }
1850 
1851             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes);
1852 
1853             aml::Method::new(
1854                 "CEJ0".into(),
1855                 1,
1856                 true,
1857                 vec![
1858                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1859                     // Write CPU number (in first argument) to I/O port via field
1860                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1861                     // Set CEJ0 bit
1862                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1863                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1864                 ],
1865             )
1866             .append_aml_bytes(bytes);
1867 
1868             aml::Method::new(
1869                 "CSCN".into(),
1870                 0,
1871                 true,
1872                 vec![
1873                     // Take lock defined above
1874                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1875                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1876                     &aml::While::new(
1877                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1878                         vec![
1879                             // Write CPU number (in first argument) to I/O port via field
1880                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1881                             // Check if CINS bit is set
1882                             &aml::If::new(
1883                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1884                                 // Notify device if it is
1885                                 vec![
1886                                     &aml::MethodCall::new(
1887                                         "CTFY".into(),
1888                                         vec![&aml::Local(0), &aml::ONE],
1889                                     ),
1890                                     // Reset CINS bit
1891                                     &aml::Store::new(
1892                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1893                                         &aml::ONE,
1894                                     ),
1895                                 ],
1896                             ),
1897                             // Check if CRMV bit is set
1898                             &aml::If::new(
1899                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1900                                 // Notify device if it is (with the eject constant 0x3)
1901                                 vec![
1902                                     &aml::MethodCall::new(
1903                                         "CTFY".into(),
1904                                         vec![&aml::Local(0), &3u8],
1905                                     ),
1906                                     // Reset CRMV bit
1907                                     &aml::Store::new(
1908                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1909                                         &aml::ONE,
1910                                     ),
1911                                 ],
1912                             ),
1913                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1914                         ],
1915                     ),
1916                     // Release lock
1917                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1918                 ],
1919             )
1920             .append_aml_bytes(bytes)
1921         } else {
1922             aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes)
1923         }
1924     }
1925 }
1926 
1927 impl Aml for CpuManager {
1928     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1929         #[cfg(target_arch = "x86_64")]
1930         if let Some(acpi_address) = self.acpi_address {
1931             // CPU hotplug controller
1932             aml::Device::new(
1933                 "_SB_.PRES".into(),
1934                 vec![
1935                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
1936                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1937                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1938                     &aml::Mutex::new("CPLK".into(), 0),
1939                     &aml::Name::new(
1940                         "_CRS".into(),
1941                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
1942                             aml::AddressSpaceCachable::NotCacheable,
1943                             true,
1944                             acpi_address.0 as u64,
1945                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
1946                         )]),
1947                     ),
1948                     // OpRegion and Fields map MMIO range into individual field values
1949                     &aml::OpRegion::new(
1950                         "PRST".into(),
1951                         aml::OpRegionSpace::SystemMemory,
1952                         acpi_address.0 as usize,
1953                         CPU_MANAGER_ACPI_SIZE,
1954                     ),
1955                     &aml::Field::new(
1956                         "PRST".into(),
1957                         aml::FieldAccessType::Byte,
1958                         aml::FieldUpdateRule::WriteAsZeroes,
1959                         vec![
1960                             aml::FieldEntry::Reserved(32),
1961                             aml::FieldEntry::Named(*b"CPEN", 1),
1962                             aml::FieldEntry::Named(*b"CINS", 1),
1963                             aml::FieldEntry::Named(*b"CRMV", 1),
1964                             aml::FieldEntry::Named(*b"CEJ0", 1),
1965                             aml::FieldEntry::Reserved(4),
1966                             aml::FieldEntry::Named(*b"CCMD", 8),
1967                         ],
1968                     ),
1969                     &aml::Field::new(
1970                         "PRST".into(),
1971                         aml::FieldAccessType::DWord,
1972                         aml::FieldUpdateRule::Preserve,
1973                         vec![
1974                             aml::FieldEntry::Named(*b"CSEL", 32),
1975                             aml::FieldEntry::Reserved(32),
1976                             aml::FieldEntry::Named(*b"CDAT", 32),
1977                         ],
1978                     ),
1979                 ],
1980             )
1981             .append_aml_bytes(bytes);
1982         }
1983 
1984         // CPU devices
1985         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
1986         let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05"));
1987         // Bundle methods together under a common object
1988         let methods = CpuMethods {
1989             max_vcpus: self.config.max_vcpus,
1990             dynamic: self.dynamic,
1991         };
1992         let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods];
1993 
1994         let mut cpu_devices = Vec::new();
1995         for cpu_id in 0..self.config.max_vcpus {
1996             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
1997             let cpu_device = Cpu {
1998                 cpu_id,
1999                 proximity_domain,
2000                 dynamic: self.dynamic,
2001             };
2002 
2003             cpu_devices.push(cpu_device);
2004         }
2005 
2006         for cpu_device in cpu_devices.iter() {
2007             cpu_data_inner.push(cpu_device);
2008         }
2009 
2010         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes)
2011     }
2012 }
2013 
2014 impl Pausable for CpuManager {
2015     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2016         // Tell the vCPUs to pause themselves next time they exit
2017         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2018 
2019         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2020         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2021         // above.
2022         for state in self.vcpu_states.iter() {
2023             state.signal_thread();
2024         }
2025 
2026         for vcpu in self.vcpus.iter() {
2027             let mut vcpu = vcpu.lock().unwrap();
2028             vcpu.pause()?;
2029             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2030             if !self.config.kvm_hyperv {
2031                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2032                     MigratableError::Pause(anyhow!(
2033                         "Could not notify guest it has been paused {:?}",
2034                         e
2035                     ))
2036                 })?;
2037             }
2038         }
2039 
2040         Ok(())
2041     }
2042 
2043     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2044         for vcpu in self.vcpus.iter() {
2045             vcpu.lock().unwrap().resume()?;
2046         }
2047 
2048         // Toggle the vCPUs pause boolean
2049         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2050 
2051         // Unpark all the VCPU threads.
2052         // Once unparked, the next thing they will do is checking for the pause
2053         // boolean. Since it'll be set to false, they will exit their pause loop
2054         // and go back to vmx root.
2055         for state in self.vcpu_states.iter() {
2056             state.unpark_thread();
2057         }
2058         Ok(())
2059     }
2060 }
2061 
2062 impl Snapshottable for CpuManager {
2063     fn id(&self) -> String {
2064         CPU_MANAGER_SNAPSHOT_ID.to_string()
2065     }
2066 
2067     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2068         let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID);
2069 
2070         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2071         for vcpu in &self.vcpus {
2072             let cpu_snapshot = vcpu.lock().unwrap().snapshot()?;
2073             cpu_manager_snapshot.add_snapshot(cpu_snapshot);
2074         }
2075 
2076         Ok(cpu_manager_snapshot)
2077     }
2078 
2079     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
2080         for (cpu_id, snapshot) in snapshot.snapshots.iter() {
2081             info!("Restoring VCPU {}", cpu_id);
2082             self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone()))
2083                 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?;
2084         }
2085 
2086         Ok(())
2087     }
2088 }
2089 
2090 impl Transportable for CpuManager {}
2091 impl Migratable for CpuManager {}
2092 
2093 #[cfg(feature = "guest_debug")]
2094 impl Debuggable for CpuManager {
2095     #[cfg(feature = "kvm")]
2096     fn set_guest_debug(
2097         &self,
2098         cpu_id: usize,
2099         addrs: &[GuestAddress],
2100         singlestep: bool,
2101     ) -> std::result::Result<(), DebuggableError> {
2102         self.vcpus[cpu_id]
2103             .lock()
2104             .unwrap()
2105             .vcpu
2106             .set_guest_debug(addrs, singlestep)
2107             .map_err(DebuggableError::SetDebug)
2108     }
2109 
2110     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2111         Ok(())
2112     }
2113 
2114     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2115         Ok(())
2116     }
2117 
2118     #[cfg(target_arch = "x86_64")]
2119     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2120         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2121         let gregs = self
2122             .get_regs(cpu_id as u8)
2123             .map_err(DebuggableError::ReadRegs)?;
2124         let regs = [
2125             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
2126             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
2127         ];
2128 
2129         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2130         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2131         let eflags = gregs.rflags as u32;
2132         let rip = gregs.rip;
2133 
2134         // Segment registers: CS, SS, DS, ES, FS, GS
2135         let sregs = self
2136             .get_sregs(cpu_id as u8)
2137             .map_err(DebuggableError::ReadRegs)?;
2138         let segments = X86SegmentRegs {
2139             cs: sregs.cs.selector as u32,
2140             ss: sregs.ss.selector as u32,
2141             ds: sregs.ds.selector as u32,
2142             es: sregs.es.selector as u32,
2143             fs: sregs.fs.selector as u32,
2144             gs: sregs.gs.selector as u32,
2145         };
2146 
2147         // TODO: Add other registers
2148 
2149         Ok(CoreRegs {
2150             regs,
2151             eflags,
2152             rip,
2153             segments,
2154             ..Default::default()
2155         })
2156     }
2157 
2158     #[cfg(target_arch = "aarch64")]
2159     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2160         let gregs = self
2161             .get_regs(cpu_id as u8)
2162             .map_err(DebuggableError::ReadRegs)?;
2163         Ok(CoreRegs {
2164             x: gregs.regs.regs,
2165             sp: gregs.regs.sp,
2166             pc: gregs.regs.pc,
2167             ..Default::default()
2168         })
2169     }
2170 
2171     #[cfg(target_arch = "x86_64")]
2172     fn write_regs(
2173         &self,
2174         cpu_id: usize,
2175         regs: &CoreRegs,
2176     ) -> std::result::Result<(), DebuggableError> {
2177         let orig_gregs = self
2178             .get_regs(cpu_id as u8)
2179             .map_err(DebuggableError::ReadRegs)?;
2180         let gregs = StandardRegisters {
2181             rax: regs.regs[0],
2182             rbx: regs.regs[1],
2183             rcx: regs.regs[2],
2184             rdx: regs.regs[3],
2185             rsi: regs.regs[4],
2186             rdi: regs.regs[5],
2187             rbp: regs.regs[6],
2188             rsp: regs.regs[7],
2189             r8: regs.regs[8],
2190             r9: regs.regs[9],
2191             r10: regs.regs[10],
2192             r11: regs.regs[11],
2193             r12: regs.regs[12],
2194             r13: regs.regs[13],
2195             r14: regs.regs[14],
2196             r15: regs.regs[15],
2197             rip: regs.rip,
2198             // Update the lower 32-bit of rflags.
2199             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2200         };
2201 
2202         self.set_regs(cpu_id as u8, &gregs)
2203             .map_err(DebuggableError::WriteRegs)?;
2204 
2205         // Segment registers: CS, SS, DS, ES, FS, GS
2206         // Since GDB care only selectors, we call get_sregs() first.
2207         let mut sregs = self
2208             .get_sregs(cpu_id as u8)
2209             .map_err(DebuggableError::ReadRegs)?;
2210         sregs.cs.selector = regs.segments.cs as u16;
2211         sregs.ss.selector = regs.segments.ss as u16;
2212         sregs.ds.selector = regs.segments.ds as u16;
2213         sregs.es.selector = regs.segments.es as u16;
2214         sregs.fs.selector = regs.segments.fs as u16;
2215         sregs.gs.selector = regs.segments.gs as u16;
2216 
2217         self.set_sregs(cpu_id as u8, &sregs)
2218             .map_err(DebuggableError::WriteRegs)?;
2219 
2220         // TODO: Add other registers
2221 
2222         Ok(())
2223     }
2224 
2225     #[cfg(target_arch = "aarch64")]
2226     fn write_regs(
2227         &self,
2228         cpu_id: usize,
2229         regs: &CoreRegs,
2230     ) -> std::result::Result<(), DebuggableError> {
2231         let mut gregs = self
2232             .get_regs(cpu_id as u8)
2233             .map_err(DebuggableError::ReadRegs)?;
2234 
2235         gregs.regs.regs = regs.x;
2236         gregs.regs.sp = regs.sp;
2237         gregs.regs.pc = regs.pc;
2238 
2239         self.set_regs(cpu_id as u8, &gregs)
2240             .map_err(DebuggableError::WriteRegs)?;
2241 
2242         Ok(())
2243     }
2244 
2245     fn read_mem(
2246         &self,
2247         cpu_id: usize,
2248         vaddr: GuestAddress,
2249         len: usize,
2250     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2251         let mut buf = vec![0; len];
2252         let mut total_read = 0_u64;
2253 
2254         while total_read < len as u64 {
2255             let gaddr = vaddr.0 + total_read;
2256             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2257                 Ok(paddr) => paddr,
2258                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2259                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2260             };
2261             let psize = arch::PAGE_SIZE as u64;
2262             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2263             self.vm_memory
2264                 .memory()
2265                 .read(
2266                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2267                     GuestAddress(paddr),
2268                 )
2269                 .map_err(DebuggableError::ReadMem)?;
2270             total_read += read_len;
2271         }
2272         Ok(buf)
2273     }
2274 
2275     fn write_mem(
2276         &self,
2277         cpu_id: usize,
2278         vaddr: &GuestAddress,
2279         data: &[u8],
2280     ) -> std::result::Result<(), DebuggableError> {
2281         let mut total_written = 0_u64;
2282 
2283         while total_written < data.len() as u64 {
2284             let gaddr = vaddr.0 + total_written;
2285             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2286                 Ok(paddr) => paddr,
2287                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2288                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2289             };
2290             let psize = arch::PAGE_SIZE as u64;
2291             let write_len = std::cmp::min(
2292                 data.len() as u64 - total_written,
2293                 psize - (paddr & (psize - 1)),
2294             );
2295             self.vm_memory
2296                 .memory()
2297                 .write(
2298                     &data[total_written as usize..total_written as usize + write_len as usize],
2299                     GuestAddress(paddr),
2300                 )
2301                 .map_err(DebuggableError::WriteMem)?;
2302             total_written += write_len;
2303         }
2304         Ok(())
2305     }
2306 
2307     fn active_vcpus(&self) -> usize {
2308         self.present_vcpus() as usize
2309     }
2310 }
2311 
2312 #[cfg(feature = "guest_debug")]
2313 impl Elf64Writable for CpuManager {}
2314 
2315 #[cfg(feature = "guest_debug")]
2316 impl CpuElf64Writable for CpuManager {
2317     fn cpu_write_elf64_note(
2318         &mut self,
2319         dump_state: &DumpState,
2320     ) -> std::result::Result<(), GuestDebuggableError> {
2321         let mut coredump_file = dump_state.file.as_ref().unwrap();
2322         for vcpu in &self.vcpus {
2323             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2324             let mut pos: usize = 0;
2325             let mut buf = vec![0; note_size as usize];
2326             let descsz = size_of::<X86_64ElfPrStatus>();
2327             let vcpu_id = vcpu.lock().unwrap().id;
2328 
2329             let note = Elf64_Nhdr {
2330                 n_namesz: COREDUMP_NAME_SIZE,
2331                 n_descsz: descsz as u32,
2332                 n_type: NT_PRSTATUS,
2333             };
2334 
2335             let bytes: &[u8] = note.as_slice();
2336             buf.splice(0.., bytes.to_vec());
2337             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2338             buf.resize(pos + 4, 0);
2339             buf.splice(pos.., "CORE".to_string().into_bytes());
2340 
2341             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2342             buf.resize(pos + 32 + 4, 0);
2343             let pid = vcpu_id as u64;
2344             let bytes: &[u8] = pid.as_slice();
2345             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2346 
2347             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2348 
2349             let orig_rax: u64 = 0;
2350             let gregs = self.vcpus[usize::from(vcpu_id)]
2351                 .lock()
2352                 .unwrap()
2353                 .vcpu
2354                 .get_regs()
2355                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2356 
2357             let regs1 = [
2358                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2359                 gregs.r10,
2360             ];
2361             let regs2 = [
2362                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2363             ];
2364 
2365             let sregs = self.vcpus[usize::from(vcpu_id)]
2366                 .lock()
2367                 .unwrap()
2368                 .vcpu
2369                 .get_sregs()
2370                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2371 
2372             debug!(
2373                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2374                 gregs.rip,
2375                 gregs.rsp,
2376                 sregs.gs.base,
2377                 sregs.cs.selector,
2378                 sregs.ss.selector,
2379                 sregs.ds.selector,
2380             );
2381 
2382             let regs = X86_64UserRegs {
2383                 regs1,
2384                 regs2,
2385                 rip: gregs.rip,
2386                 cs: sregs.cs.selector as u64,
2387                 eflags: gregs.rflags,
2388                 rsp: gregs.rsp,
2389                 ss: sregs.ss.selector as u64,
2390                 fs_base: sregs.fs.base as u64,
2391                 gs_base: sregs.gs.base as u64,
2392                 ds: sregs.ds.selector as u64,
2393                 es: sregs.es.selector as u64,
2394                 fs: sregs.fs.selector as u64,
2395                 gs: sregs.gs.selector as u64,
2396             };
2397 
2398             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2399             let bytes: &[u8] = regs.as_slice();
2400             buf.resize(note_size as usize, 0);
2401             buf.splice(pos.., bytes.to_vec());
2402             buf.resize(note_size as usize, 0);
2403 
2404             coredump_file
2405                 .write(&buf)
2406                 .map_err(GuestDebuggableError::CoredumpFile)?;
2407         }
2408 
2409         Ok(())
2410     }
2411 
2412     fn cpu_write_vmm_note(
2413         &mut self,
2414         dump_state: &DumpState,
2415     ) -> std::result::Result<(), GuestDebuggableError> {
2416         let mut coredump_file = dump_state.file.as_ref().unwrap();
2417         for vcpu in &self.vcpus {
2418             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2419             let mut pos: usize = 0;
2420             let mut buf = vec![0; note_size as usize];
2421             let descsz = size_of::<DumpCpusState>();
2422             let vcpu_id = vcpu.lock().unwrap().id;
2423 
2424             let note = Elf64_Nhdr {
2425                 n_namesz: COREDUMP_NAME_SIZE,
2426                 n_descsz: descsz as u32,
2427                 n_type: 0,
2428             };
2429 
2430             let bytes: &[u8] = note.as_slice();
2431             buf.splice(0.., bytes.to_vec());
2432             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2433 
2434             buf.resize(pos + 4, 0);
2435             buf.splice(pos.., "QEMU".to_string().into_bytes());
2436 
2437             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2438 
2439             let gregs = self.vcpus[usize::from(vcpu_id)]
2440                 .lock()
2441                 .unwrap()
2442                 .vcpu
2443                 .get_regs()
2444                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2445 
2446             let regs1 = [
2447                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2448                 gregs.rbp,
2449             ];
2450 
2451             let regs2 = [
2452                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2453                 gregs.r15,
2454             ];
2455 
2456             let sregs = self.vcpus[usize::from(vcpu_id)]
2457                 .lock()
2458                 .unwrap()
2459                 .vcpu
2460                 .get_sregs()
2461                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2462 
2463             let mut msrs = vec![MsrEntry {
2464                 index: msr_index::MSR_KERNEL_GS_BASE,
2465                 ..Default::default()
2466             }];
2467 
2468             self.vcpus[vcpu_id as usize]
2469                 .lock()
2470                 .unwrap()
2471                 .vcpu
2472                 .get_msrs(&mut msrs)
2473                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2474             let kernel_gs_base = msrs[0].data;
2475 
2476             let cs = CpuSegment::new(sregs.cs);
2477             let ds = CpuSegment::new(sregs.ds);
2478             let es = CpuSegment::new(sregs.es);
2479             let fs = CpuSegment::new(sregs.fs);
2480             let gs = CpuSegment::new(sregs.gs);
2481             let ss = CpuSegment::new(sregs.ss);
2482             let ldt = CpuSegment::new(sregs.ldt);
2483             let tr = CpuSegment::new(sregs.tr);
2484             let gdt = CpuSegment::new_from_table(sregs.gdt);
2485             let idt = CpuSegment::new_from_table(sregs.idt);
2486             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2487             let regs = DumpCpusState {
2488                 version: 1,
2489                 size: size_of::<DumpCpusState>() as u32,
2490                 regs1,
2491                 regs2,
2492                 rip: gregs.rip,
2493                 rflags: gregs.rflags,
2494                 cs,
2495                 ds,
2496                 es,
2497                 fs,
2498                 gs,
2499                 ss,
2500                 ldt,
2501                 tr,
2502                 gdt,
2503                 idt,
2504                 cr,
2505                 kernel_gs_base,
2506             };
2507 
2508             let bytes: &[u8] = regs.as_slice();
2509             buf.resize(note_size as usize, 0);
2510             buf.splice(pos.., bytes.to_vec());
2511             buf.resize(note_size as usize, 0);
2512 
2513             coredump_file
2514                 .write(&buf)
2515                 .map_err(GuestDebuggableError::CoredumpFile)?;
2516         }
2517 
2518         Ok(())
2519     }
2520 }
2521 
2522 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2523 #[cfg(test)]
2524 mod tests {
2525     use arch::x86_64::interrupts::*;
2526     use arch::x86_64::regs::*;
2527     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2528 
2529     #[test]
2530     fn test_setlint() {
2531         let hv = hypervisor::new().unwrap();
2532         let vm = hv.create_vm().expect("new VM fd creation failed");
2533         assert!(hv.check_required_extensions().is_ok());
2534         // Calling get_lapic will fail if there is no irqchip before hand.
2535         assert!(vm.create_irq_chip().is_ok());
2536         let vcpu = vm.create_vcpu(0, None).unwrap();
2537         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2538 
2539         // Compute the value that is expected to represent LVT0 and LVT1.
2540         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2541         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2542         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2543         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2544 
2545         set_lint(&vcpu).unwrap();
2546 
2547         // Compute the value that represents LVT0 and LVT1 after set_lint.
2548         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2549         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2550         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2551         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2552         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2553     }
2554 
2555     #[test]
2556     fn test_setup_fpu() {
2557         let hv = hypervisor::new().unwrap();
2558         let vm = hv.create_vm().expect("new VM fd creation failed");
2559         let vcpu = vm.create_vcpu(0, None).unwrap();
2560         setup_fpu(&vcpu).unwrap();
2561 
2562         let expected_fpu: FpuState = FpuState {
2563             fcw: 0x37f,
2564             mxcsr: 0x1f80,
2565             ..Default::default()
2566         };
2567         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2568         // TODO: auto-generate kvm related structures with PartialEq on.
2569         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2570         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2571         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2572         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2573         // remove it at all.
2574         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2575     }
2576 
2577     #[test]
2578     fn test_setup_msrs() {
2579         use hypervisor::arch::x86::{msr_index, MsrEntry};
2580 
2581         let hv = hypervisor::new().unwrap();
2582         let vm = hv.create_vm().expect("new VM fd creation failed");
2583         let vcpu = vm.create_vcpu(0, None).unwrap();
2584         setup_msrs(&vcpu).unwrap();
2585 
2586         // This test will check against the last MSR entry configured (the tenth one).
2587         // See create_msr_entries for details.
2588         let mut msrs = vec![MsrEntry {
2589             index: msr_index::MSR_IA32_MISC_ENABLE,
2590             ..Default::default()
2591         }];
2592 
2593         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2594         // in this test case scenario.
2595         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2596         assert_eq!(read_msrs, 1);
2597 
2598         // Official entries that were setup when we did setup_msrs. We need to assert that the
2599         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2600         // expect.
2601         let entry_vec = vcpu.boot_msr_entries();
2602         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2603     }
2604 
2605     #[test]
2606     fn test_setup_regs() {
2607         let hv = hypervisor::new().unwrap();
2608         let vm = hv.create_vm().expect("new VM fd creation failed");
2609         let vcpu = vm.create_vcpu(0, None).unwrap();
2610 
2611         let expected_regs: StandardRegisters = StandardRegisters {
2612             rflags: 0x0000000000000002u64,
2613             rbx: arch::layout::PVH_INFO_START.0,
2614             rip: 1,
2615             ..Default::default()
2616         };
2617 
2618         setup_regs(&vcpu, expected_regs.rip).unwrap();
2619 
2620         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2621         assert_eq!(actual_regs, expected_regs);
2622     }
2623 }
2624 
2625 #[cfg(target_arch = "aarch64")]
2626 #[cfg(test)]
2627 mod tests {
2628     use arch::{aarch64::regs, layout};
2629     use hypervisor::kvm::aarch64::is_system_register;
2630     use hypervisor::kvm::kvm_bindings::{
2631         kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2632         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2633     };
2634     use hypervisor::{arm64_core_reg_id, offset__of};
2635     use std::mem;
2636 
2637     #[test]
2638     fn test_setup_regs() {
2639         let hv = hypervisor::new().unwrap();
2640         let vm = hv.create_vm().unwrap();
2641         let vcpu = vm.create_vcpu(0, None).unwrap();
2642 
2643         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2644         // Must fail when vcpu is not initialized yet.
2645         assert!(res.is_err());
2646 
2647         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2648         vm.get_preferred_target(&mut kvi).unwrap();
2649         vcpu.vcpu_init(&kvi).unwrap();
2650 
2651         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2652     }
2653 
2654     #[test]
2655     fn test_read_mpidr() {
2656         let hv = hypervisor::new().unwrap();
2657         let vm = hv.create_vm().unwrap();
2658         let vcpu = vm.create_vcpu(0, None).unwrap();
2659         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2660         vm.get_preferred_target(&mut kvi).unwrap();
2661 
2662         // Must fail when vcpu is not initialized yet.
2663         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2664 
2665         vcpu.vcpu_init(&kvi).unwrap();
2666         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2667     }
2668 
2669     #[test]
2670     fn test_is_system_register() {
2671         let offset = offset__of!(user_pt_regs, pc);
2672         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2673         assert!(!is_system_register(regid));
2674         let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64;
2675         assert!(is_system_register(regid));
2676     }
2677 
2678     #[test]
2679     fn test_save_restore_core_regs() {
2680         let hv = hypervisor::new().unwrap();
2681         let vm = hv.create_vm().unwrap();
2682         let vcpu = vm.create_vcpu(0, None).unwrap();
2683         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2684         vm.get_preferred_target(&mut kvi).unwrap();
2685 
2686         // Must fail when vcpu is not initialized yet.
2687         let res = vcpu.get_regs();
2688         assert!(res.is_err());
2689         assert_eq!(
2690             format!("{}", res.unwrap_err()),
2691             "Failed to get core register: Exec format error (os error 8)"
2692         );
2693 
2694         let mut state = kvm_regs::default();
2695         let res = vcpu.set_regs(&state);
2696         assert!(res.is_err());
2697         assert_eq!(
2698             format!("{}", res.unwrap_err()),
2699             "Failed to set core register: Exec format error (os error 8)"
2700         );
2701 
2702         vcpu.vcpu_init(&kvi).unwrap();
2703         let res = vcpu.get_regs();
2704         assert!(res.is_ok());
2705         state = res.unwrap();
2706         assert_eq!(state.regs.pstate, 0x3C5);
2707 
2708         assert!(vcpu.set_regs(&state).is_ok());
2709     }
2710 
2711     #[test]
2712     fn test_get_set_mpstate() {
2713         let hv = hypervisor::new().unwrap();
2714         let vm = hv.create_vm().unwrap();
2715         let vcpu = vm.create_vcpu(0, None).unwrap();
2716         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2717         vm.get_preferred_target(&mut kvi).unwrap();
2718 
2719         let res = vcpu.get_mp_state();
2720         assert!(res.is_ok());
2721         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2722     }
2723 }
2724