xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 38380198e1660348e54cc69a6355bcd1f92e8cae)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::collections::BTreeMap;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use std::io::Write;
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::os::unix::thread::JoinHandleExt;
20 use std::sync::atomic::{AtomicBool, Ordering};
21 use std::sync::{Arc, Barrier, Mutex};
22 use std::{cmp, io, result, thread};
23 
24 #[cfg(not(target_arch = "riscv64"))]
25 use acpi_tables::sdt::Sdt;
26 use acpi_tables::{aml, Aml};
27 use anyhow::anyhow;
28 #[cfg(target_arch = "x86_64")]
29 use arch::x86_64::get_x2apic_id;
30 use arch::{EntryPoint, NumaNodes};
31 #[cfg(target_arch = "aarch64")]
32 use devices::gic::Gic;
33 use devices::interrupt_controller::InterruptController;
34 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
35 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
36 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
37 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
39 use hypervisor::arch::aarch64::regs::{ID_AA64MMFR0_EL1, TCR_EL1, TTBR1_EL1};
40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
41 use hypervisor::arch::x86::msr_index;
42 #[cfg(target_arch = "x86_64")]
43 use hypervisor::arch::x86::CpuIdEntry;
44 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
45 use hypervisor::arch::x86::MsrEntry;
46 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
47 use hypervisor::arch::x86::SpecialRegisters;
48 #[cfg(feature = "tdx")]
49 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
50 #[cfg(target_arch = "x86_64")]
51 use hypervisor::CpuVendor;
52 #[cfg(feature = "kvm")]
53 use hypervisor::HypervisorType;
54 #[cfg(feature = "guest_debug")]
55 use hypervisor::StandardRegisters;
56 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
57 use libc::{c_void, siginfo_t};
58 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
59 use linux_loader::elf::Elf64_Nhdr;
60 use seccompiler::{apply_filter, SeccompAction};
61 use thiserror::Error;
62 use tracer::trace_scoped;
63 use vm_device::BusDevice;
64 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
65 use vm_memory::ByteValued;
66 #[cfg(feature = "guest_debug")]
67 use vm_memory::{Bytes, GuestAddressSpace};
68 use vm_memory::{GuestAddress, GuestMemoryAtomic};
69 use vm_migration::{
70     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
71     Transportable,
72 };
73 use vmm_sys_util::eventfd::EventFd;
74 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
75 use zerocopy::{FromBytes, Immutable, IntoBytes};
76 
77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
78 use crate::coredump::{
79     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
80     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
81     NT_PRSTATUS,
82 };
83 #[cfg(feature = "guest_debug")]
84 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
85 #[cfg(target_arch = "x86_64")]
86 use crate::memory_manager::MemoryManager;
87 use crate::seccomp_filters::{get_seccomp_filter, Thread};
88 #[cfg(target_arch = "x86_64")]
89 use crate::vm::physical_bits;
90 use crate::vm_config::CpusConfig;
91 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID};
92 
93 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
94 /// Extract the specified bits of a 64-bit integer.
95 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
96 /// following expression should return 3 (`0b11`):
97 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
98 ///
99 macro_rules! extract_bits_64 {
100     ($value: tt, $offset: tt, $length: tt) => {
101         ($value >> $offset) & (!0u64 >> (64 - $length))
102     };
103 }
104 
105 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
106 macro_rules! extract_bits_64_without_offset {
107     ($value: tt, $length: tt) => {
108         $value & (!0u64 >> (64 - $length))
109     };
110 }
111 
112 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
113 
114 #[derive(Debug, Error)]
115 pub enum Error {
116     #[error("Error creating vCPU: {0}")]
117     VcpuCreate(#[source] anyhow::Error),
118 
119     #[error("Error running bCPU: {0}")]
120     VcpuRun(#[source] anyhow::Error),
121 
122     #[error("Error spawning vCPU thread: {0}")]
123     VcpuSpawn(#[source] io::Error),
124 
125     #[error("Error generating common CPUID: {0}")]
126     CommonCpuId(#[source] arch::Error),
127 
128     #[error("Error configuring vCPU: {0}")]
129     VcpuConfiguration(#[source] arch::Error),
130 
131     #[error("Still pending removed vcpu")]
132     VcpuPendingRemovedVcpu,
133 
134     #[cfg(target_arch = "aarch64")]
135     #[error("Error fetching preferred target: {0}")]
136     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
137 
138     #[cfg(target_arch = "aarch64")]
139     #[error("Error setting vCPU processor features: {0}")]
140     VcpuSetProcessorFeatures(#[source] hypervisor::HypervisorCpuError),
141 
142     #[cfg(target_arch = "aarch64")]
143     #[error("Error initialising vCPU: {0}")]
144     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
145 
146     #[cfg(target_arch = "aarch64")]
147     #[error("Error finalising vCPU: {0}")]
148     VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError),
149 
150     #[cfg(target_arch = "aarch64")]
151     #[error("Error initialising GICR base address: {0}")]
152     VcpuSetGicrBaseAddr(#[source] hypervisor::HypervisorCpuError),
153 
154     #[error("Failed to join on vCPU threads: {0:?}")]
155     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
156 
157     #[error("Error adding CpuManager to MMIO bus: {0}")]
158     BusError(#[source] vm_device::BusError),
159 
160     #[error("Requested vCPUs exceed maximum")]
161     DesiredVCpuCountExceedsMax,
162 
163     #[error("Cannot create seccomp filter: {0}")]
164     CreateSeccompFilter(#[source] seccompiler::Error),
165 
166     #[error("Cannot apply seccomp filter: {0}")]
167     ApplySeccompFilter(#[source] seccompiler::Error),
168 
169     #[error("Error starting vCPU after restore: {0}")]
170     StartRestoreVcpu(#[source] anyhow::Error),
171 
172     #[error("Unexpected VmExit")]
173     UnexpectedVmExit,
174 
175     #[error("Failed to allocate MMIO address for CpuManager")]
176     AllocateMmmioAddress,
177 
178     #[cfg(feature = "tdx")]
179     #[error("Error initializing TDX: {0}")]
180     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
181 
182     #[cfg(target_arch = "aarch64")]
183     #[error("Error initializing PMU: {0}")]
184     InitPmu(#[source] hypervisor::HypervisorCpuError),
185 
186     #[cfg(feature = "guest_debug")]
187     #[error("Error during CPU debug: {0}")]
188     CpuDebug(#[source] hypervisor::HypervisorCpuError),
189 
190     #[cfg(feature = "guest_debug")]
191     #[error("Error translating virtual address: {0}")]
192     TranslateVirtualAddress(#[source] anyhow::Error),
193 
194     #[cfg(target_arch = "x86_64")]
195     #[error("Error setting up AMX: {0}")]
196     AmxEnable(#[source] anyhow::Error),
197 
198     #[error("Maximum number of vCPUs exceeds host limit")]
199     MaximumVcpusExceeded,
200 
201     #[cfg(feature = "sev_snp")]
202     #[error("Failed to set sev control register: {0}")]
203     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
204 
205     #[cfg(target_arch = "x86_64")]
206     #[error("Failed to inject NMI")]
207     NmiError(hypervisor::HypervisorCpuError),
208 }
209 pub type Result<T> = result::Result<T, Error>;
210 
211 #[cfg(target_arch = "x86_64")]
212 #[allow(dead_code)]
213 #[repr(C, packed)]
214 #[derive(IntoBytes, Immutable, FromBytes)]
215 struct LocalX2Apic {
216     pub r#type: u8,
217     pub length: u8,
218     pub _reserved: u16,
219     pub apic_id: u32,
220     pub flags: u32,
221     pub processor_id: u32,
222 }
223 
224 #[allow(dead_code)]
225 #[repr(C, packed)]
226 #[derive(Default, IntoBytes, Immutable, FromBytes)]
227 struct Ioapic {
228     pub r#type: u8,
229     pub length: u8,
230     pub ioapic_id: u8,
231     _reserved: u8,
232     pub apic_address: u32,
233     pub gsi_base: u32,
234 }
235 
236 #[cfg(target_arch = "aarch64")]
237 #[allow(dead_code)]
238 #[repr(C, packed)]
239 #[derive(IntoBytes, Immutable, FromBytes)]
240 struct GicC {
241     pub r#type: u8,
242     pub length: u8,
243     pub reserved0: u16,
244     pub cpu_interface_number: u32,
245     pub uid: u32,
246     pub flags: u32,
247     pub parking_version: u32,
248     pub performance_interrupt: u32,
249     pub parked_address: u64,
250     pub base_address: u64,
251     pub gicv_base_address: u64,
252     pub gich_base_address: u64,
253     pub vgic_interrupt: u32,
254     pub gicr_base_address: u64,
255     pub mpidr: u64,
256     pub proc_power_effi_class: u8,
257     pub reserved1: u8,
258     pub spe_overflow_interrupt: u16,
259 }
260 
261 #[cfg(target_arch = "aarch64")]
262 #[allow(dead_code)]
263 #[repr(C, packed)]
264 #[derive(IntoBytes, Immutable, FromBytes)]
265 struct GicD {
266     pub r#type: u8,
267     pub length: u8,
268     pub reserved0: u16,
269     pub gic_id: u32,
270     pub base_address: u64,
271     pub global_irq_base: u32,
272     pub version: u8,
273     pub reserved1: [u8; 3],
274 }
275 
276 #[cfg(target_arch = "aarch64")]
277 #[allow(dead_code)]
278 #[repr(C, packed)]
279 #[derive(IntoBytes, Immutable, FromBytes)]
280 struct GicR {
281     pub r#type: u8,
282     pub length: u8,
283     pub reserved: u16,
284     pub base_address: u64,
285     pub range_length: u32,
286 }
287 
288 #[cfg(target_arch = "aarch64")]
289 #[allow(dead_code)]
290 #[repr(C, packed)]
291 #[derive(IntoBytes, Immutable, FromBytes)]
292 struct GicIts {
293     pub r#type: u8,
294     pub length: u8,
295     pub reserved0: u16,
296     pub translation_id: u32,
297     pub base_address: u64,
298     pub reserved1: u32,
299 }
300 
301 #[cfg(target_arch = "aarch64")]
302 #[allow(dead_code)]
303 #[repr(C, packed)]
304 #[derive(IntoBytes, Immutable, FromBytes)]
305 struct ProcessorHierarchyNode {
306     pub r#type: u8,
307     pub length: u8,
308     pub reserved: u16,
309     pub flags: u32,
310     pub parent: u32,
311     pub acpi_processor_id: u32,
312     pub num_private_resources: u32,
313 }
314 
315 #[allow(dead_code)]
316 #[repr(C, packed)]
317 #[derive(Default, IntoBytes, Immutable, FromBytes)]
318 struct InterruptSourceOverride {
319     pub r#type: u8,
320     pub length: u8,
321     pub bus: u8,
322     pub source: u8,
323     pub gsi: u32,
324     pub flags: u16,
325 }
326 
327 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
328 macro_rules! round_up {
329     ($n:expr,$d:expr) => {
330         (($n / ($d + 1)) + 1) * $d
331     };
332 }
333 
334 /// A wrapper around creating and using a kvm-based VCPU.
335 pub struct Vcpu {
336     // The hypervisor abstracted CPU.
337     vcpu: Arc<dyn hypervisor::Vcpu>,
338     id: u8,
339     #[cfg(target_arch = "aarch64")]
340     mpidr: u64,
341     saved_state: Option<CpuState>,
342     #[cfg(target_arch = "x86_64")]
343     vendor: CpuVendor,
344 }
345 
346 impl Vcpu {
347     /// Constructs a new VCPU for `vm`.
348     ///
349     /// # Arguments
350     ///
351     /// * `id` - Represents the CPU number between [0, max vcpus).
352     /// * `vm` - The virtual machine this vcpu will get attached to.
353     /// * `vm_ops` - Optional object for exit handling.
354     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
355     pub fn new(
356         id: u8,
357         apic_id: u8,
358         vm: &Arc<dyn hypervisor::Vm>,
359         vm_ops: Option<Arc<dyn VmOps>>,
360         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
361     ) -> Result<Self> {
362         let vcpu = vm
363             .create_vcpu(apic_id, vm_ops)
364             .map_err(|e| Error::VcpuCreate(e.into()))?;
365         // Initially the cpuid per vCPU is the one supported by this VM.
366         Ok(Vcpu {
367             vcpu,
368             id,
369             #[cfg(target_arch = "aarch64")]
370             mpidr: 0,
371             saved_state: None,
372             #[cfg(target_arch = "x86_64")]
373             vendor: cpu_vendor,
374         })
375     }
376 
377     /// Configures a vcpu and should be called once per vcpu when created.
378     ///
379     /// # Arguments
380     ///
381     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
382     /// * `guest_memory` - Guest memory.
383     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
384     pub fn configure(
385         &mut self,
386         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
387         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
388         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
389         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
390         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
391     ) -> Result<()> {
392         #[cfg(target_arch = "aarch64")]
393         {
394             self.init(vm)?;
395             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
396                 .map_err(Error::VcpuConfiguration)?;
397         }
398         #[cfg(target_arch = "riscv64")]
399         arch::configure_vcpu(&self.vcpu, self.id, boot_setup).map_err(Error::VcpuConfiguration)?;
400         info!("Configuring vCPU: cpu_id = {}", self.id);
401         #[cfg(target_arch = "x86_64")]
402         arch::configure_vcpu(
403             &self.vcpu,
404             self.id,
405             boot_setup,
406             cpuid,
407             kvm_hyperv,
408             self.vendor,
409             topology,
410         )
411         .map_err(Error::VcpuConfiguration)?;
412 
413         Ok(())
414     }
415 
416     /// Gets the MPIDR register value.
417     #[cfg(target_arch = "aarch64")]
418     pub fn get_mpidr(&self) -> u64 {
419         self.mpidr
420     }
421 
422     /// Gets the saved vCPU state.
423     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
424     pub fn get_saved_state(&self) -> Option<CpuState> {
425         self.saved_state.clone()
426     }
427 
428     /// Initializes an aarch64 specific vcpu for booting Linux.
429     #[cfg(target_arch = "aarch64")]
430     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
431         use std::arch::is_aarch64_feature_detected;
432         #[allow(clippy::nonminimal_bool)]
433         let sve_supported =
434             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
435         let mut kvi = self.vcpu.create_vcpu_init();
436 
437         // This reads back the kernel's preferred target type.
438         vm.get_preferred_target(&mut kvi)
439             .map_err(Error::VcpuArmPreferredTarget)?;
440 
441         self.vcpu
442             .vcpu_set_processor_features(vm, &mut kvi, self.id)
443             .map_err(Error::VcpuSetProcessorFeatures)?;
444 
445         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?;
446 
447         if sve_supported {
448             let finalized_features = self.vcpu.vcpu_get_finalized_features();
449             self.vcpu
450                 .vcpu_finalize(finalized_features)
451                 .map_err(Error::VcpuArmFinalize)?;
452         }
453         Ok(())
454     }
455 
456     /// Runs the VCPU until it exits, returning the reason.
457     ///
458     /// Note that the state of the VCPU and associated VM must be setup first for this to do
459     /// anything useful.
460     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
461         self.vcpu.run()
462     }
463 
464     #[cfg(feature = "sev_snp")]
465     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
466         self.vcpu
467             .set_sev_control_register(vmsa_pfn)
468             .map_err(Error::SetSevControlRegister)
469     }
470 
471     ///
472     /// Sets the vCPU's GIC redistributor base address.
473     ///
474     #[cfg(target_arch = "aarch64")]
475     pub fn set_gic_redistributor_addr(
476         &self,
477         base_redist_addr: u64,
478         redist_size: u64,
479     ) -> Result<()> {
480         let gicr_base = base_redist_addr + (arch::layout::GIC_V3_REDIST_SIZE * self.id as u64);
481         assert!(gicr_base + arch::layout::GIC_V3_REDIST_SIZE <= base_redist_addr + redist_size);
482         self.vcpu
483             .set_gic_redistributor_addr(gicr_base)
484             .map_err(Error::VcpuSetGicrBaseAddr)?;
485         Ok(())
486     }
487 }
488 
489 impl Pausable for Vcpu {}
490 impl Snapshottable for Vcpu {
491     fn id(&self) -> String {
492         self.id.to_string()
493     }
494 
495     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
496         let saved_state = self
497             .vcpu
498             .state()
499             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
500 
501         self.saved_state = Some(saved_state.clone());
502 
503         Ok(Snapshot::from_data(SnapshotData::new_from_state(
504             &saved_state,
505         )?))
506     }
507 }
508 
509 pub struct CpuManager {
510     config: CpusConfig,
511     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
512     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
513     #[cfg(target_arch = "x86_64")]
514     cpuid: Vec<CpuIdEntry>,
515     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
516     vm: Arc<dyn hypervisor::Vm>,
517     vcpus_kill_signalled: Arc<AtomicBool>,
518     vcpus_pause_signalled: Arc<AtomicBool>,
519     vcpus_kick_signalled: Arc<AtomicBool>,
520     exit_evt: EventFd,
521     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
522     reset_evt: EventFd,
523     #[cfg(feature = "guest_debug")]
524     vm_debug_evt: EventFd,
525     vcpu_states: Vec<VcpuState>,
526     selected_cpu: u8,
527     vcpus: Vec<Arc<Mutex<Vcpu>>>,
528     seccomp_action: SeccompAction,
529     vm_ops: Arc<dyn VmOps>,
530     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
531     acpi_address: Option<GuestAddress>,
532     proximity_domain_per_cpu: BTreeMap<u8, u32>,
533     affinity: BTreeMap<u8, Vec<usize>>,
534     dynamic: bool,
535     hypervisor: Arc<dyn hypervisor::Hypervisor>,
536     #[cfg(feature = "sev_snp")]
537     sev_snp_enabled: bool,
538 }
539 
540 const CPU_ENABLE_FLAG: usize = 0;
541 const CPU_INSERTING_FLAG: usize = 1;
542 const CPU_REMOVING_FLAG: usize = 2;
543 const CPU_EJECT_FLAG: usize = 3;
544 
545 const CPU_STATUS_OFFSET: u64 = 4;
546 const CPU_SELECTION_OFFSET: u64 = 0;
547 
548 impl BusDevice for CpuManager {
549     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
550         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
551         data.fill(0);
552 
553         match offset {
554             CPU_SELECTION_OFFSET => {
555                 data[0] = self.selected_cpu;
556             }
557             CPU_STATUS_OFFSET => {
558                 if self.selected_cpu < self.max_vcpus() {
559                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
560                     if state.active() {
561                         data[0] |= 1 << CPU_ENABLE_FLAG;
562                     }
563                     if state.inserting {
564                         data[0] |= 1 << CPU_INSERTING_FLAG;
565                     }
566                     if state.removing {
567                         data[0] |= 1 << CPU_REMOVING_FLAG;
568                     }
569                 } else {
570                     warn!("Out of range vCPU id: {}", self.selected_cpu);
571                 }
572             }
573             _ => {
574                 warn!(
575                     "Unexpected offset for accessing CPU manager device: {:#}",
576                     offset
577                 );
578             }
579         }
580     }
581 
582     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
583         match offset {
584             CPU_SELECTION_OFFSET => {
585                 self.selected_cpu = data[0];
586             }
587             CPU_STATUS_OFFSET => {
588                 if self.selected_cpu < self.max_vcpus() {
589                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
590                     // The ACPI code writes back a 1 to acknowledge the insertion
591                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
592                         && state.inserting
593                     {
594                         state.inserting = false;
595                     }
596                     // Ditto for removal
597                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
598                         && state.removing
599                     {
600                         state.removing = false;
601                     }
602                     // Trigger removal of vCPU
603                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
604                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
605                             error!("Error removing vCPU: {:?}", e);
606                         }
607                     }
608                 } else {
609                     warn!("Out of range vCPU id: {}", self.selected_cpu);
610                 }
611             }
612             _ => {
613                 warn!(
614                     "Unexpected offset for accessing CPU manager device: {:#}",
615                     offset
616                 );
617             }
618         }
619         None
620     }
621 }
622 
623 #[derive(Default)]
624 struct VcpuState {
625     inserting: bool,
626     removing: bool,
627     pending_removal: Arc<AtomicBool>,
628     handle: Option<thread::JoinHandle<()>>,
629     kill: Arc<AtomicBool>,
630     vcpu_run_interrupted: Arc<AtomicBool>,
631     paused: Arc<AtomicBool>,
632 }
633 
634 impl VcpuState {
635     fn active(&self) -> bool {
636         self.handle.is_some()
637     }
638 
639     fn signal_thread(&self) {
640         if let Some(handle) = self.handle.as_ref() {
641             loop {
642                 // SAFETY: FFI call with correct arguments
643                 unsafe {
644                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
645                 }
646                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
647                     break;
648                 } else {
649                     // This is more effective than thread::yield_now() at
650                     // avoiding a priority inversion with the vCPU thread
651                     thread::sleep(std::time::Duration::from_millis(1));
652                 }
653             }
654         }
655     }
656 
657     fn join_thread(&mut self) -> Result<()> {
658         if let Some(handle) = self.handle.take() {
659             handle.join().map_err(Error::ThreadCleanup)?
660         }
661 
662         Ok(())
663     }
664 
665     fn unpark_thread(&self) {
666         if let Some(handle) = self.handle.as_ref() {
667             handle.thread().unpark()
668         }
669     }
670 }
671 
672 impl CpuManager {
673     #[allow(unused_variables)]
674     #[allow(clippy::too_many_arguments)]
675     pub fn new(
676         config: &CpusConfig,
677         vm: Arc<dyn hypervisor::Vm>,
678         exit_evt: EventFd,
679         reset_evt: EventFd,
680         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
681         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
682         seccomp_action: SeccompAction,
683         vm_ops: Arc<dyn VmOps>,
684         #[cfg(feature = "tdx")] tdx_enabled: bool,
685         numa_nodes: &NumaNodes,
686         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
687     ) -> Result<Arc<Mutex<CpuManager>>> {
688         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
689             return Err(Error::MaximumVcpusExceeded);
690         }
691 
692         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
693         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
694         let hypervisor_type = hypervisor.hypervisor_type();
695         #[cfg(target_arch = "x86_64")]
696         let cpu_vendor = hypervisor.get_cpu_vendor();
697 
698         #[cfg(target_arch = "x86_64")]
699         if config.features.amx {
700             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
701             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
702             const XFEATURE_XTILEDATA: usize = 18;
703             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
704 
705             // SAFETY: the syscall is only modifying kernel internal
706             // data structures that the kernel is itself expected to safeguard.
707             let amx_tile = unsafe {
708                 libc::syscall(
709                     libc::SYS_arch_prctl,
710                     ARCH_REQ_XCOMP_GUEST_PERM,
711                     XFEATURE_XTILEDATA,
712                 )
713             };
714 
715             if amx_tile != 0 {
716                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
717             } else {
718                 let mask: usize = 0;
719                 // SAFETY: the mask being modified (not marked mutable as it is
720                 // modified in unsafe only which is permitted) isn't in use elsewhere.
721                 let result = unsafe {
722                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
723                 };
724                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
725                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
726                 }
727             }
728         }
729 
730         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
731             let mut cpu_list = Vec::new();
732             for (proximity_domain, numa_node) in numa_nodes.iter() {
733                 for cpu in numa_node.cpus.iter() {
734                     cpu_list.push((*cpu, *proximity_domain))
735                 }
736             }
737             cpu_list
738         }
739         .into_iter()
740         .collect();
741 
742         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
743             cpu_affinity
744                 .iter()
745                 .map(|a| (a.vcpu, a.host_cpus.clone()))
746                 .collect()
747         } else {
748             BTreeMap::new()
749         };
750 
751         #[cfg(feature = "tdx")]
752         let dynamic = !tdx_enabled;
753         #[cfg(not(feature = "tdx"))]
754         let dynamic = true;
755 
756         Ok(Arc::new(Mutex::new(CpuManager {
757             config: config.clone(),
758             interrupt_controller: None,
759             #[cfg(target_arch = "x86_64")]
760             cpuid: Vec::new(),
761             vm,
762             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
763             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
764             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
765             vcpu_states,
766             exit_evt,
767             reset_evt,
768             #[cfg(feature = "guest_debug")]
769             vm_debug_evt,
770             selected_cpu: 0,
771             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
772             seccomp_action,
773             vm_ops,
774             acpi_address: None,
775             proximity_domain_per_cpu,
776             affinity,
777             dynamic,
778             hypervisor: hypervisor.clone(),
779             #[cfg(feature = "sev_snp")]
780             sev_snp_enabled,
781         })))
782     }
783 
784     #[cfg(target_arch = "x86_64")]
785     pub fn populate_cpuid(
786         &mut self,
787         memory_manager: &Arc<Mutex<MemoryManager>>,
788         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
789         #[cfg(feature = "tdx")] tdx: bool,
790     ) -> Result<()> {
791         let sgx_epc_sections = memory_manager
792             .lock()
793             .unwrap()
794             .sgx_epc_region()
795             .as_ref()
796             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
797 
798         self.cpuid = {
799             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
800             arch::generate_common_cpuid(
801                 hypervisor,
802                 &arch::CpuidConfig {
803                     sgx_epc_sections,
804                     phys_bits,
805                     kvm_hyperv: self.config.kvm_hyperv,
806                     #[cfg(feature = "tdx")]
807                     tdx,
808                     amx: self.config.features.amx,
809                 },
810             )
811             .map_err(Error::CommonCpuId)?
812         };
813 
814         Ok(())
815     }
816 
817     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
818         info!("Creating vCPU: cpu_id = {}", cpu_id);
819 
820         #[cfg(target_arch = "x86_64")]
821         let topology = self.get_vcpu_topology();
822         #[cfg(target_arch = "x86_64")]
823         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
824         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
825         let x2apic_id = cpu_id as u32;
826 
827         let mut vcpu = Vcpu::new(
828             cpu_id,
829             x2apic_id as u8,
830             &self.vm,
831             Some(self.vm_ops.clone()),
832             #[cfg(target_arch = "x86_64")]
833             self.hypervisor.get_cpu_vendor(),
834         )?;
835 
836         if let Some(snapshot) = snapshot {
837             // AArch64 vCPUs should be initialized after created.
838             #[cfg(target_arch = "aarch64")]
839             vcpu.init(&self.vm)?;
840 
841             let state: CpuState = snapshot.to_state().map_err(|e| {
842                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
843             })?;
844             vcpu.vcpu
845                 .set_state(&state)
846                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
847 
848             vcpu.saved_state = Some(state);
849         }
850 
851         let vcpu = Arc::new(Mutex::new(vcpu));
852 
853         // Adding vCPU to the CpuManager's vCPU list.
854         self.vcpus.push(vcpu.clone());
855 
856         Ok(vcpu)
857     }
858 
859     pub fn configure_vcpu(
860         &self,
861         vcpu: Arc<Mutex<Vcpu>>,
862         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
863     ) -> Result<()> {
864         let mut vcpu = vcpu.lock().unwrap();
865 
866         #[cfg(feature = "sev_snp")]
867         if self.sev_snp_enabled {
868             if let Some((kernel_entry_point, _)) = boot_setup {
869                 vcpu.set_sev_control_register(
870                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
871                 )?;
872             }
873 
874             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
875             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
876             return Ok(());
877         }
878 
879         #[cfg(target_arch = "x86_64")]
880         assert!(!self.cpuid.is_empty());
881 
882         #[cfg(target_arch = "x86_64")]
883         let topology = self.config.topology.clone().map_or_else(
884             || Some((1, self.boot_vcpus(), 1)),
885             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
886         );
887         #[cfg(target_arch = "x86_64")]
888         vcpu.configure(
889             boot_setup,
890             self.cpuid.clone(),
891             self.config.kvm_hyperv,
892             topology,
893         )?;
894 
895         #[cfg(target_arch = "aarch64")]
896         vcpu.configure(&self.vm, boot_setup)?;
897 
898         #[cfg(target_arch = "riscv64")]
899         vcpu.configure(boot_setup)?;
900 
901         Ok(())
902     }
903 
904     /// Only create new vCPUs if there aren't any inactive ones to reuse
905     fn create_vcpus(
906         &mut self,
907         desired_vcpus: u8,
908         snapshot: Option<Snapshot>,
909     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
910         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
911         info!(
912             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
913             desired_vcpus,
914             self.config.max_vcpus,
915             self.vcpus.len(),
916             self.present_vcpus()
917         );
918 
919         if desired_vcpus > self.config.max_vcpus {
920             return Err(Error::DesiredVCpuCountExceedsMax);
921         }
922 
923         // Only create vCPUs in excess of all the allocated vCPUs.
924         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
925             vcpus.push(self.create_vcpu(
926                 cpu_id,
927                 // TODO: The special format of the CPU id can be removed once
928                 // ready to break live upgrade.
929                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
930             )?);
931         }
932 
933         Ok(vcpus)
934     }
935 
936     #[cfg(target_arch = "aarch64")]
937     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
938         for cpu in self.vcpus.iter() {
939             let cpu = cpu.lock().unwrap();
940             // Check if PMU attr is available, if not, log the information.
941             if cpu.vcpu.has_pmu_support() {
942                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
943             } else {
944                 debug!(
945                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
946                     cpu.id
947                 );
948                 return Ok(false);
949             }
950         }
951 
952         Ok(true)
953     }
954 
955     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
956         self.vcpus.clone()
957     }
958 
959     fn start_vcpu(
960         &mut self,
961         vcpu: Arc<Mutex<Vcpu>>,
962         vcpu_id: u8,
963         vcpu_thread_barrier: Arc<Barrier>,
964         inserting: bool,
965     ) -> Result<()> {
966         let reset_evt = self.reset_evt.try_clone().unwrap();
967         let exit_evt = self.exit_evt.try_clone().unwrap();
968         #[cfg(feature = "kvm")]
969         let hypervisor_type = self.hypervisor.hypervisor_type();
970         #[cfg(feature = "guest_debug")]
971         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
972         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
973         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
974         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
975         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
976 
977         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
978         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
979             .vcpu_run_interrupted
980             .clone();
981         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
982         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
983 
984         // Prepare the CPU set the current vCPU is expected to run onto.
985         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
986             // SAFETY: all zeros is a valid pattern
987             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
988             // SAFETY: FFI call, trivially safe
989             unsafe { libc::CPU_ZERO(&mut cpuset) };
990             for host_cpu in host_cpus {
991                 // SAFETY: FFI call, trivially safe
992                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
993             }
994             cpuset
995         });
996 
997         // Retrieve seccomp filter for vcpu thread
998         let vcpu_seccomp_filter = get_seccomp_filter(
999             &self.seccomp_action,
1000             Thread::Vcpu,
1001             self.hypervisor.hypervisor_type(),
1002         )
1003         .map_err(Error::CreateSeccompFilter)?;
1004 
1005         #[cfg(target_arch = "x86_64")]
1006         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
1007 
1008         info!("Starting vCPU: cpu_id = {}", vcpu_id);
1009 
1010         let handle = Some(
1011             thread::Builder::new()
1012                 .name(format!("vcpu{vcpu_id}"))
1013                 .spawn(move || {
1014                     // Schedule the thread to run on the expected CPU set
1015                     if let Some(cpuset) = cpuset.as_ref() {
1016                         // SAFETY: FFI call with correct arguments
1017                         let ret = unsafe {
1018                             libc::sched_setaffinity(
1019                                 0,
1020                                 std::mem::size_of::<libc::cpu_set_t>(),
1021                                 cpuset as *const libc::cpu_set_t,
1022                             )
1023                         };
1024 
1025                         if ret != 0 {
1026                             error!(
1027                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
1028                                 vcpu_id,
1029                                 io::Error::last_os_error()
1030                             );
1031                             return;
1032                         }
1033                     }
1034 
1035                     // Apply seccomp filter for vcpu thread.
1036                     if !vcpu_seccomp_filter.is_empty() {
1037                         if let Err(e) =
1038                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1039                         {
1040                             error!("Error applying seccomp filter: {:?}", e);
1041                             return;
1042                         }
1043                     }
1044                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1045                     // This uses an async signal safe handler to kill the vcpu handles.
1046                     register_signal_handler(SIGRTMIN(), handle_signal)
1047                         .expect("Failed to register vcpu signal handler");
1048                     // Block until all CPUs are ready.
1049                     vcpu_thread_barrier.wait();
1050 
1051                     std::panic::catch_unwind(move || {
1052                         loop {
1053                             // If we are being told to pause, we park the thread
1054                             // until the pause boolean is toggled.
1055                             // The resume operation is responsible for toggling
1056                             // the boolean and unpark the thread.
1057                             // We enter a loop because park() could spuriously
1058                             // return. We will then park() again unless the
1059                             // pause boolean has been toggled.
1060 
1061                             // Need to use Ordering::SeqCst as we have multiple
1062                             // loads and stores to different atomics and we need
1063                             // to see them in a consistent order in all threads
1064 
1065                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1066                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1067                                 // completed by returning to KVM_RUN. From the kernel docs:
1068                                 //
1069                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1070                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1071                                 // operations are complete (and guest state is consistent) only after userspace
1072                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1073                                 // incomplete operations and then check for pending signals.
1074                                 // The pending state of the operation is not preserved in state which is
1075                                 // visible to userspace, thus userspace should ensure that the operation is
1076                                 // completed before performing a live migration.  Userspace can re-enter the
1077                                 // guest with an unmasked signal pending or with the immediate_exit field set
1078                                 // to complete pending operations without allowing any further instructions
1079                                 // to be executed.
1080 
1081                                 #[cfg(feature = "kvm")]
1082                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1083                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1084                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1085                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1086                                         break;
1087                                     }
1088                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1089                                 }
1090 
1091                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1092 
1093                                 vcpu_paused.store(true, Ordering::SeqCst);
1094                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1095                                     thread::park();
1096                                 }
1097                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1098                             }
1099 
1100                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1101                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1102                                 #[cfg(target_arch = "x86_64")]
1103                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1104                                     Ok(()) => {},
1105                                     Err(e) => {
1106                                         error!("Error when inject nmi {}", e);
1107                                         break;
1108                                     }
1109                                 }
1110                             }
1111 
1112                             // We've been told to terminate
1113                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1114                                 || vcpu_kill.load(Ordering::SeqCst)
1115                             {
1116                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1117                                 break;
1118                             }
1119 
1120                             #[cfg(feature = "tdx")]
1121                             let mut vcpu = vcpu.lock().unwrap();
1122                             #[cfg(not(feature = "tdx"))]
1123                             let vcpu = vcpu.lock().unwrap();
1124                             // vcpu.run() returns false on a triple-fault so trigger a reset
1125                             match vcpu.run() {
1126                                 Ok(run) => match run {
1127                                     #[cfg(feature = "kvm")]
1128                                     VmExit::Debug => {
1129                                         info!("VmExit::Debug");
1130                                         #[cfg(feature = "guest_debug")]
1131                                         {
1132                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1133                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1134                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1135                                         }
1136                                     }
1137                                     #[cfg(target_arch = "x86_64")]
1138                                     VmExit::IoapicEoi(vector) => {
1139                                         if let Some(interrupt_controller) =
1140                                             &interrupt_controller_clone
1141                                         {
1142                                             interrupt_controller
1143                                                 .lock()
1144                                                 .unwrap()
1145                                                 .end_of_interrupt(vector);
1146                                         }
1147                                     }
1148                                     VmExit::Ignore => {}
1149                                     VmExit::Hyperv => {}
1150                                     VmExit::Reset => {
1151                                         info!("VmExit::Reset");
1152                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1153                                         reset_evt.write(1).unwrap();
1154                                         break;
1155                                     }
1156                                     VmExit::Shutdown => {
1157                                         info!("VmExit::Shutdown");
1158                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1159                                         exit_evt.write(1).unwrap();
1160                                         break;
1161                                     }
1162                                     #[cfg(feature = "tdx")]
1163                                     VmExit::Tdx => {
1164                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1165                                             match vcpu.get_tdx_exit_details() {
1166                                                 Ok(details) => match details {
1167                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1168                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1169                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1170                                                     }
1171                                                 },
1172                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1173                                             }
1174                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1175                                         } else {
1176                                             // We should never reach this code as
1177                                             // this means the design from the code
1178                                             // is wrong.
1179                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1180                                         }
1181                                     }
1182                                 },
1183 
1184                                 Err(e) => {
1185                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1186                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1187                                     exit_evt.write(1).unwrap();
1188                                     break;
1189                                 }
1190                             }
1191 
1192                             // We've been told to terminate
1193                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1194                                 || vcpu_kill.load(Ordering::SeqCst)
1195                             {
1196                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1197                                 break;
1198                             }
1199                         }
1200                     })
1201                     .or_else(|_| {
1202                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1203                         error!("vCPU thread panicked");
1204                         panic_exit_evt.write(1)
1205                     })
1206                     .ok();
1207                 })
1208                 .map_err(Error::VcpuSpawn)?,
1209         );
1210 
1211         // On hot plug calls into this function entry_point is None. It is for
1212         // those hotplug CPU additions that we need to set the inserting flag.
1213         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1214         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1215 
1216         Ok(())
1217     }
1218 
1219     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1220     fn activate_vcpus(
1221         &mut self,
1222         desired_vcpus: u8,
1223         inserting: bool,
1224         paused: Option<bool>,
1225     ) -> Result<()> {
1226         if desired_vcpus > self.config.max_vcpus {
1227             return Err(Error::DesiredVCpuCountExceedsMax);
1228         }
1229 
1230         let vcpu_thread_barrier = Arc::new(Barrier::new(
1231             (desired_vcpus - self.present_vcpus() + 1) as usize,
1232         ));
1233 
1234         if let Some(paused) = paused {
1235             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1236         }
1237 
1238         info!(
1239             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1240             desired_vcpus,
1241             self.vcpus.len(),
1242             self.present_vcpus(),
1243             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1244         );
1245 
1246         // This reuses any inactive vCPUs as well as any that were newly created
1247         for vcpu_id in self.present_vcpus()..desired_vcpus {
1248             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1249             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1250         }
1251 
1252         // Unblock all CPU threads.
1253         vcpu_thread_barrier.wait();
1254         Ok(())
1255     }
1256 
1257     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1258         // Mark vCPUs for removal, actual removal happens on ejection
1259         for cpu_id in desired_vcpus..self.present_vcpus() {
1260             self.vcpu_states[usize::from(cpu_id)].removing = true;
1261             self.vcpu_states[usize::from(cpu_id)]
1262                 .pending_removal
1263                 .store(true, Ordering::SeqCst);
1264         }
1265     }
1266 
1267     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1268         for state in self.vcpu_states.iter() {
1269             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1270                 return true;
1271             }
1272         }
1273         false
1274     }
1275 
1276     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1277         info!("Removing vCPU: cpu_id = {}", cpu_id);
1278         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1279         state.kill.store(true, Ordering::SeqCst);
1280         state.signal_thread();
1281         state.join_thread()?;
1282         state.handle = None;
1283 
1284         // Once the thread has exited, clear the "kill" so that it can reused
1285         state.kill.store(false, Ordering::SeqCst);
1286         state.pending_removal.store(false, Ordering::SeqCst);
1287 
1288         Ok(())
1289     }
1290 
1291     pub fn create_boot_vcpus(
1292         &mut self,
1293         snapshot: Option<Snapshot>,
1294     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1295         trace_scoped!("create_boot_vcpus");
1296 
1297         self.create_vcpus(self.boot_vcpus(), snapshot)
1298     }
1299 
1300     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1301     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1302         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1303     }
1304 
1305     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1306         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1307             .map_err(|e| {
1308                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1309             })?;
1310 
1311         Ok(())
1312     }
1313 
1314     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1315         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1316             return Ok(false);
1317         }
1318 
1319         if !self.dynamic {
1320             return Ok(false);
1321         }
1322 
1323         if self.check_pending_removed_vcpu() {
1324             return Err(Error::VcpuPendingRemovedVcpu);
1325         }
1326 
1327         match desired_vcpus.cmp(&self.present_vcpus()) {
1328             cmp::Ordering::Greater => {
1329                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1330                 for vcpu in vcpus {
1331                     self.configure_vcpu(vcpu, None)?
1332                 }
1333                 self.activate_vcpus(desired_vcpus, true, None)?;
1334                 Ok(true)
1335             }
1336             cmp::Ordering::Less => {
1337                 self.mark_vcpus_for_removal(desired_vcpus);
1338                 Ok(true)
1339             }
1340             _ => Ok(false),
1341         }
1342     }
1343 
1344     pub fn shutdown(&mut self) -> Result<()> {
1345         // Tell the vCPUs to stop themselves next time they go through the loop
1346         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1347 
1348         // Toggle the vCPUs pause boolean
1349         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1350 
1351         // Unpark all the VCPU threads.
1352         for state in self.vcpu_states.iter() {
1353             state.unpark_thread();
1354         }
1355 
1356         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1357         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1358         // above.
1359         for state in self.vcpu_states.iter() {
1360             state.signal_thread();
1361         }
1362 
1363         // Wait for all the threads to finish. This removes the state from the vector.
1364         for mut state in self.vcpu_states.drain(..) {
1365             state.join_thread()?;
1366         }
1367 
1368         Ok(())
1369     }
1370 
1371     #[cfg(feature = "tdx")]
1372     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1373         for vcpu in &self.vcpus {
1374             vcpu.lock()
1375                 .unwrap()
1376                 .vcpu
1377                 .tdx_init(hob_address)
1378                 .map_err(Error::InitializeTdx)?;
1379         }
1380         Ok(())
1381     }
1382 
1383     pub fn boot_vcpus(&self) -> u8 {
1384         self.config.boot_vcpus
1385     }
1386 
1387     pub fn max_vcpus(&self) -> u8 {
1388         self.config.max_vcpus
1389     }
1390 
1391     #[cfg(target_arch = "x86_64")]
1392     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1393         assert!(!self.cpuid.is_empty());
1394         self.cpuid.clone()
1395     }
1396 
1397     fn present_vcpus(&self) -> u8 {
1398         self.vcpu_states
1399             .iter()
1400             .fold(0, |acc, state| acc + state.active() as u8)
1401     }
1402 
1403     #[cfg(target_arch = "aarch64")]
1404     pub fn get_mpidrs(&self) -> Vec<u64> {
1405         self.vcpus
1406             .iter()
1407             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1408             .collect()
1409     }
1410 
1411     #[cfg(target_arch = "aarch64")]
1412     pub fn get_saved_states(&self) -> Vec<CpuState> {
1413         self.vcpus
1414             .iter()
1415             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1416             .collect()
1417     }
1418 
1419     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1420         self.config
1421             .topology
1422             .clone()
1423             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1424     }
1425 
1426     #[cfg(not(target_arch = "riscv64"))]
1427     pub fn create_madt(&self) -> Sdt {
1428         use crate::acpi;
1429         // This is also checked in the commandline parsing.
1430         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1431 
1432         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1433         #[cfg(target_arch = "x86_64")]
1434         {
1435             madt.write(36, arch::layout::APIC_START.0);
1436 
1437             for cpu in 0..self.config.max_vcpus {
1438                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1439 
1440                 let lapic = LocalX2Apic {
1441                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1442                     length: 16,
1443                     processor_id: cpu.into(),
1444                     apic_id: x2apic_id,
1445                     flags: if cpu < self.config.boot_vcpus {
1446                         1 << MADT_CPU_ENABLE_FLAG
1447                     } else {
1448                         0
1449                     } | (1 << MADT_CPU_ONLINE_CAPABLE_FLAG),
1450                     _reserved: 0,
1451                 };
1452                 madt.append(lapic);
1453             }
1454 
1455             madt.append(Ioapic {
1456                 r#type: acpi::ACPI_APIC_IO,
1457                 length: 12,
1458                 ioapic_id: 0,
1459                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1460                 gsi_base: 0,
1461                 ..Default::default()
1462             });
1463 
1464             madt.append(InterruptSourceOverride {
1465                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1466                 length: 10,
1467                 bus: 0,
1468                 source: 4,
1469                 gsi: 4,
1470                 flags: 0,
1471             });
1472         }
1473 
1474         #[cfg(target_arch = "aarch64")]
1475         {
1476             /* Notes:
1477              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1478              */
1479 
1480             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1481             for cpu in 0..self.config.boot_vcpus {
1482                 let vcpu = &self.vcpus[cpu as usize];
1483                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1484                 /* ARMv8 MPIDR format:
1485                      Bits [63:40] Must be zero
1486                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1487                      Bits [31:24] Must be zero
1488                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1489                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1490                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1491                 */
1492                 let mpidr_mask = 0xff_00ff_ffff;
1493                 let gicc = GicC {
1494                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1495                     length: 80,
1496                     reserved0: 0,
1497                     cpu_interface_number: cpu as u32,
1498                     uid: cpu as u32,
1499                     flags: 1,
1500                     parking_version: 0,
1501                     performance_interrupt: 0,
1502                     parked_address: 0,
1503                     base_address: 0,
1504                     gicv_base_address: 0,
1505                     gich_base_address: 0,
1506                     vgic_interrupt: 0,
1507                     gicr_base_address: 0,
1508                     mpidr: mpidr & mpidr_mask,
1509                     proc_power_effi_class: 0,
1510                     reserved1: 0,
1511                     spe_overflow_interrupt: 0,
1512                 };
1513 
1514                 madt.append(gicc);
1515             }
1516             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1517 
1518             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1519             let gicd = GicD {
1520                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1521                 length: 24,
1522                 reserved0: 0,
1523                 gic_id: 0,
1524                 base_address: vgic_config.dist_addr,
1525                 global_irq_base: 0,
1526                 version: 3,
1527                 reserved1: [0; 3],
1528             };
1529             madt.append(gicd);
1530 
1531             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1532             let gicr = GicR {
1533                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1534                 length: 16,
1535                 reserved: 0,
1536                 base_address: vgic_config.redists_addr,
1537                 range_length: vgic_config.redists_size as u32,
1538             };
1539             madt.append(gicr);
1540 
1541             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1542             let gicits = GicIts {
1543                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1544                 length: 20,
1545                 reserved0: 0,
1546                 translation_id: 0,
1547                 base_address: vgic_config.msi_addr,
1548                 reserved1: 0,
1549             };
1550             madt.append(gicits);
1551 
1552             madt.update_checksum();
1553         }
1554 
1555         madt
1556     }
1557 
1558     #[cfg(target_arch = "aarch64")]
1559     pub fn create_pptt(&self) -> Sdt {
1560         let pptt_start = 0;
1561         let mut cpus = 0;
1562         let mut uid = 0;
1563         // If topology is not specified, the default setting is:
1564         // 1 package, multiple cores, 1 thread per core
1565         // This is also the behavior when PPTT is missing.
1566         let (threads_per_core, cores_per_package, packages) =
1567             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1568 
1569         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1570 
1571         for cluster_idx in 0..packages {
1572             if cpus < self.config.boot_vcpus as usize {
1573                 let cluster_offset = pptt.len() - pptt_start;
1574                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1575                     r#type: 0,
1576                     length: 20,
1577                     reserved: 0,
1578                     flags: 0x2,
1579                     parent: 0,
1580                     acpi_processor_id: cluster_idx as u32,
1581                     num_private_resources: 0,
1582                 };
1583                 pptt.append(cluster_hierarchy_node);
1584 
1585                 for core_idx in 0..cores_per_package {
1586                     let core_offset = pptt.len() - pptt_start;
1587 
1588                     if threads_per_core > 1 {
1589                         let core_hierarchy_node = ProcessorHierarchyNode {
1590                             r#type: 0,
1591                             length: 20,
1592                             reserved: 0,
1593                             flags: 0x2,
1594                             parent: cluster_offset as u32,
1595                             acpi_processor_id: core_idx as u32,
1596                             num_private_resources: 0,
1597                         };
1598                         pptt.append(core_hierarchy_node);
1599 
1600                         for _thread_idx in 0..threads_per_core {
1601                             let thread_hierarchy_node = ProcessorHierarchyNode {
1602                                 r#type: 0,
1603                                 length: 20,
1604                                 reserved: 0,
1605                                 flags: 0xE,
1606                                 parent: core_offset as u32,
1607                                 acpi_processor_id: uid as u32,
1608                                 num_private_resources: 0,
1609                             };
1610                             pptt.append(thread_hierarchy_node);
1611                             uid += 1;
1612                         }
1613                     } else {
1614                         let thread_hierarchy_node = ProcessorHierarchyNode {
1615                             r#type: 0,
1616                             length: 20,
1617                             reserved: 0,
1618                             flags: 0xA,
1619                             parent: cluster_offset as u32,
1620                             acpi_processor_id: uid as u32,
1621                             num_private_resources: 0,
1622                         };
1623                         pptt.append(thread_hierarchy_node);
1624                         uid += 1;
1625                     }
1626                 }
1627                 cpus += (cores_per_package * threads_per_core) as usize;
1628             }
1629         }
1630 
1631         pptt.update_checksum();
1632         pptt
1633     }
1634 
1635     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1636     fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters {
1637         self.vcpus[usize::from(cpu_id)]
1638             .lock()
1639             .unwrap()
1640             .vcpu
1641             .create_standard_regs()
1642     }
1643 
1644     #[cfg(feature = "guest_debug")]
1645     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1646         self.vcpus[usize::from(cpu_id)]
1647             .lock()
1648             .unwrap()
1649             .vcpu
1650             .get_regs()
1651             .map_err(Error::CpuDebug)
1652     }
1653 
1654     #[cfg(feature = "guest_debug")]
1655     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1656         self.vcpus[usize::from(cpu_id)]
1657             .lock()
1658             .unwrap()
1659             .vcpu
1660             .set_regs(regs)
1661             .map_err(Error::CpuDebug)
1662     }
1663 
1664     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1665     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1666         self.vcpus[usize::from(cpu_id)]
1667             .lock()
1668             .unwrap()
1669             .vcpu
1670             .get_sregs()
1671             .map_err(Error::CpuDebug)
1672     }
1673 
1674     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1675     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1676         self.vcpus[usize::from(cpu_id)]
1677             .lock()
1678             .unwrap()
1679             .vcpu
1680             .set_sregs(sregs)
1681             .map_err(Error::CpuDebug)
1682     }
1683 
1684     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1685     fn translate_gva(
1686         &self,
1687         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1688         cpu_id: u8,
1689         gva: u64,
1690     ) -> Result<u64> {
1691         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1692             .lock()
1693             .unwrap()
1694             .vcpu
1695             .translate_gva(gva, /* flags: unused */ 0)
1696             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1697         Ok(gpa)
1698     }
1699 
1700     ///
1701     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1702     /// it in VMM by walking through translation tables.
1703     ///
1704     /// Address translation is big topic, here we only focus the scenario that
1705     /// happens in VMM while debugging kernel. This `translate_gva`
1706     /// implementation is restricted to:
1707     /// - Exception Level 1
1708     /// - Translate high address range only (kernel space)
1709     ///
1710     /// This implementation supports following Arm-v8a features related to
1711     /// address translation:
1712     /// - FEAT_LPA
1713     /// - FEAT_LVA
1714     /// - FEAT_LPA2
1715     ///
1716     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1717     fn translate_gva(
1718         &self,
1719         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1720         cpu_id: u8,
1721         gva: u64,
1722     ) -> Result<u64> {
1723         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1724             .lock()
1725             .unwrap()
1726             .vcpu
1727             .get_sys_reg(TCR_EL1)
1728             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1729         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1730             .lock()
1731             .unwrap()
1732             .vcpu
1733             .get_sys_reg(TTBR1_EL1)
1734             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1735         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1736             .lock()
1737             .unwrap()
1738             .vcpu
1739             .get_sys_reg(ID_AA64MMFR0_EL1)
1740             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1741 
1742         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1743         // or low (0x000xxx...).
1744         let high_range = extract_bits_64!(gva, 55, 1);
1745         if high_range == 0 {
1746             info!("VA (0x{:x}) range is not supported!", gva);
1747             return Ok(gva);
1748         }
1749 
1750         // High range size offset
1751         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1752         // Granule size
1753         let tg = extract_bits_64!(tcr_el1, 30, 2);
1754         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1755         let ds = extract_bits_64!(tcr_el1, 59, 1);
1756 
1757         if tsz == 0 {
1758             info!("VA translation is not ready!");
1759             return Ok(gva);
1760         }
1761 
1762         // VA size is determined by TCR_BL1.T1SZ
1763         let va_size = 64 - tsz;
1764         // Number of bits in VA consumed in each level of translation
1765         let stride = match tg {
1766             3 => 13, // 64KB granule size
1767             1 => 11, // 16KB granule size
1768             _ => 9,  // 4KB, default
1769         };
1770         // Starting level of walking
1771         let mut level = 4 - (va_size - 4) / stride;
1772 
1773         // PA or IPA size is determined
1774         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1775         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1776         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1777         // To be safe, we use the minimum value if they are different.
1778         let pa_range = std::cmp::min(tcr_ips, pa_range);
1779         // PA size in bits
1780         let pa_size = match pa_range {
1781             0 => 32,
1782             1 => 36,
1783             2 => 40,
1784             3 => 42,
1785             4 => 44,
1786             5 => 48,
1787             6 => 52,
1788             _ => {
1789                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1790                     "PA range not supported {pa_range}"
1791                 ))))
1792             }
1793         };
1794 
1795         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1796         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1797         // If FEAT_LPA2 is present, the translation table descriptor holds
1798         // 50 bits of the table address of next level.
1799         // Otherwise, it is 48 bits.
1800         let descaddrmask = if ds == 1 {
1801             !0u64 >> (64 - 50) // mask with 50 least significant bits
1802         } else {
1803             !0u64 >> (64 - 48) // mask with 48 least significant bits
1804         };
1805         let descaddrmask = descaddrmask & !indexmask_grainsize;
1806 
1807         // Translation table base address
1808         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1809         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1810         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1811         if pa_size == 52 {
1812             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1813         }
1814 
1815         // Loop through tables of each level
1816         loop {
1817             // Table offset for current level
1818             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1819             descaddr |= table_offset;
1820             descaddr &= !7u64;
1821 
1822             let mut buf = [0; 8];
1823             guest_memory
1824                 .memory()
1825                 .read(&mut buf, GuestAddress(descaddr))
1826                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1827             let descriptor = u64::from_le_bytes(buf);
1828 
1829             descaddr = descriptor & descaddrmask;
1830             // In the case of FEAT_LPA, the next-level translation table address
1831             // bits [48:51] comes from bits [12:15] of the current descriptor.
1832             // For FEAT_LPA2, the next-level translation table address
1833             // bits [50:51] comes from bits [8:9] of the current descriptor,
1834             // bits [48:49] comes from bits [48:49] of the descriptor which was
1835             // handled previously.
1836             if pa_size == 52 {
1837                 if ds == 1 {
1838                     // FEAT_LPA2
1839                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1840                 } else {
1841                     // FEAT_LPA
1842                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1843                 }
1844             }
1845 
1846             if (descriptor & 2) != 0 && (level < 3) {
1847                 // This is a table entry. Go down to next level.
1848                 level += 1;
1849                 indexmask = indexmask_grainsize;
1850                 continue;
1851             }
1852 
1853             break;
1854         }
1855 
1856         // We have reached either:
1857         // - a page entry at level 3 or
1858         // - a block entry at level 1 or 2
1859         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1860         descaddr &= !(page_size - 1);
1861         descaddr |= gva & (page_size - 1);
1862 
1863         Ok(descaddr)
1864     }
1865 
1866     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1867         self.acpi_address = Some(acpi_address);
1868     }
1869 
1870     pub(crate) fn set_interrupt_controller(
1871         &mut self,
1872         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1873     ) {
1874         self.interrupt_controller = Some(interrupt_controller);
1875     }
1876 
1877     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1878         &self.vcpus_kill_signalled
1879     }
1880 
1881     #[cfg(feature = "igvm")]
1882     pub(crate) fn get_cpuid_leaf(
1883         &self,
1884         cpu_id: u8,
1885         eax: u32,
1886         ecx: u32,
1887         xfem: u64,
1888         xss: u64,
1889     ) -> Result<[u32; 4]> {
1890         let leaf_info = self.vcpus[usize::from(cpu_id)]
1891             .lock()
1892             .unwrap()
1893             .vcpu
1894             .get_cpuid_values(eax, ecx, xfem, xss)
1895             .unwrap();
1896         Ok(leaf_info)
1897     }
1898 
1899     #[cfg(feature = "sev_snp")]
1900     pub(crate) fn sev_snp_enabled(&self) -> bool {
1901         self.sev_snp_enabled
1902     }
1903 
1904     pub(crate) fn nmi(&self) -> Result<()> {
1905         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1906 
1907         for state in self.vcpu_states.iter() {
1908             state.signal_thread();
1909         }
1910 
1911         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1912 
1913         Ok(())
1914     }
1915 }
1916 
1917 struct Cpu {
1918     cpu_id: u8,
1919     proximity_domain: u32,
1920     dynamic: bool,
1921     #[cfg(target_arch = "x86_64")]
1922     topology: Option<(u8, u8, u8)>,
1923 }
1924 
1925 #[cfg(target_arch = "x86_64")]
1926 const MADT_CPU_ENABLE_FLAG: usize = 0;
1927 
1928 #[cfg(target_arch = "x86_64")]
1929 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1930 
1931 impl Cpu {
1932     #[cfg(target_arch = "x86_64")]
1933     fn generate_mat(&self) -> Vec<u8> {
1934         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1935 
1936         let lapic = LocalX2Apic {
1937             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1938             length: 16,
1939             processor_id: self.cpu_id.into(),
1940             apic_id: x2apic_id,
1941             flags: 1 << MADT_CPU_ENABLE_FLAG,
1942             _reserved: 0,
1943         };
1944 
1945         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1946         // SAFETY: mat_data is large enough to hold lapic
1947         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1948 
1949         mat_data
1950     }
1951 }
1952 
1953 impl Aml for Cpu {
1954     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1955         #[cfg(target_arch = "x86_64")]
1956         let mat_data: Vec<u8> = self.generate_mat();
1957         #[allow(clippy::if_same_then_else)]
1958         if self.dynamic {
1959             aml::Device::new(
1960                 format!("C{:03X}", self.cpu_id).as_str().into(),
1961                 vec![
1962                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1963                     &aml::Name::new("_UID".into(), &self.cpu_id),
1964                     // Currently, AArch64 cannot support following fields.
1965                     /*
1966                     _STA return value:
1967                     Bit [0] – Set if the device is present.
1968                     Bit [1] – Set if the device is enabled and decoding its resources.
1969                     Bit [2] – Set if the device should be shown in the UI.
1970                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1971                     Bit [4] – Set if the battery is present.
1972                     Bits [31:5] – Reserved (must be cleared).
1973                     */
1974                     #[cfg(target_arch = "x86_64")]
1975                     &aml::Method::new(
1976                         "_STA".into(),
1977                         0,
1978                         false,
1979                         // Call into CSTA method which will interrogate device
1980                         vec![&aml::Return::new(&aml::MethodCall::new(
1981                             "CSTA".into(),
1982                             vec![&self.cpu_id],
1983                         ))],
1984                     ),
1985                     &aml::Method::new(
1986                         "_PXM".into(),
1987                         0,
1988                         false,
1989                         vec![&aml::Return::new(&self.proximity_domain)],
1990                     ),
1991                     // The Linux kernel expects every CPU device to have a _MAT entry
1992                     // containing the LAPIC for this processor with the enabled bit set
1993                     // even it if is disabled in the MADT (non-boot CPU)
1994                     #[cfg(target_arch = "x86_64")]
1995                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1996                     // Trigger CPU ejection
1997                     #[cfg(target_arch = "x86_64")]
1998                     &aml::Method::new(
1999                         "_EJ0".into(),
2000                         1,
2001                         false,
2002                         // Call into CEJ0 method which will actually eject device
2003                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
2004                     ),
2005                 ],
2006             )
2007             .to_aml_bytes(sink);
2008         } else {
2009             aml::Device::new(
2010                 format!("C{:03X}", self.cpu_id).as_str().into(),
2011                 vec![
2012                     &aml::Name::new("_HID".into(), &"ACPI0007"),
2013                     &aml::Name::new("_UID".into(), &self.cpu_id),
2014                     #[cfg(target_arch = "x86_64")]
2015                     &aml::Method::new(
2016                         "_STA".into(),
2017                         0,
2018                         false,
2019                         // Mark CPU present see CSTA implementation
2020                         vec![&aml::Return::new(&0xfu8)],
2021                     ),
2022                     &aml::Method::new(
2023                         "_PXM".into(),
2024                         0,
2025                         false,
2026                         vec![&aml::Return::new(&self.proximity_domain)],
2027                     ),
2028                     // The Linux kernel expects every CPU device to have a _MAT entry
2029                     // containing the LAPIC for this processor with the enabled bit set
2030                     // even it if is disabled in the MADT (non-boot CPU)
2031                     #[cfg(target_arch = "x86_64")]
2032                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
2033                 ],
2034             )
2035             .to_aml_bytes(sink);
2036         }
2037     }
2038 }
2039 
2040 struct CpuNotify {
2041     cpu_id: u8,
2042 }
2043 
2044 impl Aml for CpuNotify {
2045     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2046         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2047         aml::If::new(
2048             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2049             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2050         )
2051         .to_aml_bytes(sink)
2052     }
2053 }
2054 
2055 struct CpuMethods {
2056     max_vcpus: u8,
2057     dynamic: bool,
2058 }
2059 
2060 impl Aml for CpuMethods {
2061     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2062         if self.dynamic {
2063             // CPU status method
2064             aml::Method::new(
2065                 "CSTA".into(),
2066                 1,
2067                 true,
2068                 vec![
2069                     // Take lock defined above
2070                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2071                     // Write CPU number (in first argument) to I/O port via field
2072                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2073                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2074                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2075                     &aml::If::new(
2076                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2077                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2078                     ),
2079                     // Release lock
2080                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2081                     // Return 0 or 0xf
2082                     &aml::Return::new(&aml::Local(0)),
2083                 ],
2084             )
2085             .to_aml_bytes(sink);
2086 
2087             let mut cpu_notifies = Vec::new();
2088             for cpu_id in 0..self.max_vcpus {
2089                 cpu_notifies.push(CpuNotify { cpu_id });
2090             }
2091 
2092             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2093             for cpu_id in 0..self.max_vcpus {
2094                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2095             }
2096 
2097             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2098 
2099             aml::Method::new(
2100                 "CEJ0".into(),
2101                 1,
2102                 true,
2103                 vec![
2104                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2105                     // Write CPU number (in first argument) to I/O port via field
2106                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2107                     // Set CEJ0 bit
2108                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2109                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2110                 ],
2111             )
2112             .to_aml_bytes(sink);
2113 
2114             aml::Method::new(
2115                 "CSCN".into(),
2116                 0,
2117                 true,
2118                 vec![
2119                     // Take lock defined above
2120                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2121                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2122                     &aml::While::new(
2123                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2124                         vec![
2125                             // Write CPU number (in first argument) to I/O port via field
2126                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2127                             // Check if CINS bit is set
2128                             &aml::If::new(
2129                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2130                                 // Notify device if it is
2131                                 vec![
2132                                     &aml::MethodCall::new(
2133                                         "CTFY".into(),
2134                                         vec![&aml::Local(0), &aml::ONE],
2135                                     ),
2136                                     // Reset CINS bit
2137                                     &aml::Store::new(
2138                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2139                                         &aml::ONE,
2140                                     ),
2141                                 ],
2142                             ),
2143                             // Check if CRMV bit is set
2144                             &aml::If::new(
2145                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2146                                 // Notify device if it is (with the eject constant 0x3)
2147                                 vec![
2148                                     &aml::MethodCall::new(
2149                                         "CTFY".into(),
2150                                         vec![&aml::Local(0), &3u8],
2151                                     ),
2152                                     // Reset CRMV bit
2153                                     &aml::Store::new(
2154                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2155                                         &aml::ONE,
2156                                     ),
2157                                 ],
2158                             ),
2159                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2160                         ],
2161                     ),
2162                     // Release lock
2163                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2164                 ],
2165             )
2166             .to_aml_bytes(sink)
2167         } else {
2168             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2169         }
2170     }
2171 }
2172 
2173 impl Aml for CpuManager {
2174     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2175         #[cfg(target_arch = "x86_64")]
2176         if let Some(acpi_address) = self.acpi_address {
2177             // CPU hotplug controller
2178             aml::Device::new(
2179                 "_SB_.PRES".into(),
2180                 vec![
2181                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2182                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2183                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2184                     &aml::Mutex::new("CPLK".into(), 0),
2185                     &aml::Name::new(
2186                         "_CRS".into(),
2187                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2188                             aml::AddressSpaceCacheable::NotCacheable,
2189                             true,
2190                             acpi_address.0,
2191                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2192                             None,
2193                         )]),
2194                     ),
2195                     // OpRegion and Fields map MMIO range into individual field values
2196                     &aml::OpRegion::new(
2197                         "PRST".into(),
2198                         aml::OpRegionSpace::SystemMemory,
2199                         &(acpi_address.0 as usize),
2200                         &CPU_MANAGER_ACPI_SIZE,
2201                     ),
2202                     &aml::Field::new(
2203                         "PRST".into(),
2204                         aml::FieldAccessType::Byte,
2205                         aml::FieldLockRule::NoLock,
2206                         aml::FieldUpdateRule::WriteAsZeroes,
2207                         vec![
2208                             aml::FieldEntry::Reserved(32),
2209                             aml::FieldEntry::Named(*b"CPEN", 1),
2210                             aml::FieldEntry::Named(*b"CINS", 1),
2211                             aml::FieldEntry::Named(*b"CRMV", 1),
2212                             aml::FieldEntry::Named(*b"CEJ0", 1),
2213                             aml::FieldEntry::Reserved(4),
2214                             aml::FieldEntry::Named(*b"CCMD", 8),
2215                         ],
2216                     ),
2217                     &aml::Field::new(
2218                         "PRST".into(),
2219                         aml::FieldAccessType::DWord,
2220                         aml::FieldLockRule::NoLock,
2221                         aml::FieldUpdateRule::Preserve,
2222                         vec![
2223                             aml::FieldEntry::Named(*b"CSEL", 32),
2224                             aml::FieldEntry::Reserved(32),
2225                             aml::FieldEntry::Named(*b"CDAT", 32),
2226                         ],
2227                     ),
2228                 ],
2229             )
2230             .to_aml_bytes(sink);
2231         }
2232 
2233         // CPU devices
2234         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2235         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2236         // Bundle methods together under a common object
2237         let methods = CpuMethods {
2238             max_vcpus: self.config.max_vcpus,
2239             dynamic: self.dynamic,
2240         };
2241         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2242 
2243         #[cfg(target_arch = "x86_64")]
2244         let topology = self.get_vcpu_topology();
2245         let mut cpu_devices = Vec::new();
2246         for cpu_id in 0..self.config.max_vcpus {
2247             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2248             let cpu_device = Cpu {
2249                 cpu_id,
2250                 proximity_domain,
2251                 dynamic: self.dynamic,
2252                 #[cfg(target_arch = "x86_64")]
2253                 topology,
2254             };
2255 
2256             cpu_devices.push(cpu_device);
2257         }
2258 
2259         for cpu_device in cpu_devices.iter() {
2260             cpu_data_inner.push(cpu_device);
2261         }
2262 
2263         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2264     }
2265 }
2266 
2267 impl Pausable for CpuManager {
2268     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2269         // Tell the vCPUs to pause themselves next time they exit
2270         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2271 
2272         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2273         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2274         // above.
2275         for state in self.vcpu_states.iter() {
2276             state.signal_thread();
2277         }
2278 
2279         for vcpu in self.vcpus.iter() {
2280             let mut vcpu = vcpu.lock().unwrap();
2281             vcpu.pause()?;
2282             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2283             if !self.config.kvm_hyperv {
2284                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2285                     MigratableError::Pause(anyhow!(
2286                         "Could not notify guest it has been paused {:?}",
2287                         e
2288                     ))
2289                 })?;
2290             }
2291         }
2292 
2293         // The vCPU thread will change its paused state before parking, wait here for each
2294         // activated vCPU change their state to ensure they have parked.
2295         for state in self.vcpu_states.iter() {
2296             if state.active() {
2297                 while !state.paused.load(Ordering::SeqCst) {
2298                     // To avoid a priority inversion with the vCPU thread
2299                     thread::sleep(std::time::Duration::from_millis(1));
2300                 }
2301             }
2302         }
2303 
2304         Ok(())
2305     }
2306 
2307     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2308         for vcpu in self.vcpus.iter() {
2309             vcpu.lock().unwrap().resume()?;
2310         }
2311 
2312         // Toggle the vCPUs pause boolean
2313         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2314 
2315         // Unpark all the VCPU threads.
2316         // Once unparked, the next thing they will do is checking for the pause
2317         // boolean. Since it'll be set to false, they will exit their pause loop
2318         // and go back to vmx root.
2319         for state in self.vcpu_states.iter() {
2320             state.paused.store(false, Ordering::SeqCst);
2321             state.unpark_thread();
2322         }
2323         Ok(())
2324     }
2325 }
2326 
2327 impl Snapshottable for CpuManager {
2328     fn id(&self) -> String {
2329         CPU_MANAGER_SNAPSHOT_ID.to_string()
2330     }
2331 
2332     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2333         let mut cpu_manager_snapshot = Snapshot::default();
2334 
2335         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2336         for vcpu in &self.vcpus {
2337             let mut vcpu = vcpu.lock().unwrap();
2338             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2339         }
2340 
2341         Ok(cpu_manager_snapshot)
2342     }
2343 }
2344 
2345 impl Transportable for CpuManager {}
2346 impl Migratable for CpuManager {}
2347 
2348 #[cfg(feature = "guest_debug")]
2349 impl Debuggable for CpuManager {
2350     #[cfg(feature = "kvm")]
2351     fn set_guest_debug(
2352         &self,
2353         cpu_id: usize,
2354         addrs: &[GuestAddress],
2355         singlestep: bool,
2356     ) -> std::result::Result<(), DebuggableError> {
2357         self.vcpus[cpu_id]
2358             .lock()
2359             .unwrap()
2360             .vcpu
2361             .set_guest_debug(addrs, singlestep)
2362             .map_err(DebuggableError::SetDebug)
2363     }
2364 
2365     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2366         Ok(())
2367     }
2368 
2369     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2370         Ok(())
2371     }
2372 
2373     #[cfg(target_arch = "x86_64")]
2374     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2375         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2376         let gregs = self
2377             .get_regs(cpu_id as u8)
2378             .map_err(DebuggableError::ReadRegs)?;
2379         let regs = [
2380             gregs.get_rax(),
2381             gregs.get_rbx(),
2382             gregs.get_rcx(),
2383             gregs.get_rdx(),
2384             gregs.get_rsi(),
2385             gregs.get_rdi(),
2386             gregs.get_rbp(),
2387             gregs.get_rsp(),
2388             gregs.get_r8(),
2389             gregs.get_r9(),
2390             gregs.get_r10(),
2391             gregs.get_r11(),
2392             gregs.get_r12(),
2393             gregs.get_r13(),
2394             gregs.get_r14(),
2395             gregs.get_r15(),
2396         ];
2397 
2398         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2399         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2400         let eflags = gregs.get_rflags() as u32;
2401         let rip = gregs.get_rip();
2402 
2403         // Segment registers: CS, SS, DS, ES, FS, GS
2404         let sregs = self
2405             .get_sregs(cpu_id as u8)
2406             .map_err(DebuggableError::ReadRegs)?;
2407         let segments = X86SegmentRegs {
2408             cs: sregs.cs.selector as u32,
2409             ss: sregs.ss.selector as u32,
2410             ds: sregs.ds.selector as u32,
2411             es: sregs.es.selector as u32,
2412             fs: sregs.fs.selector as u32,
2413             gs: sregs.gs.selector as u32,
2414         };
2415 
2416         // TODO: Add other registers
2417 
2418         Ok(CoreRegs {
2419             regs,
2420             eflags,
2421             rip,
2422             segments,
2423             ..Default::default()
2424         })
2425     }
2426 
2427     #[cfg(target_arch = "aarch64")]
2428     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2429         let gregs = self
2430             .get_regs(cpu_id as u8)
2431             .map_err(DebuggableError::ReadRegs)?;
2432         Ok(CoreRegs {
2433             x: gregs.get_regs(),
2434             sp: gregs.get_sp(),
2435             pc: gregs.get_pc(),
2436             ..Default::default()
2437         })
2438     }
2439 
2440     #[cfg(target_arch = "x86_64")]
2441     fn write_regs(
2442         &self,
2443         cpu_id: usize,
2444         regs: &CoreRegs,
2445     ) -> std::result::Result<(), DebuggableError> {
2446         let orig_gregs = self
2447             .get_regs(cpu_id as u8)
2448             .map_err(DebuggableError::ReadRegs)?;
2449         let mut gregs = self.create_standard_regs(cpu_id as u8);
2450         gregs.set_rax(regs.regs[0]);
2451         gregs.set_rbx(regs.regs[1]);
2452         gregs.set_rcx(regs.regs[2]);
2453         gregs.set_rdx(regs.regs[3]);
2454         gregs.set_rsi(regs.regs[4]);
2455         gregs.set_rdi(regs.regs[5]);
2456         gregs.set_rbp(regs.regs[6]);
2457         gregs.set_rsp(regs.regs[7]);
2458         gregs.set_r8(regs.regs[8]);
2459         gregs.set_r9(regs.regs[9]);
2460         gregs.set_r10(regs.regs[10]);
2461         gregs.set_r11(regs.regs[11]);
2462         gregs.set_r12(regs.regs[12]);
2463         gregs.set_r13(regs.regs[13]);
2464         gregs.set_r14(regs.regs[14]);
2465         gregs.set_r15(regs.regs[15]);
2466         gregs.set_rip(regs.rip);
2467         // Update the lower 32-bit of rflags.
2468         gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64));
2469 
2470         self.set_regs(cpu_id as u8, &gregs)
2471             .map_err(DebuggableError::WriteRegs)?;
2472 
2473         // Segment registers: CS, SS, DS, ES, FS, GS
2474         // Since GDB care only selectors, we call get_sregs() first.
2475         let mut sregs = self
2476             .get_sregs(cpu_id as u8)
2477             .map_err(DebuggableError::ReadRegs)?;
2478         sregs.cs.selector = regs.segments.cs as u16;
2479         sregs.ss.selector = regs.segments.ss as u16;
2480         sregs.ds.selector = regs.segments.ds as u16;
2481         sregs.es.selector = regs.segments.es as u16;
2482         sregs.fs.selector = regs.segments.fs as u16;
2483         sregs.gs.selector = regs.segments.gs as u16;
2484 
2485         self.set_sregs(cpu_id as u8, &sregs)
2486             .map_err(DebuggableError::WriteRegs)?;
2487 
2488         // TODO: Add other registers
2489 
2490         Ok(())
2491     }
2492 
2493     #[cfg(target_arch = "aarch64")]
2494     fn write_regs(
2495         &self,
2496         cpu_id: usize,
2497         regs: &CoreRegs,
2498     ) -> std::result::Result<(), DebuggableError> {
2499         let mut gregs = self
2500             .get_regs(cpu_id as u8)
2501             .map_err(DebuggableError::ReadRegs)?;
2502 
2503         gregs.set_regs(regs.x);
2504         gregs.set_sp(regs.sp);
2505         gregs.set_pc(regs.pc);
2506 
2507         self.set_regs(cpu_id as u8, &gregs)
2508             .map_err(DebuggableError::WriteRegs)?;
2509 
2510         Ok(())
2511     }
2512 
2513     fn read_mem(
2514         &self,
2515         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2516         cpu_id: usize,
2517         vaddr: GuestAddress,
2518         len: usize,
2519     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2520         let mut buf = vec![0; len];
2521         let mut total_read = 0_u64;
2522 
2523         while total_read < len as u64 {
2524             let gaddr = vaddr.0 + total_read;
2525             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2526                 Ok(paddr) => paddr,
2527                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2528                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2529             };
2530             let psize = arch::PAGE_SIZE as u64;
2531             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2532             guest_memory
2533                 .memory()
2534                 .read(
2535                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2536                     GuestAddress(paddr),
2537                 )
2538                 .map_err(DebuggableError::ReadMem)?;
2539             total_read += read_len;
2540         }
2541         Ok(buf)
2542     }
2543 
2544     fn write_mem(
2545         &self,
2546         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2547         cpu_id: usize,
2548         vaddr: &GuestAddress,
2549         data: &[u8],
2550     ) -> std::result::Result<(), DebuggableError> {
2551         let mut total_written = 0_u64;
2552 
2553         while total_written < data.len() as u64 {
2554             let gaddr = vaddr.0 + total_written;
2555             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2556                 Ok(paddr) => paddr,
2557                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2558                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2559             };
2560             let psize = arch::PAGE_SIZE as u64;
2561             let write_len = std::cmp::min(
2562                 data.len() as u64 - total_written,
2563                 psize - (paddr & (psize - 1)),
2564             );
2565             guest_memory
2566                 .memory()
2567                 .write(
2568                     &data[total_written as usize..total_written as usize + write_len as usize],
2569                     GuestAddress(paddr),
2570                 )
2571                 .map_err(DebuggableError::WriteMem)?;
2572             total_written += write_len;
2573         }
2574         Ok(())
2575     }
2576 
2577     fn active_vcpus(&self) -> usize {
2578         self.present_vcpus() as usize
2579     }
2580 }
2581 
2582 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2583 impl Elf64Writable for CpuManager {}
2584 
2585 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2586 impl CpuElf64Writable for CpuManager {
2587     fn cpu_write_elf64_note(
2588         &mut self,
2589         dump_state: &DumpState,
2590     ) -> std::result::Result<(), GuestDebuggableError> {
2591         let mut coredump_file = dump_state.file.as_ref().unwrap();
2592         for vcpu in &self.vcpus {
2593             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2594             let mut pos: usize = 0;
2595             let mut buf = vec![0; note_size as usize];
2596             let descsz = size_of::<X86_64ElfPrStatus>();
2597             let vcpu_id = vcpu.lock().unwrap().id;
2598 
2599             let note = Elf64_Nhdr {
2600                 n_namesz: COREDUMP_NAME_SIZE,
2601                 n_descsz: descsz as u32,
2602                 n_type: NT_PRSTATUS,
2603             };
2604 
2605             let bytes: &[u8] = note.as_slice();
2606             buf.splice(0.., bytes.to_vec());
2607             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2608             buf.resize(pos + 4, 0);
2609             buf.splice(pos.., "CORE".to_string().into_bytes());
2610 
2611             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2612             buf.resize(pos + 32 + 4, 0);
2613             let pid = vcpu_id as u64;
2614             let bytes: &[u8] = pid.as_slice();
2615             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2616 
2617             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2618 
2619             let orig_rax: u64 = 0;
2620             let gregs = self.vcpus[usize::from(vcpu_id)]
2621                 .lock()
2622                 .unwrap()
2623                 .vcpu
2624                 .get_regs()
2625                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2626 
2627             let regs1 = [
2628                 gregs.get_r15(),
2629                 gregs.get_r14(),
2630                 gregs.get_r13(),
2631                 gregs.get_r12(),
2632                 gregs.get_rbp(),
2633                 gregs.get_rbx(),
2634                 gregs.get_r11(),
2635                 gregs.get_r10(),
2636             ];
2637             let regs2 = [
2638                 gregs.get_r9(),
2639                 gregs.get_r8(),
2640                 gregs.get_rax(),
2641                 gregs.get_rcx(),
2642                 gregs.get_rdx(),
2643                 gregs.get_rsi(),
2644                 gregs.get_rdi(),
2645                 orig_rax,
2646             ];
2647 
2648             let sregs = self.vcpus[usize::from(vcpu_id)]
2649                 .lock()
2650                 .unwrap()
2651                 .vcpu
2652                 .get_sregs()
2653                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2654 
2655             debug!(
2656                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2657                 gregs.get_rip(),
2658                 gregs.get_rsp(),
2659                 sregs.gs.base,
2660                 sregs.cs.selector,
2661                 sregs.ss.selector,
2662                 sregs.ds.selector,
2663             );
2664 
2665             let regs = X86_64UserRegs {
2666                 regs1,
2667                 regs2,
2668                 rip: gregs.get_rip(),
2669                 cs: sregs.cs.selector as u64,
2670                 eflags: gregs.get_rflags(),
2671                 rsp: gregs.get_rsp(),
2672                 ss: sregs.ss.selector as u64,
2673                 fs_base: sregs.fs.base,
2674                 gs_base: sregs.gs.base,
2675                 ds: sregs.ds.selector as u64,
2676                 es: sregs.es.selector as u64,
2677                 fs: sregs.fs.selector as u64,
2678                 gs: sregs.gs.selector as u64,
2679             };
2680 
2681             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2682             let bytes: &[u8] = regs.as_slice();
2683             buf.resize(note_size as usize, 0);
2684             buf.splice(pos.., bytes.to_vec());
2685             buf.resize(note_size as usize, 0);
2686 
2687             coredump_file
2688                 .write(&buf)
2689                 .map_err(GuestDebuggableError::CoredumpFile)?;
2690         }
2691 
2692         Ok(())
2693     }
2694 
2695     fn cpu_write_vmm_note(
2696         &mut self,
2697         dump_state: &DumpState,
2698     ) -> std::result::Result<(), GuestDebuggableError> {
2699         let mut coredump_file = dump_state.file.as_ref().unwrap();
2700         for vcpu in &self.vcpus {
2701             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2702             let mut pos: usize = 0;
2703             let mut buf = vec![0; note_size as usize];
2704             let descsz = size_of::<DumpCpusState>();
2705             let vcpu_id = vcpu.lock().unwrap().id;
2706 
2707             let note = Elf64_Nhdr {
2708                 n_namesz: COREDUMP_NAME_SIZE,
2709                 n_descsz: descsz as u32,
2710                 n_type: 0,
2711             };
2712 
2713             let bytes: &[u8] = note.as_slice();
2714             buf.splice(0.., bytes.to_vec());
2715             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2716 
2717             buf.resize(pos + 4, 0);
2718             buf.splice(pos.., "QEMU".to_string().into_bytes());
2719 
2720             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2721 
2722             let gregs = self.vcpus[usize::from(vcpu_id)]
2723                 .lock()
2724                 .unwrap()
2725                 .vcpu
2726                 .get_regs()
2727                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2728 
2729             let regs1 = [
2730                 gregs.get_rax(),
2731                 gregs.get_rbx(),
2732                 gregs.get_rcx(),
2733                 gregs.get_rdx(),
2734                 gregs.get_rsi(),
2735                 gregs.get_rdi(),
2736                 gregs.get_rsp(),
2737                 gregs.get_rbp(),
2738             ];
2739 
2740             let regs2 = [
2741                 gregs.get_r8(),
2742                 gregs.get_r9(),
2743                 gregs.get_r10(),
2744                 gregs.get_r11(),
2745                 gregs.get_r12(),
2746                 gregs.get_r13(),
2747                 gregs.get_r14(),
2748                 gregs.get_r15(),
2749             ];
2750 
2751             let sregs = self.vcpus[usize::from(vcpu_id)]
2752                 .lock()
2753                 .unwrap()
2754                 .vcpu
2755                 .get_sregs()
2756                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2757 
2758             let mut msrs = vec![MsrEntry {
2759                 index: msr_index::MSR_KERNEL_GS_BASE,
2760                 ..Default::default()
2761             }];
2762 
2763             self.vcpus[vcpu_id as usize]
2764                 .lock()
2765                 .unwrap()
2766                 .vcpu
2767                 .get_msrs(&mut msrs)
2768                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2769             let kernel_gs_base = msrs[0].data;
2770 
2771             let cs = CpuSegment::new(sregs.cs);
2772             let ds = CpuSegment::new(sregs.ds);
2773             let es = CpuSegment::new(sregs.es);
2774             let fs = CpuSegment::new(sregs.fs);
2775             let gs = CpuSegment::new(sregs.gs);
2776             let ss = CpuSegment::new(sregs.ss);
2777             let ldt = CpuSegment::new(sregs.ldt);
2778             let tr = CpuSegment::new(sregs.tr);
2779             let gdt = CpuSegment::new_from_table(sregs.gdt);
2780             let idt = CpuSegment::new_from_table(sregs.idt);
2781             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2782             let regs = DumpCpusState {
2783                 version: 1,
2784                 size: size_of::<DumpCpusState>() as u32,
2785                 regs1,
2786                 regs2,
2787                 rip: gregs.get_rip(),
2788                 rflags: gregs.get_rflags(),
2789                 cs,
2790                 ds,
2791                 es,
2792                 fs,
2793                 gs,
2794                 ss,
2795                 ldt,
2796                 tr,
2797                 gdt,
2798                 idt,
2799                 cr,
2800                 kernel_gs_base,
2801             };
2802 
2803             let bytes: &[u8] = regs.as_slice();
2804             buf.resize(note_size as usize, 0);
2805             buf.splice(pos.., bytes.to_vec());
2806             buf.resize(note_size as usize, 0);
2807 
2808             coredump_file
2809                 .write(&buf)
2810                 .map_err(GuestDebuggableError::CoredumpFile)?;
2811         }
2812 
2813         Ok(())
2814     }
2815 }
2816 
2817 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2818 #[cfg(test)]
2819 mod tests {
2820     use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START};
2821     use arch::x86_64::interrupts::*;
2822     use arch::x86_64::regs::*;
2823     use hypervisor::arch::x86::{FpuState, LapicState};
2824     use hypervisor::StandardRegisters;
2825     use linux_loader::loader::bootparam::setup_header;
2826 
2827     #[test]
2828     fn test_setlint() {
2829         let hv = hypervisor::new().unwrap();
2830         let vm = hv.create_vm().expect("new VM fd creation failed");
2831         hv.check_required_extensions().unwrap();
2832         // Calling get_lapic will fail if there is no irqchip before hand.
2833         vm.create_irq_chip().unwrap();
2834         let vcpu = vm.create_vcpu(0, None).unwrap();
2835         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2836 
2837         // Compute the value that is expected to represent LVT0 and LVT1.
2838         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2839         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2840         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2841         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2842 
2843         set_lint(&vcpu).unwrap();
2844 
2845         // Compute the value that represents LVT0 and LVT1 after set_lint.
2846         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2847         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2848         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2849         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2850         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2851     }
2852 
2853     #[test]
2854     fn test_setup_fpu() {
2855         let hv = hypervisor::new().unwrap();
2856         let vm = hv.create_vm().expect("new VM fd creation failed");
2857         let vcpu = vm.create_vcpu(0, None).unwrap();
2858         setup_fpu(&vcpu).unwrap();
2859 
2860         let expected_fpu: FpuState = FpuState {
2861             fcw: 0x37f,
2862             mxcsr: 0x1f80,
2863             ..Default::default()
2864         };
2865         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2866         // TODO: auto-generate kvm related structures with PartialEq on.
2867         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2868         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2869         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2870         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2871         // remove it at all.
2872         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2873     }
2874 
2875     #[test]
2876     fn test_setup_msrs() {
2877         use hypervisor::arch::x86::{msr_index, MsrEntry};
2878 
2879         let hv = hypervisor::new().unwrap();
2880         let vm = hv.create_vm().expect("new VM fd creation failed");
2881         let vcpu = vm.create_vcpu(0, None).unwrap();
2882         setup_msrs(&vcpu).unwrap();
2883 
2884         // This test will check against the last MSR entry configured (the tenth one).
2885         // See create_msr_entries for details.
2886         let mut msrs = vec![MsrEntry {
2887             index: msr_index::MSR_IA32_MISC_ENABLE,
2888             ..Default::default()
2889         }];
2890 
2891         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2892         // in this test case scenario.
2893         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2894         assert_eq!(read_msrs, 1);
2895 
2896         // Official entries that were setup when we did setup_msrs. We need to assert that the
2897         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2898         // expect.
2899         let entry_vec = vcpu.boot_msr_entries();
2900         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2901     }
2902 
2903     #[test]
2904     fn test_setup_regs_for_pvh() {
2905         let hv = hypervisor::new().unwrap();
2906         let vm = hv.create_vm().expect("new VM fd creation failed");
2907         let vcpu = vm.create_vcpu(0, None).unwrap();
2908 
2909         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2910         expected_regs.set_rflags(0x0000000000000002u64);
2911         expected_regs.set_rbx(arch::layout::PVH_INFO_START.0);
2912         expected_regs.set_rip(1);
2913 
2914         setup_regs(
2915             &vcpu,
2916             arch::EntryPoint {
2917                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2918                 setup_header: None,
2919             },
2920         )
2921         .unwrap();
2922 
2923         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2924         assert_eq!(actual_regs, expected_regs);
2925     }
2926 
2927     #[test]
2928     fn test_setup_regs_for_bzimage() {
2929         let hv = hypervisor::new().unwrap();
2930         let vm = hv.create_vm().expect("new VM fd creation failed");
2931         let vcpu = vm.create_vcpu(0, None).unwrap();
2932 
2933         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2934         expected_regs.set_rflags(0x0000000000000002u64);
2935         expected_regs.set_rip(1);
2936         expected_regs.set_rsp(BOOT_STACK_POINTER.0);
2937         expected_regs.set_rsi(ZERO_PAGE_START.0);
2938 
2939         setup_regs(
2940             &vcpu,
2941             arch::EntryPoint {
2942                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2943                 setup_header: Some(setup_header {
2944                     ..Default::default()
2945                 }),
2946             },
2947         )
2948         .unwrap();
2949 
2950         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2951         assert_eq!(actual_regs, expected_regs);
2952     }
2953 }
2954 
2955 #[cfg(target_arch = "aarch64")]
2956 #[cfg(test)]
2957 mod tests {
2958     #[cfg(feature = "kvm")]
2959     use std::mem;
2960 
2961     use arch::layout;
2962     use hypervisor::arch::aarch64::regs::MPIDR_EL1;
2963     #[cfg(feature = "kvm")]
2964     use hypervisor::kvm::aarch64::is_system_register;
2965     #[cfg(feature = "kvm")]
2966     use hypervisor::kvm::kvm_bindings::{
2967         user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2968     };
2969     #[cfg(feature = "kvm")]
2970     use hypervisor::{arm64_core_reg_id, offset_of};
2971 
2972     #[test]
2973     fn test_setup_regs() {
2974         let hv = hypervisor::new().unwrap();
2975         let vm = hv.create_vm().unwrap();
2976         let vcpu = vm.create_vcpu(0, None).unwrap();
2977 
2978         // Must fail when vcpu is not initialized yet.
2979         vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap_err();
2980 
2981         let mut kvi = vcpu.create_vcpu_init();
2982         vm.get_preferred_target(&mut kvi).unwrap();
2983         vcpu.vcpu_init(&kvi).unwrap();
2984 
2985         vcpu.setup_regs(0, 0x0, layout::FDT_START.0).unwrap();
2986     }
2987 
2988     #[test]
2989     fn test_read_mpidr() {
2990         let hv = hypervisor::new().unwrap();
2991         let vm = hv.create_vm().unwrap();
2992         let vcpu = vm.create_vcpu(0, None).unwrap();
2993         let mut kvi = vcpu.create_vcpu_init();
2994         vm.get_preferred_target(&mut kvi).unwrap();
2995 
2996         // Must fail when vcpu is not initialized yet.
2997         vcpu.get_sys_reg(MPIDR_EL1).unwrap_err();
2998 
2999         vcpu.vcpu_init(&kvi).unwrap();
3000         assert_eq!(vcpu.get_sys_reg(MPIDR_EL1).unwrap(), 0x80000000);
3001     }
3002 
3003     #[cfg(feature = "kvm")]
3004     #[test]
3005     fn test_is_system_register() {
3006         let offset = offset_of!(user_pt_regs, pc);
3007         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
3008         assert!(!is_system_register(regid));
3009         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
3010         assert!(is_system_register(regid));
3011     }
3012 
3013     #[test]
3014     fn test_save_restore_core_regs() {
3015         let hv = hypervisor::new().unwrap();
3016         let vm = hv.create_vm().unwrap();
3017         let vcpu = vm.create_vcpu(0, None).unwrap();
3018         let mut kvi = vcpu.create_vcpu_init();
3019         vm.get_preferred_target(&mut kvi).unwrap();
3020 
3021         // Must fail when vcpu is not initialized yet.
3022         assert_eq!(
3023             format!("{}", vcpu.get_regs().unwrap_err()),
3024             "Failed to get aarch64 core register: Exec format error (os error 8)"
3025         );
3026 
3027         let mut state = vcpu.create_standard_regs();
3028         assert_eq!(
3029             format!("{}", vcpu.set_regs(&state).unwrap_err()),
3030             "Failed to set aarch64 core register: Exec format error (os error 8)"
3031         );
3032 
3033         vcpu.vcpu_init(&kvi).unwrap();
3034         state = vcpu.get_regs().unwrap();
3035         assert_eq!(state.get_pstate(), 0x3C5);
3036 
3037         vcpu.set_regs(&state).unwrap();
3038     }
3039 
3040     #[test]
3041     fn test_get_set_mpstate() {
3042         let hv = hypervisor::new().unwrap();
3043         let vm = hv.create_vm().unwrap();
3044         let vcpu = vm.create_vcpu(0, None).unwrap();
3045         let mut kvi = vcpu.create_vcpu_init();
3046         vm.get_preferred_target(&mut kvi).unwrap();
3047 
3048         let state = vcpu.get_mp_state().unwrap();
3049         vcpu.set_mp_state(state).unwrap();
3050     }
3051 }
3052