xref: /cloud-hypervisor/vmm/src/cpu.rs (revision d90fa96bb70492dfa8cf7419120dab5051e768ed)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 #[cfg(feature = "guest_debug")]
22 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
23 #[cfg(target_arch = "x86_64")]
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, sdt::Sdt, Aml};
31 use anyhow::anyhow;
32 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
33 use arch::aarch64::regs;
34 #[cfg(target_arch = "x86_64")]
35 use arch::x86_64::get_x2apic_id;
36 use arch::EntryPoint;
37 use arch::NumaNodes;
38 #[cfg(target_arch = "aarch64")]
39 use devices::gic::Gic;
40 use devices::interrupt_controller::InterruptController;
41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
44 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::msr_index;
47 #[cfg(target_arch = "x86_64")]
48 use hypervisor::arch::x86::CpuIdEntry;
49 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
50 use hypervisor::arch::x86::MsrEntry;
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use hypervisor::arch::x86::SpecialRegisters;
53 #[cfg(target_arch = "aarch64")]
54 use hypervisor::kvm::kvm_bindings;
55 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
56 use hypervisor::kvm::kvm_ioctls::Cap;
57 #[cfg(feature = "tdx")]
58 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
59 #[cfg(target_arch = "x86_64")]
60 use hypervisor::CpuVendor;
61 #[cfg(feature = "kvm")]
62 use hypervisor::HypervisorType;
63 #[cfg(feature = "guest_debug")]
64 use hypervisor::StandardRegisters;
65 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
66 use libc::{c_void, siginfo_t};
67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
68 use linux_loader::elf::Elf64_Nhdr;
69 use seccompiler::{apply_filter, SeccompAction};
70 use std::collections::BTreeMap;
71 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
72 use std::io::Write;
73 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
74 use std::mem::size_of;
75 use std::os::unix::thread::JoinHandleExt;
76 use std::sync::atomic::{AtomicBool, Ordering};
77 use std::sync::{Arc, Barrier, Mutex};
78 use std::{cmp, io, result, thread};
79 use thiserror::Error;
80 use tracer::trace_scoped;
81 use vm_device::BusDevice;
82 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
83 use vm_memory::ByteValued;
84 #[cfg(feature = "guest_debug")]
85 use vm_memory::{Bytes, GuestAddressSpace};
86 use vm_memory::{GuestAddress, GuestMemoryAtomic};
87 use vm_migration::{
88     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
89     Transportable,
90 };
91 use vmm_sys_util::eventfd::EventFd;
92 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
93 use zerocopy::AsBytes;
94 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
95 /// Extract the specified bits of a 64-bit integer.
96 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
97 /// following expression should return 3 (`0b11`):
98 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
99 ///
100 macro_rules! extract_bits_64 {
101     ($value: tt, $offset: tt, $length: tt) => {
102         ($value >> $offset) & (!0u64 >> (64 - $length))
103     };
104 }
105 
106 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
107 macro_rules! extract_bits_64_without_offset {
108     ($value: tt, $length: tt) => {
109         $value & (!0u64 >> (64 - $length))
110     };
111 }
112 
113 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
114 
115 #[derive(Debug, Error)]
116 pub enum Error {
117     #[error("Error creating vCPU: {0}")]
118     VcpuCreate(#[source] anyhow::Error),
119 
120     #[error("Error running bCPU: {0}")]
121     VcpuRun(#[source] anyhow::Error),
122 
123     #[error("Error spawning vCPU thread: {0}")]
124     VcpuSpawn(#[source] io::Error),
125 
126     #[error("Error generating common CPUID: {0}")]
127     CommonCpuId(#[source] arch::Error),
128 
129     #[error("Error configuring vCPU: {0}")]
130     VcpuConfiguration(#[source] arch::Error),
131 
132     #[error("Still pending removed vcpu")]
133     VcpuPendingRemovedVcpu,
134 
135     #[cfg(target_arch = "aarch64")]
136     #[error("Error fetching preferred target: {0}")]
137     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
138 
139     #[cfg(target_arch = "aarch64")]
140     #[error("Error initialising vCPU: {0}")]
141     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
142 
143     #[cfg(target_arch = "aarch64")]
144     #[error("Error finalising vCPU: {0}")]
145     VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError),
146 
147     #[error("Failed to join on vCPU threads: {0:?}")]
148     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
149 
150     #[error("Error adding CpuManager to MMIO bus: {0}")]
151     BusError(#[source] vm_device::BusError),
152 
153     #[error("Requested vCPUs exceed maximum")]
154     DesiredVCpuCountExceedsMax,
155 
156     #[error("Cannot create seccomp filter: {0}")]
157     CreateSeccompFilter(#[source] seccompiler::Error),
158 
159     #[error("Cannot apply seccomp filter: {0}")]
160     ApplySeccompFilter(#[source] seccompiler::Error),
161 
162     #[error("Error starting vCPU after restore: {0}")]
163     StartRestoreVcpu(#[source] anyhow::Error),
164 
165     #[error("Unexpected VmExit")]
166     UnexpectedVmExit,
167 
168     #[error("Failed to allocate MMIO address for CpuManager")]
169     AllocateMmmioAddress,
170 
171     #[cfg(feature = "tdx")]
172     #[error("Error initializing TDX: {0}")]
173     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
174 
175     #[cfg(target_arch = "aarch64")]
176     #[error("Error initializing PMU: {0}")]
177     InitPmu(#[source] hypervisor::HypervisorCpuError),
178 
179     #[cfg(feature = "guest_debug")]
180     #[error("Error during CPU debug: {0}")]
181     CpuDebug(#[source] hypervisor::HypervisorCpuError),
182 
183     #[cfg(feature = "guest_debug")]
184     #[error("Error translating virtual address: {0}")]
185     TranslateVirtualAddress(#[source] anyhow::Error),
186 
187     #[cfg(target_arch = "x86_64")]
188     #[error("Error setting up AMX: {0}")]
189     AmxEnable(#[source] anyhow::Error),
190 
191     #[error("Maximum number of vCPUs exceeds host limit")]
192     MaximumVcpusExceeded,
193 
194     #[cfg(feature = "sev_snp")]
195     #[error("Failed to set sev control register: {0}")]
196     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
197 
198     #[cfg(target_arch = "x86_64")]
199     #[error("Failed to inject NMI")]
200     NmiError(hypervisor::HypervisorCpuError),
201 }
202 pub type Result<T> = result::Result<T, Error>;
203 
204 #[cfg(target_arch = "x86_64")]
205 #[allow(dead_code)]
206 #[repr(packed)]
207 #[derive(AsBytes)]
208 struct LocalX2Apic {
209     pub r#type: u8,
210     pub length: u8,
211     pub _reserved: u16,
212     pub apic_id: u32,
213     pub flags: u32,
214     pub processor_id: u32,
215 }
216 
217 #[allow(dead_code)]
218 #[repr(packed)]
219 #[derive(Default, AsBytes)]
220 struct Ioapic {
221     pub r#type: u8,
222     pub length: u8,
223     pub ioapic_id: u8,
224     _reserved: u8,
225     pub apic_address: u32,
226     pub gsi_base: u32,
227 }
228 
229 #[cfg(target_arch = "aarch64")]
230 #[allow(dead_code)]
231 #[repr(packed)]
232 #[derive(AsBytes)]
233 struct GicC {
234     pub r#type: u8,
235     pub length: u8,
236     pub reserved0: u16,
237     pub cpu_interface_number: u32,
238     pub uid: u32,
239     pub flags: u32,
240     pub parking_version: u32,
241     pub performance_interrupt: u32,
242     pub parked_address: u64,
243     pub base_address: u64,
244     pub gicv_base_address: u64,
245     pub gich_base_address: u64,
246     pub vgic_interrupt: u32,
247     pub gicr_base_address: u64,
248     pub mpidr: u64,
249     pub proc_power_effi_class: u8,
250     pub reserved1: u8,
251     pub spe_overflow_interrupt: u16,
252 }
253 
254 #[cfg(target_arch = "aarch64")]
255 #[allow(dead_code)]
256 #[repr(packed)]
257 #[derive(AsBytes)]
258 struct GicD {
259     pub r#type: u8,
260     pub length: u8,
261     pub reserved0: u16,
262     pub gic_id: u32,
263     pub base_address: u64,
264     pub global_irq_base: u32,
265     pub version: u8,
266     pub reserved1: [u8; 3],
267 }
268 
269 #[cfg(target_arch = "aarch64")]
270 #[allow(dead_code)]
271 #[repr(packed)]
272 #[derive(AsBytes)]
273 struct GicR {
274     pub r#type: u8,
275     pub length: u8,
276     pub reserved: u16,
277     pub base_address: u64,
278     pub range_length: u32,
279 }
280 
281 #[cfg(target_arch = "aarch64")]
282 #[allow(dead_code)]
283 #[repr(packed)]
284 #[derive(AsBytes)]
285 struct GicIts {
286     pub r#type: u8,
287     pub length: u8,
288     pub reserved0: u16,
289     pub translation_id: u32,
290     pub base_address: u64,
291     pub reserved1: u32,
292 }
293 
294 #[cfg(target_arch = "aarch64")]
295 #[allow(dead_code)]
296 #[repr(packed)]
297 #[derive(AsBytes)]
298 struct ProcessorHierarchyNode {
299     pub r#type: u8,
300     pub length: u8,
301     pub reserved: u16,
302     pub flags: u32,
303     pub parent: u32,
304     pub acpi_processor_id: u32,
305     pub num_private_resources: u32,
306 }
307 
308 #[allow(dead_code)]
309 #[repr(packed)]
310 #[derive(Default, AsBytes)]
311 struct InterruptSourceOverride {
312     pub r#type: u8,
313     pub length: u8,
314     pub bus: u8,
315     pub source: u8,
316     pub gsi: u32,
317     pub flags: u16,
318 }
319 
320 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
321 macro_rules! round_up {
322     ($n:expr,$d:expr) => {
323         (($n / ($d + 1)) + 1) * $d
324     };
325 }
326 
327 /// A wrapper around creating and using a kvm-based VCPU.
328 pub struct Vcpu {
329     // The hypervisor abstracted CPU.
330     vcpu: Arc<dyn hypervisor::Vcpu>,
331     id: u8,
332     #[cfg(target_arch = "aarch64")]
333     mpidr: u64,
334     saved_state: Option<CpuState>,
335     #[cfg(target_arch = "x86_64")]
336     vendor: CpuVendor,
337 }
338 
339 impl Vcpu {
340     /// Constructs a new VCPU for `vm`.
341     ///
342     /// # Arguments
343     ///
344     /// * `id` - Represents the CPU number between [0, max vcpus).
345     /// * `vm` - The virtual machine this vcpu will get attached to.
346     /// * `vm_ops` - Optional object for exit handling.
347     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
348     pub fn new(
349         id: u8,
350         apic_id: u8,
351         vm: &Arc<dyn hypervisor::Vm>,
352         vm_ops: Option<Arc<dyn VmOps>>,
353         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
354     ) -> Result<Self> {
355         let vcpu = vm
356             .create_vcpu(apic_id, vm_ops)
357             .map_err(|e| Error::VcpuCreate(e.into()))?;
358         // Initially the cpuid per vCPU is the one supported by this VM.
359         Ok(Vcpu {
360             vcpu,
361             id,
362             #[cfg(target_arch = "aarch64")]
363             mpidr: 0,
364             saved_state: None,
365             #[cfg(target_arch = "x86_64")]
366             vendor: cpu_vendor,
367         })
368     }
369 
370     /// Configures a vcpu and should be called once per vcpu when created.
371     ///
372     /// # Arguments
373     ///
374     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
375     /// * `guest_memory` - Guest memory.
376     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
377     pub fn configure(
378         &mut self,
379         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
380         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
381         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
382         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
383         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
384     ) -> Result<()> {
385         #[cfg(target_arch = "aarch64")]
386         {
387             self.init(vm)?;
388             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
389                 .map_err(Error::VcpuConfiguration)?;
390         }
391         info!("Configuring vCPU: cpu_id = {}", self.id);
392         #[cfg(target_arch = "x86_64")]
393         arch::configure_vcpu(
394             &self.vcpu,
395             self.id,
396             boot_setup,
397             cpuid,
398             kvm_hyperv,
399             self.vendor,
400             topology,
401         )
402         .map_err(Error::VcpuConfiguration)?;
403 
404         Ok(())
405     }
406 
407     /// Gets the MPIDR register value.
408     #[cfg(target_arch = "aarch64")]
409     pub fn get_mpidr(&self) -> u64 {
410         self.mpidr
411     }
412 
413     /// Gets the saved vCPU state.
414     #[cfg(target_arch = "aarch64")]
415     pub fn get_saved_state(&self) -> Option<CpuState> {
416         self.saved_state.clone()
417     }
418 
419     /// Initializes an aarch64 specific vcpu for booting Linux.
420     #[cfg(target_arch = "aarch64")]
421     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
422         use std::arch::is_aarch64_feature_detected;
423         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
424         #[allow(clippy::nonminimal_bool)]
425         let sve_supported =
426             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
427         // This reads back the kernel's preferred target type.
428         vm.get_preferred_target(&mut kvi)
429             .map_err(Error::VcpuArmPreferredTarget)?;
430         // We already checked that the capability is supported.
431         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
432         if vm
433             .as_any()
434             .downcast_ref::<hypervisor::kvm::KvmVm>()
435             .unwrap()
436             .check_extension(Cap::ArmPmuV3)
437         {
438             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
439         }
440 
441         if sve_supported
442             && vm
443                 .as_any()
444                 .downcast_ref::<hypervisor::kvm::KvmVm>()
445                 .unwrap()
446                 .check_extension(Cap::ArmSve)
447         {
448             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE;
449         }
450 
451         // Non-boot cpus are powered off initially.
452         if self.id > 0 {
453             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
454         }
455         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?;
456         if sve_supported {
457             self.vcpu
458                 .vcpu_finalize(kvm_bindings::KVM_ARM_VCPU_SVE as i32)
459                 .map_err(Error::VcpuArmFinalize)?;
460         }
461         Ok(())
462     }
463 
464     /// Runs the VCPU until it exits, returning the reason.
465     ///
466     /// Note that the state of the VCPU and associated VM must be setup first for this to do
467     /// anything useful.
468     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
469         self.vcpu.run()
470     }
471 
472     #[cfg(feature = "sev_snp")]
473     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
474         self.vcpu
475             .set_sev_control_register(vmsa_pfn)
476             .map_err(Error::SetSevControlRegister)
477     }
478 }
479 
480 impl Pausable for Vcpu {}
481 impl Snapshottable for Vcpu {
482     fn id(&self) -> String {
483         self.id.to_string()
484     }
485 
486     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
487         let saved_state = self
488             .vcpu
489             .state()
490             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
491 
492         self.saved_state = Some(saved_state.clone());
493 
494         Ok(Snapshot::from_data(SnapshotData::new_from_state(
495             &saved_state,
496         )?))
497     }
498 }
499 
500 pub struct CpuManager {
501     config: CpusConfig,
502     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
503     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
504     #[cfg(target_arch = "x86_64")]
505     cpuid: Vec<CpuIdEntry>,
506     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
507     vm: Arc<dyn hypervisor::Vm>,
508     vcpus_kill_signalled: Arc<AtomicBool>,
509     vcpus_pause_signalled: Arc<AtomicBool>,
510     vcpus_kick_signalled: Arc<AtomicBool>,
511     exit_evt: EventFd,
512     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
513     reset_evt: EventFd,
514     #[cfg(feature = "guest_debug")]
515     vm_debug_evt: EventFd,
516     vcpu_states: Vec<VcpuState>,
517     selected_cpu: u8,
518     vcpus: Vec<Arc<Mutex<Vcpu>>>,
519     seccomp_action: SeccompAction,
520     vm_ops: Arc<dyn VmOps>,
521     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
522     acpi_address: Option<GuestAddress>,
523     proximity_domain_per_cpu: BTreeMap<u8, u32>,
524     affinity: BTreeMap<u8, Vec<usize>>,
525     dynamic: bool,
526     hypervisor: Arc<dyn hypervisor::Hypervisor>,
527     #[cfg(feature = "sev_snp")]
528     sev_snp_enabled: bool,
529 }
530 
531 const CPU_ENABLE_FLAG: usize = 0;
532 const CPU_INSERTING_FLAG: usize = 1;
533 const CPU_REMOVING_FLAG: usize = 2;
534 const CPU_EJECT_FLAG: usize = 3;
535 
536 const CPU_STATUS_OFFSET: u64 = 4;
537 const CPU_SELECTION_OFFSET: u64 = 0;
538 
539 impl BusDevice for CpuManager {
540     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
541         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
542         data.fill(0);
543 
544         match offset {
545             CPU_SELECTION_OFFSET => {
546                 data[0] = self.selected_cpu;
547             }
548             CPU_STATUS_OFFSET => {
549                 if self.selected_cpu < self.max_vcpus() {
550                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
551                     if state.active() {
552                         data[0] |= 1 << CPU_ENABLE_FLAG;
553                     }
554                     if state.inserting {
555                         data[0] |= 1 << CPU_INSERTING_FLAG;
556                     }
557                     if state.removing {
558                         data[0] |= 1 << CPU_REMOVING_FLAG;
559                     }
560                 } else {
561                     warn!("Out of range vCPU id: {}", self.selected_cpu);
562                 }
563             }
564             _ => {
565                 warn!(
566                     "Unexpected offset for accessing CPU manager device: {:#}",
567                     offset
568                 );
569             }
570         }
571     }
572 
573     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
574         match offset {
575             CPU_SELECTION_OFFSET => {
576                 self.selected_cpu = data[0];
577             }
578             CPU_STATUS_OFFSET => {
579                 if self.selected_cpu < self.max_vcpus() {
580                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
581                     // The ACPI code writes back a 1 to acknowledge the insertion
582                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
583                         && state.inserting
584                     {
585                         state.inserting = false;
586                     }
587                     // Ditto for removal
588                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
589                         && state.removing
590                     {
591                         state.removing = false;
592                     }
593                     // Trigger removal of vCPU
594                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
595                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
596                             error!("Error removing vCPU: {:?}", e);
597                         }
598                     }
599                 } else {
600                     warn!("Out of range vCPU id: {}", self.selected_cpu);
601                 }
602             }
603             _ => {
604                 warn!(
605                     "Unexpected offset for accessing CPU manager device: {:#}",
606                     offset
607                 );
608             }
609         }
610         None
611     }
612 }
613 
614 #[derive(Default)]
615 struct VcpuState {
616     inserting: bool,
617     removing: bool,
618     pending_removal: Arc<AtomicBool>,
619     handle: Option<thread::JoinHandle<()>>,
620     kill: Arc<AtomicBool>,
621     vcpu_run_interrupted: Arc<AtomicBool>,
622     paused: Arc<AtomicBool>,
623 }
624 
625 impl VcpuState {
626     fn active(&self) -> bool {
627         self.handle.is_some()
628     }
629 
630     fn signal_thread(&self) {
631         if let Some(handle) = self.handle.as_ref() {
632             loop {
633                 // SAFETY: FFI call with correct arguments
634                 unsafe {
635                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
636                 }
637                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
638                     break;
639                 } else {
640                     // This is more effective than thread::yield_now() at
641                     // avoiding a priority inversion with the vCPU thread
642                     thread::sleep(std::time::Duration::from_millis(1));
643                 }
644             }
645         }
646     }
647 
648     fn join_thread(&mut self) -> Result<()> {
649         if let Some(handle) = self.handle.take() {
650             handle.join().map_err(Error::ThreadCleanup)?
651         }
652 
653         Ok(())
654     }
655 
656     fn unpark_thread(&self) {
657         if let Some(handle) = self.handle.as_ref() {
658             handle.thread().unpark()
659         }
660     }
661 }
662 
663 impl CpuManager {
664     #[allow(unused_variables)]
665     #[allow(clippy::too_many_arguments)]
666     pub fn new(
667         config: &CpusConfig,
668         vm: Arc<dyn hypervisor::Vm>,
669         exit_evt: EventFd,
670         reset_evt: EventFd,
671         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
672         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
673         seccomp_action: SeccompAction,
674         vm_ops: Arc<dyn VmOps>,
675         #[cfg(feature = "tdx")] tdx_enabled: bool,
676         numa_nodes: &NumaNodes,
677         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
678     ) -> Result<Arc<Mutex<CpuManager>>> {
679         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
680             return Err(Error::MaximumVcpusExceeded);
681         }
682 
683         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
684         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
685         let hypervisor_type = hypervisor.hypervisor_type();
686         #[cfg(target_arch = "x86_64")]
687         let cpu_vendor = hypervisor.get_cpu_vendor();
688 
689         #[cfg(target_arch = "x86_64")]
690         if config.features.amx {
691             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
692             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
693             const XFEATURE_XTILEDATA: usize = 18;
694             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
695 
696             // SAFETY: the syscall is only modifying kernel internal
697             // data structures that the kernel is itself expected to safeguard.
698             let amx_tile = unsafe {
699                 libc::syscall(
700                     libc::SYS_arch_prctl,
701                     ARCH_REQ_XCOMP_GUEST_PERM,
702                     XFEATURE_XTILEDATA,
703                 )
704             };
705 
706             if amx_tile != 0 {
707                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
708             } else {
709                 let mask: usize = 0;
710                 // SAFETY: the mask being modified (not marked mutable as it is
711                 // modified in unsafe only which is permitted) isn't in use elsewhere.
712                 let result = unsafe {
713                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
714                 };
715                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
716                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
717                 }
718             }
719         }
720 
721         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
722             let mut cpu_list = Vec::new();
723             for (proximity_domain, numa_node) in numa_nodes.iter() {
724                 for cpu in numa_node.cpus.iter() {
725                     cpu_list.push((*cpu, *proximity_domain))
726                 }
727             }
728             cpu_list
729         }
730         .into_iter()
731         .collect();
732 
733         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
734             cpu_affinity
735                 .iter()
736                 .map(|a| (a.vcpu, a.host_cpus.clone()))
737                 .collect()
738         } else {
739             BTreeMap::new()
740         };
741 
742         #[cfg(feature = "tdx")]
743         let dynamic = !tdx_enabled;
744         #[cfg(not(feature = "tdx"))]
745         let dynamic = true;
746 
747         Ok(Arc::new(Mutex::new(CpuManager {
748             config: config.clone(),
749             interrupt_controller: None,
750             #[cfg(target_arch = "x86_64")]
751             cpuid: Vec::new(),
752             vm,
753             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
754             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
755             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
756             vcpu_states,
757             exit_evt,
758             reset_evt,
759             #[cfg(feature = "guest_debug")]
760             vm_debug_evt,
761             selected_cpu: 0,
762             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
763             seccomp_action,
764             vm_ops,
765             acpi_address: None,
766             proximity_domain_per_cpu,
767             affinity,
768             dynamic,
769             hypervisor: hypervisor.clone(),
770             #[cfg(feature = "sev_snp")]
771             sev_snp_enabled,
772         })))
773     }
774 
775     #[cfg(target_arch = "x86_64")]
776     pub fn populate_cpuid(
777         &mut self,
778         memory_manager: &Arc<Mutex<MemoryManager>>,
779         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
780         #[cfg(feature = "tdx")] tdx: bool,
781     ) -> Result<()> {
782         let sgx_epc_sections = memory_manager
783             .lock()
784             .unwrap()
785             .sgx_epc_region()
786             .as_ref()
787             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
788 
789         self.cpuid = {
790             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
791             arch::generate_common_cpuid(
792                 hypervisor,
793                 &arch::CpuidConfig {
794                     sgx_epc_sections,
795                     phys_bits,
796                     kvm_hyperv: self.config.kvm_hyperv,
797                     #[cfg(feature = "tdx")]
798                     tdx,
799                     amx: self.config.features.amx,
800                 },
801             )
802             .map_err(Error::CommonCpuId)?
803         };
804 
805         Ok(())
806     }
807 
808     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
809         info!("Creating vCPU: cpu_id = {}", cpu_id);
810 
811         #[cfg(target_arch = "x86_64")]
812         let topology = self.get_vcpu_topology();
813         #[cfg(target_arch = "x86_64")]
814         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
815         #[cfg(target_arch = "aarch64")]
816         let x2apic_id = cpu_id as u32;
817 
818         let mut vcpu = Vcpu::new(
819             cpu_id,
820             x2apic_id as u8,
821             &self.vm,
822             Some(self.vm_ops.clone()),
823             #[cfg(target_arch = "x86_64")]
824             self.hypervisor.get_cpu_vendor(),
825         )?;
826 
827         if let Some(snapshot) = snapshot {
828             // AArch64 vCPUs should be initialized after created.
829             #[cfg(target_arch = "aarch64")]
830             vcpu.init(&self.vm)?;
831 
832             let state: CpuState = snapshot.to_state().map_err(|e| {
833                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
834             })?;
835             vcpu.vcpu
836                 .set_state(&state)
837                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
838 
839             vcpu.saved_state = Some(state);
840         }
841 
842         let vcpu = Arc::new(Mutex::new(vcpu));
843 
844         // Adding vCPU to the CpuManager's vCPU list.
845         self.vcpus.push(vcpu.clone());
846 
847         Ok(vcpu)
848     }
849 
850     pub fn configure_vcpu(
851         &self,
852         vcpu: Arc<Mutex<Vcpu>>,
853         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
854     ) -> Result<()> {
855         let mut vcpu = vcpu.lock().unwrap();
856 
857         #[cfg(feature = "sev_snp")]
858         if self.sev_snp_enabled {
859             if let Some((kernel_entry_point, _)) = boot_setup {
860                 vcpu.set_sev_control_register(
861                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
862                 )?;
863             }
864 
865             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
866             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
867             return Ok(());
868         }
869 
870         #[cfg(target_arch = "x86_64")]
871         assert!(!self.cpuid.is_empty());
872 
873         #[cfg(target_arch = "x86_64")]
874         let topology = self.config.topology.clone().map_or_else(
875             || Some((1, self.boot_vcpus(), 1)),
876             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
877         );
878         #[cfg(target_arch = "x86_64")]
879         vcpu.configure(
880             boot_setup,
881             self.cpuid.clone(),
882             self.config.kvm_hyperv,
883             topology,
884         )?;
885 
886         #[cfg(target_arch = "aarch64")]
887         vcpu.configure(&self.vm, boot_setup)?;
888 
889         Ok(())
890     }
891 
892     /// Only create new vCPUs if there aren't any inactive ones to reuse
893     fn create_vcpus(
894         &mut self,
895         desired_vcpus: u8,
896         snapshot: Option<Snapshot>,
897     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
898         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
899         info!(
900             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
901             desired_vcpus,
902             self.config.max_vcpus,
903             self.vcpus.len(),
904             self.present_vcpus()
905         );
906 
907         if desired_vcpus > self.config.max_vcpus {
908             return Err(Error::DesiredVCpuCountExceedsMax);
909         }
910 
911         // Only create vCPUs in excess of all the allocated vCPUs.
912         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
913             vcpus.push(self.create_vcpu(
914                 cpu_id,
915                 // TODO: The special format of the CPU id can be removed once
916                 // ready to break live upgrade.
917                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
918             )?);
919         }
920 
921         Ok(vcpus)
922     }
923 
924     #[cfg(target_arch = "aarch64")]
925     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
926         for cpu in self.vcpus.iter() {
927             let cpu = cpu.lock().unwrap();
928             // Check if PMU attr is available, if not, log the information.
929             if cpu.vcpu.has_pmu_support() {
930                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
931             } else {
932                 debug!(
933                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
934                     cpu.id
935                 );
936                 return Ok(false);
937             }
938         }
939 
940         Ok(true)
941     }
942 
943     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
944         self.vcpus.clone()
945     }
946 
947     fn start_vcpu(
948         &mut self,
949         vcpu: Arc<Mutex<Vcpu>>,
950         vcpu_id: u8,
951         vcpu_thread_barrier: Arc<Barrier>,
952         inserting: bool,
953     ) -> Result<()> {
954         let reset_evt = self.reset_evt.try_clone().unwrap();
955         let exit_evt = self.exit_evt.try_clone().unwrap();
956         #[cfg(feature = "kvm")]
957         let hypervisor_type = self.hypervisor.hypervisor_type();
958         #[cfg(feature = "guest_debug")]
959         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
960         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
961         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
962         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
963         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
964 
965         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
966         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
967             .vcpu_run_interrupted
968             .clone();
969         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
970         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
971 
972         // Prepare the CPU set the current vCPU is expected to run onto.
973         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
974             // SAFETY: all zeros is a valid pattern
975             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
976             // SAFETY: FFI call, trivially safe
977             unsafe { libc::CPU_ZERO(&mut cpuset) };
978             for host_cpu in host_cpus {
979                 // SAFETY: FFI call, trivially safe
980                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
981             }
982             cpuset
983         });
984 
985         // Retrieve seccomp filter for vcpu thread
986         let vcpu_seccomp_filter = get_seccomp_filter(
987             &self.seccomp_action,
988             Thread::Vcpu,
989             self.hypervisor.hypervisor_type(),
990         )
991         .map_err(Error::CreateSeccompFilter)?;
992 
993         #[cfg(target_arch = "x86_64")]
994         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
995 
996         info!("Starting vCPU: cpu_id = {}", vcpu_id);
997 
998         let handle = Some(
999             thread::Builder::new()
1000                 .name(format!("vcpu{vcpu_id}"))
1001                 .spawn(move || {
1002                     // Schedule the thread to run on the expected CPU set
1003                     if let Some(cpuset) = cpuset.as_ref() {
1004                         // SAFETY: FFI call with correct arguments
1005                         let ret = unsafe {
1006                             libc::sched_setaffinity(
1007                                 0,
1008                                 std::mem::size_of::<libc::cpu_set_t>(),
1009                                 cpuset as *const libc::cpu_set_t,
1010                             )
1011                         };
1012 
1013                         if ret != 0 {
1014                             error!(
1015                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
1016                                 vcpu_id,
1017                                 io::Error::last_os_error()
1018                             );
1019                             return;
1020                         }
1021                     }
1022 
1023                     // Apply seccomp filter for vcpu thread.
1024                     if !vcpu_seccomp_filter.is_empty() {
1025                         if let Err(e) =
1026                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1027                         {
1028                             error!("Error applying seccomp filter: {:?}", e);
1029                             return;
1030                         }
1031                     }
1032                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1033                     // This uses an async signal safe handler to kill the vcpu handles.
1034                     register_signal_handler(SIGRTMIN(), handle_signal)
1035                         .expect("Failed to register vcpu signal handler");
1036                     // Block until all CPUs are ready.
1037                     vcpu_thread_barrier.wait();
1038 
1039                     std::panic::catch_unwind(move || {
1040                         loop {
1041                             // If we are being told to pause, we park the thread
1042                             // until the pause boolean is toggled.
1043                             // The resume operation is responsible for toggling
1044                             // the boolean and unpark the thread.
1045                             // We enter a loop because park() could spuriously
1046                             // return. We will then park() again unless the
1047                             // pause boolean has been toggled.
1048 
1049                             // Need to use Ordering::SeqCst as we have multiple
1050                             // loads and stores to different atomics and we need
1051                             // to see them in a consistent order in all threads
1052 
1053                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1054                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1055                                 // completed by returning to KVM_RUN. From the kernel docs:
1056                                 //
1057                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1058                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1059                                 // operations are complete (and guest state is consistent) only after userspace
1060                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1061                                 // incomplete operations and then check for pending signals.
1062                                 // The pending state of the operation is not preserved in state which is
1063                                 // visible to userspace, thus userspace should ensure that the operation is
1064                                 // completed before performing a live migration.  Userspace can re-enter the
1065                                 // guest with an unmasked signal pending or with the immediate_exit field set
1066                                 // to complete pending operations without allowing any further instructions
1067                                 // to be executed.
1068 
1069                                 #[cfg(feature = "kvm")]
1070                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1071                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1072                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1073                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1074                                         break;
1075                                     }
1076                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1077                                 }
1078 
1079                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1080 
1081                                 vcpu_paused.store(true, Ordering::SeqCst);
1082                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1083                                     thread::park();
1084                                 }
1085                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1086                             }
1087 
1088                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1089                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1090                                 #[cfg(target_arch = "x86_64")]
1091                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1092                                     Ok(()) => {},
1093                                     Err(e) => {
1094                                         error!("Error when inject nmi {}", e);
1095                                         break;
1096                                     }
1097                                 }
1098                             }
1099 
1100                             // We've been told to terminate
1101                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1102                                 || vcpu_kill.load(Ordering::SeqCst)
1103                             {
1104                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1105                                 break;
1106                             }
1107 
1108                             #[cfg(feature = "tdx")]
1109                             let mut vcpu = vcpu.lock().unwrap();
1110                             #[cfg(not(feature = "tdx"))]
1111                             let vcpu = vcpu.lock().unwrap();
1112                             // vcpu.run() returns false on a triple-fault so trigger a reset
1113                             match vcpu.run() {
1114                                 Ok(run) => match run {
1115                                     #[cfg(feature = "kvm")]
1116                                     VmExit::Debug => {
1117                                         info!("VmExit::Debug");
1118                                         #[cfg(feature = "guest_debug")]
1119                                         {
1120                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1121                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1122                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1123                                         }
1124                                     }
1125                                     #[cfg(target_arch = "x86_64")]
1126                                     VmExit::IoapicEoi(vector) => {
1127                                         if let Some(interrupt_controller) =
1128                                             &interrupt_controller_clone
1129                                         {
1130                                             interrupt_controller
1131                                                 .lock()
1132                                                 .unwrap()
1133                                                 .end_of_interrupt(vector);
1134                                         }
1135                                     }
1136                                     VmExit::Ignore => {}
1137                                     VmExit::Hyperv => {}
1138                                     VmExit::Reset => {
1139                                         info!("VmExit::Reset");
1140                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1141                                         reset_evt.write(1).unwrap();
1142                                         break;
1143                                     }
1144                                     VmExit::Shutdown => {
1145                                         info!("VmExit::Shutdown");
1146                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1147                                         exit_evt.write(1).unwrap();
1148                                         break;
1149                                     }
1150                                     #[cfg(feature = "tdx")]
1151                                     VmExit::Tdx => {
1152                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1153                                             match vcpu.get_tdx_exit_details() {
1154                                                 Ok(details) => match details {
1155                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1156                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1157                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1158                                                     }
1159                                                 },
1160                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1161                                             }
1162                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1163                                         } else {
1164                                             // We should never reach this code as
1165                                             // this means the design from the code
1166                                             // is wrong.
1167                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1168                                         }
1169                                     }
1170                                 },
1171 
1172                                 Err(e) => {
1173                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1174                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1175                                     exit_evt.write(1).unwrap();
1176                                     break;
1177                                 }
1178                             }
1179 
1180                             // We've been told to terminate
1181                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1182                                 || vcpu_kill.load(Ordering::SeqCst)
1183                             {
1184                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1185                                 break;
1186                             }
1187                         }
1188                     })
1189                     .or_else(|_| {
1190                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1191                         error!("vCPU thread panicked");
1192                         panic_exit_evt.write(1)
1193                     })
1194                     .ok();
1195                 })
1196                 .map_err(Error::VcpuSpawn)?,
1197         );
1198 
1199         // On hot plug calls into this function entry_point is None. It is for
1200         // those hotplug CPU additions that we need to set the inserting flag.
1201         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1202         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1203 
1204         Ok(())
1205     }
1206 
1207     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1208     fn activate_vcpus(
1209         &mut self,
1210         desired_vcpus: u8,
1211         inserting: bool,
1212         paused: Option<bool>,
1213     ) -> Result<()> {
1214         if desired_vcpus > self.config.max_vcpus {
1215             return Err(Error::DesiredVCpuCountExceedsMax);
1216         }
1217 
1218         let vcpu_thread_barrier = Arc::new(Barrier::new(
1219             (desired_vcpus - self.present_vcpus() + 1) as usize,
1220         ));
1221 
1222         if let Some(paused) = paused {
1223             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1224         }
1225 
1226         info!(
1227             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1228             desired_vcpus,
1229             self.vcpus.len(),
1230             self.present_vcpus(),
1231             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1232         );
1233 
1234         // This reuses any inactive vCPUs as well as any that were newly created
1235         for vcpu_id in self.present_vcpus()..desired_vcpus {
1236             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1237             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1238         }
1239 
1240         // Unblock all CPU threads.
1241         vcpu_thread_barrier.wait();
1242         Ok(())
1243     }
1244 
1245     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1246         // Mark vCPUs for removal, actual removal happens on ejection
1247         for cpu_id in desired_vcpus..self.present_vcpus() {
1248             self.vcpu_states[usize::from(cpu_id)].removing = true;
1249             self.vcpu_states[usize::from(cpu_id)]
1250                 .pending_removal
1251                 .store(true, Ordering::SeqCst);
1252         }
1253     }
1254 
1255     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1256         for state in self.vcpu_states.iter() {
1257             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1258                 return true;
1259             }
1260         }
1261         false
1262     }
1263 
1264     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1265         info!("Removing vCPU: cpu_id = {}", cpu_id);
1266         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1267         state.kill.store(true, Ordering::SeqCst);
1268         state.signal_thread();
1269         state.join_thread()?;
1270         state.handle = None;
1271 
1272         // Once the thread has exited, clear the "kill" so that it can reused
1273         state.kill.store(false, Ordering::SeqCst);
1274         state.pending_removal.store(false, Ordering::SeqCst);
1275 
1276         Ok(())
1277     }
1278 
1279     pub fn create_boot_vcpus(
1280         &mut self,
1281         snapshot: Option<Snapshot>,
1282     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1283         trace_scoped!("create_boot_vcpus");
1284 
1285         self.create_vcpus(self.boot_vcpus(), snapshot)
1286     }
1287 
1288     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1289     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1290         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1291     }
1292 
1293     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1294         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1295             .map_err(|e| {
1296                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1297             })?;
1298 
1299         Ok(())
1300     }
1301 
1302     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1303         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1304             return Ok(false);
1305         }
1306 
1307         if !self.dynamic {
1308             return Ok(false);
1309         }
1310 
1311         if self.check_pending_removed_vcpu() {
1312             return Err(Error::VcpuPendingRemovedVcpu);
1313         }
1314 
1315         match desired_vcpus.cmp(&self.present_vcpus()) {
1316             cmp::Ordering::Greater => {
1317                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1318                 for vcpu in vcpus {
1319                     self.configure_vcpu(vcpu, None)?
1320                 }
1321                 self.activate_vcpus(desired_vcpus, true, None)?;
1322                 Ok(true)
1323             }
1324             cmp::Ordering::Less => {
1325                 self.mark_vcpus_for_removal(desired_vcpus);
1326                 Ok(true)
1327             }
1328             _ => Ok(false),
1329         }
1330     }
1331 
1332     pub fn shutdown(&mut self) -> Result<()> {
1333         // Tell the vCPUs to stop themselves next time they go through the loop
1334         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1335 
1336         // Toggle the vCPUs pause boolean
1337         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1338 
1339         // Unpark all the VCPU threads.
1340         for state in self.vcpu_states.iter() {
1341             state.unpark_thread();
1342         }
1343 
1344         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1345         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1346         // above.
1347         for state in self.vcpu_states.iter() {
1348             state.signal_thread();
1349         }
1350 
1351         // Wait for all the threads to finish. This removes the state from the vector.
1352         for mut state in self.vcpu_states.drain(..) {
1353             state.join_thread()?;
1354         }
1355 
1356         Ok(())
1357     }
1358 
1359     #[cfg(feature = "tdx")]
1360     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1361         for vcpu in &self.vcpus {
1362             vcpu.lock()
1363                 .unwrap()
1364                 .vcpu
1365                 .tdx_init(hob_address)
1366                 .map_err(Error::InitializeTdx)?;
1367         }
1368         Ok(())
1369     }
1370 
1371     pub fn boot_vcpus(&self) -> u8 {
1372         self.config.boot_vcpus
1373     }
1374 
1375     pub fn max_vcpus(&self) -> u8 {
1376         self.config.max_vcpus
1377     }
1378 
1379     #[cfg(target_arch = "x86_64")]
1380     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1381         assert!(!self.cpuid.is_empty());
1382         self.cpuid.clone()
1383     }
1384 
1385     fn present_vcpus(&self) -> u8 {
1386         self.vcpu_states
1387             .iter()
1388             .fold(0, |acc, state| acc + state.active() as u8)
1389     }
1390 
1391     #[cfg(target_arch = "aarch64")]
1392     pub fn get_mpidrs(&self) -> Vec<u64> {
1393         self.vcpus
1394             .iter()
1395             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1396             .collect()
1397     }
1398 
1399     #[cfg(target_arch = "aarch64")]
1400     pub fn get_saved_states(&self) -> Vec<CpuState> {
1401         self.vcpus
1402             .iter()
1403             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1404             .collect()
1405     }
1406 
1407     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1408         self.config
1409             .topology
1410             .clone()
1411             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1412     }
1413 
1414     pub fn create_madt(&self) -> Sdt {
1415         use crate::acpi;
1416         // This is also checked in the commandline parsing.
1417         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1418 
1419         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1420         #[cfg(target_arch = "x86_64")]
1421         {
1422             madt.write(36, arch::layout::APIC_START.0);
1423 
1424             for cpu in 0..self.config.max_vcpus {
1425                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1426 
1427                 let lapic = LocalX2Apic {
1428                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1429                     length: 16,
1430                     processor_id: cpu.into(),
1431                     apic_id: x2apic_id,
1432                     flags: if cpu < self.config.boot_vcpus {
1433                         1 << MADT_CPU_ENABLE_FLAG
1434                     } else {
1435                         0
1436                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1437                     _reserved: 0,
1438                 };
1439                 madt.append(lapic);
1440             }
1441 
1442             madt.append(Ioapic {
1443                 r#type: acpi::ACPI_APIC_IO,
1444                 length: 12,
1445                 ioapic_id: 0,
1446                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1447                 gsi_base: 0,
1448                 ..Default::default()
1449             });
1450 
1451             madt.append(InterruptSourceOverride {
1452                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1453                 length: 10,
1454                 bus: 0,
1455                 source: 4,
1456                 gsi: 4,
1457                 flags: 0,
1458             });
1459         }
1460 
1461         #[cfg(target_arch = "aarch64")]
1462         {
1463             /* Notes:
1464              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1465              */
1466 
1467             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1468             for cpu in 0..self.config.boot_vcpus {
1469                 let vcpu = &self.vcpus[cpu as usize];
1470                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1471                 /* ARMv8 MPIDR format:
1472                      Bits [63:40] Must be zero
1473                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1474                      Bits [31:24] Must be zero
1475                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1476                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1477                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1478                 */
1479                 let mpidr_mask = 0xff_00ff_ffff;
1480                 let gicc = GicC {
1481                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1482                     length: 80,
1483                     reserved0: 0,
1484                     cpu_interface_number: cpu as u32,
1485                     uid: cpu as u32,
1486                     flags: 1,
1487                     parking_version: 0,
1488                     performance_interrupt: 0,
1489                     parked_address: 0,
1490                     base_address: 0,
1491                     gicv_base_address: 0,
1492                     gich_base_address: 0,
1493                     vgic_interrupt: 0,
1494                     gicr_base_address: 0,
1495                     mpidr: mpidr & mpidr_mask,
1496                     proc_power_effi_class: 0,
1497                     reserved1: 0,
1498                     spe_overflow_interrupt: 0,
1499                 };
1500 
1501                 madt.append(gicc);
1502             }
1503             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1504 
1505             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1506             let gicd = GicD {
1507                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1508                 length: 24,
1509                 reserved0: 0,
1510                 gic_id: 0,
1511                 base_address: vgic_config.dist_addr,
1512                 global_irq_base: 0,
1513                 version: 3,
1514                 reserved1: [0; 3],
1515             };
1516             madt.append(gicd);
1517 
1518             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1519             let gicr = GicR {
1520                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1521                 length: 16,
1522                 reserved: 0,
1523                 base_address: vgic_config.redists_addr,
1524                 range_length: vgic_config.redists_size as u32,
1525             };
1526             madt.append(gicr);
1527 
1528             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1529             let gicits = GicIts {
1530                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1531                 length: 20,
1532                 reserved0: 0,
1533                 translation_id: 0,
1534                 base_address: vgic_config.msi_addr,
1535                 reserved1: 0,
1536             };
1537             madt.append(gicits);
1538 
1539             madt.update_checksum();
1540         }
1541 
1542         madt
1543     }
1544 
1545     #[cfg(target_arch = "aarch64")]
1546     pub fn create_pptt(&self) -> Sdt {
1547         let pptt_start = 0;
1548         let mut cpus = 0;
1549         let mut uid = 0;
1550         // If topology is not specified, the default setting is:
1551         // 1 package, multiple cores, 1 thread per core
1552         // This is also the behavior when PPTT is missing.
1553         let (threads_per_core, cores_per_package, packages) =
1554             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1555 
1556         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1557 
1558         for cluster_idx in 0..packages {
1559             if cpus < self.config.boot_vcpus as usize {
1560                 let cluster_offset = pptt.len() - pptt_start;
1561                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1562                     r#type: 0,
1563                     length: 20,
1564                     reserved: 0,
1565                     flags: 0x2,
1566                     parent: 0,
1567                     acpi_processor_id: cluster_idx as u32,
1568                     num_private_resources: 0,
1569                 };
1570                 pptt.append(cluster_hierarchy_node);
1571 
1572                 for core_idx in 0..cores_per_package {
1573                     let core_offset = pptt.len() - pptt_start;
1574 
1575                     if threads_per_core > 1 {
1576                         let core_hierarchy_node = ProcessorHierarchyNode {
1577                             r#type: 0,
1578                             length: 20,
1579                             reserved: 0,
1580                             flags: 0x2,
1581                             parent: cluster_offset as u32,
1582                             acpi_processor_id: core_idx as u32,
1583                             num_private_resources: 0,
1584                         };
1585                         pptt.append(core_hierarchy_node);
1586 
1587                         for _thread_idx in 0..threads_per_core {
1588                             let thread_hierarchy_node = ProcessorHierarchyNode {
1589                                 r#type: 0,
1590                                 length: 20,
1591                                 reserved: 0,
1592                                 flags: 0xE,
1593                                 parent: core_offset as u32,
1594                                 acpi_processor_id: uid as u32,
1595                                 num_private_resources: 0,
1596                             };
1597                             pptt.append(thread_hierarchy_node);
1598                             uid += 1;
1599                         }
1600                     } else {
1601                         let thread_hierarchy_node = ProcessorHierarchyNode {
1602                             r#type: 0,
1603                             length: 20,
1604                             reserved: 0,
1605                             flags: 0xA,
1606                             parent: cluster_offset as u32,
1607                             acpi_processor_id: uid as u32,
1608                             num_private_resources: 0,
1609                         };
1610                         pptt.append(thread_hierarchy_node);
1611                         uid += 1;
1612                     }
1613                 }
1614                 cpus += (cores_per_package * threads_per_core) as usize;
1615             }
1616         }
1617 
1618         pptt.update_checksum();
1619         pptt
1620     }
1621 
1622     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1623     fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters {
1624         self.vcpus[usize::from(cpu_id)]
1625             .lock()
1626             .unwrap()
1627             .vcpu
1628             .create_standard_regs()
1629     }
1630 
1631     #[cfg(feature = "guest_debug")]
1632     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1633         self.vcpus[usize::from(cpu_id)]
1634             .lock()
1635             .unwrap()
1636             .vcpu
1637             .get_regs()
1638             .map_err(Error::CpuDebug)
1639     }
1640 
1641     #[cfg(feature = "guest_debug")]
1642     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1643         self.vcpus[usize::from(cpu_id)]
1644             .lock()
1645             .unwrap()
1646             .vcpu
1647             .set_regs(regs)
1648             .map_err(Error::CpuDebug)
1649     }
1650 
1651     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1652     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1653         self.vcpus[usize::from(cpu_id)]
1654             .lock()
1655             .unwrap()
1656             .vcpu
1657             .get_sregs()
1658             .map_err(Error::CpuDebug)
1659     }
1660 
1661     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1662     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1663         self.vcpus[usize::from(cpu_id)]
1664             .lock()
1665             .unwrap()
1666             .vcpu
1667             .set_sregs(sregs)
1668             .map_err(Error::CpuDebug)
1669     }
1670 
1671     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1672     fn translate_gva(
1673         &self,
1674         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1675         cpu_id: u8,
1676         gva: u64,
1677     ) -> Result<u64> {
1678         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1679             .lock()
1680             .unwrap()
1681             .vcpu
1682             .translate_gva(gva, /* flags: unused */ 0)
1683             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1684         Ok(gpa)
1685     }
1686 
1687     ///
1688     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1689     /// it in VMM by walking through translation tables.
1690     ///
1691     /// Address translation is big topic, here we only focus the scenario that
1692     /// happens in VMM while debugging kernel. This `translate_gva`
1693     /// implementation is restricted to:
1694     /// - Exception Level 1
1695     /// - Translate high address range only (kernel space)
1696     ///
1697     /// This implementation supports following Arm-v8a features related to
1698     /// address translation:
1699     /// - FEAT_LPA
1700     /// - FEAT_LVA
1701     /// - FEAT_LPA2
1702     ///
1703     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1704     fn translate_gva(
1705         &self,
1706         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1707         cpu_id: u8,
1708         gva: u64,
1709     ) -> Result<u64> {
1710         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1711             .lock()
1712             .unwrap()
1713             .vcpu
1714             .get_sys_reg(regs::TCR_EL1)
1715             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1716         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1717             .lock()
1718             .unwrap()
1719             .vcpu
1720             .get_sys_reg(regs::TTBR1_EL1)
1721             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1722         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1723             .lock()
1724             .unwrap()
1725             .vcpu
1726             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1727             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1728 
1729         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1730         // or low (0x000xxx...).
1731         let high_range = extract_bits_64!(gva, 55, 1);
1732         if high_range == 0 {
1733             info!("VA (0x{:x}) range is not supported!", gva);
1734             return Ok(gva);
1735         }
1736 
1737         // High range size offset
1738         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1739         // Granule size
1740         let tg = extract_bits_64!(tcr_el1, 30, 2);
1741         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1742         let ds = extract_bits_64!(tcr_el1, 59, 1);
1743 
1744         if tsz == 0 {
1745             info!("VA translation is not ready!");
1746             return Ok(gva);
1747         }
1748 
1749         // VA size is determined by TCR_BL1.T1SZ
1750         let va_size = 64 - tsz;
1751         // Number of bits in VA consumed in each level of translation
1752         let stride = match tg {
1753             3 => 13, // 64KB granule size
1754             1 => 11, // 16KB granule size
1755             _ => 9,  // 4KB, default
1756         };
1757         // Starting level of walking
1758         let mut level = 4 - (va_size - 4) / stride;
1759 
1760         // PA or IPA size is determined
1761         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1762         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1763         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1764         // To be safe, we use the minimum value if they are different.
1765         let pa_range = std::cmp::min(tcr_ips, pa_range);
1766         // PA size in bits
1767         let pa_size = match pa_range {
1768             0 => 32,
1769             1 => 36,
1770             2 => 40,
1771             3 => 42,
1772             4 => 44,
1773             5 => 48,
1774             6 => 52,
1775             _ => {
1776                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1777                     "PA range not supported {pa_range}"
1778                 ))))
1779             }
1780         };
1781 
1782         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1783         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1784         // If FEAT_LPA2 is present, the translation table descriptor holds
1785         // 50 bits of the table address of next level.
1786         // Otherwise, it is 48 bits.
1787         let descaddrmask = if ds == 1 {
1788             !0u64 >> (64 - 50) // mask with 50 least significant bits
1789         } else {
1790             !0u64 >> (64 - 48) // mask with 48 least significant bits
1791         };
1792         let descaddrmask = descaddrmask & !indexmask_grainsize;
1793 
1794         // Translation table base address
1795         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1796         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1797         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1798         if pa_size == 52 {
1799             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1800         }
1801 
1802         // Loop through tables of each level
1803         loop {
1804             // Table offset for current level
1805             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1806             descaddr |= table_offset;
1807             descaddr &= !7u64;
1808 
1809             let mut buf = [0; 8];
1810             guest_memory
1811                 .memory()
1812                 .read(&mut buf, GuestAddress(descaddr))
1813                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1814             let descriptor = u64::from_le_bytes(buf);
1815 
1816             descaddr = descriptor & descaddrmask;
1817             // In the case of FEAT_LPA, the next-level translation table address
1818             // bits [48:51] comes from bits [12:15] of the current descriptor.
1819             // For FEAT_LPA2, the next-level translation table address
1820             // bits [50:51] comes from bits [8:9] of the current descriptor,
1821             // bits [48:49] comes from bits [48:49] of the descriptor which was
1822             // handled previously.
1823             if pa_size == 52 {
1824                 if ds == 1 {
1825                     // FEAT_LPA2
1826                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1827                 } else {
1828                     // FEAT_LPA
1829                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1830                 }
1831             }
1832 
1833             if (descriptor & 2) != 0 && (level < 3) {
1834                 // This is a table entry. Go down to next level.
1835                 level += 1;
1836                 indexmask = indexmask_grainsize;
1837                 continue;
1838             }
1839 
1840             break;
1841         }
1842 
1843         // We have reached either:
1844         // - a page entry at level 3 or
1845         // - a block entry at level 1 or 2
1846         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1847         descaddr &= !(page_size - 1);
1848         descaddr |= gva & (page_size - 1);
1849 
1850         Ok(descaddr)
1851     }
1852 
1853     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1854         self.acpi_address = Some(acpi_address);
1855     }
1856 
1857     pub(crate) fn set_interrupt_controller(
1858         &mut self,
1859         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1860     ) {
1861         self.interrupt_controller = Some(interrupt_controller);
1862     }
1863 
1864     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1865         &self.vcpus_kill_signalled
1866     }
1867 
1868     #[cfg(feature = "igvm")]
1869     pub(crate) fn get_cpuid_leaf(
1870         &self,
1871         cpu_id: u8,
1872         eax: u32,
1873         ecx: u32,
1874         xfem: u64,
1875         xss: u64,
1876     ) -> Result<[u32; 4]> {
1877         let leaf_info = self.vcpus[usize::from(cpu_id)]
1878             .lock()
1879             .unwrap()
1880             .vcpu
1881             .get_cpuid_values(eax, ecx, xfem, xss)
1882             .unwrap();
1883         Ok(leaf_info)
1884     }
1885 
1886     #[cfg(feature = "sev_snp")]
1887     pub(crate) fn sev_snp_enabled(&self) -> bool {
1888         self.sev_snp_enabled
1889     }
1890 
1891     pub(crate) fn nmi(&self) -> Result<()> {
1892         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1893 
1894         for state in self.vcpu_states.iter() {
1895             state.signal_thread();
1896         }
1897 
1898         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1899 
1900         Ok(())
1901     }
1902 }
1903 
1904 struct Cpu {
1905     cpu_id: u8,
1906     proximity_domain: u32,
1907     dynamic: bool,
1908     #[cfg(target_arch = "x86_64")]
1909     topology: Option<(u8, u8, u8)>,
1910 }
1911 
1912 #[cfg(target_arch = "x86_64")]
1913 const MADT_CPU_ENABLE_FLAG: usize = 0;
1914 
1915 #[cfg(target_arch = "x86_64")]
1916 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1917 
1918 impl Cpu {
1919     #[cfg(target_arch = "x86_64")]
1920     fn generate_mat(&self) -> Vec<u8> {
1921         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1922 
1923         let lapic = LocalX2Apic {
1924             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1925             length: 16,
1926             processor_id: self.cpu_id.into(),
1927             apic_id: x2apic_id,
1928             flags: 1 << MADT_CPU_ENABLE_FLAG,
1929             _reserved: 0,
1930         };
1931 
1932         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1933         // SAFETY: mat_data is large enough to hold lapic
1934         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1935 
1936         mat_data
1937     }
1938 }
1939 
1940 impl Aml for Cpu {
1941     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1942         #[cfg(target_arch = "x86_64")]
1943         let mat_data: Vec<u8> = self.generate_mat();
1944         #[allow(clippy::if_same_then_else)]
1945         if self.dynamic {
1946             aml::Device::new(
1947                 format!("C{:03X}", self.cpu_id).as_str().into(),
1948                 vec![
1949                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1950                     &aml::Name::new("_UID".into(), &self.cpu_id),
1951                     // Currently, AArch64 cannot support following fields.
1952                     /*
1953                     _STA return value:
1954                     Bit [0] – Set if the device is present.
1955                     Bit [1] – Set if the device is enabled and decoding its resources.
1956                     Bit [2] – Set if the device should be shown in the UI.
1957                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1958                     Bit [4] – Set if the battery is present.
1959                     Bits [31:5] – Reserved (must be cleared).
1960                     */
1961                     #[cfg(target_arch = "x86_64")]
1962                     &aml::Method::new(
1963                         "_STA".into(),
1964                         0,
1965                         false,
1966                         // Call into CSTA method which will interrogate device
1967                         vec![&aml::Return::new(&aml::MethodCall::new(
1968                             "CSTA".into(),
1969                             vec![&self.cpu_id],
1970                         ))],
1971                     ),
1972                     &aml::Method::new(
1973                         "_PXM".into(),
1974                         0,
1975                         false,
1976                         vec![&aml::Return::new(&self.proximity_domain)],
1977                     ),
1978                     // The Linux kernel expects every CPU device to have a _MAT entry
1979                     // containing the LAPIC for this processor with the enabled bit set
1980                     // even it if is disabled in the MADT (non-boot CPU)
1981                     #[cfg(target_arch = "x86_64")]
1982                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1983                     // Trigger CPU ejection
1984                     #[cfg(target_arch = "x86_64")]
1985                     &aml::Method::new(
1986                         "_EJ0".into(),
1987                         1,
1988                         false,
1989                         // Call into CEJ0 method which will actually eject device
1990                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1991                     ),
1992                 ],
1993             )
1994             .to_aml_bytes(sink);
1995         } else {
1996             aml::Device::new(
1997                 format!("C{:03X}", self.cpu_id).as_str().into(),
1998                 vec![
1999                     &aml::Name::new("_HID".into(), &"ACPI0007"),
2000                     &aml::Name::new("_UID".into(), &self.cpu_id),
2001                     #[cfg(target_arch = "x86_64")]
2002                     &aml::Method::new(
2003                         "_STA".into(),
2004                         0,
2005                         false,
2006                         // Mark CPU present see CSTA implementation
2007                         vec![&aml::Return::new(&0xfu8)],
2008                     ),
2009                     &aml::Method::new(
2010                         "_PXM".into(),
2011                         0,
2012                         false,
2013                         vec![&aml::Return::new(&self.proximity_domain)],
2014                     ),
2015                     // The Linux kernel expects every CPU device to have a _MAT entry
2016                     // containing the LAPIC for this processor with the enabled bit set
2017                     // even it if is disabled in the MADT (non-boot CPU)
2018                     #[cfg(target_arch = "x86_64")]
2019                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
2020                 ],
2021             )
2022             .to_aml_bytes(sink);
2023         }
2024     }
2025 }
2026 
2027 struct CpuNotify {
2028     cpu_id: u8,
2029 }
2030 
2031 impl Aml for CpuNotify {
2032     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2033         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2034         aml::If::new(
2035             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2036             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2037         )
2038         .to_aml_bytes(sink)
2039     }
2040 }
2041 
2042 struct CpuMethods {
2043     max_vcpus: u8,
2044     dynamic: bool,
2045 }
2046 
2047 impl Aml for CpuMethods {
2048     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2049         if self.dynamic {
2050             // CPU status method
2051             aml::Method::new(
2052                 "CSTA".into(),
2053                 1,
2054                 true,
2055                 vec![
2056                     // Take lock defined above
2057                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2058                     // Write CPU number (in first argument) to I/O port via field
2059                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2060                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2061                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2062                     &aml::If::new(
2063                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2064                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2065                     ),
2066                     // Release lock
2067                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2068                     // Return 0 or 0xf
2069                     &aml::Return::new(&aml::Local(0)),
2070                 ],
2071             )
2072             .to_aml_bytes(sink);
2073 
2074             let mut cpu_notifies = Vec::new();
2075             for cpu_id in 0..self.max_vcpus {
2076                 cpu_notifies.push(CpuNotify { cpu_id });
2077             }
2078 
2079             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2080             for cpu_id in 0..self.max_vcpus {
2081                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2082             }
2083 
2084             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2085 
2086             aml::Method::new(
2087                 "CEJ0".into(),
2088                 1,
2089                 true,
2090                 vec![
2091                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2092                     // Write CPU number (in first argument) to I/O port via field
2093                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2094                     // Set CEJ0 bit
2095                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2096                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2097                 ],
2098             )
2099             .to_aml_bytes(sink);
2100 
2101             aml::Method::new(
2102                 "CSCN".into(),
2103                 0,
2104                 true,
2105                 vec![
2106                     // Take lock defined above
2107                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2108                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2109                     &aml::While::new(
2110                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2111                         vec![
2112                             // Write CPU number (in first argument) to I/O port via field
2113                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2114                             // Check if CINS bit is set
2115                             &aml::If::new(
2116                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2117                                 // Notify device if it is
2118                                 vec![
2119                                     &aml::MethodCall::new(
2120                                         "CTFY".into(),
2121                                         vec![&aml::Local(0), &aml::ONE],
2122                                     ),
2123                                     // Reset CINS bit
2124                                     &aml::Store::new(
2125                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2126                                         &aml::ONE,
2127                                     ),
2128                                 ],
2129                             ),
2130                             // Check if CRMV bit is set
2131                             &aml::If::new(
2132                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2133                                 // Notify device if it is (with the eject constant 0x3)
2134                                 vec![
2135                                     &aml::MethodCall::new(
2136                                         "CTFY".into(),
2137                                         vec![&aml::Local(0), &3u8],
2138                                     ),
2139                                     // Reset CRMV bit
2140                                     &aml::Store::new(
2141                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2142                                         &aml::ONE,
2143                                     ),
2144                                 ],
2145                             ),
2146                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2147                         ],
2148                     ),
2149                     // Release lock
2150                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2151                 ],
2152             )
2153             .to_aml_bytes(sink)
2154         } else {
2155             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2156         }
2157     }
2158 }
2159 
2160 impl Aml for CpuManager {
2161     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2162         #[cfg(target_arch = "x86_64")]
2163         if let Some(acpi_address) = self.acpi_address {
2164             // CPU hotplug controller
2165             aml::Device::new(
2166                 "_SB_.PRES".into(),
2167                 vec![
2168                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2169                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2170                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2171                     &aml::Mutex::new("CPLK".into(), 0),
2172                     &aml::Name::new(
2173                         "_CRS".into(),
2174                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2175                             aml::AddressSpaceCacheable::NotCacheable,
2176                             true,
2177                             acpi_address.0,
2178                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2179                             None,
2180                         )]),
2181                     ),
2182                     // OpRegion and Fields map MMIO range into individual field values
2183                     &aml::OpRegion::new(
2184                         "PRST".into(),
2185                         aml::OpRegionSpace::SystemMemory,
2186                         &(acpi_address.0 as usize),
2187                         &CPU_MANAGER_ACPI_SIZE,
2188                     ),
2189                     &aml::Field::new(
2190                         "PRST".into(),
2191                         aml::FieldAccessType::Byte,
2192                         aml::FieldLockRule::NoLock,
2193                         aml::FieldUpdateRule::WriteAsZeroes,
2194                         vec![
2195                             aml::FieldEntry::Reserved(32),
2196                             aml::FieldEntry::Named(*b"CPEN", 1),
2197                             aml::FieldEntry::Named(*b"CINS", 1),
2198                             aml::FieldEntry::Named(*b"CRMV", 1),
2199                             aml::FieldEntry::Named(*b"CEJ0", 1),
2200                             aml::FieldEntry::Reserved(4),
2201                             aml::FieldEntry::Named(*b"CCMD", 8),
2202                         ],
2203                     ),
2204                     &aml::Field::new(
2205                         "PRST".into(),
2206                         aml::FieldAccessType::DWord,
2207                         aml::FieldLockRule::NoLock,
2208                         aml::FieldUpdateRule::Preserve,
2209                         vec![
2210                             aml::FieldEntry::Named(*b"CSEL", 32),
2211                             aml::FieldEntry::Reserved(32),
2212                             aml::FieldEntry::Named(*b"CDAT", 32),
2213                         ],
2214                     ),
2215                 ],
2216             )
2217             .to_aml_bytes(sink);
2218         }
2219 
2220         // CPU devices
2221         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2222         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2223         // Bundle methods together under a common object
2224         let methods = CpuMethods {
2225             max_vcpus: self.config.max_vcpus,
2226             dynamic: self.dynamic,
2227         };
2228         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2229 
2230         #[cfg(target_arch = "x86_64")]
2231         let topology = self.get_vcpu_topology();
2232         let mut cpu_devices = Vec::new();
2233         for cpu_id in 0..self.config.max_vcpus {
2234             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2235             let cpu_device = Cpu {
2236                 cpu_id,
2237                 proximity_domain,
2238                 dynamic: self.dynamic,
2239                 #[cfg(target_arch = "x86_64")]
2240                 topology,
2241             };
2242 
2243             cpu_devices.push(cpu_device);
2244         }
2245 
2246         for cpu_device in cpu_devices.iter() {
2247             cpu_data_inner.push(cpu_device);
2248         }
2249 
2250         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2251     }
2252 }
2253 
2254 impl Pausable for CpuManager {
2255     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2256         // Tell the vCPUs to pause themselves next time they exit
2257         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2258 
2259         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2260         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2261         // above.
2262         for state in self.vcpu_states.iter() {
2263             state.signal_thread();
2264         }
2265 
2266         for vcpu in self.vcpus.iter() {
2267             let mut vcpu = vcpu.lock().unwrap();
2268             vcpu.pause()?;
2269             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2270             if !self.config.kvm_hyperv {
2271                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2272                     MigratableError::Pause(anyhow!(
2273                         "Could not notify guest it has been paused {:?}",
2274                         e
2275                     ))
2276                 })?;
2277             }
2278         }
2279 
2280         // The vCPU thread will change its paused state before parking, wait here for each
2281         // activated vCPU change their state to ensure they have parked.
2282         for state in self.vcpu_states.iter() {
2283             if state.active() {
2284                 while !state.paused.load(Ordering::SeqCst) {
2285                     // To avoid a priority inversion with the vCPU thread
2286                     thread::sleep(std::time::Duration::from_millis(1));
2287                 }
2288             }
2289         }
2290 
2291         Ok(())
2292     }
2293 
2294     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2295         for vcpu in self.vcpus.iter() {
2296             vcpu.lock().unwrap().resume()?;
2297         }
2298 
2299         // Toggle the vCPUs pause boolean
2300         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2301 
2302         // Unpark all the VCPU threads.
2303         // Once unparked, the next thing they will do is checking for the pause
2304         // boolean. Since it'll be set to false, they will exit their pause loop
2305         // and go back to vmx root.
2306         for state in self.vcpu_states.iter() {
2307             state.paused.store(false, Ordering::SeqCst);
2308             state.unpark_thread();
2309         }
2310         Ok(())
2311     }
2312 }
2313 
2314 impl Snapshottable for CpuManager {
2315     fn id(&self) -> String {
2316         CPU_MANAGER_SNAPSHOT_ID.to_string()
2317     }
2318 
2319     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2320         let mut cpu_manager_snapshot = Snapshot::default();
2321 
2322         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2323         for vcpu in &self.vcpus {
2324             let mut vcpu = vcpu.lock().unwrap();
2325             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2326         }
2327 
2328         Ok(cpu_manager_snapshot)
2329     }
2330 }
2331 
2332 impl Transportable for CpuManager {}
2333 impl Migratable for CpuManager {}
2334 
2335 #[cfg(feature = "guest_debug")]
2336 impl Debuggable for CpuManager {
2337     #[cfg(feature = "kvm")]
2338     fn set_guest_debug(
2339         &self,
2340         cpu_id: usize,
2341         addrs: &[GuestAddress],
2342         singlestep: bool,
2343     ) -> std::result::Result<(), DebuggableError> {
2344         self.vcpus[cpu_id]
2345             .lock()
2346             .unwrap()
2347             .vcpu
2348             .set_guest_debug(addrs, singlestep)
2349             .map_err(DebuggableError::SetDebug)
2350     }
2351 
2352     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2353         Ok(())
2354     }
2355 
2356     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2357         Ok(())
2358     }
2359 
2360     #[cfg(target_arch = "x86_64")]
2361     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2362         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2363         let gregs = self
2364             .get_regs(cpu_id as u8)
2365             .map_err(DebuggableError::ReadRegs)?;
2366         let regs = [
2367             gregs.get_rax(),
2368             gregs.get_rbx(),
2369             gregs.get_rcx(),
2370             gregs.get_rdx(),
2371             gregs.get_rsi(),
2372             gregs.get_rdi(),
2373             gregs.get_rbp(),
2374             gregs.get_rsp(),
2375             gregs.get_r8(),
2376             gregs.get_r9(),
2377             gregs.get_r10(),
2378             gregs.get_r11(),
2379             gregs.get_r12(),
2380             gregs.get_r13(),
2381             gregs.get_r14(),
2382             gregs.get_r15(),
2383         ];
2384 
2385         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2386         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2387         let eflags = gregs.get_rflags() as u32;
2388         let rip = gregs.get_rip();
2389 
2390         // Segment registers: CS, SS, DS, ES, FS, GS
2391         let sregs = self
2392             .get_sregs(cpu_id as u8)
2393             .map_err(DebuggableError::ReadRegs)?;
2394         let segments = X86SegmentRegs {
2395             cs: sregs.cs.selector as u32,
2396             ss: sregs.ss.selector as u32,
2397             ds: sregs.ds.selector as u32,
2398             es: sregs.es.selector as u32,
2399             fs: sregs.fs.selector as u32,
2400             gs: sregs.gs.selector as u32,
2401         };
2402 
2403         // TODO: Add other registers
2404 
2405         Ok(CoreRegs {
2406             regs,
2407             eflags,
2408             rip,
2409             segments,
2410             ..Default::default()
2411         })
2412     }
2413 
2414     #[cfg(target_arch = "aarch64")]
2415     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2416         let gregs = self
2417             .get_regs(cpu_id as u8)
2418             .map_err(DebuggableError::ReadRegs)?;
2419         Ok(CoreRegs {
2420             x: gregs.get_regs(),
2421             sp: gregs.get_sp(),
2422             pc: gregs.get_pc(),
2423             ..Default::default()
2424         })
2425     }
2426 
2427     #[cfg(target_arch = "x86_64")]
2428     fn write_regs(
2429         &self,
2430         cpu_id: usize,
2431         regs: &CoreRegs,
2432     ) -> std::result::Result<(), DebuggableError> {
2433         let orig_gregs = self
2434             .get_regs(cpu_id as u8)
2435             .map_err(DebuggableError::ReadRegs)?;
2436         let mut gregs = self.create_standard_regs(cpu_id as u8);
2437         gregs.set_rax(regs.regs[0]);
2438         gregs.set_rbx(regs.regs[1]);
2439         gregs.set_rcx(regs.regs[2]);
2440         gregs.set_rdx(regs.regs[3]);
2441         gregs.set_rsi(regs.regs[4]);
2442         gregs.set_rdi(regs.regs[5]);
2443         gregs.set_rbp(regs.regs[6]);
2444         gregs.set_rsp(regs.regs[7]);
2445         gregs.set_r8(regs.regs[8]);
2446         gregs.set_r9(regs.regs[9]);
2447         gregs.set_r10(regs.regs[10]);
2448         gregs.set_r11(regs.regs[11]);
2449         gregs.set_r12(regs.regs[12]);
2450         gregs.set_r13(regs.regs[13]);
2451         gregs.set_r14(regs.regs[14]);
2452         gregs.set_r15(regs.regs[15]);
2453         gregs.set_rip(regs.rip);
2454         // Update the lower 32-bit of rflags.
2455         gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64));
2456 
2457         self.set_regs(cpu_id as u8, &gregs)
2458             .map_err(DebuggableError::WriteRegs)?;
2459 
2460         // Segment registers: CS, SS, DS, ES, FS, GS
2461         // Since GDB care only selectors, we call get_sregs() first.
2462         let mut sregs = self
2463             .get_sregs(cpu_id as u8)
2464             .map_err(DebuggableError::ReadRegs)?;
2465         sregs.cs.selector = regs.segments.cs as u16;
2466         sregs.ss.selector = regs.segments.ss as u16;
2467         sregs.ds.selector = regs.segments.ds as u16;
2468         sregs.es.selector = regs.segments.es as u16;
2469         sregs.fs.selector = regs.segments.fs as u16;
2470         sregs.gs.selector = regs.segments.gs as u16;
2471 
2472         self.set_sregs(cpu_id as u8, &sregs)
2473             .map_err(DebuggableError::WriteRegs)?;
2474 
2475         // TODO: Add other registers
2476 
2477         Ok(())
2478     }
2479 
2480     #[cfg(target_arch = "aarch64")]
2481     fn write_regs(
2482         &self,
2483         cpu_id: usize,
2484         regs: &CoreRegs,
2485     ) -> std::result::Result<(), DebuggableError> {
2486         let mut gregs = self
2487             .get_regs(cpu_id as u8)
2488             .map_err(DebuggableError::ReadRegs)?;
2489 
2490         gregs.set_regs(regs.x);
2491         gregs.set_sp(regs.sp);
2492         gregs.set_pc(regs.pc);
2493 
2494         self.set_regs(cpu_id as u8, &gregs)
2495             .map_err(DebuggableError::WriteRegs)?;
2496 
2497         Ok(())
2498     }
2499 
2500     fn read_mem(
2501         &self,
2502         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2503         cpu_id: usize,
2504         vaddr: GuestAddress,
2505         len: usize,
2506     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2507         let mut buf = vec![0; len];
2508         let mut total_read = 0_u64;
2509 
2510         while total_read < len as u64 {
2511             let gaddr = vaddr.0 + total_read;
2512             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2513                 Ok(paddr) => paddr,
2514                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2515                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2516             };
2517             let psize = arch::PAGE_SIZE as u64;
2518             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2519             guest_memory
2520                 .memory()
2521                 .read(
2522                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2523                     GuestAddress(paddr),
2524                 )
2525                 .map_err(DebuggableError::ReadMem)?;
2526             total_read += read_len;
2527         }
2528         Ok(buf)
2529     }
2530 
2531     fn write_mem(
2532         &self,
2533         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2534         cpu_id: usize,
2535         vaddr: &GuestAddress,
2536         data: &[u8],
2537     ) -> std::result::Result<(), DebuggableError> {
2538         let mut total_written = 0_u64;
2539 
2540         while total_written < data.len() as u64 {
2541             let gaddr = vaddr.0 + total_written;
2542             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2543                 Ok(paddr) => paddr,
2544                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2545                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2546             };
2547             let psize = arch::PAGE_SIZE as u64;
2548             let write_len = std::cmp::min(
2549                 data.len() as u64 - total_written,
2550                 psize - (paddr & (psize - 1)),
2551             );
2552             guest_memory
2553                 .memory()
2554                 .write(
2555                     &data[total_written as usize..total_written as usize + write_len as usize],
2556                     GuestAddress(paddr),
2557                 )
2558                 .map_err(DebuggableError::WriteMem)?;
2559             total_written += write_len;
2560         }
2561         Ok(())
2562     }
2563 
2564     fn active_vcpus(&self) -> usize {
2565         self.present_vcpus() as usize
2566     }
2567 }
2568 
2569 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2570 impl Elf64Writable for CpuManager {}
2571 
2572 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2573 impl CpuElf64Writable for CpuManager {
2574     fn cpu_write_elf64_note(
2575         &mut self,
2576         dump_state: &DumpState,
2577     ) -> std::result::Result<(), GuestDebuggableError> {
2578         let mut coredump_file = dump_state.file.as_ref().unwrap();
2579         for vcpu in &self.vcpus {
2580             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2581             let mut pos: usize = 0;
2582             let mut buf = vec![0; note_size as usize];
2583             let descsz = size_of::<X86_64ElfPrStatus>();
2584             let vcpu_id = vcpu.lock().unwrap().id;
2585 
2586             let note = Elf64_Nhdr {
2587                 n_namesz: COREDUMP_NAME_SIZE,
2588                 n_descsz: descsz as u32,
2589                 n_type: NT_PRSTATUS,
2590             };
2591 
2592             let bytes: &[u8] = note.as_slice();
2593             buf.splice(0.., bytes.to_vec());
2594             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2595             buf.resize(pos + 4, 0);
2596             buf.splice(pos.., "CORE".to_string().into_bytes());
2597 
2598             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2599             buf.resize(pos + 32 + 4, 0);
2600             let pid = vcpu_id as u64;
2601             let bytes: &[u8] = pid.as_slice();
2602             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2603 
2604             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2605 
2606             let orig_rax: u64 = 0;
2607             let gregs = self.vcpus[usize::from(vcpu_id)]
2608                 .lock()
2609                 .unwrap()
2610                 .vcpu
2611                 .get_regs()
2612                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2613 
2614             let regs1 = [
2615                 gregs.get_r15(),
2616                 gregs.get_r14(),
2617                 gregs.get_r13(),
2618                 gregs.get_r12(),
2619                 gregs.get_rbp(),
2620                 gregs.get_rbx(),
2621                 gregs.get_r11(),
2622                 gregs.get_r10(),
2623             ];
2624             let regs2 = [
2625                 gregs.get_r9(),
2626                 gregs.get_r8(),
2627                 gregs.get_rax(),
2628                 gregs.get_rcx(),
2629                 gregs.get_rdx(),
2630                 gregs.get_rsi(),
2631                 gregs.get_rdi(),
2632                 orig_rax,
2633             ];
2634 
2635             let sregs = self.vcpus[usize::from(vcpu_id)]
2636                 .lock()
2637                 .unwrap()
2638                 .vcpu
2639                 .get_sregs()
2640                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2641 
2642             debug!(
2643                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2644                 gregs.get_rip(),
2645                 gregs.get_rsp(),
2646                 sregs.gs.base,
2647                 sregs.cs.selector,
2648                 sregs.ss.selector,
2649                 sregs.ds.selector,
2650             );
2651 
2652             let regs = X86_64UserRegs {
2653                 regs1,
2654                 regs2,
2655                 rip: gregs.get_rip(),
2656                 cs: sregs.cs.selector as u64,
2657                 eflags: gregs.get_rflags(),
2658                 rsp: gregs.get_rsp(),
2659                 ss: sregs.ss.selector as u64,
2660                 fs_base: sregs.fs.base,
2661                 gs_base: sregs.gs.base,
2662                 ds: sregs.ds.selector as u64,
2663                 es: sregs.es.selector as u64,
2664                 fs: sregs.fs.selector as u64,
2665                 gs: sregs.gs.selector as u64,
2666             };
2667 
2668             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2669             let bytes: &[u8] = regs.as_slice();
2670             buf.resize(note_size as usize, 0);
2671             buf.splice(pos.., bytes.to_vec());
2672             buf.resize(note_size as usize, 0);
2673 
2674             coredump_file
2675                 .write(&buf)
2676                 .map_err(GuestDebuggableError::CoredumpFile)?;
2677         }
2678 
2679         Ok(())
2680     }
2681 
2682     fn cpu_write_vmm_note(
2683         &mut self,
2684         dump_state: &DumpState,
2685     ) -> std::result::Result<(), GuestDebuggableError> {
2686         let mut coredump_file = dump_state.file.as_ref().unwrap();
2687         for vcpu in &self.vcpus {
2688             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2689             let mut pos: usize = 0;
2690             let mut buf = vec![0; note_size as usize];
2691             let descsz = size_of::<DumpCpusState>();
2692             let vcpu_id = vcpu.lock().unwrap().id;
2693 
2694             let note = Elf64_Nhdr {
2695                 n_namesz: COREDUMP_NAME_SIZE,
2696                 n_descsz: descsz as u32,
2697                 n_type: 0,
2698             };
2699 
2700             let bytes: &[u8] = note.as_slice();
2701             buf.splice(0.., bytes.to_vec());
2702             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2703 
2704             buf.resize(pos + 4, 0);
2705             buf.splice(pos.., "QEMU".to_string().into_bytes());
2706 
2707             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2708 
2709             let gregs = self.vcpus[usize::from(vcpu_id)]
2710                 .lock()
2711                 .unwrap()
2712                 .vcpu
2713                 .get_regs()
2714                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2715 
2716             let regs1 = [
2717                 gregs.get_rax(),
2718                 gregs.get_rbx(),
2719                 gregs.get_rcx(),
2720                 gregs.get_rdx(),
2721                 gregs.get_rsi(),
2722                 gregs.get_rdi(),
2723                 gregs.get_rsp(),
2724                 gregs.get_rbp(),
2725             ];
2726 
2727             let regs2 = [
2728                 gregs.get_r8(),
2729                 gregs.get_r9(),
2730                 gregs.get_r10(),
2731                 gregs.get_r11(),
2732                 gregs.get_r12(),
2733                 gregs.get_r13(),
2734                 gregs.get_r14(),
2735                 gregs.get_r15(),
2736             ];
2737 
2738             let sregs = self.vcpus[usize::from(vcpu_id)]
2739                 .lock()
2740                 .unwrap()
2741                 .vcpu
2742                 .get_sregs()
2743                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2744 
2745             let mut msrs = vec![MsrEntry {
2746                 index: msr_index::MSR_KERNEL_GS_BASE,
2747                 ..Default::default()
2748             }];
2749 
2750             self.vcpus[vcpu_id as usize]
2751                 .lock()
2752                 .unwrap()
2753                 .vcpu
2754                 .get_msrs(&mut msrs)
2755                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2756             let kernel_gs_base = msrs[0].data;
2757 
2758             let cs = CpuSegment::new(sregs.cs);
2759             let ds = CpuSegment::new(sregs.ds);
2760             let es = CpuSegment::new(sregs.es);
2761             let fs = CpuSegment::new(sregs.fs);
2762             let gs = CpuSegment::new(sregs.gs);
2763             let ss = CpuSegment::new(sregs.ss);
2764             let ldt = CpuSegment::new(sregs.ldt);
2765             let tr = CpuSegment::new(sregs.tr);
2766             let gdt = CpuSegment::new_from_table(sregs.gdt);
2767             let idt = CpuSegment::new_from_table(sregs.idt);
2768             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2769             let regs = DumpCpusState {
2770                 version: 1,
2771                 size: size_of::<DumpCpusState>() as u32,
2772                 regs1,
2773                 regs2,
2774                 rip: gregs.get_rip(),
2775                 rflags: gregs.get_rflags(),
2776                 cs,
2777                 ds,
2778                 es,
2779                 fs,
2780                 gs,
2781                 ss,
2782                 ldt,
2783                 tr,
2784                 gdt,
2785                 idt,
2786                 cr,
2787                 kernel_gs_base,
2788             };
2789 
2790             let bytes: &[u8] = regs.as_slice();
2791             buf.resize(note_size as usize, 0);
2792             buf.splice(pos.., bytes.to_vec());
2793             buf.resize(note_size as usize, 0);
2794 
2795             coredump_file
2796                 .write(&buf)
2797                 .map_err(GuestDebuggableError::CoredumpFile)?;
2798         }
2799 
2800         Ok(())
2801     }
2802 }
2803 
2804 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2805 #[cfg(test)]
2806 mod tests {
2807     use arch::layout::BOOT_STACK_POINTER;
2808     use arch::layout::ZERO_PAGE_START;
2809     use arch::x86_64::interrupts::*;
2810     use arch::x86_64::regs::*;
2811     use hypervisor::arch::x86::{FpuState, LapicState};
2812     use hypervisor::StandardRegisters;
2813     use linux_loader::loader::bootparam::setup_header;
2814 
2815     #[test]
2816     fn test_setlint() {
2817         let hv = hypervisor::new().unwrap();
2818         let vm = hv.create_vm().expect("new VM fd creation failed");
2819         assert!(hv.check_required_extensions().is_ok());
2820         // Calling get_lapic will fail if there is no irqchip before hand.
2821         assert!(vm.create_irq_chip().is_ok());
2822         let vcpu = vm.create_vcpu(0, None).unwrap();
2823         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2824 
2825         // Compute the value that is expected to represent LVT0 and LVT1.
2826         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2827         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2828         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2829         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2830 
2831         set_lint(&vcpu).unwrap();
2832 
2833         // Compute the value that represents LVT0 and LVT1 after set_lint.
2834         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2835         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2836         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2837         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2838         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2839     }
2840 
2841     #[test]
2842     fn test_setup_fpu() {
2843         let hv = hypervisor::new().unwrap();
2844         let vm = hv.create_vm().expect("new VM fd creation failed");
2845         let vcpu = vm.create_vcpu(0, None).unwrap();
2846         setup_fpu(&vcpu).unwrap();
2847 
2848         let expected_fpu: FpuState = FpuState {
2849             fcw: 0x37f,
2850             mxcsr: 0x1f80,
2851             ..Default::default()
2852         };
2853         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2854         // TODO: auto-generate kvm related structures with PartialEq on.
2855         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2856         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2857         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2858         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2859         // remove it at all.
2860         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2861     }
2862 
2863     #[test]
2864     fn test_setup_msrs() {
2865         use hypervisor::arch::x86::{msr_index, MsrEntry};
2866 
2867         let hv = hypervisor::new().unwrap();
2868         let vm = hv.create_vm().expect("new VM fd creation failed");
2869         let vcpu = vm.create_vcpu(0, None).unwrap();
2870         setup_msrs(&vcpu).unwrap();
2871 
2872         // This test will check against the last MSR entry configured (the tenth one).
2873         // See create_msr_entries for details.
2874         let mut msrs = vec![MsrEntry {
2875             index: msr_index::MSR_IA32_MISC_ENABLE,
2876             ..Default::default()
2877         }];
2878 
2879         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2880         // in this test case scenario.
2881         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2882         assert_eq!(read_msrs, 1);
2883 
2884         // Official entries that were setup when we did setup_msrs. We need to assert that the
2885         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2886         // expect.
2887         let entry_vec = vcpu.boot_msr_entries();
2888         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2889     }
2890 
2891     #[test]
2892     fn test_setup_regs_for_pvh() {
2893         let hv = hypervisor::new().unwrap();
2894         let vm = hv.create_vm().expect("new VM fd creation failed");
2895         let vcpu = vm.create_vcpu(0, None).unwrap();
2896 
2897         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2898         expected_regs.set_rflags(0x0000000000000002u64);
2899         expected_regs.set_rbx(arch::layout::PVH_INFO_START.0);
2900         expected_regs.set_rip(1);
2901 
2902         setup_regs(
2903             &vcpu,
2904             arch::EntryPoint {
2905                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2906                 setup_header: None,
2907             },
2908         )
2909         .unwrap();
2910 
2911         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2912         assert_eq!(actual_regs, expected_regs);
2913     }
2914 
2915     #[test]
2916     fn test_setup_regs_for_bzimage() {
2917         let hv = hypervisor::new().unwrap();
2918         let vm = hv.create_vm().expect("new VM fd creation failed");
2919         let vcpu = vm.create_vcpu(0, None).unwrap();
2920 
2921         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2922         expected_regs.set_rflags(0x0000000000000002u64);
2923         expected_regs.set_rip(1);
2924         expected_regs.set_rsp(BOOT_STACK_POINTER.0);
2925         expected_regs.set_rsi(ZERO_PAGE_START.0);
2926 
2927         setup_regs(
2928             &vcpu,
2929             arch::EntryPoint {
2930                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2931                 setup_header: Some(setup_header {
2932                     ..Default::default()
2933                 }),
2934             },
2935         )
2936         .unwrap();
2937 
2938         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2939         assert_eq!(actual_regs, expected_regs);
2940     }
2941 }
2942 
2943 #[cfg(target_arch = "aarch64")]
2944 #[cfg(test)]
2945 mod tests {
2946     use arch::{aarch64::regs, layout};
2947     use hypervisor::kvm::aarch64::is_system_register;
2948     use hypervisor::kvm::kvm_bindings::{
2949         kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE,
2950         KVM_REG_SIZE_U64,
2951     };
2952     use hypervisor::{arm64_core_reg_id, offset_of};
2953     use std::mem;
2954 
2955     #[test]
2956     fn test_setup_regs() {
2957         let hv = hypervisor::new().unwrap();
2958         let vm = hv.create_vm().unwrap();
2959         let vcpu = vm.create_vcpu(0, None).unwrap();
2960 
2961         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2962         // Must fail when vcpu is not initialized yet.
2963         assert!(res.is_err());
2964 
2965         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2966         vm.get_preferred_target(&mut kvi).unwrap();
2967         vcpu.vcpu_init(&kvi).unwrap();
2968 
2969         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2970     }
2971 
2972     #[test]
2973     fn test_read_mpidr() {
2974         let hv = hypervisor::new().unwrap();
2975         let vm = hv.create_vm().unwrap();
2976         let vcpu = vm.create_vcpu(0, None).unwrap();
2977         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2978         vm.get_preferred_target(&mut kvi).unwrap();
2979 
2980         // Must fail when vcpu is not initialized yet.
2981         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2982 
2983         vcpu.vcpu_init(&kvi).unwrap();
2984         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2985     }
2986 
2987     #[test]
2988     fn test_is_system_register() {
2989         let offset = offset_of!(user_pt_regs, pc);
2990         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2991         assert!(!is_system_register(regid));
2992         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2993         assert!(is_system_register(regid));
2994     }
2995 
2996     #[test]
2997     fn test_save_restore_core_regs() {
2998         let hv = hypervisor::new().unwrap();
2999         let vm = hv.create_vm().unwrap();
3000         let vcpu = vm.create_vcpu(0, None).unwrap();
3001         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
3002         vm.get_preferred_target(&mut kvi).unwrap();
3003 
3004         // Must fail when vcpu is not initialized yet.
3005         let res = vcpu.get_regs();
3006         assert!(res.is_err());
3007         assert_eq!(
3008             format!("{}", res.unwrap_err()),
3009             "Failed to get core register: Exec format error (os error 8)"
3010         );
3011 
3012         let mut state = vcpu.create_standard_regs();
3013         let res = vcpu.set_regs(&state);
3014         assert!(res.is_err());
3015         assert_eq!(
3016             format!("{}", res.unwrap_err()),
3017             "Failed to set core register: Exec format error (os error 8)"
3018         );
3019 
3020         vcpu.vcpu_init(&kvi).unwrap();
3021         let res = vcpu.get_regs();
3022         assert!(res.is_ok());
3023         state = res.unwrap();
3024         assert_eq!(state.get_pstate(), 0x3C5);
3025 
3026         assert!(vcpu.set_regs(&state).is_ok());
3027     }
3028 
3029     #[test]
3030     fn test_get_set_mpstate() {
3031         let hv = hypervisor::new().unwrap();
3032         let vm = hv.create_vm().unwrap();
3033         let vcpu = vm.create_vcpu(0, None).unwrap();
3034         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
3035         vm.get_preferred_target(&mut kvi).unwrap();
3036 
3037         let res = vcpu.get_mp_state();
3038         assert!(res.is_ok());
3039         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
3040     }
3041 }
3042