xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 61e57e1cb149de03ae1e0b799b9e5ba9a4a63ace)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::collections::BTreeMap;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use std::io::Write;
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::os::unix::thread::JoinHandleExt;
20 use std::sync::atomic::{AtomicBool, Ordering};
21 use std::sync::{Arc, Barrier, Mutex};
22 use std::{cmp, io, result, thread};
23 
24 use acpi_tables::sdt::Sdt;
25 use acpi_tables::{aml, Aml};
26 use anyhow::anyhow;
27 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
28 use arch::aarch64::regs;
29 #[cfg(target_arch = "x86_64")]
30 use arch::x86_64::get_x2apic_id;
31 use arch::{EntryPoint, NumaNodes};
32 #[cfg(target_arch = "aarch64")]
33 use devices::gic::Gic;
34 use devices::interrupt_controller::InterruptController;
35 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
36 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
37 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
38 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
39 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
40 use hypervisor::arch::x86::msr_index;
41 #[cfg(target_arch = "x86_64")]
42 use hypervisor::arch::x86::CpuIdEntry;
43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
44 use hypervisor::arch::x86::MsrEntry;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::SpecialRegisters;
47 #[cfg(target_arch = "aarch64")]
48 use hypervisor::kvm::kvm_bindings;
49 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
50 use hypervisor::kvm::kvm_ioctls::Cap;
51 #[cfg(feature = "tdx")]
52 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
53 #[cfg(target_arch = "x86_64")]
54 use hypervisor::CpuVendor;
55 #[cfg(feature = "kvm")]
56 use hypervisor::HypervisorType;
57 #[cfg(feature = "guest_debug")]
58 use hypervisor::StandardRegisters;
59 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
60 use libc::{c_void, siginfo_t};
61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
62 use linux_loader::elf::Elf64_Nhdr;
63 use seccompiler::{apply_filter, SeccompAction};
64 use thiserror::Error;
65 use tracer::trace_scoped;
66 use vm_device::BusDevice;
67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
68 use vm_memory::ByteValued;
69 #[cfg(feature = "guest_debug")]
70 use vm_memory::{Bytes, GuestAddressSpace};
71 use vm_memory::{GuestAddress, GuestMemoryAtomic};
72 use vm_migration::{
73     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
74     Transportable,
75 };
76 use vmm_sys_util::eventfd::EventFd;
77 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
78 use zerocopy::AsBytes;
79 
80 use crate::config::CpusConfig;
81 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
82 use crate::coredump::{
83     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
84     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
85     NT_PRSTATUS,
86 };
87 #[cfg(feature = "guest_debug")]
88 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
89 #[cfg(target_arch = "x86_64")]
90 use crate::memory_manager::MemoryManager;
91 use crate::seccomp_filters::{get_seccomp_filter, Thread};
92 #[cfg(target_arch = "x86_64")]
93 use crate::vm::physical_bits;
94 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID};
95 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
96 /// Extract the specified bits of a 64-bit integer.
97 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
98 /// following expression should return 3 (`0b11`):
99 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
100 ///
101 macro_rules! extract_bits_64 {
102     ($value: tt, $offset: tt, $length: tt) => {
103         ($value >> $offset) & (!0u64 >> (64 - $length))
104     };
105 }
106 
107 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
108 macro_rules! extract_bits_64_without_offset {
109     ($value: tt, $length: tt) => {
110         $value & (!0u64 >> (64 - $length))
111     };
112 }
113 
114 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
115 
116 #[derive(Debug, Error)]
117 pub enum Error {
118     #[error("Error creating vCPU: {0}")]
119     VcpuCreate(#[source] anyhow::Error),
120 
121     #[error("Error running bCPU: {0}")]
122     VcpuRun(#[source] anyhow::Error),
123 
124     #[error("Error spawning vCPU thread: {0}")]
125     VcpuSpawn(#[source] io::Error),
126 
127     #[error("Error generating common CPUID: {0}")]
128     CommonCpuId(#[source] arch::Error),
129 
130     #[error("Error configuring vCPU: {0}")]
131     VcpuConfiguration(#[source] arch::Error),
132 
133     #[error("Still pending removed vcpu")]
134     VcpuPendingRemovedVcpu,
135 
136     #[cfg(target_arch = "aarch64")]
137     #[error("Error fetching preferred target: {0}")]
138     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
139 
140     #[cfg(target_arch = "aarch64")]
141     #[error("Error initialising vCPU: {0}")]
142     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
143 
144     #[cfg(target_arch = "aarch64")]
145     #[error("Error finalising vCPU: {0}")]
146     VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError),
147 
148     #[error("Failed to join on vCPU threads: {0:?}")]
149     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
150 
151     #[error("Error adding CpuManager to MMIO bus: {0}")]
152     BusError(#[source] vm_device::BusError),
153 
154     #[error("Requested vCPUs exceed maximum")]
155     DesiredVCpuCountExceedsMax,
156 
157     #[error("Cannot create seccomp filter: {0}")]
158     CreateSeccompFilter(#[source] seccompiler::Error),
159 
160     #[error("Cannot apply seccomp filter: {0}")]
161     ApplySeccompFilter(#[source] seccompiler::Error),
162 
163     #[error("Error starting vCPU after restore: {0}")]
164     StartRestoreVcpu(#[source] anyhow::Error),
165 
166     #[error("Unexpected VmExit")]
167     UnexpectedVmExit,
168 
169     #[error("Failed to allocate MMIO address for CpuManager")]
170     AllocateMmmioAddress,
171 
172     #[cfg(feature = "tdx")]
173     #[error("Error initializing TDX: {0}")]
174     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
175 
176     #[cfg(target_arch = "aarch64")]
177     #[error("Error initializing PMU: {0}")]
178     InitPmu(#[source] hypervisor::HypervisorCpuError),
179 
180     #[cfg(feature = "guest_debug")]
181     #[error("Error during CPU debug: {0}")]
182     CpuDebug(#[source] hypervisor::HypervisorCpuError),
183 
184     #[cfg(feature = "guest_debug")]
185     #[error("Error translating virtual address: {0}")]
186     TranslateVirtualAddress(#[source] anyhow::Error),
187 
188     #[cfg(target_arch = "x86_64")]
189     #[error("Error setting up AMX: {0}")]
190     AmxEnable(#[source] anyhow::Error),
191 
192     #[error("Maximum number of vCPUs exceeds host limit")]
193     MaximumVcpusExceeded,
194 
195     #[cfg(feature = "sev_snp")]
196     #[error("Failed to set sev control register: {0}")]
197     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
198 
199     #[cfg(target_arch = "x86_64")]
200     #[error("Failed to inject NMI")]
201     NmiError(hypervisor::HypervisorCpuError),
202 }
203 pub type Result<T> = result::Result<T, Error>;
204 
205 #[cfg(target_arch = "x86_64")]
206 #[allow(dead_code)]
207 #[repr(packed)]
208 #[derive(AsBytes)]
209 struct LocalX2Apic {
210     pub r#type: u8,
211     pub length: u8,
212     pub _reserved: u16,
213     pub apic_id: u32,
214     pub flags: u32,
215     pub processor_id: u32,
216 }
217 
218 #[allow(dead_code)]
219 #[repr(packed)]
220 #[derive(Default, AsBytes)]
221 struct Ioapic {
222     pub r#type: u8,
223     pub length: u8,
224     pub ioapic_id: u8,
225     _reserved: u8,
226     pub apic_address: u32,
227     pub gsi_base: u32,
228 }
229 
230 #[cfg(target_arch = "aarch64")]
231 #[allow(dead_code)]
232 #[repr(packed)]
233 #[derive(AsBytes)]
234 struct GicC {
235     pub r#type: u8,
236     pub length: u8,
237     pub reserved0: u16,
238     pub cpu_interface_number: u32,
239     pub uid: u32,
240     pub flags: u32,
241     pub parking_version: u32,
242     pub performance_interrupt: u32,
243     pub parked_address: u64,
244     pub base_address: u64,
245     pub gicv_base_address: u64,
246     pub gich_base_address: u64,
247     pub vgic_interrupt: u32,
248     pub gicr_base_address: u64,
249     pub mpidr: u64,
250     pub proc_power_effi_class: u8,
251     pub reserved1: u8,
252     pub spe_overflow_interrupt: u16,
253 }
254 
255 #[cfg(target_arch = "aarch64")]
256 #[allow(dead_code)]
257 #[repr(packed)]
258 #[derive(AsBytes)]
259 struct GicD {
260     pub r#type: u8,
261     pub length: u8,
262     pub reserved0: u16,
263     pub gic_id: u32,
264     pub base_address: u64,
265     pub global_irq_base: u32,
266     pub version: u8,
267     pub reserved1: [u8; 3],
268 }
269 
270 #[cfg(target_arch = "aarch64")]
271 #[allow(dead_code)]
272 #[repr(packed)]
273 #[derive(AsBytes)]
274 struct GicR {
275     pub r#type: u8,
276     pub length: u8,
277     pub reserved: u16,
278     pub base_address: u64,
279     pub range_length: u32,
280 }
281 
282 #[cfg(target_arch = "aarch64")]
283 #[allow(dead_code)]
284 #[repr(packed)]
285 #[derive(AsBytes)]
286 struct GicIts {
287     pub r#type: u8,
288     pub length: u8,
289     pub reserved0: u16,
290     pub translation_id: u32,
291     pub base_address: u64,
292     pub reserved1: u32,
293 }
294 
295 #[cfg(target_arch = "aarch64")]
296 #[allow(dead_code)]
297 #[repr(packed)]
298 #[derive(AsBytes)]
299 struct ProcessorHierarchyNode {
300     pub r#type: u8,
301     pub length: u8,
302     pub reserved: u16,
303     pub flags: u32,
304     pub parent: u32,
305     pub acpi_processor_id: u32,
306     pub num_private_resources: u32,
307 }
308 
309 #[allow(dead_code)]
310 #[repr(packed)]
311 #[derive(Default, AsBytes)]
312 struct InterruptSourceOverride {
313     pub r#type: u8,
314     pub length: u8,
315     pub bus: u8,
316     pub source: u8,
317     pub gsi: u32,
318     pub flags: u16,
319 }
320 
321 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
322 macro_rules! round_up {
323     ($n:expr,$d:expr) => {
324         (($n / ($d + 1)) + 1) * $d
325     };
326 }
327 
328 /// A wrapper around creating and using a kvm-based VCPU.
329 pub struct Vcpu {
330     // The hypervisor abstracted CPU.
331     vcpu: Arc<dyn hypervisor::Vcpu>,
332     id: u8,
333     #[cfg(target_arch = "aarch64")]
334     mpidr: u64,
335     saved_state: Option<CpuState>,
336     #[cfg(target_arch = "x86_64")]
337     vendor: CpuVendor,
338 }
339 
340 impl Vcpu {
341     /// Constructs a new VCPU for `vm`.
342     ///
343     /// # Arguments
344     ///
345     /// * `id` - Represents the CPU number between [0, max vcpus).
346     /// * `vm` - The virtual machine this vcpu will get attached to.
347     /// * `vm_ops` - Optional object for exit handling.
348     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
349     pub fn new(
350         id: u8,
351         apic_id: u8,
352         vm: &Arc<dyn hypervisor::Vm>,
353         vm_ops: Option<Arc<dyn VmOps>>,
354         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
355     ) -> Result<Self> {
356         let vcpu = vm
357             .create_vcpu(apic_id, vm_ops)
358             .map_err(|e| Error::VcpuCreate(e.into()))?;
359         // Initially the cpuid per vCPU is the one supported by this VM.
360         Ok(Vcpu {
361             vcpu,
362             id,
363             #[cfg(target_arch = "aarch64")]
364             mpidr: 0,
365             saved_state: None,
366             #[cfg(target_arch = "x86_64")]
367             vendor: cpu_vendor,
368         })
369     }
370 
371     /// Configures a vcpu and should be called once per vcpu when created.
372     ///
373     /// # Arguments
374     ///
375     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
376     /// * `guest_memory` - Guest memory.
377     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
378     pub fn configure(
379         &mut self,
380         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
381         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
382         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
383         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
384         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
385     ) -> Result<()> {
386         #[cfg(target_arch = "aarch64")]
387         {
388             self.init(vm)?;
389             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
390                 .map_err(Error::VcpuConfiguration)?;
391         }
392         info!("Configuring vCPU: cpu_id = {}", self.id);
393         #[cfg(target_arch = "x86_64")]
394         arch::configure_vcpu(
395             &self.vcpu,
396             self.id,
397             boot_setup,
398             cpuid,
399             kvm_hyperv,
400             self.vendor,
401             topology,
402         )
403         .map_err(Error::VcpuConfiguration)?;
404 
405         Ok(())
406     }
407 
408     /// Gets the MPIDR register value.
409     #[cfg(target_arch = "aarch64")]
410     pub fn get_mpidr(&self) -> u64 {
411         self.mpidr
412     }
413 
414     /// Gets the saved vCPU state.
415     #[cfg(target_arch = "aarch64")]
416     pub fn get_saved_state(&self) -> Option<CpuState> {
417         self.saved_state.clone()
418     }
419 
420     /// Initializes an aarch64 specific vcpu for booting Linux.
421     #[cfg(target_arch = "aarch64")]
422     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
423         use std::arch::is_aarch64_feature_detected;
424         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
425         #[allow(clippy::nonminimal_bool)]
426         let sve_supported =
427             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
428         // This reads back the kernel's preferred target type.
429         vm.get_preferred_target(&mut kvi)
430             .map_err(Error::VcpuArmPreferredTarget)?;
431         // We already checked that the capability is supported.
432         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
433         if vm
434             .as_any()
435             .downcast_ref::<hypervisor::kvm::KvmVm>()
436             .unwrap()
437             .check_extension(Cap::ArmPmuV3)
438         {
439             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
440         }
441 
442         if sve_supported
443             && vm
444                 .as_any()
445                 .downcast_ref::<hypervisor::kvm::KvmVm>()
446                 .unwrap()
447                 .check_extension(Cap::ArmSve)
448         {
449             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE;
450         }
451 
452         // Non-boot cpus are powered off initially.
453         if self.id > 0 {
454             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
455         }
456         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?;
457         if sve_supported {
458             self.vcpu
459                 .vcpu_finalize(kvm_bindings::KVM_ARM_VCPU_SVE as i32)
460                 .map_err(Error::VcpuArmFinalize)?;
461         }
462         Ok(())
463     }
464 
465     /// Runs the VCPU until it exits, returning the reason.
466     ///
467     /// Note that the state of the VCPU and associated VM must be setup first for this to do
468     /// anything useful.
469     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
470         self.vcpu.run()
471     }
472 
473     #[cfg(feature = "sev_snp")]
474     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
475         self.vcpu
476             .set_sev_control_register(vmsa_pfn)
477             .map_err(Error::SetSevControlRegister)
478     }
479 }
480 
481 impl Pausable for Vcpu {}
482 impl Snapshottable for Vcpu {
483     fn id(&self) -> String {
484         self.id.to_string()
485     }
486 
487     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
488         let saved_state = self
489             .vcpu
490             .state()
491             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
492 
493         self.saved_state = Some(saved_state.clone());
494 
495         Ok(Snapshot::from_data(SnapshotData::new_from_state(
496             &saved_state,
497         )?))
498     }
499 }
500 
501 pub struct CpuManager {
502     config: CpusConfig,
503     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
504     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
505     #[cfg(target_arch = "x86_64")]
506     cpuid: Vec<CpuIdEntry>,
507     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
508     vm: Arc<dyn hypervisor::Vm>,
509     vcpus_kill_signalled: Arc<AtomicBool>,
510     vcpus_pause_signalled: Arc<AtomicBool>,
511     vcpus_kick_signalled: Arc<AtomicBool>,
512     exit_evt: EventFd,
513     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
514     reset_evt: EventFd,
515     #[cfg(feature = "guest_debug")]
516     vm_debug_evt: EventFd,
517     vcpu_states: Vec<VcpuState>,
518     selected_cpu: u8,
519     vcpus: Vec<Arc<Mutex<Vcpu>>>,
520     seccomp_action: SeccompAction,
521     vm_ops: Arc<dyn VmOps>,
522     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
523     acpi_address: Option<GuestAddress>,
524     proximity_domain_per_cpu: BTreeMap<u8, u32>,
525     affinity: BTreeMap<u8, Vec<usize>>,
526     dynamic: bool,
527     hypervisor: Arc<dyn hypervisor::Hypervisor>,
528     #[cfg(feature = "sev_snp")]
529     sev_snp_enabled: bool,
530 }
531 
532 const CPU_ENABLE_FLAG: usize = 0;
533 const CPU_INSERTING_FLAG: usize = 1;
534 const CPU_REMOVING_FLAG: usize = 2;
535 const CPU_EJECT_FLAG: usize = 3;
536 
537 const CPU_STATUS_OFFSET: u64 = 4;
538 const CPU_SELECTION_OFFSET: u64 = 0;
539 
540 impl BusDevice for CpuManager {
541     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
542         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
543         data.fill(0);
544 
545         match offset {
546             CPU_SELECTION_OFFSET => {
547                 data[0] = self.selected_cpu;
548             }
549             CPU_STATUS_OFFSET => {
550                 if self.selected_cpu < self.max_vcpus() {
551                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
552                     if state.active() {
553                         data[0] |= 1 << CPU_ENABLE_FLAG;
554                     }
555                     if state.inserting {
556                         data[0] |= 1 << CPU_INSERTING_FLAG;
557                     }
558                     if state.removing {
559                         data[0] |= 1 << CPU_REMOVING_FLAG;
560                     }
561                 } else {
562                     warn!("Out of range vCPU id: {}", self.selected_cpu);
563                 }
564             }
565             _ => {
566                 warn!(
567                     "Unexpected offset for accessing CPU manager device: {:#}",
568                     offset
569                 );
570             }
571         }
572     }
573 
574     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
575         match offset {
576             CPU_SELECTION_OFFSET => {
577                 self.selected_cpu = data[0];
578             }
579             CPU_STATUS_OFFSET => {
580                 if self.selected_cpu < self.max_vcpus() {
581                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
582                     // The ACPI code writes back a 1 to acknowledge the insertion
583                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
584                         && state.inserting
585                     {
586                         state.inserting = false;
587                     }
588                     // Ditto for removal
589                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
590                         && state.removing
591                     {
592                         state.removing = false;
593                     }
594                     // Trigger removal of vCPU
595                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
596                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
597                             error!("Error removing vCPU: {:?}", e);
598                         }
599                     }
600                 } else {
601                     warn!("Out of range vCPU id: {}", self.selected_cpu);
602                 }
603             }
604             _ => {
605                 warn!(
606                     "Unexpected offset for accessing CPU manager device: {:#}",
607                     offset
608                 );
609             }
610         }
611         None
612     }
613 }
614 
615 #[derive(Default)]
616 struct VcpuState {
617     inserting: bool,
618     removing: bool,
619     pending_removal: Arc<AtomicBool>,
620     handle: Option<thread::JoinHandle<()>>,
621     kill: Arc<AtomicBool>,
622     vcpu_run_interrupted: Arc<AtomicBool>,
623     paused: Arc<AtomicBool>,
624 }
625 
626 impl VcpuState {
627     fn active(&self) -> bool {
628         self.handle.is_some()
629     }
630 
631     fn signal_thread(&self) {
632         if let Some(handle) = self.handle.as_ref() {
633             loop {
634                 // SAFETY: FFI call with correct arguments
635                 unsafe {
636                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
637                 }
638                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
639                     break;
640                 } else {
641                     // This is more effective than thread::yield_now() at
642                     // avoiding a priority inversion with the vCPU thread
643                     thread::sleep(std::time::Duration::from_millis(1));
644                 }
645             }
646         }
647     }
648 
649     fn join_thread(&mut self) -> Result<()> {
650         if let Some(handle) = self.handle.take() {
651             handle.join().map_err(Error::ThreadCleanup)?
652         }
653 
654         Ok(())
655     }
656 
657     fn unpark_thread(&self) {
658         if let Some(handle) = self.handle.as_ref() {
659             handle.thread().unpark()
660         }
661     }
662 }
663 
664 impl CpuManager {
665     #[allow(unused_variables)]
666     #[allow(clippy::too_many_arguments)]
667     pub fn new(
668         config: &CpusConfig,
669         vm: Arc<dyn hypervisor::Vm>,
670         exit_evt: EventFd,
671         reset_evt: EventFd,
672         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
673         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
674         seccomp_action: SeccompAction,
675         vm_ops: Arc<dyn VmOps>,
676         #[cfg(feature = "tdx")] tdx_enabled: bool,
677         numa_nodes: &NumaNodes,
678         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
679     ) -> Result<Arc<Mutex<CpuManager>>> {
680         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
681             return Err(Error::MaximumVcpusExceeded);
682         }
683 
684         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
685         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
686         let hypervisor_type = hypervisor.hypervisor_type();
687         #[cfg(target_arch = "x86_64")]
688         let cpu_vendor = hypervisor.get_cpu_vendor();
689 
690         #[cfg(target_arch = "x86_64")]
691         if config.features.amx {
692             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
693             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
694             const XFEATURE_XTILEDATA: usize = 18;
695             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
696 
697             // SAFETY: the syscall is only modifying kernel internal
698             // data structures that the kernel is itself expected to safeguard.
699             let amx_tile = unsafe {
700                 libc::syscall(
701                     libc::SYS_arch_prctl,
702                     ARCH_REQ_XCOMP_GUEST_PERM,
703                     XFEATURE_XTILEDATA,
704                 )
705             };
706 
707             if amx_tile != 0 {
708                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
709             } else {
710                 let mask: usize = 0;
711                 // SAFETY: the mask being modified (not marked mutable as it is
712                 // modified in unsafe only which is permitted) isn't in use elsewhere.
713                 let result = unsafe {
714                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
715                 };
716                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
717                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
718                 }
719             }
720         }
721 
722         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
723             let mut cpu_list = Vec::new();
724             for (proximity_domain, numa_node) in numa_nodes.iter() {
725                 for cpu in numa_node.cpus.iter() {
726                     cpu_list.push((*cpu, *proximity_domain))
727                 }
728             }
729             cpu_list
730         }
731         .into_iter()
732         .collect();
733 
734         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
735             cpu_affinity
736                 .iter()
737                 .map(|a| (a.vcpu, a.host_cpus.clone()))
738                 .collect()
739         } else {
740             BTreeMap::new()
741         };
742 
743         #[cfg(feature = "tdx")]
744         let dynamic = !tdx_enabled;
745         #[cfg(not(feature = "tdx"))]
746         let dynamic = true;
747 
748         Ok(Arc::new(Mutex::new(CpuManager {
749             config: config.clone(),
750             interrupt_controller: None,
751             #[cfg(target_arch = "x86_64")]
752             cpuid: Vec::new(),
753             vm,
754             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
755             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
756             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
757             vcpu_states,
758             exit_evt,
759             reset_evt,
760             #[cfg(feature = "guest_debug")]
761             vm_debug_evt,
762             selected_cpu: 0,
763             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
764             seccomp_action,
765             vm_ops,
766             acpi_address: None,
767             proximity_domain_per_cpu,
768             affinity,
769             dynamic,
770             hypervisor: hypervisor.clone(),
771             #[cfg(feature = "sev_snp")]
772             sev_snp_enabled,
773         })))
774     }
775 
776     #[cfg(target_arch = "x86_64")]
777     pub fn populate_cpuid(
778         &mut self,
779         memory_manager: &Arc<Mutex<MemoryManager>>,
780         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
781         #[cfg(feature = "tdx")] tdx: bool,
782     ) -> Result<()> {
783         let sgx_epc_sections = memory_manager
784             .lock()
785             .unwrap()
786             .sgx_epc_region()
787             .as_ref()
788             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
789 
790         self.cpuid = {
791             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
792             arch::generate_common_cpuid(
793                 hypervisor,
794                 &arch::CpuidConfig {
795                     sgx_epc_sections,
796                     phys_bits,
797                     kvm_hyperv: self.config.kvm_hyperv,
798                     #[cfg(feature = "tdx")]
799                     tdx,
800                     amx: self.config.features.amx,
801                 },
802             )
803             .map_err(Error::CommonCpuId)?
804         };
805 
806         Ok(())
807     }
808 
809     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
810         info!("Creating vCPU: cpu_id = {}", cpu_id);
811 
812         #[cfg(target_arch = "x86_64")]
813         let topology = self.get_vcpu_topology();
814         #[cfg(target_arch = "x86_64")]
815         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
816         #[cfg(target_arch = "aarch64")]
817         let x2apic_id = cpu_id as u32;
818 
819         let mut vcpu = Vcpu::new(
820             cpu_id,
821             x2apic_id as u8,
822             &self.vm,
823             Some(self.vm_ops.clone()),
824             #[cfg(target_arch = "x86_64")]
825             self.hypervisor.get_cpu_vendor(),
826         )?;
827 
828         if let Some(snapshot) = snapshot {
829             // AArch64 vCPUs should be initialized after created.
830             #[cfg(target_arch = "aarch64")]
831             vcpu.init(&self.vm)?;
832 
833             let state: CpuState = snapshot.to_state().map_err(|e| {
834                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
835             })?;
836             vcpu.vcpu
837                 .set_state(&state)
838                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
839 
840             vcpu.saved_state = Some(state);
841         }
842 
843         let vcpu = Arc::new(Mutex::new(vcpu));
844 
845         // Adding vCPU to the CpuManager's vCPU list.
846         self.vcpus.push(vcpu.clone());
847 
848         Ok(vcpu)
849     }
850 
851     pub fn configure_vcpu(
852         &self,
853         vcpu: Arc<Mutex<Vcpu>>,
854         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
855     ) -> Result<()> {
856         let mut vcpu = vcpu.lock().unwrap();
857 
858         #[cfg(feature = "sev_snp")]
859         if self.sev_snp_enabled {
860             if let Some((kernel_entry_point, _)) = boot_setup {
861                 vcpu.set_sev_control_register(
862                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
863                 )?;
864             }
865 
866             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
867             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
868             return Ok(());
869         }
870 
871         #[cfg(target_arch = "x86_64")]
872         assert!(!self.cpuid.is_empty());
873 
874         #[cfg(target_arch = "x86_64")]
875         let topology = self.config.topology.clone().map_or_else(
876             || Some((1, self.boot_vcpus(), 1)),
877             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
878         );
879         #[cfg(target_arch = "x86_64")]
880         vcpu.configure(
881             boot_setup,
882             self.cpuid.clone(),
883             self.config.kvm_hyperv,
884             topology,
885         )?;
886 
887         #[cfg(target_arch = "aarch64")]
888         vcpu.configure(&self.vm, boot_setup)?;
889 
890         Ok(())
891     }
892 
893     /// Only create new vCPUs if there aren't any inactive ones to reuse
894     fn create_vcpus(
895         &mut self,
896         desired_vcpus: u8,
897         snapshot: Option<Snapshot>,
898     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
899         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
900         info!(
901             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
902             desired_vcpus,
903             self.config.max_vcpus,
904             self.vcpus.len(),
905             self.present_vcpus()
906         );
907 
908         if desired_vcpus > self.config.max_vcpus {
909             return Err(Error::DesiredVCpuCountExceedsMax);
910         }
911 
912         // Only create vCPUs in excess of all the allocated vCPUs.
913         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
914             vcpus.push(self.create_vcpu(
915                 cpu_id,
916                 // TODO: The special format of the CPU id can be removed once
917                 // ready to break live upgrade.
918                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
919             )?);
920         }
921 
922         Ok(vcpus)
923     }
924 
925     #[cfg(target_arch = "aarch64")]
926     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
927         for cpu in self.vcpus.iter() {
928             let cpu = cpu.lock().unwrap();
929             // Check if PMU attr is available, if not, log the information.
930             if cpu.vcpu.has_pmu_support() {
931                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
932             } else {
933                 debug!(
934                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
935                     cpu.id
936                 );
937                 return Ok(false);
938             }
939         }
940 
941         Ok(true)
942     }
943 
944     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
945         self.vcpus.clone()
946     }
947 
948     fn start_vcpu(
949         &mut self,
950         vcpu: Arc<Mutex<Vcpu>>,
951         vcpu_id: u8,
952         vcpu_thread_barrier: Arc<Barrier>,
953         inserting: bool,
954     ) -> Result<()> {
955         let reset_evt = self.reset_evt.try_clone().unwrap();
956         let exit_evt = self.exit_evt.try_clone().unwrap();
957         #[cfg(feature = "kvm")]
958         let hypervisor_type = self.hypervisor.hypervisor_type();
959         #[cfg(feature = "guest_debug")]
960         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
961         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
962         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
963         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
964         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
965 
966         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
967         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
968             .vcpu_run_interrupted
969             .clone();
970         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
971         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
972 
973         // Prepare the CPU set the current vCPU is expected to run onto.
974         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
975             // SAFETY: all zeros is a valid pattern
976             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
977             // SAFETY: FFI call, trivially safe
978             unsafe { libc::CPU_ZERO(&mut cpuset) };
979             for host_cpu in host_cpus {
980                 // SAFETY: FFI call, trivially safe
981                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
982             }
983             cpuset
984         });
985 
986         // Retrieve seccomp filter for vcpu thread
987         let vcpu_seccomp_filter = get_seccomp_filter(
988             &self.seccomp_action,
989             Thread::Vcpu,
990             self.hypervisor.hypervisor_type(),
991         )
992         .map_err(Error::CreateSeccompFilter)?;
993 
994         #[cfg(target_arch = "x86_64")]
995         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
996 
997         info!("Starting vCPU: cpu_id = {}", vcpu_id);
998 
999         let handle = Some(
1000             thread::Builder::new()
1001                 .name(format!("vcpu{vcpu_id}"))
1002                 .spawn(move || {
1003                     // Schedule the thread to run on the expected CPU set
1004                     if let Some(cpuset) = cpuset.as_ref() {
1005                         // SAFETY: FFI call with correct arguments
1006                         let ret = unsafe {
1007                             libc::sched_setaffinity(
1008                                 0,
1009                                 std::mem::size_of::<libc::cpu_set_t>(),
1010                                 cpuset as *const libc::cpu_set_t,
1011                             )
1012                         };
1013 
1014                         if ret != 0 {
1015                             error!(
1016                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
1017                                 vcpu_id,
1018                                 io::Error::last_os_error()
1019                             );
1020                             return;
1021                         }
1022                     }
1023 
1024                     // Apply seccomp filter for vcpu thread.
1025                     if !vcpu_seccomp_filter.is_empty() {
1026                         if let Err(e) =
1027                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1028                         {
1029                             error!("Error applying seccomp filter: {:?}", e);
1030                             return;
1031                         }
1032                     }
1033                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1034                     // This uses an async signal safe handler to kill the vcpu handles.
1035                     register_signal_handler(SIGRTMIN(), handle_signal)
1036                         .expect("Failed to register vcpu signal handler");
1037                     // Block until all CPUs are ready.
1038                     vcpu_thread_barrier.wait();
1039 
1040                     std::panic::catch_unwind(move || {
1041                         loop {
1042                             // If we are being told to pause, we park the thread
1043                             // until the pause boolean is toggled.
1044                             // The resume operation is responsible for toggling
1045                             // the boolean and unpark the thread.
1046                             // We enter a loop because park() could spuriously
1047                             // return. We will then park() again unless the
1048                             // pause boolean has been toggled.
1049 
1050                             // Need to use Ordering::SeqCst as we have multiple
1051                             // loads and stores to different atomics and we need
1052                             // to see them in a consistent order in all threads
1053 
1054                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1055                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1056                                 // completed by returning to KVM_RUN. From the kernel docs:
1057                                 //
1058                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1059                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1060                                 // operations are complete (and guest state is consistent) only after userspace
1061                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1062                                 // incomplete operations and then check for pending signals.
1063                                 // The pending state of the operation is not preserved in state which is
1064                                 // visible to userspace, thus userspace should ensure that the operation is
1065                                 // completed before performing a live migration.  Userspace can re-enter the
1066                                 // guest with an unmasked signal pending or with the immediate_exit field set
1067                                 // to complete pending operations without allowing any further instructions
1068                                 // to be executed.
1069 
1070                                 #[cfg(feature = "kvm")]
1071                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1072                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1073                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1074                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1075                                         break;
1076                                     }
1077                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1078                                 }
1079 
1080                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1081 
1082                                 vcpu_paused.store(true, Ordering::SeqCst);
1083                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1084                                     thread::park();
1085                                 }
1086                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1087                             }
1088 
1089                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1090                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1091                                 #[cfg(target_arch = "x86_64")]
1092                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1093                                     Ok(()) => {},
1094                                     Err(e) => {
1095                                         error!("Error when inject nmi {}", e);
1096                                         break;
1097                                     }
1098                                 }
1099                             }
1100 
1101                             // We've been told to terminate
1102                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1103                                 || vcpu_kill.load(Ordering::SeqCst)
1104                             {
1105                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1106                                 break;
1107                             }
1108 
1109                             #[cfg(feature = "tdx")]
1110                             let mut vcpu = vcpu.lock().unwrap();
1111                             #[cfg(not(feature = "tdx"))]
1112                             let vcpu = vcpu.lock().unwrap();
1113                             // vcpu.run() returns false on a triple-fault so trigger a reset
1114                             match vcpu.run() {
1115                                 Ok(run) => match run {
1116                                     #[cfg(feature = "kvm")]
1117                                     VmExit::Debug => {
1118                                         info!("VmExit::Debug");
1119                                         #[cfg(feature = "guest_debug")]
1120                                         {
1121                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1122                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1123                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1124                                         }
1125                                     }
1126                                     #[cfg(target_arch = "x86_64")]
1127                                     VmExit::IoapicEoi(vector) => {
1128                                         if let Some(interrupt_controller) =
1129                                             &interrupt_controller_clone
1130                                         {
1131                                             interrupt_controller
1132                                                 .lock()
1133                                                 .unwrap()
1134                                                 .end_of_interrupt(vector);
1135                                         }
1136                                     }
1137                                     VmExit::Ignore => {}
1138                                     VmExit::Hyperv => {}
1139                                     VmExit::Reset => {
1140                                         info!("VmExit::Reset");
1141                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1142                                         reset_evt.write(1).unwrap();
1143                                         break;
1144                                     }
1145                                     VmExit::Shutdown => {
1146                                         info!("VmExit::Shutdown");
1147                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1148                                         exit_evt.write(1).unwrap();
1149                                         break;
1150                                     }
1151                                     #[cfg(feature = "tdx")]
1152                                     VmExit::Tdx => {
1153                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1154                                             match vcpu.get_tdx_exit_details() {
1155                                                 Ok(details) => match details {
1156                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1157                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1158                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1159                                                     }
1160                                                 },
1161                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1162                                             }
1163                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1164                                         } else {
1165                                             // We should never reach this code as
1166                                             // this means the design from the code
1167                                             // is wrong.
1168                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1169                                         }
1170                                     }
1171                                 },
1172 
1173                                 Err(e) => {
1174                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1175                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1176                                     exit_evt.write(1).unwrap();
1177                                     break;
1178                                 }
1179                             }
1180 
1181                             // We've been told to terminate
1182                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1183                                 || vcpu_kill.load(Ordering::SeqCst)
1184                             {
1185                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1186                                 break;
1187                             }
1188                         }
1189                     })
1190                     .or_else(|_| {
1191                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1192                         error!("vCPU thread panicked");
1193                         panic_exit_evt.write(1)
1194                     })
1195                     .ok();
1196                 })
1197                 .map_err(Error::VcpuSpawn)?,
1198         );
1199 
1200         // On hot plug calls into this function entry_point is None. It is for
1201         // those hotplug CPU additions that we need to set the inserting flag.
1202         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1203         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1204 
1205         Ok(())
1206     }
1207 
1208     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1209     fn activate_vcpus(
1210         &mut self,
1211         desired_vcpus: u8,
1212         inserting: bool,
1213         paused: Option<bool>,
1214     ) -> Result<()> {
1215         if desired_vcpus > self.config.max_vcpus {
1216             return Err(Error::DesiredVCpuCountExceedsMax);
1217         }
1218 
1219         let vcpu_thread_barrier = Arc::new(Barrier::new(
1220             (desired_vcpus - self.present_vcpus() + 1) as usize,
1221         ));
1222 
1223         if let Some(paused) = paused {
1224             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1225         }
1226 
1227         info!(
1228             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1229             desired_vcpus,
1230             self.vcpus.len(),
1231             self.present_vcpus(),
1232             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1233         );
1234 
1235         // This reuses any inactive vCPUs as well as any that were newly created
1236         for vcpu_id in self.present_vcpus()..desired_vcpus {
1237             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1238             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1239         }
1240 
1241         // Unblock all CPU threads.
1242         vcpu_thread_barrier.wait();
1243         Ok(())
1244     }
1245 
1246     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1247         // Mark vCPUs for removal, actual removal happens on ejection
1248         for cpu_id in desired_vcpus..self.present_vcpus() {
1249             self.vcpu_states[usize::from(cpu_id)].removing = true;
1250             self.vcpu_states[usize::from(cpu_id)]
1251                 .pending_removal
1252                 .store(true, Ordering::SeqCst);
1253         }
1254     }
1255 
1256     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1257         for state in self.vcpu_states.iter() {
1258             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1259                 return true;
1260             }
1261         }
1262         false
1263     }
1264 
1265     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1266         info!("Removing vCPU: cpu_id = {}", cpu_id);
1267         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1268         state.kill.store(true, Ordering::SeqCst);
1269         state.signal_thread();
1270         state.join_thread()?;
1271         state.handle = None;
1272 
1273         // Once the thread has exited, clear the "kill" so that it can reused
1274         state.kill.store(false, Ordering::SeqCst);
1275         state.pending_removal.store(false, Ordering::SeqCst);
1276 
1277         Ok(())
1278     }
1279 
1280     pub fn create_boot_vcpus(
1281         &mut self,
1282         snapshot: Option<Snapshot>,
1283     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1284         trace_scoped!("create_boot_vcpus");
1285 
1286         self.create_vcpus(self.boot_vcpus(), snapshot)
1287     }
1288 
1289     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1290     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1291         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1292     }
1293 
1294     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1295         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1296             .map_err(|e| {
1297                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1298             })?;
1299 
1300         Ok(())
1301     }
1302 
1303     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1304         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1305             return Ok(false);
1306         }
1307 
1308         if !self.dynamic {
1309             return Ok(false);
1310         }
1311 
1312         if self.check_pending_removed_vcpu() {
1313             return Err(Error::VcpuPendingRemovedVcpu);
1314         }
1315 
1316         match desired_vcpus.cmp(&self.present_vcpus()) {
1317             cmp::Ordering::Greater => {
1318                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1319                 for vcpu in vcpus {
1320                     self.configure_vcpu(vcpu, None)?
1321                 }
1322                 self.activate_vcpus(desired_vcpus, true, None)?;
1323                 Ok(true)
1324             }
1325             cmp::Ordering::Less => {
1326                 self.mark_vcpus_for_removal(desired_vcpus);
1327                 Ok(true)
1328             }
1329             _ => Ok(false),
1330         }
1331     }
1332 
1333     pub fn shutdown(&mut self) -> Result<()> {
1334         // Tell the vCPUs to stop themselves next time they go through the loop
1335         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1336 
1337         // Toggle the vCPUs pause boolean
1338         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1339 
1340         // Unpark all the VCPU threads.
1341         for state in self.vcpu_states.iter() {
1342             state.unpark_thread();
1343         }
1344 
1345         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1346         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1347         // above.
1348         for state in self.vcpu_states.iter() {
1349             state.signal_thread();
1350         }
1351 
1352         // Wait for all the threads to finish. This removes the state from the vector.
1353         for mut state in self.vcpu_states.drain(..) {
1354             state.join_thread()?;
1355         }
1356 
1357         Ok(())
1358     }
1359 
1360     #[cfg(feature = "tdx")]
1361     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1362         for vcpu in &self.vcpus {
1363             vcpu.lock()
1364                 .unwrap()
1365                 .vcpu
1366                 .tdx_init(hob_address)
1367                 .map_err(Error::InitializeTdx)?;
1368         }
1369         Ok(())
1370     }
1371 
1372     pub fn boot_vcpus(&self) -> u8 {
1373         self.config.boot_vcpus
1374     }
1375 
1376     pub fn max_vcpus(&self) -> u8 {
1377         self.config.max_vcpus
1378     }
1379 
1380     #[cfg(target_arch = "x86_64")]
1381     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1382         assert!(!self.cpuid.is_empty());
1383         self.cpuid.clone()
1384     }
1385 
1386     fn present_vcpus(&self) -> u8 {
1387         self.vcpu_states
1388             .iter()
1389             .fold(0, |acc, state| acc + state.active() as u8)
1390     }
1391 
1392     #[cfg(target_arch = "aarch64")]
1393     pub fn get_mpidrs(&self) -> Vec<u64> {
1394         self.vcpus
1395             .iter()
1396             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1397             .collect()
1398     }
1399 
1400     #[cfg(target_arch = "aarch64")]
1401     pub fn get_saved_states(&self) -> Vec<CpuState> {
1402         self.vcpus
1403             .iter()
1404             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1405             .collect()
1406     }
1407 
1408     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1409         self.config
1410             .topology
1411             .clone()
1412             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1413     }
1414 
1415     pub fn create_madt(&self) -> Sdt {
1416         use crate::acpi;
1417         // This is also checked in the commandline parsing.
1418         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1419 
1420         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1421         #[cfg(target_arch = "x86_64")]
1422         {
1423             madt.write(36, arch::layout::APIC_START.0);
1424 
1425             for cpu in 0..self.config.max_vcpus {
1426                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1427 
1428                 let lapic = LocalX2Apic {
1429                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1430                     length: 16,
1431                     processor_id: cpu.into(),
1432                     apic_id: x2apic_id,
1433                     flags: if cpu < self.config.boot_vcpus {
1434                         1 << MADT_CPU_ENABLE_FLAG
1435                     } else {
1436                         0
1437                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1438                     _reserved: 0,
1439                 };
1440                 madt.append(lapic);
1441             }
1442 
1443             madt.append(Ioapic {
1444                 r#type: acpi::ACPI_APIC_IO,
1445                 length: 12,
1446                 ioapic_id: 0,
1447                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1448                 gsi_base: 0,
1449                 ..Default::default()
1450             });
1451 
1452             madt.append(InterruptSourceOverride {
1453                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1454                 length: 10,
1455                 bus: 0,
1456                 source: 4,
1457                 gsi: 4,
1458                 flags: 0,
1459             });
1460         }
1461 
1462         #[cfg(target_arch = "aarch64")]
1463         {
1464             /* Notes:
1465              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1466              */
1467 
1468             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1469             for cpu in 0..self.config.boot_vcpus {
1470                 let vcpu = &self.vcpus[cpu as usize];
1471                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1472                 /* ARMv8 MPIDR format:
1473                      Bits [63:40] Must be zero
1474                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1475                      Bits [31:24] Must be zero
1476                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1477                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1478                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1479                 */
1480                 let mpidr_mask = 0xff_00ff_ffff;
1481                 let gicc = GicC {
1482                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1483                     length: 80,
1484                     reserved0: 0,
1485                     cpu_interface_number: cpu as u32,
1486                     uid: cpu as u32,
1487                     flags: 1,
1488                     parking_version: 0,
1489                     performance_interrupt: 0,
1490                     parked_address: 0,
1491                     base_address: 0,
1492                     gicv_base_address: 0,
1493                     gich_base_address: 0,
1494                     vgic_interrupt: 0,
1495                     gicr_base_address: 0,
1496                     mpidr: mpidr & mpidr_mask,
1497                     proc_power_effi_class: 0,
1498                     reserved1: 0,
1499                     spe_overflow_interrupt: 0,
1500                 };
1501 
1502                 madt.append(gicc);
1503             }
1504             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1505 
1506             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1507             let gicd = GicD {
1508                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1509                 length: 24,
1510                 reserved0: 0,
1511                 gic_id: 0,
1512                 base_address: vgic_config.dist_addr,
1513                 global_irq_base: 0,
1514                 version: 3,
1515                 reserved1: [0; 3],
1516             };
1517             madt.append(gicd);
1518 
1519             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1520             let gicr = GicR {
1521                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1522                 length: 16,
1523                 reserved: 0,
1524                 base_address: vgic_config.redists_addr,
1525                 range_length: vgic_config.redists_size as u32,
1526             };
1527             madt.append(gicr);
1528 
1529             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1530             let gicits = GicIts {
1531                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1532                 length: 20,
1533                 reserved0: 0,
1534                 translation_id: 0,
1535                 base_address: vgic_config.msi_addr,
1536                 reserved1: 0,
1537             };
1538             madt.append(gicits);
1539 
1540             madt.update_checksum();
1541         }
1542 
1543         madt
1544     }
1545 
1546     #[cfg(target_arch = "aarch64")]
1547     pub fn create_pptt(&self) -> Sdt {
1548         let pptt_start = 0;
1549         let mut cpus = 0;
1550         let mut uid = 0;
1551         // If topology is not specified, the default setting is:
1552         // 1 package, multiple cores, 1 thread per core
1553         // This is also the behavior when PPTT is missing.
1554         let (threads_per_core, cores_per_package, packages) =
1555             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1556 
1557         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1558 
1559         for cluster_idx in 0..packages {
1560             if cpus < self.config.boot_vcpus as usize {
1561                 let cluster_offset = pptt.len() - pptt_start;
1562                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1563                     r#type: 0,
1564                     length: 20,
1565                     reserved: 0,
1566                     flags: 0x2,
1567                     parent: 0,
1568                     acpi_processor_id: cluster_idx as u32,
1569                     num_private_resources: 0,
1570                 };
1571                 pptt.append(cluster_hierarchy_node);
1572 
1573                 for core_idx in 0..cores_per_package {
1574                     let core_offset = pptt.len() - pptt_start;
1575 
1576                     if threads_per_core > 1 {
1577                         let core_hierarchy_node = ProcessorHierarchyNode {
1578                             r#type: 0,
1579                             length: 20,
1580                             reserved: 0,
1581                             flags: 0x2,
1582                             parent: cluster_offset as u32,
1583                             acpi_processor_id: core_idx as u32,
1584                             num_private_resources: 0,
1585                         };
1586                         pptt.append(core_hierarchy_node);
1587 
1588                         for _thread_idx in 0..threads_per_core {
1589                             let thread_hierarchy_node = ProcessorHierarchyNode {
1590                                 r#type: 0,
1591                                 length: 20,
1592                                 reserved: 0,
1593                                 flags: 0xE,
1594                                 parent: core_offset as u32,
1595                                 acpi_processor_id: uid as u32,
1596                                 num_private_resources: 0,
1597                             };
1598                             pptt.append(thread_hierarchy_node);
1599                             uid += 1;
1600                         }
1601                     } else {
1602                         let thread_hierarchy_node = ProcessorHierarchyNode {
1603                             r#type: 0,
1604                             length: 20,
1605                             reserved: 0,
1606                             flags: 0xA,
1607                             parent: cluster_offset as u32,
1608                             acpi_processor_id: uid as u32,
1609                             num_private_resources: 0,
1610                         };
1611                         pptt.append(thread_hierarchy_node);
1612                         uid += 1;
1613                     }
1614                 }
1615                 cpus += (cores_per_package * threads_per_core) as usize;
1616             }
1617         }
1618 
1619         pptt.update_checksum();
1620         pptt
1621     }
1622 
1623     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1624     fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters {
1625         self.vcpus[usize::from(cpu_id)]
1626             .lock()
1627             .unwrap()
1628             .vcpu
1629             .create_standard_regs()
1630     }
1631 
1632     #[cfg(feature = "guest_debug")]
1633     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1634         self.vcpus[usize::from(cpu_id)]
1635             .lock()
1636             .unwrap()
1637             .vcpu
1638             .get_regs()
1639             .map_err(Error::CpuDebug)
1640     }
1641 
1642     #[cfg(feature = "guest_debug")]
1643     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1644         self.vcpus[usize::from(cpu_id)]
1645             .lock()
1646             .unwrap()
1647             .vcpu
1648             .set_regs(regs)
1649             .map_err(Error::CpuDebug)
1650     }
1651 
1652     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1653     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1654         self.vcpus[usize::from(cpu_id)]
1655             .lock()
1656             .unwrap()
1657             .vcpu
1658             .get_sregs()
1659             .map_err(Error::CpuDebug)
1660     }
1661 
1662     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1663     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1664         self.vcpus[usize::from(cpu_id)]
1665             .lock()
1666             .unwrap()
1667             .vcpu
1668             .set_sregs(sregs)
1669             .map_err(Error::CpuDebug)
1670     }
1671 
1672     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1673     fn translate_gva(
1674         &self,
1675         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1676         cpu_id: u8,
1677         gva: u64,
1678     ) -> Result<u64> {
1679         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1680             .lock()
1681             .unwrap()
1682             .vcpu
1683             .translate_gva(gva, /* flags: unused */ 0)
1684             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1685         Ok(gpa)
1686     }
1687 
1688     ///
1689     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1690     /// it in VMM by walking through translation tables.
1691     ///
1692     /// Address translation is big topic, here we only focus the scenario that
1693     /// happens in VMM while debugging kernel. This `translate_gva`
1694     /// implementation is restricted to:
1695     /// - Exception Level 1
1696     /// - Translate high address range only (kernel space)
1697     ///
1698     /// This implementation supports following Arm-v8a features related to
1699     /// address translation:
1700     /// - FEAT_LPA
1701     /// - FEAT_LVA
1702     /// - FEAT_LPA2
1703     ///
1704     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1705     fn translate_gva(
1706         &self,
1707         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1708         cpu_id: u8,
1709         gva: u64,
1710     ) -> Result<u64> {
1711         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1712             .lock()
1713             .unwrap()
1714             .vcpu
1715             .get_sys_reg(regs::TCR_EL1)
1716             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1717         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1718             .lock()
1719             .unwrap()
1720             .vcpu
1721             .get_sys_reg(regs::TTBR1_EL1)
1722             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1723         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1724             .lock()
1725             .unwrap()
1726             .vcpu
1727             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1728             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1729 
1730         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1731         // or low (0x000xxx...).
1732         let high_range = extract_bits_64!(gva, 55, 1);
1733         if high_range == 0 {
1734             info!("VA (0x{:x}) range is not supported!", gva);
1735             return Ok(gva);
1736         }
1737 
1738         // High range size offset
1739         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1740         // Granule size
1741         let tg = extract_bits_64!(tcr_el1, 30, 2);
1742         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1743         let ds = extract_bits_64!(tcr_el1, 59, 1);
1744 
1745         if tsz == 0 {
1746             info!("VA translation is not ready!");
1747             return Ok(gva);
1748         }
1749 
1750         // VA size is determined by TCR_BL1.T1SZ
1751         let va_size = 64 - tsz;
1752         // Number of bits in VA consumed in each level of translation
1753         let stride = match tg {
1754             3 => 13, // 64KB granule size
1755             1 => 11, // 16KB granule size
1756             _ => 9,  // 4KB, default
1757         };
1758         // Starting level of walking
1759         let mut level = 4 - (va_size - 4) / stride;
1760 
1761         // PA or IPA size is determined
1762         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1763         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1764         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1765         // To be safe, we use the minimum value if they are different.
1766         let pa_range = std::cmp::min(tcr_ips, pa_range);
1767         // PA size in bits
1768         let pa_size = match pa_range {
1769             0 => 32,
1770             1 => 36,
1771             2 => 40,
1772             3 => 42,
1773             4 => 44,
1774             5 => 48,
1775             6 => 52,
1776             _ => {
1777                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1778                     "PA range not supported {pa_range}"
1779                 ))))
1780             }
1781         };
1782 
1783         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1784         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1785         // If FEAT_LPA2 is present, the translation table descriptor holds
1786         // 50 bits of the table address of next level.
1787         // Otherwise, it is 48 bits.
1788         let descaddrmask = if ds == 1 {
1789             !0u64 >> (64 - 50) // mask with 50 least significant bits
1790         } else {
1791             !0u64 >> (64 - 48) // mask with 48 least significant bits
1792         };
1793         let descaddrmask = descaddrmask & !indexmask_grainsize;
1794 
1795         // Translation table base address
1796         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1797         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1798         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1799         if pa_size == 52 {
1800             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1801         }
1802 
1803         // Loop through tables of each level
1804         loop {
1805             // Table offset for current level
1806             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1807             descaddr |= table_offset;
1808             descaddr &= !7u64;
1809 
1810             let mut buf = [0; 8];
1811             guest_memory
1812                 .memory()
1813                 .read(&mut buf, GuestAddress(descaddr))
1814                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1815             let descriptor = u64::from_le_bytes(buf);
1816 
1817             descaddr = descriptor & descaddrmask;
1818             // In the case of FEAT_LPA, the next-level translation table address
1819             // bits [48:51] comes from bits [12:15] of the current descriptor.
1820             // For FEAT_LPA2, the next-level translation table address
1821             // bits [50:51] comes from bits [8:9] of the current descriptor,
1822             // bits [48:49] comes from bits [48:49] of the descriptor which was
1823             // handled previously.
1824             if pa_size == 52 {
1825                 if ds == 1 {
1826                     // FEAT_LPA2
1827                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1828                 } else {
1829                     // FEAT_LPA
1830                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1831                 }
1832             }
1833 
1834             if (descriptor & 2) != 0 && (level < 3) {
1835                 // This is a table entry. Go down to next level.
1836                 level += 1;
1837                 indexmask = indexmask_grainsize;
1838                 continue;
1839             }
1840 
1841             break;
1842         }
1843 
1844         // We have reached either:
1845         // - a page entry at level 3 or
1846         // - a block entry at level 1 or 2
1847         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1848         descaddr &= !(page_size - 1);
1849         descaddr |= gva & (page_size - 1);
1850 
1851         Ok(descaddr)
1852     }
1853 
1854     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1855         self.acpi_address = Some(acpi_address);
1856     }
1857 
1858     pub(crate) fn set_interrupt_controller(
1859         &mut self,
1860         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1861     ) {
1862         self.interrupt_controller = Some(interrupt_controller);
1863     }
1864 
1865     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1866         &self.vcpus_kill_signalled
1867     }
1868 
1869     #[cfg(feature = "igvm")]
1870     pub(crate) fn get_cpuid_leaf(
1871         &self,
1872         cpu_id: u8,
1873         eax: u32,
1874         ecx: u32,
1875         xfem: u64,
1876         xss: u64,
1877     ) -> Result<[u32; 4]> {
1878         let leaf_info = self.vcpus[usize::from(cpu_id)]
1879             .lock()
1880             .unwrap()
1881             .vcpu
1882             .get_cpuid_values(eax, ecx, xfem, xss)
1883             .unwrap();
1884         Ok(leaf_info)
1885     }
1886 
1887     #[cfg(feature = "sev_snp")]
1888     pub(crate) fn sev_snp_enabled(&self) -> bool {
1889         self.sev_snp_enabled
1890     }
1891 
1892     pub(crate) fn nmi(&self) -> Result<()> {
1893         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1894 
1895         for state in self.vcpu_states.iter() {
1896             state.signal_thread();
1897         }
1898 
1899         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1900 
1901         Ok(())
1902     }
1903 }
1904 
1905 struct Cpu {
1906     cpu_id: u8,
1907     proximity_domain: u32,
1908     dynamic: bool,
1909     #[cfg(target_arch = "x86_64")]
1910     topology: Option<(u8, u8, u8)>,
1911 }
1912 
1913 #[cfg(target_arch = "x86_64")]
1914 const MADT_CPU_ENABLE_FLAG: usize = 0;
1915 
1916 #[cfg(target_arch = "x86_64")]
1917 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1918 
1919 impl Cpu {
1920     #[cfg(target_arch = "x86_64")]
1921     fn generate_mat(&self) -> Vec<u8> {
1922         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1923 
1924         let lapic = LocalX2Apic {
1925             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1926             length: 16,
1927             processor_id: self.cpu_id.into(),
1928             apic_id: x2apic_id,
1929             flags: 1 << MADT_CPU_ENABLE_FLAG,
1930             _reserved: 0,
1931         };
1932 
1933         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1934         // SAFETY: mat_data is large enough to hold lapic
1935         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1936 
1937         mat_data
1938     }
1939 }
1940 
1941 impl Aml for Cpu {
1942     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1943         #[cfg(target_arch = "x86_64")]
1944         let mat_data: Vec<u8> = self.generate_mat();
1945         #[allow(clippy::if_same_then_else)]
1946         if self.dynamic {
1947             aml::Device::new(
1948                 format!("C{:03X}", self.cpu_id).as_str().into(),
1949                 vec![
1950                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1951                     &aml::Name::new("_UID".into(), &self.cpu_id),
1952                     // Currently, AArch64 cannot support following fields.
1953                     /*
1954                     _STA return value:
1955                     Bit [0] – Set if the device is present.
1956                     Bit [1] – Set if the device is enabled and decoding its resources.
1957                     Bit [2] – Set if the device should be shown in the UI.
1958                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1959                     Bit [4] – Set if the battery is present.
1960                     Bits [31:5] – Reserved (must be cleared).
1961                     */
1962                     #[cfg(target_arch = "x86_64")]
1963                     &aml::Method::new(
1964                         "_STA".into(),
1965                         0,
1966                         false,
1967                         // Call into CSTA method which will interrogate device
1968                         vec![&aml::Return::new(&aml::MethodCall::new(
1969                             "CSTA".into(),
1970                             vec![&self.cpu_id],
1971                         ))],
1972                     ),
1973                     &aml::Method::new(
1974                         "_PXM".into(),
1975                         0,
1976                         false,
1977                         vec![&aml::Return::new(&self.proximity_domain)],
1978                     ),
1979                     // The Linux kernel expects every CPU device to have a _MAT entry
1980                     // containing the LAPIC for this processor with the enabled bit set
1981                     // even it if is disabled in the MADT (non-boot CPU)
1982                     #[cfg(target_arch = "x86_64")]
1983                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1984                     // Trigger CPU ejection
1985                     #[cfg(target_arch = "x86_64")]
1986                     &aml::Method::new(
1987                         "_EJ0".into(),
1988                         1,
1989                         false,
1990                         // Call into CEJ0 method which will actually eject device
1991                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1992                     ),
1993                 ],
1994             )
1995             .to_aml_bytes(sink);
1996         } else {
1997             aml::Device::new(
1998                 format!("C{:03X}", self.cpu_id).as_str().into(),
1999                 vec![
2000                     &aml::Name::new("_HID".into(), &"ACPI0007"),
2001                     &aml::Name::new("_UID".into(), &self.cpu_id),
2002                     #[cfg(target_arch = "x86_64")]
2003                     &aml::Method::new(
2004                         "_STA".into(),
2005                         0,
2006                         false,
2007                         // Mark CPU present see CSTA implementation
2008                         vec![&aml::Return::new(&0xfu8)],
2009                     ),
2010                     &aml::Method::new(
2011                         "_PXM".into(),
2012                         0,
2013                         false,
2014                         vec![&aml::Return::new(&self.proximity_domain)],
2015                     ),
2016                     // The Linux kernel expects every CPU device to have a _MAT entry
2017                     // containing the LAPIC for this processor with the enabled bit set
2018                     // even it if is disabled in the MADT (non-boot CPU)
2019                     #[cfg(target_arch = "x86_64")]
2020                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
2021                 ],
2022             )
2023             .to_aml_bytes(sink);
2024         }
2025     }
2026 }
2027 
2028 struct CpuNotify {
2029     cpu_id: u8,
2030 }
2031 
2032 impl Aml for CpuNotify {
2033     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2034         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2035         aml::If::new(
2036             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2037             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2038         )
2039         .to_aml_bytes(sink)
2040     }
2041 }
2042 
2043 struct CpuMethods {
2044     max_vcpus: u8,
2045     dynamic: bool,
2046 }
2047 
2048 impl Aml for CpuMethods {
2049     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2050         if self.dynamic {
2051             // CPU status method
2052             aml::Method::new(
2053                 "CSTA".into(),
2054                 1,
2055                 true,
2056                 vec![
2057                     // Take lock defined above
2058                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2059                     // Write CPU number (in first argument) to I/O port via field
2060                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2061                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2062                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2063                     &aml::If::new(
2064                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2065                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2066                     ),
2067                     // Release lock
2068                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2069                     // Return 0 or 0xf
2070                     &aml::Return::new(&aml::Local(0)),
2071                 ],
2072             )
2073             .to_aml_bytes(sink);
2074 
2075             let mut cpu_notifies = Vec::new();
2076             for cpu_id in 0..self.max_vcpus {
2077                 cpu_notifies.push(CpuNotify { cpu_id });
2078             }
2079 
2080             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2081             for cpu_id in 0..self.max_vcpus {
2082                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2083             }
2084 
2085             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2086 
2087             aml::Method::new(
2088                 "CEJ0".into(),
2089                 1,
2090                 true,
2091                 vec![
2092                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2093                     // Write CPU number (in first argument) to I/O port via field
2094                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2095                     // Set CEJ0 bit
2096                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2097                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2098                 ],
2099             )
2100             .to_aml_bytes(sink);
2101 
2102             aml::Method::new(
2103                 "CSCN".into(),
2104                 0,
2105                 true,
2106                 vec![
2107                     // Take lock defined above
2108                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2109                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2110                     &aml::While::new(
2111                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2112                         vec![
2113                             // Write CPU number (in first argument) to I/O port via field
2114                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2115                             // Check if CINS bit is set
2116                             &aml::If::new(
2117                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2118                                 // Notify device if it is
2119                                 vec![
2120                                     &aml::MethodCall::new(
2121                                         "CTFY".into(),
2122                                         vec![&aml::Local(0), &aml::ONE],
2123                                     ),
2124                                     // Reset CINS bit
2125                                     &aml::Store::new(
2126                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2127                                         &aml::ONE,
2128                                     ),
2129                                 ],
2130                             ),
2131                             // Check if CRMV bit is set
2132                             &aml::If::new(
2133                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2134                                 // Notify device if it is (with the eject constant 0x3)
2135                                 vec![
2136                                     &aml::MethodCall::new(
2137                                         "CTFY".into(),
2138                                         vec![&aml::Local(0), &3u8],
2139                                     ),
2140                                     // Reset CRMV bit
2141                                     &aml::Store::new(
2142                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2143                                         &aml::ONE,
2144                                     ),
2145                                 ],
2146                             ),
2147                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2148                         ],
2149                     ),
2150                     // Release lock
2151                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2152                 ],
2153             )
2154             .to_aml_bytes(sink)
2155         } else {
2156             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2157         }
2158     }
2159 }
2160 
2161 impl Aml for CpuManager {
2162     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2163         #[cfg(target_arch = "x86_64")]
2164         if let Some(acpi_address) = self.acpi_address {
2165             // CPU hotplug controller
2166             aml::Device::new(
2167                 "_SB_.PRES".into(),
2168                 vec![
2169                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2170                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2171                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2172                     &aml::Mutex::new("CPLK".into(), 0),
2173                     &aml::Name::new(
2174                         "_CRS".into(),
2175                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2176                             aml::AddressSpaceCacheable::NotCacheable,
2177                             true,
2178                             acpi_address.0,
2179                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2180                             None,
2181                         )]),
2182                     ),
2183                     // OpRegion and Fields map MMIO range into individual field values
2184                     &aml::OpRegion::new(
2185                         "PRST".into(),
2186                         aml::OpRegionSpace::SystemMemory,
2187                         &(acpi_address.0 as usize),
2188                         &CPU_MANAGER_ACPI_SIZE,
2189                     ),
2190                     &aml::Field::new(
2191                         "PRST".into(),
2192                         aml::FieldAccessType::Byte,
2193                         aml::FieldLockRule::NoLock,
2194                         aml::FieldUpdateRule::WriteAsZeroes,
2195                         vec![
2196                             aml::FieldEntry::Reserved(32),
2197                             aml::FieldEntry::Named(*b"CPEN", 1),
2198                             aml::FieldEntry::Named(*b"CINS", 1),
2199                             aml::FieldEntry::Named(*b"CRMV", 1),
2200                             aml::FieldEntry::Named(*b"CEJ0", 1),
2201                             aml::FieldEntry::Reserved(4),
2202                             aml::FieldEntry::Named(*b"CCMD", 8),
2203                         ],
2204                     ),
2205                     &aml::Field::new(
2206                         "PRST".into(),
2207                         aml::FieldAccessType::DWord,
2208                         aml::FieldLockRule::NoLock,
2209                         aml::FieldUpdateRule::Preserve,
2210                         vec![
2211                             aml::FieldEntry::Named(*b"CSEL", 32),
2212                             aml::FieldEntry::Reserved(32),
2213                             aml::FieldEntry::Named(*b"CDAT", 32),
2214                         ],
2215                     ),
2216                 ],
2217             )
2218             .to_aml_bytes(sink);
2219         }
2220 
2221         // CPU devices
2222         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2223         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2224         // Bundle methods together under a common object
2225         let methods = CpuMethods {
2226             max_vcpus: self.config.max_vcpus,
2227             dynamic: self.dynamic,
2228         };
2229         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2230 
2231         #[cfg(target_arch = "x86_64")]
2232         let topology = self.get_vcpu_topology();
2233         let mut cpu_devices = Vec::new();
2234         for cpu_id in 0..self.config.max_vcpus {
2235             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2236             let cpu_device = Cpu {
2237                 cpu_id,
2238                 proximity_domain,
2239                 dynamic: self.dynamic,
2240                 #[cfg(target_arch = "x86_64")]
2241                 topology,
2242             };
2243 
2244             cpu_devices.push(cpu_device);
2245         }
2246 
2247         for cpu_device in cpu_devices.iter() {
2248             cpu_data_inner.push(cpu_device);
2249         }
2250 
2251         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2252     }
2253 }
2254 
2255 impl Pausable for CpuManager {
2256     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2257         // Tell the vCPUs to pause themselves next time they exit
2258         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2259 
2260         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2261         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2262         // above.
2263         for state in self.vcpu_states.iter() {
2264             state.signal_thread();
2265         }
2266 
2267         for vcpu in self.vcpus.iter() {
2268             let mut vcpu = vcpu.lock().unwrap();
2269             vcpu.pause()?;
2270             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2271             if !self.config.kvm_hyperv {
2272                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2273                     MigratableError::Pause(anyhow!(
2274                         "Could not notify guest it has been paused {:?}",
2275                         e
2276                     ))
2277                 })?;
2278             }
2279         }
2280 
2281         // The vCPU thread will change its paused state before parking, wait here for each
2282         // activated vCPU change their state to ensure they have parked.
2283         for state in self.vcpu_states.iter() {
2284             if state.active() {
2285                 while !state.paused.load(Ordering::SeqCst) {
2286                     // To avoid a priority inversion with the vCPU thread
2287                     thread::sleep(std::time::Duration::from_millis(1));
2288                 }
2289             }
2290         }
2291 
2292         Ok(())
2293     }
2294 
2295     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2296         for vcpu in self.vcpus.iter() {
2297             vcpu.lock().unwrap().resume()?;
2298         }
2299 
2300         // Toggle the vCPUs pause boolean
2301         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2302 
2303         // Unpark all the VCPU threads.
2304         // Once unparked, the next thing they will do is checking for the pause
2305         // boolean. Since it'll be set to false, they will exit their pause loop
2306         // and go back to vmx root.
2307         for state in self.vcpu_states.iter() {
2308             state.paused.store(false, Ordering::SeqCst);
2309             state.unpark_thread();
2310         }
2311         Ok(())
2312     }
2313 }
2314 
2315 impl Snapshottable for CpuManager {
2316     fn id(&self) -> String {
2317         CPU_MANAGER_SNAPSHOT_ID.to_string()
2318     }
2319 
2320     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2321         let mut cpu_manager_snapshot = Snapshot::default();
2322 
2323         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2324         for vcpu in &self.vcpus {
2325             let mut vcpu = vcpu.lock().unwrap();
2326             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2327         }
2328 
2329         Ok(cpu_manager_snapshot)
2330     }
2331 }
2332 
2333 impl Transportable for CpuManager {}
2334 impl Migratable for CpuManager {}
2335 
2336 #[cfg(feature = "guest_debug")]
2337 impl Debuggable for CpuManager {
2338     #[cfg(feature = "kvm")]
2339     fn set_guest_debug(
2340         &self,
2341         cpu_id: usize,
2342         addrs: &[GuestAddress],
2343         singlestep: bool,
2344     ) -> std::result::Result<(), DebuggableError> {
2345         self.vcpus[cpu_id]
2346             .lock()
2347             .unwrap()
2348             .vcpu
2349             .set_guest_debug(addrs, singlestep)
2350             .map_err(DebuggableError::SetDebug)
2351     }
2352 
2353     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2354         Ok(())
2355     }
2356 
2357     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2358         Ok(())
2359     }
2360 
2361     #[cfg(target_arch = "x86_64")]
2362     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2363         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2364         let gregs = self
2365             .get_regs(cpu_id as u8)
2366             .map_err(DebuggableError::ReadRegs)?;
2367         let regs = [
2368             gregs.get_rax(),
2369             gregs.get_rbx(),
2370             gregs.get_rcx(),
2371             gregs.get_rdx(),
2372             gregs.get_rsi(),
2373             gregs.get_rdi(),
2374             gregs.get_rbp(),
2375             gregs.get_rsp(),
2376             gregs.get_r8(),
2377             gregs.get_r9(),
2378             gregs.get_r10(),
2379             gregs.get_r11(),
2380             gregs.get_r12(),
2381             gregs.get_r13(),
2382             gregs.get_r14(),
2383             gregs.get_r15(),
2384         ];
2385 
2386         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2387         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2388         let eflags = gregs.get_rflags() as u32;
2389         let rip = gregs.get_rip();
2390 
2391         // Segment registers: CS, SS, DS, ES, FS, GS
2392         let sregs = self
2393             .get_sregs(cpu_id as u8)
2394             .map_err(DebuggableError::ReadRegs)?;
2395         let segments = X86SegmentRegs {
2396             cs: sregs.cs.selector as u32,
2397             ss: sregs.ss.selector as u32,
2398             ds: sregs.ds.selector as u32,
2399             es: sregs.es.selector as u32,
2400             fs: sregs.fs.selector as u32,
2401             gs: sregs.gs.selector as u32,
2402         };
2403 
2404         // TODO: Add other registers
2405 
2406         Ok(CoreRegs {
2407             regs,
2408             eflags,
2409             rip,
2410             segments,
2411             ..Default::default()
2412         })
2413     }
2414 
2415     #[cfg(target_arch = "aarch64")]
2416     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2417         let gregs = self
2418             .get_regs(cpu_id as u8)
2419             .map_err(DebuggableError::ReadRegs)?;
2420         Ok(CoreRegs {
2421             x: gregs.get_regs(),
2422             sp: gregs.get_sp(),
2423             pc: gregs.get_pc(),
2424             ..Default::default()
2425         })
2426     }
2427 
2428     #[cfg(target_arch = "x86_64")]
2429     fn write_regs(
2430         &self,
2431         cpu_id: usize,
2432         regs: &CoreRegs,
2433     ) -> std::result::Result<(), DebuggableError> {
2434         let orig_gregs = self
2435             .get_regs(cpu_id as u8)
2436             .map_err(DebuggableError::ReadRegs)?;
2437         let mut gregs = self.create_standard_regs(cpu_id as u8);
2438         gregs.set_rax(regs.regs[0]);
2439         gregs.set_rbx(regs.regs[1]);
2440         gregs.set_rcx(regs.regs[2]);
2441         gregs.set_rdx(regs.regs[3]);
2442         gregs.set_rsi(regs.regs[4]);
2443         gregs.set_rdi(regs.regs[5]);
2444         gregs.set_rbp(regs.regs[6]);
2445         gregs.set_rsp(regs.regs[7]);
2446         gregs.set_r8(regs.regs[8]);
2447         gregs.set_r9(regs.regs[9]);
2448         gregs.set_r10(regs.regs[10]);
2449         gregs.set_r11(regs.regs[11]);
2450         gregs.set_r12(regs.regs[12]);
2451         gregs.set_r13(regs.regs[13]);
2452         gregs.set_r14(regs.regs[14]);
2453         gregs.set_r15(regs.regs[15]);
2454         gregs.set_rip(regs.rip);
2455         // Update the lower 32-bit of rflags.
2456         gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64));
2457 
2458         self.set_regs(cpu_id as u8, &gregs)
2459             .map_err(DebuggableError::WriteRegs)?;
2460 
2461         // Segment registers: CS, SS, DS, ES, FS, GS
2462         // Since GDB care only selectors, we call get_sregs() first.
2463         let mut sregs = self
2464             .get_sregs(cpu_id as u8)
2465             .map_err(DebuggableError::ReadRegs)?;
2466         sregs.cs.selector = regs.segments.cs as u16;
2467         sregs.ss.selector = regs.segments.ss as u16;
2468         sregs.ds.selector = regs.segments.ds as u16;
2469         sregs.es.selector = regs.segments.es as u16;
2470         sregs.fs.selector = regs.segments.fs as u16;
2471         sregs.gs.selector = regs.segments.gs as u16;
2472 
2473         self.set_sregs(cpu_id as u8, &sregs)
2474             .map_err(DebuggableError::WriteRegs)?;
2475 
2476         // TODO: Add other registers
2477 
2478         Ok(())
2479     }
2480 
2481     #[cfg(target_arch = "aarch64")]
2482     fn write_regs(
2483         &self,
2484         cpu_id: usize,
2485         regs: &CoreRegs,
2486     ) -> std::result::Result<(), DebuggableError> {
2487         let mut gregs = self
2488             .get_regs(cpu_id as u8)
2489             .map_err(DebuggableError::ReadRegs)?;
2490 
2491         gregs.set_regs(regs.x);
2492         gregs.set_sp(regs.sp);
2493         gregs.set_pc(regs.pc);
2494 
2495         self.set_regs(cpu_id as u8, &gregs)
2496             .map_err(DebuggableError::WriteRegs)?;
2497 
2498         Ok(())
2499     }
2500 
2501     fn read_mem(
2502         &self,
2503         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2504         cpu_id: usize,
2505         vaddr: GuestAddress,
2506         len: usize,
2507     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2508         let mut buf = vec![0; len];
2509         let mut total_read = 0_u64;
2510 
2511         while total_read < len as u64 {
2512             let gaddr = vaddr.0 + total_read;
2513             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2514                 Ok(paddr) => paddr,
2515                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2516                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2517             };
2518             let psize = arch::PAGE_SIZE as u64;
2519             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2520             guest_memory
2521                 .memory()
2522                 .read(
2523                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2524                     GuestAddress(paddr),
2525                 )
2526                 .map_err(DebuggableError::ReadMem)?;
2527             total_read += read_len;
2528         }
2529         Ok(buf)
2530     }
2531 
2532     fn write_mem(
2533         &self,
2534         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2535         cpu_id: usize,
2536         vaddr: &GuestAddress,
2537         data: &[u8],
2538     ) -> std::result::Result<(), DebuggableError> {
2539         let mut total_written = 0_u64;
2540 
2541         while total_written < data.len() as u64 {
2542             let gaddr = vaddr.0 + total_written;
2543             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2544                 Ok(paddr) => paddr,
2545                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2546                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2547             };
2548             let psize = arch::PAGE_SIZE as u64;
2549             let write_len = std::cmp::min(
2550                 data.len() as u64 - total_written,
2551                 psize - (paddr & (psize - 1)),
2552             );
2553             guest_memory
2554                 .memory()
2555                 .write(
2556                     &data[total_written as usize..total_written as usize + write_len as usize],
2557                     GuestAddress(paddr),
2558                 )
2559                 .map_err(DebuggableError::WriteMem)?;
2560             total_written += write_len;
2561         }
2562         Ok(())
2563     }
2564 
2565     fn active_vcpus(&self) -> usize {
2566         self.present_vcpus() as usize
2567     }
2568 }
2569 
2570 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2571 impl Elf64Writable for CpuManager {}
2572 
2573 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2574 impl CpuElf64Writable for CpuManager {
2575     fn cpu_write_elf64_note(
2576         &mut self,
2577         dump_state: &DumpState,
2578     ) -> std::result::Result<(), GuestDebuggableError> {
2579         let mut coredump_file = dump_state.file.as_ref().unwrap();
2580         for vcpu in &self.vcpus {
2581             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2582             let mut pos: usize = 0;
2583             let mut buf = vec![0; note_size as usize];
2584             let descsz = size_of::<X86_64ElfPrStatus>();
2585             let vcpu_id = vcpu.lock().unwrap().id;
2586 
2587             let note = Elf64_Nhdr {
2588                 n_namesz: COREDUMP_NAME_SIZE,
2589                 n_descsz: descsz as u32,
2590                 n_type: NT_PRSTATUS,
2591             };
2592 
2593             let bytes: &[u8] = note.as_slice();
2594             buf.splice(0.., bytes.to_vec());
2595             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2596             buf.resize(pos + 4, 0);
2597             buf.splice(pos.., "CORE".to_string().into_bytes());
2598 
2599             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2600             buf.resize(pos + 32 + 4, 0);
2601             let pid = vcpu_id as u64;
2602             let bytes: &[u8] = pid.as_slice();
2603             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2604 
2605             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2606 
2607             let orig_rax: u64 = 0;
2608             let gregs = self.vcpus[usize::from(vcpu_id)]
2609                 .lock()
2610                 .unwrap()
2611                 .vcpu
2612                 .get_regs()
2613                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2614 
2615             let regs1 = [
2616                 gregs.get_r15(),
2617                 gregs.get_r14(),
2618                 gregs.get_r13(),
2619                 gregs.get_r12(),
2620                 gregs.get_rbp(),
2621                 gregs.get_rbx(),
2622                 gregs.get_r11(),
2623                 gregs.get_r10(),
2624             ];
2625             let regs2 = [
2626                 gregs.get_r9(),
2627                 gregs.get_r8(),
2628                 gregs.get_rax(),
2629                 gregs.get_rcx(),
2630                 gregs.get_rdx(),
2631                 gregs.get_rsi(),
2632                 gregs.get_rdi(),
2633                 orig_rax,
2634             ];
2635 
2636             let sregs = self.vcpus[usize::from(vcpu_id)]
2637                 .lock()
2638                 .unwrap()
2639                 .vcpu
2640                 .get_sregs()
2641                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2642 
2643             debug!(
2644                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2645                 gregs.get_rip(),
2646                 gregs.get_rsp(),
2647                 sregs.gs.base,
2648                 sregs.cs.selector,
2649                 sregs.ss.selector,
2650                 sregs.ds.selector,
2651             );
2652 
2653             let regs = X86_64UserRegs {
2654                 regs1,
2655                 regs2,
2656                 rip: gregs.get_rip(),
2657                 cs: sregs.cs.selector as u64,
2658                 eflags: gregs.get_rflags(),
2659                 rsp: gregs.get_rsp(),
2660                 ss: sregs.ss.selector as u64,
2661                 fs_base: sregs.fs.base,
2662                 gs_base: sregs.gs.base,
2663                 ds: sregs.ds.selector as u64,
2664                 es: sregs.es.selector as u64,
2665                 fs: sregs.fs.selector as u64,
2666                 gs: sregs.gs.selector as u64,
2667             };
2668 
2669             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2670             let bytes: &[u8] = regs.as_slice();
2671             buf.resize(note_size as usize, 0);
2672             buf.splice(pos.., bytes.to_vec());
2673             buf.resize(note_size as usize, 0);
2674 
2675             coredump_file
2676                 .write(&buf)
2677                 .map_err(GuestDebuggableError::CoredumpFile)?;
2678         }
2679 
2680         Ok(())
2681     }
2682 
2683     fn cpu_write_vmm_note(
2684         &mut self,
2685         dump_state: &DumpState,
2686     ) -> std::result::Result<(), GuestDebuggableError> {
2687         let mut coredump_file = dump_state.file.as_ref().unwrap();
2688         for vcpu in &self.vcpus {
2689             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2690             let mut pos: usize = 0;
2691             let mut buf = vec![0; note_size as usize];
2692             let descsz = size_of::<DumpCpusState>();
2693             let vcpu_id = vcpu.lock().unwrap().id;
2694 
2695             let note = Elf64_Nhdr {
2696                 n_namesz: COREDUMP_NAME_SIZE,
2697                 n_descsz: descsz as u32,
2698                 n_type: 0,
2699             };
2700 
2701             let bytes: &[u8] = note.as_slice();
2702             buf.splice(0.., bytes.to_vec());
2703             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2704 
2705             buf.resize(pos + 4, 0);
2706             buf.splice(pos.., "QEMU".to_string().into_bytes());
2707 
2708             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2709 
2710             let gregs = self.vcpus[usize::from(vcpu_id)]
2711                 .lock()
2712                 .unwrap()
2713                 .vcpu
2714                 .get_regs()
2715                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2716 
2717             let regs1 = [
2718                 gregs.get_rax(),
2719                 gregs.get_rbx(),
2720                 gregs.get_rcx(),
2721                 gregs.get_rdx(),
2722                 gregs.get_rsi(),
2723                 gregs.get_rdi(),
2724                 gregs.get_rsp(),
2725                 gregs.get_rbp(),
2726             ];
2727 
2728             let regs2 = [
2729                 gregs.get_r8(),
2730                 gregs.get_r9(),
2731                 gregs.get_r10(),
2732                 gregs.get_r11(),
2733                 gregs.get_r12(),
2734                 gregs.get_r13(),
2735                 gregs.get_r14(),
2736                 gregs.get_r15(),
2737             ];
2738 
2739             let sregs = self.vcpus[usize::from(vcpu_id)]
2740                 .lock()
2741                 .unwrap()
2742                 .vcpu
2743                 .get_sregs()
2744                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2745 
2746             let mut msrs = vec![MsrEntry {
2747                 index: msr_index::MSR_KERNEL_GS_BASE,
2748                 ..Default::default()
2749             }];
2750 
2751             self.vcpus[vcpu_id as usize]
2752                 .lock()
2753                 .unwrap()
2754                 .vcpu
2755                 .get_msrs(&mut msrs)
2756                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2757             let kernel_gs_base = msrs[0].data;
2758 
2759             let cs = CpuSegment::new(sregs.cs);
2760             let ds = CpuSegment::new(sregs.ds);
2761             let es = CpuSegment::new(sregs.es);
2762             let fs = CpuSegment::new(sregs.fs);
2763             let gs = CpuSegment::new(sregs.gs);
2764             let ss = CpuSegment::new(sregs.ss);
2765             let ldt = CpuSegment::new(sregs.ldt);
2766             let tr = CpuSegment::new(sregs.tr);
2767             let gdt = CpuSegment::new_from_table(sregs.gdt);
2768             let idt = CpuSegment::new_from_table(sregs.idt);
2769             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2770             let regs = DumpCpusState {
2771                 version: 1,
2772                 size: size_of::<DumpCpusState>() as u32,
2773                 regs1,
2774                 regs2,
2775                 rip: gregs.get_rip(),
2776                 rflags: gregs.get_rflags(),
2777                 cs,
2778                 ds,
2779                 es,
2780                 fs,
2781                 gs,
2782                 ss,
2783                 ldt,
2784                 tr,
2785                 gdt,
2786                 idt,
2787                 cr,
2788                 kernel_gs_base,
2789             };
2790 
2791             let bytes: &[u8] = regs.as_slice();
2792             buf.resize(note_size as usize, 0);
2793             buf.splice(pos.., bytes.to_vec());
2794             buf.resize(note_size as usize, 0);
2795 
2796             coredump_file
2797                 .write(&buf)
2798                 .map_err(GuestDebuggableError::CoredumpFile)?;
2799         }
2800 
2801         Ok(())
2802     }
2803 }
2804 
2805 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2806 #[cfg(test)]
2807 mod tests {
2808     use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START};
2809     use arch::x86_64::interrupts::*;
2810     use arch::x86_64::regs::*;
2811     use hypervisor::arch::x86::{FpuState, LapicState};
2812     use hypervisor::StandardRegisters;
2813     use linux_loader::loader::bootparam::setup_header;
2814 
2815     #[test]
2816     fn test_setlint() {
2817         let hv = hypervisor::new().unwrap();
2818         let vm = hv.create_vm().expect("new VM fd creation failed");
2819         assert!(hv.check_required_extensions().is_ok());
2820         // Calling get_lapic will fail if there is no irqchip before hand.
2821         assert!(vm.create_irq_chip().is_ok());
2822         let vcpu = vm.create_vcpu(0, None).unwrap();
2823         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2824 
2825         // Compute the value that is expected to represent LVT0 and LVT1.
2826         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2827         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2828         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2829         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2830 
2831         set_lint(&vcpu).unwrap();
2832 
2833         // Compute the value that represents LVT0 and LVT1 after set_lint.
2834         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2835         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2836         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2837         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2838         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2839     }
2840 
2841     #[test]
2842     fn test_setup_fpu() {
2843         let hv = hypervisor::new().unwrap();
2844         let vm = hv.create_vm().expect("new VM fd creation failed");
2845         let vcpu = vm.create_vcpu(0, None).unwrap();
2846         setup_fpu(&vcpu).unwrap();
2847 
2848         let expected_fpu: FpuState = FpuState {
2849             fcw: 0x37f,
2850             mxcsr: 0x1f80,
2851             ..Default::default()
2852         };
2853         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2854         // TODO: auto-generate kvm related structures with PartialEq on.
2855         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2856         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2857         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2858         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2859         // remove it at all.
2860         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2861     }
2862 
2863     #[test]
2864     fn test_setup_msrs() {
2865         use hypervisor::arch::x86::{msr_index, MsrEntry};
2866 
2867         let hv = hypervisor::new().unwrap();
2868         let vm = hv.create_vm().expect("new VM fd creation failed");
2869         let vcpu = vm.create_vcpu(0, None).unwrap();
2870         setup_msrs(&vcpu).unwrap();
2871 
2872         // This test will check against the last MSR entry configured (the tenth one).
2873         // See create_msr_entries for details.
2874         let mut msrs = vec![MsrEntry {
2875             index: msr_index::MSR_IA32_MISC_ENABLE,
2876             ..Default::default()
2877         }];
2878 
2879         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2880         // in this test case scenario.
2881         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2882         assert_eq!(read_msrs, 1);
2883 
2884         // Official entries that were setup when we did setup_msrs. We need to assert that the
2885         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2886         // expect.
2887         let entry_vec = vcpu.boot_msr_entries();
2888         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2889     }
2890 
2891     #[test]
2892     fn test_setup_regs_for_pvh() {
2893         let hv = hypervisor::new().unwrap();
2894         let vm = hv.create_vm().expect("new VM fd creation failed");
2895         let vcpu = vm.create_vcpu(0, None).unwrap();
2896 
2897         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2898         expected_regs.set_rflags(0x0000000000000002u64);
2899         expected_regs.set_rbx(arch::layout::PVH_INFO_START.0);
2900         expected_regs.set_rip(1);
2901 
2902         setup_regs(
2903             &vcpu,
2904             arch::EntryPoint {
2905                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2906                 setup_header: None,
2907             },
2908         )
2909         .unwrap();
2910 
2911         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2912         assert_eq!(actual_regs, expected_regs);
2913     }
2914 
2915     #[test]
2916     fn test_setup_regs_for_bzimage() {
2917         let hv = hypervisor::new().unwrap();
2918         let vm = hv.create_vm().expect("new VM fd creation failed");
2919         let vcpu = vm.create_vcpu(0, None).unwrap();
2920 
2921         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2922         expected_regs.set_rflags(0x0000000000000002u64);
2923         expected_regs.set_rip(1);
2924         expected_regs.set_rsp(BOOT_STACK_POINTER.0);
2925         expected_regs.set_rsi(ZERO_PAGE_START.0);
2926 
2927         setup_regs(
2928             &vcpu,
2929             arch::EntryPoint {
2930                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2931                 setup_header: Some(setup_header {
2932                     ..Default::default()
2933                 }),
2934             },
2935         )
2936         .unwrap();
2937 
2938         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2939         assert_eq!(actual_regs, expected_regs);
2940     }
2941 }
2942 
2943 #[cfg(target_arch = "aarch64")]
2944 #[cfg(test)]
2945 mod tests {
2946     use std::mem;
2947 
2948     use arch::aarch64::regs;
2949     use arch::layout;
2950     use hypervisor::kvm::aarch64::is_system_register;
2951     use hypervisor::kvm::kvm_bindings::{
2952         kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE,
2953         KVM_REG_SIZE_U64,
2954     };
2955     use hypervisor::{arm64_core_reg_id, offset_of};
2956 
2957     #[test]
2958     fn test_setup_regs() {
2959         let hv = hypervisor::new().unwrap();
2960         let vm = hv.create_vm().unwrap();
2961         let vcpu = vm.create_vcpu(0, None).unwrap();
2962 
2963         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2964         // Must fail when vcpu is not initialized yet.
2965         assert!(res.is_err());
2966 
2967         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2968         vm.get_preferred_target(&mut kvi).unwrap();
2969         vcpu.vcpu_init(&kvi).unwrap();
2970 
2971         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2972     }
2973 
2974     #[test]
2975     fn test_read_mpidr() {
2976         let hv = hypervisor::new().unwrap();
2977         let vm = hv.create_vm().unwrap();
2978         let vcpu = vm.create_vcpu(0, None).unwrap();
2979         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2980         vm.get_preferred_target(&mut kvi).unwrap();
2981 
2982         // Must fail when vcpu is not initialized yet.
2983         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2984 
2985         vcpu.vcpu_init(&kvi).unwrap();
2986         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2987     }
2988 
2989     #[test]
2990     fn test_is_system_register() {
2991         let offset = offset_of!(user_pt_regs, pc);
2992         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2993         assert!(!is_system_register(regid));
2994         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2995         assert!(is_system_register(regid));
2996     }
2997 
2998     #[test]
2999     fn test_save_restore_core_regs() {
3000         let hv = hypervisor::new().unwrap();
3001         let vm = hv.create_vm().unwrap();
3002         let vcpu = vm.create_vcpu(0, None).unwrap();
3003         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
3004         vm.get_preferred_target(&mut kvi).unwrap();
3005 
3006         // Must fail when vcpu is not initialized yet.
3007         let res = vcpu.get_regs();
3008         assert!(res.is_err());
3009         assert_eq!(
3010             format!("{}", res.unwrap_err()),
3011             "Failed to get core register: Exec format error (os error 8)"
3012         );
3013 
3014         let mut state = vcpu.create_standard_regs();
3015         let res = vcpu.set_regs(&state);
3016         assert!(res.is_err());
3017         assert_eq!(
3018             format!("{}", res.unwrap_err()),
3019             "Failed to set core register: Exec format error (os error 8)"
3020         );
3021 
3022         vcpu.vcpu_init(&kvi).unwrap();
3023         let res = vcpu.get_regs();
3024         assert!(res.is_ok());
3025         state = res.unwrap();
3026         assert_eq!(state.get_pstate(), 0x3C5);
3027 
3028         assert!(vcpu.set_regs(&state).is_ok());
3029     }
3030 
3031     #[test]
3032     fn test_get_set_mpstate() {
3033         let hv = hypervisor::new().unwrap();
3034         let vm = hv.create_vm().unwrap();
3035         let vcpu = vm.create_vcpu(0, None).unwrap();
3036         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
3037         vm.get_preferred_target(&mut kvi).unwrap();
3038 
3039         let res = vcpu.get_mp_state();
3040         assert!(res.is_ok());
3041         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
3042     }
3043 }
3044