xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 19d36c765fdf00be749d95b3e61028bc302d6d73)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::collections::BTreeMap;
15 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
16 use std::io::Write;
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::os::unix::thread::JoinHandleExt;
20 use std::sync::atomic::{AtomicBool, Ordering};
21 use std::sync::{Arc, Barrier, Mutex};
22 use std::{cmp, io, result, thread};
23 
24 use acpi_tables::sdt::Sdt;
25 use acpi_tables::{aml, Aml};
26 use anyhow::anyhow;
27 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
28 use arch::aarch64::regs;
29 #[cfg(target_arch = "x86_64")]
30 use arch::x86_64::get_x2apic_id;
31 use arch::{EntryPoint, NumaNodes};
32 #[cfg(target_arch = "aarch64")]
33 use devices::gic::Gic;
34 use devices::interrupt_controller::InterruptController;
35 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
36 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
37 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
38 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs as CoreRegs};
39 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
40 use hypervisor::arch::x86::msr_index;
41 #[cfg(target_arch = "x86_64")]
42 use hypervisor::arch::x86::CpuIdEntry;
43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
44 use hypervisor::arch::x86::MsrEntry;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use hypervisor::arch::x86::SpecialRegisters;
47 #[cfg(target_arch = "aarch64")]
48 use hypervisor::kvm::kvm_bindings;
49 #[cfg(all(target_arch = "aarch64", feature = "kvm"))]
50 use hypervisor::kvm::kvm_ioctls::Cap;
51 #[cfg(feature = "tdx")]
52 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
53 #[cfg(target_arch = "x86_64")]
54 use hypervisor::CpuVendor;
55 #[cfg(feature = "kvm")]
56 use hypervisor::HypervisorType;
57 #[cfg(feature = "guest_debug")]
58 use hypervisor::StandardRegisters;
59 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
60 use libc::{c_void, siginfo_t};
61 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
62 use linux_loader::elf::Elf64_Nhdr;
63 use seccompiler::{apply_filter, SeccompAction};
64 use thiserror::Error;
65 use tracer::trace_scoped;
66 use vm_device::BusDevice;
67 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
68 use vm_memory::ByteValued;
69 #[cfg(feature = "guest_debug")]
70 use vm_memory::{Bytes, GuestAddressSpace};
71 use vm_memory::{GuestAddress, GuestMemoryAtomic};
72 use vm_migration::{
73     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable,
74     Transportable,
75 };
76 use vmm_sys_util::eventfd::EventFd;
77 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
78 use zerocopy::AsBytes;
79 
80 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
81 use crate::coredump::{
82     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
83     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
84     NT_PRSTATUS,
85 };
86 #[cfg(feature = "guest_debug")]
87 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
88 #[cfg(target_arch = "x86_64")]
89 use crate::memory_manager::MemoryManager;
90 use crate::seccomp_filters::{get_seccomp_filter, Thread};
91 #[cfg(target_arch = "x86_64")]
92 use crate::vm::physical_bits;
93 use crate::vm_config::CpusConfig;
94 use crate::{GuestMemoryMmap, CPU_MANAGER_SNAPSHOT_ID};
95 
96 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
97 /// Extract the specified bits of a 64-bit integer.
98 /// For example, to extrace 2 bits from offset 1 (zero based) of `6u64`,
99 /// following expression should return 3 (`0b11`):
100 /// `extract_bits_64!(0b0000_0110u64, 1, 2)`
101 ///
102 macro_rules! extract_bits_64 {
103     ($value: tt, $offset: tt, $length: tt) => {
104         ($value >> $offset) & (!0u64 >> (64 - $length))
105     };
106 }
107 
108 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
109 macro_rules! extract_bits_64_without_offset {
110     ($value: tt, $length: tt) => {
111         $value & (!0u64 >> (64 - $length))
112     };
113 }
114 
115 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
116 
117 #[derive(Debug, Error)]
118 pub enum Error {
119     #[error("Error creating vCPU: {0}")]
120     VcpuCreate(#[source] anyhow::Error),
121 
122     #[error("Error running bCPU: {0}")]
123     VcpuRun(#[source] anyhow::Error),
124 
125     #[error("Error spawning vCPU thread: {0}")]
126     VcpuSpawn(#[source] io::Error),
127 
128     #[error("Error generating common CPUID: {0}")]
129     CommonCpuId(#[source] arch::Error),
130 
131     #[error("Error configuring vCPU: {0}")]
132     VcpuConfiguration(#[source] arch::Error),
133 
134     #[error("Still pending removed vcpu")]
135     VcpuPendingRemovedVcpu,
136 
137     #[cfg(target_arch = "aarch64")]
138     #[error("Error fetching preferred target: {0}")]
139     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
140 
141     #[cfg(target_arch = "aarch64")]
142     #[error("Error initialising vCPU: {0}")]
143     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
144 
145     #[cfg(target_arch = "aarch64")]
146     #[error("Error finalising vCPU: {0}")]
147     VcpuArmFinalize(#[source] hypervisor::HypervisorCpuError),
148 
149     #[error("Failed to join on vCPU threads: {0:?}")]
150     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
151 
152     #[error("Error adding CpuManager to MMIO bus: {0}")]
153     BusError(#[source] vm_device::BusError),
154 
155     #[error("Requested vCPUs exceed maximum")]
156     DesiredVCpuCountExceedsMax,
157 
158     #[error("Cannot create seccomp filter: {0}")]
159     CreateSeccompFilter(#[source] seccompiler::Error),
160 
161     #[error("Cannot apply seccomp filter: {0}")]
162     ApplySeccompFilter(#[source] seccompiler::Error),
163 
164     #[error("Error starting vCPU after restore: {0}")]
165     StartRestoreVcpu(#[source] anyhow::Error),
166 
167     #[error("Unexpected VmExit")]
168     UnexpectedVmExit,
169 
170     #[error("Failed to allocate MMIO address for CpuManager")]
171     AllocateMmmioAddress,
172 
173     #[cfg(feature = "tdx")]
174     #[error("Error initializing TDX: {0}")]
175     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
176 
177     #[cfg(target_arch = "aarch64")]
178     #[error("Error initializing PMU: {0}")]
179     InitPmu(#[source] hypervisor::HypervisorCpuError),
180 
181     #[cfg(feature = "guest_debug")]
182     #[error("Error during CPU debug: {0}")]
183     CpuDebug(#[source] hypervisor::HypervisorCpuError),
184 
185     #[cfg(feature = "guest_debug")]
186     #[error("Error translating virtual address: {0}")]
187     TranslateVirtualAddress(#[source] anyhow::Error),
188 
189     #[cfg(target_arch = "x86_64")]
190     #[error("Error setting up AMX: {0}")]
191     AmxEnable(#[source] anyhow::Error),
192 
193     #[error("Maximum number of vCPUs exceeds host limit")]
194     MaximumVcpusExceeded,
195 
196     #[cfg(feature = "sev_snp")]
197     #[error("Failed to set sev control register: {0}")]
198     SetSevControlRegister(#[source] hypervisor::HypervisorCpuError),
199 
200     #[cfg(target_arch = "x86_64")]
201     #[error("Failed to inject NMI")]
202     NmiError(hypervisor::HypervisorCpuError),
203 }
204 pub type Result<T> = result::Result<T, Error>;
205 
206 #[cfg(target_arch = "x86_64")]
207 #[allow(dead_code)]
208 #[repr(packed)]
209 #[derive(AsBytes)]
210 struct LocalX2Apic {
211     pub r#type: u8,
212     pub length: u8,
213     pub _reserved: u16,
214     pub apic_id: u32,
215     pub flags: u32,
216     pub processor_id: u32,
217 }
218 
219 #[allow(dead_code)]
220 #[repr(packed)]
221 #[derive(Default, AsBytes)]
222 struct Ioapic {
223     pub r#type: u8,
224     pub length: u8,
225     pub ioapic_id: u8,
226     _reserved: u8,
227     pub apic_address: u32,
228     pub gsi_base: u32,
229 }
230 
231 #[cfg(target_arch = "aarch64")]
232 #[allow(dead_code)]
233 #[repr(packed)]
234 #[derive(AsBytes)]
235 struct GicC {
236     pub r#type: u8,
237     pub length: u8,
238     pub reserved0: u16,
239     pub cpu_interface_number: u32,
240     pub uid: u32,
241     pub flags: u32,
242     pub parking_version: u32,
243     pub performance_interrupt: u32,
244     pub parked_address: u64,
245     pub base_address: u64,
246     pub gicv_base_address: u64,
247     pub gich_base_address: u64,
248     pub vgic_interrupt: u32,
249     pub gicr_base_address: u64,
250     pub mpidr: u64,
251     pub proc_power_effi_class: u8,
252     pub reserved1: u8,
253     pub spe_overflow_interrupt: u16,
254 }
255 
256 #[cfg(target_arch = "aarch64")]
257 #[allow(dead_code)]
258 #[repr(packed)]
259 #[derive(AsBytes)]
260 struct GicD {
261     pub r#type: u8,
262     pub length: u8,
263     pub reserved0: u16,
264     pub gic_id: u32,
265     pub base_address: u64,
266     pub global_irq_base: u32,
267     pub version: u8,
268     pub reserved1: [u8; 3],
269 }
270 
271 #[cfg(target_arch = "aarch64")]
272 #[allow(dead_code)]
273 #[repr(packed)]
274 #[derive(AsBytes)]
275 struct GicR {
276     pub r#type: u8,
277     pub length: u8,
278     pub reserved: u16,
279     pub base_address: u64,
280     pub range_length: u32,
281 }
282 
283 #[cfg(target_arch = "aarch64")]
284 #[allow(dead_code)]
285 #[repr(packed)]
286 #[derive(AsBytes)]
287 struct GicIts {
288     pub r#type: u8,
289     pub length: u8,
290     pub reserved0: u16,
291     pub translation_id: u32,
292     pub base_address: u64,
293     pub reserved1: u32,
294 }
295 
296 #[cfg(target_arch = "aarch64")]
297 #[allow(dead_code)]
298 #[repr(packed)]
299 #[derive(AsBytes)]
300 struct ProcessorHierarchyNode {
301     pub r#type: u8,
302     pub length: u8,
303     pub reserved: u16,
304     pub flags: u32,
305     pub parent: u32,
306     pub acpi_processor_id: u32,
307     pub num_private_resources: u32,
308 }
309 
310 #[allow(dead_code)]
311 #[repr(packed)]
312 #[derive(Default, AsBytes)]
313 struct InterruptSourceOverride {
314     pub r#type: u8,
315     pub length: u8,
316     pub bus: u8,
317     pub source: u8,
318     pub gsi: u32,
319     pub flags: u16,
320 }
321 
322 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
323 macro_rules! round_up {
324     ($n:expr,$d:expr) => {
325         (($n / ($d + 1)) + 1) * $d
326     };
327 }
328 
329 /// A wrapper around creating and using a kvm-based VCPU.
330 pub struct Vcpu {
331     // The hypervisor abstracted CPU.
332     vcpu: Arc<dyn hypervisor::Vcpu>,
333     id: u8,
334     #[cfg(target_arch = "aarch64")]
335     mpidr: u64,
336     saved_state: Option<CpuState>,
337     #[cfg(target_arch = "x86_64")]
338     vendor: CpuVendor,
339 }
340 
341 impl Vcpu {
342     /// Constructs a new VCPU for `vm`.
343     ///
344     /// # Arguments
345     ///
346     /// * `id` - Represents the CPU number between [0, max vcpus).
347     /// * `vm` - The virtual machine this vcpu will get attached to.
348     /// * `vm_ops` - Optional object for exit handling.
349     /// * `cpu_vendor` - CPU vendor as reported by __cpuid(0x0)
350     pub fn new(
351         id: u8,
352         apic_id: u8,
353         vm: &Arc<dyn hypervisor::Vm>,
354         vm_ops: Option<Arc<dyn VmOps>>,
355         #[cfg(target_arch = "x86_64")] cpu_vendor: CpuVendor,
356     ) -> Result<Self> {
357         let vcpu = vm
358             .create_vcpu(apic_id, vm_ops)
359             .map_err(|e| Error::VcpuCreate(e.into()))?;
360         // Initially the cpuid per vCPU is the one supported by this VM.
361         Ok(Vcpu {
362             vcpu,
363             id,
364             #[cfg(target_arch = "aarch64")]
365             mpidr: 0,
366             saved_state: None,
367             #[cfg(target_arch = "x86_64")]
368             vendor: cpu_vendor,
369         })
370     }
371 
372     /// Configures a vcpu and should be called once per vcpu when created.
373     ///
374     /// # Arguments
375     ///
376     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
377     /// * `guest_memory` - Guest memory.
378     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
379     pub fn configure(
380         &mut self,
381         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
382         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
383         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
384         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
385         #[cfg(target_arch = "x86_64")] topology: Option<(u8, u8, u8)>,
386     ) -> Result<()> {
387         #[cfg(target_arch = "aarch64")]
388         {
389             self.init(vm)?;
390             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, boot_setup)
391                 .map_err(Error::VcpuConfiguration)?;
392         }
393         info!("Configuring vCPU: cpu_id = {}", self.id);
394         #[cfg(target_arch = "x86_64")]
395         arch::configure_vcpu(
396             &self.vcpu,
397             self.id,
398             boot_setup,
399             cpuid,
400             kvm_hyperv,
401             self.vendor,
402             topology,
403         )
404         .map_err(Error::VcpuConfiguration)?;
405 
406         Ok(())
407     }
408 
409     /// Gets the MPIDR register value.
410     #[cfg(target_arch = "aarch64")]
411     pub fn get_mpidr(&self) -> u64 {
412         self.mpidr
413     }
414 
415     /// Gets the saved vCPU state.
416     #[cfg(target_arch = "aarch64")]
417     pub fn get_saved_state(&self) -> Option<CpuState> {
418         self.saved_state.clone()
419     }
420 
421     /// Initializes an aarch64 specific vcpu for booting Linux.
422     #[cfg(target_arch = "aarch64")]
423     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
424         use std::arch::is_aarch64_feature_detected;
425         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
426         #[allow(clippy::nonminimal_bool)]
427         let sve_supported =
428             is_aarch64_feature_detected!("sve") || is_aarch64_feature_detected!("sve2");
429         // This reads back the kernel's preferred target type.
430         vm.get_preferred_target(&mut kvi)
431             .map_err(Error::VcpuArmPreferredTarget)?;
432         // We already checked that the capability is supported.
433         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
434         if vm
435             .as_any()
436             .downcast_ref::<hypervisor::kvm::KvmVm>()
437             .unwrap()
438             .check_extension(Cap::ArmPmuV3)
439         {
440             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
441         }
442 
443         if sve_supported
444             && vm
445                 .as_any()
446                 .downcast_ref::<hypervisor::kvm::KvmVm>()
447                 .unwrap()
448                 .check_extension(Cap::ArmSve)
449         {
450             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_SVE;
451         }
452 
453         // Non-boot cpus are powered off initially.
454         if self.id > 0 {
455             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
456         }
457         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)?;
458         if sve_supported {
459             self.vcpu
460                 .vcpu_finalize(kvm_bindings::KVM_ARM_VCPU_SVE as i32)
461                 .map_err(Error::VcpuArmFinalize)?;
462         }
463         Ok(())
464     }
465 
466     /// Runs the VCPU until it exits, returning the reason.
467     ///
468     /// Note that the state of the VCPU and associated VM must be setup first for this to do
469     /// anything useful.
470     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
471         self.vcpu.run()
472     }
473 
474     #[cfg(feature = "sev_snp")]
475     pub fn set_sev_control_register(&self, vmsa_pfn: u64) -> Result<()> {
476         self.vcpu
477             .set_sev_control_register(vmsa_pfn)
478             .map_err(Error::SetSevControlRegister)
479     }
480 }
481 
482 impl Pausable for Vcpu {}
483 impl Snapshottable for Vcpu {
484     fn id(&self) -> String {
485         self.id.to_string()
486     }
487 
488     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
489         let saved_state = self
490             .vcpu
491             .state()
492             .map_err(|e| MigratableError::Snapshot(anyhow!("Could not get vCPU state {:?}", e)))?;
493 
494         self.saved_state = Some(saved_state.clone());
495 
496         Ok(Snapshot::from_data(SnapshotData::new_from_state(
497             &saved_state,
498         )?))
499     }
500 }
501 
502 pub struct CpuManager {
503     config: CpusConfig,
504     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
505     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
506     #[cfg(target_arch = "x86_64")]
507     cpuid: Vec<CpuIdEntry>,
508     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
509     vm: Arc<dyn hypervisor::Vm>,
510     vcpus_kill_signalled: Arc<AtomicBool>,
511     vcpus_pause_signalled: Arc<AtomicBool>,
512     vcpus_kick_signalled: Arc<AtomicBool>,
513     exit_evt: EventFd,
514     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
515     reset_evt: EventFd,
516     #[cfg(feature = "guest_debug")]
517     vm_debug_evt: EventFd,
518     vcpu_states: Vec<VcpuState>,
519     selected_cpu: u8,
520     vcpus: Vec<Arc<Mutex<Vcpu>>>,
521     seccomp_action: SeccompAction,
522     vm_ops: Arc<dyn VmOps>,
523     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
524     acpi_address: Option<GuestAddress>,
525     proximity_domain_per_cpu: BTreeMap<u8, u32>,
526     affinity: BTreeMap<u8, Vec<usize>>,
527     dynamic: bool,
528     hypervisor: Arc<dyn hypervisor::Hypervisor>,
529     #[cfg(feature = "sev_snp")]
530     sev_snp_enabled: bool,
531 }
532 
533 const CPU_ENABLE_FLAG: usize = 0;
534 const CPU_INSERTING_FLAG: usize = 1;
535 const CPU_REMOVING_FLAG: usize = 2;
536 const CPU_EJECT_FLAG: usize = 3;
537 
538 const CPU_STATUS_OFFSET: u64 = 4;
539 const CPU_SELECTION_OFFSET: u64 = 0;
540 
541 impl BusDevice for CpuManager {
542     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
543         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
544         data.fill(0);
545 
546         match offset {
547             CPU_SELECTION_OFFSET => {
548                 data[0] = self.selected_cpu;
549             }
550             CPU_STATUS_OFFSET => {
551                 if self.selected_cpu < self.max_vcpus() {
552                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
553                     if state.active() {
554                         data[0] |= 1 << CPU_ENABLE_FLAG;
555                     }
556                     if state.inserting {
557                         data[0] |= 1 << CPU_INSERTING_FLAG;
558                     }
559                     if state.removing {
560                         data[0] |= 1 << CPU_REMOVING_FLAG;
561                     }
562                 } else {
563                     warn!("Out of range vCPU id: {}", self.selected_cpu);
564                 }
565             }
566             _ => {
567                 warn!(
568                     "Unexpected offset for accessing CPU manager device: {:#}",
569                     offset
570                 );
571             }
572         }
573     }
574 
575     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
576         match offset {
577             CPU_SELECTION_OFFSET => {
578                 self.selected_cpu = data[0];
579             }
580             CPU_STATUS_OFFSET => {
581                 if self.selected_cpu < self.max_vcpus() {
582                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
583                     // The ACPI code writes back a 1 to acknowledge the insertion
584                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
585                         && state.inserting
586                     {
587                         state.inserting = false;
588                     }
589                     // Ditto for removal
590                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
591                         && state.removing
592                     {
593                         state.removing = false;
594                     }
595                     // Trigger removal of vCPU
596                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
597                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
598                             error!("Error removing vCPU: {:?}", e);
599                         }
600                     }
601                 } else {
602                     warn!("Out of range vCPU id: {}", self.selected_cpu);
603                 }
604             }
605             _ => {
606                 warn!(
607                     "Unexpected offset for accessing CPU manager device: {:#}",
608                     offset
609                 );
610             }
611         }
612         None
613     }
614 }
615 
616 #[derive(Default)]
617 struct VcpuState {
618     inserting: bool,
619     removing: bool,
620     pending_removal: Arc<AtomicBool>,
621     handle: Option<thread::JoinHandle<()>>,
622     kill: Arc<AtomicBool>,
623     vcpu_run_interrupted: Arc<AtomicBool>,
624     paused: Arc<AtomicBool>,
625 }
626 
627 impl VcpuState {
628     fn active(&self) -> bool {
629         self.handle.is_some()
630     }
631 
632     fn signal_thread(&self) {
633         if let Some(handle) = self.handle.as_ref() {
634             loop {
635                 // SAFETY: FFI call with correct arguments
636                 unsafe {
637                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
638                 }
639                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
640                     break;
641                 } else {
642                     // This is more effective than thread::yield_now() at
643                     // avoiding a priority inversion with the vCPU thread
644                     thread::sleep(std::time::Duration::from_millis(1));
645                 }
646             }
647         }
648     }
649 
650     fn join_thread(&mut self) -> Result<()> {
651         if let Some(handle) = self.handle.take() {
652             handle.join().map_err(Error::ThreadCleanup)?
653         }
654 
655         Ok(())
656     }
657 
658     fn unpark_thread(&self) {
659         if let Some(handle) = self.handle.as_ref() {
660             handle.thread().unpark()
661         }
662     }
663 }
664 
665 impl CpuManager {
666     #[allow(unused_variables)]
667     #[allow(clippy::too_many_arguments)]
668     pub fn new(
669         config: &CpusConfig,
670         vm: Arc<dyn hypervisor::Vm>,
671         exit_evt: EventFd,
672         reset_evt: EventFd,
673         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
674         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
675         seccomp_action: SeccompAction,
676         vm_ops: Arc<dyn VmOps>,
677         #[cfg(feature = "tdx")] tdx_enabled: bool,
678         numa_nodes: &NumaNodes,
679         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
680     ) -> Result<Arc<Mutex<CpuManager>>> {
681         if u32::from(config.max_vcpus) > hypervisor.get_max_vcpus() {
682             return Err(Error::MaximumVcpusExceeded);
683         }
684 
685         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
686         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
687         let hypervisor_type = hypervisor.hypervisor_type();
688         #[cfg(target_arch = "x86_64")]
689         let cpu_vendor = hypervisor.get_cpu_vendor();
690 
691         #[cfg(target_arch = "x86_64")]
692         if config.features.amx {
693             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
694             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
695             const XFEATURE_XTILEDATA: usize = 18;
696             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
697 
698             // SAFETY: the syscall is only modifying kernel internal
699             // data structures that the kernel is itself expected to safeguard.
700             let amx_tile = unsafe {
701                 libc::syscall(
702                     libc::SYS_arch_prctl,
703                     ARCH_REQ_XCOMP_GUEST_PERM,
704                     XFEATURE_XTILEDATA,
705                 )
706             };
707 
708             if amx_tile != 0 {
709                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
710             } else {
711                 let mask: usize = 0;
712                 // SAFETY: the mask being modified (not marked mutable as it is
713                 // modified in unsafe only which is permitted) isn't in use elsewhere.
714                 let result = unsafe {
715                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
716                 };
717                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
718                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
719                 }
720             }
721         }
722 
723         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
724             let mut cpu_list = Vec::new();
725             for (proximity_domain, numa_node) in numa_nodes.iter() {
726                 for cpu in numa_node.cpus.iter() {
727                     cpu_list.push((*cpu, *proximity_domain))
728                 }
729             }
730             cpu_list
731         }
732         .into_iter()
733         .collect();
734 
735         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
736             cpu_affinity
737                 .iter()
738                 .map(|a| (a.vcpu, a.host_cpus.clone()))
739                 .collect()
740         } else {
741             BTreeMap::new()
742         };
743 
744         #[cfg(feature = "tdx")]
745         let dynamic = !tdx_enabled;
746         #[cfg(not(feature = "tdx"))]
747         let dynamic = true;
748 
749         Ok(Arc::new(Mutex::new(CpuManager {
750             config: config.clone(),
751             interrupt_controller: None,
752             #[cfg(target_arch = "x86_64")]
753             cpuid: Vec::new(),
754             vm,
755             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
756             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
757             vcpus_kick_signalled: Arc::new(AtomicBool::new(false)),
758             vcpu_states,
759             exit_evt,
760             reset_evt,
761             #[cfg(feature = "guest_debug")]
762             vm_debug_evt,
763             selected_cpu: 0,
764             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
765             seccomp_action,
766             vm_ops,
767             acpi_address: None,
768             proximity_domain_per_cpu,
769             affinity,
770             dynamic,
771             hypervisor: hypervisor.clone(),
772             #[cfg(feature = "sev_snp")]
773             sev_snp_enabled,
774         })))
775     }
776 
777     #[cfg(target_arch = "x86_64")]
778     pub fn populate_cpuid(
779         &mut self,
780         memory_manager: &Arc<Mutex<MemoryManager>>,
781         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
782         #[cfg(feature = "tdx")] tdx: bool,
783     ) -> Result<()> {
784         let sgx_epc_sections = memory_manager
785             .lock()
786             .unwrap()
787             .sgx_epc_region()
788             .as_ref()
789             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
790 
791         self.cpuid = {
792             let phys_bits = physical_bits(hypervisor, self.config.max_phys_bits);
793             arch::generate_common_cpuid(
794                 hypervisor,
795                 &arch::CpuidConfig {
796                     sgx_epc_sections,
797                     phys_bits,
798                     kvm_hyperv: self.config.kvm_hyperv,
799                     #[cfg(feature = "tdx")]
800                     tdx,
801                     amx: self.config.features.amx,
802                 },
803             )
804             .map_err(Error::CommonCpuId)?
805         };
806 
807         Ok(())
808     }
809 
810     fn create_vcpu(&mut self, cpu_id: u8, snapshot: Option<Snapshot>) -> Result<Arc<Mutex<Vcpu>>> {
811         info!("Creating vCPU: cpu_id = {}", cpu_id);
812 
813         #[cfg(target_arch = "x86_64")]
814         let topology = self.get_vcpu_topology();
815         #[cfg(target_arch = "x86_64")]
816         let x2apic_id = arch::x86_64::get_x2apic_id(cpu_id as u32, topology);
817         #[cfg(target_arch = "aarch64")]
818         let x2apic_id = cpu_id as u32;
819 
820         let mut vcpu = Vcpu::new(
821             cpu_id,
822             x2apic_id as u8,
823             &self.vm,
824             Some(self.vm_ops.clone()),
825             #[cfg(target_arch = "x86_64")]
826             self.hypervisor.get_cpu_vendor(),
827         )?;
828 
829         if let Some(snapshot) = snapshot {
830             // AArch64 vCPUs should be initialized after created.
831             #[cfg(target_arch = "aarch64")]
832             vcpu.init(&self.vm)?;
833 
834             let state: CpuState = snapshot.to_state().map_err(|e| {
835                 Error::VcpuCreate(anyhow!("Could not get vCPU state from snapshot {:?}", e))
836             })?;
837             vcpu.vcpu
838                 .set_state(&state)
839                 .map_err(|e| Error::VcpuCreate(anyhow!("Could not set the vCPU state {:?}", e)))?;
840 
841             vcpu.saved_state = Some(state);
842         }
843 
844         let vcpu = Arc::new(Mutex::new(vcpu));
845 
846         // Adding vCPU to the CpuManager's vCPU list.
847         self.vcpus.push(vcpu.clone());
848 
849         Ok(vcpu)
850     }
851 
852     pub fn configure_vcpu(
853         &self,
854         vcpu: Arc<Mutex<Vcpu>>,
855         boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
856     ) -> Result<()> {
857         let mut vcpu = vcpu.lock().unwrap();
858 
859         #[cfg(feature = "sev_snp")]
860         if self.sev_snp_enabled {
861             if let Some((kernel_entry_point, _)) = boot_setup {
862                 vcpu.set_sev_control_register(
863                     kernel_entry_point.entry_addr.0 / crate::igvm::HV_PAGE_SIZE,
864                 )?;
865             }
866 
867             // Traditional way to configure vcpu doesn't work for SEV-SNP guests.
868             // All the vCPU configuration for SEV-SNP guest is provided via VMSA.
869             return Ok(());
870         }
871 
872         #[cfg(target_arch = "x86_64")]
873         assert!(!self.cpuid.is_empty());
874 
875         #[cfg(target_arch = "x86_64")]
876         let topology = self.config.topology.clone().map_or_else(
877             || Some((1, self.boot_vcpus(), 1)),
878             |t| Some((t.threads_per_core, t.cores_per_die, t.dies_per_package)),
879         );
880         #[cfg(target_arch = "x86_64")]
881         vcpu.configure(
882             boot_setup,
883             self.cpuid.clone(),
884             self.config.kvm_hyperv,
885             topology,
886         )?;
887 
888         #[cfg(target_arch = "aarch64")]
889         vcpu.configure(&self.vm, boot_setup)?;
890 
891         Ok(())
892     }
893 
894     /// Only create new vCPUs if there aren't any inactive ones to reuse
895     fn create_vcpus(
896         &mut self,
897         desired_vcpus: u8,
898         snapshot: Option<Snapshot>,
899     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
900         let mut vcpus: Vec<Arc<Mutex<Vcpu>>> = vec![];
901         info!(
902             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
903             desired_vcpus,
904             self.config.max_vcpus,
905             self.vcpus.len(),
906             self.present_vcpus()
907         );
908 
909         if desired_vcpus > self.config.max_vcpus {
910             return Err(Error::DesiredVCpuCountExceedsMax);
911         }
912 
913         // Only create vCPUs in excess of all the allocated vCPUs.
914         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
915             vcpus.push(self.create_vcpu(
916                 cpu_id,
917                 // TODO: The special format of the CPU id can be removed once
918                 // ready to break live upgrade.
919                 snapshot_from_id(snapshot.as_ref(), cpu_id.to_string().as_str()),
920             )?);
921         }
922 
923         Ok(vcpus)
924     }
925 
926     #[cfg(target_arch = "aarch64")]
927     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
928         for cpu in self.vcpus.iter() {
929             let cpu = cpu.lock().unwrap();
930             // Check if PMU attr is available, if not, log the information.
931             if cpu.vcpu.has_pmu_support() {
932                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
933             } else {
934                 debug!(
935                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
936                     cpu.id
937                 );
938                 return Ok(false);
939             }
940         }
941 
942         Ok(true)
943     }
944 
945     pub fn vcpus(&self) -> Vec<Arc<Mutex<Vcpu>>> {
946         self.vcpus.clone()
947     }
948 
949     fn start_vcpu(
950         &mut self,
951         vcpu: Arc<Mutex<Vcpu>>,
952         vcpu_id: u8,
953         vcpu_thread_barrier: Arc<Barrier>,
954         inserting: bool,
955     ) -> Result<()> {
956         let reset_evt = self.reset_evt.try_clone().unwrap();
957         let exit_evt = self.exit_evt.try_clone().unwrap();
958         #[cfg(feature = "kvm")]
959         let hypervisor_type = self.hypervisor.hypervisor_type();
960         #[cfg(feature = "guest_debug")]
961         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
962         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
963         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
964         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
965         let vcpu_kick_signalled = self.vcpus_kick_signalled.clone();
966 
967         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
968         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
969             .vcpu_run_interrupted
970             .clone();
971         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
972         let vcpu_paused = self.vcpu_states[usize::from(vcpu_id)].paused.clone();
973 
974         // Prepare the CPU set the current vCPU is expected to run onto.
975         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
976             // SAFETY: all zeros is a valid pattern
977             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
978             // SAFETY: FFI call, trivially safe
979             unsafe { libc::CPU_ZERO(&mut cpuset) };
980             for host_cpu in host_cpus {
981                 // SAFETY: FFI call, trivially safe
982                 unsafe { libc::CPU_SET(*host_cpu, &mut cpuset) };
983             }
984             cpuset
985         });
986 
987         // Retrieve seccomp filter for vcpu thread
988         let vcpu_seccomp_filter = get_seccomp_filter(
989             &self.seccomp_action,
990             Thread::Vcpu,
991             self.hypervisor.hypervisor_type(),
992         )
993         .map_err(Error::CreateSeccompFilter)?;
994 
995         #[cfg(target_arch = "x86_64")]
996         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
997 
998         info!("Starting vCPU: cpu_id = {}", vcpu_id);
999 
1000         let handle = Some(
1001             thread::Builder::new()
1002                 .name(format!("vcpu{vcpu_id}"))
1003                 .spawn(move || {
1004                     // Schedule the thread to run on the expected CPU set
1005                     if let Some(cpuset) = cpuset.as_ref() {
1006                         // SAFETY: FFI call with correct arguments
1007                         let ret = unsafe {
1008                             libc::sched_setaffinity(
1009                                 0,
1010                                 std::mem::size_of::<libc::cpu_set_t>(),
1011                                 cpuset as *const libc::cpu_set_t,
1012                             )
1013                         };
1014 
1015                         if ret != 0 {
1016                             error!(
1017                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
1018                                 vcpu_id,
1019                                 io::Error::last_os_error()
1020                             );
1021                             return;
1022                         }
1023                     }
1024 
1025                     // Apply seccomp filter for vcpu thread.
1026                     if !vcpu_seccomp_filter.is_empty() {
1027                         if let Err(e) =
1028                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
1029                         {
1030                             error!("Error applying seccomp filter: {:?}", e);
1031                             return;
1032                         }
1033                     }
1034                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
1035                     // This uses an async signal safe handler to kill the vcpu handles.
1036                     register_signal_handler(SIGRTMIN(), handle_signal)
1037                         .expect("Failed to register vcpu signal handler");
1038                     // Block until all CPUs are ready.
1039                     vcpu_thread_barrier.wait();
1040 
1041                     std::panic::catch_unwind(move || {
1042                         loop {
1043                             // If we are being told to pause, we park the thread
1044                             // until the pause boolean is toggled.
1045                             // The resume operation is responsible for toggling
1046                             // the boolean and unpark the thread.
1047                             // We enter a loop because park() could spuriously
1048                             // return. We will then park() again unless the
1049                             // pause boolean has been toggled.
1050 
1051                             // Need to use Ordering::SeqCst as we have multiple
1052                             // loads and stores to different atomics and we need
1053                             // to see them in a consistent order in all threads
1054 
1055                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
1056                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
1057                                 // completed by returning to KVM_RUN. From the kernel docs:
1058                                 //
1059                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
1060                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
1061                                 // operations are complete (and guest state is consistent) only after userspace
1062                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
1063                                 // incomplete operations and then check for pending signals.
1064                                 // The pending state of the operation is not preserved in state which is
1065                                 // visible to userspace, thus userspace should ensure that the operation is
1066                                 // completed before performing a live migration.  Userspace can re-enter the
1067                                 // guest with an unmasked signal pending or with the immediate_exit field set
1068                                 // to complete pending operations without allowing any further instructions
1069                                 // to be executed.
1070 
1071                                 #[cfg(feature = "kvm")]
1072                                 if matches!(hypervisor_type, HypervisorType::Kvm) {
1073                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
1074                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
1075                                         error!("Unexpected VM exit on \"immediate_exit\" run");
1076                                         break;
1077                                     }
1078                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
1079                                 }
1080 
1081                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1082 
1083                                 vcpu_paused.store(true, Ordering::SeqCst);
1084                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
1085                                     thread::park();
1086                                 }
1087                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
1088                             }
1089 
1090                             if vcpu_kick_signalled.load(Ordering::SeqCst) {
1091                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1092                                 #[cfg(target_arch = "x86_64")]
1093                                 match vcpu.lock().as_ref().unwrap().vcpu.nmi() {
1094                                     Ok(()) => {},
1095                                     Err(e) => {
1096                                         error!("Error when inject nmi {}", e);
1097                                         break;
1098                                     }
1099                                 }
1100                             }
1101 
1102                             // We've been told to terminate
1103                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1104                                 || vcpu_kill.load(Ordering::SeqCst)
1105                             {
1106                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1107                                 break;
1108                             }
1109 
1110                             #[cfg(feature = "tdx")]
1111                             let mut vcpu = vcpu.lock().unwrap();
1112                             #[cfg(not(feature = "tdx"))]
1113                             let vcpu = vcpu.lock().unwrap();
1114                             // vcpu.run() returns false on a triple-fault so trigger a reset
1115                             match vcpu.run() {
1116                                 Ok(run) => match run {
1117                                     #[cfg(feature = "kvm")]
1118                                     VmExit::Debug => {
1119                                         info!("VmExit::Debug");
1120                                         #[cfg(feature = "guest_debug")]
1121                                         {
1122                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
1123                                             let raw_tid = get_raw_tid(vcpu_id as usize);
1124                                             vm_debug_evt.write(raw_tid as u64).unwrap();
1125                                         }
1126                                     }
1127                                     #[cfg(target_arch = "x86_64")]
1128                                     VmExit::IoapicEoi(vector) => {
1129                                         if let Some(interrupt_controller) =
1130                                             &interrupt_controller_clone
1131                                         {
1132                                             interrupt_controller
1133                                                 .lock()
1134                                                 .unwrap()
1135                                                 .end_of_interrupt(vector);
1136                                         }
1137                                     }
1138                                     VmExit::Ignore => {}
1139                                     VmExit::Hyperv => {}
1140                                     VmExit::Reset => {
1141                                         info!("VmExit::Reset");
1142                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1143                                         reset_evt.write(1).unwrap();
1144                                         break;
1145                                     }
1146                                     VmExit::Shutdown => {
1147                                         info!("VmExit::Shutdown");
1148                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1149                                         exit_evt.write(1).unwrap();
1150                                         break;
1151                                     }
1152                                     #[cfg(feature = "tdx")]
1153                                     VmExit::Tdx => {
1154                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1155                                             match vcpu.get_tdx_exit_details() {
1156                                                 Ok(details) => match details {
1157                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1158                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1159                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1160                                                     }
1161                                                 },
1162                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1163                                             }
1164                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1165                                         } else {
1166                                             // We should never reach this code as
1167                                             // this means the design from the code
1168                                             // is wrong.
1169                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1170                                         }
1171                                     }
1172                                 },
1173 
1174                                 Err(e) => {
1175                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1176                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
1177                                     exit_evt.write(1).unwrap();
1178                                     break;
1179                                 }
1180                             }
1181 
1182                             // We've been told to terminate
1183                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1184                                 || vcpu_kill.load(Ordering::SeqCst)
1185                             {
1186                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1187                                 break;
1188                             }
1189                         }
1190                     })
1191                     .or_else(|_| {
1192                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1193                         error!("vCPU thread panicked");
1194                         panic_exit_evt.write(1)
1195                     })
1196                     .ok();
1197                 })
1198                 .map_err(Error::VcpuSpawn)?,
1199         );
1200 
1201         // On hot plug calls into this function entry_point is None. It is for
1202         // those hotplug CPU additions that we need to set the inserting flag.
1203         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1204         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1205 
1206         Ok(())
1207     }
1208 
1209     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1210     fn activate_vcpus(
1211         &mut self,
1212         desired_vcpus: u8,
1213         inserting: bool,
1214         paused: Option<bool>,
1215     ) -> Result<()> {
1216         if desired_vcpus > self.config.max_vcpus {
1217             return Err(Error::DesiredVCpuCountExceedsMax);
1218         }
1219 
1220         let vcpu_thread_barrier = Arc::new(Barrier::new(
1221             (desired_vcpus - self.present_vcpus() + 1) as usize,
1222         ));
1223 
1224         if let Some(paused) = paused {
1225             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1226         }
1227 
1228         info!(
1229             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1230             desired_vcpus,
1231             self.vcpus.len(),
1232             self.present_vcpus(),
1233             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1234         );
1235 
1236         // This reuses any inactive vCPUs as well as any that were newly created
1237         for vcpu_id in self.present_vcpus()..desired_vcpus {
1238             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1239             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1240         }
1241 
1242         // Unblock all CPU threads.
1243         vcpu_thread_barrier.wait();
1244         Ok(())
1245     }
1246 
1247     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1248         // Mark vCPUs for removal, actual removal happens on ejection
1249         for cpu_id in desired_vcpus..self.present_vcpus() {
1250             self.vcpu_states[usize::from(cpu_id)].removing = true;
1251             self.vcpu_states[usize::from(cpu_id)]
1252                 .pending_removal
1253                 .store(true, Ordering::SeqCst);
1254         }
1255     }
1256 
1257     pub fn check_pending_removed_vcpu(&mut self) -> bool {
1258         for state in self.vcpu_states.iter() {
1259             if state.active() && state.pending_removal.load(Ordering::SeqCst) {
1260                 return true;
1261             }
1262         }
1263         false
1264     }
1265 
1266     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1267         info!("Removing vCPU: cpu_id = {}", cpu_id);
1268         let state = &mut self.vcpu_states[usize::from(cpu_id)];
1269         state.kill.store(true, Ordering::SeqCst);
1270         state.signal_thread();
1271         state.join_thread()?;
1272         state.handle = None;
1273 
1274         // Once the thread has exited, clear the "kill" so that it can reused
1275         state.kill.store(false, Ordering::SeqCst);
1276         state.pending_removal.store(false, Ordering::SeqCst);
1277 
1278         Ok(())
1279     }
1280 
1281     pub fn create_boot_vcpus(
1282         &mut self,
1283         snapshot: Option<Snapshot>,
1284     ) -> Result<Vec<Arc<Mutex<Vcpu>>>> {
1285         trace_scoped!("create_boot_vcpus");
1286 
1287         self.create_vcpus(self.boot_vcpus(), snapshot)
1288     }
1289 
1290     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1291     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1292         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1293     }
1294 
1295     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1296         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1297             .map_err(|e| {
1298                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1299             })?;
1300 
1301         Ok(())
1302     }
1303 
1304     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1305         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1306             return Ok(false);
1307         }
1308 
1309         if !self.dynamic {
1310             return Ok(false);
1311         }
1312 
1313         if self.check_pending_removed_vcpu() {
1314             return Err(Error::VcpuPendingRemovedVcpu);
1315         }
1316 
1317         match desired_vcpus.cmp(&self.present_vcpus()) {
1318             cmp::Ordering::Greater => {
1319                 let vcpus = self.create_vcpus(desired_vcpus, None)?;
1320                 for vcpu in vcpus {
1321                     self.configure_vcpu(vcpu, None)?
1322                 }
1323                 self.activate_vcpus(desired_vcpus, true, None)?;
1324                 Ok(true)
1325             }
1326             cmp::Ordering::Less => {
1327                 self.mark_vcpus_for_removal(desired_vcpus);
1328                 Ok(true)
1329             }
1330             _ => Ok(false),
1331         }
1332     }
1333 
1334     pub fn shutdown(&mut self) -> Result<()> {
1335         // Tell the vCPUs to stop themselves next time they go through the loop
1336         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1337 
1338         // Toggle the vCPUs pause boolean
1339         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1340 
1341         // Unpark all the VCPU threads.
1342         for state in self.vcpu_states.iter() {
1343             state.unpark_thread();
1344         }
1345 
1346         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1347         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1348         // above.
1349         for state in self.vcpu_states.iter() {
1350             state.signal_thread();
1351         }
1352 
1353         // Wait for all the threads to finish. This removes the state from the vector.
1354         for mut state in self.vcpu_states.drain(..) {
1355             state.join_thread()?;
1356         }
1357 
1358         Ok(())
1359     }
1360 
1361     #[cfg(feature = "tdx")]
1362     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1363         for vcpu in &self.vcpus {
1364             vcpu.lock()
1365                 .unwrap()
1366                 .vcpu
1367                 .tdx_init(hob_address)
1368                 .map_err(Error::InitializeTdx)?;
1369         }
1370         Ok(())
1371     }
1372 
1373     pub fn boot_vcpus(&self) -> u8 {
1374         self.config.boot_vcpus
1375     }
1376 
1377     pub fn max_vcpus(&self) -> u8 {
1378         self.config.max_vcpus
1379     }
1380 
1381     #[cfg(target_arch = "x86_64")]
1382     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1383         assert!(!self.cpuid.is_empty());
1384         self.cpuid.clone()
1385     }
1386 
1387     fn present_vcpus(&self) -> u8 {
1388         self.vcpu_states
1389             .iter()
1390             .fold(0, |acc, state| acc + state.active() as u8)
1391     }
1392 
1393     #[cfg(target_arch = "aarch64")]
1394     pub fn get_mpidrs(&self) -> Vec<u64> {
1395         self.vcpus
1396             .iter()
1397             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1398             .collect()
1399     }
1400 
1401     #[cfg(target_arch = "aarch64")]
1402     pub fn get_saved_states(&self) -> Vec<CpuState> {
1403         self.vcpus
1404             .iter()
1405             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1406             .collect()
1407     }
1408 
1409     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1410         self.config
1411             .topology
1412             .clone()
1413             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1414     }
1415 
1416     pub fn create_madt(&self) -> Sdt {
1417         use crate::acpi;
1418         // This is also checked in the commandline parsing.
1419         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1420 
1421         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1422         #[cfg(target_arch = "x86_64")]
1423         {
1424             madt.write(36, arch::layout::APIC_START.0);
1425 
1426             for cpu in 0..self.config.max_vcpus {
1427                 let x2apic_id = get_x2apic_id(cpu.into(), self.get_vcpu_topology());
1428 
1429                 let lapic = LocalX2Apic {
1430                     r#type: acpi::ACPI_X2APIC_PROCESSOR,
1431                     length: 16,
1432                     processor_id: cpu.into(),
1433                     apic_id: x2apic_id,
1434                     flags: if cpu < self.config.boot_vcpus {
1435                         1 << MADT_CPU_ENABLE_FLAG
1436                     } else {
1437                         0
1438                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1439                     _reserved: 0,
1440                 };
1441                 madt.append(lapic);
1442             }
1443 
1444             madt.append(Ioapic {
1445                 r#type: acpi::ACPI_APIC_IO,
1446                 length: 12,
1447                 ioapic_id: 0,
1448                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1449                 gsi_base: 0,
1450                 ..Default::default()
1451             });
1452 
1453             madt.append(InterruptSourceOverride {
1454                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1455                 length: 10,
1456                 bus: 0,
1457                 source: 4,
1458                 gsi: 4,
1459                 flags: 0,
1460             });
1461         }
1462 
1463         #[cfg(target_arch = "aarch64")]
1464         {
1465             /* Notes:
1466              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1467              */
1468 
1469             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1470             for cpu in 0..self.config.boot_vcpus {
1471                 let vcpu = &self.vcpus[cpu as usize];
1472                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1473                 /* ARMv8 MPIDR format:
1474                      Bits [63:40] Must be zero
1475                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1476                      Bits [31:24] Must be zero
1477                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1478                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1479                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1480                 */
1481                 let mpidr_mask = 0xff_00ff_ffff;
1482                 let gicc = GicC {
1483                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1484                     length: 80,
1485                     reserved0: 0,
1486                     cpu_interface_number: cpu as u32,
1487                     uid: cpu as u32,
1488                     flags: 1,
1489                     parking_version: 0,
1490                     performance_interrupt: 0,
1491                     parked_address: 0,
1492                     base_address: 0,
1493                     gicv_base_address: 0,
1494                     gich_base_address: 0,
1495                     vgic_interrupt: 0,
1496                     gicr_base_address: 0,
1497                     mpidr: mpidr & mpidr_mask,
1498                     proc_power_effi_class: 0,
1499                     reserved1: 0,
1500                     spe_overflow_interrupt: 0,
1501                 };
1502 
1503                 madt.append(gicc);
1504             }
1505             let vgic_config = Gic::create_default_config(self.config.boot_vcpus.into());
1506 
1507             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1508             let gicd = GicD {
1509                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1510                 length: 24,
1511                 reserved0: 0,
1512                 gic_id: 0,
1513                 base_address: vgic_config.dist_addr,
1514                 global_irq_base: 0,
1515                 version: 3,
1516                 reserved1: [0; 3],
1517             };
1518             madt.append(gicd);
1519 
1520             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1521             let gicr = GicR {
1522                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1523                 length: 16,
1524                 reserved: 0,
1525                 base_address: vgic_config.redists_addr,
1526                 range_length: vgic_config.redists_size as u32,
1527             };
1528             madt.append(gicr);
1529 
1530             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1531             let gicits = GicIts {
1532                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1533                 length: 20,
1534                 reserved0: 0,
1535                 translation_id: 0,
1536                 base_address: vgic_config.msi_addr,
1537                 reserved1: 0,
1538             };
1539             madt.append(gicits);
1540 
1541             madt.update_checksum();
1542         }
1543 
1544         madt
1545     }
1546 
1547     #[cfg(target_arch = "aarch64")]
1548     pub fn create_pptt(&self) -> Sdt {
1549         let pptt_start = 0;
1550         let mut cpus = 0;
1551         let mut uid = 0;
1552         // If topology is not specified, the default setting is:
1553         // 1 package, multiple cores, 1 thread per core
1554         // This is also the behavior when PPTT is missing.
1555         let (threads_per_core, cores_per_package, packages) =
1556             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1557 
1558         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1559 
1560         for cluster_idx in 0..packages {
1561             if cpus < self.config.boot_vcpus as usize {
1562                 let cluster_offset = pptt.len() - pptt_start;
1563                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1564                     r#type: 0,
1565                     length: 20,
1566                     reserved: 0,
1567                     flags: 0x2,
1568                     parent: 0,
1569                     acpi_processor_id: cluster_idx as u32,
1570                     num_private_resources: 0,
1571                 };
1572                 pptt.append(cluster_hierarchy_node);
1573 
1574                 for core_idx in 0..cores_per_package {
1575                     let core_offset = pptt.len() - pptt_start;
1576 
1577                     if threads_per_core > 1 {
1578                         let core_hierarchy_node = ProcessorHierarchyNode {
1579                             r#type: 0,
1580                             length: 20,
1581                             reserved: 0,
1582                             flags: 0x2,
1583                             parent: cluster_offset as u32,
1584                             acpi_processor_id: core_idx as u32,
1585                             num_private_resources: 0,
1586                         };
1587                         pptt.append(core_hierarchy_node);
1588 
1589                         for _thread_idx in 0..threads_per_core {
1590                             let thread_hierarchy_node = ProcessorHierarchyNode {
1591                                 r#type: 0,
1592                                 length: 20,
1593                                 reserved: 0,
1594                                 flags: 0xE,
1595                                 parent: core_offset as u32,
1596                                 acpi_processor_id: uid as u32,
1597                                 num_private_resources: 0,
1598                             };
1599                             pptt.append(thread_hierarchy_node);
1600                             uid += 1;
1601                         }
1602                     } else {
1603                         let thread_hierarchy_node = ProcessorHierarchyNode {
1604                             r#type: 0,
1605                             length: 20,
1606                             reserved: 0,
1607                             flags: 0xA,
1608                             parent: cluster_offset as u32,
1609                             acpi_processor_id: uid as u32,
1610                             num_private_resources: 0,
1611                         };
1612                         pptt.append(thread_hierarchy_node);
1613                         uid += 1;
1614                     }
1615                 }
1616                 cpus += (cores_per_package * threads_per_core) as usize;
1617             }
1618         }
1619 
1620         pptt.update_checksum();
1621         pptt
1622     }
1623 
1624     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1625     fn create_standard_regs(&self, cpu_id: u8) -> StandardRegisters {
1626         self.vcpus[usize::from(cpu_id)]
1627             .lock()
1628             .unwrap()
1629             .vcpu
1630             .create_standard_regs()
1631     }
1632 
1633     #[cfg(feature = "guest_debug")]
1634     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1635         self.vcpus[usize::from(cpu_id)]
1636             .lock()
1637             .unwrap()
1638             .vcpu
1639             .get_regs()
1640             .map_err(Error::CpuDebug)
1641     }
1642 
1643     #[cfg(feature = "guest_debug")]
1644     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1645         self.vcpus[usize::from(cpu_id)]
1646             .lock()
1647             .unwrap()
1648             .vcpu
1649             .set_regs(regs)
1650             .map_err(Error::CpuDebug)
1651     }
1652 
1653     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1654     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1655         self.vcpus[usize::from(cpu_id)]
1656             .lock()
1657             .unwrap()
1658             .vcpu
1659             .get_sregs()
1660             .map_err(Error::CpuDebug)
1661     }
1662 
1663     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1664     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1665         self.vcpus[usize::from(cpu_id)]
1666             .lock()
1667             .unwrap()
1668             .vcpu
1669             .set_sregs(sregs)
1670             .map_err(Error::CpuDebug)
1671     }
1672 
1673     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1674     fn translate_gva(
1675         &self,
1676         _guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1677         cpu_id: u8,
1678         gva: u64,
1679     ) -> Result<u64> {
1680         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1681             .lock()
1682             .unwrap()
1683             .vcpu
1684             .translate_gva(gva, /* flags: unused */ 0)
1685             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1686         Ok(gpa)
1687     }
1688 
1689     ///
1690     /// On AArch64, `translate_gva` API is not provided by KVM. We implemented
1691     /// it in VMM by walking through translation tables.
1692     ///
1693     /// Address translation is big topic, here we only focus the scenario that
1694     /// happens in VMM while debugging kernel. This `translate_gva`
1695     /// implementation is restricted to:
1696     /// - Exception Level 1
1697     /// - Translate high address range only (kernel space)
1698     ///
1699     /// This implementation supports following Arm-v8a features related to
1700     /// address translation:
1701     /// - FEAT_LPA
1702     /// - FEAT_LVA
1703     /// - FEAT_LPA2
1704     ///
1705     #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
1706     fn translate_gva(
1707         &self,
1708         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
1709         cpu_id: u8,
1710         gva: u64,
1711     ) -> Result<u64> {
1712         let tcr_el1: u64 = self.vcpus[usize::from(cpu_id)]
1713             .lock()
1714             .unwrap()
1715             .vcpu
1716             .get_sys_reg(regs::TCR_EL1)
1717             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1718         let ttbr1_el1: u64 = self.vcpus[usize::from(cpu_id)]
1719             .lock()
1720             .unwrap()
1721             .vcpu
1722             .get_sys_reg(regs::TTBR1_EL1)
1723             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1724         let id_aa64mmfr0_el1: u64 = self.vcpus[usize::from(cpu_id)]
1725             .lock()
1726             .unwrap()
1727             .vcpu
1728             .get_sys_reg(regs::ID_AA64MMFR0_EL1)
1729             .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1730 
1731         // Bit 55 of the VA determines the range, high (0xFFFxxx...)
1732         // or low (0x000xxx...).
1733         let high_range = extract_bits_64!(gva, 55, 1);
1734         if high_range == 0 {
1735             info!("VA (0x{:x}) range is not supported!", gva);
1736             return Ok(gva);
1737         }
1738 
1739         // High range size offset
1740         let tsz = extract_bits_64!(tcr_el1, 16, 6);
1741         // Granule size
1742         let tg = extract_bits_64!(tcr_el1, 30, 2);
1743         // Indication of 48-bits (0) or 52-bits (1) for FEAT_LPA2
1744         let ds = extract_bits_64!(tcr_el1, 59, 1);
1745 
1746         if tsz == 0 {
1747             info!("VA translation is not ready!");
1748             return Ok(gva);
1749         }
1750 
1751         // VA size is determined by TCR_BL1.T1SZ
1752         let va_size = 64 - tsz;
1753         // Number of bits in VA consumed in each level of translation
1754         let stride = match tg {
1755             3 => 13, // 64KB granule size
1756             1 => 11, // 16KB granule size
1757             _ => 9,  // 4KB, default
1758         };
1759         // Starting level of walking
1760         let mut level = 4 - (va_size - 4) / stride;
1761 
1762         // PA or IPA size is determined
1763         let tcr_ips = extract_bits_64!(tcr_el1, 32, 3);
1764         let pa_range = extract_bits_64_without_offset!(id_aa64mmfr0_el1, 4);
1765         // The IPA size in TCR_BL1 and PA Range in ID_AA64MMFR0_EL1 should match.
1766         // To be safe, we use the minimum value if they are different.
1767         let pa_range = std::cmp::min(tcr_ips, pa_range);
1768         // PA size in bits
1769         let pa_size = match pa_range {
1770             0 => 32,
1771             1 => 36,
1772             2 => 40,
1773             3 => 42,
1774             4 => 44,
1775             5 => 48,
1776             6 => 52,
1777             _ => {
1778                 return Err(Error::TranslateVirtualAddress(anyhow!(format!(
1779                     "PA range not supported {pa_range}"
1780                 ))))
1781             }
1782         };
1783 
1784         let indexmask_grainsize = (!0u64) >> (64 - (stride + 3));
1785         let mut indexmask = (!0u64) >> (64 - (va_size - (stride * (4 - level))));
1786         // If FEAT_LPA2 is present, the translation table descriptor holds
1787         // 50 bits of the table address of next level.
1788         // Otherwise, it is 48 bits.
1789         let descaddrmask = if ds == 1 {
1790             !0u64 >> (64 - 50) // mask with 50 least significant bits
1791         } else {
1792             !0u64 >> (64 - 48) // mask with 48 least significant bits
1793         };
1794         let descaddrmask = descaddrmask & !indexmask_grainsize;
1795 
1796         // Translation table base address
1797         let mut descaddr: u64 = extract_bits_64_without_offset!(ttbr1_el1, 48);
1798         // In the case of FEAT_LPA and FEAT_LPA2, the initial translation table
1799         // address bits [48:51] comes from TTBR1_EL1 bits [2:5].
1800         if pa_size == 52 {
1801             descaddr |= extract_bits_64!(ttbr1_el1, 2, 4) << 48;
1802         }
1803 
1804         // Loop through tables of each level
1805         loop {
1806             // Table offset for current level
1807             let table_offset: u64 = (gva >> (stride * (4 - level))) & indexmask;
1808             descaddr |= table_offset;
1809             descaddr &= !7u64;
1810 
1811             let mut buf = [0; 8];
1812             guest_memory
1813                 .memory()
1814                 .read(&mut buf, GuestAddress(descaddr))
1815                 .map_err(|e| Error::TranslateVirtualAddress(e.into()))?;
1816             let descriptor = u64::from_le_bytes(buf);
1817 
1818             descaddr = descriptor & descaddrmask;
1819             // In the case of FEAT_LPA, the next-level translation table address
1820             // bits [48:51] comes from bits [12:15] of the current descriptor.
1821             // For FEAT_LPA2, the next-level translation table address
1822             // bits [50:51] comes from bits [8:9] of the current descriptor,
1823             // bits [48:49] comes from bits [48:49] of the descriptor which was
1824             // handled previously.
1825             if pa_size == 52 {
1826                 if ds == 1 {
1827                     // FEAT_LPA2
1828                     descaddr |= extract_bits_64!(descriptor, 8, 2) << 50;
1829                 } else {
1830                     // FEAT_LPA
1831                     descaddr |= extract_bits_64!(descriptor, 12, 4) << 48;
1832                 }
1833             }
1834 
1835             if (descriptor & 2) != 0 && (level < 3) {
1836                 // This is a table entry. Go down to next level.
1837                 level += 1;
1838                 indexmask = indexmask_grainsize;
1839                 continue;
1840             }
1841 
1842             break;
1843         }
1844 
1845         // We have reached either:
1846         // - a page entry at level 3 or
1847         // - a block entry at level 1 or 2
1848         let page_size = 1u64 << ((stride * (4 - level)) + 3);
1849         descaddr &= !(page_size - 1);
1850         descaddr |= gva & (page_size - 1);
1851 
1852         Ok(descaddr)
1853     }
1854 
1855     pub(crate) fn set_acpi_address(&mut self, acpi_address: GuestAddress) {
1856         self.acpi_address = Some(acpi_address);
1857     }
1858 
1859     pub(crate) fn set_interrupt_controller(
1860         &mut self,
1861         interrupt_controller: Arc<Mutex<dyn InterruptController>>,
1862     ) {
1863         self.interrupt_controller = Some(interrupt_controller);
1864     }
1865 
1866     pub(crate) fn vcpus_kill_signalled(&self) -> &Arc<AtomicBool> {
1867         &self.vcpus_kill_signalled
1868     }
1869 
1870     #[cfg(feature = "igvm")]
1871     pub(crate) fn get_cpuid_leaf(
1872         &self,
1873         cpu_id: u8,
1874         eax: u32,
1875         ecx: u32,
1876         xfem: u64,
1877         xss: u64,
1878     ) -> Result<[u32; 4]> {
1879         let leaf_info = self.vcpus[usize::from(cpu_id)]
1880             .lock()
1881             .unwrap()
1882             .vcpu
1883             .get_cpuid_values(eax, ecx, xfem, xss)
1884             .unwrap();
1885         Ok(leaf_info)
1886     }
1887 
1888     #[cfg(feature = "sev_snp")]
1889     pub(crate) fn sev_snp_enabled(&self) -> bool {
1890         self.sev_snp_enabled
1891     }
1892 
1893     pub(crate) fn nmi(&self) -> Result<()> {
1894         self.vcpus_kick_signalled.store(true, Ordering::SeqCst);
1895 
1896         for state in self.vcpu_states.iter() {
1897             state.signal_thread();
1898         }
1899 
1900         self.vcpus_kick_signalled.store(false, Ordering::SeqCst);
1901 
1902         Ok(())
1903     }
1904 }
1905 
1906 struct Cpu {
1907     cpu_id: u8,
1908     proximity_domain: u32,
1909     dynamic: bool,
1910     #[cfg(target_arch = "x86_64")]
1911     topology: Option<(u8, u8, u8)>,
1912 }
1913 
1914 #[cfg(target_arch = "x86_64")]
1915 const MADT_CPU_ENABLE_FLAG: usize = 0;
1916 
1917 #[cfg(target_arch = "x86_64")]
1918 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1919 
1920 impl Cpu {
1921     #[cfg(target_arch = "x86_64")]
1922     fn generate_mat(&self) -> Vec<u8> {
1923         let x2apic_id = arch::x86_64::get_x2apic_id(self.cpu_id.into(), self.topology);
1924 
1925         let lapic = LocalX2Apic {
1926             r#type: crate::acpi::ACPI_X2APIC_PROCESSOR,
1927             length: 16,
1928             processor_id: self.cpu_id.into(),
1929             apic_id: x2apic_id,
1930             flags: 1 << MADT_CPU_ENABLE_FLAG,
1931             _reserved: 0,
1932         };
1933 
1934         let mut mat_data: Vec<u8> = vec![0; std::mem::size_of_val(&lapic)];
1935         // SAFETY: mat_data is large enough to hold lapic
1936         unsafe { *(mat_data.as_mut_ptr() as *mut LocalX2Apic) = lapic };
1937 
1938         mat_data
1939     }
1940 }
1941 
1942 impl Aml for Cpu {
1943     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
1944         #[cfg(target_arch = "x86_64")]
1945         let mat_data: Vec<u8> = self.generate_mat();
1946         #[allow(clippy::if_same_then_else)]
1947         if self.dynamic {
1948             aml::Device::new(
1949                 format!("C{:03X}", self.cpu_id).as_str().into(),
1950                 vec![
1951                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1952                     &aml::Name::new("_UID".into(), &self.cpu_id),
1953                     // Currently, AArch64 cannot support following fields.
1954                     /*
1955                     _STA return value:
1956                     Bit [0] – Set if the device is present.
1957                     Bit [1] – Set if the device is enabled and decoding its resources.
1958                     Bit [2] – Set if the device should be shown in the UI.
1959                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1960                     Bit [4] – Set if the battery is present.
1961                     Bits [31:5] – Reserved (must be cleared).
1962                     */
1963                     #[cfg(target_arch = "x86_64")]
1964                     &aml::Method::new(
1965                         "_STA".into(),
1966                         0,
1967                         false,
1968                         // Call into CSTA method which will interrogate device
1969                         vec![&aml::Return::new(&aml::MethodCall::new(
1970                             "CSTA".into(),
1971                             vec![&self.cpu_id],
1972                         ))],
1973                     ),
1974                     &aml::Method::new(
1975                         "_PXM".into(),
1976                         0,
1977                         false,
1978                         vec![&aml::Return::new(&self.proximity_domain)],
1979                     ),
1980                     // The Linux kernel expects every CPU device to have a _MAT entry
1981                     // containing the LAPIC for this processor with the enabled bit set
1982                     // even it if is disabled in the MADT (non-boot CPU)
1983                     #[cfg(target_arch = "x86_64")]
1984                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
1985                     // Trigger CPU ejection
1986                     #[cfg(target_arch = "x86_64")]
1987                     &aml::Method::new(
1988                         "_EJ0".into(),
1989                         1,
1990                         false,
1991                         // Call into CEJ0 method which will actually eject device
1992                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1993                     ),
1994                 ],
1995             )
1996             .to_aml_bytes(sink);
1997         } else {
1998             aml::Device::new(
1999                 format!("C{:03X}", self.cpu_id).as_str().into(),
2000                 vec![
2001                     &aml::Name::new("_HID".into(), &"ACPI0007"),
2002                     &aml::Name::new("_UID".into(), &self.cpu_id),
2003                     #[cfg(target_arch = "x86_64")]
2004                     &aml::Method::new(
2005                         "_STA".into(),
2006                         0,
2007                         false,
2008                         // Mark CPU present see CSTA implementation
2009                         vec![&aml::Return::new(&0xfu8)],
2010                     ),
2011                     &aml::Method::new(
2012                         "_PXM".into(),
2013                         0,
2014                         false,
2015                         vec![&aml::Return::new(&self.proximity_domain)],
2016                     ),
2017                     // The Linux kernel expects every CPU device to have a _MAT entry
2018                     // containing the LAPIC for this processor with the enabled bit set
2019                     // even it if is disabled in the MADT (non-boot CPU)
2020                     #[cfg(target_arch = "x86_64")]
2021                     &aml::Name::new("_MAT".into(), &aml::BufferData::new(mat_data)),
2022                 ],
2023             )
2024             .to_aml_bytes(sink);
2025         }
2026     }
2027 }
2028 
2029 struct CpuNotify {
2030     cpu_id: u8,
2031 }
2032 
2033 impl Aml for CpuNotify {
2034     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2035         let object = aml::Path::new(&format!("C{:03X}", self.cpu_id));
2036         aml::If::new(
2037             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
2038             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2039         )
2040         .to_aml_bytes(sink)
2041     }
2042 }
2043 
2044 struct CpuMethods {
2045     max_vcpus: u8,
2046     dynamic: bool,
2047 }
2048 
2049 impl Aml for CpuMethods {
2050     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2051         if self.dynamic {
2052             // CPU status method
2053             aml::Method::new(
2054                 "CSTA".into(),
2055                 1,
2056                 true,
2057                 vec![
2058                     // Take lock defined above
2059                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2060                     // Write CPU number (in first argument) to I/O port via field
2061                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2062                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2063                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2064                     &aml::If::new(
2065                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
2066                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2067                     ),
2068                     // Release lock
2069                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2070                     // Return 0 or 0xf
2071                     &aml::Return::new(&aml::Local(0)),
2072                 ],
2073             )
2074             .to_aml_bytes(sink);
2075 
2076             let mut cpu_notifies = Vec::new();
2077             for cpu_id in 0..self.max_vcpus {
2078                 cpu_notifies.push(CpuNotify { cpu_id });
2079             }
2080 
2081             let mut cpu_notifies_refs: Vec<&dyn Aml> = Vec::new();
2082             for cpu_id in 0..self.max_vcpus {
2083                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
2084             }
2085 
2086             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(sink);
2087 
2088             aml::Method::new(
2089                 "CEJ0".into(),
2090                 1,
2091                 true,
2092                 vec![
2093                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2094                     // Write CPU number (in first argument) to I/O port via field
2095                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
2096                     // Set CEJ0 bit
2097                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
2098                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2099                 ],
2100             )
2101             .to_aml_bytes(sink);
2102 
2103             aml::Method::new(
2104                 "CSCN".into(),
2105                 0,
2106                 true,
2107                 vec![
2108                     // Take lock defined above
2109                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
2110                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
2111                     &aml::While::new(
2112                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
2113                         vec![
2114                             // Write CPU number (in first argument) to I/O port via field
2115                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
2116                             // Check if CINS bit is set
2117                             &aml::If::new(
2118                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
2119                                 // Notify device if it is
2120                                 vec![
2121                                     &aml::MethodCall::new(
2122                                         "CTFY".into(),
2123                                         vec![&aml::Local(0), &aml::ONE],
2124                                     ),
2125                                     // Reset CINS bit
2126                                     &aml::Store::new(
2127                                         &aml::Path::new("\\_SB_.PRES.CINS"),
2128                                         &aml::ONE,
2129                                     ),
2130                                 ],
2131                             ),
2132                             // Check if CRMV bit is set
2133                             &aml::If::new(
2134                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
2135                                 // Notify device if it is (with the eject constant 0x3)
2136                                 vec![
2137                                     &aml::MethodCall::new(
2138                                         "CTFY".into(),
2139                                         vec![&aml::Local(0), &3u8],
2140                                     ),
2141                                     // Reset CRMV bit
2142                                     &aml::Store::new(
2143                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
2144                                         &aml::ONE,
2145                                     ),
2146                                 ],
2147                             ),
2148                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2149                         ],
2150                     ),
2151                     // Release lock
2152                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
2153                 ],
2154             )
2155             .to_aml_bytes(sink)
2156         } else {
2157             aml::Method::new("CSCN".into(), 0, true, vec![]).to_aml_bytes(sink)
2158         }
2159     }
2160 }
2161 
2162 impl Aml for CpuManager {
2163     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2164         #[cfg(target_arch = "x86_64")]
2165         if let Some(acpi_address) = self.acpi_address {
2166             // CPU hotplug controller
2167             aml::Device::new(
2168                 "_SB_.PRES".into(),
2169                 vec![
2170                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2171                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
2172                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
2173                     &aml::Mutex::new("CPLK".into(), 0),
2174                     &aml::Name::new(
2175                         "_CRS".into(),
2176                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2177                             aml::AddressSpaceCacheable::NotCacheable,
2178                             true,
2179                             acpi_address.0,
2180                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
2181                             None,
2182                         )]),
2183                     ),
2184                     // OpRegion and Fields map MMIO range into individual field values
2185                     &aml::OpRegion::new(
2186                         "PRST".into(),
2187                         aml::OpRegionSpace::SystemMemory,
2188                         &(acpi_address.0 as usize),
2189                         &CPU_MANAGER_ACPI_SIZE,
2190                     ),
2191                     &aml::Field::new(
2192                         "PRST".into(),
2193                         aml::FieldAccessType::Byte,
2194                         aml::FieldLockRule::NoLock,
2195                         aml::FieldUpdateRule::WriteAsZeroes,
2196                         vec![
2197                             aml::FieldEntry::Reserved(32),
2198                             aml::FieldEntry::Named(*b"CPEN", 1),
2199                             aml::FieldEntry::Named(*b"CINS", 1),
2200                             aml::FieldEntry::Named(*b"CRMV", 1),
2201                             aml::FieldEntry::Named(*b"CEJ0", 1),
2202                             aml::FieldEntry::Reserved(4),
2203                             aml::FieldEntry::Named(*b"CCMD", 8),
2204                         ],
2205                     ),
2206                     &aml::Field::new(
2207                         "PRST".into(),
2208                         aml::FieldAccessType::DWord,
2209                         aml::FieldLockRule::NoLock,
2210                         aml::FieldUpdateRule::Preserve,
2211                         vec![
2212                             aml::FieldEntry::Named(*b"CSEL", 32),
2213                             aml::FieldEntry::Reserved(32),
2214                             aml::FieldEntry::Named(*b"CDAT", 32),
2215                         ],
2216                     ),
2217                 ],
2218             )
2219             .to_aml_bytes(sink);
2220         }
2221 
2222         // CPU devices
2223         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
2224         let uid = aml::Name::new("_CID".into(), &aml::EISAName::new("PNP0A05"));
2225         // Bundle methods together under a common object
2226         let methods = CpuMethods {
2227             max_vcpus: self.config.max_vcpus,
2228             dynamic: self.dynamic,
2229         };
2230         let mut cpu_data_inner: Vec<&dyn Aml> = vec![&hid, &uid, &methods];
2231 
2232         #[cfg(target_arch = "x86_64")]
2233         let topology = self.get_vcpu_topology();
2234         let mut cpu_devices = Vec::new();
2235         for cpu_id in 0..self.config.max_vcpus {
2236             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
2237             let cpu_device = Cpu {
2238                 cpu_id,
2239                 proximity_domain,
2240                 dynamic: self.dynamic,
2241                 #[cfg(target_arch = "x86_64")]
2242                 topology,
2243             };
2244 
2245             cpu_devices.push(cpu_device);
2246         }
2247 
2248         for cpu_device in cpu_devices.iter() {
2249             cpu_data_inner.push(cpu_device);
2250         }
2251 
2252         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(sink)
2253     }
2254 }
2255 
2256 impl Pausable for CpuManager {
2257     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2258         // Tell the vCPUs to pause themselves next time they exit
2259         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
2260 
2261         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
2262         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
2263         // above.
2264         for state in self.vcpu_states.iter() {
2265             state.signal_thread();
2266         }
2267 
2268         for vcpu in self.vcpus.iter() {
2269             let mut vcpu = vcpu.lock().unwrap();
2270             vcpu.pause()?;
2271             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2272             if !self.config.kvm_hyperv {
2273                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
2274                     MigratableError::Pause(anyhow!(
2275                         "Could not notify guest it has been paused {:?}",
2276                         e
2277                     ))
2278                 })?;
2279             }
2280         }
2281 
2282         // The vCPU thread will change its paused state before parking, wait here for each
2283         // activated vCPU change their state to ensure they have parked.
2284         for state in self.vcpu_states.iter() {
2285             if state.active() {
2286                 while !state.paused.load(Ordering::SeqCst) {
2287                     // To avoid a priority inversion with the vCPU thread
2288                     thread::sleep(std::time::Duration::from_millis(1));
2289                 }
2290             }
2291         }
2292 
2293         Ok(())
2294     }
2295 
2296     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2297         for vcpu in self.vcpus.iter() {
2298             vcpu.lock().unwrap().resume()?;
2299         }
2300 
2301         // Toggle the vCPUs pause boolean
2302         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
2303 
2304         // Unpark all the VCPU threads.
2305         // Once unparked, the next thing they will do is checking for the pause
2306         // boolean. Since it'll be set to false, they will exit their pause loop
2307         // and go back to vmx root.
2308         for state in self.vcpu_states.iter() {
2309             state.paused.store(false, Ordering::SeqCst);
2310             state.unpark_thread();
2311         }
2312         Ok(())
2313     }
2314 }
2315 
2316 impl Snapshottable for CpuManager {
2317     fn id(&self) -> String {
2318         CPU_MANAGER_SNAPSHOT_ID.to_string()
2319     }
2320 
2321     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2322         let mut cpu_manager_snapshot = Snapshot::default();
2323 
2324         // The CpuManager snapshot is a collection of all vCPUs snapshots.
2325         for vcpu in &self.vcpus {
2326             let mut vcpu = vcpu.lock().unwrap();
2327             cpu_manager_snapshot.add_snapshot(vcpu.id(), vcpu.snapshot()?);
2328         }
2329 
2330         Ok(cpu_manager_snapshot)
2331     }
2332 }
2333 
2334 impl Transportable for CpuManager {}
2335 impl Migratable for CpuManager {}
2336 
2337 #[cfg(feature = "guest_debug")]
2338 impl Debuggable for CpuManager {
2339     #[cfg(feature = "kvm")]
2340     fn set_guest_debug(
2341         &self,
2342         cpu_id: usize,
2343         addrs: &[GuestAddress],
2344         singlestep: bool,
2345     ) -> std::result::Result<(), DebuggableError> {
2346         self.vcpus[cpu_id]
2347             .lock()
2348             .unwrap()
2349             .vcpu
2350             .set_guest_debug(addrs, singlestep)
2351             .map_err(DebuggableError::SetDebug)
2352     }
2353 
2354     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2355         Ok(())
2356     }
2357 
2358     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2359         Ok(())
2360     }
2361 
2362     #[cfg(target_arch = "x86_64")]
2363     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2364         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
2365         let gregs = self
2366             .get_regs(cpu_id as u8)
2367             .map_err(DebuggableError::ReadRegs)?;
2368         let regs = [
2369             gregs.get_rax(),
2370             gregs.get_rbx(),
2371             gregs.get_rcx(),
2372             gregs.get_rdx(),
2373             gregs.get_rsi(),
2374             gregs.get_rdi(),
2375             gregs.get_rbp(),
2376             gregs.get_rsp(),
2377             gregs.get_r8(),
2378             gregs.get_r9(),
2379             gregs.get_r10(),
2380             gregs.get_r11(),
2381             gregs.get_r12(),
2382             gregs.get_r13(),
2383             gregs.get_r14(),
2384             gregs.get_r15(),
2385         ];
2386 
2387         // GDB exposes 32-bit eflags instead of 64-bit rflags.
2388         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
2389         let eflags = gregs.get_rflags() as u32;
2390         let rip = gregs.get_rip();
2391 
2392         // Segment registers: CS, SS, DS, ES, FS, GS
2393         let sregs = self
2394             .get_sregs(cpu_id as u8)
2395             .map_err(DebuggableError::ReadRegs)?;
2396         let segments = X86SegmentRegs {
2397             cs: sregs.cs.selector as u32,
2398             ss: sregs.ss.selector as u32,
2399             ds: sregs.ds.selector as u32,
2400             es: sregs.es.selector as u32,
2401             fs: sregs.fs.selector as u32,
2402             gs: sregs.gs.selector as u32,
2403         };
2404 
2405         // TODO: Add other registers
2406 
2407         Ok(CoreRegs {
2408             regs,
2409             eflags,
2410             rip,
2411             segments,
2412             ..Default::default()
2413         })
2414     }
2415 
2416     #[cfg(target_arch = "aarch64")]
2417     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2418         let gregs = self
2419             .get_regs(cpu_id as u8)
2420             .map_err(DebuggableError::ReadRegs)?;
2421         Ok(CoreRegs {
2422             x: gregs.get_regs(),
2423             sp: gregs.get_sp(),
2424             pc: gregs.get_pc(),
2425             ..Default::default()
2426         })
2427     }
2428 
2429     #[cfg(target_arch = "x86_64")]
2430     fn write_regs(
2431         &self,
2432         cpu_id: usize,
2433         regs: &CoreRegs,
2434     ) -> std::result::Result<(), DebuggableError> {
2435         let orig_gregs = self
2436             .get_regs(cpu_id as u8)
2437             .map_err(DebuggableError::ReadRegs)?;
2438         let mut gregs = self.create_standard_regs(cpu_id as u8);
2439         gregs.set_rax(regs.regs[0]);
2440         gregs.set_rbx(regs.regs[1]);
2441         gregs.set_rcx(regs.regs[2]);
2442         gregs.set_rdx(regs.regs[3]);
2443         gregs.set_rsi(regs.regs[4]);
2444         gregs.set_rdi(regs.regs[5]);
2445         gregs.set_rbp(regs.regs[6]);
2446         gregs.set_rsp(regs.regs[7]);
2447         gregs.set_r8(regs.regs[8]);
2448         gregs.set_r9(regs.regs[9]);
2449         gregs.set_r10(regs.regs[10]);
2450         gregs.set_r11(regs.regs[11]);
2451         gregs.set_r12(regs.regs[12]);
2452         gregs.set_r13(regs.regs[13]);
2453         gregs.set_r14(regs.regs[14]);
2454         gregs.set_r15(regs.regs[15]);
2455         gregs.set_rip(regs.rip);
2456         // Update the lower 32-bit of rflags.
2457         gregs.set_rflags((orig_gregs.get_rflags() & !(u32::MAX as u64)) | (regs.eflags as u64));
2458 
2459         self.set_regs(cpu_id as u8, &gregs)
2460             .map_err(DebuggableError::WriteRegs)?;
2461 
2462         // Segment registers: CS, SS, DS, ES, FS, GS
2463         // Since GDB care only selectors, we call get_sregs() first.
2464         let mut sregs = self
2465             .get_sregs(cpu_id as u8)
2466             .map_err(DebuggableError::ReadRegs)?;
2467         sregs.cs.selector = regs.segments.cs as u16;
2468         sregs.ss.selector = regs.segments.ss as u16;
2469         sregs.ds.selector = regs.segments.ds as u16;
2470         sregs.es.selector = regs.segments.es as u16;
2471         sregs.fs.selector = regs.segments.fs as u16;
2472         sregs.gs.selector = regs.segments.gs as u16;
2473 
2474         self.set_sregs(cpu_id as u8, &sregs)
2475             .map_err(DebuggableError::WriteRegs)?;
2476 
2477         // TODO: Add other registers
2478 
2479         Ok(())
2480     }
2481 
2482     #[cfg(target_arch = "aarch64")]
2483     fn write_regs(
2484         &self,
2485         cpu_id: usize,
2486         regs: &CoreRegs,
2487     ) -> std::result::Result<(), DebuggableError> {
2488         let mut gregs = self
2489             .get_regs(cpu_id as u8)
2490             .map_err(DebuggableError::ReadRegs)?;
2491 
2492         gregs.set_regs(regs.x);
2493         gregs.set_sp(regs.sp);
2494         gregs.set_pc(regs.pc);
2495 
2496         self.set_regs(cpu_id as u8, &gregs)
2497             .map_err(DebuggableError::WriteRegs)?;
2498 
2499         Ok(())
2500     }
2501 
2502     fn read_mem(
2503         &self,
2504         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2505         cpu_id: usize,
2506         vaddr: GuestAddress,
2507         len: usize,
2508     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2509         let mut buf = vec![0; len];
2510         let mut total_read = 0_u64;
2511 
2512         while total_read < len as u64 {
2513             let gaddr = vaddr.0 + total_read;
2514             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2515                 Ok(paddr) => paddr,
2516                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2517                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2518             };
2519             let psize = arch::PAGE_SIZE as u64;
2520             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2521             guest_memory
2522                 .memory()
2523                 .read(
2524                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2525                     GuestAddress(paddr),
2526                 )
2527                 .map_err(DebuggableError::ReadMem)?;
2528             total_read += read_len;
2529         }
2530         Ok(buf)
2531     }
2532 
2533     fn write_mem(
2534         &self,
2535         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2536         cpu_id: usize,
2537         vaddr: &GuestAddress,
2538         data: &[u8],
2539     ) -> std::result::Result<(), DebuggableError> {
2540         let mut total_written = 0_u64;
2541 
2542         while total_written < data.len() as u64 {
2543             let gaddr = vaddr.0 + total_written;
2544             let paddr = match self.translate_gva(guest_memory, cpu_id as u8, gaddr) {
2545                 Ok(paddr) => paddr,
2546                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2547                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2548             };
2549             let psize = arch::PAGE_SIZE as u64;
2550             let write_len = std::cmp::min(
2551                 data.len() as u64 - total_written,
2552                 psize - (paddr & (psize - 1)),
2553             );
2554             guest_memory
2555                 .memory()
2556                 .write(
2557                     &data[total_written as usize..total_written as usize + write_len as usize],
2558                     GuestAddress(paddr),
2559                 )
2560                 .map_err(DebuggableError::WriteMem)?;
2561             total_written += write_len;
2562         }
2563         Ok(())
2564     }
2565 
2566     fn active_vcpus(&self) -> usize {
2567         self.present_vcpus() as usize
2568     }
2569 }
2570 
2571 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2572 impl Elf64Writable for CpuManager {}
2573 
2574 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2575 impl CpuElf64Writable for CpuManager {
2576     fn cpu_write_elf64_note(
2577         &mut self,
2578         dump_state: &DumpState,
2579     ) -> std::result::Result<(), GuestDebuggableError> {
2580         let mut coredump_file = dump_state.file.as_ref().unwrap();
2581         for vcpu in &self.vcpus {
2582             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2583             let mut pos: usize = 0;
2584             let mut buf = vec![0; note_size as usize];
2585             let descsz = size_of::<X86_64ElfPrStatus>();
2586             let vcpu_id = vcpu.lock().unwrap().id;
2587 
2588             let note = Elf64_Nhdr {
2589                 n_namesz: COREDUMP_NAME_SIZE,
2590                 n_descsz: descsz as u32,
2591                 n_type: NT_PRSTATUS,
2592             };
2593 
2594             let bytes: &[u8] = note.as_slice();
2595             buf.splice(0.., bytes.to_vec());
2596             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2597             buf.resize(pos + 4, 0);
2598             buf.splice(pos.., "CORE".to_string().into_bytes());
2599 
2600             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2601             buf.resize(pos + 32 + 4, 0);
2602             let pid = vcpu_id as u64;
2603             let bytes: &[u8] = pid.as_slice();
2604             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2605 
2606             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2607 
2608             let orig_rax: u64 = 0;
2609             let gregs = self.vcpus[usize::from(vcpu_id)]
2610                 .lock()
2611                 .unwrap()
2612                 .vcpu
2613                 .get_regs()
2614                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2615 
2616             let regs1 = [
2617                 gregs.get_r15(),
2618                 gregs.get_r14(),
2619                 gregs.get_r13(),
2620                 gregs.get_r12(),
2621                 gregs.get_rbp(),
2622                 gregs.get_rbx(),
2623                 gregs.get_r11(),
2624                 gregs.get_r10(),
2625             ];
2626             let regs2 = [
2627                 gregs.get_r9(),
2628                 gregs.get_r8(),
2629                 gregs.get_rax(),
2630                 gregs.get_rcx(),
2631                 gregs.get_rdx(),
2632                 gregs.get_rsi(),
2633                 gregs.get_rdi(),
2634                 orig_rax,
2635             ];
2636 
2637             let sregs = self.vcpus[usize::from(vcpu_id)]
2638                 .lock()
2639                 .unwrap()
2640                 .vcpu
2641                 .get_sregs()
2642                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2643 
2644             debug!(
2645                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2646                 gregs.get_rip(),
2647                 gregs.get_rsp(),
2648                 sregs.gs.base,
2649                 sregs.cs.selector,
2650                 sregs.ss.selector,
2651                 sregs.ds.selector,
2652             );
2653 
2654             let regs = X86_64UserRegs {
2655                 regs1,
2656                 regs2,
2657                 rip: gregs.get_rip(),
2658                 cs: sregs.cs.selector as u64,
2659                 eflags: gregs.get_rflags(),
2660                 rsp: gregs.get_rsp(),
2661                 ss: sregs.ss.selector as u64,
2662                 fs_base: sregs.fs.base,
2663                 gs_base: sregs.gs.base,
2664                 ds: sregs.ds.selector as u64,
2665                 es: sregs.es.selector as u64,
2666                 fs: sregs.fs.selector as u64,
2667                 gs: sregs.gs.selector as u64,
2668             };
2669 
2670             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2671             let bytes: &[u8] = regs.as_slice();
2672             buf.resize(note_size as usize, 0);
2673             buf.splice(pos.., bytes.to_vec());
2674             buf.resize(note_size as usize, 0);
2675 
2676             coredump_file
2677                 .write(&buf)
2678                 .map_err(GuestDebuggableError::CoredumpFile)?;
2679         }
2680 
2681         Ok(())
2682     }
2683 
2684     fn cpu_write_vmm_note(
2685         &mut self,
2686         dump_state: &DumpState,
2687     ) -> std::result::Result<(), GuestDebuggableError> {
2688         let mut coredump_file = dump_state.file.as_ref().unwrap();
2689         for vcpu in &self.vcpus {
2690             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2691             let mut pos: usize = 0;
2692             let mut buf = vec![0; note_size as usize];
2693             let descsz = size_of::<DumpCpusState>();
2694             let vcpu_id = vcpu.lock().unwrap().id;
2695 
2696             let note = Elf64_Nhdr {
2697                 n_namesz: COREDUMP_NAME_SIZE,
2698                 n_descsz: descsz as u32,
2699                 n_type: 0,
2700             };
2701 
2702             let bytes: &[u8] = note.as_slice();
2703             buf.splice(0.., bytes.to_vec());
2704             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2705 
2706             buf.resize(pos + 4, 0);
2707             buf.splice(pos.., "QEMU".to_string().into_bytes());
2708 
2709             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2710 
2711             let gregs = self.vcpus[usize::from(vcpu_id)]
2712                 .lock()
2713                 .unwrap()
2714                 .vcpu
2715                 .get_regs()
2716                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2717 
2718             let regs1 = [
2719                 gregs.get_rax(),
2720                 gregs.get_rbx(),
2721                 gregs.get_rcx(),
2722                 gregs.get_rdx(),
2723                 gregs.get_rsi(),
2724                 gregs.get_rdi(),
2725                 gregs.get_rsp(),
2726                 gregs.get_rbp(),
2727             ];
2728 
2729             let regs2 = [
2730                 gregs.get_r8(),
2731                 gregs.get_r9(),
2732                 gregs.get_r10(),
2733                 gregs.get_r11(),
2734                 gregs.get_r12(),
2735                 gregs.get_r13(),
2736                 gregs.get_r14(),
2737                 gregs.get_r15(),
2738             ];
2739 
2740             let sregs = self.vcpus[usize::from(vcpu_id)]
2741                 .lock()
2742                 .unwrap()
2743                 .vcpu
2744                 .get_sregs()
2745                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2746 
2747             let mut msrs = vec![MsrEntry {
2748                 index: msr_index::MSR_KERNEL_GS_BASE,
2749                 ..Default::default()
2750             }];
2751 
2752             self.vcpus[vcpu_id as usize]
2753                 .lock()
2754                 .unwrap()
2755                 .vcpu
2756                 .get_msrs(&mut msrs)
2757                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2758             let kernel_gs_base = msrs[0].data;
2759 
2760             let cs = CpuSegment::new(sregs.cs);
2761             let ds = CpuSegment::new(sregs.ds);
2762             let es = CpuSegment::new(sregs.es);
2763             let fs = CpuSegment::new(sregs.fs);
2764             let gs = CpuSegment::new(sregs.gs);
2765             let ss = CpuSegment::new(sregs.ss);
2766             let ldt = CpuSegment::new(sregs.ldt);
2767             let tr = CpuSegment::new(sregs.tr);
2768             let gdt = CpuSegment::new_from_table(sregs.gdt);
2769             let idt = CpuSegment::new_from_table(sregs.idt);
2770             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2771             let regs = DumpCpusState {
2772                 version: 1,
2773                 size: size_of::<DumpCpusState>() as u32,
2774                 regs1,
2775                 regs2,
2776                 rip: gregs.get_rip(),
2777                 rflags: gregs.get_rflags(),
2778                 cs,
2779                 ds,
2780                 es,
2781                 fs,
2782                 gs,
2783                 ss,
2784                 ldt,
2785                 tr,
2786                 gdt,
2787                 idt,
2788                 cr,
2789                 kernel_gs_base,
2790             };
2791 
2792             let bytes: &[u8] = regs.as_slice();
2793             buf.resize(note_size as usize, 0);
2794             buf.splice(pos.., bytes.to_vec());
2795             buf.resize(note_size as usize, 0);
2796 
2797             coredump_file
2798                 .write(&buf)
2799                 .map_err(GuestDebuggableError::CoredumpFile)?;
2800         }
2801 
2802         Ok(())
2803     }
2804 }
2805 
2806 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2807 #[cfg(test)]
2808 mod tests {
2809     use arch::layout::{BOOT_STACK_POINTER, ZERO_PAGE_START};
2810     use arch::x86_64::interrupts::*;
2811     use arch::x86_64::regs::*;
2812     use hypervisor::arch::x86::{FpuState, LapicState};
2813     use hypervisor::StandardRegisters;
2814     use linux_loader::loader::bootparam::setup_header;
2815 
2816     #[test]
2817     fn test_setlint() {
2818         let hv = hypervisor::new().unwrap();
2819         let vm = hv.create_vm().expect("new VM fd creation failed");
2820         assert!(hv.check_required_extensions().is_ok());
2821         // Calling get_lapic will fail if there is no irqchip before hand.
2822         assert!(vm.create_irq_chip().is_ok());
2823         let vcpu = vm.create_vcpu(0, None).unwrap();
2824         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2825 
2826         // Compute the value that is expected to represent LVT0 and LVT1.
2827         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2828         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2829         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2830         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2831 
2832         set_lint(&vcpu).unwrap();
2833 
2834         // Compute the value that represents LVT0 and LVT1 after set_lint.
2835         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2836         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2837         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2838         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2839         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2840     }
2841 
2842     #[test]
2843     fn test_setup_fpu() {
2844         let hv = hypervisor::new().unwrap();
2845         let vm = hv.create_vm().expect("new VM fd creation failed");
2846         let vcpu = vm.create_vcpu(0, None).unwrap();
2847         setup_fpu(&vcpu).unwrap();
2848 
2849         let expected_fpu: FpuState = FpuState {
2850             fcw: 0x37f,
2851             mxcsr: 0x1f80,
2852             ..Default::default()
2853         };
2854         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2855         // TODO: auto-generate kvm related structures with PartialEq on.
2856         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2857         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2858         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2859         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2860         // remove it at all.
2861         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2862     }
2863 
2864     #[test]
2865     fn test_setup_msrs() {
2866         use hypervisor::arch::x86::{msr_index, MsrEntry};
2867 
2868         let hv = hypervisor::new().unwrap();
2869         let vm = hv.create_vm().expect("new VM fd creation failed");
2870         let vcpu = vm.create_vcpu(0, None).unwrap();
2871         setup_msrs(&vcpu).unwrap();
2872 
2873         // This test will check against the last MSR entry configured (the tenth one).
2874         // See create_msr_entries for details.
2875         let mut msrs = vec![MsrEntry {
2876             index: msr_index::MSR_IA32_MISC_ENABLE,
2877             ..Default::default()
2878         }];
2879 
2880         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2881         // in this test case scenario.
2882         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2883         assert_eq!(read_msrs, 1);
2884 
2885         // Official entries that were setup when we did setup_msrs. We need to assert that the
2886         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2887         // expect.
2888         let entry_vec = vcpu.boot_msr_entries();
2889         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2890     }
2891 
2892     #[test]
2893     fn test_setup_regs_for_pvh() {
2894         let hv = hypervisor::new().unwrap();
2895         let vm = hv.create_vm().expect("new VM fd creation failed");
2896         let vcpu = vm.create_vcpu(0, None).unwrap();
2897 
2898         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2899         expected_regs.set_rflags(0x0000000000000002u64);
2900         expected_regs.set_rbx(arch::layout::PVH_INFO_START.0);
2901         expected_regs.set_rip(1);
2902 
2903         setup_regs(
2904             &vcpu,
2905             arch::EntryPoint {
2906                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2907                 setup_header: None,
2908             },
2909         )
2910         .unwrap();
2911 
2912         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2913         assert_eq!(actual_regs, expected_regs);
2914     }
2915 
2916     #[test]
2917     fn test_setup_regs_for_bzimage() {
2918         let hv = hypervisor::new().unwrap();
2919         let vm = hv.create_vm().expect("new VM fd creation failed");
2920         let vcpu = vm.create_vcpu(0, None).unwrap();
2921 
2922         let mut expected_regs: StandardRegisters = vcpu.create_standard_regs();
2923         expected_regs.set_rflags(0x0000000000000002u64);
2924         expected_regs.set_rip(1);
2925         expected_regs.set_rsp(BOOT_STACK_POINTER.0);
2926         expected_regs.set_rsi(ZERO_PAGE_START.0);
2927 
2928         setup_regs(
2929             &vcpu,
2930             arch::EntryPoint {
2931                 entry_addr: vm_memory::GuestAddress(expected_regs.get_rip()),
2932                 setup_header: Some(setup_header {
2933                     ..Default::default()
2934                 }),
2935             },
2936         )
2937         .unwrap();
2938 
2939         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2940         assert_eq!(actual_regs, expected_regs);
2941     }
2942 }
2943 
2944 #[cfg(target_arch = "aarch64")]
2945 #[cfg(test)]
2946 mod tests {
2947     use std::mem;
2948 
2949     use arch::aarch64::regs;
2950     use arch::layout;
2951     use hypervisor::kvm::aarch64::is_system_register;
2952     use hypervisor::kvm::kvm_bindings::{
2953         kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM_CORE,
2954         KVM_REG_SIZE_U64,
2955     };
2956     use hypervisor::{arm64_core_reg_id, offset_of};
2957 
2958     #[test]
2959     fn test_setup_regs() {
2960         let hv = hypervisor::new().unwrap();
2961         let vm = hv.create_vm().unwrap();
2962         let vcpu = vm.create_vcpu(0, None).unwrap();
2963 
2964         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2965         // Must fail when vcpu is not initialized yet.
2966         assert!(res.is_err());
2967 
2968         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2969         vm.get_preferred_target(&mut kvi).unwrap();
2970         vcpu.vcpu_init(&kvi).unwrap();
2971 
2972         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2973     }
2974 
2975     #[test]
2976     fn test_read_mpidr() {
2977         let hv = hypervisor::new().unwrap();
2978         let vm = hv.create_vm().unwrap();
2979         let vcpu = vm.create_vcpu(0, None).unwrap();
2980         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2981         vm.get_preferred_target(&mut kvi).unwrap();
2982 
2983         // Must fail when vcpu is not initialized yet.
2984         assert!(vcpu.get_sys_reg(regs::MPIDR_EL1).is_err());
2985 
2986         vcpu.vcpu_init(&kvi).unwrap();
2987         assert_eq!(vcpu.get_sys_reg(regs::MPIDR_EL1).unwrap(), 0x80000000);
2988     }
2989 
2990     #[test]
2991     fn test_is_system_register() {
2992         let offset = offset_of!(user_pt_regs, pc);
2993         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2994         assert!(!is_system_register(regid));
2995         let regid = KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM64_SYSREG as u64;
2996         assert!(is_system_register(regid));
2997     }
2998 
2999     #[test]
3000     fn test_save_restore_core_regs() {
3001         let hv = hypervisor::new().unwrap();
3002         let vm = hv.create_vm().unwrap();
3003         let vcpu = vm.create_vcpu(0, None).unwrap();
3004         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
3005         vm.get_preferred_target(&mut kvi).unwrap();
3006 
3007         // Must fail when vcpu is not initialized yet.
3008         let res = vcpu.get_regs();
3009         assert!(res.is_err());
3010         assert_eq!(
3011             format!("{}", res.unwrap_err()),
3012             "Failed to get aarch64 core register: Exec format error (os error 8)"
3013         );
3014 
3015         let mut state = vcpu.create_standard_regs();
3016         let res = vcpu.set_regs(&state);
3017         assert!(res.is_err());
3018         assert_eq!(
3019             format!("{}", res.unwrap_err()),
3020             "Failed to set aarch64 core register: Exec format error (os error 8)"
3021         );
3022 
3023         vcpu.vcpu_init(&kvi).unwrap();
3024         let res = vcpu.get_regs();
3025         assert!(res.is_ok());
3026         state = res.unwrap();
3027         assert_eq!(state.get_pstate(), 0x3C5);
3028 
3029         assert!(vcpu.set_regs(&state).is_ok());
3030     }
3031 
3032     #[test]
3033     fn test_get_set_mpstate() {
3034         let hv = hypervisor::new().unwrap();
3035         let vm = hv.create_vm().unwrap();
3036         let vcpu = vm.create_vcpu(0, None).unwrap();
3037         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
3038         vm.get_preferred_target(&mut kvi).unwrap();
3039 
3040         let res = vcpu.get_mp_state();
3041         assert!(res.is_ok());
3042         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
3043     }
3044 }
3045