xref: /cloud-hypervisor/vmm/src/cpu.rs (revision b440cb7d2330770cd415b63544a371d4caa2db3a)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(feature = "guest_debug")]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 use crate::device_manager::DeviceManager;
22 #[cfg(feature = "gdb")]
23 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, aml::Aml, sdt::Sdt};
31 use anyhow::anyhow;
32 use arch::EntryPoint;
33 use arch::NumaNodes;
34 use devices::interrupt_controller::InterruptController;
35 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
36 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs};
37 #[cfg(feature = "guest_debug")]
38 use hypervisor::arch::x86::msr_index;
39 #[cfg(target_arch = "x86_64")]
40 use hypervisor::arch::x86::CpuIdEntry;
41 #[cfg(feature = "guest_debug")]
42 use hypervisor::arch::x86::MsrEntry;
43 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
44 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
45 #[cfg(target_arch = "aarch64")]
46 use hypervisor::kvm::kvm_bindings;
47 #[cfg(feature = "tdx")]
48 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
49 use hypervisor::{CpuState, HypervisorCpuError, VmExit, VmOps};
50 use libc::{c_void, siginfo_t};
51 #[cfg(feature = "guest_debug")]
52 use linux_loader::elf::Elf64_Nhdr;
53 use seccompiler::{apply_filter, SeccompAction};
54 use std::collections::BTreeMap;
55 #[cfg(feature = "guest_debug")]
56 use std::io::Write;
57 #[cfg(feature = "guest_debug")]
58 use std::mem::size_of;
59 use std::os::unix::thread::JoinHandleExt;
60 use std::sync::atomic::{AtomicBool, Ordering};
61 use std::sync::{Arc, Barrier, Mutex};
62 use std::{cmp, io, result, thread};
63 use thiserror::Error;
64 use vm_device::BusDevice;
65 #[cfg(feature = "guest_debug")]
66 use vm_memory::ByteValued;
67 #[cfg(feature = "gdb")]
68 use vm_memory::{Bytes, GuestAddressSpace};
69 use vm_memory::{GuestAddress, GuestMemoryAtomic};
70 use vm_migration::{
71     Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
72     Transportable,
73 };
74 use vmm_sys_util::eventfd::EventFd;
75 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
76 
77 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
78 
79 #[derive(Debug, Error)]
80 pub enum Error {
81     #[error("Error creating vCPU: {0}")]
82     VcpuCreate(#[source] anyhow::Error),
83 
84     #[error("Error running bCPU: {0}")]
85     VcpuRun(#[source] anyhow::Error),
86 
87     #[error("Error spawning vCPU thread: {0}")]
88     VcpuSpawn(#[source] io::Error),
89 
90     #[error("Error generating common CPUID: {0}")]
91     CommonCpuId(#[source] arch::Error),
92 
93     #[error("Error configuring vCPU: {0}")]
94     VcpuConfiguration(#[source] arch::Error),
95 
96     #[cfg(target_arch = "aarch64")]
97     #[error("Error fetching preferred target: {0}")]
98     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
99 
100     #[cfg(target_arch = "aarch64")]
101     #[error("Error initialising vCPU: {0}")]
102     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
103 
104     #[error("Failed to join on vCPU threads: {0:?}")]
105     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
106 
107     #[error("Error adding CpuManager to MMIO bus: {0}")]
108     BusError(#[source] vm_device::BusError),
109 
110     #[error("Requested vCPUs exceed maximum")]
111     DesiredVCpuCountExceedsMax,
112 
113     #[error("Cannot create seccomp filter: {0}")]
114     CreateSeccompFilter(#[source] seccompiler::Error),
115 
116     #[error("Cannot apply seccomp filter: {0}")]
117     ApplySeccompFilter(#[source] seccompiler::Error),
118 
119     #[error("Error starting vCPU after restore: {0}")]
120     StartRestoreVcpu(#[source] anyhow::Error),
121 
122     #[error("Unexpected VmExit")]
123     UnexpectedVmExit,
124 
125     #[error("Failed to allocate MMIO address for CpuManager")]
126     AllocateMmmioAddress,
127 
128     #[cfg(feature = "tdx")]
129     #[error("Error initializing TDX: {0}")]
130     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
131 
132     #[cfg(target_arch = "aarch64")]
133     #[error("Error initializing PMU: {0}")]
134     InitPmu(#[source] hypervisor::HypervisorCpuError),
135 
136     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
137     #[error("Error during CPU debug: {0}")]
138     CpuDebug(#[source] hypervisor::HypervisorCpuError),
139 
140     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
141     #[error("Error translating virtual address: {0}")]
142     TranslateVirtualAddress(#[source] hypervisor::HypervisorCpuError),
143 
144     #[cfg(all(feature = "amx", target_arch = "x86_64"))]
145     #[error("Error setting up AMX: {0}")]
146     AmxEnable(#[source] anyhow::Error),
147 }
148 pub type Result<T> = result::Result<T, Error>;
149 
150 #[cfg(target_arch = "x86_64")]
151 #[allow(dead_code)]
152 #[repr(packed)]
153 struct LocalApic {
154     pub r#type: u8,
155     pub length: u8,
156     pub processor_id: u8,
157     pub apic_id: u8,
158     pub flags: u32,
159 }
160 
161 #[allow(dead_code)]
162 #[repr(packed)]
163 #[derive(Default)]
164 struct Ioapic {
165     pub r#type: u8,
166     pub length: u8,
167     pub ioapic_id: u8,
168     _reserved: u8,
169     pub apic_address: u32,
170     pub gsi_base: u32,
171 }
172 
173 #[cfg(target_arch = "aarch64")]
174 #[allow(dead_code)]
175 #[repr(packed)]
176 struct GicC {
177     pub r#type: u8,
178     pub length: u8,
179     pub reserved0: u16,
180     pub cpu_interface_number: u32,
181     pub uid: u32,
182     pub flags: u32,
183     pub parking_version: u32,
184     pub performance_interrupt: u32,
185     pub parked_address: u64,
186     pub base_address: u64,
187     pub gicv_base_address: u64,
188     pub gich_base_address: u64,
189     pub vgic_interrupt: u32,
190     pub gicr_base_address: u64,
191     pub mpidr: u64,
192     pub proc_power_effi_class: u8,
193     pub reserved1: u8,
194     pub spe_overflow_interrupt: u16,
195 }
196 
197 #[cfg(target_arch = "aarch64")]
198 #[allow(dead_code)]
199 #[repr(packed)]
200 struct GicD {
201     pub r#type: u8,
202     pub length: u8,
203     pub reserved0: u16,
204     pub gic_id: u32,
205     pub base_address: u64,
206     pub global_irq_base: u32,
207     pub version: u8,
208     pub reserved1: [u8; 3],
209 }
210 
211 #[cfg(target_arch = "aarch64")]
212 #[allow(dead_code)]
213 #[repr(packed)]
214 struct GicR {
215     pub r#type: u8,
216     pub length: u8,
217     pub reserved: u16,
218     pub base_address: u64,
219     pub range_length: u32,
220 }
221 
222 #[cfg(target_arch = "aarch64")]
223 #[allow(dead_code)]
224 #[repr(packed)]
225 struct GicIts {
226     pub r#type: u8,
227     pub length: u8,
228     pub reserved0: u16,
229     pub translation_id: u32,
230     pub base_address: u64,
231     pub reserved1: u32,
232 }
233 
234 #[cfg(target_arch = "aarch64")]
235 #[allow(dead_code)]
236 #[repr(packed)]
237 struct ProcessorHierarchyNode {
238     pub r#type: u8,
239     pub length: u8,
240     pub reserved: u16,
241     pub flags: u32,
242     pub parent: u32,
243     pub acpi_processor_id: u32,
244     pub num_private_resources: u32,
245 }
246 
247 #[allow(dead_code)]
248 #[repr(packed)]
249 #[derive(Default)]
250 struct InterruptSourceOverride {
251     pub r#type: u8,
252     pub length: u8,
253     pub bus: u8,
254     pub source: u8,
255     pub gsi: u32,
256     pub flags: u16,
257 }
258 
259 #[cfg(feature = "guest_debug")]
260 macro_rules! round_up {
261     ($n:expr,$d:expr) => {
262         (($n / ($d + 1)) + 1) * $d
263     };
264 }
265 
266 /// A wrapper around creating and using a kvm-based VCPU.
267 pub struct Vcpu {
268     // The hypervisor abstracted CPU.
269     vcpu: Arc<dyn hypervisor::Vcpu>,
270     id: u8,
271     #[cfg(target_arch = "aarch64")]
272     mpidr: u64,
273     saved_state: Option<CpuState>,
274 }
275 
276 impl Vcpu {
277     /// Constructs a new VCPU for `vm`.
278     ///
279     /// # Arguments
280     ///
281     /// * `id` - Represents the CPU number between [0, max vcpus).
282     /// * `vm` - The virtual machine this vcpu will get attached to.
283     /// * `vm_ops` - Optional object for exit handling.
284     pub fn new(
285         id: u8,
286         vm: &Arc<dyn hypervisor::Vm>,
287         vm_ops: Option<Arc<dyn VmOps>>,
288     ) -> Result<Self> {
289         let vcpu = vm
290             .create_vcpu(id, vm_ops)
291             .map_err(|e| Error::VcpuCreate(e.into()))?;
292         // Initially the cpuid per vCPU is the one supported by this VM.
293         Ok(Vcpu {
294             vcpu,
295             id,
296             #[cfg(target_arch = "aarch64")]
297             mpidr: 0,
298             saved_state: None,
299         })
300     }
301 
302     /// Configures a vcpu and should be called once per vcpu when created.
303     ///
304     /// # Arguments
305     ///
306     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
307     /// * `vm_memory` - Guest memory.
308     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
309     pub fn configure(
310         &mut self,
311         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
312         kernel_entry_point: Option<EntryPoint>,
313         #[cfg(target_arch = "x86_64")] vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
314         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
315         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
316     ) -> Result<()> {
317         #[cfg(target_arch = "aarch64")]
318         {
319             self.init(vm)?;
320             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point)
321                 .map_err(Error::VcpuConfiguration)?;
322         }
323         info!("Configuring vCPU: cpu_id = {}", self.id);
324         #[cfg(target_arch = "x86_64")]
325         arch::configure_vcpu(
326             &self.vcpu,
327             self.id,
328             kernel_entry_point,
329             vm_memory,
330             cpuid,
331             kvm_hyperv,
332         )
333         .map_err(Error::VcpuConfiguration)?;
334 
335         Ok(())
336     }
337 
338     /// Gets the MPIDR register value.
339     #[cfg(target_arch = "aarch64")]
340     pub fn get_mpidr(&self) -> u64 {
341         self.mpidr
342     }
343 
344     /// Gets the saved vCPU state.
345     #[cfg(target_arch = "aarch64")]
346     pub fn get_saved_state(&self) -> Option<CpuState> {
347         self.saved_state.clone()
348     }
349 
350     /// Initializes an aarch64 specific vcpu for booting Linux.
351     #[cfg(target_arch = "aarch64")]
352     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
353         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
354 
355         // This reads back the kernel's preferred target type.
356         vm.get_preferred_target(&mut kvi)
357             .map_err(Error::VcpuArmPreferredTarget)?;
358         // We already checked that the capability is supported.
359         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
360         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
361         // Non-boot cpus are powered off initially.
362         if self.id > 0 {
363             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
364         }
365         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
366     }
367 
368     /// Runs the VCPU until it exits, returning the reason.
369     ///
370     /// Note that the state of the VCPU and associated VM must be setup first for this to do
371     /// anything useful.
372     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
373         self.vcpu.run()
374     }
375 }
376 
377 const VCPU_SNAPSHOT_ID: &str = "vcpu";
378 impl Pausable for Vcpu {}
379 impl Snapshottable for Vcpu {
380     fn id(&self) -> String {
381         VCPU_SNAPSHOT_ID.to_string()
382     }
383 
384     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
385         let saved_state = self
386             .vcpu
387             .state()
388             .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?;
389 
390         let mut vcpu_snapshot = Snapshot::new(&format!("{:03}", self.id));
391         vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state(
392             VCPU_SNAPSHOT_ID,
393             &saved_state,
394         )?);
395 
396         self.saved_state = Some(saved_state);
397 
398         Ok(vcpu_snapshot)
399     }
400 
401     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
402         let saved_state: CpuState = snapshot.to_state(VCPU_SNAPSHOT_ID)?;
403 
404         self.vcpu
405             .set_state(&saved_state)
406             .map_err(|e| MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)))?;
407 
408         self.saved_state = Some(saved_state);
409 
410         Ok(())
411     }
412 }
413 
414 pub struct CpuManager {
415     config: CpusConfig,
416     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
417     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
418     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
419     vm_memory: GuestMemoryAtomic<GuestMemoryMmap>,
420     #[cfg(target_arch = "x86_64")]
421     cpuid: Vec<CpuIdEntry>,
422     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
423     vm: Arc<dyn hypervisor::Vm>,
424     vcpus_kill_signalled: Arc<AtomicBool>,
425     vcpus_pause_signalled: Arc<AtomicBool>,
426     exit_evt: EventFd,
427     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
428     reset_evt: EventFd,
429     #[cfg(feature = "gdb")]
430     vm_debug_evt: EventFd,
431     vcpu_states: Vec<VcpuState>,
432     selected_cpu: u8,
433     vcpus: Vec<Arc<Mutex<Vcpu>>>,
434     seccomp_action: SeccompAction,
435     vm_ops: Arc<dyn VmOps>,
436     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
437     acpi_address: Option<GuestAddress>,
438     proximity_domain_per_cpu: BTreeMap<u8, u32>,
439     affinity: BTreeMap<u8, Vec<u8>>,
440     dynamic: bool,
441 }
442 
443 const CPU_ENABLE_FLAG: usize = 0;
444 const CPU_INSERTING_FLAG: usize = 1;
445 const CPU_REMOVING_FLAG: usize = 2;
446 const CPU_EJECT_FLAG: usize = 3;
447 
448 const CPU_STATUS_OFFSET: u64 = 4;
449 const CPU_SELECTION_OFFSET: u64 = 0;
450 
451 impl BusDevice for CpuManager {
452     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
453         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
454         data.fill(0);
455 
456         match offset {
457             CPU_SELECTION_OFFSET => {
458                 data[0] = self.selected_cpu;
459             }
460             CPU_STATUS_OFFSET => {
461                 if self.selected_cpu < self.max_vcpus() {
462                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
463                     if state.active() {
464                         data[0] |= 1 << CPU_ENABLE_FLAG;
465                     }
466                     if state.inserting {
467                         data[0] |= 1 << CPU_INSERTING_FLAG;
468                     }
469                     if state.removing {
470                         data[0] |= 1 << CPU_REMOVING_FLAG;
471                     }
472                 } else {
473                     warn!("Out of range vCPU id: {}", self.selected_cpu);
474                 }
475             }
476             _ => {
477                 warn!(
478                     "Unexpected offset for accessing CPU manager device: {:#}",
479                     offset
480                 );
481             }
482         }
483     }
484 
485     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
486         match offset {
487             CPU_SELECTION_OFFSET => {
488                 self.selected_cpu = data[0];
489             }
490             CPU_STATUS_OFFSET => {
491                 if self.selected_cpu < self.max_vcpus() {
492                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
493                     // The ACPI code writes back a 1 to acknowledge the insertion
494                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
495                         && state.inserting
496                     {
497                         state.inserting = false;
498                     }
499                     // Ditto for removal
500                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
501                         && state.removing
502                     {
503                         state.removing = false;
504                     }
505                     // Trigger removal of vCPU
506                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
507                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
508                             error!("Error removing vCPU: {:?}", e);
509                         }
510                     }
511                 } else {
512                     warn!("Out of range vCPU id: {}", self.selected_cpu);
513                 }
514             }
515             _ => {
516                 warn!(
517                     "Unexpected offset for accessing CPU manager device: {:#}",
518                     offset
519                 );
520             }
521         }
522         None
523     }
524 }
525 
526 #[derive(Default)]
527 struct VcpuState {
528     inserting: bool,
529     removing: bool,
530     handle: Option<thread::JoinHandle<()>>,
531     kill: Arc<AtomicBool>,
532     vcpu_run_interrupted: Arc<AtomicBool>,
533 }
534 
535 impl VcpuState {
536     fn active(&self) -> bool {
537         self.handle.is_some()
538     }
539 
540     fn signal_thread(&self) {
541         if let Some(handle) = self.handle.as_ref() {
542             loop {
543                 unsafe {
544                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
545                 }
546                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
547                     break;
548                 } else {
549                     // This is more effective than thread::yield_now() at
550                     // avoiding a priority inversion with the vCPU thread
551                     thread::sleep(std::time::Duration::from_millis(1));
552                 }
553             }
554         }
555     }
556 
557     fn join_thread(&mut self) -> Result<()> {
558         if let Some(handle) = self.handle.take() {
559             handle.join().map_err(Error::ThreadCleanup)?
560         }
561 
562         Ok(())
563     }
564 
565     fn unpark_thread(&self) {
566         if let Some(handle) = self.handle.as_ref() {
567             handle.thread().unpark()
568         }
569     }
570 }
571 
572 impl CpuManager {
573     #[allow(unused_variables)]
574     #[allow(clippy::too_many_arguments)]
575     pub fn new(
576         config: &CpusConfig,
577         device_manager: &Arc<Mutex<DeviceManager>>,
578         memory_manager: &Arc<Mutex<MemoryManager>>,
579         vm: Arc<dyn hypervisor::Vm>,
580         exit_evt: EventFd,
581         reset_evt: EventFd,
582         #[cfg(feature = "gdb")] vm_debug_evt: EventFd,
583         hypervisor: Arc<dyn hypervisor::Hypervisor>,
584         seccomp_action: SeccompAction,
585         vm_ops: Arc<dyn VmOps>,
586         #[cfg(feature = "tdx")] tdx_enabled: bool,
587         numa_nodes: &NumaNodes,
588     ) -> Result<Arc<Mutex<CpuManager>>> {
589         let guest_memory = memory_manager.lock().unwrap().guest_memory();
590         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
591         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
592 
593         #[cfg(target_arch = "x86_64")]
594         let sgx_epc_sections = memory_manager
595             .lock()
596             .unwrap()
597             .sgx_epc_region()
598             .as_ref()
599             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
600         #[cfg(target_arch = "x86_64")]
601         let cpuid = {
602             let phys_bits = physical_bits(config.max_phys_bits);
603             arch::generate_common_cpuid(
604                 hypervisor,
605                 config
606                     .topology
607                     .clone()
608                     .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)),
609                 sgx_epc_sections,
610                 phys_bits,
611                 config.kvm_hyperv,
612                 #[cfg(feature = "tdx")]
613                 tdx_enabled,
614             )
615             .map_err(Error::CommonCpuId)?
616         };
617         #[cfg(all(feature = "amx", target_arch = "x86_64"))]
618         if config.features.amx {
619             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
620             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
621             const XFEATURE_XTILEDATA: usize = 18;
622             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
623 
624             // This is safe as the syscall is only modifing kernel internal
625             // data structures that the kernel is itself expected to safeguard.
626             let amx_tile = unsafe {
627                 libc::syscall(
628                     libc::SYS_arch_prctl,
629                     ARCH_REQ_XCOMP_GUEST_PERM,
630                     XFEATURE_XTILEDATA,
631                 )
632             };
633 
634             if amx_tile != 0 {
635                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
636             } else {
637                 // This is safe as the mask being modified (not marked mutable as it is
638                 // modified in unsafe only which is permitted) isn't in use elsewhere.
639                 let mask: usize = 0;
640                 let result = unsafe {
641                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
642                 };
643                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
644                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
645                 }
646             }
647         }
648 
649         let device_manager = device_manager.lock().unwrap();
650 
651         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
652             let mut cpu_list = Vec::new();
653             for (proximity_domain, numa_node) in numa_nodes.iter() {
654                 for cpu in numa_node.cpus.iter() {
655                     cpu_list.push((*cpu, *proximity_domain))
656                 }
657             }
658             cpu_list
659         }
660         .into_iter()
661         .collect();
662 
663         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
664             cpu_affinity
665                 .iter()
666                 .map(|a| (a.vcpu, a.host_cpus.clone()))
667                 .collect()
668         } else {
669             BTreeMap::new()
670         };
671 
672         #[cfg(feature = "tdx")]
673         let dynamic = !tdx_enabled;
674         #[cfg(not(feature = "tdx"))]
675         let dynamic = true;
676 
677         let acpi_address = if dynamic {
678             Some(
679                 device_manager
680                     .allocator()
681                     .lock()
682                     .unwrap()
683                     .allocate_platform_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None)
684                     .ok_or(Error::AllocateMmmioAddress)?,
685             )
686         } else {
687             None
688         };
689 
690         let cpu_manager = Arc::new(Mutex::new(CpuManager {
691             config: config.clone(),
692             interrupt_controller: device_manager.interrupt_controller().clone(),
693             vm_memory: guest_memory,
694             #[cfg(target_arch = "x86_64")]
695             cpuid,
696             vm,
697             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
698             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
699             vcpu_states,
700             exit_evt,
701             reset_evt,
702             #[cfg(feature = "gdb")]
703             vm_debug_evt,
704             selected_cpu: 0,
705             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
706             seccomp_action,
707             vm_ops,
708             acpi_address,
709             proximity_domain_per_cpu,
710             affinity,
711             dynamic,
712         }));
713 
714         if let Some(acpi_address) = acpi_address {
715             device_manager
716                 .mmio_bus()
717                 .insert(
718                     cpu_manager.clone(),
719                     acpi_address.0,
720                     CPU_MANAGER_ACPI_SIZE as u64,
721                 )
722                 .map_err(Error::BusError)?;
723         }
724 
725         Ok(cpu_manager)
726     }
727 
728     fn create_vcpu(
729         &mut self,
730         cpu_id: u8,
731         entry_point: Option<EntryPoint>,
732         snapshot: Option<Snapshot>,
733     ) -> Result<()> {
734         info!("Creating vCPU: cpu_id = {}", cpu_id);
735 
736         let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?;
737 
738         if let Some(snapshot) = snapshot {
739             // AArch64 vCPUs should be initialized after created.
740             #[cfg(target_arch = "aarch64")]
741             vcpu.init(&self.vm)?;
742 
743             vcpu.restore(snapshot).expect("Failed to restore vCPU");
744         } else {
745             #[cfg(target_arch = "x86_64")]
746             vcpu.configure(
747                 entry_point,
748                 &self.vm_memory,
749                 self.cpuid.clone(),
750                 self.config.kvm_hyperv,
751             )
752             .expect("Failed to configure vCPU");
753 
754             #[cfg(target_arch = "aarch64")]
755             vcpu.configure(&self.vm, entry_point)
756                 .expect("Failed to configure vCPU");
757         }
758 
759         // Adding vCPU to the CpuManager's vCPU list.
760         let vcpu = Arc::new(Mutex::new(vcpu));
761         self.vcpus.push(vcpu);
762 
763         Ok(())
764     }
765 
766     /// Only create new vCPUs if there aren't any inactive ones to reuse
767     fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> {
768         info!(
769             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
770             desired_vcpus,
771             self.config.max_vcpus,
772             self.vcpus.len(),
773             self.present_vcpus()
774         );
775 
776         if desired_vcpus > self.config.max_vcpus {
777             return Err(Error::DesiredVCpuCountExceedsMax);
778         }
779 
780         // Only create vCPUs in excess of all the allocated vCPUs.
781         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
782             self.create_vcpu(cpu_id, entry_point, None)?;
783         }
784 
785         Ok(())
786     }
787 
788     #[cfg(target_arch = "aarch64")]
789     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
790         let cpu_attr = kvm_bindings::kvm_device_attr {
791             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
792             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
793             addr: 0x0,
794             flags: 0,
795         };
796 
797         for cpu in self.vcpus.iter() {
798             let tmp = irq;
799             let cpu_attr_irq = kvm_bindings::kvm_device_attr {
800                 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
801                 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
802                 addr: &tmp as *const u32 as u64,
803                 flags: 0,
804             };
805 
806             // Check if PMU attr is available, if not, log the information.
807             if cpu.lock().unwrap().vcpu.has_vcpu_attr(&cpu_attr).is_ok() {
808                 // Set irq for PMU
809                 cpu.lock()
810                     .unwrap()
811                     .vcpu
812                     .set_vcpu_attr(&cpu_attr_irq)
813                     .map_err(Error::InitPmu)?;
814 
815                 // Init PMU
816                 cpu.lock()
817                     .unwrap()
818                     .vcpu
819                     .set_vcpu_attr(&cpu_attr)
820                     .map_err(Error::InitPmu)?;
821             } else {
822                 debug!(
823                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
824                     cpu.lock().unwrap().id
825                 );
826                 return Ok(false);
827             }
828         }
829 
830         Ok(true)
831     }
832 
833     fn start_vcpu(
834         &mut self,
835         vcpu: Arc<Mutex<Vcpu>>,
836         vcpu_id: u8,
837         vcpu_thread_barrier: Arc<Barrier>,
838         inserting: bool,
839     ) -> Result<()> {
840         let reset_evt = self.reset_evt.try_clone().unwrap();
841         let exit_evt = self.exit_evt.try_clone().unwrap();
842         #[cfg(feature = "gdb")]
843         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
844         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
845         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
846         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
847 
848         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
849         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
850             .vcpu_run_interrupted
851             .clone();
852         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
853 
854         // Prepare the CPU set the current vCPU is expected to run onto.
855         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
856             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
857             unsafe { libc::CPU_ZERO(&mut cpuset) };
858             for host_cpu in host_cpus {
859                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
860             }
861             cpuset
862         });
863 
864         // Retrieve seccomp filter for vcpu thread
865         let vcpu_seccomp_filter = get_seccomp_filter(&self.seccomp_action, Thread::Vcpu)
866             .map_err(Error::CreateSeccompFilter)?;
867 
868         #[cfg(target_arch = "x86_64")]
869         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
870 
871         info!("Starting vCPU: cpu_id = {}", vcpu_id);
872 
873         let handle = Some(
874             thread::Builder::new()
875                 .name(format!("vcpu{}", vcpu_id))
876                 .spawn(move || {
877                     // Schedule the thread to run on the expected CPU set
878                     if let Some(cpuset) = cpuset.as_ref() {
879                         let ret = unsafe {
880                             libc::sched_setaffinity(
881                                 0,
882                                 std::mem::size_of::<libc::cpu_set_t>(),
883                                 cpuset as *const libc::cpu_set_t,
884                             )
885                         };
886 
887                         if ret != 0 {
888                             error!(
889                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
890                                 vcpu_id,
891                                 io::Error::last_os_error()
892                             );
893                             return;
894                         }
895                     }
896 
897                     // Apply seccomp filter for vcpu thread.
898                     if !vcpu_seccomp_filter.is_empty() {
899                         if let Err(e) =
900                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
901                         {
902                             error!("Error applying seccomp filter: {:?}", e);
903                             return;
904                         }
905                     }
906                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
907                     // This uses an async signal safe handler to kill the vcpu handles.
908                     register_signal_handler(SIGRTMIN(), handle_signal)
909                         .expect("Failed to register vcpu signal handler");
910                     // Block until all CPUs are ready.
911                     vcpu_thread_barrier.wait();
912 
913                     std::panic::catch_unwind(move || {
914                         loop {
915                             // If we are being told to pause, we park the thread
916                             // until the pause boolean is toggled.
917                             // The resume operation is responsible for toggling
918                             // the boolean and unpark the thread.
919                             // We enter a loop because park() could spuriously
920                             // return. We will then park() again unless the
921                             // pause boolean has been toggled.
922 
923                             // Need to use Ordering::SeqCst as we have multiple
924                             // loads and stores to different atomics and we need
925                             // to see them in a consistent order in all threads
926 
927                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
928                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
929                                 // completed by returning to KVM_RUN. From the kernel docs:
930                                 //
931                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
932                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
933                                 // operations are complete (and guest state is consistent) only after userspace
934                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
935                                 // incomplete operations and then check for pending signals.
936                                 // The pending state of the operation is not preserved in state which is
937                                 // visible to userspace, thus userspace should ensure that the operation is
938                                 // completed before performing a live migration.  Userspace can re-enter the
939                                 // guest with an unmasked signal pending or with the immediate_exit field set
940                                 // to complete pending operations without allowing any further instructions
941                                 // to be executed.
942 
943                                 #[cfg(feature = "kvm")]
944                                 {
945                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
946                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
947                                         error!("Unexpected VM exit on \"immediate_exit\" run");
948                                         break;
949                                     }
950                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
951                                 }
952 
953                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
954                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
955                                     thread::park();
956                                 }
957                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
958                             }
959 
960                             // We've been told to terminate
961                             if vcpu_kill_signalled.load(Ordering::SeqCst)
962                                 || vcpu_kill.load(Ordering::SeqCst)
963                             {
964                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
965                                 break;
966                             }
967 
968                             #[cfg(feature = "tdx")]
969                             let mut vcpu = vcpu.lock().unwrap();
970                             #[cfg(not(feature = "tdx"))]
971                             let vcpu = vcpu.lock().unwrap();
972                             // vcpu.run() returns false on a triple-fault so trigger a reset
973                             match vcpu.run() {
974                                 Ok(run) => match run {
975                                     #[cfg(all(target_arch = "x86_64", feature = "kvm"))]
976                                     VmExit::Debug => {
977                                         info!("VmExit::Debug");
978                                         #[cfg(feature = "gdb")]
979                                         {
980                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
981                                             let raw_tid = get_raw_tid(vcpu_id as usize);
982                                             vm_debug_evt.write(raw_tid as u64).unwrap();
983                                         }
984                                     }
985                                     #[cfg(target_arch = "x86_64")]
986                                     VmExit::IoapicEoi(vector) => {
987                                         if let Some(interrupt_controller) =
988                                             &interrupt_controller_clone
989                                         {
990                                             interrupt_controller
991                                                 .lock()
992                                                 .unwrap()
993                                                 .end_of_interrupt(vector);
994                                         }
995                                     }
996                                     VmExit::Ignore => {}
997                                     VmExit::Hyperv => {}
998                                     VmExit::Reset => {
999                                         info!("VmExit::Reset");
1000                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1001                                         reset_evt.write(1).unwrap();
1002                                         break;
1003                                     }
1004                                     VmExit::Shutdown => {
1005                                         info!("VmExit::Shutdown");
1006                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
1007                                         exit_evt.write(1).unwrap();
1008                                         break;
1009                                     }
1010                                     #[cfg(feature = "tdx")]
1011                                     VmExit::Tdx => {
1012                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
1013                                             match vcpu.get_tdx_exit_details() {
1014                                                 Ok(details) => match details {
1015                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
1016                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
1017                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
1018                                                     }
1019                                                 },
1020                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1021                                             }
1022                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1023                                         } else {
1024                                             // We should never reach this code as
1025                                             // this means the design from the code
1026                                             // is wrong.
1027                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1028                                         }
1029                                     }
1030                                     _ => {
1031                                         error!(
1032                                             "VCPU generated error: {:?}",
1033                                             Error::UnexpectedVmExit
1034                                         );
1035                                         break;
1036                                     }
1037                                 },
1038 
1039                                 Err(e) => {
1040                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1041                                     break;
1042                                 }
1043                             }
1044 
1045                             // We've been told to terminate
1046                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1047                                 || vcpu_kill.load(Ordering::SeqCst)
1048                             {
1049                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1050                                 break;
1051                             }
1052                         }
1053                     })
1054                     .or_else(|_| {
1055                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1056                         error!("vCPU thread panicked");
1057                         panic_exit_evt.write(1)
1058                     })
1059                     .ok();
1060                 })
1061                 .map_err(Error::VcpuSpawn)?,
1062         );
1063 
1064         // On hot plug calls into this function entry_point is None. It is for
1065         // those hotplug CPU additions that we need to set the inserting flag.
1066         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1067         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1068 
1069         Ok(())
1070     }
1071 
1072     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1073     fn activate_vcpus(&mut self, desired_vcpus: u8, inserting: bool) -> Result<()> {
1074         if desired_vcpus > self.config.max_vcpus {
1075             return Err(Error::DesiredVCpuCountExceedsMax);
1076         }
1077 
1078         let vcpu_thread_barrier = Arc::new(Barrier::new(
1079             (desired_vcpus - self.present_vcpus() + 1) as usize,
1080         ));
1081 
1082         info!(
1083             "Starting vCPUs: desired = {}, allocated = {}, present = {}",
1084             desired_vcpus,
1085             self.vcpus.len(),
1086             self.present_vcpus()
1087         );
1088 
1089         // This reuses any inactive vCPUs as well as any that were newly created
1090         for vcpu_id in self.present_vcpus()..desired_vcpus {
1091             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1092             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1093         }
1094 
1095         // Unblock all CPU threads.
1096         vcpu_thread_barrier.wait();
1097         Ok(())
1098     }
1099 
1100     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1101         // Mark vCPUs for removal, actual removal happens on ejection
1102         for cpu_id in desired_vcpus..self.present_vcpus() {
1103             self.vcpu_states[usize::from(cpu_id)].removing = true;
1104         }
1105     }
1106 
1107     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1108         info!("Removing vCPU: cpu_id = {}", cpu_id);
1109         let mut state = &mut self.vcpu_states[usize::from(cpu_id)];
1110         state.kill.store(true, Ordering::SeqCst);
1111         state.signal_thread();
1112         state.join_thread()?;
1113         state.handle = None;
1114 
1115         // Once the thread has exited, clear the "kill" so that it can reused
1116         state.kill.store(false, Ordering::SeqCst);
1117 
1118         Ok(())
1119     }
1120 
1121     pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> {
1122         self.create_vcpus(self.boot_vcpus(), entry_point)
1123     }
1124 
1125     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1126     pub fn start_boot_vcpus(&mut self) -> Result<()> {
1127         self.activate_vcpus(self.boot_vcpus(), false)
1128     }
1129 
1130     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1131         let vcpu_numbers = self.vcpus.len() as u8;
1132         let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_numbers + 1) as usize));
1133         // Restore the vCPUs in "paused" state.
1134         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
1135 
1136         for vcpu_id in 0..vcpu_numbers {
1137             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1138 
1139             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), false)
1140                 .map_err(|e| {
1141                     Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1142                 })?;
1143         }
1144         // Unblock all restored CPU threads.
1145         vcpu_thread_barrier.wait();
1146         Ok(())
1147     }
1148 
1149     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1150         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1151             return Ok(false);
1152         }
1153 
1154         if !self.dynamic {
1155             return Ok(false);
1156         }
1157 
1158         match desired_vcpus.cmp(&self.present_vcpus()) {
1159             cmp::Ordering::Greater => {
1160                 self.create_vcpus(desired_vcpus, None)?;
1161                 self.activate_vcpus(desired_vcpus, true)?;
1162                 Ok(true)
1163             }
1164             cmp::Ordering::Less => {
1165                 self.mark_vcpus_for_removal(desired_vcpus);
1166                 Ok(true)
1167             }
1168             _ => Ok(false),
1169         }
1170     }
1171 
1172     pub fn shutdown(&mut self) -> Result<()> {
1173         // Tell the vCPUs to stop themselves next time they go through the loop
1174         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1175 
1176         // Toggle the vCPUs pause boolean
1177         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1178 
1179         // Unpark all the VCPU threads.
1180         for state in self.vcpu_states.iter() {
1181             state.unpark_thread();
1182         }
1183 
1184         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1185         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1186         // above.
1187         for state in self.vcpu_states.iter() {
1188             state.signal_thread();
1189         }
1190 
1191         // Wait for all the threads to finish. This removes the state from the vector.
1192         for mut state in self.vcpu_states.drain(..) {
1193             state.join_thread()?;
1194         }
1195 
1196         Ok(())
1197     }
1198 
1199     #[cfg(feature = "tdx")]
1200     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1201         for vcpu in &self.vcpus {
1202             vcpu.lock()
1203                 .unwrap()
1204                 .vcpu
1205                 .tdx_init(hob_address)
1206                 .map_err(Error::InitializeTdx)?;
1207         }
1208         Ok(())
1209     }
1210 
1211     pub fn boot_vcpus(&self) -> u8 {
1212         self.config.boot_vcpus
1213     }
1214 
1215     pub fn max_vcpus(&self) -> u8 {
1216         self.config.max_vcpus
1217     }
1218 
1219     #[cfg(target_arch = "x86_64")]
1220     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1221         self.cpuid.clone()
1222     }
1223 
1224     fn present_vcpus(&self) -> u8 {
1225         self.vcpu_states
1226             .iter()
1227             .fold(0, |acc, state| acc + state.active() as u8)
1228     }
1229 
1230     #[cfg(target_arch = "aarch64")]
1231     pub fn get_mpidrs(&self) -> Vec<u64> {
1232         self.vcpus
1233             .iter()
1234             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1235             .collect()
1236     }
1237 
1238     #[cfg(target_arch = "aarch64")]
1239     pub fn get_saved_states(&self) -> Vec<CpuState> {
1240         self.vcpus
1241             .iter()
1242             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1243             .collect()
1244     }
1245 
1246     #[cfg(target_arch = "aarch64")]
1247     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1248         self.config
1249             .topology
1250             .clone()
1251             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1252     }
1253 
1254     pub fn create_madt(&self) -> Sdt {
1255         use crate::acpi;
1256         // This is also checked in the commandline parsing.
1257         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1258 
1259         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1260         #[cfg(target_arch = "x86_64")]
1261         {
1262             madt.write(36, arch::layout::APIC_START);
1263 
1264             for cpu in 0..self.config.max_vcpus {
1265                 let lapic = LocalApic {
1266                     r#type: acpi::ACPI_APIC_PROCESSOR,
1267                     length: 8,
1268                     processor_id: cpu,
1269                     apic_id: cpu,
1270                     flags: if cpu < self.config.boot_vcpus {
1271                         1 << MADT_CPU_ENABLE_FLAG
1272                     } else {
1273                         0
1274                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1275                 };
1276                 madt.append(lapic);
1277             }
1278 
1279             madt.append(Ioapic {
1280                 r#type: acpi::ACPI_APIC_IO,
1281                 length: 12,
1282                 ioapic_id: 0,
1283                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1284                 gsi_base: 0,
1285                 ..Default::default()
1286             });
1287 
1288             madt.append(InterruptSourceOverride {
1289                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1290                 length: 10,
1291                 bus: 0,
1292                 source: 4,
1293                 gsi: 4,
1294                 flags: 0,
1295             });
1296         }
1297 
1298         #[cfg(target_arch = "aarch64")]
1299         {
1300             use vm_memory::Address;
1301             /* Notes:
1302              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1303              */
1304 
1305             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1306             for cpu in 0..self.config.boot_vcpus {
1307                 let vcpu = &self.vcpus[cpu as usize];
1308                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1309                 /* ARMv8 MPIDR format:
1310                      Bits [63:40] Must be zero
1311                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1312                      Bits [31:24] Must be zero
1313                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1314                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1315                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1316                 */
1317                 let mpidr_mask = 0xff_00ff_ffff;
1318                 let gicc = GicC {
1319                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1320                     length: 80,
1321                     reserved0: 0,
1322                     cpu_interface_number: cpu as u32,
1323                     uid: cpu as u32,
1324                     flags: 1,
1325                     parking_version: 0,
1326                     performance_interrupt: 0,
1327                     parked_address: 0,
1328                     base_address: 0,
1329                     gicv_base_address: 0,
1330                     gich_base_address: 0,
1331                     vgic_interrupt: 0,
1332                     gicr_base_address: 0,
1333                     mpidr: mpidr & mpidr_mask,
1334                     proc_power_effi_class: 0,
1335                     reserved1: 0,
1336                     spe_overflow_interrupt: 0,
1337                 };
1338 
1339                 madt.append(gicc);
1340             }
1341 
1342             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1343             let gicd = GicD {
1344                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1345                 length: 24,
1346                 reserved0: 0,
1347                 gic_id: 0,
1348                 base_address: arch::layout::MAPPED_IO_START.raw_value() - 0x0001_0000,
1349                 global_irq_base: 0,
1350                 version: 3,
1351                 reserved1: [0; 3],
1352             };
1353             madt.append(gicd);
1354 
1355             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1356             let gicr_size: u32 = 0x0001_0000 * 2 * (self.config.boot_vcpus as u32);
1357             let gicr_base: u64 =
1358                 arch::layout::MAPPED_IO_START.raw_value() - 0x0001_0000 - gicr_size as u64;
1359             let gicr = GicR {
1360                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1361                 length: 16,
1362                 reserved: 0,
1363                 base_address: gicr_base,
1364                 range_length: gicr_size,
1365             };
1366             madt.append(gicr);
1367 
1368             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1369             let gicits = GicIts {
1370                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1371                 length: 20,
1372                 reserved0: 0,
1373                 translation_id: 0,
1374                 base_address: gicr_base - 2 * 0x0001_0000,
1375                 reserved1: 0,
1376             };
1377             madt.append(gicits);
1378 
1379             madt.update_checksum();
1380         }
1381 
1382         madt
1383     }
1384 
1385     #[cfg(target_arch = "aarch64")]
1386     pub fn create_pptt(&self) -> Sdt {
1387         let pptt_start = 0;
1388         let mut cpus = 0;
1389         let mut uid = 0;
1390         // If topology is not specified, the default setting is:
1391         // 1 package, multiple cores, 1 thread per core
1392         // This is also the behavior when PPTT is missing.
1393         let (threads_per_core, cores_per_package, packages) =
1394             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1395 
1396         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1397 
1398         for cluster_idx in 0..packages {
1399             if cpus < self.config.boot_vcpus as usize {
1400                 let cluster_offset = pptt.len() - pptt_start;
1401                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1402                     r#type: 0,
1403                     length: 20,
1404                     reserved: 0,
1405                     flags: 0x2,
1406                     parent: 0,
1407                     acpi_processor_id: cluster_idx as u32,
1408                     num_private_resources: 0,
1409                 };
1410                 pptt.append(cluster_hierarchy_node);
1411 
1412                 for core_idx in 0..cores_per_package {
1413                     let core_offset = pptt.len() - pptt_start;
1414 
1415                     if threads_per_core > 1 {
1416                         let core_hierarchy_node = ProcessorHierarchyNode {
1417                             r#type: 0,
1418                             length: 20,
1419                             reserved: 0,
1420                             flags: 0x2,
1421                             parent: cluster_offset as u32,
1422                             acpi_processor_id: core_idx as u32,
1423                             num_private_resources: 0,
1424                         };
1425                         pptt.append(core_hierarchy_node);
1426 
1427                         for _thread_idx in 0..threads_per_core {
1428                             let thread_hierarchy_node = ProcessorHierarchyNode {
1429                                 r#type: 0,
1430                                 length: 20,
1431                                 reserved: 0,
1432                                 flags: 0xE,
1433                                 parent: core_offset as u32,
1434                                 acpi_processor_id: uid as u32,
1435                                 num_private_resources: 0,
1436                             };
1437                             pptt.append(thread_hierarchy_node);
1438                             uid += 1;
1439                         }
1440                     } else {
1441                         let thread_hierarchy_node = ProcessorHierarchyNode {
1442                             r#type: 0,
1443                             length: 20,
1444                             reserved: 0,
1445                             flags: 0xA,
1446                             parent: cluster_offset as u32,
1447                             acpi_processor_id: uid as u32,
1448                             num_private_resources: 0,
1449                         };
1450                         pptt.append(thread_hierarchy_node);
1451                         uid += 1;
1452                     }
1453                 }
1454                 cpus += (cores_per_package * threads_per_core) as usize;
1455             }
1456         }
1457 
1458         pptt.update_checksum();
1459         pptt
1460     }
1461 
1462     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1463     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1464         self.vcpus[usize::from(cpu_id)]
1465             .lock()
1466             .unwrap()
1467             .vcpu
1468             .get_regs()
1469             .map_err(Error::CpuDebug)
1470     }
1471 
1472     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1473     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1474         self.vcpus[usize::from(cpu_id)]
1475             .lock()
1476             .unwrap()
1477             .vcpu
1478             .set_regs(regs)
1479             .map_err(Error::CpuDebug)
1480     }
1481 
1482     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1483     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1484         self.vcpus[usize::from(cpu_id)]
1485             .lock()
1486             .unwrap()
1487             .vcpu
1488             .get_sregs()
1489             .map_err(Error::CpuDebug)
1490     }
1491 
1492     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1493     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1494         self.vcpus[usize::from(cpu_id)]
1495             .lock()
1496             .unwrap()
1497             .vcpu
1498             .set_sregs(sregs)
1499             .map_err(Error::CpuDebug)
1500     }
1501 
1502     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1503     fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> {
1504         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1505             .lock()
1506             .unwrap()
1507             .vcpu
1508             .translate_gva(gva, /* flags: unused */ 0)
1509             .map_err(Error::TranslateVirtualAddress)?;
1510         Ok(gpa)
1511     }
1512 
1513     pub fn vcpus_paused(&self) -> bool {
1514         self.vcpus_pause_signalled.load(Ordering::SeqCst)
1515     }
1516 }
1517 
1518 struct Cpu {
1519     cpu_id: u8,
1520     proximity_domain: u32,
1521     dynamic: bool,
1522 }
1523 
1524 #[cfg(target_arch = "x86_64")]
1525 const MADT_CPU_ENABLE_FLAG: usize = 0;
1526 
1527 #[cfg(target_arch = "x86_64")]
1528 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1529 
1530 impl Cpu {
1531     #[cfg(target_arch = "x86_64")]
1532     fn generate_mat(&self) -> Vec<u8> {
1533         let lapic = LocalApic {
1534             r#type: 0,
1535             length: 8,
1536             processor_id: self.cpu_id,
1537             apic_id: self.cpu_id,
1538             flags: 1 << MADT_CPU_ENABLE_FLAG,
1539         };
1540 
1541         let mut mat_data: Vec<u8> = Vec::new();
1542         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1543         unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic };
1544 
1545         mat_data
1546     }
1547 }
1548 
1549 impl Aml for Cpu {
1550     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1551         #[cfg(target_arch = "x86_64")]
1552         let mat_data: Vec<u8> = self.generate_mat();
1553         #[allow(clippy::if_same_then_else)]
1554         if self.dynamic {
1555             aml::Device::new(
1556                 format!("C{:03}", self.cpu_id).as_str().into(),
1557                 vec![
1558                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1559                     &aml::Name::new("_UID".into(), &self.cpu_id),
1560                     // Currently, AArch64 cannot support following fields.
1561                     /*
1562                     _STA return value:
1563                     Bit [0] – Set if the device is present.
1564                     Bit [1] – Set if the device is enabled and decoding its resources.
1565                     Bit [2] – Set if the device should be shown in the UI.
1566                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1567                     Bit [4] – Set if the battery is present.
1568                     Bits [31:5] – Reserved (must be cleared).
1569                     */
1570                     #[cfg(target_arch = "x86_64")]
1571                     &aml::Method::new(
1572                         "_STA".into(),
1573                         0,
1574                         false,
1575                         // Call into CSTA method which will interrogate device
1576                         vec![&aml::Return::new(&aml::MethodCall::new(
1577                             "CSTA".into(),
1578                             vec![&self.cpu_id],
1579                         ))],
1580                     ),
1581                     &aml::Method::new(
1582                         "_PXM".into(),
1583                         0,
1584                         false,
1585                         vec![&aml::Return::new(&self.proximity_domain)],
1586                     ),
1587                     // The Linux kernel expects every CPU device to have a _MAT entry
1588                     // containing the LAPIC for this processor with the enabled bit set
1589                     // even it if is disabled in the MADT (non-boot CPU)
1590                     #[cfg(target_arch = "x86_64")]
1591                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1592                     // Trigger CPU ejection
1593                     #[cfg(target_arch = "x86_64")]
1594                     &aml::Method::new(
1595                         "_EJ0".into(),
1596                         1,
1597                         false,
1598                         // Call into CEJ0 method which will actually eject device
1599                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1600                     ),
1601                 ],
1602             )
1603             .append_aml_bytes(bytes);
1604         } else {
1605             aml::Device::new(
1606                 format!("C{:03}", self.cpu_id).as_str().into(),
1607                 vec![
1608                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1609                     &aml::Name::new("_UID".into(), &self.cpu_id),
1610                     #[cfg(target_arch = "x86_64")]
1611                     &aml::Method::new(
1612                         "_STA".into(),
1613                         0,
1614                         false,
1615                         // Mark CPU present see CSTA implementation
1616                         vec![&aml::Return::new(&0xfu8)],
1617                     ),
1618                     &aml::Method::new(
1619                         "_PXM".into(),
1620                         0,
1621                         false,
1622                         vec![&aml::Return::new(&self.proximity_domain)],
1623                     ),
1624                     // The Linux kernel expects every CPU device to have a _MAT entry
1625                     // containing the LAPIC for this processor with the enabled bit set
1626                     // even it if is disabled in the MADT (non-boot CPU)
1627                     #[cfg(target_arch = "x86_64")]
1628                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1629                 ],
1630             )
1631             .append_aml_bytes(bytes);
1632         }
1633     }
1634 }
1635 
1636 struct CpuNotify {
1637     cpu_id: u8,
1638 }
1639 
1640 impl Aml for CpuNotify {
1641     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1642         let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
1643         aml::If::new(
1644             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1645             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1646         )
1647         .append_aml_bytes(bytes)
1648     }
1649 }
1650 
1651 struct CpuMethods {
1652     max_vcpus: u8,
1653     dynamic: bool,
1654 }
1655 
1656 impl Aml for CpuMethods {
1657     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1658         if self.dynamic {
1659             // CPU status method
1660             aml::Method::new(
1661                 "CSTA".into(),
1662                 1,
1663                 true,
1664                 vec![
1665                     // Take lock defined above
1666                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1667                     // Write CPU number (in first argument) to I/O port via field
1668                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1669                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1670                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1671                     &aml::If::new(
1672                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1673                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1674                     ),
1675                     // Release lock
1676                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1677                     // Return 0 or 0xf
1678                     &aml::Return::new(&aml::Local(0)),
1679                 ],
1680             )
1681             .append_aml_bytes(bytes);
1682 
1683             let mut cpu_notifies = Vec::new();
1684             for cpu_id in 0..self.max_vcpus {
1685                 cpu_notifies.push(CpuNotify { cpu_id });
1686             }
1687 
1688             let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
1689             for cpu_id in 0..self.max_vcpus {
1690                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1691             }
1692 
1693             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes);
1694 
1695             aml::Method::new(
1696                 "CEJ0".into(),
1697                 1,
1698                 true,
1699                 vec![
1700                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1701                     // Write CPU number (in first argument) to I/O port via field
1702                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1703                     // Set CEJ0 bit
1704                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1705                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1706                 ],
1707             )
1708             .append_aml_bytes(bytes);
1709 
1710             aml::Method::new(
1711                 "CSCN".into(),
1712                 0,
1713                 true,
1714                 vec![
1715                     // Take lock defined above
1716                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1717                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1718                     &aml::While::new(
1719                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1720                         vec![
1721                             // Write CPU number (in first argument) to I/O port via field
1722                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1723                             // Check if CINS bit is set
1724                             &aml::If::new(
1725                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1726                                 // Notify device if it is
1727                                 vec![
1728                                     &aml::MethodCall::new(
1729                                         "CTFY".into(),
1730                                         vec![&aml::Local(0), &aml::ONE],
1731                                     ),
1732                                     // Reset CINS bit
1733                                     &aml::Store::new(
1734                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1735                                         &aml::ONE,
1736                                     ),
1737                                 ],
1738                             ),
1739                             // Check if CRMV bit is set
1740                             &aml::If::new(
1741                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1742                                 // Notify device if it is (with the eject constant 0x3)
1743                                 vec![
1744                                     &aml::MethodCall::new(
1745                                         "CTFY".into(),
1746                                         vec![&aml::Local(0), &3u8],
1747                                     ),
1748                                     // Reset CRMV bit
1749                                     &aml::Store::new(
1750                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1751                                         &aml::ONE,
1752                                     ),
1753                                 ],
1754                             ),
1755                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1756                         ],
1757                     ),
1758                     // Release lock
1759                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1760                 ],
1761             )
1762             .append_aml_bytes(bytes)
1763         } else {
1764             aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes)
1765         }
1766     }
1767 }
1768 
1769 impl Aml for CpuManager {
1770     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1771         #[cfg(target_arch = "x86_64")]
1772         if let Some(acpi_address) = self.acpi_address {
1773             // CPU hotplug controller
1774             aml::Device::new(
1775                 "_SB_.PRES".into(),
1776                 vec![
1777                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
1778                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1779                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1780                     &aml::Mutex::new("CPLK".into(), 0),
1781                     &aml::Name::new(
1782                         "_CRS".into(),
1783                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
1784                             aml::AddressSpaceCachable::NotCacheable,
1785                             true,
1786                             acpi_address.0 as u64,
1787                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
1788                         )]),
1789                     ),
1790                     // OpRegion and Fields map MMIO range into individual field values
1791                     &aml::OpRegion::new(
1792                         "PRST".into(),
1793                         aml::OpRegionSpace::SystemMemory,
1794                         acpi_address.0 as usize,
1795                         CPU_MANAGER_ACPI_SIZE,
1796                     ),
1797                     &aml::Field::new(
1798                         "PRST".into(),
1799                         aml::FieldAccessType::Byte,
1800                         aml::FieldUpdateRule::WriteAsZeroes,
1801                         vec![
1802                             aml::FieldEntry::Reserved(32),
1803                             aml::FieldEntry::Named(*b"CPEN", 1),
1804                             aml::FieldEntry::Named(*b"CINS", 1),
1805                             aml::FieldEntry::Named(*b"CRMV", 1),
1806                             aml::FieldEntry::Named(*b"CEJ0", 1),
1807                             aml::FieldEntry::Reserved(4),
1808                             aml::FieldEntry::Named(*b"CCMD", 8),
1809                         ],
1810                     ),
1811                     &aml::Field::new(
1812                         "PRST".into(),
1813                         aml::FieldAccessType::DWord,
1814                         aml::FieldUpdateRule::Preserve,
1815                         vec![
1816                             aml::FieldEntry::Named(*b"CSEL", 32),
1817                             aml::FieldEntry::Reserved(32),
1818                             aml::FieldEntry::Named(*b"CDAT", 32),
1819                         ],
1820                     ),
1821                 ],
1822             )
1823             .append_aml_bytes(bytes);
1824         }
1825 
1826         // CPU devices
1827         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
1828         let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05"));
1829         // Bundle methods together under a common object
1830         let methods = CpuMethods {
1831             max_vcpus: self.config.max_vcpus,
1832             dynamic: self.dynamic,
1833         };
1834         let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods];
1835 
1836         let mut cpu_devices = Vec::new();
1837         for cpu_id in 0..self.config.max_vcpus {
1838             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
1839             let cpu_device = Cpu {
1840                 cpu_id,
1841                 proximity_domain,
1842                 dynamic: self.dynamic,
1843             };
1844 
1845             cpu_devices.push(cpu_device);
1846         }
1847 
1848         for cpu_device in cpu_devices.iter() {
1849             cpu_data_inner.push(cpu_device);
1850         }
1851 
1852         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes)
1853     }
1854 }
1855 
1856 impl Pausable for CpuManager {
1857     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
1858         // Tell the vCPUs to pause themselves next time they exit
1859         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
1860 
1861         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1862         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1863         // above.
1864         for state in self.vcpu_states.iter() {
1865             state.signal_thread();
1866         }
1867 
1868         for vcpu in self.vcpus.iter() {
1869             let mut vcpu = vcpu.lock().unwrap();
1870             vcpu.pause()?;
1871             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
1872             if !self.config.kvm_hyperv {
1873                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
1874                     MigratableError::Pause(anyhow!(
1875                         "Could not notify guest it has been paused {:?}",
1876                         e
1877                     ))
1878                 })?;
1879             }
1880         }
1881 
1882         Ok(())
1883     }
1884 
1885     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
1886         for vcpu in self.vcpus.iter() {
1887             vcpu.lock().unwrap().resume()?;
1888         }
1889 
1890         // Toggle the vCPUs pause boolean
1891         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1892 
1893         // Unpark all the VCPU threads.
1894         // Once unparked, the next thing they will do is checking for the pause
1895         // boolean. Since it'll be set to false, they will exit their pause loop
1896         // and go back to vmx root.
1897         for state in self.vcpu_states.iter() {
1898             state.unpark_thread();
1899         }
1900         Ok(())
1901     }
1902 }
1903 
1904 impl Snapshottable for CpuManager {
1905     fn id(&self) -> String {
1906         CPU_MANAGER_SNAPSHOT_ID.to_string()
1907     }
1908 
1909     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1910         let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID);
1911 
1912         // The CpuManager snapshot is a collection of all vCPUs snapshots.
1913         for vcpu in &self.vcpus {
1914             let cpu_snapshot = vcpu.lock().unwrap().snapshot()?;
1915             cpu_manager_snapshot.add_snapshot(cpu_snapshot);
1916         }
1917 
1918         Ok(cpu_manager_snapshot)
1919     }
1920 
1921     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1922         for (cpu_id, snapshot) in snapshot.snapshots.iter() {
1923             info!("Restoring VCPU {}", cpu_id);
1924             self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone()))
1925                 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?;
1926         }
1927 
1928         Ok(())
1929     }
1930 }
1931 
1932 impl Transportable for CpuManager {}
1933 impl Migratable for CpuManager {}
1934 
1935 #[cfg(feature = "gdb")]
1936 impl Debuggable for CpuManager {
1937     #[cfg(feature = "kvm")]
1938     fn set_guest_debug(
1939         &self,
1940         cpu_id: usize,
1941         addrs: &[GuestAddress],
1942         singlestep: bool,
1943     ) -> std::result::Result<(), DebuggableError> {
1944         self.vcpus[cpu_id]
1945             .lock()
1946             .unwrap()
1947             .vcpu
1948             .set_guest_debug(addrs, singlestep)
1949             .map_err(DebuggableError::SetDebug)
1950     }
1951 
1952     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
1953         Ok(())
1954     }
1955 
1956     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
1957         Ok(())
1958     }
1959 
1960     #[cfg(target_arch = "x86_64")]
1961     fn read_regs(&self, cpu_id: usize) -> std::result::Result<X86_64CoreRegs, DebuggableError> {
1962         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
1963         let gregs = self
1964             .get_regs(cpu_id as u8)
1965             .map_err(DebuggableError::ReadRegs)?;
1966         let regs = [
1967             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
1968             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
1969         ];
1970 
1971         // GDB exposes 32-bit eflags instead of 64-bit rflags.
1972         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
1973         let eflags = gregs.rflags as u32;
1974         let rip = gregs.rip;
1975 
1976         // Segment registers: CS, SS, DS, ES, FS, GS
1977         let sregs = self
1978             .get_sregs(cpu_id as u8)
1979             .map_err(DebuggableError::ReadRegs)?;
1980         let segments = X86SegmentRegs {
1981             cs: sregs.cs.selector as u32,
1982             ss: sregs.ss.selector as u32,
1983             ds: sregs.ds.selector as u32,
1984             es: sregs.es.selector as u32,
1985             fs: sregs.fs.selector as u32,
1986             gs: sregs.gs.selector as u32,
1987         };
1988 
1989         // TODO: Add other registers
1990 
1991         Ok(X86_64CoreRegs {
1992             regs,
1993             eflags,
1994             rip,
1995             segments,
1996             ..Default::default()
1997         })
1998     }
1999 
2000     #[cfg(target_arch = "x86_64")]
2001     fn write_regs(
2002         &self,
2003         cpu_id: usize,
2004         regs: &X86_64CoreRegs,
2005     ) -> std::result::Result<(), DebuggableError> {
2006         let orig_gregs = self
2007             .get_regs(cpu_id as u8)
2008             .map_err(DebuggableError::ReadRegs)?;
2009         let gregs = StandardRegisters {
2010             rax: regs.regs[0],
2011             rbx: regs.regs[1],
2012             rcx: regs.regs[2],
2013             rdx: regs.regs[3],
2014             rsi: regs.regs[4],
2015             rdi: regs.regs[5],
2016             rbp: regs.regs[6],
2017             rsp: regs.regs[7],
2018             r8: regs.regs[8],
2019             r9: regs.regs[9],
2020             r10: regs.regs[10],
2021             r11: regs.regs[11],
2022             r12: regs.regs[12],
2023             r13: regs.regs[13],
2024             r14: regs.regs[14],
2025             r15: regs.regs[15],
2026             rip: regs.rip,
2027             // Update the lower 32-bit of rflags.
2028             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2029         };
2030 
2031         self.set_regs(cpu_id as u8, &gregs)
2032             .map_err(DebuggableError::WriteRegs)?;
2033 
2034         // Segment registers: CS, SS, DS, ES, FS, GS
2035         // Since GDB care only selectors, we call get_sregs() first.
2036         let mut sregs = self
2037             .get_sregs(cpu_id as u8)
2038             .map_err(DebuggableError::ReadRegs)?;
2039         sregs.cs.selector = regs.segments.cs as u16;
2040         sregs.ss.selector = regs.segments.ss as u16;
2041         sregs.ds.selector = regs.segments.ds as u16;
2042         sregs.es.selector = regs.segments.es as u16;
2043         sregs.fs.selector = regs.segments.fs as u16;
2044         sregs.gs.selector = regs.segments.gs as u16;
2045 
2046         self.set_sregs(cpu_id as u8, &sregs)
2047             .map_err(DebuggableError::WriteRegs)?;
2048 
2049         // TODO: Add other registers
2050 
2051         Ok(())
2052     }
2053 
2054     #[cfg(target_arch = "x86_64")]
2055     fn read_mem(
2056         &self,
2057         cpu_id: usize,
2058         vaddr: GuestAddress,
2059         len: usize,
2060     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2061         let mut buf = vec![0; len];
2062         let mut total_read = 0_u64;
2063 
2064         while total_read < len as u64 {
2065             let gaddr = vaddr.0 + total_read;
2066             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2067                 Ok(paddr) => paddr,
2068                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2069                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2070             };
2071             let psize = arch::PAGE_SIZE as u64;
2072             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2073             self.vm_memory
2074                 .memory()
2075                 .read(
2076                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2077                     GuestAddress(paddr),
2078                 )
2079                 .map_err(DebuggableError::ReadMem)?;
2080             total_read += read_len;
2081         }
2082         Ok(buf)
2083     }
2084 
2085     #[cfg(target_arch = "x86_64")]
2086     fn write_mem(
2087         &self,
2088         cpu_id: usize,
2089         vaddr: &GuestAddress,
2090         data: &[u8],
2091     ) -> std::result::Result<(), DebuggableError> {
2092         let mut total_written = 0_u64;
2093 
2094         while total_written < data.len() as u64 {
2095             let gaddr = vaddr.0 + total_written;
2096             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2097                 Ok(paddr) => paddr,
2098                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2099                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2100             };
2101             let psize = arch::PAGE_SIZE as u64;
2102             let write_len = std::cmp::min(
2103                 data.len() as u64 - total_written,
2104                 psize - (paddr & (psize - 1)),
2105             );
2106             self.vm_memory
2107                 .memory()
2108                 .write(
2109                     &data[total_written as usize..total_written as usize + write_len as usize],
2110                     GuestAddress(paddr),
2111                 )
2112                 .map_err(DebuggableError::WriteMem)?;
2113             total_written += write_len;
2114         }
2115         Ok(())
2116     }
2117 
2118     fn active_vcpus(&self) -> usize {
2119         self.present_vcpus() as usize
2120     }
2121 }
2122 
2123 #[cfg(feature = "guest_debug")]
2124 impl Elf64Writable for CpuManager {}
2125 
2126 #[cfg(feature = "guest_debug")]
2127 impl CpuElf64Writable for CpuManager {
2128     fn cpu_write_elf64_note(
2129         &mut self,
2130         dump_state: &DumpState,
2131     ) -> std::result::Result<(), GuestDebuggableError> {
2132         let mut coredump_file = dump_state.file.as_ref().unwrap();
2133         for vcpu in &self.vcpus {
2134             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2135             let mut pos: usize = 0;
2136             let mut buf = vec![0; note_size as usize];
2137             let descsz = size_of::<X86_64ElfPrStatus>();
2138             let vcpu_id = vcpu.lock().unwrap().id;
2139 
2140             let note = Elf64_Nhdr {
2141                 n_namesz: COREDUMP_NAME_SIZE,
2142                 n_descsz: descsz as u32,
2143                 n_type: NT_PRSTATUS,
2144             };
2145 
2146             let bytes: &[u8] = note.as_slice();
2147             buf.splice(0.., bytes.to_vec());
2148             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2149             buf.resize(pos + 4, 0);
2150             buf.splice(pos.., "CORE".to_string().into_bytes());
2151 
2152             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2153             buf.resize(pos + 32 + 4, 0);
2154             let pid = vcpu_id as u64;
2155             let bytes: &[u8] = pid.as_slice();
2156             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2157 
2158             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2159 
2160             let orig_rax: u64 = 0;
2161             let gregs = self.vcpus[usize::from(vcpu_id)]
2162                 .lock()
2163                 .unwrap()
2164                 .vcpu
2165                 .get_regs()
2166                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2167 
2168             let regs1 = [
2169                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2170                 gregs.r10,
2171             ];
2172             let regs2 = [
2173                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2174             ];
2175 
2176             let sregs = self.vcpus[usize::from(vcpu_id)]
2177                 .lock()
2178                 .unwrap()
2179                 .vcpu
2180                 .get_sregs()
2181                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2182 
2183             debug!(
2184                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2185                 gregs.rip,
2186                 gregs.rsp,
2187                 sregs.gs.base,
2188                 sregs.cs.selector,
2189                 sregs.ss.selector,
2190                 sregs.ds.selector,
2191             );
2192 
2193             let regs = X86_64UserRegs {
2194                 regs1,
2195                 regs2,
2196                 rip: gregs.rip,
2197                 cs: sregs.cs.selector as u64,
2198                 eflags: gregs.rflags,
2199                 rsp: gregs.rsp,
2200                 ss: sregs.ss.selector as u64,
2201                 fs_base: sregs.fs.base as u64,
2202                 gs_base: sregs.gs.base as u64,
2203                 ds: sregs.ds.selector as u64,
2204                 es: sregs.es.selector as u64,
2205                 fs: sregs.fs.selector as u64,
2206                 gs: sregs.gs.selector as u64,
2207             };
2208 
2209             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2210             let bytes: &[u8] = regs.as_slice();
2211             buf.resize(note_size as usize, 0);
2212             buf.splice(pos.., bytes.to_vec());
2213             buf.resize(note_size as usize, 0);
2214 
2215             coredump_file
2216                 .write(&buf)
2217                 .map_err(GuestDebuggableError::CoredumpFile)?;
2218         }
2219 
2220         Ok(())
2221     }
2222 
2223     fn cpu_write_vmm_note(
2224         &mut self,
2225         dump_state: &DumpState,
2226     ) -> std::result::Result<(), GuestDebuggableError> {
2227         let mut coredump_file = dump_state.file.as_ref().unwrap();
2228         for vcpu in &self.vcpus {
2229             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2230             let mut pos: usize = 0;
2231             let mut buf = vec![0; note_size as usize];
2232             let descsz = size_of::<DumpCpusState>();
2233             let vcpu_id = vcpu.lock().unwrap().id;
2234 
2235             let note = Elf64_Nhdr {
2236                 n_namesz: COREDUMP_NAME_SIZE,
2237                 n_descsz: descsz as u32,
2238                 n_type: 0,
2239             };
2240 
2241             let bytes: &[u8] = note.as_slice();
2242             buf.splice(0.., bytes.to_vec());
2243             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2244 
2245             buf.resize(pos + 4, 0);
2246             buf.splice(pos.., "QEMU".to_string().into_bytes());
2247 
2248             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2249 
2250             let gregs = self.vcpus[usize::from(vcpu_id)]
2251                 .lock()
2252                 .unwrap()
2253                 .vcpu
2254                 .get_regs()
2255                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2256 
2257             let regs1 = [
2258                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2259                 gregs.rbp,
2260             ];
2261 
2262             let regs2 = [
2263                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2264                 gregs.r15,
2265             ];
2266 
2267             let sregs = self.vcpus[usize::from(vcpu_id)]
2268                 .lock()
2269                 .unwrap()
2270                 .vcpu
2271                 .get_sregs()
2272                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2273 
2274             let mut msrs = vec![MsrEntry {
2275                 index: msr_index::MSR_KERNEL_GS_BASE,
2276                 ..Default::default()
2277             }];
2278 
2279             self.vcpus[vcpu_id as usize]
2280                 .lock()
2281                 .unwrap()
2282                 .vcpu
2283                 .get_msrs(&mut msrs)
2284                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2285             let kernel_gs_base = msrs[0].data;
2286 
2287             let cs = CpuSegment::new(sregs.cs);
2288             let ds = CpuSegment::new(sregs.ds);
2289             let es = CpuSegment::new(sregs.es);
2290             let fs = CpuSegment::new(sregs.fs);
2291             let gs = CpuSegment::new(sregs.gs);
2292             let ss = CpuSegment::new(sregs.ss);
2293             let ldt = CpuSegment::new(sregs.ldt);
2294             let tr = CpuSegment::new(sregs.tr);
2295             let gdt = CpuSegment::new_from_table(sregs.gdt);
2296             let idt = CpuSegment::new_from_table(sregs.idt);
2297             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2298             let regs = DumpCpusState {
2299                 version: 1,
2300                 size: size_of::<DumpCpusState>() as u32,
2301                 regs1,
2302                 regs2,
2303                 rip: gregs.rip,
2304                 rflags: gregs.rflags,
2305                 cs,
2306                 ds,
2307                 es,
2308                 fs,
2309                 gs,
2310                 ss,
2311                 ldt,
2312                 tr,
2313                 gdt,
2314                 idt,
2315                 cr,
2316                 kernel_gs_base,
2317             };
2318 
2319             let bytes: &[u8] = regs.as_slice();
2320             buf.resize(note_size as usize, 0);
2321             buf.splice(pos.., bytes.to_vec());
2322             buf.resize(note_size as usize, 0);
2323 
2324             coredump_file
2325                 .write(&buf)
2326                 .map_err(GuestDebuggableError::CoredumpFile)?;
2327         }
2328 
2329         Ok(())
2330     }
2331 }
2332 
2333 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2334 #[cfg(test)]
2335 mod tests {
2336     use arch::x86_64::interrupts::*;
2337     use arch::x86_64::regs::*;
2338     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2339 
2340     #[test]
2341     fn test_setlint() {
2342         let hv = hypervisor::new().unwrap();
2343         let vm = hv.create_vm().expect("new VM fd creation failed");
2344         assert!(hv.check_required_extensions().is_ok());
2345         // Calling get_lapic will fail if there is no irqchip before hand.
2346         assert!(vm.create_irq_chip().is_ok());
2347         let vcpu = vm.create_vcpu(0, None).unwrap();
2348         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2349 
2350         // Compute the value that is expected to represent LVT0 and LVT1.
2351         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2352         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2353         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2354         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2355 
2356         set_lint(&vcpu).unwrap();
2357 
2358         // Compute the value that represents LVT0 and LVT1 after set_lint.
2359         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2360         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2361         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2362         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2363         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2364     }
2365 
2366     #[test]
2367     fn test_setup_fpu() {
2368         let hv = hypervisor::new().unwrap();
2369         let vm = hv.create_vm().expect("new VM fd creation failed");
2370         let vcpu = vm.create_vcpu(0, None).unwrap();
2371         setup_fpu(&vcpu).unwrap();
2372 
2373         let expected_fpu: FpuState = FpuState {
2374             fcw: 0x37f,
2375             mxcsr: 0x1f80,
2376             ..Default::default()
2377         };
2378         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2379         // TODO: auto-generate kvm related structures with PartialEq on.
2380         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2381         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2382         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2383         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2384         // remove it at all.
2385         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2386     }
2387 
2388     #[test]
2389     fn test_setup_msrs() {
2390         use hypervisor::arch::x86::{msr_index, MsrEntry};
2391 
2392         let hv = hypervisor::new().unwrap();
2393         let vm = hv.create_vm().expect("new VM fd creation failed");
2394         let vcpu = vm.create_vcpu(0, None).unwrap();
2395         setup_msrs(&vcpu).unwrap();
2396 
2397         // This test will check against the last MSR entry configured (the tenth one).
2398         // See create_msr_entries for details.
2399         let mut msrs = vec![MsrEntry {
2400             index: msr_index::MSR_IA32_MISC_ENABLE,
2401             ..Default::default()
2402         }];
2403 
2404         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2405         // in this test case scenario.
2406         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2407         assert_eq!(read_msrs, 1);
2408 
2409         // Official entries that were setup when we did setup_msrs. We need to assert that the
2410         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2411         // expect.
2412         let entry_vec = vcpu.boot_msr_entries();
2413         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2414     }
2415 
2416     #[test]
2417     fn test_setup_regs() {
2418         let hv = hypervisor::new().unwrap();
2419         let vm = hv.create_vm().expect("new VM fd creation failed");
2420         let vcpu = vm.create_vcpu(0, None).unwrap();
2421 
2422         let expected_regs: StandardRegisters = StandardRegisters {
2423             rflags: 0x0000000000000002u64,
2424             rbx: arch::layout::PVH_INFO_START.0,
2425             rip: 1,
2426             ..Default::default()
2427         };
2428 
2429         setup_regs(&vcpu, expected_regs.rip).unwrap();
2430 
2431         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2432         assert_eq!(actual_regs, expected_regs);
2433     }
2434 }
2435 
2436 #[cfg(target_arch = "aarch64")]
2437 #[cfg(test)]
2438 mod tests {
2439     use arch::layout;
2440     use hypervisor::kvm::aarch64::{is_system_register, MPIDR_EL1};
2441     use hypervisor::kvm::kvm_bindings::{
2442         kvm_one_reg, kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2443         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2444     };
2445     use hypervisor::{arm64_core_reg_id, offset__of};
2446     use std::mem;
2447 
2448     #[test]
2449     fn test_setup_regs() {
2450         let hv = hypervisor::new().unwrap();
2451         let vm = hv.create_vm().unwrap();
2452         let vcpu = vm.create_vcpu(0, None).unwrap();
2453 
2454         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2455         // Must fail when vcpu is not initialized yet.
2456         assert!(res.is_err());
2457 
2458         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2459         vm.get_preferred_target(&mut kvi).unwrap();
2460         vcpu.vcpu_init(&kvi).unwrap();
2461 
2462         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2463     }
2464 
2465     #[test]
2466     fn test_read_mpidr() {
2467         let hv = hypervisor::new().unwrap();
2468         let vm = hv.create_vm().unwrap();
2469         let vcpu = vm.create_vcpu(0, None).unwrap();
2470         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2471         vm.get_preferred_target(&mut kvi).unwrap();
2472 
2473         // Must fail when vcpu is not initialized yet.
2474         assert!(vcpu.read_mpidr().is_err());
2475 
2476         vcpu.vcpu_init(&kvi).unwrap();
2477         assert_eq!(vcpu.read_mpidr().unwrap(), 0x80000000);
2478     }
2479 
2480     #[test]
2481     fn test_is_system_register() {
2482         let offset = offset__of!(user_pt_regs, pc);
2483         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2484         assert!(!is_system_register(regid));
2485         let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64;
2486         assert!(is_system_register(regid));
2487     }
2488 
2489     #[test]
2490     fn test_save_restore_core_regs() {
2491         let hv = hypervisor::new().unwrap();
2492         let vm = hv.create_vm().unwrap();
2493         let vcpu = vm.create_vcpu(0, None).unwrap();
2494         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2495         vm.get_preferred_target(&mut kvi).unwrap();
2496 
2497         // Must fail when vcpu is not initialized yet.
2498         let res = vcpu.get_regs();
2499         assert!(res.is_err());
2500         assert_eq!(
2501             format!("{}", res.unwrap_err()),
2502             "Failed to get core register: Exec format error (os error 8)"
2503         );
2504 
2505         let mut state = kvm_regs::default();
2506         let res = vcpu.set_regs(&state);
2507         assert!(res.is_err());
2508         assert_eq!(
2509             format!("{}", res.unwrap_err()),
2510             "Failed to set core register: Exec format error (os error 8)"
2511         );
2512 
2513         vcpu.vcpu_init(&kvi).unwrap();
2514         let res = vcpu.get_regs();
2515         assert!(res.is_ok());
2516         state = res.unwrap();
2517         assert_eq!(state.regs.pstate, 0x3C5);
2518 
2519         assert!(vcpu.set_regs(&state).is_ok());
2520         let off = offset__of!(user_pt_regs, pstate);
2521         let pstate = vcpu
2522             .get_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
2523             .expect("Failed to call kvm get one reg");
2524         assert_eq!(state.regs.pstate, pstate);
2525     }
2526 
2527     #[test]
2528     fn test_save_restore_system_regs() {
2529         let hv = hypervisor::new().unwrap();
2530         let vm = hv.create_vm().unwrap();
2531         let vcpu = vm.create_vcpu(0, None).unwrap();
2532         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2533         vm.get_preferred_target(&mut kvi).unwrap();
2534 
2535         // Must fail when vcpu is not initialized yet.
2536         let mut state: Vec<kvm_one_reg> = Vec::new();
2537         let res = vcpu.get_sys_regs();
2538         assert!(res.is_err());
2539         assert_eq!(
2540             format!("{}", res.as_ref().unwrap_err()),
2541             "Failed to retrieve list of registers: Exec format error (os error 8)"
2542         );
2543 
2544         state.push(kvm_one_reg {
2545             id: MPIDR_EL1,
2546             addr: 0x00,
2547         });
2548         let res = vcpu.set_sys_regs(&state);
2549         assert!(res.is_err());
2550         assert_eq!(
2551             format!("{}", res.unwrap_err()),
2552             "Failed to set system register: Exec format error (os error 8)"
2553         );
2554 
2555         vcpu.vcpu_init(&kvi).unwrap();
2556         let res = vcpu.get_sys_regs();
2557         assert!(res.is_ok());
2558         state = res.unwrap();
2559 
2560         let initial_mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr");
2561         assert!(state.contains(&kvm_one_reg {
2562             id: MPIDR_EL1,
2563             addr: initial_mpidr
2564         }));
2565 
2566         assert!(vcpu.set_sys_regs(&state).is_ok());
2567         let mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr");
2568         assert_eq!(initial_mpidr, mpidr);
2569     }
2570 
2571     #[test]
2572     fn test_get_set_mpstate() {
2573         let hv = hypervisor::new().unwrap();
2574         let vm = hv.create_vm().unwrap();
2575         let vcpu = vm.create_vcpu(0, None).unwrap();
2576         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2577         vm.get_preferred_target(&mut kvi).unwrap();
2578 
2579         let res = vcpu.get_mp_state();
2580         assert!(res.is_ok());
2581         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2582     }
2583 }
2584