xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 87c0791d535fd9a1a248dd1b146b65ccac106dd2)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 #[cfg(feature = "guest_debug")]
16 use crate::coredump::{
17     CpuElf64Writable, CpuSegment, CpuState as DumpCpusState, DumpState, Elf64Writable,
18     GuestDebuggableError, NoteDescType, X86_64ElfPrStatus, X86_64UserRegs, COREDUMP_NAME_SIZE,
19     NT_PRSTATUS,
20 };
21 use crate::device_manager::DeviceManager;
22 #[cfg(feature = "gdb")]
23 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
24 use crate::memory_manager::MemoryManager;
25 use crate::seccomp_filters::{get_seccomp_filter, Thread};
26 #[cfg(target_arch = "x86_64")]
27 use crate::vm::physical_bits;
28 use crate::GuestMemoryMmap;
29 use crate::CPU_MANAGER_SNAPSHOT_ID;
30 use acpi_tables::{aml, aml::Aml, sdt::Sdt};
31 use anyhow::anyhow;
32 use arch::EntryPoint;
33 use arch::NumaNodes;
34 use devices::interrupt_controller::InterruptController;
35 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
36 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs};
37 #[cfg(feature = "guest_debug")]
38 use hypervisor::arch::x86::msr_index;
39 #[cfg(target_arch = "x86_64")]
40 use hypervisor::arch::x86::CpuIdEntry;
41 #[cfg(feature = "guest_debug")]
42 use hypervisor::arch::x86::MsrEntry;
43 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
44 use hypervisor::arch::x86::{SpecialRegisters, StandardRegisters};
45 #[cfg(target_arch = "aarch64")]
46 use hypervisor::kvm::kvm_bindings;
47 #[cfg(feature = "tdx")]
48 use hypervisor::kvm::{TdxExitDetails, TdxExitStatus};
49 use hypervisor::{CpuState, HypervisorCpuError, HypervisorType, VmExit, VmOps};
50 use libc::{c_void, siginfo_t};
51 #[cfg(feature = "guest_debug")]
52 use linux_loader::elf::Elf64_Nhdr;
53 use seccompiler::{apply_filter, SeccompAction};
54 use std::collections::BTreeMap;
55 #[cfg(feature = "guest_debug")]
56 use std::io::Write;
57 #[cfg(feature = "guest_debug")]
58 use std::mem::size_of;
59 use std::os::unix::thread::JoinHandleExt;
60 use std::sync::atomic::{AtomicBool, Ordering};
61 use std::sync::{Arc, Barrier, Mutex};
62 use std::{cmp, io, result, thread};
63 use thiserror::Error;
64 use vm_device::BusDevice;
65 #[cfg(feature = "guest_debug")]
66 use vm_memory::ByteValued;
67 #[cfg(feature = "gdb")]
68 use vm_memory::{Bytes, GuestAddressSpace};
69 use vm_memory::{GuestAddress, GuestMemoryAtomic};
70 use vm_migration::{
71     Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
72     Transportable,
73 };
74 use vmm_sys_util::eventfd::EventFd;
75 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
76 
77 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
78 
79 #[derive(Debug, Error)]
80 pub enum Error {
81     #[error("Error creating vCPU: {0}")]
82     VcpuCreate(#[source] anyhow::Error),
83 
84     #[error("Error running bCPU: {0}")]
85     VcpuRun(#[source] anyhow::Error),
86 
87     #[error("Error spawning vCPU thread: {0}")]
88     VcpuSpawn(#[source] io::Error),
89 
90     #[error("Error generating common CPUID: {0}")]
91     CommonCpuId(#[source] arch::Error),
92 
93     #[error("Error configuring vCPU: {0}")]
94     VcpuConfiguration(#[source] arch::Error),
95 
96     #[cfg(target_arch = "aarch64")]
97     #[error("Error fetching preferred target: {0}")]
98     VcpuArmPreferredTarget(#[source] hypervisor::HypervisorVmError),
99 
100     #[cfg(target_arch = "aarch64")]
101     #[error("Error initialising vCPU: {0}")]
102     VcpuArmInit(#[source] hypervisor::HypervisorCpuError),
103 
104     #[error("Failed to join on vCPU threads: {0:?}")]
105     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
106 
107     #[error("Error adding CpuManager to MMIO bus: {0}")]
108     BusError(#[source] vm_device::BusError),
109 
110     #[error("Requested vCPUs exceed maximum")]
111     DesiredVCpuCountExceedsMax,
112 
113     #[error("Cannot create seccomp filter: {0}")]
114     CreateSeccompFilter(#[source] seccompiler::Error),
115 
116     #[error("Cannot apply seccomp filter: {0}")]
117     ApplySeccompFilter(#[source] seccompiler::Error),
118 
119     #[error("Error starting vCPU after restore: {0}")]
120     StartRestoreVcpu(#[source] anyhow::Error),
121 
122     #[error("Unexpected VmExit")]
123     UnexpectedVmExit,
124 
125     #[error("Failed to allocate MMIO address for CpuManager")]
126     AllocateMmmioAddress,
127 
128     #[cfg(feature = "tdx")]
129     #[error("Error initializing TDX: {0}")]
130     InitializeTdx(#[source] hypervisor::HypervisorCpuError),
131 
132     #[cfg(target_arch = "aarch64")]
133     #[error("Error initializing PMU: {0}")]
134     InitPmu(#[source] hypervisor::HypervisorCpuError),
135 
136     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
137     #[error("Error during CPU debug: {0}")]
138     CpuDebug(#[source] hypervisor::HypervisorCpuError),
139 
140     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
141     #[error("Error translating virtual address: {0}")]
142     TranslateVirtualAddress(#[source] hypervisor::HypervisorCpuError),
143 
144     #[cfg(all(feature = "amx", target_arch = "x86_64"))]
145     #[error("Error setting up AMX: {0}")]
146     AmxEnable(#[source] anyhow::Error),
147 }
148 pub type Result<T> = result::Result<T, Error>;
149 
150 #[cfg(target_arch = "x86_64")]
151 #[allow(dead_code)]
152 #[repr(packed)]
153 struct LocalApic {
154     pub r#type: u8,
155     pub length: u8,
156     pub processor_id: u8,
157     pub apic_id: u8,
158     pub flags: u32,
159 }
160 
161 #[allow(dead_code)]
162 #[repr(packed)]
163 #[derive(Default)]
164 struct Ioapic {
165     pub r#type: u8,
166     pub length: u8,
167     pub ioapic_id: u8,
168     _reserved: u8,
169     pub apic_address: u32,
170     pub gsi_base: u32,
171 }
172 
173 #[cfg(target_arch = "aarch64")]
174 #[allow(dead_code)]
175 #[repr(packed)]
176 struct GicC {
177     pub r#type: u8,
178     pub length: u8,
179     pub reserved0: u16,
180     pub cpu_interface_number: u32,
181     pub uid: u32,
182     pub flags: u32,
183     pub parking_version: u32,
184     pub performance_interrupt: u32,
185     pub parked_address: u64,
186     pub base_address: u64,
187     pub gicv_base_address: u64,
188     pub gich_base_address: u64,
189     pub vgic_interrupt: u32,
190     pub gicr_base_address: u64,
191     pub mpidr: u64,
192     pub proc_power_effi_class: u8,
193     pub reserved1: u8,
194     pub spe_overflow_interrupt: u16,
195 }
196 
197 #[cfg(target_arch = "aarch64")]
198 #[allow(dead_code)]
199 #[repr(packed)]
200 struct GicD {
201     pub r#type: u8,
202     pub length: u8,
203     pub reserved0: u16,
204     pub gic_id: u32,
205     pub base_address: u64,
206     pub global_irq_base: u32,
207     pub version: u8,
208     pub reserved1: [u8; 3],
209 }
210 
211 #[cfg(target_arch = "aarch64")]
212 #[allow(dead_code)]
213 #[repr(packed)]
214 struct GicR {
215     pub r#type: u8,
216     pub length: u8,
217     pub reserved: u16,
218     pub base_address: u64,
219     pub range_length: u32,
220 }
221 
222 #[cfg(target_arch = "aarch64")]
223 #[allow(dead_code)]
224 #[repr(packed)]
225 struct GicIts {
226     pub r#type: u8,
227     pub length: u8,
228     pub reserved0: u16,
229     pub translation_id: u32,
230     pub base_address: u64,
231     pub reserved1: u32,
232 }
233 
234 #[cfg(target_arch = "aarch64")]
235 #[allow(dead_code)]
236 #[repr(packed)]
237 struct ProcessorHierarchyNode {
238     pub r#type: u8,
239     pub length: u8,
240     pub reserved: u16,
241     pub flags: u32,
242     pub parent: u32,
243     pub acpi_processor_id: u32,
244     pub num_private_resources: u32,
245 }
246 
247 #[allow(dead_code)]
248 #[repr(packed)]
249 #[derive(Default)]
250 struct InterruptSourceOverride {
251     pub r#type: u8,
252     pub length: u8,
253     pub bus: u8,
254     pub source: u8,
255     pub gsi: u32,
256     pub flags: u16,
257 }
258 
259 #[cfg(feature = "guest_debug")]
260 macro_rules! round_up {
261     ($n:expr,$d:expr) => {
262         (($n / ($d + 1)) + 1) * $d
263     };
264 }
265 
266 /// A wrapper around creating and using a kvm-based VCPU.
267 pub struct Vcpu {
268     // The hypervisor abstracted CPU.
269     vcpu: Arc<dyn hypervisor::Vcpu>,
270     id: u8,
271     #[cfg(target_arch = "aarch64")]
272     mpidr: u64,
273     saved_state: Option<CpuState>,
274 }
275 
276 impl Vcpu {
277     /// Constructs a new VCPU for `vm`.
278     ///
279     /// # Arguments
280     ///
281     /// * `id` - Represents the CPU number between [0, max vcpus).
282     /// * `vm` - The virtual machine this vcpu will get attached to.
283     /// * `vm_ops` - Optional object for exit handling.
284     pub fn new(
285         id: u8,
286         vm: &Arc<dyn hypervisor::Vm>,
287         vm_ops: Option<Arc<dyn VmOps>>,
288     ) -> Result<Self> {
289         let vcpu = vm
290             .create_vcpu(id, vm_ops)
291             .map_err(|e| Error::VcpuCreate(e.into()))?;
292         // Initially the cpuid per vCPU is the one supported by this VM.
293         Ok(Vcpu {
294             vcpu,
295             id,
296             #[cfg(target_arch = "aarch64")]
297             mpidr: 0,
298             saved_state: None,
299         })
300     }
301 
302     /// Configures a vcpu and should be called once per vcpu when created.
303     ///
304     /// # Arguments
305     ///
306     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
307     /// * `vm_memory` - Guest memory.
308     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
309     pub fn configure(
310         &mut self,
311         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
312         kernel_entry_point: Option<EntryPoint>,
313         #[cfg(target_arch = "x86_64")] vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
314         #[cfg(target_arch = "x86_64")] cpuid: Vec<CpuIdEntry>,
315         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
316     ) -> Result<()> {
317         #[cfg(target_arch = "aarch64")]
318         {
319             self.init(vm)?;
320             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point)
321                 .map_err(Error::VcpuConfiguration)?;
322         }
323         info!("Configuring vCPU: cpu_id = {}", self.id);
324         #[cfg(target_arch = "x86_64")]
325         arch::configure_vcpu(
326             &self.vcpu,
327             self.id,
328             kernel_entry_point,
329             vm_memory,
330             cpuid,
331             kvm_hyperv,
332         )
333         .map_err(Error::VcpuConfiguration)?;
334 
335         Ok(())
336     }
337 
338     /// Gets the MPIDR register value.
339     #[cfg(target_arch = "aarch64")]
340     pub fn get_mpidr(&self) -> u64 {
341         self.mpidr
342     }
343 
344     /// Gets the saved vCPU state.
345     #[cfg(target_arch = "aarch64")]
346     pub fn get_saved_state(&self) -> Option<CpuState> {
347         self.saved_state.clone()
348     }
349 
350     /// Initializes an aarch64 specific vcpu for booting Linux.
351     #[cfg(target_arch = "aarch64")]
352     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
353         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
354 
355         // This reads back the kernel's preferred target type.
356         vm.get_preferred_target(&mut kvi)
357             .map_err(Error::VcpuArmPreferredTarget)?;
358         // We already checked that the capability is supported.
359         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
360         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
361         // Non-boot cpus are powered off initially.
362         if self.id > 0 {
363             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
364         }
365         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
366     }
367 
368     /// Runs the VCPU until it exits, returning the reason.
369     ///
370     /// Note that the state of the VCPU and associated VM must be setup first for this to do
371     /// anything useful.
372     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
373         self.vcpu.run()
374     }
375 }
376 
377 const VCPU_SNAPSHOT_ID: &str = "vcpu";
378 impl Pausable for Vcpu {}
379 impl Snapshottable for Vcpu {
380     fn id(&self) -> String {
381         VCPU_SNAPSHOT_ID.to_string()
382     }
383 
384     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
385         let saved_state = self
386             .vcpu
387             .state()
388             .map_err(|e| MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e)))?;
389 
390         let mut vcpu_snapshot = Snapshot::new(&format!("{:03}", self.id));
391         vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state(
392             VCPU_SNAPSHOT_ID,
393             &saved_state,
394         )?);
395 
396         self.saved_state = Some(saved_state);
397 
398         Ok(vcpu_snapshot)
399     }
400 
401     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
402         let saved_state: CpuState = snapshot.to_state(VCPU_SNAPSHOT_ID)?;
403 
404         self.vcpu
405             .set_state(&saved_state)
406             .map_err(|e| MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e)))?;
407 
408         self.saved_state = Some(saved_state);
409 
410         Ok(())
411     }
412 }
413 
414 pub struct CpuManager {
415     hypervisor_type: HypervisorType,
416     config: CpusConfig,
417     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
418     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
419     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
420     vm_memory: GuestMemoryAtomic<GuestMemoryMmap>,
421     #[cfg(target_arch = "x86_64")]
422     cpuid: Vec<CpuIdEntry>,
423     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
424     vm: Arc<dyn hypervisor::Vm>,
425     vcpus_kill_signalled: Arc<AtomicBool>,
426     vcpus_pause_signalled: Arc<AtomicBool>,
427     exit_evt: EventFd,
428     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
429     reset_evt: EventFd,
430     #[cfg(feature = "gdb")]
431     vm_debug_evt: EventFd,
432     vcpu_states: Vec<VcpuState>,
433     selected_cpu: u8,
434     vcpus: Vec<Arc<Mutex<Vcpu>>>,
435     seccomp_action: SeccompAction,
436     vm_ops: Arc<dyn VmOps>,
437     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
438     acpi_address: Option<GuestAddress>,
439     proximity_domain_per_cpu: BTreeMap<u8, u32>,
440     affinity: BTreeMap<u8, Vec<u8>>,
441     dynamic: bool,
442 }
443 
444 const CPU_ENABLE_FLAG: usize = 0;
445 const CPU_INSERTING_FLAG: usize = 1;
446 const CPU_REMOVING_FLAG: usize = 2;
447 const CPU_EJECT_FLAG: usize = 3;
448 
449 const CPU_STATUS_OFFSET: u64 = 4;
450 const CPU_SELECTION_OFFSET: u64 = 0;
451 
452 impl BusDevice for CpuManager {
453     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
454         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
455         data.fill(0);
456 
457         match offset {
458             CPU_SELECTION_OFFSET => {
459                 data[0] = self.selected_cpu;
460             }
461             CPU_STATUS_OFFSET => {
462                 if self.selected_cpu < self.max_vcpus() {
463                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
464                     if state.active() {
465                         data[0] |= 1 << CPU_ENABLE_FLAG;
466                     }
467                     if state.inserting {
468                         data[0] |= 1 << CPU_INSERTING_FLAG;
469                     }
470                     if state.removing {
471                         data[0] |= 1 << CPU_REMOVING_FLAG;
472                     }
473                 } else {
474                     warn!("Out of range vCPU id: {}", self.selected_cpu);
475                 }
476             }
477             _ => {
478                 warn!(
479                     "Unexpected offset for accessing CPU manager device: {:#}",
480                     offset
481                 );
482             }
483         }
484     }
485 
486     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
487         match offset {
488             CPU_SELECTION_OFFSET => {
489                 self.selected_cpu = data[0];
490             }
491             CPU_STATUS_OFFSET => {
492                 if self.selected_cpu < self.max_vcpus() {
493                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
494                     // The ACPI code writes back a 1 to acknowledge the insertion
495                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
496                         && state.inserting
497                     {
498                         state.inserting = false;
499                     }
500                     // Ditto for removal
501                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
502                         && state.removing
503                     {
504                         state.removing = false;
505                     }
506                     // Trigger removal of vCPU
507                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
508                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
509                             error!("Error removing vCPU: {:?}", e);
510                         }
511                     }
512                 } else {
513                     warn!("Out of range vCPU id: {}", self.selected_cpu);
514                 }
515             }
516             _ => {
517                 warn!(
518                     "Unexpected offset for accessing CPU manager device: {:#}",
519                     offset
520                 );
521             }
522         }
523         None
524     }
525 }
526 
527 #[derive(Default)]
528 struct VcpuState {
529     inserting: bool,
530     removing: bool,
531     handle: Option<thread::JoinHandle<()>>,
532     kill: Arc<AtomicBool>,
533     vcpu_run_interrupted: Arc<AtomicBool>,
534 }
535 
536 impl VcpuState {
537     fn active(&self) -> bool {
538         self.handle.is_some()
539     }
540 
541     fn signal_thread(&self) {
542         if let Some(handle) = self.handle.as_ref() {
543             loop {
544                 unsafe {
545                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
546                 }
547                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
548                     break;
549                 } else {
550                     // This is more effective than thread::yield_now() at
551                     // avoiding a priority inversion with the vCPU thread
552                     thread::sleep(std::time::Duration::from_millis(1));
553                 }
554             }
555         }
556     }
557 
558     fn join_thread(&mut self) -> Result<()> {
559         if let Some(handle) = self.handle.take() {
560             handle.join().map_err(Error::ThreadCleanup)?
561         }
562 
563         Ok(())
564     }
565 
566     fn unpark_thread(&self) {
567         if let Some(handle) = self.handle.as_ref() {
568             handle.thread().unpark()
569         }
570     }
571 }
572 
573 impl CpuManager {
574     #[allow(unused_variables)]
575     #[allow(clippy::too_many_arguments)]
576     pub fn new(
577         config: &CpusConfig,
578         device_manager: &Arc<Mutex<DeviceManager>>,
579         memory_manager: &Arc<Mutex<MemoryManager>>,
580         vm: Arc<dyn hypervisor::Vm>,
581         exit_evt: EventFd,
582         reset_evt: EventFd,
583         #[cfg(feature = "gdb")] vm_debug_evt: EventFd,
584         hypervisor: Arc<dyn hypervisor::Hypervisor>,
585         seccomp_action: SeccompAction,
586         vm_ops: Arc<dyn VmOps>,
587         #[cfg(feature = "tdx")] tdx_enabled: bool,
588         numa_nodes: &NumaNodes,
589     ) -> Result<Arc<Mutex<CpuManager>>> {
590         let guest_memory = memory_manager.lock().unwrap().guest_memory();
591         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
592         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
593         let hypervisor_type = hypervisor.hypervisor_type();
594 
595         #[cfg(target_arch = "x86_64")]
596         let sgx_epc_sections = memory_manager
597             .lock()
598             .unwrap()
599             .sgx_epc_region()
600             .as_ref()
601             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
602         #[cfg(target_arch = "x86_64")]
603         let cpuid = {
604             let phys_bits = physical_bits(config.max_phys_bits);
605             arch::generate_common_cpuid(
606                 hypervisor,
607                 config
608                     .topology
609                     .clone()
610                     .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)),
611                 sgx_epc_sections,
612                 phys_bits,
613                 config.kvm_hyperv,
614                 #[cfg(feature = "tdx")]
615                 tdx_enabled,
616             )
617             .map_err(Error::CommonCpuId)?
618         };
619         #[cfg(all(feature = "amx", target_arch = "x86_64"))]
620         if config.features.amx {
621             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
622             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
623             const XFEATURE_XTILEDATA: usize = 18;
624             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
625 
626             // This is safe as the syscall is only modifing kernel internal
627             // data structures that the kernel is itself expected to safeguard.
628             let amx_tile = unsafe {
629                 libc::syscall(
630                     libc::SYS_arch_prctl,
631                     ARCH_REQ_XCOMP_GUEST_PERM,
632                     XFEATURE_XTILEDATA,
633                 )
634             };
635 
636             if amx_tile != 0 {
637                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
638             } else {
639                 // This is safe as the mask being modified (not marked mutable as it is
640                 // modified in unsafe only which is permitted) isn't in use elsewhere.
641                 let mask: usize = 0;
642                 let result = unsafe {
643                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
644                 };
645                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
646                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
647                 }
648             }
649         }
650 
651         let device_manager = device_manager.lock().unwrap();
652 
653         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
654             let mut cpu_list = Vec::new();
655             for (proximity_domain, numa_node) in numa_nodes.iter() {
656                 for cpu in numa_node.cpus.iter() {
657                     cpu_list.push((*cpu, *proximity_domain))
658                 }
659             }
660             cpu_list
661         }
662         .into_iter()
663         .collect();
664 
665         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
666             cpu_affinity
667                 .iter()
668                 .map(|a| (a.vcpu, a.host_cpus.clone()))
669                 .collect()
670         } else {
671             BTreeMap::new()
672         };
673 
674         #[cfg(feature = "tdx")]
675         let dynamic = !tdx_enabled;
676         #[cfg(not(feature = "tdx"))]
677         let dynamic = true;
678 
679         let acpi_address = if dynamic {
680             Some(
681                 device_manager
682                     .allocator()
683                     .lock()
684                     .unwrap()
685                     .allocate_platform_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None)
686                     .ok_or(Error::AllocateMmmioAddress)?,
687             )
688         } else {
689             None
690         };
691 
692         let cpu_manager = Arc::new(Mutex::new(CpuManager {
693             hypervisor_type,
694             config: config.clone(),
695             interrupt_controller: device_manager.interrupt_controller().clone(),
696             vm_memory: guest_memory,
697             #[cfg(target_arch = "x86_64")]
698             cpuid,
699             vm,
700             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
701             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
702             vcpu_states,
703             exit_evt,
704             reset_evt,
705             #[cfg(feature = "gdb")]
706             vm_debug_evt,
707             selected_cpu: 0,
708             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
709             seccomp_action,
710             vm_ops,
711             acpi_address,
712             proximity_domain_per_cpu,
713             affinity,
714             dynamic,
715         }));
716 
717         if let Some(acpi_address) = acpi_address {
718             device_manager
719                 .mmio_bus()
720                 .insert(
721                     cpu_manager.clone(),
722                     acpi_address.0,
723                     CPU_MANAGER_ACPI_SIZE as u64,
724                 )
725                 .map_err(Error::BusError)?;
726         }
727 
728         Ok(cpu_manager)
729     }
730 
731     fn create_vcpu(
732         &mut self,
733         cpu_id: u8,
734         entry_point: Option<EntryPoint>,
735         snapshot: Option<Snapshot>,
736     ) -> Result<()> {
737         info!("Creating vCPU: cpu_id = {}", cpu_id);
738 
739         let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vm_ops.clone()))?;
740 
741         if let Some(snapshot) = snapshot {
742             // AArch64 vCPUs should be initialized after created.
743             #[cfg(target_arch = "aarch64")]
744             vcpu.init(&self.vm)?;
745 
746             vcpu.restore(snapshot).expect("Failed to restore vCPU");
747         } else {
748             #[cfg(target_arch = "x86_64")]
749             vcpu.configure(
750                 entry_point,
751                 &self.vm_memory,
752                 self.cpuid.clone(),
753                 self.config.kvm_hyperv,
754             )
755             .expect("Failed to configure vCPU");
756 
757             #[cfg(target_arch = "aarch64")]
758             vcpu.configure(&self.vm, entry_point)
759                 .expect("Failed to configure vCPU");
760         }
761 
762         // Adding vCPU to the CpuManager's vCPU list.
763         let vcpu = Arc::new(Mutex::new(vcpu));
764         self.vcpus.push(vcpu);
765 
766         Ok(())
767     }
768 
769     /// Only create new vCPUs if there aren't any inactive ones to reuse
770     fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> {
771         info!(
772             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
773             desired_vcpus,
774             self.config.max_vcpus,
775             self.vcpus.len(),
776             self.present_vcpus()
777         );
778 
779         if desired_vcpus > self.config.max_vcpus {
780             return Err(Error::DesiredVCpuCountExceedsMax);
781         }
782 
783         // Only create vCPUs in excess of all the allocated vCPUs.
784         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
785             self.create_vcpu(cpu_id, entry_point, None)?;
786         }
787 
788         Ok(())
789     }
790 
791     #[cfg(target_arch = "aarch64")]
792     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
793         for cpu in self.vcpus.iter() {
794             let cpu = cpu.lock().unwrap();
795             // Check if PMU attr is available, if not, log the information.
796             if cpu.vcpu.has_pmu_support() {
797                 cpu.vcpu.init_pmu(irq).map_err(Error::InitPmu)?;
798             } else {
799                 debug!(
800                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
801                     cpu.id
802                 );
803                 return Ok(false);
804             }
805         }
806 
807         Ok(true)
808     }
809 
810     fn start_vcpu(
811         &mut self,
812         vcpu: Arc<Mutex<Vcpu>>,
813         vcpu_id: u8,
814         vcpu_thread_barrier: Arc<Barrier>,
815         inserting: bool,
816     ) -> Result<()> {
817         let reset_evt = self.reset_evt.try_clone().unwrap();
818         let exit_evt = self.exit_evt.try_clone().unwrap();
819         #[cfg(feature = "gdb")]
820         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
821         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
822         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
823         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
824 
825         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
826         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
827             .vcpu_run_interrupted
828             .clone();
829         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
830 
831         // Prepare the CPU set the current vCPU is expected to run onto.
832         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
833             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
834             unsafe { libc::CPU_ZERO(&mut cpuset) };
835             for host_cpu in host_cpus {
836                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
837             }
838             cpuset
839         });
840 
841         // Retrieve seccomp filter for vcpu thread
842         let vcpu_seccomp_filter =
843             get_seccomp_filter(&self.seccomp_action, Thread::Vcpu, self.hypervisor_type)
844                 .map_err(Error::CreateSeccompFilter)?;
845 
846         #[cfg(target_arch = "x86_64")]
847         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
848 
849         info!("Starting vCPU: cpu_id = {}", vcpu_id);
850 
851         let handle = Some(
852             thread::Builder::new()
853                 .name(format!("vcpu{}", vcpu_id))
854                 .spawn(move || {
855                     // Schedule the thread to run on the expected CPU set
856                     if let Some(cpuset) = cpuset.as_ref() {
857                         let ret = unsafe {
858                             libc::sched_setaffinity(
859                                 0,
860                                 std::mem::size_of::<libc::cpu_set_t>(),
861                                 cpuset as *const libc::cpu_set_t,
862                             )
863                         };
864 
865                         if ret != 0 {
866                             error!(
867                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
868                                 vcpu_id,
869                                 io::Error::last_os_error()
870                             );
871                             return;
872                         }
873                     }
874 
875                     // Apply seccomp filter for vcpu thread.
876                     if !vcpu_seccomp_filter.is_empty() {
877                         if let Err(e) =
878                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
879                         {
880                             error!("Error applying seccomp filter: {:?}", e);
881                             return;
882                         }
883                     }
884                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
885                     // This uses an async signal safe handler to kill the vcpu handles.
886                     register_signal_handler(SIGRTMIN(), handle_signal)
887                         .expect("Failed to register vcpu signal handler");
888                     // Block until all CPUs are ready.
889                     vcpu_thread_barrier.wait();
890 
891                     std::panic::catch_unwind(move || {
892                         loop {
893                             // If we are being told to pause, we park the thread
894                             // until the pause boolean is toggled.
895                             // The resume operation is responsible for toggling
896                             // the boolean and unpark the thread.
897                             // We enter a loop because park() could spuriously
898                             // return. We will then park() again unless the
899                             // pause boolean has been toggled.
900 
901                             // Need to use Ordering::SeqCst as we have multiple
902                             // loads and stores to different atomics and we need
903                             // to see them in a consistent order in all threads
904 
905                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
906                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
907                                 // completed by returning to KVM_RUN. From the kernel docs:
908                                 //
909                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
910                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
911                                 // operations are complete (and guest state is consistent) only after userspace
912                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
913                                 // incomplete operations and then check for pending signals.
914                                 // The pending state of the operation is not preserved in state which is
915                                 // visible to userspace, thus userspace should ensure that the operation is
916                                 // completed before performing a live migration.  Userspace can re-enter the
917                                 // guest with an unmasked signal pending or with the immediate_exit field set
918                                 // to complete pending operations without allowing any further instructions
919                                 // to be executed.
920 
921                                 #[cfg(feature = "kvm")]
922                                 {
923                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
924                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
925                                         error!("Unexpected VM exit on \"immediate_exit\" run");
926                                         break;
927                                     }
928                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
929                                 }
930 
931                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
932                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
933                                     thread::park();
934                                 }
935                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
936                             }
937 
938                             // We've been told to terminate
939                             if vcpu_kill_signalled.load(Ordering::SeqCst)
940                                 || vcpu_kill.load(Ordering::SeqCst)
941                             {
942                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
943                                 break;
944                             }
945 
946                             #[cfg(feature = "tdx")]
947                             let mut vcpu = vcpu.lock().unwrap();
948                             #[cfg(not(feature = "tdx"))]
949                             let vcpu = vcpu.lock().unwrap();
950                             // vcpu.run() returns false on a triple-fault so trigger a reset
951                             match vcpu.run() {
952                                 Ok(run) => match run {
953                                     #[cfg(all(target_arch = "x86_64", feature = "kvm"))]
954                                     VmExit::Debug => {
955                                         info!("VmExit::Debug");
956                                         #[cfg(feature = "gdb")]
957                                         {
958                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
959                                             let raw_tid = get_raw_tid(vcpu_id as usize);
960                                             vm_debug_evt.write(raw_tid as u64).unwrap();
961                                         }
962                                     }
963                                     #[cfg(target_arch = "x86_64")]
964                                     VmExit::IoapicEoi(vector) => {
965                                         if let Some(interrupt_controller) =
966                                             &interrupt_controller_clone
967                                         {
968                                             interrupt_controller
969                                                 .lock()
970                                                 .unwrap()
971                                                 .end_of_interrupt(vector);
972                                         }
973                                     }
974                                     VmExit::Ignore => {}
975                                     VmExit::Hyperv => {}
976                                     VmExit::Reset => {
977                                         info!("VmExit::Reset");
978                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
979                                         reset_evt.write(1).unwrap();
980                                         break;
981                                     }
982                                     VmExit::Shutdown => {
983                                         info!("VmExit::Shutdown");
984                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
985                                         exit_evt.write(1).unwrap();
986                                         break;
987                                     }
988                                     #[cfg(feature = "tdx")]
989                                     VmExit::Tdx => {
990                                         if let Some(vcpu) = Arc::get_mut(&mut vcpu.vcpu) {
991                                             match vcpu.get_tdx_exit_details() {
992                                                 Ok(details) => match details {
993                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
994                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
995                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
996                                                     }
997                                                 },
998                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
999                                             }
1000                                             vcpu.set_tdx_status(TdxExitStatus::InvalidOperand);
1001                                         } else {
1002                                             // We should never reach this code as
1003                                             // this means the design from the code
1004                                             // is wrong.
1005                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1006                                         }
1007                                     }
1008                                     _ => {
1009                                         error!(
1010                                             "VCPU generated error: {:?}",
1011                                             Error::UnexpectedVmExit
1012                                         );
1013                                         break;
1014                                     }
1015                                 },
1016 
1017                                 Err(e) => {
1018                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1019                                     break;
1020                                 }
1021                             }
1022 
1023                             // We've been told to terminate
1024                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1025                                 || vcpu_kill.load(Ordering::SeqCst)
1026                             {
1027                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1028                                 break;
1029                             }
1030                         }
1031                     })
1032                     .or_else(|_| {
1033                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1034                         error!("vCPU thread panicked");
1035                         panic_exit_evt.write(1)
1036                     })
1037                     .ok();
1038                 })
1039                 .map_err(Error::VcpuSpawn)?,
1040         );
1041 
1042         // On hot plug calls into this function entry_point is None. It is for
1043         // those hotplug CPU additions that we need to set the inserting flag.
1044         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1045         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1046 
1047         Ok(())
1048     }
1049 
1050     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1051     fn activate_vcpus(
1052         &mut self,
1053         desired_vcpus: u8,
1054         inserting: bool,
1055         paused: Option<bool>,
1056     ) -> Result<()> {
1057         if desired_vcpus > self.config.max_vcpus {
1058             return Err(Error::DesiredVCpuCountExceedsMax);
1059         }
1060 
1061         let vcpu_thread_barrier = Arc::new(Barrier::new(
1062             (desired_vcpus - self.present_vcpus() + 1) as usize,
1063         ));
1064 
1065         if let Some(paused) = paused {
1066             self.vcpus_pause_signalled.store(paused, Ordering::SeqCst);
1067         }
1068 
1069         info!(
1070             "Starting vCPUs: desired = {}, allocated = {}, present = {}, paused = {}",
1071             desired_vcpus,
1072             self.vcpus.len(),
1073             self.present_vcpus(),
1074             self.vcpus_pause_signalled.load(Ordering::SeqCst)
1075         );
1076 
1077         // This reuses any inactive vCPUs as well as any that were newly created
1078         for vcpu_id in self.present_vcpus()..desired_vcpus {
1079             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1080             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1081         }
1082 
1083         // Unblock all CPU threads.
1084         vcpu_thread_barrier.wait();
1085         Ok(())
1086     }
1087 
1088     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1089         // Mark vCPUs for removal, actual removal happens on ejection
1090         for cpu_id in desired_vcpus..self.present_vcpus() {
1091             self.vcpu_states[usize::from(cpu_id)].removing = true;
1092         }
1093     }
1094 
1095     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1096         info!("Removing vCPU: cpu_id = {}", cpu_id);
1097         let mut state = &mut self.vcpu_states[usize::from(cpu_id)];
1098         state.kill.store(true, Ordering::SeqCst);
1099         state.signal_thread();
1100         state.join_thread()?;
1101         state.handle = None;
1102 
1103         // Once the thread has exited, clear the "kill" so that it can reused
1104         state.kill.store(false, Ordering::SeqCst);
1105 
1106         Ok(())
1107     }
1108 
1109     pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> {
1110         self.create_vcpus(self.boot_vcpus(), entry_point)
1111     }
1112 
1113     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1114     pub fn start_boot_vcpus(&mut self, paused: bool) -> Result<()> {
1115         self.activate_vcpus(self.boot_vcpus(), false, Some(paused))
1116     }
1117 
1118     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1119         self.activate_vcpus(self.vcpus.len() as u8, false, Some(true))
1120             .map_err(|e| {
1121                 Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1122             })?;
1123 
1124         Ok(())
1125     }
1126 
1127     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1128         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1129             return Ok(false);
1130         }
1131 
1132         if !self.dynamic {
1133             return Ok(false);
1134         }
1135 
1136         match desired_vcpus.cmp(&self.present_vcpus()) {
1137             cmp::Ordering::Greater => {
1138                 self.create_vcpus(desired_vcpus, None)?;
1139                 self.activate_vcpus(desired_vcpus, true, None)?;
1140                 Ok(true)
1141             }
1142             cmp::Ordering::Less => {
1143                 self.mark_vcpus_for_removal(desired_vcpus);
1144                 Ok(true)
1145             }
1146             _ => Ok(false),
1147         }
1148     }
1149 
1150     pub fn shutdown(&mut self) -> Result<()> {
1151         // Tell the vCPUs to stop themselves next time they go through the loop
1152         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1153 
1154         // Toggle the vCPUs pause boolean
1155         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1156 
1157         // Unpark all the VCPU threads.
1158         for state in self.vcpu_states.iter() {
1159             state.unpark_thread();
1160         }
1161 
1162         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1163         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1164         // above.
1165         for state in self.vcpu_states.iter() {
1166             state.signal_thread();
1167         }
1168 
1169         // Wait for all the threads to finish. This removes the state from the vector.
1170         for mut state in self.vcpu_states.drain(..) {
1171             state.join_thread()?;
1172         }
1173 
1174         Ok(())
1175     }
1176 
1177     #[cfg(feature = "tdx")]
1178     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1179         for vcpu in &self.vcpus {
1180             vcpu.lock()
1181                 .unwrap()
1182                 .vcpu
1183                 .tdx_init(hob_address)
1184                 .map_err(Error::InitializeTdx)?;
1185         }
1186         Ok(())
1187     }
1188 
1189     pub fn boot_vcpus(&self) -> u8 {
1190         self.config.boot_vcpus
1191     }
1192 
1193     pub fn max_vcpus(&self) -> u8 {
1194         self.config.max_vcpus
1195     }
1196 
1197     #[cfg(target_arch = "x86_64")]
1198     pub fn common_cpuid(&self) -> Vec<CpuIdEntry> {
1199         self.cpuid.clone()
1200     }
1201 
1202     fn present_vcpus(&self) -> u8 {
1203         self.vcpu_states
1204             .iter()
1205             .fold(0, |acc, state| acc + state.active() as u8)
1206     }
1207 
1208     #[cfg(target_arch = "aarch64")]
1209     pub fn get_mpidrs(&self) -> Vec<u64> {
1210         self.vcpus
1211             .iter()
1212             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1213             .collect()
1214     }
1215 
1216     #[cfg(target_arch = "aarch64")]
1217     pub fn get_saved_states(&self) -> Vec<CpuState> {
1218         self.vcpus
1219             .iter()
1220             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1221             .collect()
1222     }
1223 
1224     #[cfg(target_arch = "aarch64")]
1225     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1226         self.config
1227             .topology
1228             .clone()
1229             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1230     }
1231 
1232     pub fn create_madt(&self) -> Sdt {
1233         use crate::acpi;
1234         // This is also checked in the commandline parsing.
1235         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1236 
1237         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1238         #[cfg(target_arch = "x86_64")]
1239         {
1240             madt.write(36, arch::layout::APIC_START);
1241 
1242             for cpu in 0..self.config.max_vcpus {
1243                 let lapic = LocalApic {
1244                     r#type: acpi::ACPI_APIC_PROCESSOR,
1245                     length: 8,
1246                     processor_id: cpu,
1247                     apic_id: cpu,
1248                     flags: if cpu < self.config.boot_vcpus {
1249                         1 << MADT_CPU_ENABLE_FLAG
1250                     } else {
1251                         0
1252                     } | 1 << MADT_CPU_ONLINE_CAPABLE_FLAG,
1253                 };
1254                 madt.append(lapic);
1255             }
1256 
1257             madt.append(Ioapic {
1258                 r#type: acpi::ACPI_APIC_IO,
1259                 length: 12,
1260                 ioapic_id: 0,
1261                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1262                 gsi_base: 0,
1263                 ..Default::default()
1264             });
1265 
1266             madt.append(InterruptSourceOverride {
1267                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1268                 length: 10,
1269                 bus: 0,
1270                 source: 4,
1271                 gsi: 4,
1272                 flags: 0,
1273             });
1274         }
1275 
1276         #[cfg(target_arch = "aarch64")]
1277         {
1278             use vm_memory::Address;
1279             /* Notes:
1280              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1281              */
1282 
1283             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1284             for cpu in 0..self.config.boot_vcpus {
1285                 let vcpu = &self.vcpus[cpu as usize];
1286                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1287                 /* ARMv8 MPIDR format:
1288                      Bits [63:40] Must be zero
1289                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1290                      Bits [31:24] Must be zero
1291                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1292                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1293                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1294                 */
1295                 let mpidr_mask = 0xff_00ff_ffff;
1296                 let gicc = GicC {
1297                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1298                     length: 80,
1299                     reserved0: 0,
1300                     cpu_interface_number: cpu as u32,
1301                     uid: cpu as u32,
1302                     flags: 1,
1303                     parking_version: 0,
1304                     performance_interrupt: 0,
1305                     parked_address: 0,
1306                     base_address: 0,
1307                     gicv_base_address: 0,
1308                     gich_base_address: 0,
1309                     vgic_interrupt: 0,
1310                     gicr_base_address: 0,
1311                     mpidr: mpidr & mpidr_mask,
1312                     proc_power_effi_class: 0,
1313                     reserved1: 0,
1314                     spe_overflow_interrupt: 0,
1315                 };
1316 
1317                 madt.append(gicc);
1318             }
1319 
1320             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1321             let gicd = GicD {
1322                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1323                 length: 24,
1324                 reserved0: 0,
1325                 gic_id: 0,
1326                 base_address: arch::layout::MAPPED_IO_START.raw_value() - 0x0001_0000,
1327                 global_irq_base: 0,
1328                 version: 3,
1329                 reserved1: [0; 3],
1330             };
1331             madt.append(gicd);
1332 
1333             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1334             let gicr_size: u32 = 0x0001_0000 * 2 * (self.config.boot_vcpus as u32);
1335             let gicr_base: u64 =
1336                 arch::layout::MAPPED_IO_START.raw_value() - 0x0001_0000 - gicr_size as u64;
1337             let gicr = GicR {
1338                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1339                 length: 16,
1340                 reserved: 0,
1341                 base_address: gicr_base,
1342                 range_length: gicr_size,
1343             };
1344             madt.append(gicr);
1345 
1346             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1347             let gicits = GicIts {
1348                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1349                 length: 20,
1350                 reserved0: 0,
1351                 translation_id: 0,
1352                 base_address: gicr_base - 2 * 0x0001_0000,
1353                 reserved1: 0,
1354             };
1355             madt.append(gicits);
1356 
1357             madt.update_checksum();
1358         }
1359 
1360         madt
1361     }
1362 
1363     #[cfg(target_arch = "aarch64")]
1364     pub fn create_pptt(&self) -> Sdt {
1365         let pptt_start = 0;
1366         let mut cpus = 0;
1367         let mut uid = 0;
1368         // If topology is not specified, the default setting is:
1369         // 1 package, multiple cores, 1 thread per core
1370         // This is also the behavior when PPTT is missing.
1371         let (threads_per_core, cores_per_package, packages) =
1372             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1373 
1374         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1375 
1376         for cluster_idx in 0..packages {
1377             if cpus < self.config.boot_vcpus as usize {
1378                 let cluster_offset = pptt.len() - pptt_start;
1379                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1380                     r#type: 0,
1381                     length: 20,
1382                     reserved: 0,
1383                     flags: 0x2,
1384                     parent: 0,
1385                     acpi_processor_id: cluster_idx as u32,
1386                     num_private_resources: 0,
1387                 };
1388                 pptt.append(cluster_hierarchy_node);
1389 
1390                 for core_idx in 0..cores_per_package {
1391                     let core_offset = pptt.len() - pptt_start;
1392 
1393                     if threads_per_core > 1 {
1394                         let core_hierarchy_node = ProcessorHierarchyNode {
1395                             r#type: 0,
1396                             length: 20,
1397                             reserved: 0,
1398                             flags: 0x2,
1399                             parent: cluster_offset as u32,
1400                             acpi_processor_id: core_idx as u32,
1401                             num_private_resources: 0,
1402                         };
1403                         pptt.append(core_hierarchy_node);
1404 
1405                         for _thread_idx in 0..threads_per_core {
1406                             let thread_hierarchy_node = ProcessorHierarchyNode {
1407                                 r#type: 0,
1408                                 length: 20,
1409                                 reserved: 0,
1410                                 flags: 0xE,
1411                                 parent: core_offset as u32,
1412                                 acpi_processor_id: uid as u32,
1413                                 num_private_resources: 0,
1414                             };
1415                             pptt.append(thread_hierarchy_node);
1416                             uid += 1;
1417                         }
1418                     } else {
1419                         let thread_hierarchy_node = ProcessorHierarchyNode {
1420                             r#type: 0,
1421                             length: 20,
1422                             reserved: 0,
1423                             flags: 0xA,
1424                             parent: cluster_offset as u32,
1425                             acpi_processor_id: uid as u32,
1426                             num_private_resources: 0,
1427                         };
1428                         pptt.append(thread_hierarchy_node);
1429                         uid += 1;
1430                     }
1431                 }
1432                 cpus += (cores_per_package * threads_per_core) as usize;
1433             }
1434         }
1435 
1436         pptt.update_checksum();
1437         pptt
1438     }
1439 
1440     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1441     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1442         self.vcpus[usize::from(cpu_id)]
1443             .lock()
1444             .unwrap()
1445             .vcpu
1446             .get_regs()
1447             .map_err(Error::CpuDebug)
1448     }
1449 
1450     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1451     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1452         self.vcpus[usize::from(cpu_id)]
1453             .lock()
1454             .unwrap()
1455             .vcpu
1456             .set_regs(regs)
1457             .map_err(Error::CpuDebug)
1458     }
1459 
1460     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1461     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1462         self.vcpus[usize::from(cpu_id)]
1463             .lock()
1464             .unwrap()
1465             .vcpu
1466             .get_sregs()
1467             .map_err(Error::CpuDebug)
1468     }
1469 
1470     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1471     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1472         self.vcpus[usize::from(cpu_id)]
1473             .lock()
1474             .unwrap()
1475             .vcpu
1476             .set_sregs(sregs)
1477             .map_err(Error::CpuDebug)
1478     }
1479 
1480     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1481     fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> {
1482         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1483             .lock()
1484             .unwrap()
1485             .vcpu
1486             .translate_gva(gva, /* flags: unused */ 0)
1487             .map_err(Error::TranslateVirtualAddress)?;
1488         Ok(gpa)
1489     }
1490 }
1491 
1492 struct Cpu {
1493     cpu_id: u8,
1494     proximity_domain: u32,
1495     dynamic: bool,
1496 }
1497 
1498 #[cfg(target_arch = "x86_64")]
1499 const MADT_CPU_ENABLE_FLAG: usize = 0;
1500 
1501 #[cfg(target_arch = "x86_64")]
1502 const MADT_CPU_ONLINE_CAPABLE_FLAG: usize = 1;
1503 
1504 impl Cpu {
1505     #[cfg(target_arch = "x86_64")]
1506     fn generate_mat(&self) -> Vec<u8> {
1507         let lapic = LocalApic {
1508             r#type: 0,
1509             length: 8,
1510             processor_id: self.cpu_id,
1511             apic_id: self.cpu_id,
1512             flags: 1 << MADT_CPU_ENABLE_FLAG,
1513         };
1514 
1515         let mut mat_data: Vec<u8> = Vec::new();
1516         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1517         unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic };
1518 
1519         mat_data
1520     }
1521 }
1522 
1523 impl Aml for Cpu {
1524     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1525         #[cfg(target_arch = "x86_64")]
1526         let mat_data: Vec<u8> = self.generate_mat();
1527         #[allow(clippy::if_same_then_else)]
1528         if self.dynamic {
1529             aml::Device::new(
1530                 format!("C{:03}", self.cpu_id).as_str().into(),
1531                 vec![
1532                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1533                     &aml::Name::new("_UID".into(), &self.cpu_id),
1534                     // Currently, AArch64 cannot support following fields.
1535                     /*
1536                     _STA return value:
1537                     Bit [0] – Set if the device is present.
1538                     Bit [1] – Set if the device is enabled and decoding its resources.
1539                     Bit [2] – Set if the device should be shown in the UI.
1540                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1541                     Bit [4] – Set if the battery is present.
1542                     Bits [31:5] – Reserved (must be cleared).
1543                     */
1544                     #[cfg(target_arch = "x86_64")]
1545                     &aml::Method::new(
1546                         "_STA".into(),
1547                         0,
1548                         false,
1549                         // Call into CSTA method which will interrogate device
1550                         vec![&aml::Return::new(&aml::MethodCall::new(
1551                             "CSTA".into(),
1552                             vec![&self.cpu_id],
1553                         ))],
1554                     ),
1555                     &aml::Method::new(
1556                         "_PXM".into(),
1557                         0,
1558                         false,
1559                         vec![&aml::Return::new(&self.proximity_domain)],
1560                     ),
1561                     // The Linux kernel expects every CPU device to have a _MAT entry
1562                     // containing the LAPIC for this processor with the enabled bit set
1563                     // even it if is disabled in the MADT (non-boot CPU)
1564                     #[cfg(target_arch = "x86_64")]
1565                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1566                     // Trigger CPU ejection
1567                     #[cfg(target_arch = "x86_64")]
1568                     &aml::Method::new(
1569                         "_EJ0".into(),
1570                         1,
1571                         false,
1572                         // Call into CEJ0 method which will actually eject device
1573                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1574                     ),
1575                 ],
1576             )
1577             .append_aml_bytes(bytes);
1578         } else {
1579             aml::Device::new(
1580                 format!("C{:03}", self.cpu_id).as_str().into(),
1581                 vec![
1582                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1583                     &aml::Name::new("_UID".into(), &self.cpu_id),
1584                     #[cfg(target_arch = "x86_64")]
1585                     &aml::Method::new(
1586                         "_STA".into(),
1587                         0,
1588                         false,
1589                         // Mark CPU present see CSTA implementation
1590                         vec![&aml::Return::new(&0xfu8)],
1591                     ),
1592                     &aml::Method::new(
1593                         "_PXM".into(),
1594                         0,
1595                         false,
1596                         vec![&aml::Return::new(&self.proximity_domain)],
1597                     ),
1598                     // The Linux kernel expects every CPU device to have a _MAT entry
1599                     // containing the LAPIC for this processor with the enabled bit set
1600                     // even it if is disabled in the MADT (non-boot CPU)
1601                     #[cfg(target_arch = "x86_64")]
1602                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1603                 ],
1604             )
1605             .append_aml_bytes(bytes);
1606         }
1607     }
1608 }
1609 
1610 struct CpuNotify {
1611     cpu_id: u8,
1612 }
1613 
1614 impl Aml for CpuNotify {
1615     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1616         let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
1617         aml::If::new(
1618             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1619             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1620         )
1621         .append_aml_bytes(bytes)
1622     }
1623 }
1624 
1625 struct CpuMethods {
1626     max_vcpus: u8,
1627     dynamic: bool,
1628 }
1629 
1630 impl Aml for CpuMethods {
1631     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1632         if self.dynamic {
1633             // CPU status method
1634             aml::Method::new(
1635                 "CSTA".into(),
1636                 1,
1637                 true,
1638                 vec![
1639                     // Take lock defined above
1640                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1641                     // Write CPU number (in first argument) to I/O port via field
1642                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1643                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1644                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1645                     &aml::If::new(
1646                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1647                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1648                     ),
1649                     // Release lock
1650                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1651                     // Return 0 or 0xf
1652                     &aml::Return::new(&aml::Local(0)),
1653                 ],
1654             )
1655             .append_aml_bytes(bytes);
1656 
1657             let mut cpu_notifies = Vec::new();
1658             for cpu_id in 0..self.max_vcpus {
1659                 cpu_notifies.push(CpuNotify { cpu_id });
1660             }
1661 
1662             let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
1663             for cpu_id in 0..self.max_vcpus {
1664                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1665             }
1666 
1667             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes);
1668 
1669             aml::Method::new(
1670                 "CEJ0".into(),
1671                 1,
1672                 true,
1673                 vec![
1674                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1675                     // Write CPU number (in first argument) to I/O port via field
1676                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1677                     // Set CEJ0 bit
1678                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1679                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1680                 ],
1681             )
1682             .append_aml_bytes(bytes);
1683 
1684             aml::Method::new(
1685                 "CSCN".into(),
1686                 0,
1687                 true,
1688                 vec![
1689                     // Take lock defined above
1690                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1691                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1692                     &aml::While::new(
1693                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1694                         vec![
1695                             // Write CPU number (in first argument) to I/O port via field
1696                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1697                             // Check if CINS bit is set
1698                             &aml::If::new(
1699                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1700                                 // Notify device if it is
1701                                 vec![
1702                                     &aml::MethodCall::new(
1703                                         "CTFY".into(),
1704                                         vec![&aml::Local(0), &aml::ONE],
1705                                     ),
1706                                     // Reset CINS bit
1707                                     &aml::Store::new(
1708                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1709                                         &aml::ONE,
1710                                     ),
1711                                 ],
1712                             ),
1713                             // Check if CRMV bit is set
1714                             &aml::If::new(
1715                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1716                                 // Notify device if it is (with the eject constant 0x3)
1717                                 vec![
1718                                     &aml::MethodCall::new(
1719                                         "CTFY".into(),
1720                                         vec![&aml::Local(0), &3u8],
1721                                     ),
1722                                     // Reset CRMV bit
1723                                     &aml::Store::new(
1724                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1725                                         &aml::ONE,
1726                                     ),
1727                                 ],
1728                             ),
1729                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1730                         ],
1731                     ),
1732                     // Release lock
1733                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1734                 ],
1735             )
1736             .append_aml_bytes(bytes)
1737         } else {
1738             aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes)
1739         }
1740     }
1741 }
1742 
1743 impl Aml for CpuManager {
1744     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1745         #[cfg(target_arch = "x86_64")]
1746         if let Some(acpi_address) = self.acpi_address {
1747             // CPU hotplug controller
1748             aml::Device::new(
1749                 "_SB_.PRES".into(),
1750                 vec![
1751                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
1752                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1753                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1754                     &aml::Mutex::new("CPLK".into(), 0),
1755                     &aml::Name::new(
1756                         "_CRS".into(),
1757                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
1758                             aml::AddressSpaceCachable::NotCacheable,
1759                             true,
1760                             acpi_address.0 as u64,
1761                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
1762                         )]),
1763                     ),
1764                     // OpRegion and Fields map MMIO range into individual field values
1765                     &aml::OpRegion::new(
1766                         "PRST".into(),
1767                         aml::OpRegionSpace::SystemMemory,
1768                         acpi_address.0 as usize,
1769                         CPU_MANAGER_ACPI_SIZE,
1770                     ),
1771                     &aml::Field::new(
1772                         "PRST".into(),
1773                         aml::FieldAccessType::Byte,
1774                         aml::FieldUpdateRule::WriteAsZeroes,
1775                         vec![
1776                             aml::FieldEntry::Reserved(32),
1777                             aml::FieldEntry::Named(*b"CPEN", 1),
1778                             aml::FieldEntry::Named(*b"CINS", 1),
1779                             aml::FieldEntry::Named(*b"CRMV", 1),
1780                             aml::FieldEntry::Named(*b"CEJ0", 1),
1781                             aml::FieldEntry::Reserved(4),
1782                             aml::FieldEntry::Named(*b"CCMD", 8),
1783                         ],
1784                     ),
1785                     &aml::Field::new(
1786                         "PRST".into(),
1787                         aml::FieldAccessType::DWord,
1788                         aml::FieldUpdateRule::Preserve,
1789                         vec![
1790                             aml::FieldEntry::Named(*b"CSEL", 32),
1791                             aml::FieldEntry::Reserved(32),
1792                             aml::FieldEntry::Named(*b"CDAT", 32),
1793                         ],
1794                     ),
1795                 ],
1796             )
1797             .append_aml_bytes(bytes);
1798         }
1799 
1800         // CPU devices
1801         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
1802         let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05"));
1803         // Bundle methods together under a common object
1804         let methods = CpuMethods {
1805             max_vcpus: self.config.max_vcpus,
1806             dynamic: self.dynamic,
1807         };
1808         let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods];
1809 
1810         let mut cpu_devices = Vec::new();
1811         for cpu_id in 0..self.config.max_vcpus {
1812             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
1813             let cpu_device = Cpu {
1814                 cpu_id,
1815                 proximity_domain,
1816                 dynamic: self.dynamic,
1817             };
1818 
1819             cpu_devices.push(cpu_device);
1820         }
1821 
1822         for cpu_device in cpu_devices.iter() {
1823             cpu_data_inner.push(cpu_device);
1824         }
1825 
1826         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes)
1827     }
1828 }
1829 
1830 impl Pausable for CpuManager {
1831     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
1832         // Tell the vCPUs to pause themselves next time they exit
1833         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
1834 
1835         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1836         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1837         // above.
1838         for state in self.vcpu_states.iter() {
1839             state.signal_thread();
1840         }
1841 
1842         for vcpu in self.vcpus.iter() {
1843             let mut vcpu = vcpu.lock().unwrap();
1844             vcpu.pause()?;
1845             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
1846             if !self.config.kvm_hyperv {
1847                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
1848                     MigratableError::Pause(anyhow!(
1849                         "Could not notify guest it has been paused {:?}",
1850                         e
1851                     ))
1852                 })?;
1853             }
1854         }
1855 
1856         Ok(())
1857     }
1858 
1859     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
1860         for vcpu in self.vcpus.iter() {
1861             vcpu.lock().unwrap().resume()?;
1862         }
1863 
1864         // Toggle the vCPUs pause boolean
1865         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1866 
1867         // Unpark all the VCPU threads.
1868         // Once unparked, the next thing they will do is checking for the pause
1869         // boolean. Since it'll be set to false, they will exit their pause loop
1870         // and go back to vmx root.
1871         for state in self.vcpu_states.iter() {
1872             state.unpark_thread();
1873         }
1874         Ok(())
1875     }
1876 }
1877 
1878 impl Snapshottable for CpuManager {
1879     fn id(&self) -> String {
1880         CPU_MANAGER_SNAPSHOT_ID.to_string()
1881     }
1882 
1883     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1884         let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID);
1885 
1886         // The CpuManager snapshot is a collection of all vCPUs snapshots.
1887         for vcpu in &self.vcpus {
1888             let cpu_snapshot = vcpu.lock().unwrap().snapshot()?;
1889             cpu_manager_snapshot.add_snapshot(cpu_snapshot);
1890         }
1891 
1892         Ok(cpu_manager_snapshot)
1893     }
1894 
1895     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1896         for (cpu_id, snapshot) in snapshot.snapshots.iter() {
1897             info!("Restoring VCPU {}", cpu_id);
1898             self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone()))
1899                 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?;
1900         }
1901 
1902         Ok(())
1903     }
1904 }
1905 
1906 impl Transportable for CpuManager {}
1907 impl Migratable for CpuManager {}
1908 
1909 #[cfg(feature = "gdb")]
1910 impl Debuggable for CpuManager {
1911     #[cfg(feature = "kvm")]
1912     fn set_guest_debug(
1913         &self,
1914         cpu_id: usize,
1915         addrs: &[GuestAddress],
1916         singlestep: bool,
1917     ) -> std::result::Result<(), DebuggableError> {
1918         self.vcpus[cpu_id]
1919             .lock()
1920             .unwrap()
1921             .vcpu
1922             .set_guest_debug(addrs, singlestep)
1923             .map_err(DebuggableError::SetDebug)
1924     }
1925 
1926     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
1927         Ok(())
1928     }
1929 
1930     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
1931         Ok(())
1932     }
1933 
1934     #[cfg(target_arch = "x86_64")]
1935     fn read_regs(&self, cpu_id: usize) -> std::result::Result<X86_64CoreRegs, DebuggableError> {
1936         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
1937         let gregs = self
1938             .get_regs(cpu_id as u8)
1939             .map_err(DebuggableError::ReadRegs)?;
1940         let regs = [
1941             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
1942             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
1943         ];
1944 
1945         // GDB exposes 32-bit eflags instead of 64-bit rflags.
1946         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
1947         let eflags = gregs.rflags as u32;
1948         let rip = gregs.rip;
1949 
1950         // Segment registers: CS, SS, DS, ES, FS, GS
1951         let sregs = self
1952             .get_sregs(cpu_id as u8)
1953             .map_err(DebuggableError::ReadRegs)?;
1954         let segments = X86SegmentRegs {
1955             cs: sregs.cs.selector as u32,
1956             ss: sregs.ss.selector as u32,
1957             ds: sregs.ds.selector as u32,
1958             es: sregs.es.selector as u32,
1959             fs: sregs.fs.selector as u32,
1960             gs: sregs.gs.selector as u32,
1961         };
1962 
1963         // TODO: Add other registers
1964 
1965         Ok(X86_64CoreRegs {
1966             regs,
1967             eflags,
1968             rip,
1969             segments,
1970             ..Default::default()
1971         })
1972     }
1973 
1974     #[cfg(target_arch = "x86_64")]
1975     fn write_regs(
1976         &self,
1977         cpu_id: usize,
1978         regs: &X86_64CoreRegs,
1979     ) -> std::result::Result<(), DebuggableError> {
1980         let orig_gregs = self
1981             .get_regs(cpu_id as u8)
1982             .map_err(DebuggableError::ReadRegs)?;
1983         let gregs = StandardRegisters {
1984             rax: regs.regs[0],
1985             rbx: regs.regs[1],
1986             rcx: regs.regs[2],
1987             rdx: regs.regs[3],
1988             rsi: regs.regs[4],
1989             rdi: regs.regs[5],
1990             rbp: regs.regs[6],
1991             rsp: regs.regs[7],
1992             r8: regs.regs[8],
1993             r9: regs.regs[9],
1994             r10: regs.regs[10],
1995             r11: regs.regs[11],
1996             r12: regs.regs[12],
1997             r13: regs.regs[13],
1998             r14: regs.regs[14],
1999             r15: regs.regs[15],
2000             rip: regs.rip,
2001             // Update the lower 32-bit of rflags.
2002             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2003         };
2004 
2005         self.set_regs(cpu_id as u8, &gregs)
2006             .map_err(DebuggableError::WriteRegs)?;
2007 
2008         // Segment registers: CS, SS, DS, ES, FS, GS
2009         // Since GDB care only selectors, we call get_sregs() first.
2010         let mut sregs = self
2011             .get_sregs(cpu_id as u8)
2012             .map_err(DebuggableError::ReadRegs)?;
2013         sregs.cs.selector = regs.segments.cs as u16;
2014         sregs.ss.selector = regs.segments.ss as u16;
2015         sregs.ds.selector = regs.segments.ds as u16;
2016         sregs.es.selector = regs.segments.es as u16;
2017         sregs.fs.selector = regs.segments.fs as u16;
2018         sregs.gs.selector = regs.segments.gs as u16;
2019 
2020         self.set_sregs(cpu_id as u8, &sregs)
2021             .map_err(DebuggableError::WriteRegs)?;
2022 
2023         // TODO: Add other registers
2024 
2025         Ok(())
2026     }
2027 
2028     #[cfg(target_arch = "x86_64")]
2029     fn read_mem(
2030         &self,
2031         cpu_id: usize,
2032         vaddr: GuestAddress,
2033         len: usize,
2034     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2035         let mut buf = vec![0; len];
2036         let mut total_read = 0_u64;
2037 
2038         while total_read < len as u64 {
2039             let gaddr = vaddr.0 + total_read;
2040             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2041                 Ok(paddr) => paddr,
2042                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2043                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2044             };
2045             let psize = arch::PAGE_SIZE as u64;
2046             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2047             self.vm_memory
2048                 .memory()
2049                 .read(
2050                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2051                     GuestAddress(paddr),
2052                 )
2053                 .map_err(DebuggableError::ReadMem)?;
2054             total_read += read_len;
2055         }
2056         Ok(buf)
2057     }
2058 
2059     #[cfg(target_arch = "x86_64")]
2060     fn write_mem(
2061         &self,
2062         cpu_id: usize,
2063         vaddr: &GuestAddress,
2064         data: &[u8],
2065     ) -> std::result::Result<(), DebuggableError> {
2066         let mut total_written = 0_u64;
2067 
2068         while total_written < data.len() as u64 {
2069             let gaddr = vaddr.0 + total_written;
2070             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2071                 Ok(paddr) => paddr,
2072                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2073                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2074             };
2075             let psize = arch::PAGE_SIZE as u64;
2076             let write_len = std::cmp::min(
2077                 data.len() as u64 - total_written,
2078                 psize - (paddr & (psize - 1)),
2079             );
2080             self.vm_memory
2081                 .memory()
2082                 .write(
2083                     &data[total_written as usize..total_written as usize + write_len as usize],
2084                     GuestAddress(paddr),
2085                 )
2086                 .map_err(DebuggableError::WriteMem)?;
2087             total_written += write_len;
2088         }
2089         Ok(())
2090     }
2091 
2092     fn active_vcpus(&self) -> usize {
2093         self.present_vcpus() as usize
2094     }
2095 }
2096 
2097 #[cfg(feature = "guest_debug")]
2098 impl Elf64Writable for CpuManager {}
2099 
2100 #[cfg(feature = "guest_debug")]
2101 impl CpuElf64Writable for CpuManager {
2102     fn cpu_write_elf64_note(
2103         &mut self,
2104         dump_state: &DumpState,
2105     ) -> std::result::Result<(), GuestDebuggableError> {
2106         let mut coredump_file = dump_state.file.as_ref().unwrap();
2107         for vcpu in &self.vcpus {
2108             let note_size = self.get_note_size(NoteDescType::Elf, 1);
2109             let mut pos: usize = 0;
2110             let mut buf = vec![0; note_size as usize];
2111             let descsz = size_of::<X86_64ElfPrStatus>();
2112             let vcpu_id = vcpu.lock().unwrap().id;
2113 
2114             let note = Elf64_Nhdr {
2115                 n_namesz: COREDUMP_NAME_SIZE,
2116                 n_descsz: descsz as u32,
2117                 n_type: NT_PRSTATUS,
2118             };
2119 
2120             let bytes: &[u8] = note.as_slice();
2121             buf.splice(0.., bytes.to_vec());
2122             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2123             buf.resize(pos + 4, 0);
2124             buf.splice(pos.., "CORE".to_string().into_bytes());
2125 
2126             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2127             buf.resize(pos + 32 + 4, 0);
2128             let pid = vcpu_id as u64;
2129             let bytes: &[u8] = pid.as_slice();
2130             buf.splice(pos + 32.., bytes.to_vec()); /* pr_pid */
2131 
2132             pos += descsz - size_of::<X86_64UserRegs>() - size_of::<u64>();
2133 
2134             let orig_rax: u64 = 0;
2135             let gregs = self.vcpus[usize::from(vcpu_id)]
2136                 .lock()
2137                 .unwrap()
2138                 .vcpu
2139                 .get_regs()
2140                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2141 
2142             let regs1 = [
2143                 gregs.r15, gregs.r14, gregs.r13, gregs.r12, gregs.rbp, gregs.rbx, gregs.r11,
2144                 gregs.r10,
2145             ];
2146             let regs2 = [
2147                 gregs.r9, gregs.r8, gregs.rax, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, orig_rax,
2148             ];
2149 
2150             let sregs = self.vcpus[usize::from(vcpu_id)]
2151                 .lock()
2152                 .unwrap()
2153                 .vcpu
2154                 .get_sregs()
2155                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2156 
2157             debug!(
2158                 "rip 0x{:x} rsp 0x{:x} gs 0x{:x} cs 0x{:x} ss 0x{:x} ds 0x{:x}",
2159                 gregs.rip,
2160                 gregs.rsp,
2161                 sregs.gs.base,
2162                 sregs.cs.selector,
2163                 sregs.ss.selector,
2164                 sregs.ds.selector,
2165             );
2166 
2167             let regs = X86_64UserRegs {
2168                 regs1,
2169                 regs2,
2170                 rip: gregs.rip,
2171                 cs: sregs.cs.selector as u64,
2172                 eflags: gregs.rflags,
2173                 rsp: gregs.rsp,
2174                 ss: sregs.ss.selector as u64,
2175                 fs_base: sregs.fs.base as u64,
2176                 gs_base: sregs.gs.base as u64,
2177                 ds: sregs.ds.selector as u64,
2178                 es: sregs.es.selector as u64,
2179                 fs: sregs.fs.selector as u64,
2180                 gs: sregs.gs.selector as u64,
2181             };
2182 
2183             // let bytes: &[u8] = unsafe { any_as_u8_slice(&regs) };
2184             let bytes: &[u8] = regs.as_slice();
2185             buf.resize(note_size as usize, 0);
2186             buf.splice(pos.., bytes.to_vec());
2187             buf.resize(note_size as usize, 0);
2188 
2189             coredump_file
2190                 .write(&buf)
2191                 .map_err(GuestDebuggableError::CoredumpFile)?;
2192         }
2193 
2194         Ok(())
2195     }
2196 
2197     fn cpu_write_vmm_note(
2198         &mut self,
2199         dump_state: &DumpState,
2200     ) -> std::result::Result<(), GuestDebuggableError> {
2201         let mut coredump_file = dump_state.file.as_ref().unwrap();
2202         for vcpu in &self.vcpus {
2203             let note_size = self.get_note_size(NoteDescType::Vmm, 1);
2204             let mut pos: usize = 0;
2205             let mut buf = vec![0; note_size as usize];
2206             let descsz = size_of::<DumpCpusState>();
2207             let vcpu_id = vcpu.lock().unwrap().id;
2208 
2209             let note = Elf64_Nhdr {
2210                 n_namesz: COREDUMP_NAME_SIZE,
2211                 n_descsz: descsz as u32,
2212                 n_type: 0,
2213             };
2214 
2215             let bytes: &[u8] = note.as_slice();
2216             buf.splice(0.., bytes.to_vec());
2217             pos += round_up!(size_of::<Elf64_Nhdr>(), 4);
2218 
2219             buf.resize(pos + 4, 0);
2220             buf.splice(pos.., "QEMU".to_string().into_bytes());
2221 
2222             pos += round_up!(COREDUMP_NAME_SIZE as usize, 4);
2223 
2224             let gregs = self.vcpus[usize::from(vcpu_id)]
2225                 .lock()
2226                 .unwrap()
2227                 .vcpu
2228                 .get_regs()
2229                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get regs failed")))?;
2230 
2231             let regs1 = [
2232                 gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rsp,
2233                 gregs.rbp,
2234             ];
2235 
2236             let regs2 = [
2237                 gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14,
2238                 gregs.r15,
2239             ];
2240 
2241             let sregs = self.vcpus[usize::from(vcpu_id)]
2242                 .lock()
2243                 .unwrap()
2244                 .vcpu
2245                 .get_sregs()
2246                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get sregs failed")))?;
2247 
2248             let mut msrs = vec![MsrEntry {
2249                 index: msr_index::MSR_KERNEL_GS_BASE,
2250                 ..Default::default()
2251             }];
2252 
2253             self.vcpus[vcpu_id as usize]
2254                 .lock()
2255                 .unwrap()
2256                 .vcpu
2257                 .get_msrs(&mut msrs)
2258                 .map_err(|_e| GuestDebuggableError::Coredump(anyhow!("get msr failed")))?;
2259             let kernel_gs_base = msrs[0].data;
2260 
2261             let cs = CpuSegment::new(sregs.cs);
2262             let ds = CpuSegment::new(sregs.ds);
2263             let es = CpuSegment::new(sregs.es);
2264             let fs = CpuSegment::new(sregs.fs);
2265             let gs = CpuSegment::new(sregs.gs);
2266             let ss = CpuSegment::new(sregs.ss);
2267             let ldt = CpuSegment::new(sregs.ldt);
2268             let tr = CpuSegment::new(sregs.tr);
2269             let gdt = CpuSegment::new_from_table(sregs.gdt);
2270             let idt = CpuSegment::new_from_table(sregs.idt);
2271             let cr = [sregs.cr0, sregs.cr8, sregs.cr2, sregs.cr3, sregs.cr4];
2272             let regs = DumpCpusState {
2273                 version: 1,
2274                 size: size_of::<DumpCpusState>() as u32,
2275                 regs1,
2276                 regs2,
2277                 rip: gregs.rip,
2278                 rflags: gregs.rflags,
2279                 cs,
2280                 ds,
2281                 es,
2282                 fs,
2283                 gs,
2284                 ss,
2285                 ldt,
2286                 tr,
2287                 gdt,
2288                 idt,
2289                 cr,
2290                 kernel_gs_base,
2291             };
2292 
2293             let bytes: &[u8] = regs.as_slice();
2294             buf.resize(note_size as usize, 0);
2295             buf.splice(pos.., bytes.to_vec());
2296             buf.resize(note_size as usize, 0);
2297 
2298             coredump_file
2299                 .write(&buf)
2300                 .map_err(GuestDebuggableError::CoredumpFile)?;
2301         }
2302 
2303         Ok(())
2304     }
2305 }
2306 
2307 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2308 #[cfg(test)]
2309 mod tests {
2310     use arch::x86_64::interrupts::*;
2311     use arch::x86_64::regs::*;
2312     use hypervisor::arch::x86::{FpuState, LapicState, StandardRegisters};
2313 
2314     #[test]
2315     fn test_setlint() {
2316         let hv = hypervisor::new().unwrap();
2317         let vm = hv.create_vm().expect("new VM fd creation failed");
2318         assert!(hv.check_required_extensions().is_ok());
2319         // Calling get_lapic will fail if there is no irqchip before hand.
2320         assert!(vm.create_irq_chip().is_ok());
2321         let vcpu = vm.create_vcpu(0, None).unwrap();
2322         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2323 
2324         // Compute the value that is expected to represent LVT0 and LVT1.
2325         let lint0 = klapic_before.get_klapic_reg(APIC_LVT0);
2326         let lint1 = klapic_before.get_klapic_reg(APIC_LVT1);
2327         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2328         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2329 
2330         set_lint(&vcpu).unwrap();
2331 
2332         // Compute the value that represents LVT0 and LVT1 after set_lint.
2333         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2334         let lint0_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT0);
2335         let lint1_mode_actual = klapic_actual.get_klapic_reg(APIC_LVT1);
2336         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2337         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2338     }
2339 
2340     #[test]
2341     fn test_setup_fpu() {
2342         let hv = hypervisor::new().unwrap();
2343         let vm = hv.create_vm().expect("new VM fd creation failed");
2344         let vcpu = vm.create_vcpu(0, None).unwrap();
2345         setup_fpu(&vcpu).unwrap();
2346 
2347         let expected_fpu: FpuState = FpuState {
2348             fcw: 0x37f,
2349             mxcsr: 0x1f80,
2350             ..Default::default()
2351         };
2352         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2353         // TODO: auto-generate kvm related structures with PartialEq on.
2354         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2355         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2356         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2357         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2358         // remove it at all.
2359         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2360     }
2361 
2362     #[test]
2363     fn test_setup_msrs() {
2364         use hypervisor::arch::x86::{msr_index, MsrEntry};
2365 
2366         let hv = hypervisor::new().unwrap();
2367         let vm = hv.create_vm().expect("new VM fd creation failed");
2368         let vcpu = vm.create_vcpu(0, None).unwrap();
2369         setup_msrs(&vcpu).unwrap();
2370 
2371         // This test will check against the last MSR entry configured (the tenth one).
2372         // See create_msr_entries for details.
2373         let mut msrs = vec![MsrEntry {
2374             index: msr_index::MSR_IA32_MISC_ENABLE,
2375             ..Default::default()
2376         }];
2377 
2378         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2379         // in this test case scenario.
2380         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2381         assert_eq!(read_msrs, 1);
2382 
2383         // Official entries that were setup when we did setup_msrs. We need to assert that the
2384         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2385         // expect.
2386         let entry_vec = vcpu.boot_msr_entries();
2387         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2388     }
2389 
2390     #[test]
2391     fn test_setup_regs() {
2392         let hv = hypervisor::new().unwrap();
2393         let vm = hv.create_vm().expect("new VM fd creation failed");
2394         let vcpu = vm.create_vcpu(0, None).unwrap();
2395 
2396         let expected_regs: StandardRegisters = StandardRegisters {
2397             rflags: 0x0000000000000002u64,
2398             rbx: arch::layout::PVH_INFO_START.0,
2399             rip: 1,
2400             ..Default::default()
2401         };
2402 
2403         setup_regs(&vcpu, expected_regs.rip).unwrap();
2404 
2405         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2406         assert_eq!(actual_regs, expected_regs);
2407     }
2408 }
2409 
2410 #[cfg(target_arch = "aarch64")]
2411 #[cfg(test)]
2412 mod tests {
2413     use arch::layout;
2414     use hypervisor::kvm::aarch64::{is_system_register, MPIDR_EL1};
2415     use hypervisor::kvm::kvm_bindings::{
2416         kvm_one_reg, kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2417         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2418     };
2419     use hypervisor::{arm64_core_reg_id, offset__of};
2420     use std::mem;
2421 
2422     #[test]
2423     fn test_setup_regs() {
2424         let hv = hypervisor::new().unwrap();
2425         let vm = hv.create_vm().unwrap();
2426         let vcpu = vm.create_vcpu(0, None).unwrap();
2427 
2428         let res = vcpu.setup_regs(0, 0x0, layout::FDT_START.0);
2429         // Must fail when vcpu is not initialized yet.
2430         assert!(res.is_err());
2431 
2432         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2433         vm.get_preferred_target(&mut kvi).unwrap();
2434         vcpu.vcpu_init(&kvi).unwrap();
2435 
2436         assert!(vcpu.setup_regs(0, 0x0, layout::FDT_START.0).is_ok());
2437     }
2438 
2439     #[test]
2440     fn test_read_mpidr() {
2441         let hv = hypervisor::new().unwrap();
2442         let vm = hv.create_vm().unwrap();
2443         let vcpu = vm.create_vcpu(0, None).unwrap();
2444         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2445         vm.get_preferred_target(&mut kvi).unwrap();
2446 
2447         // Must fail when vcpu is not initialized yet.
2448         assert!(vcpu.read_mpidr().is_err());
2449 
2450         vcpu.vcpu_init(&kvi).unwrap();
2451         assert_eq!(vcpu.read_mpidr().unwrap(), 0x80000000);
2452     }
2453 
2454     #[test]
2455     fn test_is_system_register() {
2456         let offset = offset__of!(user_pt_regs, pc);
2457         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2458         assert!(!is_system_register(regid));
2459         let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64;
2460         assert!(is_system_register(regid));
2461     }
2462 
2463     #[test]
2464     fn test_save_restore_core_regs() {
2465         let hv = hypervisor::new().unwrap();
2466         let vm = hv.create_vm().unwrap();
2467         let vcpu = vm.create_vcpu(0, None).unwrap();
2468         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2469         vm.get_preferred_target(&mut kvi).unwrap();
2470 
2471         // Must fail when vcpu is not initialized yet.
2472         let res = vcpu.get_regs();
2473         assert!(res.is_err());
2474         assert_eq!(
2475             format!("{}", res.unwrap_err()),
2476             "Failed to get core register: Exec format error (os error 8)"
2477         );
2478 
2479         let mut state = kvm_regs::default();
2480         let res = vcpu.set_regs(&state);
2481         assert!(res.is_err());
2482         assert_eq!(
2483             format!("{}", res.unwrap_err()),
2484             "Failed to set core register: Exec format error (os error 8)"
2485         );
2486 
2487         vcpu.vcpu_init(&kvi).unwrap();
2488         let res = vcpu.get_regs();
2489         assert!(res.is_ok());
2490         state = res.unwrap();
2491         assert_eq!(state.regs.pstate, 0x3C5);
2492 
2493         assert!(vcpu.set_regs(&state).is_ok());
2494         let off = offset__of!(user_pt_regs, pstate);
2495         let pstate = vcpu
2496             .get_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
2497             .expect("Failed to call kvm get one reg");
2498         assert_eq!(state.regs.pstate, pstate);
2499     }
2500 
2501     #[test]
2502     fn test_save_restore_system_regs() {
2503         let hv = hypervisor::new().unwrap();
2504         let vm = hv.create_vm().unwrap();
2505         let vcpu = vm.create_vcpu(0, None).unwrap();
2506         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2507         vm.get_preferred_target(&mut kvi).unwrap();
2508 
2509         // Must fail when vcpu is not initialized yet.
2510         let mut state: Vec<kvm_one_reg> = Vec::new();
2511         let res = vcpu.get_sys_regs();
2512         assert!(res.is_err());
2513         assert_eq!(
2514             format!("{}", res.as_ref().unwrap_err()),
2515             "Failed to retrieve list of registers: Exec format error (os error 8)"
2516         );
2517 
2518         state.push(kvm_one_reg {
2519             id: MPIDR_EL1,
2520             addr: 0x00,
2521         });
2522         let res = vcpu.set_sys_regs(&state);
2523         assert!(res.is_err());
2524         assert_eq!(
2525             format!("{}", res.unwrap_err()),
2526             "Failed to set system register: Exec format error (os error 8)"
2527         );
2528 
2529         vcpu.vcpu_init(&kvi).unwrap();
2530         let res = vcpu.get_sys_regs();
2531         assert!(res.is_ok());
2532         state = res.unwrap();
2533 
2534         let initial_mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr");
2535         assert!(state.contains(&kvm_one_reg {
2536             id: MPIDR_EL1,
2537             addr: initial_mpidr
2538         }));
2539 
2540         assert!(vcpu.set_sys_regs(&state).is_ok());
2541         let mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr");
2542         assert_eq!(initial_mpidr, mpidr);
2543     }
2544 
2545     #[test]
2546     fn test_get_set_mpstate() {
2547         let hv = hypervisor::new().unwrap();
2548         let vm = hv.create_vm().unwrap();
2549         let vcpu = vm.create_vcpu(0, None).unwrap();
2550         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2551         vm.get_preferred_target(&mut kvi).unwrap();
2552 
2553         let res = vcpu.get_mp_state();
2554         assert!(res.is_ok());
2555         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2556     }
2557 }
2558