xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 7d7bfb2034001d4cb15df2ddc56d2d350c8da30f)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 use crate::device_manager::DeviceManager;
16 #[cfg(feature = "gdb")]
17 use crate::gdb::{get_raw_tid, Debuggable, DebuggableError};
18 use crate::memory_manager::MemoryManager;
19 use crate::seccomp_filters::{get_seccomp_filter, Thread};
20 #[cfg(target_arch = "x86_64")]
21 use crate::vm::physical_bits;
22 use crate::GuestMemoryMmap;
23 use crate::CPU_MANAGER_SNAPSHOT_ID;
24 use acpi_tables::{aml, aml::Aml, sdt::Sdt};
25 use anyhow::anyhow;
26 use arch::EntryPoint;
27 use arch::NumaNodes;
28 use devices::interrupt_controller::InterruptController;
29 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
30 use gdbstub_arch::x86::reg::{X86SegmentRegs, X86_64CoreRegs};
31 #[cfg(target_arch = "aarch64")]
32 use hypervisor::kvm::kvm_bindings;
33 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
34 use hypervisor::x86_64::{SpecialRegisters, StandardRegisters};
35 #[cfg(target_arch = "x86_64")]
36 use hypervisor::CpuId;
37 use hypervisor::{vm::VmmOps, CpuState, HypervisorCpuError, VmExit};
38 #[cfg(feature = "tdx")]
39 use hypervisor::{TdxExitDetails, TdxExitStatus};
40 use libc::{c_void, siginfo_t};
41 use seccompiler::{apply_filter, SeccompAction};
42 use std::collections::BTreeMap;
43 use std::os::unix::thread::JoinHandleExt;
44 use std::sync::atomic::{AtomicBool, Ordering};
45 use std::sync::{Arc, Barrier, Mutex};
46 use std::{cmp, io, result, thread};
47 use vm_device::BusDevice;
48 use vm_memory::GuestAddress;
49 use vm_memory::GuestMemoryAtomic;
50 use vm_migration::{
51     Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
52     Transportable,
53 };
54 use vmm_sys_util::eventfd::EventFd;
55 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
56 
57 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
58 
59 #[derive(Debug)]
60 pub enum Error {
61     /// Cannot create the vCPU.
62     VcpuCreate(anyhow::Error),
63 
64     /// Cannot run the VCPUs.
65     VcpuRun(anyhow::Error),
66 
67     /// Cannot spawn a new vCPU thread.
68     VcpuSpawn(io::Error),
69 
70     /// Cannot generate common CPUID
71     CommonCpuId(arch::Error),
72 
73     /// Error configuring VCPU
74     VcpuConfiguration(arch::Error),
75 
76     #[cfg(target_arch = "aarch64")]
77     /// Error fetching prefered target
78     VcpuArmPreferredTarget(hypervisor::HypervisorVmError),
79 
80     #[cfg(target_arch = "aarch64")]
81     /// Error doing vCPU init on Arm.
82     VcpuArmInit(hypervisor::HypervisorCpuError),
83 
84     /// Failed to join on vCPU threads
85     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
86 
87     /// Cannot add legacy device to Bus.
88     BusError(vm_device::BusError),
89 
90     /// Asking for more vCPUs that we can have
91     DesiredVCpuCountExceedsMax,
92 
93     /// Cannot create seccomp filter
94     CreateSeccompFilter(seccompiler::Error),
95 
96     /// Cannot apply seccomp filter
97     ApplySeccompFilter(seccompiler::Error),
98 
99     /// Error starting vCPU after restore
100     StartRestoreVcpu(anyhow::Error),
101 
102     /// Error because an unexpected VmExit type was received.
103     UnexpectedVmExit,
104 
105     /// Failed to allocate MMIO address
106     AllocateMmmioAddress,
107 
108     #[cfg(feature = "tdx")]
109     InitializeTdx(hypervisor::HypervisorCpuError),
110 
111     #[cfg(target_arch = "aarch64")]
112     InitPmu(hypervisor::HypervisorCpuError),
113 
114     /// Failed scheduling the thread on the expected CPU set.
115     ScheduleCpuSet,
116 
117     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
118     /// Error on debug related CPU ops.
119     CpuDebug(hypervisor::HypervisorCpuError),
120 
121     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
122     /// Failed to translate guest virtual address.
123     TranslateVirtualAddress(hypervisor::HypervisorCpuError),
124 
125     #[cfg(all(feature = "amx", target_arch = "x86_64"))]
126     /// "Failed to setup AMX.
127     AmxEnable(anyhow::Error),
128 }
129 pub type Result<T> = result::Result<T, Error>;
130 
131 #[cfg(target_arch = "x86_64")]
132 #[allow(dead_code)]
133 #[repr(packed)]
134 struct LocalApic {
135     pub r#type: u8,
136     pub length: u8,
137     pub processor_id: u8,
138     pub apic_id: u8,
139     pub flags: u32,
140 }
141 
142 #[allow(dead_code)]
143 #[repr(packed)]
144 #[derive(Default)]
145 struct Ioapic {
146     pub r#type: u8,
147     pub length: u8,
148     pub ioapic_id: u8,
149     _reserved: u8,
150     pub apic_address: u32,
151     pub gsi_base: u32,
152 }
153 
154 #[cfg(target_arch = "aarch64")]
155 #[allow(dead_code)]
156 #[repr(packed)]
157 struct GicC {
158     pub r#type: u8,
159     pub length: u8,
160     pub reserved0: u16,
161     pub cpu_interface_number: u32,
162     pub uid: u32,
163     pub flags: u32,
164     pub parking_version: u32,
165     pub performance_interrupt: u32,
166     pub parked_address: u64,
167     pub base_address: u64,
168     pub gicv_base_address: u64,
169     pub gich_base_address: u64,
170     pub vgic_interrupt: u32,
171     pub gicr_base_address: u64,
172     pub mpidr: u64,
173     pub proc_power_effi_class: u8,
174     pub reserved1: u8,
175     pub spe_overflow_interrupt: u16,
176 }
177 
178 #[cfg(target_arch = "aarch64")]
179 #[allow(dead_code)]
180 #[repr(packed)]
181 struct GicD {
182     pub r#type: u8,
183     pub length: u8,
184     pub reserved0: u16,
185     pub gic_id: u32,
186     pub base_address: u64,
187     pub global_irq_base: u32,
188     pub version: u8,
189     pub reserved1: [u8; 3],
190 }
191 
192 #[cfg(target_arch = "aarch64")]
193 #[allow(dead_code)]
194 #[repr(packed)]
195 struct GicR {
196     pub r#type: u8,
197     pub length: u8,
198     pub reserved: u16,
199     pub base_address: u64,
200     pub range_length: u32,
201 }
202 
203 #[cfg(target_arch = "aarch64")]
204 #[allow(dead_code)]
205 #[repr(packed)]
206 struct GicIts {
207     pub r#type: u8,
208     pub length: u8,
209     pub reserved0: u16,
210     pub translation_id: u32,
211     pub base_address: u64,
212     pub reserved1: u32,
213 }
214 
215 #[cfg(target_arch = "aarch64")]
216 #[allow(dead_code)]
217 #[repr(packed)]
218 struct ProcessorHierarchyNode {
219     pub r#type: u8,
220     pub length: u8,
221     pub reserved: u16,
222     pub flags: u32,
223     pub parent: u32,
224     pub acpi_processor_id: u32,
225     pub num_private_resources: u32,
226 }
227 
228 #[allow(dead_code)]
229 #[repr(packed)]
230 #[derive(Default)]
231 struct InterruptSourceOverride {
232     pub r#type: u8,
233     pub length: u8,
234     pub bus: u8,
235     pub source: u8,
236     pub gsi: u32,
237     pub flags: u16,
238 }
239 
240 /// A wrapper around creating and using a kvm-based VCPU.
241 pub struct Vcpu {
242     // The hypervisor abstracted CPU.
243     vcpu: Arc<dyn hypervisor::Vcpu>,
244     id: u8,
245     #[cfg(target_arch = "aarch64")]
246     mpidr: u64,
247     saved_state: Option<CpuState>,
248 }
249 
250 impl Vcpu {
251     /// Constructs a new VCPU for `vm`.
252     ///
253     /// # Arguments
254     ///
255     /// * `id` - Represents the CPU number between [0, max vcpus).
256     /// * `vm` - The virtual machine this vcpu will get attached to.
257     /// * `vmmops` - Optional object for exit handling.
258     pub fn new(
259         id: u8,
260         vm: &Arc<dyn hypervisor::Vm>,
261         vmmops: Option<Arc<dyn VmmOps>>,
262     ) -> Result<Self> {
263         let vcpu = vm
264             .create_vcpu(id, vmmops)
265             .map_err(|e| Error::VcpuCreate(e.into()))?;
266         // Initially the cpuid per vCPU is the one supported by this VM.
267         Ok(Vcpu {
268             vcpu,
269             id,
270             #[cfg(target_arch = "aarch64")]
271             mpidr: 0,
272             saved_state: None,
273         })
274     }
275 
276     /// Configures a vcpu and should be called once per vcpu when created.
277     ///
278     /// # Arguments
279     ///
280     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
281     /// * `vm_memory` - Guest memory.
282     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
283     pub fn configure(
284         &mut self,
285         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
286         kernel_entry_point: Option<EntryPoint>,
287         #[cfg(target_arch = "x86_64")] vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
288         #[cfg(target_arch = "x86_64")] cpuid: CpuId,
289         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
290     ) -> Result<()> {
291         #[cfg(target_arch = "aarch64")]
292         {
293             self.init(vm)?;
294             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point)
295                 .map_err(Error::VcpuConfiguration)?;
296         }
297         info!("Configuring vCPU: cpu_id = {}", self.id);
298         #[cfg(target_arch = "x86_64")]
299         arch::configure_vcpu(
300             &self.vcpu,
301             self.id,
302             kernel_entry_point,
303             vm_memory,
304             cpuid,
305             kvm_hyperv,
306         )
307         .map_err(Error::VcpuConfiguration)?;
308 
309         Ok(())
310     }
311 
312     /// Gets the MPIDR register value.
313     #[cfg(target_arch = "aarch64")]
314     pub fn get_mpidr(&self) -> u64 {
315         self.mpidr
316     }
317 
318     /// Gets the saved vCPU state.
319     #[cfg(target_arch = "aarch64")]
320     pub fn get_saved_state(&self) -> Option<CpuState> {
321         self.saved_state.clone()
322     }
323 
324     /// Initializes an aarch64 specific vcpu for booting Linux.
325     #[cfg(target_arch = "aarch64")]
326     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
327         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
328 
329         // This reads back the kernel's preferred target type.
330         vm.get_preferred_target(&mut kvi)
331             .map_err(Error::VcpuArmPreferredTarget)?;
332         // We already checked that the capability is supported.
333         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
334         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PMU_V3;
335         // Non-boot cpus are powered off initially.
336         if self.id > 0 {
337             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
338         }
339         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
340     }
341 
342     /// Runs the VCPU until it exits, returning the reason.
343     ///
344     /// Note that the state of the VCPU and associated VM must be setup first for this to do
345     /// anything useful.
346     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
347         self.vcpu.run()
348     }
349 }
350 
351 const VCPU_SNAPSHOT_ID: &str = "vcpu";
352 impl Pausable for Vcpu {
353     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
354         self.saved_state =
355             Some(self.vcpu.state().map_err(|e| {
356                 MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e))
357             })?);
358 
359         Ok(())
360     }
361 
362     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
363         if let Some(vcpu_state) = &self.saved_state {
364             self.vcpu.set_state(vcpu_state).map_err(|e| {
365                 MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e))
366             })?;
367         }
368 
369         Ok(())
370     }
371 }
372 impl Snapshottable for Vcpu {
373     fn id(&self) -> String {
374         VCPU_SNAPSHOT_ID.to_string()
375     }
376 
377     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
378         let mut vcpu_snapshot = Snapshot::new(&format!("{}", self.id));
379         vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state(
380             VCPU_SNAPSHOT_ID,
381             &self.saved_state,
382         )?);
383 
384         Ok(vcpu_snapshot)
385     }
386 
387     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
388         self.saved_state = Some(snapshot.to_state(VCPU_SNAPSHOT_ID)?);
389         Ok(())
390     }
391 }
392 
393 pub struct CpuManager {
394     config: CpusConfig,
395     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
396     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
397     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
398     vm_memory: GuestMemoryAtomic<GuestMemoryMmap>,
399     #[cfg(target_arch = "x86_64")]
400     cpuid: CpuId,
401     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
402     vm: Arc<dyn hypervisor::Vm>,
403     vcpus_kill_signalled: Arc<AtomicBool>,
404     vcpus_pause_signalled: Arc<AtomicBool>,
405     exit_evt: EventFd,
406     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
407     reset_evt: EventFd,
408     #[cfg(feature = "gdb")]
409     vm_debug_evt: EventFd,
410     vcpu_states: Vec<VcpuState>,
411     selected_cpu: u8,
412     vcpus: Vec<Arc<Mutex<Vcpu>>>,
413     seccomp_action: SeccompAction,
414     vmmops: Arc<dyn VmmOps>,
415     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
416     acpi_address: Option<GuestAddress>,
417     proximity_domain_per_cpu: BTreeMap<u8, u32>,
418     affinity: BTreeMap<u8, Vec<u8>>,
419     dynamic: bool,
420 }
421 
422 const CPU_ENABLE_FLAG: usize = 0;
423 const CPU_INSERTING_FLAG: usize = 1;
424 const CPU_REMOVING_FLAG: usize = 2;
425 const CPU_EJECT_FLAG: usize = 3;
426 
427 const CPU_STATUS_OFFSET: u64 = 4;
428 const CPU_SELECTION_OFFSET: u64 = 0;
429 
430 impl BusDevice for CpuManager {
431     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
432         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
433         data.fill(0);
434 
435         match offset {
436             CPU_SELECTION_OFFSET => {
437                 data[0] = self.selected_cpu;
438             }
439             CPU_STATUS_OFFSET => {
440                 if self.selected_cpu < self.max_vcpus() {
441                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
442                     if state.active() {
443                         data[0] |= 1 << CPU_ENABLE_FLAG;
444                     }
445                     if state.inserting {
446                         data[0] |= 1 << CPU_INSERTING_FLAG;
447                     }
448                     if state.removing {
449                         data[0] |= 1 << CPU_REMOVING_FLAG;
450                     }
451                 } else {
452                     warn!("Out of range vCPU id: {}", self.selected_cpu);
453                 }
454             }
455             _ => {
456                 warn!(
457                     "Unexpected offset for accessing CPU manager device: {:#}",
458                     offset
459                 );
460             }
461         }
462     }
463 
464     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
465         match offset {
466             CPU_SELECTION_OFFSET => {
467                 self.selected_cpu = data[0];
468             }
469             CPU_STATUS_OFFSET => {
470                 if self.selected_cpu < self.max_vcpus() {
471                     let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
472                     // The ACPI code writes back a 1 to acknowledge the insertion
473                     if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
474                         && state.inserting
475                     {
476                         state.inserting = false;
477                     }
478                     // Ditto for removal
479                     if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG)
480                         && state.removing
481                     {
482                         state.removing = false;
483                     }
484                     // Trigger removal of vCPU
485                     if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
486                         if let Err(e) = self.remove_vcpu(self.selected_cpu) {
487                             error!("Error removing vCPU: {:?}", e);
488                         }
489                     }
490                 } else {
491                     warn!("Out of range vCPU id: {}", self.selected_cpu);
492                 }
493             }
494             _ => {
495                 warn!(
496                     "Unexpected offset for accessing CPU manager device: {:#}",
497                     offset
498                 );
499             }
500         }
501         None
502     }
503 }
504 
505 #[derive(Default)]
506 struct VcpuState {
507     inserting: bool,
508     removing: bool,
509     handle: Option<thread::JoinHandle<()>>,
510     kill: Arc<AtomicBool>,
511     vcpu_run_interrupted: Arc<AtomicBool>,
512 }
513 
514 impl VcpuState {
515     fn active(&self) -> bool {
516         self.handle.is_some()
517     }
518 
519     fn signal_thread(&self) {
520         if let Some(handle) = self.handle.as_ref() {
521             loop {
522                 unsafe {
523                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
524                 }
525                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
526                     break;
527                 } else {
528                     // This is more effective than thread::yield_now() at
529                     // avoiding a priority inversion with the vCPU thread
530                     thread::sleep(std::time::Duration::from_millis(1));
531                 }
532             }
533         }
534     }
535 
536     fn join_thread(&mut self) -> Result<()> {
537         if let Some(handle) = self.handle.take() {
538             handle.join().map_err(Error::ThreadCleanup)?
539         }
540 
541         Ok(())
542     }
543 
544     fn unpark_thread(&self) {
545         if let Some(handle) = self.handle.as_ref() {
546             handle.thread().unpark()
547         }
548     }
549 }
550 
551 impl CpuManager {
552     #[allow(unused_variables)]
553     #[allow(clippy::too_many_arguments)]
554     pub fn new(
555         config: &CpusConfig,
556         device_manager: &Arc<Mutex<DeviceManager>>,
557         memory_manager: &Arc<Mutex<MemoryManager>>,
558         vm: Arc<dyn hypervisor::Vm>,
559         exit_evt: EventFd,
560         reset_evt: EventFd,
561         #[cfg(feature = "gdb")] vm_debug_evt: EventFd,
562         hypervisor: Arc<dyn hypervisor::Hypervisor>,
563         seccomp_action: SeccompAction,
564         vmmops: Arc<dyn VmmOps>,
565         #[cfg(feature = "tdx")] tdx_enabled: bool,
566         numa_nodes: &NumaNodes,
567     ) -> Result<Arc<Mutex<CpuManager>>> {
568         let guest_memory = memory_manager.lock().unwrap().guest_memory();
569         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
570         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
571 
572         #[cfg(target_arch = "x86_64")]
573         let sgx_epc_sections = memory_manager
574             .lock()
575             .unwrap()
576             .sgx_epc_region()
577             .as_ref()
578             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
579         #[cfg(target_arch = "x86_64")]
580         let cpuid = {
581             let phys_bits = physical_bits(config.max_phys_bits);
582             arch::generate_common_cpuid(
583                 hypervisor,
584                 config
585                     .topology
586                     .clone()
587                     .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)),
588                 sgx_epc_sections,
589                 phys_bits,
590                 config.kvm_hyperv,
591                 #[cfg(feature = "tdx")]
592                 tdx_enabled,
593             )
594             .map_err(Error::CommonCpuId)?
595         };
596         #[cfg(all(feature = "amx", target_arch = "x86_64"))]
597         if config.features.amx {
598             const ARCH_GET_XCOMP_GUEST_PERM: usize = 0x1024;
599             const ARCH_REQ_XCOMP_GUEST_PERM: usize = 0x1025;
600             const XFEATURE_XTILEDATA: usize = 18;
601             const XFEATURE_XTILEDATA_MASK: usize = 1 << XFEATURE_XTILEDATA;
602 
603             // This is safe as the syscall is only modifing kernel internal
604             // data structures that the kernel is itself expected to safeguard.
605             let amx_tile = unsafe {
606                 libc::syscall(
607                     libc::SYS_arch_prctl,
608                     ARCH_REQ_XCOMP_GUEST_PERM,
609                     XFEATURE_XTILEDATA,
610                 )
611             };
612 
613             if amx_tile != 0 {
614                 return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
615             } else {
616                 // This is safe as the mask being modified (not marked mutable as it is
617                 // modified in unsafe only which is permitted) isn't in use elsewhere.
618                 let mask: usize = 0;
619                 let result = unsafe {
620                     libc::syscall(libc::SYS_arch_prctl, ARCH_GET_XCOMP_GUEST_PERM, &mask)
621                 };
622                 if result != 0 || (mask & XFEATURE_XTILEDATA_MASK) != XFEATURE_XTILEDATA_MASK {
623                     return Err(Error::AmxEnable(anyhow!("Guest AMX usage not supported")));
624                 }
625             }
626         }
627 
628         let device_manager = device_manager.lock().unwrap();
629 
630         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
631             let mut cpu_list = Vec::new();
632             for (proximity_domain, numa_node) in numa_nodes.iter() {
633                 for cpu in numa_node.cpus.iter() {
634                     cpu_list.push((*cpu, *proximity_domain))
635                 }
636             }
637             cpu_list
638         }
639         .into_iter()
640         .collect();
641 
642         let affinity = if let Some(cpu_affinity) = config.affinity.as_ref() {
643             cpu_affinity
644                 .iter()
645                 .map(|a| (a.vcpu, a.host_cpus.clone()))
646                 .collect()
647         } else {
648             BTreeMap::new()
649         };
650 
651         #[cfg(feature = "tdx")]
652         let dynamic = !tdx_enabled;
653         #[cfg(not(feature = "tdx"))]
654         let dynamic = true;
655 
656         let acpi_address = if dynamic {
657             Some(
658                 device_manager
659                     .allocator()
660                     .lock()
661                     .unwrap()
662                     .allocate_platform_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None)
663                     .ok_or(Error::AllocateMmmioAddress)?,
664             )
665         } else {
666             None
667         };
668 
669         let cpu_manager = Arc::new(Mutex::new(CpuManager {
670             config: config.clone(),
671             interrupt_controller: device_manager.interrupt_controller().clone(),
672             vm_memory: guest_memory,
673             #[cfg(target_arch = "x86_64")]
674             cpuid,
675             vm,
676             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
677             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
678             vcpu_states,
679             exit_evt,
680             reset_evt,
681             #[cfg(feature = "gdb")]
682             vm_debug_evt,
683             selected_cpu: 0,
684             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
685             seccomp_action,
686             vmmops,
687             acpi_address,
688             proximity_domain_per_cpu,
689             affinity,
690             dynamic,
691         }));
692 
693         if let Some(acpi_address) = acpi_address {
694             device_manager
695                 .mmio_bus()
696                 .insert(
697                     cpu_manager.clone(),
698                     acpi_address.0,
699                     CPU_MANAGER_ACPI_SIZE as u64,
700                 )
701                 .map_err(Error::BusError)?;
702         }
703 
704         Ok(cpu_manager)
705     }
706 
707     fn create_vcpu(
708         &mut self,
709         cpu_id: u8,
710         entry_point: Option<EntryPoint>,
711         snapshot: Option<Snapshot>,
712     ) -> Result<()> {
713         info!("Creating vCPU: cpu_id = {}", cpu_id);
714 
715         let mut vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vmmops.clone()))?;
716 
717         if let Some(snapshot) = snapshot {
718             // AArch64 vCPUs should be initialized after created.
719             #[cfg(target_arch = "aarch64")]
720             vcpu.init(&self.vm)?;
721 
722             vcpu.restore(snapshot).expect("Failed to restore vCPU");
723         } else {
724             #[cfg(target_arch = "x86_64")]
725             vcpu.configure(
726                 entry_point,
727                 &self.vm_memory,
728                 self.cpuid.clone(),
729                 self.config.kvm_hyperv,
730             )
731             .expect("Failed to configure vCPU");
732 
733             #[cfg(target_arch = "aarch64")]
734             vcpu.configure(&self.vm, entry_point)
735                 .expect("Failed to configure vCPU");
736         }
737 
738         // Adding vCPU to the CpuManager's vCPU list.
739         let vcpu = Arc::new(Mutex::new(vcpu));
740         self.vcpus.push(vcpu);
741 
742         Ok(())
743     }
744 
745     /// Only create new vCPUs if there aren't any inactive ones to reuse
746     fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> {
747         info!(
748             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
749             desired_vcpus,
750             self.config.max_vcpus,
751             self.vcpus.len(),
752             self.present_vcpus()
753         );
754 
755         if desired_vcpus > self.config.max_vcpus {
756             return Err(Error::DesiredVCpuCountExceedsMax);
757         }
758 
759         // Only create vCPUs in excess of all the allocated vCPUs.
760         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
761             self.create_vcpu(cpu_id, entry_point, None)?;
762         }
763 
764         Ok(())
765     }
766 
767     #[cfg(target_arch = "aarch64")]
768     pub fn init_pmu(&self, irq: u32) -> Result<bool> {
769         let cpu_attr = kvm_bindings::kvm_device_attr {
770             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
771             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
772             addr: 0x0,
773             flags: 0,
774         };
775 
776         for cpu in self.vcpus.iter() {
777             let tmp = irq;
778             let cpu_attr_irq = kvm_bindings::kvm_device_attr {
779                 group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
780                 attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
781                 addr: &tmp as *const u32 as u64,
782                 flags: 0,
783             };
784 
785             // Check if PMU attr is available, if not, log the information.
786             if cpu.lock().unwrap().vcpu.has_vcpu_attr(&cpu_attr).is_ok() {
787                 // Set irq for PMU
788                 cpu.lock()
789                     .unwrap()
790                     .vcpu
791                     .set_vcpu_attr(&cpu_attr_irq)
792                     .map_err(Error::InitPmu)?;
793 
794                 // Init PMU
795                 cpu.lock()
796                     .unwrap()
797                     .vcpu
798                     .set_vcpu_attr(&cpu_attr)
799                     .map_err(Error::InitPmu)?;
800             } else {
801                 debug!(
802                     "PMU attribute is not supported in vCPU{}, skip PMU init!",
803                     cpu.lock().unwrap().id
804                 );
805                 return Ok(false);
806             }
807         }
808 
809         Ok(true)
810     }
811 
812     fn start_vcpu(
813         &mut self,
814         vcpu: Arc<Mutex<Vcpu>>,
815         vcpu_id: u8,
816         vcpu_thread_barrier: Arc<Barrier>,
817         inserting: bool,
818     ) -> Result<()> {
819         let reset_evt = self.reset_evt.try_clone().unwrap();
820         let exit_evt = self.exit_evt.try_clone().unwrap();
821         #[cfg(feature = "gdb")]
822         let vm_debug_evt = self.vm_debug_evt.try_clone().unwrap();
823         let panic_exit_evt = self.exit_evt.try_clone().unwrap();
824         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
825         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
826 
827         let vcpu_kill = self.vcpu_states[usize::from(vcpu_id)].kill.clone();
828         let vcpu_run_interrupted = self.vcpu_states[usize::from(vcpu_id)]
829             .vcpu_run_interrupted
830             .clone();
831         let panic_vcpu_run_interrupted = vcpu_run_interrupted.clone();
832 
833         // Prepare the CPU set the current vCPU is expected to run onto.
834         let cpuset = self.affinity.get(&vcpu_id).map(|host_cpus| {
835             let mut cpuset: libc::cpu_set_t = unsafe { std::mem::zeroed() };
836             unsafe { libc::CPU_ZERO(&mut cpuset) };
837             for host_cpu in host_cpus {
838                 unsafe { libc::CPU_SET(*host_cpu as usize, &mut cpuset) };
839             }
840             cpuset
841         });
842 
843         // Retrieve seccomp filter for vcpu thread
844         let vcpu_seccomp_filter = get_seccomp_filter(&self.seccomp_action, Thread::Vcpu)
845             .map_err(Error::CreateSeccompFilter)?;
846 
847         #[cfg(target_arch = "x86_64")]
848         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
849 
850         info!("Starting vCPU: cpu_id = {}", vcpu_id);
851 
852         let handle = Some(
853             thread::Builder::new()
854                 .name(format!("vcpu{}", vcpu_id))
855                 .spawn(move || {
856                     // Schedule the thread to run on the expected CPU set
857                     if let Some(cpuset) = cpuset.as_ref() {
858                         let ret = unsafe {
859                             libc::sched_setaffinity(
860                                 0,
861                                 std::mem::size_of::<libc::cpu_set_t>(),
862                                 cpuset as *const libc::cpu_set_t,
863                             )
864                         };
865 
866                         if ret != 0 {
867                             error!(
868                                 "Failed scheduling the vCPU {} on the expected CPU set: {}",
869                                 vcpu_id,
870                                 io::Error::last_os_error()
871                             );
872                             return;
873                         }
874                     }
875 
876                     // Apply seccomp filter for vcpu thread.
877                     if !vcpu_seccomp_filter.is_empty() {
878                         if let Err(e) =
879                             apply_filter(&vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
880                         {
881                             error!("Error applying seccomp filter: {:?}", e);
882                             return;
883                         }
884                     }
885                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
886                     // This uses an async signal safe handler to kill the vcpu handles.
887                     register_signal_handler(SIGRTMIN(), handle_signal)
888                         .expect("Failed to register vcpu signal handler");
889                     // Block until all CPUs are ready.
890                     vcpu_thread_barrier.wait();
891 
892                     std::panic::catch_unwind(move || {
893                         loop {
894                             // If we are being told to pause, we park the thread
895                             // until the pause boolean is toggled.
896                             // The resume operation is responsible for toggling
897                             // the boolean and unpark the thread.
898                             // We enter a loop because park() could spuriously
899                             // return. We will then park() again unless the
900                             // pause boolean has been toggled.
901 
902                             // Need to use Ordering::SeqCst as we have multiple
903                             // loads and stores to different atomics and we need
904                             // to see them in a consistent order in all threads
905 
906                             if vcpu_pause_signalled.load(Ordering::SeqCst) {
907                                 // As a pause can be caused by PIO & MMIO exits then we need to ensure they are
908                                 // completed by returning to KVM_RUN. From the kernel docs:
909                                 //
910                                 // For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_PAPR, KVM_EXIT_XEN,
911                                 // KVM_EXIT_EPR, KVM_EXIT_X86_RDMSR and KVM_EXIT_X86_WRMSR the corresponding
912                                 // operations are complete (and guest state is consistent) only after userspace
913                                 // has re-entered the kernel with KVM_RUN.  The kernel side will first finish
914                                 // incomplete operations and then check for pending signals.
915                                 // The pending state of the operation is not preserved in state which is
916                                 // visible to userspace, thus userspace should ensure that the operation is
917                                 // completed before performing a live migration.  Userspace can re-enter the
918                                 // guest with an unmasked signal pending or with the immediate_exit field set
919                                 // to complete pending operations without allowing any further instructions
920                                 // to be executed.
921 
922                                 #[cfg(feature = "kvm")]
923                                 {
924                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(true);
925                                     if !matches!(vcpu.lock().unwrap().run(), Ok(VmExit::Ignore)) {
926                                         error!("Unexpected VM exit on \"immediate_exit\" run");
927                                         break;
928                                     }
929                                     vcpu.lock().as_ref().unwrap().vcpu.set_immediate_exit(false);
930                                 }
931 
932                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
933                                 while vcpu_pause_signalled.load(Ordering::SeqCst) {
934                                     thread::park();
935                                 }
936                                 vcpu_run_interrupted.store(false, Ordering::SeqCst);
937                             }
938 
939                             // We've been told to terminate
940                             if vcpu_kill_signalled.load(Ordering::SeqCst)
941                                 || vcpu_kill.load(Ordering::SeqCst)
942                             {
943                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
944                                 break;
945                             }
946 
947                             #[cfg(feature = "tdx")]
948                             let mut vcpu = vcpu.lock().unwrap();
949                             #[cfg(not(feature = "tdx"))]
950                             let vcpu = vcpu.lock().unwrap();
951                             // vcpu.run() returns false on a triple-fault so trigger a reset
952                             match vcpu.run() {
953                                 Ok(run) => match run {
954                                     #[cfg(all(target_arch = "x86_64", feature = "kvm"))]
955                                     VmExit::Debug => {
956                                         info!("VmExit::Debug");
957                                         #[cfg(feature = "gdb")]
958                                         {
959                                             vcpu_pause_signalled.store(true, Ordering::SeqCst);
960                                             let raw_tid = get_raw_tid(vcpu_id as usize);
961                                             vm_debug_evt.write(raw_tid as u64).unwrap();
962                                         }
963                                     }
964                                     #[cfg(target_arch = "x86_64")]
965                                     VmExit::IoapicEoi(vector) => {
966                                         if let Some(interrupt_controller) =
967                                             &interrupt_controller_clone
968                                         {
969                                             interrupt_controller
970                                                 .lock()
971                                                 .unwrap()
972                                                 .end_of_interrupt(vector);
973                                         }
974                                     }
975                                     VmExit::Ignore => {}
976                                     VmExit::Hyperv => {}
977                                     VmExit::Reset => {
978                                         info!("VmExit::Reset");
979                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
980                                         reset_evt.write(1).unwrap();
981                                         break;
982                                     }
983                                     VmExit::Shutdown => {
984                                         info!("VmExit::Shutdown");
985                                         vcpu_run_interrupted.store(true, Ordering::SeqCst);
986                                         exit_evt.write(1).unwrap();
987                                         break;
988                                     }
989                                     #[cfg(feature = "tdx")]
990                                     VmExit::Tdx => {
991                                         if let Some(vcpu_fd) = Arc::get_mut(&mut vcpu.vcpu) {
992                                             match vcpu_fd.get_tdx_exit_details() {
993                                                 Ok(details) => match details {
994                                                     TdxExitDetails::GetQuote => warn!("TDG_VP_VMCALL_GET_QUOTE not supported"),
995                                                     TdxExitDetails::SetupEventNotifyInterrupt => {
996                                                         warn!("TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT not supported")
997                                                     }
998                                                 },
999                                                 Err(e) => error!("Unexpected TDX VMCALL: {}", e),
1000                                             }
1001                                             vcpu_fd.set_tdx_status(TdxExitStatus::InvalidOperand);
1002                                         } else {
1003                                             // We should never reach this code as
1004                                             // this means the design from the code
1005                                             // is wrong.
1006                                             unreachable!("Couldn't get a mutable reference from Arc<dyn Vcpu> as there are multiple instances");
1007                                         }
1008                                     }
1009                                     _ => {
1010                                         error!(
1011                                             "VCPU generated error: {:?}",
1012                                             Error::UnexpectedVmExit
1013                                         );
1014                                         break;
1015                                     }
1016                                 },
1017 
1018                                 Err(e) => {
1019                                     error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
1020                                     break;
1021                                 }
1022                             }
1023 
1024                             // We've been told to terminate
1025                             if vcpu_kill_signalled.load(Ordering::SeqCst)
1026                                 || vcpu_kill.load(Ordering::SeqCst)
1027                             {
1028                                 vcpu_run_interrupted.store(true, Ordering::SeqCst);
1029                                 break;
1030                             }
1031                         }
1032                     })
1033                     .or_else(|_| {
1034                         panic_vcpu_run_interrupted.store(true, Ordering::SeqCst);
1035                         error!("vCPU thread panicked");
1036                         panic_exit_evt.write(1)
1037                     })
1038                     .ok();
1039                 })
1040                 .map_err(Error::VcpuSpawn)?,
1041         );
1042 
1043         // On hot plug calls into this function entry_point is None. It is for
1044         // those hotplug CPU additions that we need to set the inserting flag.
1045         self.vcpu_states[usize::from(vcpu_id)].handle = handle;
1046         self.vcpu_states[usize::from(vcpu_id)].inserting = inserting;
1047 
1048         Ok(())
1049     }
1050 
1051     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
1052     fn activate_vcpus(&mut self, desired_vcpus: u8, inserting: bool) -> Result<()> {
1053         if desired_vcpus > self.config.max_vcpus {
1054             return Err(Error::DesiredVCpuCountExceedsMax);
1055         }
1056 
1057         let vcpu_thread_barrier = Arc::new(Barrier::new(
1058             (desired_vcpus - self.present_vcpus() + 1) as usize,
1059         ));
1060 
1061         info!(
1062             "Starting vCPUs: desired = {}, allocated = {}, present = {}",
1063             desired_vcpus,
1064             self.vcpus.len(),
1065             self.present_vcpus()
1066         );
1067 
1068         // This reuses any inactive vCPUs as well as any that were newly created
1069         for vcpu_id in self.present_vcpus()..desired_vcpus {
1070             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1071             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), inserting)?;
1072         }
1073 
1074         // Unblock all CPU threads.
1075         vcpu_thread_barrier.wait();
1076         Ok(())
1077     }
1078 
1079     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
1080         // Mark vCPUs for removal, actual removal happens on ejection
1081         for cpu_id in desired_vcpus..self.present_vcpus() {
1082             self.vcpu_states[usize::from(cpu_id)].removing = true;
1083         }
1084     }
1085 
1086     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
1087         info!("Removing vCPU: cpu_id = {}", cpu_id);
1088         let mut state = &mut self.vcpu_states[usize::from(cpu_id)];
1089         state.kill.store(true, Ordering::SeqCst);
1090         state.signal_thread();
1091         state.join_thread()?;
1092         state.handle = None;
1093 
1094         // Once the thread has exited, clear the "kill" so that it can reused
1095         state.kill.store(false, Ordering::SeqCst);
1096 
1097         Ok(())
1098     }
1099 
1100     pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> {
1101         self.create_vcpus(self.boot_vcpus(), entry_point)
1102     }
1103 
1104     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
1105     pub fn start_boot_vcpus(&mut self) -> Result<()> {
1106         self.activate_vcpus(self.boot_vcpus(), false)
1107     }
1108 
1109     pub fn start_restored_vcpus(&mut self) -> Result<()> {
1110         let vcpu_numbers = self.vcpus.len() as u8;
1111         let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_numbers + 1) as usize));
1112         // Restore the vCPUs in "paused" state.
1113         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
1114 
1115         for vcpu_id in 0..vcpu_numbers {
1116             let vcpu = Arc::clone(&self.vcpus[vcpu_id as usize]);
1117 
1118             self.start_vcpu(vcpu, vcpu_id, vcpu_thread_barrier.clone(), false)
1119                 .map_err(|e| {
1120                     Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
1121                 })?;
1122         }
1123         // Unblock all restored CPU threads.
1124         vcpu_thread_barrier.wait();
1125         Ok(())
1126     }
1127 
1128     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
1129         if desired_vcpus.cmp(&self.present_vcpus()) == cmp::Ordering::Equal {
1130             return Ok(false);
1131         }
1132 
1133         if !self.dynamic {
1134             return Ok(false);
1135         }
1136 
1137         match desired_vcpus.cmp(&self.present_vcpus()) {
1138             cmp::Ordering::Greater => {
1139                 self.create_vcpus(desired_vcpus, None)?;
1140                 self.activate_vcpus(desired_vcpus, true)?;
1141                 Ok(true)
1142             }
1143             cmp::Ordering::Less => {
1144                 self.mark_vcpus_for_removal(desired_vcpus);
1145                 Ok(true)
1146             }
1147             _ => Ok(false),
1148         }
1149     }
1150 
1151     pub fn shutdown(&mut self) -> Result<()> {
1152         // Tell the vCPUs to stop themselves next time they go through the loop
1153         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
1154 
1155         // Toggle the vCPUs pause boolean
1156         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1157 
1158         // Unpark all the VCPU threads.
1159         for state in self.vcpu_states.iter() {
1160             state.unpark_thread();
1161         }
1162 
1163         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1164         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1165         // above.
1166         for state in self.vcpu_states.iter() {
1167             state.signal_thread();
1168         }
1169 
1170         // Wait for all the threads to finish. This removes the state from the vector.
1171         for mut state in self.vcpu_states.drain(..) {
1172             state.join_thread()?;
1173         }
1174 
1175         Ok(())
1176     }
1177 
1178     #[cfg(feature = "tdx")]
1179     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
1180         for vcpu in &self.vcpus {
1181             vcpu.lock()
1182                 .unwrap()
1183                 .vcpu
1184                 .tdx_init(hob_address)
1185                 .map_err(Error::InitializeTdx)?;
1186         }
1187         Ok(())
1188     }
1189 
1190     pub fn boot_vcpus(&self) -> u8 {
1191         self.config.boot_vcpus
1192     }
1193 
1194     pub fn max_vcpus(&self) -> u8 {
1195         self.config.max_vcpus
1196     }
1197 
1198     #[cfg(target_arch = "x86_64")]
1199     pub fn common_cpuid(&self) -> CpuId {
1200         self.cpuid.clone()
1201     }
1202 
1203     fn present_vcpus(&self) -> u8 {
1204         self.vcpu_states
1205             .iter()
1206             .fold(0, |acc, state| acc + state.active() as u8)
1207     }
1208 
1209     #[cfg(target_arch = "aarch64")]
1210     pub fn get_mpidrs(&self) -> Vec<u64> {
1211         self.vcpus
1212             .iter()
1213             .map(|cpu| cpu.lock().unwrap().get_mpidr())
1214             .collect()
1215     }
1216 
1217     #[cfg(target_arch = "aarch64")]
1218     pub fn get_saved_states(&self) -> Vec<CpuState> {
1219         self.vcpus
1220             .iter()
1221             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
1222             .collect()
1223     }
1224 
1225     #[cfg(target_arch = "aarch64")]
1226     pub fn get_vcpu_topology(&self) -> Option<(u8, u8, u8)> {
1227         self.config
1228             .topology
1229             .clone()
1230             .map(|t| (t.threads_per_core, t.cores_per_die, t.packages))
1231     }
1232 
1233     pub fn create_madt(&self) -> Sdt {
1234         use crate::acpi;
1235         // This is also checked in the commandline parsing.
1236         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
1237 
1238         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
1239         #[cfg(target_arch = "x86_64")]
1240         {
1241             madt.write(36, arch::layout::APIC_START);
1242 
1243             for cpu in 0..self.config.max_vcpus {
1244                 let lapic = LocalApic {
1245                     r#type: acpi::ACPI_APIC_PROCESSOR,
1246                     length: 8,
1247                     processor_id: cpu,
1248                     apic_id: cpu,
1249                     flags: if cpu < self.config.boot_vcpus {
1250                         1 << MADT_CPU_ENABLE_FLAG
1251                     } else {
1252                         0
1253                     },
1254                 };
1255                 madt.append(lapic);
1256             }
1257 
1258             madt.append(Ioapic {
1259                 r#type: acpi::ACPI_APIC_IO,
1260                 length: 12,
1261                 ioapic_id: 0,
1262                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1263                 gsi_base: 0,
1264                 ..Default::default()
1265             });
1266 
1267             madt.append(InterruptSourceOverride {
1268                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1269                 length: 10,
1270                 bus: 0,
1271                 source: 4,
1272                 gsi: 4,
1273                 flags: 0,
1274             });
1275         }
1276 
1277         #[cfg(target_arch = "aarch64")]
1278         {
1279             use vm_memory::Address;
1280             /* Notes:
1281              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1282              */
1283 
1284             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1285             for cpu in 0..self.config.boot_vcpus {
1286                 let vcpu = &self.vcpus[cpu as usize];
1287                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1288                 /* ARMv8 MPIDR format:
1289                      Bits [63:40] Must be zero
1290                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1291                      Bits [31:24] Must be zero
1292                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1293                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1294                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1295                 */
1296                 let mpidr_mask = 0xff_00ff_ffff;
1297                 let gicc = GicC {
1298                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1299                     length: 80,
1300                     reserved0: 0,
1301                     cpu_interface_number: cpu as u32,
1302                     uid: cpu as u32,
1303                     flags: 1,
1304                     parking_version: 0,
1305                     performance_interrupt: 0,
1306                     parked_address: 0,
1307                     base_address: 0,
1308                     gicv_base_address: 0,
1309                     gich_base_address: 0,
1310                     vgic_interrupt: 0,
1311                     gicr_base_address: 0,
1312                     mpidr: mpidr & mpidr_mask,
1313                     proc_power_effi_class: 0,
1314                     reserved1: 0,
1315                     spe_overflow_interrupt: 0,
1316                 };
1317 
1318                 madt.append(gicc);
1319             }
1320 
1321             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1322             let gicd = GicD {
1323                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1324                 length: 24,
1325                 reserved0: 0,
1326                 gic_id: 0,
1327                 base_address: arch::layout::MAPPED_IO_START.raw_value() - 0x0001_0000,
1328                 global_irq_base: 0,
1329                 version: 3,
1330                 reserved1: [0; 3],
1331             };
1332             madt.append(gicd);
1333 
1334             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1335             let gicr_size: u32 = 0x0001_0000 * 2 * (self.config.boot_vcpus as u32);
1336             let gicr_base: u64 =
1337                 arch::layout::MAPPED_IO_START.raw_value() - 0x0001_0000 - gicr_size as u64;
1338             let gicr = GicR {
1339                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1340                 length: 16,
1341                 reserved: 0,
1342                 base_address: gicr_base,
1343                 range_length: gicr_size,
1344             };
1345             madt.append(gicr);
1346 
1347             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1348             let gicits = GicIts {
1349                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1350                 length: 20,
1351                 reserved0: 0,
1352                 translation_id: 0,
1353                 base_address: gicr_base - 2 * 0x0001_0000,
1354                 reserved1: 0,
1355             };
1356             madt.append(gicits);
1357 
1358             madt.update_checksum();
1359         }
1360 
1361         madt
1362     }
1363 
1364     #[cfg(target_arch = "aarch64")]
1365     pub fn create_pptt(&self) -> Sdt {
1366         let pptt_start = 0;
1367         let mut cpus = 0;
1368         let mut uid = 0;
1369         // If topology is not specified, the default setting is:
1370         // 1 package, multiple cores, 1 thread per core
1371         // This is also the behavior when PPTT is missing.
1372         let (threads_per_core, cores_per_package, packages) =
1373             self.get_vcpu_topology().unwrap_or((1, self.max_vcpus(), 1));
1374 
1375         let mut pptt = Sdt::new(*b"PPTT", 36, 2, *b"CLOUDH", *b"CHPPTT  ", 1);
1376 
1377         for cluster_idx in 0..packages {
1378             if cpus < self.config.boot_vcpus as usize {
1379                 let cluster_offset = pptt.len() - pptt_start;
1380                 let cluster_hierarchy_node = ProcessorHierarchyNode {
1381                     r#type: 0,
1382                     length: 20,
1383                     reserved: 0,
1384                     flags: 0x2,
1385                     parent: 0,
1386                     acpi_processor_id: cluster_idx as u32,
1387                     num_private_resources: 0,
1388                 };
1389                 pptt.append(cluster_hierarchy_node);
1390 
1391                 for core_idx in 0..cores_per_package {
1392                     let core_offset = pptt.len() - pptt_start;
1393 
1394                     if threads_per_core > 1 {
1395                         let core_hierarchy_node = ProcessorHierarchyNode {
1396                             r#type: 0,
1397                             length: 20,
1398                             reserved: 0,
1399                             flags: 0x2,
1400                             parent: cluster_offset as u32,
1401                             acpi_processor_id: core_idx as u32,
1402                             num_private_resources: 0,
1403                         };
1404                         pptt.append(core_hierarchy_node);
1405 
1406                         for _thread_idx in 0..threads_per_core {
1407                             let thread_hierarchy_node = ProcessorHierarchyNode {
1408                                 r#type: 0,
1409                                 length: 20,
1410                                 reserved: 0,
1411                                 flags: 0xE,
1412                                 parent: core_offset as u32,
1413                                 acpi_processor_id: uid as u32,
1414                                 num_private_resources: 0,
1415                             };
1416                             pptt.append(thread_hierarchy_node);
1417                             uid += 1;
1418                         }
1419                     } else {
1420                         let thread_hierarchy_node = ProcessorHierarchyNode {
1421                             r#type: 0,
1422                             length: 20,
1423                             reserved: 0,
1424                             flags: 0xA,
1425                             parent: cluster_offset as u32,
1426                             acpi_processor_id: uid as u32,
1427                             num_private_resources: 0,
1428                         };
1429                         pptt.append(thread_hierarchy_node);
1430                         uid += 1;
1431                     }
1432                 }
1433                 cpus += (cores_per_package * threads_per_core) as usize;
1434             }
1435         }
1436 
1437         pptt.update_checksum();
1438         pptt
1439     }
1440 
1441     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1442     fn get_regs(&self, cpu_id: u8) -> Result<StandardRegisters> {
1443         self.vcpus[usize::from(cpu_id)]
1444             .lock()
1445             .unwrap()
1446             .vcpu
1447             .get_regs()
1448             .map_err(Error::CpuDebug)
1449     }
1450 
1451     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1452     fn set_regs(&self, cpu_id: u8, regs: &StandardRegisters) -> Result<()> {
1453         self.vcpus[usize::from(cpu_id)]
1454             .lock()
1455             .unwrap()
1456             .vcpu
1457             .set_regs(regs)
1458             .map_err(Error::CpuDebug)
1459     }
1460 
1461     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1462     fn get_sregs(&self, cpu_id: u8) -> Result<SpecialRegisters> {
1463         self.vcpus[usize::from(cpu_id)]
1464             .lock()
1465             .unwrap()
1466             .vcpu
1467             .get_sregs()
1468             .map_err(Error::CpuDebug)
1469     }
1470 
1471     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1472     fn set_sregs(&self, cpu_id: u8, sregs: &SpecialRegisters) -> Result<()> {
1473         self.vcpus[usize::from(cpu_id)]
1474             .lock()
1475             .unwrap()
1476             .vcpu
1477             .set_sregs(sregs)
1478             .map_err(Error::CpuDebug)
1479     }
1480 
1481     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
1482     fn translate_gva(&self, cpu_id: u8, gva: u64) -> Result<u64> {
1483         let (gpa, _) = self.vcpus[usize::from(cpu_id)]
1484             .lock()
1485             .unwrap()
1486             .vcpu
1487             .translate_gva(gva, /* flags: unused */ 0)
1488             .map_err(Error::TranslateVirtualAddress)?;
1489         Ok(gpa)
1490     }
1491 
1492     pub fn vcpus_paused(&self) -> bool {
1493         self.vcpus_pause_signalled.load(Ordering::SeqCst)
1494     }
1495 }
1496 
1497 struct Cpu {
1498     cpu_id: u8,
1499     proximity_domain: u32,
1500     dynamic: bool,
1501 }
1502 
1503 #[cfg(target_arch = "x86_64")]
1504 const MADT_CPU_ENABLE_FLAG: usize = 0;
1505 
1506 impl Cpu {
1507     #[cfg(target_arch = "x86_64")]
1508     fn generate_mat(&self) -> Vec<u8> {
1509         let lapic = LocalApic {
1510             r#type: 0,
1511             length: 8,
1512             processor_id: self.cpu_id,
1513             apic_id: self.cpu_id,
1514             flags: 1 << MADT_CPU_ENABLE_FLAG,
1515         };
1516 
1517         let mut mat_data: Vec<u8> = Vec::new();
1518         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1519         unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic };
1520 
1521         mat_data
1522     }
1523 }
1524 
1525 impl Aml for Cpu {
1526     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1527         #[cfg(target_arch = "x86_64")]
1528         let mat_data: Vec<u8> = self.generate_mat();
1529         #[allow(clippy::if_same_then_else)]
1530         if self.dynamic {
1531             aml::Device::new(
1532                 format!("C{:03}", self.cpu_id).as_str().into(),
1533                 vec![
1534                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1535                     &aml::Name::new("_UID".into(), &self.cpu_id),
1536                     // Currently, AArch64 cannot support following fields.
1537                     /*
1538                     _STA return value:
1539                     Bit [0] – Set if the device is present.
1540                     Bit [1] – Set if the device is enabled and decoding its resources.
1541                     Bit [2] – Set if the device should be shown in the UI.
1542                     Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1543                     Bit [4] – Set if the battery is present.
1544                     Bits [31:5] – Reserved (must be cleared).
1545                     */
1546                     #[cfg(target_arch = "x86_64")]
1547                     &aml::Method::new(
1548                         "_STA".into(),
1549                         0,
1550                         false,
1551                         // Call into CSTA method which will interrogate device
1552                         vec![&aml::Return::new(&aml::MethodCall::new(
1553                             "CSTA".into(),
1554                             vec![&self.cpu_id],
1555                         ))],
1556                     ),
1557                     &aml::Method::new(
1558                         "_PXM".into(),
1559                         0,
1560                         false,
1561                         vec![&aml::Return::new(&self.proximity_domain)],
1562                     ),
1563                     // The Linux kernel expects every CPU device to have a _MAT entry
1564                     // containing the LAPIC for this processor with the enabled bit set
1565                     // even it if is disabled in the MADT (non-boot CPU)
1566                     #[cfg(target_arch = "x86_64")]
1567                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1568                     // Trigger CPU ejection
1569                     #[cfg(target_arch = "x86_64")]
1570                     &aml::Method::new(
1571                         "_EJ0".into(),
1572                         1,
1573                         false,
1574                         // Call into CEJ0 method which will actually eject device
1575                         vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1576                     ),
1577                 ],
1578             )
1579             .append_aml_bytes(bytes);
1580         } else {
1581             aml::Device::new(
1582                 format!("C{:03}", self.cpu_id).as_str().into(),
1583                 vec![
1584                     &aml::Name::new("_HID".into(), &"ACPI0007"),
1585                     &aml::Name::new("_UID".into(), &self.cpu_id),
1586                     #[cfg(target_arch = "x86_64")]
1587                     &aml::Method::new(
1588                         "_STA".into(),
1589                         0,
1590                         false,
1591                         // Mark CPU present see CSTA implementation
1592                         vec![&aml::Return::new(&0xfu8)],
1593                     ),
1594                     &aml::Method::new(
1595                         "_PXM".into(),
1596                         0,
1597                         false,
1598                         vec![&aml::Return::new(&self.proximity_domain)],
1599                     ),
1600                     // The Linux kernel expects every CPU device to have a _MAT entry
1601                     // containing the LAPIC for this processor with the enabled bit set
1602                     // even it if is disabled in the MADT (non-boot CPU)
1603                     #[cfg(target_arch = "x86_64")]
1604                     &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1605                 ],
1606             )
1607             .append_aml_bytes(bytes);
1608         }
1609     }
1610 }
1611 
1612 struct CpuNotify {
1613     cpu_id: u8,
1614 }
1615 
1616 impl Aml for CpuNotify {
1617     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1618         let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
1619         aml::If::new(
1620             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1621             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1622         )
1623         .append_aml_bytes(bytes)
1624     }
1625 }
1626 
1627 struct CpuMethods {
1628     max_vcpus: u8,
1629     dynamic: bool,
1630 }
1631 
1632 impl Aml for CpuMethods {
1633     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1634         if self.dynamic {
1635             // CPU status method
1636             aml::Method::new(
1637                 "CSTA".into(),
1638                 1,
1639                 true,
1640                 vec![
1641                     // Take lock defined above
1642                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1643                     // Write CPU number (in first argument) to I/O port via field
1644                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1645                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1646                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1647                     &aml::If::new(
1648                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1649                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1650                     ),
1651                     // Release lock
1652                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1653                     // Return 0 or 0xf
1654                     &aml::Return::new(&aml::Local(0)),
1655                 ],
1656             )
1657             .append_aml_bytes(bytes);
1658 
1659             let mut cpu_notifies = Vec::new();
1660             for cpu_id in 0..self.max_vcpus {
1661                 cpu_notifies.push(CpuNotify { cpu_id });
1662             }
1663 
1664             let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
1665             for cpu_id in 0..self.max_vcpus {
1666                 cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1667             }
1668 
1669             aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).append_aml_bytes(bytes);
1670 
1671             aml::Method::new(
1672                 "CEJ0".into(),
1673                 1,
1674                 true,
1675                 vec![
1676                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1677                     // Write CPU number (in first argument) to I/O port via field
1678                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1679                     // Set CEJ0 bit
1680                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1681                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1682                 ],
1683             )
1684             .append_aml_bytes(bytes);
1685 
1686             aml::Method::new(
1687                 "CSCN".into(),
1688                 0,
1689                 true,
1690                 vec![
1691                     // Take lock defined above
1692                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1693                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1694                     &aml::While::new(
1695                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1696                         vec![
1697                             // Write CPU number (in first argument) to I/O port via field
1698                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1699                             // Check if CINS bit is set
1700                             &aml::If::new(
1701                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1702                                 // Notify device if it is
1703                                 vec![
1704                                     &aml::MethodCall::new(
1705                                         "CTFY".into(),
1706                                         vec![&aml::Local(0), &aml::ONE],
1707                                     ),
1708                                     // Reset CINS bit
1709                                     &aml::Store::new(
1710                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1711                                         &aml::ONE,
1712                                     ),
1713                                 ],
1714                             ),
1715                             // Check if CRMV bit is set
1716                             &aml::If::new(
1717                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1718                                 // Notify device if it is (with the eject constant 0x3)
1719                                 vec![
1720                                     &aml::MethodCall::new(
1721                                         "CTFY".into(),
1722                                         vec![&aml::Local(0), &3u8],
1723                                     ),
1724                                     // Reset CRMV bit
1725                                     &aml::Store::new(
1726                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1727                                         &aml::ONE,
1728                                     ),
1729                                 ],
1730                             ),
1731                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1732                         ],
1733                     ),
1734                     // Release lock
1735                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1736                 ],
1737             )
1738             .append_aml_bytes(bytes)
1739         } else {
1740             aml::Method::new("CSCN".into(), 0, true, vec![]).append_aml_bytes(bytes)
1741         }
1742     }
1743 }
1744 
1745 impl Aml for CpuManager {
1746     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
1747         #[cfg(target_arch = "x86_64")]
1748         if let Some(acpi_address) = self.acpi_address {
1749             // CPU hotplug controller
1750             aml::Device::new(
1751                 "_SB_.PRES".into(),
1752                 vec![
1753                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
1754                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1755                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1756                     &aml::Mutex::new("CPLK".into(), 0),
1757                     &aml::Name::new(
1758                         "_CRS".into(),
1759                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
1760                             aml::AddressSpaceCachable::NotCacheable,
1761                             true,
1762                             acpi_address.0 as u64,
1763                             acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
1764                         )]),
1765                     ),
1766                     // OpRegion and Fields map MMIO range into individual field values
1767                     &aml::OpRegion::new(
1768                         "PRST".into(),
1769                         aml::OpRegionSpace::SystemMemory,
1770                         acpi_address.0 as usize,
1771                         CPU_MANAGER_ACPI_SIZE,
1772                     ),
1773                     &aml::Field::new(
1774                         "PRST".into(),
1775                         aml::FieldAccessType::Byte,
1776                         aml::FieldUpdateRule::WriteAsZeroes,
1777                         vec![
1778                             aml::FieldEntry::Reserved(32),
1779                             aml::FieldEntry::Named(*b"CPEN", 1),
1780                             aml::FieldEntry::Named(*b"CINS", 1),
1781                             aml::FieldEntry::Named(*b"CRMV", 1),
1782                             aml::FieldEntry::Named(*b"CEJ0", 1),
1783                             aml::FieldEntry::Reserved(4),
1784                             aml::FieldEntry::Named(*b"CCMD", 8),
1785                         ],
1786                     ),
1787                     &aml::Field::new(
1788                         "PRST".into(),
1789                         aml::FieldAccessType::DWord,
1790                         aml::FieldUpdateRule::Preserve,
1791                         vec![
1792                             aml::FieldEntry::Named(*b"CSEL", 32),
1793                             aml::FieldEntry::Reserved(32),
1794                             aml::FieldEntry::Named(*b"CDAT", 32),
1795                         ],
1796                     ),
1797                 ],
1798             )
1799             .append_aml_bytes(bytes);
1800         }
1801 
1802         // CPU devices
1803         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
1804         let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05"));
1805         // Bundle methods together under a common object
1806         let methods = CpuMethods {
1807             max_vcpus: self.config.max_vcpus,
1808             dynamic: self.dynamic,
1809         };
1810         let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods];
1811 
1812         let mut cpu_devices = Vec::new();
1813         for cpu_id in 0..self.config.max_vcpus {
1814             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
1815             let cpu_device = Cpu {
1816                 cpu_id,
1817                 proximity_domain,
1818                 dynamic: self.dynamic,
1819             };
1820 
1821             cpu_devices.push(cpu_device);
1822         }
1823 
1824         for cpu_device in cpu_devices.iter() {
1825             cpu_data_inner.push(cpu_device);
1826         }
1827 
1828         aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).append_aml_bytes(bytes)
1829     }
1830 }
1831 
1832 impl Pausable for CpuManager {
1833     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
1834         // Tell the vCPUs to pause themselves next time they exit
1835         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
1836 
1837         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1838         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1839         // above.
1840         for state in self.vcpu_states.iter() {
1841             state.signal_thread();
1842         }
1843 
1844         for vcpu in self.vcpus.iter() {
1845             let mut vcpu = vcpu.lock().unwrap();
1846             vcpu.pause()?;
1847             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
1848             if !self.config.kvm_hyperv {
1849                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
1850                     MigratableError::Pause(anyhow!(
1851                         "Could not notify guest it has been paused {:?}",
1852                         e
1853                     ))
1854                 })?;
1855             }
1856         }
1857 
1858         Ok(())
1859     }
1860 
1861     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
1862         for vcpu in self.vcpus.iter() {
1863             vcpu.lock().unwrap().resume()?;
1864         }
1865 
1866         // Toggle the vCPUs pause boolean
1867         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1868 
1869         // Unpark all the VCPU threads.
1870         // Once unparked, the next thing they will do is checking for the pause
1871         // boolean. Since it'll be set to false, they will exit their pause loop
1872         // and go back to vmx root.
1873         for state in self.vcpu_states.iter() {
1874             state.unpark_thread();
1875         }
1876         Ok(())
1877     }
1878 }
1879 
1880 impl Snapshottable for CpuManager {
1881     fn id(&self) -> String {
1882         CPU_MANAGER_SNAPSHOT_ID.to_string()
1883     }
1884 
1885     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1886         let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID);
1887 
1888         // The CpuManager snapshot is a collection of all vCPUs snapshots.
1889         for vcpu in &self.vcpus {
1890             let cpu_snapshot = vcpu.lock().unwrap().snapshot()?;
1891             cpu_manager_snapshot.add_snapshot(cpu_snapshot);
1892         }
1893 
1894         Ok(cpu_manager_snapshot)
1895     }
1896 
1897     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1898         for (cpu_id, snapshot) in snapshot.snapshots.iter() {
1899             info!("Restoring VCPU {}", cpu_id);
1900             self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone()))
1901                 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?;
1902         }
1903 
1904         Ok(())
1905     }
1906 }
1907 
1908 impl Transportable for CpuManager {}
1909 impl Migratable for CpuManager {}
1910 
1911 #[cfg(feature = "gdb")]
1912 impl Debuggable for CpuManager {
1913     #[cfg(feature = "kvm")]
1914     fn set_guest_debug(
1915         &self,
1916         cpu_id: usize,
1917         addrs: &[GuestAddress],
1918         singlestep: bool,
1919     ) -> std::result::Result<(), DebuggableError> {
1920         self.vcpus[cpu_id]
1921             .lock()
1922             .unwrap()
1923             .vcpu
1924             .set_guest_debug(addrs, singlestep)
1925             .map_err(DebuggableError::SetDebug)
1926     }
1927 
1928     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
1929         Ok(())
1930     }
1931 
1932     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
1933         Ok(())
1934     }
1935 
1936     #[cfg(target_arch = "x86_64")]
1937     fn read_regs(&self, cpu_id: usize) -> std::result::Result<X86_64CoreRegs, DebuggableError> {
1938         // General registers: RAX, RBX, RCX, RDX, RSI, RDI, RBP, RSP, r8-r15
1939         let gregs = self
1940             .get_regs(cpu_id as u8)
1941             .map_err(DebuggableError::ReadRegs)?;
1942         let regs = [
1943             gregs.rax, gregs.rbx, gregs.rcx, gregs.rdx, gregs.rsi, gregs.rdi, gregs.rbp, gregs.rsp,
1944             gregs.r8, gregs.r9, gregs.r10, gregs.r11, gregs.r12, gregs.r13, gregs.r14, gregs.r15,
1945         ];
1946 
1947         // GDB exposes 32-bit eflags instead of 64-bit rflags.
1948         // https://github.com/bminor/binutils-gdb/blob/master/gdb/features/i386/64bit-core.xml
1949         let eflags = gregs.rflags as u32;
1950         let rip = gregs.rip;
1951 
1952         // Segment registers: CS, SS, DS, ES, FS, GS
1953         let sregs = self
1954             .get_sregs(cpu_id as u8)
1955             .map_err(DebuggableError::ReadRegs)?;
1956         let segments = X86SegmentRegs {
1957             cs: sregs.cs.selector as u32,
1958             ss: sregs.ss.selector as u32,
1959             ds: sregs.ds.selector as u32,
1960             es: sregs.es.selector as u32,
1961             fs: sregs.fs.selector as u32,
1962             gs: sregs.gs.selector as u32,
1963         };
1964 
1965         // TODO: Add other registers
1966 
1967         Ok(X86_64CoreRegs {
1968             regs,
1969             eflags,
1970             rip,
1971             segments,
1972             ..Default::default()
1973         })
1974     }
1975 
1976     #[cfg(target_arch = "x86_64")]
1977     fn write_regs(
1978         &self,
1979         cpu_id: usize,
1980         regs: &X86_64CoreRegs,
1981     ) -> std::result::Result<(), DebuggableError> {
1982         let orig_gregs = self
1983             .get_regs(cpu_id as u8)
1984             .map_err(DebuggableError::ReadRegs)?;
1985         let gregs = StandardRegisters {
1986             rax: regs.regs[0],
1987             rbx: regs.regs[1],
1988             rcx: regs.regs[2],
1989             rdx: regs.regs[3],
1990             rsi: regs.regs[4],
1991             rdi: regs.regs[5],
1992             rbp: regs.regs[6],
1993             rsp: regs.regs[7],
1994             r8: regs.regs[8],
1995             r9: regs.regs[9],
1996             r10: regs.regs[10],
1997             r11: regs.regs[11],
1998             r12: regs.regs[12],
1999             r13: regs.regs[13],
2000             r14: regs.regs[14],
2001             r15: regs.regs[15],
2002             rip: regs.rip,
2003             // Update the lower 32-bit of rflags.
2004             rflags: (orig_gregs.rflags & !(u32::MAX as u64)) | (regs.eflags as u64),
2005         };
2006 
2007         self.set_regs(cpu_id as u8, &gregs)
2008             .map_err(DebuggableError::WriteRegs)?;
2009 
2010         // Segment registers: CS, SS, DS, ES, FS, GS
2011         // Since GDB care only selectors, we call get_sregs() first.
2012         let mut sregs = self
2013             .get_sregs(cpu_id as u8)
2014             .map_err(DebuggableError::ReadRegs)?;
2015         sregs.cs.selector = regs.segments.cs as u16;
2016         sregs.ss.selector = regs.segments.ss as u16;
2017         sregs.ds.selector = regs.segments.ds as u16;
2018         sregs.es.selector = regs.segments.es as u16;
2019         sregs.fs.selector = regs.segments.fs as u16;
2020         sregs.gs.selector = regs.segments.gs as u16;
2021 
2022         self.set_sregs(cpu_id as u8, &sregs)
2023             .map_err(DebuggableError::WriteRegs)?;
2024 
2025         // TODO: Add other registers
2026 
2027         Ok(())
2028     }
2029 
2030     #[cfg(target_arch = "x86_64")]
2031     fn read_mem(
2032         &self,
2033         cpu_id: usize,
2034         vaddr: GuestAddress,
2035         len: usize,
2036     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2037         let mut buf = vec![0; len];
2038         let mut total_read = 0_u64;
2039 
2040         while total_read < len as u64 {
2041             let gaddr = vaddr.0 + total_read;
2042             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2043                 Ok(paddr) => paddr,
2044                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2045                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2046             };
2047             let psize = arch::PAGE_SIZE as u64;
2048             let read_len = std::cmp::min(len as u64 - total_read, psize - (paddr & (psize - 1)));
2049             self.vmmops
2050                 .guest_mem_read(
2051                     paddr,
2052                     &mut buf[total_read as usize..total_read as usize + read_len as usize],
2053                 )
2054                 .map_err(DebuggableError::ReadMem)?;
2055             total_read += read_len;
2056         }
2057         Ok(buf)
2058     }
2059 
2060     #[cfg(target_arch = "x86_64")]
2061     fn write_mem(
2062         &self,
2063         cpu_id: usize,
2064         vaddr: &GuestAddress,
2065         data: &[u8],
2066     ) -> std::result::Result<(), DebuggableError> {
2067         let mut total_written = 0_u64;
2068 
2069         while total_written < data.len() as u64 {
2070             let gaddr = vaddr.0 + total_written;
2071             let paddr = match self.translate_gva(cpu_id as u8, gaddr) {
2072                 Ok(paddr) => paddr,
2073                 Err(_) if gaddr == u64::MIN => gaddr, // Silently return GVA as GPA if GVA == 0.
2074                 Err(e) => return Err(DebuggableError::TranslateGva(e)),
2075             };
2076             let psize = arch::PAGE_SIZE as u64;
2077             let write_len = std::cmp::min(
2078                 data.len() as u64 - total_written,
2079                 psize - (paddr & (psize - 1)),
2080             );
2081             self.vmmops
2082                 .guest_mem_write(
2083                     paddr,
2084                     &data[total_written as usize..total_written as usize + write_len as usize],
2085                 )
2086                 .map_err(DebuggableError::WriteMem)?;
2087             total_written += write_len;
2088         }
2089         Ok(())
2090     }
2091 
2092     fn active_vcpus(&self) -> usize {
2093         self.present_vcpus() as usize
2094     }
2095 }
2096 
2097 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2098 #[cfg(test)]
2099 mod tests {
2100     use arch::x86_64::interrupts::*;
2101     use arch::x86_64::regs::*;
2102     use hypervisor::x86_64::{FpuState, LapicState, StandardRegisters};
2103 
2104     #[test]
2105     fn test_setlint() {
2106         let hv = hypervisor::new().unwrap();
2107         let vm = hv.create_vm().expect("new VM fd creation failed");
2108         assert!(hv.check_required_extensions().is_ok());
2109         // Calling get_lapic will fail if there is no irqchip before hand.
2110         assert!(vm.create_irq_chip().is_ok());
2111         let vcpu = vm.create_vcpu(0, None).unwrap();
2112         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
2113 
2114         // Compute the value that is expected to represent LVT0 and LVT1.
2115         let lint0 = get_klapic_reg(&klapic_before, APIC_LVT0);
2116         let lint1 = get_klapic_reg(&klapic_before, APIC_LVT1);
2117         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
2118         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
2119 
2120         set_lint(&vcpu).unwrap();
2121 
2122         // Compute the value that represents LVT0 and LVT1 after set_lint.
2123         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
2124         let lint0_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT0);
2125         let lint1_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT1);
2126         assert_eq!(lint0_mode_expected, lint0_mode_actual);
2127         assert_eq!(lint1_mode_expected, lint1_mode_actual);
2128     }
2129 
2130     #[test]
2131     fn test_setup_fpu() {
2132         let hv = hypervisor::new().unwrap();
2133         let vm = hv.create_vm().expect("new VM fd creation failed");
2134         let vcpu = vm.create_vcpu(0, None).unwrap();
2135         setup_fpu(&vcpu).unwrap();
2136 
2137         let expected_fpu: FpuState = FpuState {
2138             fcw: 0x37f,
2139             mxcsr: 0x1f80,
2140             ..Default::default()
2141         };
2142         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
2143         // TODO: auto-generate kvm related structures with PartialEq on.
2144         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
2145         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
2146         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
2147         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
2148         // remove it at all.
2149         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
2150     }
2151 
2152     #[test]
2153     fn test_setup_msrs() {
2154         use hypervisor::arch::x86::msr_index;
2155         use hypervisor::x86_64::{MsrEntries, MsrEntry};
2156 
2157         let hv = hypervisor::new().unwrap();
2158         let vm = hv.create_vm().expect("new VM fd creation failed");
2159         let vcpu = vm.create_vcpu(0, None).unwrap();
2160         setup_msrs(&vcpu).unwrap();
2161 
2162         // This test will check against the last MSR entry configured (the tenth one).
2163         // See create_msr_entries for details.
2164         let mut msrs = MsrEntries::from_entries(&[MsrEntry {
2165             index: msr_index::MSR_IA32_MISC_ENABLE,
2166             ..Default::default()
2167         }])
2168         .unwrap();
2169 
2170         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
2171         // in this test case scenario.
2172         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
2173         assert_eq!(read_msrs, 1);
2174 
2175         // Official entries that were setup when we did setup_msrs. We need to assert that the
2176         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
2177         // expect.
2178         let entry_vec = hypervisor::x86_64::boot_msr_entries();
2179         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
2180     }
2181 
2182     #[test]
2183     fn test_setup_regs() {
2184         let hv = hypervisor::new().unwrap();
2185         let vm = hv.create_vm().expect("new VM fd creation failed");
2186         let vcpu = vm.create_vcpu(0, None).unwrap();
2187 
2188         let expected_regs: StandardRegisters = StandardRegisters {
2189             rflags: 0x0000000000000002u64,
2190             rbx: arch::layout::PVH_INFO_START.0,
2191             rip: 1,
2192             ..Default::default()
2193         };
2194 
2195         setup_regs(&vcpu, expected_regs.rip).unwrap();
2196 
2197         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
2198         assert_eq!(actual_regs, expected_regs);
2199     }
2200 }
2201 
2202 #[cfg(target_arch = "aarch64")]
2203 #[cfg(test)]
2204 mod tests {
2205     use arch::aarch64::regs::*;
2206     use hypervisor::kvm::aarch64::{is_system_register, MPIDR_EL1};
2207     use hypervisor::kvm::kvm_bindings::{
2208         kvm_one_reg, kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
2209         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
2210     };
2211     use hypervisor::{arm64_core_reg_id, offset__of};
2212     use std::mem;
2213 
2214     #[test]
2215     fn test_setup_regs() {
2216         let hv = hypervisor::new().unwrap();
2217         let vm = hv.create_vm().unwrap();
2218         let vcpu = vm.create_vcpu(0, None).unwrap();
2219 
2220         let res = setup_regs(&vcpu, 0, 0x0);
2221         // Must fail when vcpu is not initialized yet.
2222         assert!(res.is_err());
2223 
2224         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2225         vm.get_preferred_target(&mut kvi).unwrap();
2226         vcpu.vcpu_init(&kvi).unwrap();
2227 
2228         assert!(setup_regs(&vcpu, 0, 0x0).is_ok());
2229     }
2230 
2231     #[test]
2232     fn test_read_mpidr() {
2233         let hv = hypervisor::new().unwrap();
2234         let vm = hv.create_vm().unwrap();
2235         let vcpu = vm.create_vcpu(0, None).unwrap();
2236         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2237         vm.get_preferred_target(&mut kvi).unwrap();
2238 
2239         // Must fail when vcpu is not initialized yet.
2240         assert!(vcpu.read_mpidr().is_err());
2241 
2242         vcpu.vcpu_init(&kvi).unwrap();
2243         assert_eq!(vcpu.read_mpidr().unwrap(), 0x80000000);
2244     }
2245 
2246     #[test]
2247     fn test_is_system_register() {
2248         let offset = offset__of!(user_pt_regs, pc);
2249         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
2250         assert!(!is_system_register(regid));
2251         let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64;
2252         assert!(is_system_register(regid));
2253     }
2254 
2255     #[test]
2256     fn test_save_restore_core_regs() {
2257         let hv = hypervisor::new().unwrap();
2258         let vm = hv.create_vm().unwrap();
2259         let vcpu = vm.create_vcpu(0, None).unwrap();
2260         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2261         vm.get_preferred_target(&mut kvi).unwrap();
2262 
2263         // Must fail when vcpu is not initialized yet.
2264         let mut state = kvm_regs::default();
2265         let res = vcpu.core_registers(&mut state);
2266         assert!(res.is_err());
2267         assert_eq!(
2268             format!("{}", res.unwrap_err()),
2269             "Failed to get core register: Exec format error (os error 8)"
2270         );
2271 
2272         let res = vcpu.set_core_registers(&state);
2273         assert!(res.is_err());
2274         assert_eq!(
2275             format!("{}", res.unwrap_err()),
2276             "Failed to set core register: Exec format error (os error 8)"
2277         );
2278 
2279         vcpu.vcpu_init(&kvi).unwrap();
2280         assert!(vcpu.core_registers(&mut state).is_ok());
2281         assert_eq!(state.regs.pstate, 0x3C5);
2282 
2283         assert!(vcpu.set_core_registers(&state).is_ok());
2284         let off = offset__of!(user_pt_regs, pstate);
2285         let pstate = vcpu
2286             .get_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
2287             .expect("Failed to call kvm get one reg");
2288         assert_eq!(state.regs.pstate, pstate);
2289     }
2290 
2291     #[test]
2292     fn test_save_restore_system_regs() {
2293         let hv = hypervisor::new().unwrap();
2294         let vm = hv.create_vm().unwrap();
2295         let vcpu = vm.create_vcpu(0, None).unwrap();
2296         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2297         vm.get_preferred_target(&mut kvi).unwrap();
2298 
2299         // Must fail when vcpu is not initialized yet.
2300         let mut state: Vec<kvm_one_reg> = Vec::new();
2301         let res = vcpu.system_registers(&mut state);
2302         assert!(res.is_err());
2303         assert_eq!(
2304             format!("{}", res.unwrap_err()),
2305             "Failed to retrieve list of registers: Exec format error (os error 8)"
2306         );
2307 
2308         state.push(kvm_one_reg {
2309             id: MPIDR_EL1,
2310             addr: 0x00,
2311         });
2312         let res = vcpu.set_system_registers(&state);
2313         assert!(res.is_err());
2314         assert_eq!(
2315             format!("{}", res.unwrap_err()),
2316             "Failed to set system register: Exec format error (os error 8)"
2317         );
2318 
2319         vcpu.vcpu_init(&kvi).unwrap();
2320         assert!(vcpu.system_registers(&mut state).is_ok());
2321         let initial_mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr");
2322         assert!(state.contains(&kvm_one_reg {
2323             id: MPIDR_EL1,
2324             addr: initial_mpidr
2325         }));
2326 
2327         assert!(vcpu.set_system_registers(&state).is_ok());
2328         let mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr");
2329         assert_eq!(initial_mpidr, mpidr);
2330     }
2331 
2332     #[test]
2333     fn test_get_set_mpstate() {
2334         let hv = hypervisor::new().unwrap();
2335         let vm = hv.create_vm().unwrap();
2336         let vcpu = vm.create_vcpu(0, None).unwrap();
2337         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
2338         vm.get_preferred_target(&mut kvi).unwrap();
2339 
2340         let res = vcpu.get_mp_state();
2341         assert!(res.is_ok());
2342         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
2343     }
2344 }
2345