xref: /cloud-hypervisor/vmm/src/cpu.rs (revision 9af2968a7dc47b89bf07ea9dc5e735084efcfa3a)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::CpusConfig;
15 use crate::device_manager::DeviceManager;
16 use crate::memory_manager::MemoryManager;
17 use crate::seccomp_filters::{get_seccomp_filter, Thread};
18 #[cfg(target_arch = "x86_64")]
19 use crate::vm::physical_bits;
20 #[cfg(feature = "acpi")]
21 use crate::vm::NumaNodes;
22 use crate::GuestMemoryMmap;
23 use crate::CPU_MANAGER_SNAPSHOT_ID;
24 #[cfg(feature = "acpi")]
25 use acpi_tables::{aml, aml::Aml, sdt::Sdt};
26 use anyhow::anyhow;
27 use arch::EntryPoint;
28 use devices::interrupt_controller::InterruptController;
29 #[cfg(target_arch = "aarch64")]
30 use hypervisor::kvm::kvm_bindings;
31 #[cfg(target_arch = "x86_64")]
32 use hypervisor::CpuId;
33 use hypervisor::{vm::VmmOps, CpuState, HypervisorCpuError, VmExit};
34 use libc::{c_void, siginfo_t};
35 use seccomp::{SeccompAction, SeccompFilter};
36 #[cfg(feature = "acpi")]
37 use std::collections::BTreeMap;
38 use std::os::unix::thread::JoinHandleExt;
39 use std::sync::atomic::{AtomicBool, Ordering};
40 use std::sync::{Arc, Barrier, Mutex};
41 use std::{cmp, io, result, thread};
42 use vm_device::BusDevice;
43 #[cfg(feature = "acpi")]
44 use vm_memory::GuestAddress;
45 use vm_memory::GuestMemoryAtomic;
46 use vm_migration::{
47     Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
48     Transportable,
49 };
50 use vmm_sys_util::eventfd::EventFd;
51 use vmm_sys_util::signal::{register_signal_handler, SIGRTMIN};
52 
53 #[cfg(feature = "acpi")]
54 pub const CPU_MANAGER_ACPI_SIZE: usize = 0xc;
55 
56 #[derive(Debug)]
57 pub enum Error {
58     /// Cannot create the vCPU.
59     VcpuCreate(anyhow::Error),
60 
61     /// Cannot run the VCPUs.
62     VcpuRun(anyhow::Error),
63 
64     /// Cannot spawn a new vCPU thread.
65     VcpuSpawn(io::Error),
66 
67     /// Cannot generate common CPUID
68     CommonCpuId(arch::Error),
69 
70     /// Error configuring VCPU
71     VcpuConfiguration(arch::Error),
72 
73     #[cfg(target_arch = "aarch64")]
74     /// Error fetching prefered target
75     VcpuArmPreferredTarget(hypervisor::HypervisorVmError),
76 
77     #[cfg(target_arch = "aarch64")]
78     /// Error doing vCPU init on Arm.
79     VcpuArmInit(hypervisor::HypervisorCpuError),
80 
81     /// Failed to join on vCPU threads
82     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
83 
84     /// Cannot add legacy device to Bus.
85     BusError(vm_device::BusError),
86 
87     /// Asking for more vCPUs that we can have
88     DesiredVCpuCountExceedsMax,
89 
90     /// Cannot create seccomp filter
91     CreateSeccompFilter(seccomp::SeccompError),
92 
93     /// Cannot apply seccomp filter
94     ApplySeccompFilter(seccomp::Error),
95 
96     /// Error starting vCPU after restore
97     StartRestoreVcpu(anyhow::Error),
98 
99     /// Error because an unexpected VmExit type was received.
100     UnexpectedVmExit,
101 
102     /// Failed to allocate MMIO address
103     AllocateMmmioAddress,
104 
105     #[cfg(feature = "tdx")]
106     InitializeTdx(hypervisor::HypervisorCpuError),
107 }
108 pub type Result<T> = result::Result<T, Error>;
109 
110 #[cfg(all(target_arch = "x86_64", feature = "acpi"))]
111 #[allow(dead_code)]
112 #[repr(packed)]
113 struct LocalApic {
114     pub r#type: u8,
115     pub length: u8,
116     pub processor_id: u8,
117     pub apic_id: u8,
118     pub flags: u32,
119 }
120 
121 #[allow(dead_code)]
122 #[repr(packed)]
123 #[derive(Default)]
124 struct Ioapic {
125     pub r#type: u8,
126     pub length: u8,
127     pub ioapic_id: u8,
128     _reserved: u8,
129     pub apic_address: u32,
130     pub gsi_base: u32,
131 }
132 
133 #[cfg(all(target_arch = "aarch64", feature = "acpi"))]
134 #[allow(dead_code)]
135 #[repr(packed)]
136 struct GicC {
137     pub r#type: u8,
138     pub length: u8,
139     pub reserved0: u16,
140     pub cpu_interface_number: u32,
141     pub uid: u32,
142     pub flags: u32,
143     pub parking_version: u32,
144     pub performance_interrupt: u32,
145     pub parked_address: u64,
146     pub base_address: u64,
147     pub gicv_base_address: u64,
148     pub gich_base_address: u64,
149     pub vgic_interrupt: u32,
150     pub gicr_base_address: u64,
151     pub mpidr: u64,
152     pub proc_power_effi_class: u8,
153     pub reserved1: u8,
154     pub spe_overflow_interrupt: u16,
155 }
156 
157 #[cfg(all(target_arch = "aarch64", feature = "acpi"))]
158 #[allow(dead_code)]
159 #[repr(packed)]
160 struct GicD {
161     pub r#type: u8,
162     pub length: u8,
163     pub reserved0: u16,
164     pub gic_id: u32,
165     pub base_address: u64,
166     pub global_irq_base: u32,
167     pub version: u8,
168     pub reserved1: [u8; 3],
169 }
170 
171 #[cfg(all(target_arch = "aarch64", feature = "acpi"))]
172 #[allow(dead_code)]
173 #[repr(packed)]
174 struct GicR {
175     pub r#type: u8,
176     pub length: u8,
177     pub reserved: u16,
178     pub base_address: u64,
179     pub range_length: u32,
180 }
181 
182 #[cfg(all(target_arch = "aarch64", feature = "acpi"))]
183 #[allow(dead_code)]
184 #[repr(packed)]
185 struct GicIts {
186     pub r#type: u8,
187     pub length: u8,
188     pub reserved0: u16,
189     pub translation_id: u32,
190     pub base_address: u64,
191     pub reserved1: u32,
192 }
193 
194 #[allow(dead_code)]
195 #[repr(packed)]
196 #[derive(Default)]
197 struct InterruptSourceOverride {
198     pub r#type: u8,
199     pub length: u8,
200     pub bus: u8,
201     pub source: u8,
202     pub gsi: u32,
203     pub flags: u16,
204 }
205 
206 /// A wrapper around creating and using a kvm-based VCPU.
207 pub struct Vcpu {
208     // The hypervisor abstracted CPU.
209     vcpu: Arc<dyn hypervisor::Vcpu>,
210     id: u8,
211     #[cfg(target_arch = "aarch64")]
212     mpidr: u64,
213     saved_state: Option<CpuState>,
214 }
215 
216 impl Vcpu {
217     /// Constructs a new VCPU for `vm`.
218     ///
219     /// # Arguments
220     ///
221     /// * `id` - Represents the CPU number between [0, max vcpus).
222     /// * `vm` - The virtual machine this vcpu will get attached to.
223     /// * `vmmops` - Optional object for exit handling.
224     pub fn new(
225         id: u8,
226         vm: &Arc<dyn hypervisor::Vm>,
227         vmmops: Option<Arc<Box<dyn VmmOps>>>,
228     ) -> Result<Arc<Mutex<Self>>> {
229         let vcpu = vm
230             .create_vcpu(id, vmmops)
231             .map_err(|e| Error::VcpuCreate(e.into()))?;
232         // Initially the cpuid per vCPU is the one supported by this VM.
233         Ok(Arc::new(Mutex::new(Vcpu {
234             vcpu,
235             id,
236             #[cfg(target_arch = "aarch64")]
237             mpidr: 0,
238             saved_state: None,
239         })))
240     }
241 
242     /// Configures a vcpu and should be called once per vcpu when created.
243     ///
244     /// # Arguments
245     ///
246     /// * `kernel_entry_point` - Kernel entry point address in guest memory and boot protocol used.
247     /// * `vm_memory` - Guest memory.
248     /// * `cpuid` - (x86_64) CpuId, wrapper over the `kvm_cpuid2` structure.
249     pub fn configure(
250         &mut self,
251         #[cfg(target_arch = "aarch64")] vm: &Arc<dyn hypervisor::Vm>,
252         kernel_entry_point: Option<EntryPoint>,
253         vm_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
254         #[cfg(target_arch = "x86_64")] cpuid: CpuId,
255         #[cfg(target_arch = "x86_64")] kvm_hyperv: bool,
256     ) -> Result<()> {
257         #[cfg(target_arch = "aarch64")]
258         {
259             self.init(vm)?;
260             self.mpidr = arch::configure_vcpu(&self.vcpu, self.id, kernel_entry_point, vm_memory)
261                 .map_err(Error::VcpuConfiguration)?;
262         }
263         info!("Configuring vCPU: cpu_id = {}", self.id);
264         #[cfg(target_arch = "x86_64")]
265         arch::configure_vcpu(
266             &self.vcpu,
267             self.id,
268             kernel_entry_point,
269             vm_memory,
270             cpuid,
271             kvm_hyperv,
272         )
273         .map_err(Error::VcpuConfiguration)?;
274 
275         Ok(())
276     }
277 
278     /// Gets the MPIDR register value.
279     #[cfg(target_arch = "aarch64")]
280     pub fn get_mpidr(&self) -> u64 {
281         self.mpidr
282     }
283 
284     /// Gets the saved vCPU state.
285     #[cfg(target_arch = "aarch64")]
286     pub fn get_saved_state(&self) -> Option<CpuState> {
287         self.saved_state.clone()
288     }
289 
290     /// Initializes an aarch64 specific vcpu for booting Linux.
291     #[cfg(target_arch = "aarch64")]
292     pub fn init(&self, vm: &Arc<dyn hypervisor::Vm>) -> Result<()> {
293         let mut kvi: kvm_bindings::kvm_vcpu_init = kvm_bindings::kvm_vcpu_init::default();
294 
295         // This reads back the kernel's preferred target type.
296         vm.get_preferred_target(&mut kvi)
297             .map_err(Error::VcpuArmPreferredTarget)?;
298         // We already checked that the capability is supported.
299         kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_PSCI_0_2;
300         // Non-boot cpus are powered off initially.
301         if self.id > 0 {
302             kvi.features[0] |= 1 << kvm_bindings::KVM_ARM_VCPU_POWER_OFF;
303         }
304         self.vcpu.vcpu_init(&kvi).map_err(Error::VcpuArmInit)
305     }
306 
307     /// Runs the VCPU until it exits, returning the reason.
308     ///
309     /// Note that the state of the VCPU and associated VM must be setup first for this to do
310     /// anything useful.
311     pub fn run(&self) -> std::result::Result<VmExit, HypervisorCpuError> {
312         self.vcpu.run()
313     }
314 }
315 
316 const VCPU_SNAPSHOT_ID: &str = "vcpu";
317 impl Pausable for Vcpu {
318     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
319         self.saved_state =
320             Some(self.vcpu.state().map_err(|e| {
321                 MigratableError::Pause(anyhow!("Could not get vCPU state {:?}", e))
322             })?);
323 
324         Ok(())
325     }
326 
327     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
328         if let Some(vcpu_state) = &self.saved_state {
329             self.vcpu.set_state(vcpu_state).map_err(|e| {
330                 MigratableError::Pause(anyhow!("Could not set the vCPU state {:?}", e))
331             })?;
332         }
333 
334         Ok(())
335     }
336 }
337 impl Snapshottable for Vcpu {
338     fn id(&self) -> String {
339         VCPU_SNAPSHOT_ID.to_string()
340     }
341 
342     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
343         let mut vcpu_snapshot = Snapshot::new(&format!("{}", self.id));
344         vcpu_snapshot.add_data_section(SnapshotDataSection::new_from_state(
345             VCPU_SNAPSHOT_ID,
346             &self.saved_state,
347         )?);
348 
349         Ok(vcpu_snapshot)
350     }
351 
352     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
353         self.saved_state = Some(snapshot.to_state(VCPU_SNAPSHOT_ID)?);
354         Ok(())
355     }
356 }
357 
358 pub struct CpuManager {
359     config: CpusConfig,
360     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
361     interrupt_controller: Option<Arc<Mutex<dyn InterruptController>>>,
362     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
363     vm_memory: GuestMemoryAtomic<GuestMemoryMmap>,
364     #[cfg(target_arch = "x86_64")]
365     cpuid: CpuId,
366     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
367     vm: Arc<dyn hypervisor::Vm>,
368     vcpus_kill_signalled: Arc<AtomicBool>,
369     vcpus_pause_signalled: Arc<AtomicBool>,
370     exit_evt: EventFd,
371     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
372     reset_evt: EventFd,
373     vcpu_states: Vec<VcpuState>,
374     selected_cpu: u8,
375     vcpus: Vec<Arc<Mutex<Vcpu>>>,
376     seccomp_action: SeccompAction,
377     vmmops: Arc<Box<dyn VmmOps>>,
378     #[cfg(feature = "acpi")]
379     #[cfg_attr(target_arch = "aarch64", allow(dead_code))]
380     acpi_address: GuestAddress,
381     #[cfg(feature = "acpi")]
382     proximity_domain_per_cpu: BTreeMap<u8, u32>,
383 }
384 
385 const CPU_ENABLE_FLAG: usize = 0;
386 const CPU_INSERTING_FLAG: usize = 1;
387 const CPU_REMOVING_FLAG: usize = 2;
388 const CPU_EJECT_FLAG: usize = 3;
389 
390 const CPU_STATUS_OFFSET: u64 = 4;
391 const CPU_SELECTION_OFFSET: u64 = 0;
392 
393 impl BusDevice for CpuManager {
394     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
395         // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
396         data.copy_from_slice(&[0; 8][0..data.len()]);
397 
398         match offset {
399             CPU_SELECTION_OFFSET => {
400                 data[0] = self.selected_cpu;
401             }
402             CPU_STATUS_OFFSET => {
403                 if self.selected_cpu < self.present_vcpus() {
404                     let state = &self.vcpu_states[usize::from(self.selected_cpu)];
405                     if state.active() {
406                         data[0] |= 1 << CPU_ENABLE_FLAG;
407                     }
408                     if state.inserting {
409                         data[0] |= 1 << CPU_INSERTING_FLAG;
410                     }
411                     if state.removing {
412                         data[0] |= 1 << CPU_REMOVING_FLAG;
413                     }
414                 }
415             }
416             _ => {
417                 warn!(
418                     "Unexpected offset for accessing CPU manager device: {:#}",
419                     offset
420                 );
421             }
422         }
423     }
424 
425     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
426         match offset {
427             CPU_SELECTION_OFFSET => {
428                 self.selected_cpu = data[0];
429             }
430             CPU_STATUS_OFFSET => {
431                 let state = &mut self.vcpu_states[usize::from(self.selected_cpu)];
432                 // The ACPI code writes back a 1 to acknowledge the insertion
433                 if (data[0] & (1 << CPU_INSERTING_FLAG) == 1 << CPU_INSERTING_FLAG)
434                     && state.inserting
435                 {
436                     state.inserting = false;
437                 }
438                 // Ditto for removal
439                 if (data[0] & (1 << CPU_REMOVING_FLAG) == 1 << CPU_REMOVING_FLAG) && state.removing
440                 {
441                     state.removing = false;
442                 }
443                 // Trigger removal of vCPU
444                 if data[0] & (1 << CPU_EJECT_FLAG) == 1 << CPU_EJECT_FLAG {
445                     if let Err(e) = self.remove_vcpu(self.selected_cpu) {
446                         error!("Error removing vCPU: {:?}", e);
447                     }
448                 }
449             }
450             _ => {
451                 warn!(
452                     "Unexpected offset for accessing CPU manager device: {:#}",
453                     offset
454                 );
455             }
456         }
457         None
458     }
459 }
460 
461 #[derive(Default)]
462 struct VcpuState {
463     inserting: bool,
464     removing: bool,
465     handle: Option<thread::JoinHandle<()>>,
466     kill: Arc<AtomicBool>,
467     vcpu_run_interrupted: Arc<AtomicBool>,
468 }
469 
470 impl VcpuState {
471     fn active(&self) -> bool {
472         self.handle.is_some()
473     }
474 
475     fn signal_thread(&self) {
476         if let Some(handle) = self.handle.as_ref() {
477             loop {
478                 unsafe {
479                     libc::pthread_kill(handle.as_pthread_t() as _, SIGRTMIN());
480                 }
481                 if self.vcpu_run_interrupted.load(Ordering::SeqCst) {
482                     break;
483                 } else {
484                     // This is more effective than thread::yield_now() at
485                     // avoiding a priority inversion with the vCPU thread
486                     thread::sleep(std::time::Duration::from_millis(1));
487                 }
488             }
489         }
490     }
491 
492     fn join_thread(&mut self) -> Result<()> {
493         if let Some(handle) = self.handle.take() {
494             handle.join().map_err(Error::ThreadCleanup)?
495         }
496 
497         Ok(())
498     }
499 
500     fn unpark_thread(&self) {
501         if let Some(handle) = self.handle.as_ref() {
502             handle.thread().unpark()
503         }
504     }
505 }
506 
507 impl CpuManager {
508     #[allow(unused_variables)]
509     #[allow(clippy::too_many_arguments)]
510     pub fn new(
511         config: &CpusConfig,
512         device_manager: &Arc<Mutex<DeviceManager>>,
513         memory_manager: &Arc<Mutex<MemoryManager>>,
514         vm: Arc<dyn hypervisor::Vm>,
515         exit_evt: EventFd,
516         reset_evt: EventFd,
517         hypervisor: Arc<dyn hypervisor::Hypervisor>,
518         seccomp_action: SeccompAction,
519         vmmops: Arc<Box<dyn VmmOps>>,
520         #[cfg(feature = "tdx")] tdx_enabled: bool,
521         #[cfg(feature = "acpi")] numa_nodes: &NumaNodes,
522     ) -> Result<Arc<Mutex<CpuManager>>> {
523         let guest_memory = memory_manager.lock().unwrap().guest_memory();
524         let mut vcpu_states = Vec::with_capacity(usize::from(config.max_vcpus));
525         vcpu_states.resize_with(usize::from(config.max_vcpus), VcpuState::default);
526 
527         #[cfg(target_arch = "x86_64")]
528         let sgx_epc_sections = memory_manager
529             .lock()
530             .unwrap()
531             .sgx_epc_region()
532             .as_ref()
533             .map(|sgx_epc_region| sgx_epc_region.epc_sections().values().cloned().collect());
534         #[cfg(target_arch = "x86_64")]
535         let cpuid = {
536             let phys_bits = physical_bits(
537                 config.max_phys_bits,
538                 #[cfg(feature = "tdx")]
539                 tdx_enabled,
540             );
541             arch::generate_common_cpuid(
542                 hypervisor,
543                 config
544                     .topology
545                     .clone()
546                     .map(|t| (t.threads_per_core, t.cores_per_die, t.dies_per_package)),
547                 sgx_epc_sections,
548                 phys_bits,
549                 config.kvm_hyperv,
550                 #[cfg(feature = "tdx")]
551                 tdx_enabled,
552             )
553             .map_err(Error::CommonCpuId)?
554         };
555 
556         let device_manager = device_manager.lock().unwrap();
557         #[cfg(feature = "acpi")]
558         let acpi_address = device_manager
559             .allocator()
560             .lock()
561             .unwrap()
562             .allocate_mmio_addresses(None, CPU_MANAGER_ACPI_SIZE as u64, None)
563             .ok_or(Error::AllocateMmmioAddress)?;
564 
565         #[cfg(feature = "acpi")]
566         let proximity_domain_per_cpu: BTreeMap<u8, u32> = {
567             let mut cpu_list = Vec::new();
568             for (proximity_domain, numa_node) in numa_nodes.iter() {
569                 for cpu in numa_node.cpus().iter() {
570                     cpu_list.push((*cpu, *proximity_domain))
571                 }
572             }
573             cpu_list
574         }
575         .into_iter()
576         .collect();
577 
578         let cpu_manager = Arc::new(Mutex::new(CpuManager {
579             config: config.clone(),
580             interrupt_controller: device_manager.interrupt_controller().clone(),
581             vm_memory: guest_memory,
582             #[cfg(target_arch = "x86_64")]
583             cpuid,
584             vm,
585             vcpus_kill_signalled: Arc::new(AtomicBool::new(false)),
586             vcpus_pause_signalled: Arc::new(AtomicBool::new(false)),
587             vcpu_states,
588             exit_evt,
589             reset_evt,
590             selected_cpu: 0,
591             vcpus: Vec::with_capacity(usize::from(config.max_vcpus)),
592             seccomp_action,
593             vmmops,
594             #[cfg(feature = "acpi")]
595             acpi_address,
596             #[cfg(feature = "acpi")]
597             proximity_domain_per_cpu,
598         }));
599 
600         #[cfg(feature = "acpi")]
601         device_manager
602             .mmio_bus()
603             .insert(
604                 cpu_manager.clone(),
605                 acpi_address.0,
606                 CPU_MANAGER_ACPI_SIZE as u64,
607             )
608             .map_err(Error::BusError)?;
609 
610         Ok(cpu_manager)
611     }
612 
613     fn create_vcpu(
614         &mut self,
615         cpu_id: u8,
616         entry_point: Option<EntryPoint>,
617         snapshot: Option<Snapshot>,
618     ) -> Result<Arc<Mutex<Vcpu>>> {
619         info!("Creating vCPU: cpu_id = {}", cpu_id);
620 
621         let vcpu = Vcpu::new(cpu_id, &self.vm, Some(self.vmmops.clone()))?;
622 
623         if let Some(snapshot) = snapshot {
624             // AArch64 vCPUs should be initialized after created.
625             #[cfg(target_arch = "aarch64")]
626             vcpu.lock().unwrap().init(&self.vm)?;
627 
628             vcpu.lock()
629                 .unwrap()
630                 .restore(snapshot)
631                 .expect("Failed to restore vCPU");
632         } else {
633             let vm_memory = self.vm_memory.clone();
634 
635             #[cfg(target_arch = "x86_64")]
636             vcpu.lock()
637                 .unwrap()
638                 .configure(
639                     entry_point,
640                     &vm_memory,
641                     self.cpuid.clone(),
642                     self.config.kvm_hyperv,
643                 )
644                 .expect("Failed to configure vCPU");
645 
646             #[cfg(target_arch = "aarch64")]
647             vcpu.lock()
648                 .unwrap()
649                 .configure(&self.vm, entry_point, &vm_memory)
650                 .expect("Failed to configure vCPU");
651         }
652 
653         // Adding vCPU to the CpuManager's vCPU list.
654         self.vcpus.push(Arc::clone(&vcpu));
655 
656         Ok(vcpu)
657     }
658 
659     /// Only create new vCPUs if there aren't any inactive ones to reuse
660     fn create_vcpus(&mut self, desired_vcpus: u8, entry_point: Option<EntryPoint>) -> Result<()> {
661         info!(
662             "Request to create new vCPUs: desired = {}, max = {}, allocated = {}, present = {}",
663             desired_vcpus,
664             self.config.max_vcpus,
665             self.vcpus.len(),
666             self.present_vcpus()
667         );
668 
669         if desired_vcpus > self.config.max_vcpus {
670             return Err(Error::DesiredVCpuCountExceedsMax);
671         }
672 
673         // Only create vCPUs in excess of all the allocated vCPUs.
674         for cpu_id in self.vcpus.len() as u8..desired_vcpus {
675             self.create_vcpu(cpu_id, entry_point, None)?;
676         }
677 
678         Ok(())
679     }
680 
681     fn start_vcpu(
682         &mut self,
683         vcpu: Arc<Mutex<Vcpu>>,
684         vcpu_thread_barrier: Arc<Barrier>,
685         inserting: bool,
686     ) -> Result<()> {
687         let cpu_id = vcpu.lock().unwrap().id;
688         let reset_evt = self.reset_evt.try_clone().unwrap();
689         let exit_evt = self.exit_evt.try_clone().unwrap();
690         let vcpu_kill_signalled = self.vcpus_kill_signalled.clone();
691         let vcpu_pause_signalled = self.vcpus_pause_signalled.clone();
692 
693         let vcpu_kill = self.vcpu_states[usize::from(cpu_id)].kill.clone();
694         let vcpu_run_interrupted = self.vcpu_states[usize::from(cpu_id)]
695             .vcpu_run_interrupted
696             .clone();
697 
698         info!("Starting vCPU: cpu_id = {}", cpu_id);
699 
700         // Retrieve seccomp filter for vcpu thread
701         let vcpu_seccomp_filter = get_seccomp_filter(&self.seccomp_action, Thread::Vcpu)
702             .map_err(Error::CreateSeccompFilter)?;
703 
704         #[cfg(target_arch = "x86_64")]
705         let interrupt_controller_clone = self.interrupt_controller.as_ref().cloned();
706 
707         let handle = Some(
708             thread::Builder::new()
709                 .name(format!("vcpu{}", cpu_id))
710                 .spawn(move || {
711                     // Apply seccomp filter for vcpu thread.
712                     if let Err(e) =
713                         SeccompFilter::apply(vcpu_seccomp_filter).map_err(Error::ApplySeccompFilter)
714                     {
715                         error!("Error applying seccomp filter: {:?}", e);
716                         return;
717                     }
718 
719                     extern "C" fn handle_signal(_: i32, _: *mut siginfo_t, _: *mut c_void) {}
720                     // This uses an async signal safe handler to kill the vcpu handles.
721                     register_signal_handler(SIGRTMIN(), handle_signal)
722                         .expect("Failed to register vcpu signal handler");
723 
724                     // Block until all CPUs are ready.
725                     vcpu_thread_barrier.wait();
726 
727                     loop {
728                         // If we are being told to pause, we park the thread
729                         // until the pause boolean is toggled.
730                         // The resume operation is responsible for toggling
731                         // the boolean and unpark the thread.
732                         // We enter a loop because park() could spuriously
733                         // return. We will then park() again unless the
734                         // pause boolean has been toggled.
735 
736                         // Need to use Ordering::SeqCst as we have multiple
737                         // loads and stores to different atomics and we need
738                         // to see them in a consistent order in all threads
739 
740                         if vcpu_pause_signalled.load(Ordering::SeqCst) {
741                             vcpu_run_interrupted.store(true, Ordering::SeqCst);
742                             while vcpu_pause_signalled.load(Ordering::SeqCst) {
743                                 thread::park();
744                             }
745                             vcpu_run_interrupted.store(false, Ordering::SeqCst);
746                         }
747 
748                         // We've been told to terminate
749                         if vcpu_kill_signalled.load(Ordering::SeqCst)
750                             || vcpu_kill.load(Ordering::SeqCst)
751                         {
752                             vcpu_run_interrupted.store(true, Ordering::SeqCst);
753                             break;
754                         }
755 
756                         // vcpu.run() returns false on a triple-fault so trigger a reset
757                         match vcpu.lock().unwrap().run() {
758                             Ok(run) => match run {
759                                 #[cfg(target_arch = "x86_64")]
760                                 VmExit::IoapicEoi(vector) => {
761                                     if let Some(interrupt_controller) = &interrupt_controller_clone
762                                     {
763                                         interrupt_controller
764                                             .lock()
765                                             .unwrap()
766                                             .end_of_interrupt(vector);
767                                     }
768                                 }
769                                 VmExit::Ignore => {}
770                                 VmExit::Hyperv => {}
771                                 VmExit::Reset => {
772                                     debug!("VmExit::Reset");
773                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
774                                     reset_evt.write(1).unwrap();
775                                     break;
776                                 }
777                                 VmExit::Shutdown => {
778                                     debug!("VmExit::Shutdown");
779                                     vcpu_run_interrupted.store(true, Ordering::SeqCst);
780                                     exit_evt.write(1).unwrap();
781                                     break;
782                                 }
783                                 _ => {
784                                     error!("VCPU generated error: {:?}", Error::UnexpectedVmExit);
785                                     break;
786                                 }
787                             },
788 
789                             Err(e) => {
790                                 error!("VCPU generated error: {:?}", Error::VcpuRun(e.into()));
791                                 break;
792                             }
793                         }
794 
795                         // We've been told to terminate
796                         if vcpu_kill_signalled.load(Ordering::SeqCst)
797                             || vcpu_kill.load(Ordering::SeqCst)
798                         {
799                             vcpu_run_interrupted.store(true, Ordering::SeqCst);
800                             break;
801                         }
802                     }
803                 })
804                 .map_err(Error::VcpuSpawn)?,
805         );
806 
807         // On hot plug calls into this function entry_point is None. It is for
808         // those hotplug CPU additions that we need to set the inserting flag.
809         self.vcpu_states[usize::from(cpu_id)].handle = handle;
810         self.vcpu_states[usize::from(cpu_id)].inserting = inserting;
811 
812         Ok(())
813     }
814 
815     /// Start up as many vCPUs threads as needed to reach `desired_vcpus`
816     fn activate_vcpus(&mut self, desired_vcpus: u8, inserting: bool) -> Result<()> {
817         if desired_vcpus > self.config.max_vcpus {
818             return Err(Error::DesiredVCpuCountExceedsMax);
819         }
820 
821         let vcpu_thread_barrier = Arc::new(Barrier::new(
822             (desired_vcpus - self.present_vcpus() + 1) as usize,
823         ));
824 
825         info!(
826             "Starting vCPUs: desired = {}, allocated = {}, present = {}",
827             desired_vcpus,
828             self.vcpus.len(),
829             self.present_vcpus()
830         );
831 
832         // This reuses any inactive vCPUs as well as any that were newly created
833         for cpu_id in self.present_vcpus()..desired_vcpus {
834             let vcpu = Arc::clone(&self.vcpus[cpu_id as usize]);
835             self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), inserting)?;
836         }
837 
838         // Unblock all CPU threads.
839         vcpu_thread_barrier.wait();
840         Ok(())
841     }
842 
843     fn mark_vcpus_for_removal(&mut self, desired_vcpus: u8) {
844         // Mark vCPUs for removal, actual removal happens on ejection
845         for cpu_id in desired_vcpus..self.present_vcpus() {
846             self.vcpu_states[usize::from(cpu_id)].removing = true;
847         }
848     }
849 
850     fn remove_vcpu(&mut self, cpu_id: u8) -> Result<()> {
851         info!("Removing vCPU: cpu_id = {}", cpu_id);
852         let mut state = &mut self.vcpu_states[usize::from(cpu_id)];
853         state.kill.store(true, Ordering::SeqCst);
854         state.signal_thread();
855         state.join_thread()?;
856         state.handle = None;
857 
858         // Once the thread has exited, clear the "kill" so that it can reused
859         state.kill.store(false, Ordering::SeqCst);
860 
861         Ok(())
862     }
863 
864     pub fn create_boot_vcpus(&mut self, entry_point: Option<EntryPoint>) -> Result<()> {
865         self.create_vcpus(self.boot_vcpus(), entry_point)
866     }
867 
868     // Starts all the vCPUs that the VM is booting with. Blocks until all vCPUs are running.
869     pub fn start_boot_vcpus(&mut self) -> Result<()> {
870         self.activate_vcpus(self.boot_vcpus(), false)
871     }
872 
873     pub fn start_restored_vcpus(&mut self) -> Result<()> {
874         let vcpu_numbers = self.vcpus.len();
875         let vcpu_thread_barrier = Arc::new(Barrier::new((vcpu_numbers + 1) as usize));
876         // Restore the vCPUs in "paused" state.
877         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
878 
879         for vcpu_index in 0..vcpu_numbers {
880             let vcpu = Arc::clone(&self.vcpus[vcpu_index as usize]);
881 
882             self.start_vcpu(vcpu, vcpu_thread_barrier.clone(), false)
883                 .map_err(|e| {
884                     Error::StartRestoreVcpu(anyhow!("Failed to start restored vCPUs: {:#?}", e))
885                 })?;
886         }
887         // Unblock all restored CPU threads.
888         vcpu_thread_barrier.wait();
889         Ok(())
890     }
891 
892     pub fn resize(&mut self, desired_vcpus: u8) -> Result<bool> {
893         match desired_vcpus.cmp(&self.present_vcpus()) {
894             cmp::Ordering::Greater => {
895                 self.create_vcpus(desired_vcpus, None)?;
896                 self.activate_vcpus(desired_vcpus, true)?;
897                 Ok(true)
898             }
899             cmp::Ordering::Less => {
900                 self.mark_vcpus_for_removal(desired_vcpus);
901                 Ok(true)
902             }
903             _ => Ok(false),
904         }
905     }
906 
907     pub fn shutdown(&mut self) -> Result<()> {
908         // Tell the vCPUs to stop themselves next time they go through the loop
909         self.vcpus_kill_signalled.store(true, Ordering::SeqCst);
910 
911         // Toggle the vCPUs pause boolean
912         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
913 
914         // Unpark all the VCPU threads.
915         for state in self.vcpu_states.iter() {
916             state.unpark_thread();
917         }
918 
919         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
920         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
921         // above.
922         for state in self.vcpu_states.iter() {
923             state.signal_thread();
924         }
925 
926         // Wait for all the threads to finish. This removes the state from the vector.
927         for mut state in self.vcpu_states.drain(..) {
928             state.join_thread()?;
929         }
930 
931         Ok(())
932     }
933 
934     #[cfg(feature = "tdx")]
935     pub fn initialize_tdx(&self, hob_address: u64) -> Result<()> {
936         for vcpu in &self.vcpus {
937             vcpu.lock()
938                 .unwrap()
939                 .vcpu
940                 .tdx_init(hob_address)
941                 .map_err(Error::InitializeTdx)?;
942         }
943         Ok(())
944     }
945 
946     pub fn boot_vcpus(&self) -> u8 {
947         self.config.boot_vcpus
948     }
949 
950     pub fn max_vcpus(&self) -> u8 {
951         self.config.max_vcpus
952     }
953 
954     #[cfg(target_arch = "x86_64")]
955     pub fn common_cpuid(&self) -> CpuId {
956         self.cpuid.clone()
957     }
958 
959     fn present_vcpus(&self) -> u8 {
960         self.vcpu_states
961             .iter()
962             .fold(0, |acc, state| acc + state.active() as u8)
963     }
964 
965     #[cfg(target_arch = "aarch64")]
966     pub fn get_mpidrs(&self) -> Vec<u64> {
967         self.vcpus
968             .iter()
969             .map(|cpu| cpu.lock().unwrap().get_mpidr())
970             .collect()
971     }
972 
973     #[cfg(target_arch = "aarch64")]
974     pub fn get_saved_states(&self) -> Vec<CpuState> {
975         self.vcpus
976             .iter()
977             .map(|cpu| cpu.lock().unwrap().get_saved_state().unwrap())
978             .collect()
979     }
980 
981     #[cfg(feature = "acpi")]
982     pub fn create_madt(&self) -> Sdt {
983         use crate::acpi;
984         // This is also checked in the commandline parsing.
985         assert!(self.config.boot_vcpus <= self.config.max_vcpus);
986 
987         let mut madt = Sdt::new(*b"APIC", 44, 5, *b"CLOUDH", *b"CHMADT  ", 1);
988         #[cfg(target_arch = "x86_64")]
989         {
990             madt.write(36, arch::layout::APIC_START);
991 
992             for cpu in 0..self.config.max_vcpus {
993                 let lapic = LocalApic {
994                     r#type: acpi::ACPI_APIC_PROCESSOR,
995                     length: 8,
996                     processor_id: cpu,
997                     apic_id: cpu,
998                     flags: if cpu < self.config.boot_vcpus {
999                         1 << MADT_CPU_ENABLE_FLAG
1000                     } else {
1001                         0
1002                     },
1003                 };
1004                 madt.append(lapic);
1005             }
1006 
1007             madt.append(Ioapic {
1008                 r#type: acpi::ACPI_APIC_IO,
1009                 length: 12,
1010                 ioapic_id: 0,
1011                 apic_address: arch::layout::IOAPIC_START.0 as u32,
1012                 gsi_base: 0,
1013                 ..Default::default()
1014             });
1015 
1016             madt.append(InterruptSourceOverride {
1017                 r#type: acpi::ACPI_APIC_XRUPT_OVERRIDE,
1018                 length: 10,
1019                 bus: 0,
1020                 source: 4,
1021                 gsi: 4,
1022                 flags: 0,
1023             });
1024         }
1025 
1026         #[cfg(target_arch = "aarch64")]
1027         {
1028             /* Notes:
1029              * Ignore Local Interrupt Controller Address at byte offset 36 of MADT table.
1030              */
1031 
1032             // See section 5.2.12.14 GIC CPU Interface (GICC) Structure in ACPI spec.
1033             for cpu in 0..self.config.boot_vcpus {
1034                 let vcpu = &self.vcpus[cpu as usize];
1035                 let mpidr = vcpu.lock().unwrap().get_mpidr();
1036                 /* ARMv8 MPIDR format:
1037                      Bits [63:40] Must be zero
1038                      Bits [39:32] Aff3 : Match Aff3 of target processor MPIDR
1039                      Bits [31:24] Must be zero
1040                      Bits [23:16] Aff2 : Match Aff2 of target processor MPIDR
1041                      Bits [15:8] Aff1 : Match Aff1 of target processor MPIDR
1042                      Bits [7:0] Aff0 : Match Aff0 of target processor MPIDR
1043                 */
1044                 let mpidr_mask = 0xff_00ff_ffff;
1045                 let gicc = GicC {
1046                     r#type: acpi::ACPI_APIC_GENERIC_CPU_INTERFACE,
1047                     length: 80,
1048                     reserved0: 0,
1049                     cpu_interface_number: cpu as u32,
1050                     uid: cpu as u32,
1051                     flags: 1,
1052                     parking_version: 0,
1053                     performance_interrupt: 0,
1054                     parked_address: 0,
1055                     base_address: 0,
1056                     gicv_base_address: 0,
1057                     gich_base_address: 0,
1058                     vgic_interrupt: 0,
1059                     gicr_base_address: 0,
1060                     mpidr: mpidr & mpidr_mask,
1061                     proc_power_effi_class: 0,
1062                     reserved1: 0,
1063                     spe_overflow_interrupt: 0,
1064                 };
1065 
1066                 madt.append(gicc);
1067             }
1068 
1069             // GIC Distributor structure. See section 5.2.12.15 in ACPI spec.
1070             let gicd = GicD {
1071                 r#type: acpi::ACPI_APIC_GENERIC_DISTRIBUTOR,
1072                 length: 24,
1073                 reserved0: 0,
1074                 gic_id: 0,
1075                 base_address: arch::layout::MAPPED_IO_START - 0x0001_0000,
1076                 global_irq_base: 0,
1077                 version: 3,
1078                 reserved1: [0; 3],
1079             };
1080             madt.append(gicd);
1081 
1082             // See 5.2.12.17 GIC Redistributor (GICR) Structure in ACPI spec.
1083             let gicr_size: u32 = 0x0001_0000 * 2 * (self.config.boot_vcpus as u32);
1084             let gicr_base: u64 = arch::layout::MAPPED_IO_START - 0x0001_0000 - gicr_size as u64;
1085             let gicr = GicR {
1086                 r#type: acpi::ACPI_APIC_GENERIC_REDISTRIBUTOR,
1087                 length: 16,
1088                 reserved: 0,
1089                 base_address: gicr_base,
1090                 range_length: gicr_size,
1091             };
1092             madt.append(gicr);
1093 
1094             // See 5.2.12.18 GIC Interrupt Translation Service (ITS) Structure in ACPI spec.
1095             let gicits = GicIts {
1096                 r#type: acpi::ACPI_APIC_GENERIC_TRANSLATOR,
1097                 length: 20,
1098                 reserved0: 0,
1099                 translation_id: 0,
1100                 base_address: gicr_base - 2 * 0x0001_0000,
1101                 reserved1: 0,
1102             };
1103             madt.append(gicits);
1104 
1105             madt.update_checksum();
1106         }
1107 
1108         madt
1109     }
1110 }
1111 
1112 #[cfg(feature = "acpi")]
1113 struct Cpu {
1114     cpu_id: u8,
1115     proximity_domain: u32,
1116 }
1117 
1118 #[cfg(all(target_arch = "x86_64", feature = "acpi"))]
1119 const MADT_CPU_ENABLE_FLAG: usize = 0;
1120 
1121 #[cfg(feature = "acpi")]
1122 impl Cpu {
1123     #[cfg(target_arch = "x86_64")]
1124     fn generate_mat(&self) -> Vec<u8> {
1125         let lapic = LocalApic {
1126             r#type: 0,
1127             length: 8,
1128             processor_id: self.cpu_id,
1129             apic_id: self.cpu_id,
1130             flags: 1 << MADT_CPU_ENABLE_FLAG,
1131         };
1132 
1133         let mut mat_data: Vec<u8> = Vec::new();
1134         mat_data.resize(std::mem::size_of_val(&lapic), 0);
1135         unsafe { *(mat_data.as_mut_ptr() as *mut LocalApic) = lapic };
1136 
1137         mat_data
1138     }
1139 }
1140 
1141 #[cfg(feature = "acpi")]
1142 impl Aml for Cpu {
1143     fn to_aml_bytes(&self) -> Vec<u8> {
1144         #[cfg(target_arch = "x86_64")]
1145         let mat_data: Vec<u8> = self.generate_mat();
1146 
1147         aml::Device::new(
1148             format!("C{:03}", self.cpu_id).as_str().into(),
1149             vec![
1150                 &aml::Name::new("_HID".into(), &"ACPI0007"),
1151                 &aml::Name::new("_UID".into(), &self.cpu_id),
1152                 // Currently, AArch64 cannot support following fields.
1153                 /*
1154                 _STA return value:
1155                 Bit [0] – Set if the device is present.
1156                 Bit [1] – Set if the device is enabled and decoding its resources.
1157                 Bit [2] – Set if the device should be shown in the UI.
1158                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
1159                 Bit [4] – Set if the battery is present.
1160                 Bits [31:5] – Reserved (must be cleared).
1161                 */
1162                 #[cfg(target_arch = "x86_64")]
1163                 &aml::Method::new(
1164                     "_STA".into(),
1165                     0,
1166                     false,
1167                     // Call into CSTA method which will interrogate device
1168                     vec![&aml::Return::new(&aml::MethodCall::new(
1169                         "CSTA".into(),
1170                         vec![&self.cpu_id],
1171                     ))],
1172                 ),
1173                 &aml::Method::new(
1174                     "_PXM".into(),
1175                     0,
1176                     false,
1177                     vec![&aml::Return::new(&self.proximity_domain)],
1178                 ),
1179                 // The Linux kernel expects every CPU device to have a _MAT entry
1180                 // containing the LAPIC for this processor with the enabled bit set
1181                 // even it if is disabled in the MADT (non-boot CPU)
1182                 #[cfg(target_arch = "x86_64")]
1183                 &aml::Name::new("_MAT".into(), &aml::Buffer::new(mat_data)),
1184                 // Trigger CPU ejection
1185                 #[cfg(target_arch = "x86_64")]
1186                 &aml::Method::new(
1187                     "_EJ0".into(),
1188                     1,
1189                     false,
1190                     // Call into CEJ0 method which will actually eject device
1191                     vec![&aml::MethodCall::new("CEJ0".into(), vec![&self.cpu_id])],
1192                 ),
1193             ],
1194         )
1195         .to_aml_bytes()
1196     }
1197 }
1198 
1199 #[cfg(feature = "acpi")]
1200 struct CpuNotify {
1201     cpu_id: u8,
1202 }
1203 
1204 #[cfg(feature = "acpi")]
1205 impl Aml for CpuNotify {
1206     fn to_aml_bytes(&self) -> Vec<u8> {
1207         let object = aml::Path::new(&format!("C{:03}", self.cpu_id));
1208         aml::If::new(
1209             &aml::Equal::new(&aml::Arg(0), &self.cpu_id),
1210             vec![&aml::Notify::new(&object, &aml::Arg(1))],
1211         )
1212         .to_aml_bytes()
1213     }
1214 }
1215 
1216 #[cfg(feature = "acpi")]
1217 struct CpuMethods {
1218     max_vcpus: u8,
1219 }
1220 
1221 #[cfg(feature = "acpi")]
1222 impl Aml for CpuMethods {
1223     fn to_aml_bytes(&self) -> Vec<u8> {
1224         let mut bytes = Vec::new();
1225         bytes.extend_from_slice(
1226             // CPU status method
1227             &aml::Method::new(
1228                 "CSTA".into(),
1229                 1,
1230                 true,
1231                 vec![
1232                     // Take lock defined above
1233                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1234                     // Write CPU number (in first argument) to I/O port via field
1235                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1236                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1237                     // Check if CPEN bit is set, if so make the local variable 0xf (see _STA for details of meaning)
1238                     &aml::If::new(
1239                         &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CPEN"), &aml::ONE),
1240                         vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
1241                     ),
1242                     // Release lock
1243                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1244                     // Return 0 or 0xf
1245                     &aml::Return::new(&aml::Local(0)),
1246                 ],
1247             )
1248             .to_aml_bytes(),
1249         );
1250 
1251         let mut cpu_notifies = Vec::new();
1252         for cpu_id in 0..self.max_vcpus {
1253             cpu_notifies.push(CpuNotify { cpu_id });
1254         }
1255 
1256         let mut cpu_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
1257         for cpu_id in 0..self.max_vcpus {
1258             cpu_notifies_refs.push(&cpu_notifies[usize::from(cpu_id)]);
1259         }
1260 
1261         bytes.extend_from_slice(
1262             &aml::Method::new("CTFY".into(), 2, true, cpu_notifies_refs).to_aml_bytes(),
1263         );
1264 
1265         bytes.extend_from_slice(
1266             &aml::Method::new(
1267                 "CEJ0".into(),
1268                 1,
1269                 true,
1270                 vec![
1271                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1272                     // Write CPU number (in first argument) to I/O port via field
1273                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Arg(0)),
1274                     // Set CEJ0 bit
1275                     &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CEJ0"), &aml::ONE),
1276                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1277                 ],
1278             )
1279             .to_aml_bytes(),
1280         );
1281 
1282         bytes.extend_from_slice(
1283             &aml::Method::new(
1284                 "CSCN".into(),
1285                 0,
1286                 true,
1287                 vec![
1288                     // Take lock defined above
1289                     &aml::Acquire::new("\\_SB_.PRES.CPLK".into(), 0xffff),
1290                     &aml::Store::new(&aml::Local(0), &aml::ZERO),
1291                     &aml::While::new(
1292                         &aml::LessThan::new(&aml::Local(0), &self.max_vcpus),
1293                         vec![
1294                             // Write CPU number (in first argument) to I/O port via field
1295                             &aml::Store::new(&aml::Path::new("\\_SB_.PRES.CSEL"), &aml::Local(0)),
1296                             // Check if CINS bit is set
1297                             &aml::If::new(
1298                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CINS"), &aml::ONE),
1299                                 // Notify device if it is
1300                                 vec![
1301                                     &aml::MethodCall::new(
1302                                         "CTFY".into(),
1303                                         vec![&aml::Local(0), &aml::ONE],
1304                                     ),
1305                                     // Reset CINS bit
1306                                     &aml::Store::new(
1307                                         &aml::Path::new("\\_SB_.PRES.CINS"),
1308                                         &aml::ONE,
1309                                     ),
1310                                 ],
1311                             ),
1312                             // Check if CRMV bit is set
1313                             &aml::If::new(
1314                                 &aml::Equal::new(&aml::Path::new("\\_SB_.PRES.CRMV"), &aml::ONE),
1315                                 // Notify device if it is (with the eject constant 0x3)
1316                                 vec![
1317                                     &aml::MethodCall::new(
1318                                         "CTFY".into(),
1319                                         vec![&aml::Local(0), &3u8],
1320                                     ),
1321                                     // Reset CRMV bit
1322                                     &aml::Store::new(
1323                                         &aml::Path::new("\\_SB_.PRES.CRMV"),
1324                                         &aml::ONE,
1325                                     ),
1326                                 ],
1327                             ),
1328                             &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
1329                         ],
1330                     ),
1331                     // Release lock
1332                     &aml::Release::new("\\_SB_.PRES.CPLK".into()),
1333                 ],
1334             )
1335             .to_aml_bytes(),
1336         );
1337         bytes
1338     }
1339 }
1340 
1341 #[cfg(feature = "acpi")]
1342 impl Aml for CpuManager {
1343     fn to_aml_bytes(&self) -> Vec<u8> {
1344         let mut bytes = Vec::new();
1345         // CPU hotplug controller
1346         #[cfg(target_arch = "x86_64")]
1347         bytes.extend_from_slice(
1348             &aml::Device::new(
1349                 "_SB_.PRES".into(),
1350                 vec![
1351                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
1352                     &aml::Name::new("_UID".into(), &"CPU Hotplug Controller"),
1353                     // Mutex to protect concurrent access as we write to choose CPU and then read back status
1354                     &aml::Mutex::new("CPLK".into(), 0),
1355                     &aml::Name::new(
1356                         "_CRS".into(),
1357                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
1358                             aml::AddressSpaceCachable::NotCacheable,
1359                             true,
1360                             self.acpi_address.0 as u64,
1361                             self.acpi_address.0 + CPU_MANAGER_ACPI_SIZE as u64 - 1,
1362                         )]),
1363                     ),
1364                     // OpRegion and Fields map MMIO range into individual field values
1365                     &aml::OpRegion::new(
1366                         "PRST".into(),
1367                         aml::OpRegionSpace::SystemMemory,
1368                         self.acpi_address.0 as usize,
1369                         CPU_MANAGER_ACPI_SIZE,
1370                     ),
1371                     &aml::Field::new(
1372                         "PRST".into(),
1373                         aml::FieldAccessType::Byte,
1374                         aml::FieldUpdateRule::WriteAsZeroes,
1375                         vec![
1376                             aml::FieldEntry::Reserved(32),
1377                             aml::FieldEntry::Named(*b"CPEN", 1),
1378                             aml::FieldEntry::Named(*b"CINS", 1),
1379                             aml::FieldEntry::Named(*b"CRMV", 1),
1380                             aml::FieldEntry::Named(*b"CEJ0", 1),
1381                             aml::FieldEntry::Reserved(4),
1382                             aml::FieldEntry::Named(*b"CCMD", 8),
1383                         ],
1384                     ),
1385                     &aml::Field::new(
1386                         "PRST".into(),
1387                         aml::FieldAccessType::DWord,
1388                         aml::FieldUpdateRule::Preserve,
1389                         vec![
1390                             aml::FieldEntry::Named(*b"CSEL", 32),
1391                             aml::FieldEntry::Reserved(32),
1392                             aml::FieldEntry::Named(*b"CDAT", 32),
1393                         ],
1394                     ),
1395                 ],
1396             )
1397             .to_aml_bytes(),
1398         );
1399 
1400         // CPU devices
1401         let hid = aml::Name::new("_HID".into(), &"ACPI0010");
1402         let uid = aml::Name::new("_CID".into(), &aml::EisaName::new("PNP0A05"));
1403         // Bundle methods together under a common object
1404         let methods = CpuMethods {
1405             max_vcpus: self.config.max_vcpus,
1406         };
1407         let mut cpu_data_inner: Vec<&dyn aml::Aml> = vec![&hid, &uid, &methods];
1408 
1409         let mut cpu_devices = Vec::new();
1410         for cpu_id in 0..self.config.max_vcpus {
1411             let proximity_domain = *self.proximity_domain_per_cpu.get(&cpu_id).unwrap_or(&0);
1412             let cpu_device = Cpu {
1413                 cpu_id,
1414                 proximity_domain,
1415             };
1416 
1417             cpu_devices.push(cpu_device);
1418         }
1419 
1420         for cpu_device in cpu_devices.iter() {
1421             cpu_data_inner.push(cpu_device);
1422         }
1423 
1424         bytes.extend_from_slice(
1425             &aml::Device::new("_SB_.CPUS".into(), cpu_data_inner).to_aml_bytes(),
1426         );
1427         bytes
1428     }
1429 }
1430 
1431 impl Pausable for CpuManager {
1432     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
1433         // Tell the vCPUs to pause themselves next time they exit
1434         self.vcpus_pause_signalled.store(true, Ordering::SeqCst);
1435 
1436         // Signal to the spawned threads (vCPUs and console signal handler). For the vCPU threads
1437         // this will interrupt the KVM_RUN ioctl() allowing the loop to check the boolean set
1438         // above.
1439         for state in self.vcpu_states.iter() {
1440             state.signal_thread();
1441         }
1442 
1443         for vcpu in self.vcpus.iter() {
1444             let mut vcpu = vcpu.lock().unwrap();
1445             vcpu.pause()?;
1446             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
1447             if !self.config.kvm_hyperv {
1448                 vcpu.vcpu.notify_guest_clock_paused().map_err(|e| {
1449                     MigratableError::Pause(anyhow!(
1450                         "Could not notify guest it has been paused {:?}",
1451                         e
1452                     ))
1453                 })?;
1454             }
1455         }
1456 
1457         Ok(())
1458     }
1459 
1460     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
1461         for vcpu in self.vcpus.iter() {
1462             vcpu.lock().unwrap().resume()?;
1463         }
1464 
1465         // Toggle the vCPUs pause boolean
1466         self.vcpus_pause_signalled.store(false, Ordering::SeqCst);
1467 
1468         // Unpark all the VCPU threads.
1469         // Once unparked, the next thing they will do is checking for the pause
1470         // boolean. Since it'll be set to false, they will exit their pause loop
1471         // and go back to vmx root.
1472         for state in self.vcpu_states.iter() {
1473             state.unpark_thread();
1474         }
1475         Ok(())
1476     }
1477 }
1478 
1479 impl Snapshottable for CpuManager {
1480     fn id(&self) -> String {
1481         CPU_MANAGER_SNAPSHOT_ID.to_string()
1482     }
1483 
1484     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
1485         let mut cpu_manager_snapshot = Snapshot::new(CPU_MANAGER_SNAPSHOT_ID);
1486 
1487         // The CpuManager snapshot is a collection of all vCPUs snapshots.
1488         for vcpu in &self.vcpus {
1489             let cpu_snapshot = vcpu.lock().unwrap().snapshot()?;
1490             cpu_manager_snapshot.add_snapshot(cpu_snapshot);
1491         }
1492 
1493         Ok(cpu_manager_snapshot)
1494     }
1495 
1496     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
1497         for (cpu_id, snapshot) in snapshot.snapshots.iter() {
1498             debug!("Restoring VCPU {}", cpu_id);
1499             self.create_vcpu(cpu_id.parse::<u8>().unwrap(), None, Some(*snapshot.clone()))
1500                 .map_err(|e| MigratableError::Restore(anyhow!("Could not create vCPU {:?}", e)))?;
1501         }
1502 
1503         Ok(())
1504     }
1505 }
1506 
1507 impl Transportable for CpuManager {}
1508 impl Migratable for CpuManager {}
1509 
1510 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
1511 #[cfg(test)]
1512 mod tests {
1513     use arch::x86_64::interrupts::*;
1514     use arch::x86_64::regs::*;
1515     use hypervisor::x86_64::{FpuState, LapicState, StandardRegisters};
1516 
1517     #[test]
1518     fn test_setlint() {
1519         let hv = hypervisor::new().unwrap();
1520         let vm = hv.create_vm().expect("new VM fd creation failed");
1521         assert!(hv.check_required_extensions().is_ok());
1522         // Calling get_lapic will fail if there is no irqchip before hand.
1523         assert!(vm.create_irq_chip().is_ok());
1524         let vcpu = vm.create_vcpu(0, None).unwrap();
1525         let klapic_before: LapicState = vcpu.get_lapic().unwrap();
1526 
1527         // Compute the value that is expected to represent LVT0 and LVT1.
1528         let lint0 = get_klapic_reg(&klapic_before, APIC_LVT0);
1529         let lint1 = get_klapic_reg(&klapic_before, APIC_LVT1);
1530         let lint0_mode_expected = set_apic_delivery_mode(lint0, APIC_MODE_EXTINT);
1531         let lint1_mode_expected = set_apic_delivery_mode(lint1, APIC_MODE_NMI);
1532 
1533         set_lint(&vcpu).unwrap();
1534 
1535         // Compute the value that represents LVT0 and LVT1 after set_lint.
1536         let klapic_actual: LapicState = vcpu.get_lapic().unwrap();
1537         let lint0_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT0);
1538         let lint1_mode_actual = get_klapic_reg(&klapic_actual, APIC_LVT1);
1539         assert_eq!(lint0_mode_expected, lint0_mode_actual);
1540         assert_eq!(lint1_mode_expected, lint1_mode_actual);
1541     }
1542 
1543     #[test]
1544     fn test_setup_fpu() {
1545         let hv = hypervisor::new().unwrap();
1546         let vm = hv.create_vm().expect("new VM fd creation failed");
1547         let vcpu = vm.create_vcpu(0, None).unwrap();
1548         setup_fpu(&vcpu).unwrap();
1549 
1550         let expected_fpu: FpuState = FpuState {
1551             fcw: 0x37f,
1552             mxcsr: 0x1f80,
1553             ..Default::default()
1554         };
1555         let actual_fpu: FpuState = vcpu.get_fpu().unwrap();
1556         // TODO: auto-generate kvm related structures with PartialEq on.
1557         assert_eq!(expected_fpu.fcw, actual_fpu.fcw);
1558         // Setting the mxcsr register from FpuState inside setup_fpu does not influence anything.
1559         // See 'kvm_arch_vcpu_ioctl_set_fpu' from arch/x86/kvm/x86.c.
1560         // The mxcsr will stay 0 and the assert below fails. Decide whether or not we should
1561         // remove it at all.
1562         // assert!(expected_fpu.mxcsr == actual_fpu.mxcsr);
1563     }
1564 
1565     #[test]
1566     fn test_setup_msrs() {
1567         use hypervisor::arch::x86::msr_index;
1568         use hypervisor::x86_64::{MsrEntries, MsrEntry};
1569 
1570         let hv = hypervisor::new().unwrap();
1571         let vm = hv.create_vm().expect("new VM fd creation failed");
1572         let vcpu = vm.create_vcpu(0, None).unwrap();
1573         setup_msrs(&vcpu).unwrap();
1574 
1575         // This test will check against the last MSR entry configured (the tenth one).
1576         // See create_msr_entries for details.
1577         let mut msrs = MsrEntries::from_entries(&[MsrEntry {
1578             index: msr_index::MSR_IA32_MISC_ENABLE,
1579             ..Default::default()
1580         }])
1581         .unwrap();
1582 
1583         // get_msrs returns the number of msrs that it succeed in reading. We only want to read 1
1584         // in this test case scenario.
1585         let read_msrs = vcpu.get_msrs(&mut msrs).unwrap();
1586         assert_eq!(read_msrs, 1);
1587 
1588         // Official entries that were setup when we did setup_msrs. We need to assert that the
1589         // tenth one (i.e the one with index msr_index::MSR_IA32_MISC_ENABLE has the data we
1590         // expect.
1591         let entry_vec = hypervisor::x86_64::boot_msr_entries();
1592         assert_eq!(entry_vec.as_slice()[9], msrs.as_slice()[0]);
1593     }
1594 
1595     #[test]
1596     fn test_setup_regs() {
1597         let hv = hypervisor::new().unwrap();
1598         let vm = hv.create_vm().expect("new VM fd creation failed");
1599         let vcpu = vm.create_vcpu(0, None).unwrap();
1600 
1601         let expected_regs: StandardRegisters = StandardRegisters {
1602             rflags: 0x0000000000000002u64,
1603             rbx: arch::layout::PVH_INFO_START.0,
1604             rip: 1,
1605             ..Default::default()
1606         };
1607 
1608         setup_regs(&vcpu, expected_regs.rip).unwrap();
1609 
1610         let actual_regs: StandardRegisters = vcpu.get_regs().unwrap();
1611         assert_eq!(actual_regs, expected_regs);
1612     }
1613 }
1614 
1615 #[cfg(target_arch = "aarch64")]
1616 #[cfg(test)]
1617 mod tests {
1618     use crate::GuestMemoryMmap;
1619     use arch::aarch64::layout;
1620     use arch::aarch64::regs::*;
1621     use hypervisor::kvm::aarch64::{is_system_register, MPIDR_EL1};
1622     use hypervisor::kvm::kvm_bindings::{
1623         kvm_one_reg, kvm_regs, kvm_vcpu_init, user_pt_regs, KVM_REG_ARM64, KVM_REG_ARM64_SYSREG,
1624         KVM_REG_ARM_CORE, KVM_REG_SIZE_U64,
1625     };
1626     use hypervisor::{arm64_core_reg_id, offset__of};
1627     use std::mem;
1628     use vm_memory::GuestAddress;
1629 
1630     #[test]
1631     fn test_setup_regs() {
1632         let hv = hypervisor::new().unwrap();
1633         let vm = hv.create_vm().unwrap();
1634         let vcpu = vm.create_vcpu(0, None).unwrap();
1635         let regions = vec![(
1636             GuestAddress(layout::RAM_64BIT_START),
1637             (layout::FDT_MAX_SIZE + 0x1000) as usize,
1638         )];
1639         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
1640 
1641         let res = setup_regs(&vcpu, 0, 0x0, &mem);
1642         // Must fail when vcpu is not initialized yet.
1643         assert!(res.is_err());
1644 
1645         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
1646         vm.get_preferred_target(&mut kvi).unwrap();
1647         vcpu.vcpu_init(&kvi).unwrap();
1648 
1649         assert!(setup_regs(&vcpu, 0, 0x0, &mem).is_ok());
1650     }
1651 
1652     #[test]
1653     fn test_read_mpidr() {
1654         let hv = hypervisor::new().unwrap();
1655         let vm = hv.create_vm().unwrap();
1656         let vcpu = vm.create_vcpu(0, None).unwrap();
1657         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
1658         vm.get_preferred_target(&mut kvi).unwrap();
1659 
1660         // Must fail when vcpu is not initialized yet.
1661         assert!(vcpu.read_mpidr().is_err());
1662 
1663         vcpu.vcpu_init(&kvi).unwrap();
1664         assert_eq!(vcpu.read_mpidr().unwrap(), 0x80000000);
1665     }
1666 
1667     #[test]
1668     fn test_is_system_register() {
1669         let offset = offset__of!(user_pt_regs, pc);
1670         let regid = arm64_core_reg_id!(KVM_REG_SIZE_U64, offset);
1671         assert!(!is_system_register(regid));
1672         let regid = KVM_REG_ARM64 as u64 | KVM_REG_SIZE_U64 as u64 | KVM_REG_ARM64_SYSREG as u64;
1673         assert!(is_system_register(regid));
1674     }
1675 
1676     #[test]
1677     fn test_save_restore_core_regs() {
1678         let hv = hypervisor::new().unwrap();
1679         let vm = hv.create_vm().unwrap();
1680         let vcpu = vm.create_vcpu(0, None).unwrap();
1681         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
1682         vm.get_preferred_target(&mut kvi).unwrap();
1683 
1684         // Must fail when vcpu is not initialized yet.
1685         let mut state = kvm_regs::default();
1686         let res = vcpu.core_registers(&mut state);
1687         assert!(res.is_err());
1688         assert_eq!(
1689             format!("{}", res.unwrap_err()),
1690             "Failed to get core register: Exec format error (os error 8)"
1691         );
1692 
1693         let res = vcpu.set_core_registers(&state);
1694         assert!(res.is_err());
1695         assert_eq!(
1696             format!("{}", res.unwrap_err()),
1697             "Failed to set core register: Exec format error (os error 8)"
1698         );
1699 
1700         vcpu.vcpu_init(&kvi).unwrap();
1701         assert!(vcpu.core_registers(&mut state).is_ok());
1702         assert_eq!(state.regs.pstate, 0x3C5);
1703 
1704         assert!(vcpu.set_core_registers(&state).is_ok());
1705         let off = offset__of!(user_pt_regs, pstate);
1706         let pstate = vcpu
1707             .get_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1708             .expect("Failed to call kvm get one reg");
1709         assert_eq!(state.regs.pstate, pstate);
1710     }
1711 
1712     #[test]
1713     fn test_save_restore_system_regs() {
1714         let hv = hypervisor::new().unwrap();
1715         let vm = hv.create_vm().unwrap();
1716         let vcpu = vm.create_vcpu(0, None).unwrap();
1717         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
1718         vm.get_preferred_target(&mut kvi).unwrap();
1719 
1720         // Must fail when vcpu is not initialized yet.
1721         let mut state: Vec<kvm_one_reg> = Vec::new();
1722         let res = vcpu.system_registers(&mut state);
1723         assert!(res.is_err());
1724         assert_eq!(
1725             format!("{}", res.unwrap_err()),
1726             "Failed to retrieve list of registers: Exec format error (os error 8)"
1727         );
1728 
1729         state.push(kvm_one_reg {
1730             id: MPIDR_EL1,
1731             addr: 0x00,
1732         });
1733         let res = vcpu.set_system_registers(&state);
1734         assert!(res.is_err());
1735         assert_eq!(
1736             format!("{}", res.unwrap_err()),
1737             "Failed to set system register: Exec format error (os error 8)"
1738         );
1739 
1740         vcpu.vcpu_init(&kvi).unwrap();
1741         assert!(vcpu.system_registers(&mut state).is_ok());
1742         let initial_mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr");
1743         assert!(state.contains(&kvm_one_reg {
1744             id: MPIDR_EL1,
1745             addr: initial_mpidr
1746         }));
1747 
1748         assert!(vcpu.set_system_registers(&state).is_ok());
1749         let mpidr: u64 = vcpu.read_mpidr().expect("Fail to read mpidr");
1750         assert_eq!(initial_mpidr, mpidr);
1751     }
1752 
1753     #[test]
1754     fn test_get_set_mpstate() {
1755         let hv = hypervisor::new().unwrap();
1756         let vm = hv.create_vm().unwrap();
1757         let vcpu = vm.create_vcpu(0, None).unwrap();
1758         let mut kvi: kvm_vcpu_init = kvm_vcpu_init::default();
1759         vm.get_preferred_target(&mut kvi).unwrap();
1760 
1761         let res = vcpu.get_mp_state();
1762         assert!(res.is_ok());
1763         assert!(vcpu.set_mp_state(res.unwrap()).is_ok());
1764     }
1765 }
1766