xref: /cloud-hypervisor/vmm/src/vm.rs (revision 5f814308d6b19037f2afb3d36fe49b0aa14c0b22)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo};
20 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
21 use crate::coredump::{
22     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
23 };
24 use crate::cpu;
25 use crate::device_manager::{DeviceManager, DeviceManagerError};
26 use crate::device_tree::DeviceTree;
27 #[cfg(feature = "guest_debug")]
28 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
29 #[cfg(feature = "igvm")]
30 use crate::igvm::igvm_loader;
31 use crate::landlock::LandlockError;
32 use crate::memory_manager::{
33     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
34 };
35 #[cfg(target_arch = "x86_64")]
36 use crate::migration::get_vm_snapshot;
37 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
38 use crate::migration::url_to_file;
39 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
40 use crate::GuestMemoryMmap;
41 use crate::{
42     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
43 };
44 use anyhow::anyhow;
45 use arch::get_host_cpu_phys_bits;
46 #[cfg(target_arch = "x86_64")]
47 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
48 #[cfg(feature = "tdx")]
49 use arch::x86_64::tdx::TdvfSection;
50 use arch::EntryPoint;
51 #[cfg(target_arch = "aarch64")]
52 use arch::PciSpaceInfo;
53 use arch::{NumaNode, NumaNodes};
54 #[cfg(target_arch = "aarch64")]
55 use devices::interrupt_controller;
56 use devices::AcpiNotificationFlags;
57 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
58 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
59 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
60 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
61 use hypervisor::{HypervisorVmError, VmOps};
62 use libc::{termios, SIGWINCH};
63 use linux_loader::cmdline::Cmdline;
64 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
65 use linux_loader::elf;
66 #[cfg(target_arch = "x86_64")]
67 use linux_loader::loader::bzimage::BzImage;
68 #[cfg(target_arch = "x86_64")]
69 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
70 #[cfg(target_arch = "aarch64")]
71 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
72 use linux_loader::loader::KernelLoader;
73 use seccompiler::SeccompAction;
74 use serde::{Deserialize, Serialize};
75 use std::cmp;
76 use std::collections::BTreeMap;
77 use std::collections::HashMap;
78 use std::fs::{File, OpenOptions};
79 use std::io::{self, Seek, SeekFrom, Write};
80 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
81 use std::mem::size_of;
82 use std::num::Wrapping;
83 use std::ops::Deref;
84 use std::os::unix::net::UnixStream;
85 use std::sync::{Arc, Mutex, RwLock};
86 use std::time::Instant;
87 use std::{result, str, thread};
88 use thiserror::Error;
89 use tracer::trace_scoped;
90 use vm_device::Bus;
91 #[cfg(feature = "tdx")]
92 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
93 use vm_memory::{
94     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
95 };
96 use vm_migration::protocol::{Request, Response};
97 use vm_migration::{
98     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
99     Snapshottable, Transportable,
100 };
101 use vmm_sys_util::eventfd::EventFd;
102 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
103 
104 /// Errors associated with VM management
105 #[derive(Debug, Error)]
106 pub enum Error {
107     #[error("Cannot open kernel file: {0}")]
108     KernelFile(#[source] io::Error),
109 
110     #[error("Cannot open initramfs file: {0}")]
111     InitramfsFile(#[source] io::Error),
112 
113     #[error("Cannot load the kernel into memory: {0}")]
114     KernelLoad(#[source] linux_loader::loader::Error),
115 
116     #[cfg(target_arch = "aarch64")]
117     #[error("Cannot load the UEFI binary in memory: {0:?}")]
118     UefiLoad(arch::aarch64::uefi::Error),
119 
120     #[error("Cannot load the initramfs into memory")]
121     InitramfsLoad,
122 
123     #[error("Cannot load the kernel command line in memory: {0}")]
124     LoadCmdLine(#[source] linux_loader::loader::Error),
125 
126     #[error("Failed to apply landlock config during vm_create: {0}")]
127     ApplyLandlock(#[source] LandlockError),
128 
129     #[error("Cannot modify the kernel command line: {0}")]
130     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
131 
132     #[error("Cannot create the kernel command line: {0}")]
133     CmdLineCreate(#[source] linux_loader::cmdline::Error),
134 
135     #[error("Cannot configure system: {0}")]
136     ConfigureSystem(#[source] arch::Error),
137 
138     #[cfg(target_arch = "aarch64")]
139     #[error("Cannot enable interrupt controller: {0:?}")]
140     EnableInterruptController(interrupt_controller::Error),
141 
142     #[error("VM state is poisoned")]
143     PoisonedState,
144 
145     #[error("Error from device manager: {0:?}")]
146     DeviceManager(DeviceManagerError),
147 
148     #[error("No device with id {0:?} to remove")]
149     NoDeviceToRemove(String),
150 
151     #[error("Cannot spawn a signal handler thread: {0}")]
152     SignalHandlerSpawn(#[source] io::Error),
153 
154     #[error("Failed to join on threads: {0:?}")]
155     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
156 
157     #[error("VM config is missing")]
158     VmMissingConfig,
159 
160     #[error("VM is not created")]
161     VmNotCreated,
162 
163     #[error("VM is already created")]
164     VmAlreadyCreated,
165 
166     #[error("VM is not running")]
167     VmNotRunning,
168 
169     #[error("Cannot clone EventFd: {0}")]
170     EventFdClone(#[source] io::Error),
171 
172     #[error("invalid VM state transition: {0:?} to {1:?}")]
173     InvalidStateTransition(VmState, VmState),
174 
175     #[error("Error from CPU manager: {0}")]
176     CpuManager(#[source] cpu::Error),
177 
178     #[error("Cannot pause devices: {0}")]
179     PauseDevices(#[source] MigratableError),
180 
181     #[error("Cannot resume devices: {0}")]
182     ResumeDevices(#[source] MigratableError),
183 
184     #[error("Cannot pause CPUs: {0}")]
185     PauseCpus(#[source] MigratableError),
186 
187     #[error("Cannot resume cpus: {0}")]
188     ResumeCpus(#[source] MigratableError),
189 
190     #[error("Cannot pause VM: {0}")]
191     Pause(#[source] MigratableError),
192 
193     #[error("Cannot resume VM: {0}")]
194     Resume(#[source] MigratableError),
195 
196     #[error("Memory manager error: {0:?}")]
197     MemoryManager(MemoryManagerError),
198 
199     #[error("Eventfd write error: {0}")]
200     EventfdError(#[source] std::io::Error),
201 
202     #[error("Cannot snapshot VM: {0}")]
203     Snapshot(#[source] MigratableError),
204 
205     #[error("Cannot restore VM: {0}")]
206     Restore(#[source] MigratableError),
207 
208     #[error("Cannot send VM snapshot: {0}")]
209     SnapshotSend(#[source] MigratableError),
210 
211     #[error("Invalid restore source URL")]
212     InvalidRestoreSourceUrl,
213 
214     #[error("Failed to validate config: {0}")]
215     ConfigValidation(#[source] ValidationError),
216 
217     #[error("Too many virtio-vsock devices")]
218     TooManyVsockDevices,
219 
220     #[error("Failed serializing into JSON: {0}")]
221     SerializeJson(#[source] serde_json::Error),
222 
223     #[error("Invalid NUMA configuration")]
224     InvalidNumaConfig,
225 
226     #[error("Cannot create seccomp filter: {0}")]
227     CreateSeccompFilter(#[source] seccompiler::Error),
228 
229     #[error("Cannot apply seccomp filter: {0}")]
230     ApplySeccompFilter(#[source] seccompiler::Error),
231 
232     #[error("Failed resizing a memory zone")]
233     ResizeZone,
234 
235     #[error("Cannot activate virtio devices: {0:?}")]
236     ActivateVirtioDevices(DeviceManagerError),
237 
238     #[error("Error triggering power button: {0:?}")]
239     PowerButton(DeviceManagerError),
240 
241     #[error("Kernel lacks PVH header")]
242     KernelMissingPvhHeader,
243 
244     #[error("Failed to allocate firmware RAM: {0:?}")]
245     AllocateFirmwareMemory(MemoryManagerError),
246 
247     #[error("Error manipulating firmware file: {0}")]
248     FirmwareFile(#[source] std::io::Error),
249 
250     #[error("Firmware too big")]
251     FirmwareTooLarge,
252 
253     #[error("Failed to copy firmware to memory: {0}")]
254     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
255 
256     #[cfg(feature = "sev_snp")]
257     #[error("Error enabling SEV-SNP VM: {0}")]
258     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
259 
260     #[cfg(feature = "tdx")]
261     #[error("Error performing I/O on TDX firmware file: {0}")]
262     LoadTdvf(#[source] std::io::Error),
263 
264     #[cfg(feature = "tdx")]
265     #[error("Error performing I/O on the TDX payload file: {0}")]
266     LoadPayload(#[source] std::io::Error),
267 
268     #[cfg(feature = "tdx")]
269     #[error("Error parsing TDVF: {0}")]
270     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
271 
272     #[cfg(feature = "tdx")]
273     #[error("Error populating TDX HOB: {0}")]
274     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
275 
276     #[cfg(feature = "tdx")]
277     #[error("Error allocating TDVF memory: {0:?}")]
278     AllocatingTdvfMemory(crate::memory_manager::Error),
279 
280     #[cfg(feature = "tdx")]
281     #[error("Error enabling TDX VM: {0}")]
282     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
283 
284     #[cfg(feature = "tdx")]
285     #[error("Error enabling TDX memory region: {0}")]
286     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
287 
288     #[cfg(feature = "tdx")]
289     #[error("Error finalizing TDX VM: {0}")]
290     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
291 
292     #[cfg(feature = "tdx")]
293     #[error("TDX firmware missing")]
294     TdxFirmwareMissing,
295 
296     #[cfg(feature = "tdx")]
297     #[error("Invalid TDX payload type")]
298     InvalidPayloadType,
299 
300     #[cfg(feature = "guest_debug")]
301     #[error("Error debugging VM: {0:?}")]
302     Debug(DebuggableError),
303 
304     #[error("Error spawning kernel loading thread")]
305     KernelLoadThreadSpawn(std::io::Error),
306 
307     #[error("Error joining kernel loading thread")]
308     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
309 
310     #[error("Payload configuration is not bootable")]
311     InvalidPayload,
312 
313     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
314     #[error("Error coredumping VM: {0:?}")]
315     Coredump(GuestDebuggableError),
316 
317     #[cfg(feature = "igvm")]
318     #[error("Cannot open igvm file: {0}")]
319     IgvmFile(#[source] io::Error),
320 
321     #[cfg(feature = "igvm")]
322     #[error("Cannot load the igvm into memory: {0}")]
323     IgvmLoad(#[source] igvm_loader::Error),
324 
325     #[error("Error injecting NMI")]
326     ErrorNmi,
327 
328     #[error("Error resuming the VM: {0}")]
329     ResumeVm(#[source] hypervisor::HypervisorVmError),
330 
331     #[error("Error creating console devices")]
332     CreateConsoleDevices(ConsoleDeviceError),
333 }
334 pub type Result<T> = result::Result<T, Error>;
335 
336 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
337 pub enum VmState {
338     Created,
339     Running,
340     Shutdown,
341     Paused,
342     BreakPoint,
343 }
344 
345 impl VmState {
346     fn valid_transition(self, new_state: VmState) -> Result<()> {
347         match self {
348             VmState::Created => match new_state {
349                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
350                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
351                     Ok(())
352                 }
353             },
354 
355             VmState::Running => match new_state {
356                 VmState::Created | VmState::Running => {
357                     Err(Error::InvalidStateTransition(self, new_state))
358                 }
359                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
360             },
361 
362             VmState::Shutdown => match new_state {
363                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
364                     Err(Error::InvalidStateTransition(self, new_state))
365                 }
366                 VmState::Running => Ok(()),
367             },
368 
369             VmState::Paused => match new_state {
370                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
371                     Err(Error::InvalidStateTransition(self, new_state))
372                 }
373                 VmState::Running | VmState::Shutdown => Ok(()),
374             },
375             VmState::BreakPoint => match new_state {
376                 VmState::Created | VmState::Running => Ok(()),
377                 _ => Err(Error::InvalidStateTransition(self, new_state)),
378             },
379         }
380     }
381 }
382 
383 struct VmOpsHandler {
384     memory: GuestMemoryAtomic<GuestMemoryMmap>,
385     #[cfg(target_arch = "x86_64")]
386     io_bus: Arc<Bus>,
387     mmio_bus: Arc<Bus>,
388 }
389 
390 impl VmOps for VmOpsHandler {
391     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
392         self.memory
393             .memory()
394             .write(buf, GuestAddress(gpa))
395             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
396     }
397 
398     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
399         self.memory
400             .memory()
401             .read(buf, GuestAddress(gpa))
402             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
403     }
404 
405     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
406         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
407             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
408         }
409         Ok(())
410     }
411 
412     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
413         match self.mmio_bus.write(gpa, data) {
414             Err(vm_device::BusError::MissingAddressRange) => {
415                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
416             }
417             Ok(Some(barrier)) => {
418                 info!("Waiting for barrier");
419                 barrier.wait();
420                 info!("Barrier released");
421             }
422             _ => {}
423         };
424         Ok(())
425     }
426 
427     #[cfg(target_arch = "x86_64")]
428     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
429         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
430             info!("Guest PIO read to unregistered address 0x{:x}", port);
431         }
432         Ok(())
433     }
434 
435     #[cfg(target_arch = "x86_64")]
436     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
437         match self.io_bus.write(port, data) {
438             Err(vm_device::BusError::MissingAddressRange) => {
439                 info!("Guest PIO write to unregistered address 0x{:x}", port);
440             }
441             Ok(Some(barrier)) => {
442                 info!("Waiting for barrier");
443                 barrier.wait();
444                 info!("Barrier released");
445             }
446             _ => {}
447         };
448         Ok(())
449     }
450 }
451 
452 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
453     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
454 
455     cmp::min(host_phys_bits, max_phys_bits)
456 }
457 
458 pub struct Vm {
459     #[cfg(feature = "tdx")]
460     kernel: Option<File>,
461     initramfs: Option<File>,
462     threads: Vec<thread::JoinHandle<()>>,
463     device_manager: Arc<Mutex<DeviceManager>>,
464     config: Arc<Mutex<VmConfig>>,
465     state: RwLock<VmState>,
466     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
467     memory_manager: Arc<Mutex<MemoryManager>>,
468     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
469     // The hypervisor abstracted virtual machine.
470     vm: Arc<dyn hypervisor::Vm>,
471     #[cfg(target_arch = "x86_64")]
472     saved_clock: Option<hypervisor::ClockData>,
473     numa_nodes: NumaNodes,
474     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
475     hypervisor: Arc<dyn hypervisor::Hypervisor>,
476     stop_on_boot: bool,
477     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
478 }
479 
480 impl Vm {
481     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
482 
483     #[allow(clippy::too_many_arguments)]
484     pub fn new_from_memory_manager(
485         config: Arc<Mutex<VmConfig>>,
486         memory_manager: Arc<Mutex<MemoryManager>>,
487         vm: Arc<dyn hypervisor::Vm>,
488         exit_evt: EventFd,
489         reset_evt: EventFd,
490         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
491         seccomp_action: &SeccompAction,
492         hypervisor: Arc<dyn hypervisor::Hypervisor>,
493         activate_evt: EventFd,
494         timestamp: Instant,
495         console_info: Option<ConsoleInfo>,
496         console_resize_pipe: Option<File>,
497         original_termios: Arc<Mutex<Option<termios>>>,
498         snapshot: Option<Snapshot>,
499     ) -> Result<Self> {
500         trace_scoped!("Vm::new_from_memory_manager");
501 
502         let boot_id_list = config
503             .lock()
504             .unwrap()
505             .validate()
506             .map_err(Error::ConfigValidation)?;
507 
508         #[cfg(not(feature = "igvm"))]
509         let load_payload_handle = if snapshot.is_none() {
510             Self::load_payload_async(&memory_manager, &config)?
511         } else {
512             None
513         };
514 
515         info!("Booting VM from config: {:?}", &config);
516 
517         // Create NUMA nodes based on NumaConfig.
518         let numa_nodes =
519             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
520 
521         #[cfg(feature = "tdx")]
522         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
523         #[cfg(feature = "sev_snp")]
524         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
525         #[cfg(feature = "tdx")]
526         let force_iommu = tdx_enabled;
527         #[cfg(feature = "sev_snp")]
528         let force_iommu = sev_snp_enabled;
529         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
530         let force_iommu = false;
531 
532         #[cfg(feature = "guest_debug")]
533         let stop_on_boot = config.lock().unwrap().gdb;
534         #[cfg(not(feature = "guest_debug"))]
535         let stop_on_boot = false;
536 
537         let memory = memory_manager.lock().unwrap().guest_memory();
538         #[cfg(target_arch = "x86_64")]
539         let io_bus = Arc::new(Bus::new());
540         let mmio_bus = Arc::new(Bus::new());
541 
542         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
543             memory,
544             #[cfg(target_arch = "x86_64")]
545             io_bus: io_bus.clone(),
546             mmio_bus: mmio_bus.clone(),
547         });
548 
549         let cpus_config = { &config.lock().unwrap().cpus.clone() };
550         let cpu_manager = cpu::CpuManager::new(
551             cpus_config,
552             vm.clone(),
553             exit_evt.try_clone().map_err(Error::EventFdClone)?,
554             reset_evt.try_clone().map_err(Error::EventFdClone)?,
555             #[cfg(feature = "guest_debug")]
556             vm_debug_evt,
557             &hypervisor,
558             seccomp_action.clone(),
559             vm_ops,
560             #[cfg(feature = "tdx")]
561             tdx_enabled,
562             &numa_nodes,
563             #[cfg(feature = "sev_snp")]
564             sev_snp_enabled,
565         )
566         .map_err(Error::CpuManager)?;
567 
568         #[cfg(target_arch = "x86_64")]
569         cpu_manager
570             .lock()
571             .unwrap()
572             .populate_cpuid(
573                 &memory_manager,
574                 &hypervisor,
575                 #[cfg(feature = "tdx")]
576                 tdx_enabled,
577             )
578             .map_err(Error::CpuManager)?;
579 
580         // Loading the igvm file is pushed down here because
581         // igvm parser needs cpu_manager to retrieve cpuid leaf.
582         // For the regular case, we can start loading early, but for
583         // igvm case we have to wait until cpu_manager is created.
584         // Currently, Microsoft Hypervisor does not provide any
585         // Hypervisor specific common cpuid, we need to call get_cpuid_values
586         // per cpuid through cpu_manager.
587         #[cfg(feature = "igvm")]
588         let load_payload_handle = if snapshot.is_none() {
589             Self::load_payload_async(
590                 &memory_manager,
591                 &config,
592                 &cpu_manager,
593                 #[cfg(feature = "sev_snp")]
594                 sev_snp_enabled,
595             )?
596         } else {
597             None
598         };
599         // The initial TDX configuration must be done before the vCPUs are
600         // created
601         #[cfg(feature = "tdx")]
602         if tdx_enabled {
603             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
604             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
605             vm.tdx_init(&cpuid, max_vcpus)
606                 .map_err(Error::InitializeTdxVm)?;
607         }
608 
609         cpu_manager
610             .lock()
611             .unwrap()
612             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
613             .map_err(Error::CpuManager)?;
614 
615         // This initial SEV-SNP configuration must be done immediately after
616         // vCPUs are created. As part of this initialization we are
617         // transitioning the guest into secure state.
618         #[cfg(feature = "sev_snp")]
619         if sev_snp_enabled {
620             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
621         }
622 
623         #[cfg(feature = "tdx")]
624         let dynamic = !tdx_enabled;
625         #[cfg(not(feature = "tdx"))]
626         let dynamic = true;
627 
628         let device_manager = DeviceManager::new(
629             #[cfg(target_arch = "x86_64")]
630             io_bus,
631             mmio_bus,
632             vm.clone(),
633             config.clone(),
634             memory_manager.clone(),
635             cpu_manager.clone(),
636             exit_evt.try_clone().map_err(Error::EventFdClone)?,
637             reset_evt,
638             seccomp_action.clone(),
639             numa_nodes.clone(),
640             &activate_evt,
641             force_iommu,
642             boot_id_list,
643             timestamp,
644             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
645             dynamic,
646         )
647         .map_err(Error::DeviceManager)?;
648 
649         device_manager
650             .lock()
651             .unwrap()
652             .create_devices(console_info, console_resize_pipe, original_termios)
653             .map_err(Error::DeviceManager)?;
654 
655         #[cfg(feature = "tdx")]
656         let kernel = config
657             .lock()
658             .unwrap()
659             .payload
660             .as_ref()
661             .map(|p| p.kernel.as_ref().map(File::open))
662             .unwrap_or_default()
663             .transpose()
664             .map_err(Error::KernelFile)?;
665 
666         let initramfs = config
667             .lock()
668             .unwrap()
669             .payload
670             .as_ref()
671             .map(|p| p.initramfs.as_ref().map(File::open))
672             .unwrap_or_default()
673             .transpose()
674             .map_err(Error::InitramfsFile)?;
675 
676         #[cfg(target_arch = "x86_64")]
677         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
678             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
679             vm_snapshot.clock
680         } else {
681             None
682         };
683 
684         let vm_state = if snapshot.is_some() {
685             VmState::Paused
686         } else {
687             VmState::Created
688         };
689 
690         Ok(Vm {
691             #[cfg(feature = "tdx")]
692             kernel,
693             initramfs,
694             device_manager,
695             config,
696             threads: Vec::with_capacity(1),
697             state: RwLock::new(vm_state),
698             cpu_manager,
699             memory_manager,
700             vm,
701             #[cfg(target_arch = "x86_64")]
702             saved_clock,
703             numa_nodes,
704             hypervisor,
705             stop_on_boot,
706             load_payload_handle,
707         })
708     }
709 
710     fn create_numa_nodes(
711         configs: Option<Vec<NumaConfig>>,
712         memory_manager: &Arc<Mutex<MemoryManager>>,
713     ) -> Result<NumaNodes> {
714         let mm = memory_manager.lock().unwrap();
715         let mm_zones = mm.memory_zones();
716         let mut numa_nodes = BTreeMap::new();
717 
718         if let Some(configs) = &configs {
719             for config in configs.iter() {
720                 if numa_nodes.contains_key(&config.guest_numa_id) {
721                     error!("Can't define twice the same NUMA node");
722                     return Err(Error::InvalidNumaConfig);
723                 }
724 
725                 let mut node = NumaNode::default();
726 
727                 if let Some(memory_zones) = &config.memory_zones {
728                     for memory_zone in memory_zones.iter() {
729                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
730                             node.memory_regions.extend(mm_zone.regions().clone());
731                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
732                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
733                             }
734                             node.memory_zones.push(memory_zone.clone());
735                         } else {
736                             error!("Unknown memory zone '{}'", memory_zone);
737                             return Err(Error::InvalidNumaConfig);
738                         }
739                     }
740                 }
741 
742                 if let Some(cpus) = &config.cpus {
743                     node.cpus.extend(cpus);
744                 }
745 
746                 if let Some(pci_segments) = &config.pci_segments {
747                     node.pci_segments.extend(pci_segments);
748                 }
749 
750                 if let Some(distances) = &config.distances {
751                     for distance in distances.iter() {
752                         let dest = distance.destination;
753                         let dist = distance.distance;
754 
755                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
756                             error!("Unknown destination NUMA node {}", dest);
757                             return Err(Error::InvalidNumaConfig);
758                         }
759 
760                         if node.distances.contains_key(&dest) {
761                             error!("Destination NUMA node {} has been already set", dest);
762                             return Err(Error::InvalidNumaConfig);
763                         }
764 
765                         node.distances.insert(dest, dist);
766                     }
767                 }
768 
769                 #[cfg(target_arch = "x86_64")]
770                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
771                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
772                         let mm_sections = sgx_epc_region.epc_sections();
773                         for sgx_epc_section in sgx_epc_sections.iter() {
774                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
775                                 node.sgx_epc_sections.push(mm_section.clone());
776                             } else {
777                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
778                                 return Err(Error::InvalidNumaConfig);
779                             }
780                         }
781                     } else {
782                         error!("Missing SGX EPC region");
783                         return Err(Error::InvalidNumaConfig);
784                     }
785                 }
786 
787                 numa_nodes.insert(config.guest_numa_id, node);
788             }
789         }
790 
791         Ok(numa_nodes)
792     }
793 
794     #[allow(clippy::too_many_arguments)]
795     pub fn new(
796         vm_config: Arc<Mutex<VmConfig>>,
797         exit_evt: EventFd,
798         reset_evt: EventFd,
799         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
800         seccomp_action: &SeccompAction,
801         hypervisor: Arc<dyn hypervisor::Hypervisor>,
802         activate_evt: EventFd,
803         console_info: Option<ConsoleInfo>,
804         console_resize_pipe: Option<File>,
805         original_termios: Arc<Mutex<Option<termios>>>,
806         snapshot: Option<Snapshot>,
807         source_url: Option<&str>,
808         prefault: Option<bool>,
809     ) -> Result<Self> {
810         trace_scoped!("Vm::new");
811 
812         let timestamp = Instant::now();
813 
814         #[cfg(feature = "tdx")]
815         let tdx_enabled = if snapshot.is_some() {
816             false
817         } else {
818             vm_config.lock().unwrap().is_tdx_enabled()
819         };
820 
821         #[cfg(feature = "sev_snp")]
822         let sev_snp_enabled = if snapshot.is_some() {
823             false
824         } else {
825             vm_config.lock().unwrap().is_sev_snp_enabled()
826         };
827 
828         let vm = Self::create_hypervisor_vm(
829             &hypervisor,
830             #[cfg(feature = "tdx")]
831             tdx_enabled,
832             #[cfg(feature = "sev_snp")]
833             sev_snp_enabled,
834         )?;
835 
836         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
837 
838         let memory_manager = if let Some(snapshot) =
839             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
840         {
841             MemoryManager::new_from_snapshot(
842                 &snapshot,
843                 vm.clone(),
844                 &vm_config.lock().unwrap().memory.clone(),
845                 source_url,
846                 prefault.unwrap(),
847                 phys_bits,
848             )
849             .map_err(Error::MemoryManager)?
850         } else {
851             #[cfg(target_arch = "x86_64")]
852             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
853 
854             MemoryManager::new(
855                 vm.clone(),
856                 &vm_config.lock().unwrap().memory.clone(),
857                 None,
858                 phys_bits,
859                 #[cfg(feature = "tdx")]
860                 tdx_enabled,
861                 None,
862                 None,
863                 #[cfg(target_arch = "x86_64")]
864                 sgx_epc_config,
865             )
866             .map_err(Error::MemoryManager)?
867         };
868 
869         Vm::new_from_memory_manager(
870             vm_config,
871             memory_manager,
872             vm,
873             exit_evt,
874             reset_evt,
875             #[cfg(feature = "guest_debug")]
876             vm_debug_evt,
877             seccomp_action,
878             hypervisor,
879             activate_evt,
880             timestamp,
881             console_info,
882             console_resize_pipe,
883             original_termios,
884             snapshot,
885         )
886     }
887 
888     pub fn create_hypervisor_vm(
889         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
890         #[cfg(feature = "tdx")] tdx_enabled: bool,
891         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
892     ) -> Result<Arc<dyn hypervisor::Vm>> {
893         hypervisor.check_required_extensions().unwrap();
894 
895         cfg_if::cfg_if! {
896             if #[cfg(feature = "tdx")] {
897                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
898                 // Otherwise KVM_X86_LEGACY_VM: 0
899                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
900                 let vm = hypervisor
901                     .create_vm_with_type(u64::from(tdx_enabled))
902                     .unwrap();
903             } else if #[cfg(feature = "sev_snp")] {
904                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
905                 // Otherwise SEV_SNP_DISABLED: 0
906                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
907                 let vm = hypervisor
908                     .create_vm_with_type(u64::from(sev_snp_enabled))
909                     .unwrap();
910             } else {
911                 let vm = hypervisor.create_vm().unwrap();
912             }
913         }
914 
915         #[cfg(target_arch = "x86_64")]
916         {
917             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
918                 .unwrap();
919             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
920             vm.enable_split_irq().unwrap();
921         }
922 
923         Ok(vm)
924     }
925 
926     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
927         let initramfs = self.initramfs.as_mut().unwrap();
928         let size: usize = initramfs
929             .seek(SeekFrom::End(0))
930             .map_err(|_| Error::InitramfsLoad)?
931             .try_into()
932             .unwrap();
933         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
934 
935         let address =
936             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
937         let address = GuestAddress(address);
938 
939         guest_mem
940             .read_volatile_from(address, initramfs, size)
941             .map_err(|_| Error::InitramfsLoad)?;
942 
943         info!("Initramfs loaded: address = 0x{:x}", address.0);
944         Ok(arch::InitramfsConfig { address, size })
945     }
946 
947     pub fn generate_cmdline(
948         payload: &PayloadConfig,
949         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
950     ) -> Result<Cmdline> {
951         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
952         if let Some(s) = payload.cmdline.as_ref() {
953             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
954         }
955 
956         #[cfg(target_arch = "aarch64")]
957         for entry in device_manager.lock().unwrap().cmdline_additions() {
958             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
959         }
960         Ok(cmdline)
961     }
962 
963     #[cfg(target_arch = "aarch64")]
964     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
965         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
966         let mem = uefi_flash.memory();
967         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
968             .map_err(Error::UefiLoad)?;
969         Ok(())
970     }
971 
972     #[cfg(target_arch = "aarch64")]
973     fn load_kernel(
974         firmware: Option<File>,
975         kernel: Option<File>,
976         memory_manager: Arc<Mutex<MemoryManager>>,
977     ) -> Result<EntryPoint> {
978         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
979         let mem = guest_memory.memory();
980         let entry_addr = match (firmware, kernel) {
981             (None, Some(mut kernel)) => {
982                 match linux_loader::loader::pe::PE::load(
983                     mem.deref(),
984                     Some(arch::layout::KERNEL_START),
985                     &mut kernel,
986                     None,
987                 ) {
988                     Ok(entry_addr) => entry_addr.kernel_load,
989                     // Try to load the binary as kernel PE file at first.
990                     // If failed, retry to load it as UEFI binary.
991                     // As the UEFI binary is formatless, it must be the last option to try.
992                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
993                         Self::load_firmware(&kernel, memory_manager)?;
994                         arch::layout::UEFI_START
995                     }
996                     Err(e) => {
997                         return Err(Error::KernelLoad(e));
998                     }
999                 }
1000             }
1001             (Some(firmware), None) => {
1002                 Self::load_firmware(&firmware, memory_manager)?;
1003                 arch::layout::UEFI_START
1004             }
1005             _ => return Err(Error::InvalidPayload),
1006         };
1007 
1008         Ok(EntryPoint { entry_addr })
1009     }
1010 
1011     #[cfg(feature = "igvm")]
1012     fn load_igvm(
1013         igvm: File,
1014         memory_manager: Arc<Mutex<MemoryManager>>,
1015         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1016         #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1017     ) -> Result<EntryPoint> {
1018         let res = igvm_loader::load_igvm(
1019             &igvm,
1020             memory_manager,
1021             cpu_manager.clone(),
1022             "",
1023             #[cfg(feature = "sev_snp")]
1024             host_data,
1025         )
1026         .map_err(Error::IgvmLoad)?;
1027 
1028         cfg_if::cfg_if! {
1029             if #[cfg(feature = "sev_snp")] {
1030                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1031                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1032                 } else {
1033                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1034                 };
1035             } else {
1036                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1037             }
1038         };
1039         Ok(entry_point)
1040     }
1041 
1042     #[cfg(target_arch = "x86_64")]
1043     fn load_kernel(
1044         mut kernel: File,
1045         cmdline: Option<Cmdline>,
1046         memory_manager: Arc<Mutex<MemoryManager>>,
1047     ) -> Result<EntryPoint> {
1048         info!("Loading kernel");
1049 
1050         let mem = {
1051             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1052             guest_memory.memory()
1053         };
1054 
1055         // Try ELF binary with PVH boot.
1056         let entry_addr = linux_loader::loader::elf::Elf::load(
1057             mem.deref(),
1058             None,
1059             &mut kernel,
1060             Some(arch::layout::HIGH_RAM_START),
1061         )
1062         // Try loading kernel as bzImage.
1063         .or_else(|_| {
1064             BzImage::load(
1065                 mem.deref(),
1066                 None,
1067                 &mut kernel,
1068                 Some(arch::layout::HIGH_RAM_START),
1069             )
1070         })
1071         .map_err(Error::KernelLoad)?;
1072 
1073         if let Some(cmdline) = cmdline {
1074             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1075                 .map_err(Error::LoadCmdLine)?;
1076         }
1077 
1078         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1079             // Use the PVH kernel entry point to boot the guest
1080             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1081             Ok(EntryPoint {
1082                 entry_addr,
1083                 setup_header: None,
1084             })
1085         } else if entry_addr.setup_header.is_some() {
1086             // Use the bzImage 32bit entry point to boot the guest
1087             info!(
1088                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1089                 entry_addr.kernel_load.0
1090             );
1091             Ok(EntryPoint {
1092                 entry_addr: entry_addr.kernel_load,
1093                 setup_header: entry_addr.setup_header,
1094             })
1095         } else {
1096             Err(Error::KernelMissingPvhHeader)
1097         }
1098     }
1099 
1100     #[cfg(target_arch = "x86_64")]
1101     fn load_payload(
1102         payload: &PayloadConfig,
1103         memory_manager: Arc<Mutex<MemoryManager>>,
1104         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1105         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1106     ) -> Result<EntryPoint> {
1107         trace_scoped!("load_payload");
1108         #[cfg(feature = "igvm")]
1109         {
1110             if let Some(_igvm_file) = &payload.igvm {
1111                 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1112                 #[cfg(feature = "sev_snp")]
1113                 if sev_snp_enabled {
1114                     return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1115                 }
1116                 #[cfg(not(feature = "sev_snp"))]
1117                 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1118             }
1119         }
1120         match (
1121             &payload.firmware,
1122             &payload.kernel,
1123             &payload.initramfs,
1124             &payload.cmdline,
1125         ) {
1126             (Some(firmware), None, None, None) => {
1127                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1128                 Self::load_kernel(firmware, None, memory_manager)
1129             }
1130             (None, Some(kernel), _, _) => {
1131                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1132                 let cmdline = Self::generate_cmdline(payload)?;
1133                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1134             }
1135             _ => Err(Error::InvalidPayload),
1136         }
1137     }
1138 
1139     #[cfg(target_arch = "aarch64")]
1140     fn load_payload(
1141         payload: &PayloadConfig,
1142         memory_manager: Arc<Mutex<MemoryManager>>,
1143     ) -> Result<EntryPoint> {
1144         match (&payload.firmware, &payload.kernel) {
1145             (Some(firmware), None) => {
1146                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1147                 Self::load_kernel(Some(firmware), None, memory_manager)
1148             }
1149             (None, Some(kernel)) => {
1150                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1151                 Self::load_kernel(None, Some(kernel), memory_manager)
1152             }
1153             _ => Err(Error::InvalidPayload),
1154         }
1155     }
1156 
1157     fn load_payload_async(
1158         memory_manager: &Arc<Mutex<MemoryManager>>,
1159         config: &Arc<Mutex<VmConfig>>,
1160         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1161         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1162     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1163         // Kernel with TDX is loaded in a different manner
1164         #[cfg(feature = "tdx")]
1165         if config.lock().unwrap().is_tdx_enabled() {
1166             return Ok(None);
1167         }
1168 
1169         config
1170             .lock()
1171             .unwrap()
1172             .payload
1173             .as_ref()
1174             .map(|payload| {
1175                 let memory_manager = memory_manager.clone();
1176                 let payload = payload.clone();
1177                 #[cfg(feature = "igvm")]
1178                 let cpu_manager = cpu_manager.clone();
1179 
1180                 std::thread::Builder::new()
1181                     .name("payload_loader".into())
1182                     .spawn(move || {
1183                         Self::load_payload(
1184                             &payload,
1185                             memory_manager,
1186                             #[cfg(feature = "igvm")]
1187                             cpu_manager,
1188                             #[cfg(feature = "sev_snp")]
1189                             sev_snp_enabled,
1190                         )
1191                     })
1192                     .map_err(Error::KernelLoadThreadSpawn)
1193             })
1194             .transpose()
1195     }
1196 
1197     #[cfg(target_arch = "x86_64")]
1198     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1199         trace_scoped!("configure_system");
1200         info!("Configuring system");
1201         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1202 
1203         let initramfs_config = match self.initramfs {
1204             Some(_) => Some(self.load_initramfs(&mem)?),
1205             None => None,
1206         };
1207 
1208         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1209         let rsdp_addr = Some(rsdp_addr);
1210         let sgx_epc_region = self
1211             .memory_manager
1212             .lock()
1213             .unwrap()
1214             .sgx_epc_region()
1215             .as_ref()
1216             .cloned();
1217 
1218         let serial_number = self
1219             .config
1220             .lock()
1221             .unwrap()
1222             .platform
1223             .as_ref()
1224             .and_then(|p| p.serial_number.clone());
1225 
1226         let uuid = self
1227             .config
1228             .lock()
1229             .unwrap()
1230             .platform
1231             .as_ref()
1232             .and_then(|p| p.uuid.clone());
1233 
1234         let oem_strings = self
1235             .config
1236             .lock()
1237             .unwrap()
1238             .platform
1239             .as_ref()
1240             .and_then(|p| p.oem_strings.clone());
1241 
1242         let oem_strings = oem_strings
1243             .as_deref()
1244             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1245 
1246         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1247 
1248         arch::configure_system(
1249             &mem,
1250             arch::layout::CMDLINE_START,
1251             arch::layout::CMDLINE_MAX_SIZE,
1252             &initramfs_config,
1253             boot_vcpus,
1254             entry_addr.setup_header,
1255             rsdp_addr,
1256             sgx_epc_region,
1257             serial_number.as_deref(),
1258             uuid.as_deref(),
1259             oem_strings.as_deref(),
1260             topology,
1261         )
1262         .map_err(Error::ConfigureSystem)?;
1263         Ok(())
1264     }
1265 
1266     #[cfg(target_arch = "aarch64")]
1267     fn configure_system(
1268         &mut self,
1269         _rsdp_addr: GuestAddress,
1270         _entry_addr: EntryPoint,
1271     ) -> Result<()> {
1272         let cmdline = Self::generate_cmdline(
1273             self.config.lock().unwrap().payload.as_ref().unwrap(),
1274             &self.device_manager,
1275         )?;
1276         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1277         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1278         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1279         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1280         let initramfs_config = match self.initramfs {
1281             Some(_) => Some(self.load_initramfs(&mem)?),
1282             None => None,
1283         };
1284 
1285         let device_info = &self
1286             .device_manager
1287             .lock()
1288             .unwrap()
1289             .get_device_info()
1290             .clone();
1291 
1292         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1293             let pci_space = PciSpaceInfo {
1294                 pci_segment_id: pci_segment.id,
1295                 mmio_config_address: pci_segment.mmio_config_address,
1296                 pci_device_space_start: pci_segment.start_of_mem64_area,
1297                 pci_device_space_size: pci_segment.end_of_mem64_area
1298                     - pci_segment.start_of_mem64_area
1299                     + 1,
1300             };
1301             pci_space_info.push(pci_space);
1302         }
1303 
1304         let virtio_iommu_bdf = self
1305             .device_manager
1306             .lock()
1307             .unwrap()
1308             .iommu_attached_devices()
1309             .as_ref()
1310             .map(|(v, _)| *v);
1311 
1312         let vgic = self
1313             .device_manager
1314             .lock()
1315             .unwrap()
1316             .get_interrupt_controller()
1317             .unwrap()
1318             .lock()
1319             .unwrap()
1320             .get_vgic()
1321             .map_err(|_| {
1322                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1323                     arch::aarch64::Error::SetupGic,
1324                 ))
1325             })?;
1326 
1327         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1328         let pmu_supported = self
1329             .cpu_manager
1330             .lock()
1331             .unwrap()
1332             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1333             .map_err(|_| {
1334                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1335                     arch::aarch64::Error::VcpuInitPmu,
1336                 ))
1337             })?;
1338 
1339         arch::configure_system(
1340             &mem,
1341             cmdline.as_cstring().unwrap().to_str().unwrap(),
1342             vcpu_mpidrs,
1343             vcpu_topology,
1344             device_info,
1345             &initramfs_config,
1346             &pci_space_info,
1347             virtio_iommu_bdf.map(|bdf| bdf.into()),
1348             &vgic,
1349             &self.numa_nodes,
1350             pmu_supported,
1351         )
1352         .map_err(Error::ConfigureSystem)?;
1353 
1354         Ok(())
1355     }
1356 
1357     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1358         self.device_manager.lock().unwrap().console_resize_pipe()
1359     }
1360 
1361     pub fn shutdown(&mut self) -> Result<()> {
1362         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1363         let new_state = VmState::Shutdown;
1364 
1365         state.valid_transition(new_state)?;
1366 
1367         // Wake up the DeviceManager threads so they will get terminated cleanly
1368         self.device_manager
1369             .lock()
1370             .unwrap()
1371             .resume()
1372             .map_err(Error::Resume)?;
1373 
1374         self.cpu_manager
1375             .lock()
1376             .unwrap()
1377             .shutdown()
1378             .map_err(Error::CpuManager)?;
1379 
1380         // Wait for all the threads to finish
1381         for thread in self.threads.drain(..) {
1382             thread.join().map_err(Error::ThreadCleanup)?
1383         }
1384         *state = new_state;
1385 
1386         Ok(())
1387     }
1388 
1389     pub fn resize(
1390         &mut self,
1391         desired_vcpus: Option<u8>,
1392         desired_memory: Option<u64>,
1393         desired_balloon: Option<u64>,
1394     ) -> Result<()> {
1395         event!("vm", "resizing");
1396 
1397         if let Some(desired_vcpus) = desired_vcpus {
1398             if self
1399                 .cpu_manager
1400                 .lock()
1401                 .unwrap()
1402                 .resize(desired_vcpus)
1403                 .map_err(Error::CpuManager)?
1404             {
1405                 self.device_manager
1406                     .lock()
1407                     .unwrap()
1408                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1409                     .map_err(Error::DeviceManager)?;
1410             }
1411             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1412         }
1413 
1414         if let Some(desired_memory) = desired_memory {
1415             let new_region = self
1416                 .memory_manager
1417                 .lock()
1418                 .unwrap()
1419                 .resize(desired_memory)
1420                 .map_err(Error::MemoryManager)?;
1421 
1422             let memory_config = &mut self.config.lock().unwrap().memory;
1423 
1424             if let Some(new_region) = &new_region {
1425                 self.device_manager
1426                     .lock()
1427                     .unwrap()
1428                     .update_memory(new_region)
1429                     .map_err(Error::DeviceManager)?;
1430 
1431                 match memory_config.hotplug_method {
1432                     HotplugMethod::Acpi => {
1433                         self.device_manager
1434                             .lock()
1435                             .unwrap()
1436                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1437                             .map_err(Error::DeviceManager)?;
1438                     }
1439                     HotplugMethod::VirtioMem => {}
1440                 }
1441             }
1442 
1443             // We update the VM config regardless of the actual guest resize
1444             // operation result (happened or not), so that if the VM reboots
1445             // it will be running with the last configure memory size.
1446             match memory_config.hotplug_method {
1447                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1448                 HotplugMethod::VirtioMem => {
1449                     if desired_memory > memory_config.size {
1450                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1451                     } else {
1452                         memory_config.hotplugged_size = None;
1453                     }
1454                 }
1455             }
1456         }
1457 
1458         if let Some(desired_balloon) = desired_balloon {
1459             self.device_manager
1460                 .lock()
1461                 .unwrap()
1462                 .resize_balloon(desired_balloon)
1463                 .map_err(Error::DeviceManager)?;
1464 
1465             // Update the configuration value for the balloon size to ensure
1466             // a reboot would use the right value.
1467             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1468                 balloon_config.size = desired_balloon;
1469             }
1470         }
1471 
1472         event!("vm", "resized");
1473 
1474         Ok(())
1475     }
1476 
1477     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1478         let memory_config = &mut self.config.lock().unwrap().memory;
1479 
1480         if let Some(zones) = &mut memory_config.zones {
1481             for zone in zones.iter_mut() {
1482                 if zone.id == id {
1483                     if desired_memory >= zone.size {
1484                         let hotplugged_size = desired_memory - zone.size;
1485                         self.memory_manager
1486                             .lock()
1487                             .unwrap()
1488                             .resize_zone(&id, desired_memory - zone.size)
1489                             .map_err(Error::MemoryManager)?;
1490                         // We update the memory zone config regardless of the
1491                         // actual 'resize-zone' operation result (happened or
1492                         // not), so that if the VM reboots it will be running
1493                         // with the last configured memory zone size.
1494                         zone.hotplugged_size = Some(hotplugged_size);
1495 
1496                         return Ok(());
1497                     } else {
1498                         error!(
1499                             "Invalid to ask less ({}) than boot RAM ({}) for \
1500                             this memory zone",
1501                             desired_memory, zone.size,
1502                         );
1503                         return Err(Error::ResizeZone);
1504                     }
1505                 }
1506             }
1507         }
1508 
1509         error!("Could not find the memory zone {} for the resize", id);
1510         Err(Error::ResizeZone)
1511     }
1512 
1513     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1514         let pci_device_info = self
1515             .device_manager
1516             .lock()
1517             .unwrap()
1518             .add_device(&mut device_cfg)
1519             .map_err(Error::DeviceManager)?;
1520 
1521         // Update VmConfig by adding the new device. This is important to
1522         // ensure the device would be created in case of a reboot.
1523         {
1524             let mut config = self.config.lock().unwrap();
1525             add_to_config(&mut config.devices, device_cfg);
1526         }
1527 
1528         self.device_manager
1529             .lock()
1530             .unwrap()
1531             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1532             .map_err(Error::DeviceManager)?;
1533 
1534         Ok(pci_device_info)
1535     }
1536 
1537     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1538         let pci_device_info = self
1539             .device_manager
1540             .lock()
1541             .unwrap()
1542             .add_user_device(&mut device_cfg)
1543             .map_err(Error::DeviceManager)?;
1544 
1545         // Update VmConfig by adding the new device. This is important to
1546         // ensure the device would be created in case of a reboot.
1547         {
1548             let mut config = self.config.lock().unwrap();
1549             add_to_config(&mut config.user_devices, device_cfg);
1550         }
1551 
1552         self.device_manager
1553             .lock()
1554             .unwrap()
1555             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1556             .map_err(Error::DeviceManager)?;
1557 
1558         Ok(pci_device_info)
1559     }
1560 
1561     pub fn remove_device(&mut self, id: String) -> Result<()> {
1562         self.device_manager
1563             .lock()
1564             .unwrap()
1565             .remove_device(id.clone())
1566             .map_err(Error::DeviceManager)?;
1567 
1568         // Update VmConfig by removing the device. This is important to
1569         // ensure the device would not be created in case of a reboot.
1570         self.config.lock().unwrap().remove_device(&id);
1571 
1572         self.device_manager
1573             .lock()
1574             .unwrap()
1575             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1576             .map_err(Error::DeviceManager)?;
1577         Ok(())
1578     }
1579 
1580     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1581         let pci_device_info = self
1582             .device_manager
1583             .lock()
1584             .unwrap()
1585             .add_disk(&mut disk_cfg)
1586             .map_err(Error::DeviceManager)?;
1587 
1588         // Update VmConfig by adding the new device. This is important to
1589         // ensure the device would be created in case of a reboot.
1590         {
1591             let mut config = self.config.lock().unwrap();
1592             add_to_config(&mut config.disks, disk_cfg);
1593         }
1594 
1595         self.device_manager
1596             .lock()
1597             .unwrap()
1598             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1599             .map_err(Error::DeviceManager)?;
1600 
1601         Ok(pci_device_info)
1602     }
1603 
1604     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1605         let pci_device_info = self
1606             .device_manager
1607             .lock()
1608             .unwrap()
1609             .add_fs(&mut fs_cfg)
1610             .map_err(Error::DeviceManager)?;
1611 
1612         // Update VmConfig by adding the new device. This is important to
1613         // ensure the device would be created in case of a reboot.
1614         {
1615             let mut config = self.config.lock().unwrap();
1616             add_to_config(&mut config.fs, fs_cfg);
1617         }
1618 
1619         self.device_manager
1620             .lock()
1621             .unwrap()
1622             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1623             .map_err(Error::DeviceManager)?;
1624 
1625         Ok(pci_device_info)
1626     }
1627 
1628     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1629         let pci_device_info = self
1630             .device_manager
1631             .lock()
1632             .unwrap()
1633             .add_pmem(&mut pmem_cfg)
1634             .map_err(Error::DeviceManager)?;
1635 
1636         // Update VmConfig by adding the new device. This is important to
1637         // ensure the device would be created in case of a reboot.
1638         {
1639             let mut config = self.config.lock().unwrap();
1640             add_to_config(&mut config.pmem, pmem_cfg);
1641         }
1642 
1643         self.device_manager
1644             .lock()
1645             .unwrap()
1646             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1647             .map_err(Error::DeviceManager)?;
1648 
1649         Ok(pci_device_info)
1650     }
1651 
1652     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1653         let pci_device_info = self
1654             .device_manager
1655             .lock()
1656             .unwrap()
1657             .add_net(&mut net_cfg)
1658             .map_err(Error::DeviceManager)?;
1659 
1660         // Update VmConfig by adding the new device. This is important to
1661         // ensure the device would be created in case of a reboot.
1662         {
1663             let mut config = self.config.lock().unwrap();
1664             add_to_config(&mut config.net, net_cfg);
1665         }
1666 
1667         self.device_manager
1668             .lock()
1669             .unwrap()
1670             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1671             .map_err(Error::DeviceManager)?;
1672 
1673         Ok(pci_device_info)
1674     }
1675 
1676     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1677         let pci_device_info = self
1678             .device_manager
1679             .lock()
1680             .unwrap()
1681             .add_vdpa(&mut vdpa_cfg)
1682             .map_err(Error::DeviceManager)?;
1683 
1684         // Update VmConfig by adding the new device. This is important to
1685         // ensure the device would be created in case of a reboot.
1686         {
1687             let mut config = self.config.lock().unwrap();
1688             add_to_config(&mut config.vdpa, vdpa_cfg);
1689         }
1690 
1691         self.device_manager
1692             .lock()
1693             .unwrap()
1694             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1695             .map_err(Error::DeviceManager)?;
1696 
1697         Ok(pci_device_info)
1698     }
1699 
1700     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1701         let pci_device_info = self
1702             .device_manager
1703             .lock()
1704             .unwrap()
1705             .add_vsock(&mut vsock_cfg)
1706             .map_err(Error::DeviceManager)?;
1707 
1708         // Update VmConfig by adding the new device. This is important to
1709         // ensure the device would be created in case of a reboot.
1710         {
1711             let mut config = self.config.lock().unwrap();
1712             config.vsock = Some(vsock_cfg);
1713         }
1714 
1715         self.device_manager
1716             .lock()
1717             .unwrap()
1718             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1719             .map_err(Error::DeviceManager)?;
1720 
1721         Ok(pci_device_info)
1722     }
1723 
1724     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1725         Ok(self.device_manager.lock().unwrap().counters())
1726     }
1727 
1728     #[cfg(feature = "tdx")]
1729     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1730         use arch::x86_64::tdx::*;
1731 
1732         let firmware_path = self
1733             .config
1734             .lock()
1735             .unwrap()
1736             .payload
1737             .as_ref()
1738             .unwrap()
1739             .firmware
1740             .clone()
1741             .ok_or(Error::TdxFirmwareMissing)?;
1742         // The TDVF file contains a table of section as well as code
1743         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1744 
1745         // For all the sections allocate some RAM backing them
1746         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1747     }
1748 
1749     #[cfg(feature = "tdx")]
1750     fn hob_memory_resources(
1751         mut sorted_sections: Vec<TdvfSection>,
1752         guest_memory: &GuestMemoryMmap,
1753     ) -> Vec<(u64, u64, bool)> {
1754         let mut list = Vec::new();
1755 
1756         let mut current_section = sorted_sections.pop();
1757 
1758         // RAM regions interleaved with TDVF sections
1759         let mut next_start_addr = 0;
1760         for region in guest_memory.iter() {
1761             let region_start = region.start_addr().0;
1762             let region_end = region.last_addr().0;
1763             if region_start > next_start_addr {
1764                 next_start_addr = region_start;
1765             }
1766 
1767             loop {
1768                 let (start, size, ram) = if let Some(section) = &current_section {
1769                     if section.address <= next_start_addr {
1770                         (section.address, section.size, false)
1771                     } else {
1772                         let last_addr = std::cmp::min(section.address - 1, region_end);
1773                         (next_start_addr, last_addr - next_start_addr + 1, true)
1774                     }
1775                 } else {
1776                     (next_start_addr, region_end - next_start_addr + 1, true)
1777                 };
1778 
1779                 list.push((start, size, ram));
1780 
1781                 if !ram {
1782                     current_section = sorted_sections.pop();
1783                 }
1784 
1785                 next_start_addr = start + size;
1786 
1787                 if region_start > next_start_addr {
1788                     next_start_addr = region_start;
1789                 }
1790 
1791                 if next_start_addr > region_end {
1792                     break;
1793                 }
1794             }
1795         }
1796 
1797         // Once all the interleaved sections have been processed, let's simply
1798         // pull the remaining ones.
1799         if let Some(section) = current_section {
1800             list.push((section.address, section.size, false));
1801         }
1802         while let Some(section) = sorted_sections.pop() {
1803             list.push((section.address, section.size, false));
1804         }
1805 
1806         list
1807     }
1808 
1809     #[cfg(feature = "tdx")]
1810     fn populate_tdx_sections(
1811         &mut self,
1812         sections: &[TdvfSection],
1813         guid_found: bool,
1814     ) -> Result<Option<u64>> {
1815         use arch::x86_64::tdx::*;
1816         // Get the memory end *before* we start adding TDVF ram regions
1817         let boot_guest_memory = self
1818             .memory_manager
1819             .lock()
1820             .as_ref()
1821             .unwrap()
1822             .boot_guest_memory();
1823         for section in sections {
1824             // No need to allocate if the section falls within guest RAM ranges
1825             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1826                 info!(
1827                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1828                     section
1829                 );
1830                 continue;
1831             }
1832 
1833             info!("Allocating TDVF Section: {:x?}", section);
1834             self.memory_manager
1835                 .lock()
1836                 .unwrap()
1837                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1838                 .map_err(Error::AllocatingTdvfMemory)?;
1839         }
1840 
1841         // The TDVF file contains a table of section as well as code
1842         let firmware_path = self
1843             .config
1844             .lock()
1845             .unwrap()
1846             .payload
1847             .as_ref()
1848             .unwrap()
1849             .firmware
1850             .clone()
1851             .ok_or(Error::TdxFirmwareMissing)?;
1852         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1853 
1854         // The guest memory at this point now has all the required regions so it
1855         // is safe to copy from the TDVF file into it.
1856         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1857         let mem = guest_memory.memory();
1858         let mut payload_info = None;
1859         let mut hob_offset = None;
1860         for section in sections {
1861             info!("Populating TDVF Section: {:x?}", section);
1862             match section.r#type {
1863                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1864                     info!("Copying section to guest memory");
1865                     firmware_file
1866                         .seek(SeekFrom::Start(section.data_offset as u64))
1867                         .map_err(Error::LoadTdvf)?;
1868                     mem.read_volatile_from(
1869                         GuestAddress(section.address),
1870                         &mut firmware_file,
1871                         section.data_size as usize,
1872                     )
1873                     .unwrap();
1874                 }
1875                 TdvfSectionType::TdHob => {
1876                     hob_offset = Some(section.address);
1877                 }
1878                 TdvfSectionType::Payload => {
1879                     info!("Copying payload to guest memory");
1880                     if let Some(payload_file) = self.kernel.as_mut() {
1881                         let payload_size = payload_file
1882                             .seek(SeekFrom::End(0))
1883                             .map_err(Error::LoadPayload)?;
1884 
1885                         payload_file
1886                             .seek(SeekFrom::Start(0x1f1))
1887                             .map_err(Error::LoadPayload)?;
1888 
1889                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1890                         payload_file
1891                             .read_volatile(&mut payload_header.as_bytes())
1892                             .unwrap();
1893 
1894                         if payload_header.header != 0x5372_6448 {
1895                             return Err(Error::InvalidPayloadType);
1896                         }
1897 
1898                         if (payload_header.version < 0x0200)
1899                             || ((payload_header.loadflags & 0x1) == 0x0)
1900                         {
1901                             return Err(Error::InvalidPayloadType);
1902                         }
1903 
1904                         payload_file.rewind().map_err(Error::LoadPayload)?;
1905                         mem.read_volatile_from(
1906                             GuestAddress(section.address),
1907                             payload_file,
1908                             payload_size as usize,
1909                         )
1910                         .unwrap();
1911 
1912                         // Create the payload info that will be inserted into
1913                         // the HOB.
1914                         payload_info = Some(PayloadInfo {
1915                             image_type: PayloadImageType::BzImage,
1916                             entry_point: section.address,
1917                         });
1918                     }
1919                 }
1920                 TdvfSectionType::PayloadParam => {
1921                     info!("Copying payload parameters to guest memory");
1922                     let cmdline = Self::generate_cmdline(
1923                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1924                     )?;
1925                     mem.write_slice(
1926                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1927                         GuestAddress(section.address),
1928                     )
1929                     .unwrap();
1930                 }
1931                 _ => {}
1932             }
1933         }
1934 
1935         // Generate HOB
1936         let mut hob = TdHob::start(hob_offset.unwrap());
1937 
1938         let mut sorted_sections = sections.to_vec();
1939         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1940 
1941         sorted_sections.sort_by_key(|section| section.address);
1942         sorted_sections.reverse();
1943 
1944         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1945             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1946                 .map_err(Error::PopulateHob)?;
1947         }
1948 
1949         // MMIO regions
1950         hob.add_mmio_resource(
1951             &mem,
1952             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1953             arch::layout::APIC_START.raw_value()
1954                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1955         )
1956         .map_err(Error::PopulateHob)?;
1957         let start_of_device_area = self
1958             .memory_manager
1959             .lock()
1960             .unwrap()
1961             .start_of_device_area()
1962             .raw_value();
1963         let end_of_device_area = self
1964             .memory_manager
1965             .lock()
1966             .unwrap()
1967             .end_of_device_area()
1968             .raw_value();
1969         hob.add_mmio_resource(
1970             &mem,
1971             start_of_device_area,
1972             end_of_device_area - start_of_device_area,
1973         )
1974         .map_err(Error::PopulateHob)?;
1975 
1976         // Loop over the ACPI tables and copy them to the HOB.
1977 
1978         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1979             &self.device_manager,
1980             &self.cpu_manager,
1981             &self.memory_manager,
1982             &self.numa_nodes,
1983         ) {
1984             hob.add_acpi_table(&mem, acpi_table.as_slice())
1985                 .map_err(Error::PopulateHob)?;
1986         }
1987 
1988         // If a payload info has been created, let's insert it into the HOB.
1989         if let Some(payload_info) = payload_info {
1990             hob.add_payload(&mem, payload_info)
1991                 .map_err(Error::PopulateHob)?;
1992         }
1993 
1994         hob.finish(&mem).map_err(Error::PopulateHob)?;
1995 
1996         Ok(hob_offset)
1997     }
1998 
1999     #[cfg(feature = "tdx")]
2000     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
2001         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2002         let mem = guest_memory.memory();
2003 
2004         for section in sections {
2005             self.vm
2006                 .tdx_init_memory_region(
2007                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2008                     section.address,
2009                     section.size,
2010                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2011                     section.attributes == 1,
2012                 )
2013                 .map_err(Error::InitializeTdxMemoryRegion)?;
2014         }
2015 
2016         Ok(())
2017     }
2018 
2019     // Creates ACPI tables
2020     // In case of TDX being used, this is a no-op since the tables will be
2021     // created and passed when populating the HOB.
2022 
2023     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2024         #[cfg(feature = "tdx")]
2025         if self.config.lock().unwrap().is_tdx_enabled() {
2026             return None;
2027         }
2028         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2029         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2030         let rsdp_addr = crate::acpi::create_acpi_tables(
2031             &mem,
2032             &self.device_manager,
2033             &self.cpu_manager,
2034             &self.memory_manager,
2035             &self.numa_nodes,
2036             tpm_enabled,
2037         );
2038         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2039 
2040         Some(rsdp_addr)
2041     }
2042 
2043     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2044         trace_scoped!("entry_point");
2045 
2046         self.load_payload_handle
2047             .take()
2048             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2049             .transpose()
2050     }
2051 
2052     pub fn boot(&mut self) -> Result<()> {
2053         trace_scoped!("Vm::boot");
2054         let current_state = self.get_state()?;
2055         if current_state == VmState::Paused {
2056             return self.resume().map_err(Error::Resume);
2057         }
2058 
2059         let new_state = if self.stop_on_boot {
2060             VmState::BreakPoint
2061         } else {
2062             VmState::Running
2063         };
2064         current_state.valid_transition(new_state)?;
2065 
2066         // Do earlier to parallelise with loading kernel
2067         #[cfg(target_arch = "x86_64")]
2068         cfg_if::cfg_if! {
2069             if #[cfg(feature = "sev_snp")] {
2070                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2071                 let rsdp_addr = if sev_snp_enabled {
2072                     // In case of SEV-SNP guest ACPI tables are provided via
2073                     // IGVM. So skip the creation of ACPI tables and set the
2074                     // rsdp addr to None.
2075                     None
2076                 } else {
2077                     self.create_acpi_tables()
2078                 };
2079             } else {
2080                 let rsdp_addr = self.create_acpi_tables();
2081             }
2082         }
2083 
2084         // Load kernel synchronously or if asynchronous then wait for load to
2085         // finish.
2086         let entry_point = self.entry_point()?;
2087 
2088         #[cfg(feature = "tdx")]
2089         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2090 
2091         // Configure the vcpus that have been created
2092         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2093         for vcpu in vcpus {
2094             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2095             let boot_setup = entry_point.map(|e| (e, guest_memory));
2096             self.cpu_manager
2097                 .lock()
2098                 .unwrap()
2099                 .configure_vcpu(vcpu, boot_setup)
2100                 .map_err(Error::CpuManager)?;
2101         }
2102 
2103         #[cfg(feature = "tdx")]
2104         let (sections, guid_found) = if tdx_enabled {
2105             self.extract_tdvf_sections()?
2106         } else {
2107             (Vec::new(), false)
2108         };
2109 
2110         // Configuring the TDX regions requires that the vCPUs are created.
2111         #[cfg(feature = "tdx")]
2112         let hob_address = if tdx_enabled {
2113             // TDX sections are written to memory.
2114             self.populate_tdx_sections(&sections, guid_found)?
2115         } else {
2116             None
2117         };
2118 
2119         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2120         // available after they are configured
2121         #[cfg(target_arch = "aarch64")]
2122         let rsdp_addr = self.create_acpi_tables();
2123 
2124         // Configure shared state based on loaded kernel
2125         entry_point
2126             .map(|entry_point| {
2127                 // Safe to unwrap rsdp_addr as we know it can't be None when
2128                 // the entry_point is Some.
2129                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2130             })
2131             .transpose()?;
2132 
2133         #[cfg(target_arch = "x86_64")]
2134         // Note: For x86, always call this function before invoking start boot vcpus.
2135         // Otherwise guest would fail to boot because we haven't created the
2136         // userspace mappings to update the hypervisor about the memory mappings.
2137         // These mappings must be created before we start the vCPU threads for
2138         // the very first time.
2139         self.memory_manager
2140             .lock()
2141             .unwrap()
2142             .allocate_address_space()
2143             .map_err(Error::MemoryManager)?;
2144 
2145         #[cfg(feature = "tdx")]
2146         if let Some(hob_address) = hob_address {
2147             // With the HOB address extracted the vCPUs can have
2148             // their TDX state configured.
2149             self.cpu_manager
2150                 .lock()
2151                 .unwrap()
2152                 .initialize_tdx(hob_address)
2153                 .map_err(Error::CpuManager)?;
2154             // Let the hypervisor know which memory ranges are shared with the
2155             // guest. This prevents the guest from ignoring/discarding memory
2156             // regions provided by the host.
2157             self.init_tdx_memory(&sections)?;
2158             // With TDX memory and CPU state configured TDX setup is complete
2159             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2160         }
2161 
2162         // Resume the vm for MSHV
2163         if current_state == VmState::Created {
2164             self.vm.resume().map_err(Error::ResumeVm)?;
2165         }
2166 
2167         self.cpu_manager
2168             .lock()
2169             .unwrap()
2170             .start_boot_vcpus(new_state == VmState::BreakPoint)
2171             .map_err(Error::CpuManager)?;
2172 
2173         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2174         *state = new_state;
2175         Ok(())
2176     }
2177 
2178     pub fn restore(&mut self) -> Result<()> {
2179         event!("vm", "restoring");
2180 
2181         #[cfg(target_arch = "x86_64")]
2182         // Note: For x86, always call this function before invoking start boot vcpus.
2183         // Otherwise guest would fail to boot because we haven't created the
2184         // userspace mappings to update the hypervisor about the memory mappings.
2185         // These mappings must be created before we start the vCPU threads for
2186         // the very first time for the restored VM.
2187         self.memory_manager
2188             .lock()
2189             .unwrap()
2190             .allocate_address_space()
2191             .map_err(Error::MemoryManager)?;
2192 
2193         // Now we can start all vCPUs from here.
2194         self.cpu_manager
2195             .lock()
2196             .unwrap()
2197             .start_restored_vcpus()
2198             .map_err(Error::CpuManager)?;
2199 
2200         event!("vm", "restored");
2201         Ok(())
2202     }
2203 
2204     /// Gets a thread-safe reference counted pointer to the VM configuration.
2205     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2206         Arc::clone(&self.config)
2207     }
2208 
2209     /// Get the VM state. Returns an error if the state is poisoned.
2210     pub fn get_state(&self) -> Result<VmState> {
2211         self.state
2212             .try_read()
2213             .map_err(|_| Error::PoisonedState)
2214             .map(|state| *state)
2215     }
2216 
2217     /// Gets the actual size of the balloon.
2218     pub fn balloon_size(&self) -> u64 {
2219         self.device_manager.lock().unwrap().balloon_size()
2220     }
2221 
2222     pub fn send_memory_fds(
2223         &mut self,
2224         socket: &mut UnixStream,
2225     ) -> std::result::Result<(), MigratableError> {
2226         for (slot, fd) in self
2227             .memory_manager
2228             .lock()
2229             .unwrap()
2230             .memory_slot_fds()
2231             .drain()
2232         {
2233             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2234                 .write_to(socket)
2235                 .map_err(|e| {
2236                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2237                 })?;
2238             socket
2239                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2240                 .map_err(|e| {
2241                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2242                 })?;
2243 
2244             Response::read_from(socket)?.ok_or_abandon(
2245                 socket,
2246                 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")),
2247             )?;
2248         }
2249 
2250         Ok(())
2251     }
2252 
2253     pub fn send_memory_regions<F>(
2254         &mut self,
2255         ranges: &MemoryRangeTable,
2256         fd: &mut F,
2257     ) -> std::result::Result<(), MigratableError>
2258     where
2259         F: WriteVolatile,
2260     {
2261         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2262         let mem = guest_memory.memory();
2263 
2264         for range in ranges.regions() {
2265             let mut offset: u64 = 0;
2266             // Here we are manually handling the retry in case we can't the
2267             // whole region at once because we can't use the implementation
2268             // from vm-memory::GuestMemory of write_all_to() as it is not
2269             // following the correct behavior. For more info about this issue
2270             // see: https://github.com/rust-vmm/vm-memory/issues/174
2271             loop {
2272                 let bytes_written = mem
2273                     .write_volatile_to(
2274                         GuestAddress(range.gpa + offset),
2275                         fd,
2276                         (range.length - offset) as usize,
2277                     )
2278                     .map_err(|e| {
2279                         MigratableError::MigrateSend(anyhow!(
2280                             "Error transferring memory to socket: {}",
2281                             e
2282                         ))
2283                     })?;
2284                 offset += bytes_written as u64;
2285 
2286                 if offset == range.length {
2287                     break;
2288                 }
2289             }
2290         }
2291 
2292         Ok(())
2293     }
2294 
2295     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2296         self.memory_manager
2297             .lock()
2298             .unwrap()
2299             .memory_range_table(false)
2300     }
2301 
2302     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2303         self.device_manager.lock().unwrap().device_tree()
2304     }
2305 
2306     pub fn activate_virtio_devices(&self) -> Result<()> {
2307         self.device_manager
2308             .lock()
2309             .unwrap()
2310             .activate_virtio_devices()
2311             .map_err(Error::ActivateVirtioDevices)
2312     }
2313 
2314     #[cfg(target_arch = "x86_64")]
2315     pub fn power_button(&self) -> Result<()> {
2316         return self
2317             .device_manager
2318             .lock()
2319             .unwrap()
2320             .notify_power_button()
2321             .map_err(Error::PowerButton);
2322     }
2323 
2324     #[cfg(target_arch = "aarch64")]
2325     pub fn power_button(&self) -> Result<()> {
2326         self.device_manager
2327             .lock()
2328             .unwrap()
2329             .notify_power_button()
2330             .map_err(Error::PowerButton)
2331     }
2332 
2333     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2334         self.memory_manager.lock().unwrap().snapshot_data()
2335     }
2336 
2337     #[cfg(feature = "guest_debug")]
2338     pub fn debug_request(
2339         &mut self,
2340         gdb_request: &GdbRequestPayload,
2341         cpu_id: usize,
2342     ) -> Result<GdbResponsePayload> {
2343         use GdbRequestPayload::*;
2344         match gdb_request {
2345             SetSingleStep(single_step) => {
2346                 self.set_guest_debug(cpu_id, &[], *single_step)
2347                     .map_err(Error::Debug)?;
2348             }
2349             SetHwBreakPoint(addrs) => {
2350                 self.set_guest_debug(cpu_id, addrs, false)
2351                     .map_err(Error::Debug)?;
2352             }
2353             Pause => {
2354                 self.debug_pause().map_err(Error::Debug)?;
2355             }
2356             Resume => {
2357                 self.debug_resume().map_err(Error::Debug)?;
2358             }
2359             ReadRegs => {
2360                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2361                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2362             }
2363             WriteRegs(regs) => {
2364                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2365             }
2366             ReadMem(vaddr, len) => {
2367                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2368                 let mem = self
2369                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2370                     .map_err(Error::Debug)?;
2371                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2372             }
2373             WriteMem(vaddr, data) => {
2374                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2375                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2376                     .map_err(Error::Debug)?;
2377             }
2378             ActiveVcpus => {
2379                 let active_vcpus = self.active_vcpus();
2380                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2381             }
2382         }
2383         Ok(GdbResponsePayload::CommandComplete)
2384     }
2385 
2386     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2387     fn get_dump_state(
2388         &mut self,
2389         destination_url: &str,
2390     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2391         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2392         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2393         let mut elf_phdr_num = 1;
2394         let elf_sh_info = 0;
2395         let coredump_file_path = url_to_file(destination_url)?;
2396         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2397 
2398         if mapping_num < UINT16_MAX - 2 {
2399             elf_phdr_num += mapping_num as u16;
2400         } else {
2401             panic!("mapping num beyond 65535 not supported");
2402         }
2403         let coredump_file = OpenOptions::new()
2404             .read(true)
2405             .write(true)
2406             .create_new(true)
2407             .open(coredump_file_path)
2408             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2409 
2410         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2411         let mem_data = self
2412             .memory_manager
2413             .lock()
2414             .unwrap()
2415             .coredump_memory_regions(mem_offset);
2416 
2417         Ok(DumpState {
2418             elf_note_size,
2419             elf_phdr_num,
2420             elf_sh_info,
2421             mem_offset,
2422             mem_info: Some(mem_data),
2423             file: Some(coredump_file),
2424         })
2425     }
2426 
2427     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2428     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2429         size_of::<elf::Elf64_Ehdr>() as u64
2430             + note_size as u64
2431             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2432     }
2433 
2434     pub fn nmi(&self) -> Result<()> {
2435         return self
2436             .cpu_manager
2437             .lock()
2438             .unwrap()
2439             .nmi()
2440             .map_err(|_| Error::ErrorNmi);
2441     }
2442 }
2443 
2444 impl Pausable for Vm {
2445     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2446         event!("vm", "pausing");
2447         let mut state = self
2448             .state
2449             .try_write()
2450             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2451         let new_state = VmState::Paused;
2452 
2453         state
2454             .valid_transition(new_state)
2455             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2456 
2457         #[cfg(target_arch = "x86_64")]
2458         {
2459             let mut clock = self
2460                 .vm
2461                 .get_clock()
2462                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2463             clock.reset_flags();
2464             self.saved_clock = Some(clock);
2465         }
2466 
2467         // Before pausing the vCPUs activate any pending virtio devices that might
2468         // need activation between starting the pause (or e.g. a migration it's part of)
2469         self.activate_virtio_devices().map_err(|e| {
2470             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2471         })?;
2472 
2473         self.cpu_manager.lock().unwrap().pause()?;
2474         self.device_manager.lock().unwrap().pause()?;
2475 
2476         self.vm
2477             .pause()
2478             .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?;
2479 
2480         *state = new_state;
2481 
2482         event!("vm", "paused");
2483         Ok(())
2484     }
2485 
2486     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2487         event!("vm", "resuming");
2488         let current_state = self.get_state().unwrap();
2489         let mut state = self
2490             .state
2491             .try_write()
2492             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2493         let new_state = VmState::Running;
2494 
2495         state
2496             .valid_transition(new_state)
2497             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2498 
2499         self.cpu_manager.lock().unwrap().resume()?;
2500         #[cfg(target_arch = "x86_64")]
2501         {
2502             if let Some(clock) = &self.saved_clock {
2503                 self.vm.set_clock(clock).map_err(|e| {
2504                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2505                 })?;
2506             }
2507         }
2508 
2509         if current_state == VmState::Paused {
2510             self.vm
2511                 .resume()
2512                 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?;
2513         }
2514 
2515         self.device_manager.lock().unwrap().resume()?;
2516 
2517         // And we're back to the Running state.
2518         *state = new_state;
2519         event!("vm", "resumed");
2520         Ok(())
2521     }
2522 }
2523 
2524 #[derive(Serialize, Deserialize)]
2525 pub struct VmSnapshot {
2526     #[cfg(target_arch = "x86_64")]
2527     pub clock: Option<hypervisor::ClockData>,
2528     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2529     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2530 }
2531 
2532 pub const VM_SNAPSHOT_ID: &str = "vm";
2533 impl Snapshottable for Vm {
2534     fn id(&self) -> String {
2535         VM_SNAPSHOT_ID.to_string()
2536     }
2537 
2538     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2539         event!("vm", "snapshotting");
2540 
2541         #[cfg(feature = "tdx")]
2542         {
2543             if self.config.lock().unwrap().is_tdx_enabled() {
2544                 return Err(MigratableError::Snapshot(anyhow!(
2545                     "Snapshot not possible with TDX VM"
2546                 )));
2547             }
2548         }
2549 
2550         let current_state = self.get_state().unwrap();
2551         if current_state != VmState::Paused {
2552             return Err(MigratableError::Snapshot(anyhow!(
2553                 "Trying to snapshot while VM is running"
2554             )));
2555         }
2556 
2557         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2558         let common_cpuid = {
2559             let amx = self.config.lock().unwrap().cpus.features.amx;
2560             let phys_bits = physical_bits(
2561                 &self.hypervisor,
2562                 self.config.lock().unwrap().cpus.max_phys_bits,
2563             );
2564             arch::generate_common_cpuid(
2565                 &self.hypervisor,
2566                 &arch::CpuidConfig {
2567                     sgx_epc_sections: None,
2568                     phys_bits,
2569                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2570                     #[cfg(feature = "tdx")]
2571                     tdx: false,
2572                     amx,
2573                 },
2574             )
2575             .map_err(|e| {
2576                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2577             })?
2578         };
2579 
2580         let vm_snapshot_state = VmSnapshot {
2581             #[cfg(target_arch = "x86_64")]
2582             clock: self.saved_clock,
2583             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2584             common_cpuid,
2585         };
2586 
2587         let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?;
2588 
2589         let (id, snapshot) = {
2590             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2591             (cpu_manager.id(), cpu_manager.snapshot()?)
2592         };
2593         vm_snapshot.add_snapshot(id, snapshot);
2594         let (id, snapshot) = {
2595             let mut memory_manager = self.memory_manager.lock().unwrap();
2596             (memory_manager.id(), memory_manager.snapshot()?)
2597         };
2598         vm_snapshot.add_snapshot(id, snapshot);
2599         let (id, snapshot) = {
2600             let mut device_manager = self.device_manager.lock().unwrap();
2601             (device_manager.id(), device_manager.snapshot()?)
2602         };
2603         vm_snapshot.add_snapshot(id, snapshot);
2604 
2605         event!("vm", "snapshotted");
2606         Ok(vm_snapshot)
2607     }
2608 }
2609 
2610 impl Transportable for Vm {
2611     fn send(
2612         &self,
2613         snapshot: &Snapshot,
2614         destination_url: &str,
2615     ) -> std::result::Result<(), MigratableError> {
2616         let mut snapshot_config_path = url_to_path(destination_url)?;
2617         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2618 
2619         // Create the snapshot config file
2620         let mut snapshot_config_file = OpenOptions::new()
2621             .read(true)
2622             .write(true)
2623             .create_new(true)
2624             .open(snapshot_config_path)
2625             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2626 
2627         // Serialize and write the snapshot config
2628         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2629             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2630 
2631         snapshot_config_file
2632             .write(vm_config.as_bytes())
2633             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2634 
2635         let mut snapshot_state_path = url_to_path(destination_url)?;
2636         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2637 
2638         // Create the snapshot state file
2639         let mut snapshot_state_file = OpenOptions::new()
2640             .read(true)
2641             .write(true)
2642             .create_new(true)
2643             .open(snapshot_state_path)
2644             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2645 
2646         // Serialize and write the snapshot state
2647         let vm_state =
2648             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2649 
2650         snapshot_state_file
2651             .write(&vm_state)
2652             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2653 
2654         // Tell the memory manager to also send/write its own snapshot.
2655         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2656             self.memory_manager
2657                 .lock()
2658                 .unwrap()
2659                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2660         } else {
2661             return Err(MigratableError::Restore(anyhow!(
2662                 "Missing memory manager snapshot"
2663             )));
2664         }
2665 
2666         Ok(())
2667     }
2668 }
2669 
2670 impl Migratable for Vm {
2671     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2672         self.memory_manager.lock().unwrap().start_dirty_log()?;
2673         self.device_manager.lock().unwrap().start_dirty_log()
2674     }
2675 
2676     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2677         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2678         self.device_manager.lock().unwrap().stop_dirty_log()
2679     }
2680 
2681     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2682         Ok(MemoryRangeTable::new_from_tables(vec![
2683             self.memory_manager.lock().unwrap().dirty_log()?,
2684             self.device_manager.lock().unwrap().dirty_log()?,
2685         ]))
2686     }
2687 
2688     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2689         self.memory_manager.lock().unwrap().start_migration()?;
2690         self.device_manager.lock().unwrap().start_migration()
2691     }
2692 
2693     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2694         self.memory_manager.lock().unwrap().complete_migration()?;
2695         self.device_manager.lock().unwrap().complete_migration()
2696     }
2697 }
2698 
2699 #[cfg(feature = "guest_debug")]
2700 impl Debuggable for Vm {
2701     fn set_guest_debug(
2702         &self,
2703         cpu_id: usize,
2704         addrs: &[GuestAddress],
2705         singlestep: bool,
2706     ) -> std::result::Result<(), DebuggableError> {
2707         self.cpu_manager
2708             .lock()
2709             .unwrap()
2710             .set_guest_debug(cpu_id, addrs, singlestep)
2711     }
2712 
2713     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2714         if *self.state.read().unwrap() == VmState::Running {
2715             self.pause().map_err(DebuggableError::Pause)?;
2716         }
2717 
2718         let mut state = self
2719             .state
2720             .try_write()
2721             .map_err(|_| DebuggableError::PoisonedState)?;
2722         *state = VmState::BreakPoint;
2723         Ok(())
2724     }
2725 
2726     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2727         if *self.state.read().unwrap() == VmState::BreakPoint {
2728             self.resume().map_err(DebuggableError::Pause)?;
2729         }
2730 
2731         Ok(())
2732     }
2733 
2734     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2735         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2736     }
2737 
2738     fn write_regs(
2739         &self,
2740         cpu_id: usize,
2741         regs: &CoreRegs,
2742     ) -> std::result::Result<(), DebuggableError> {
2743         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2744     }
2745 
2746     fn read_mem(
2747         &self,
2748         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2749         cpu_id: usize,
2750         vaddr: GuestAddress,
2751         len: usize,
2752     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2753         self.cpu_manager
2754             .lock()
2755             .unwrap()
2756             .read_mem(guest_memory, cpu_id, vaddr, len)
2757     }
2758 
2759     fn write_mem(
2760         &self,
2761         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2762         cpu_id: usize,
2763         vaddr: &GuestAddress,
2764         data: &[u8],
2765     ) -> std::result::Result<(), DebuggableError> {
2766         self.cpu_manager
2767             .lock()
2768             .unwrap()
2769             .write_mem(guest_memory, cpu_id, vaddr, data)
2770     }
2771 
2772     fn active_vcpus(&self) -> usize {
2773         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2774         if active_vcpus > 0 {
2775             active_vcpus
2776         } else {
2777             // The VM is not booted yet. Report boot_vcpus() instead.
2778             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2779         }
2780     }
2781 }
2782 
2783 #[cfg(feature = "guest_debug")]
2784 pub const UINT16_MAX: u32 = 65535;
2785 
2786 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2787 impl Elf64Writable for Vm {}
2788 
2789 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2790 impl GuestDebuggable for Vm {
2791     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2792         event!("vm", "coredumping");
2793 
2794         let mut resume = false;
2795 
2796         #[cfg(feature = "tdx")]
2797         {
2798             if let Some(ref platform) = self.config.lock().unwrap().platform {
2799                 if platform.tdx {
2800                     return Err(GuestDebuggableError::Coredump(anyhow!(
2801                         "Coredump not possible with TDX VM"
2802                     )));
2803                 }
2804             }
2805         }
2806 
2807         match self.get_state().unwrap() {
2808             VmState::Running => {
2809                 self.pause().map_err(GuestDebuggableError::Pause)?;
2810                 resume = true;
2811             }
2812             VmState::Paused => {}
2813             _ => {
2814                 return Err(GuestDebuggableError::Coredump(anyhow!(
2815                     "Trying to coredump while VM is not running or paused"
2816                 )));
2817             }
2818         }
2819 
2820         let coredump_state = self.get_dump_state(destination_url)?;
2821 
2822         self.write_header(&coredump_state)?;
2823         self.write_note(&coredump_state)?;
2824         self.write_loads(&coredump_state)?;
2825 
2826         self.cpu_manager
2827             .lock()
2828             .unwrap()
2829             .cpu_write_elf64_note(&coredump_state)?;
2830         self.cpu_manager
2831             .lock()
2832             .unwrap()
2833             .cpu_write_vmm_note(&coredump_state)?;
2834 
2835         self.memory_manager
2836             .lock()
2837             .unwrap()
2838             .coredump_iterate_save_mem(&coredump_state)?;
2839 
2840         if resume {
2841             self.resume().map_err(GuestDebuggableError::Resume)?;
2842         }
2843 
2844         Ok(())
2845     }
2846 }
2847 
2848 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2849 #[cfg(test)]
2850 mod tests {
2851     use super::*;
2852 
2853     fn test_vm_state_transitions(state: VmState) {
2854         match state {
2855             VmState::Created => {
2856                 // Check the transitions from Created
2857                 assert!(state.valid_transition(VmState::Created).is_err());
2858                 assert!(state.valid_transition(VmState::Running).is_ok());
2859                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2860                 assert!(state.valid_transition(VmState::Paused).is_ok());
2861                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2862             }
2863             VmState::Running => {
2864                 // Check the transitions from Running
2865                 assert!(state.valid_transition(VmState::Created).is_err());
2866                 assert!(state.valid_transition(VmState::Running).is_err());
2867                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2868                 assert!(state.valid_transition(VmState::Paused).is_ok());
2869                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2870             }
2871             VmState::Shutdown => {
2872                 // Check the transitions from Shutdown
2873                 assert!(state.valid_transition(VmState::Created).is_err());
2874                 assert!(state.valid_transition(VmState::Running).is_ok());
2875                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2876                 assert!(state.valid_transition(VmState::Paused).is_err());
2877                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2878             }
2879             VmState::Paused => {
2880                 // Check the transitions from Paused
2881                 assert!(state.valid_transition(VmState::Created).is_err());
2882                 assert!(state.valid_transition(VmState::Running).is_ok());
2883                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2884                 assert!(state.valid_transition(VmState::Paused).is_err());
2885                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2886             }
2887             VmState::BreakPoint => {
2888                 // Check the transitions from Breakpoint
2889                 assert!(state.valid_transition(VmState::Created).is_ok());
2890                 assert!(state.valid_transition(VmState::Running).is_ok());
2891                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2892                 assert!(state.valid_transition(VmState::Paused).is_err());
2893                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2894             }
2895         }
2896     }
2897 
2898     #[test]
2899     fn test_vm_created_transitions() {
2900         test_vm_state_transitions(VmState::Created);
2901     }
2902 
2903     #[test]
2904     fn test_vm_running_transitions() {
2905         test_vm_state_transitions(VmState::Running);
2906     }
2907 
2908     #[test]
2909     fn test_vm_shutdown_transitions() {
2910         test_vm_state_transitions(VmState::Shutdown);
2911     }
2912 
2913     #[test]
2914     fn test_vm_paused_transitions() {
2915         test_vm_state_transitions(VmState::Paused);
2916     }
2917 
2918     #[cfg(feature = "tdx")]
2919     #[test]
2920     fn test_hob_memory_resources() {
2921         // Case 1: Two TDVF sections in the middle of the RAM
2922         let sections = vec![
2923             TdvfSection {
2924                 address: 0xc000,
2925                 size: 0x1000,
2926                 ..Default::default()
2927             },
2928             TdvfSection {
2929                 address: 0x1000,
2930                 size: 0x4000,
2931                 ..Default::default()
2932             },
2933         ];
2934         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2935         let expected = vec![
2936             (0, 0x1000, true),
2937             (0x1000, 0x4000, false),
2938             (0x5000, 0x7000, true),
2939             (0xc000, 0x1000, false),
2940             (0xd000, 0x0fff_3000, true),
2941         ];
2942         assert_eq!(
2943             expected,
2944             Vm::hob_memory_resources(
2945                 sections,
2946                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2947             )
2948         );
2949 
2950         // Case 2: Two TDVF sections with no conflict with the RAM
2951         let sections = vec![
2952             TdvfSection {
2953                 address: 0x1000_1000,
2954                 size: 0x1000,
2955                 ..Default::default()
2956             },
2957             TdvfSection {
2958                 address: 0,
2959                 size: 0x1000,
2960                 ..Default::default()
2961             },
2962         ];
2963         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2964         let expected = vec![
2965             (0, 0x1000, false),
2966             (0x1000, 0x1000_0000, true),
2967             (0x1000_1000, 0x1000, false),
2968         ];
2969         assert_eq!(
2970             expected,
2971             Vm::hob_memory_resources(
2972                 sections,
2973                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2974             )
2975         );
2976 
2977         // Case 3: Two TDVF sections with partial conflicts with the RAM
2978         let sections = vec![
2979             TdvfSection {
2980                 address: 0x1000_0000,
2981                 size: 0x2000,
2982                 ..Default::default()
2983             },
2984             TdvfSection {
2985                 address: 0,
2986                 size: 0x2000,
2987                 ..Default::default()
2988             },
2989         ];
2990         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2991         let expected = vec![
2992             (0, 0x2000, false),
2993             (0x2000, 0x0fff_e000, true),
2994             (0x1000_0000, 0x2000, false),
2995         ];
2996         assert_eq!(
2997             expected,
2998             Vm::hob_memory_resources(
2999                 sections,
3000                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3001             )
3002         );
3003 
3004         // Case 4: Two TDVF sections with no conflict before the RAM and two
3005         // more additional sections with no conflict after the RAM.
3006         let sections = vec![
3007             TdvfSection {
3008                 address: 0x2000_1000,
3009                 size: 0x1000,
3010                 ..Default::default()
3011             },
3012             TdvfSection {
3013                 address: 0x2000_0000,
3014                 size: 0x1000,
3015                 ..Default::default()
3016             },
3017             TdvfSection {
3018                 address: 0x1000,
3019                 size: 0x1000,
3020                 ..Default::default()
3021             },
3022             TdvfSection {
3023                 address: 0,
3024                 size: 0x1000,
3025                 ..Default::default()
3026             },
3027         ];
3028         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3029         let expected = vec![
3030             (0, 0x1000, false),
3031             (0x1000, 0x1000, false),
3032             (0x4000, 0x1000_0000, true),
3033             (0x2000_0000, 0x1000, false),
3034             (0x2000_1000, 0x1000, false),
3035         ];
3036         assert_eq!(
3037             expected,
3038             Vm::hob_memory_resources(
3039                 sections,
3040                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3041             )
3042         );
3043 
3044         // Case 5: One TDVF section overriding the entire RAM
3045         let sections = vec![TdvfSection {
3046             address: 0,
3047             size: 0x2000_0000,
3048             ..Default::default()
3049         }];
3050         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3051         let expected = vec![(0, 0x2000_0000, false)];
3052         assert_eq!(
3053             expected,
3054             Vm::hob_memory_resources(
3055                 sections,
3056                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3057             )
3058         );
3059 
3060         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3061         let sections = vec![
3062             TdvfSection {
3063                 address: 0x1000_2000,
3064                 size: 0x2000,
3065                 ..Default::default()
3066             },
3067             TdvfSection {
3068                 address: 0,
3069                 size: 0x2000,
3070                 ..Default::default()
3071             },
3072         ];
3073         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3074             (GuestAddress(0x2000), 0x1000_0000),
3075             (GuestAddress(0x1000_4000), 0x1000_0000),
3076         ];
3077         let expected = vec![
3078             (0, 0x2000, false),
3079             (0x2000, 0x1000_0000, true),
3080             (0x1000_2000, 0x2000, false),
3081             (0x1000_4000, 0x1000_0000, true),
3082         ];
3083         assert_eq!(
3084             expected,
3085             Vm::hob_memory_resources(
3086                 sections,
3087                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3088             )
3089         );
3090 
3091         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3092         let sections = vec![
3093             TdvfSection {
3094                 address: 0x1000_0000,
3095                 size: 0x4000,
3096                 ..Default::default()
3097             },
3098             TdvfSection {
3099                 address: 0,
3100                 size: 0x4000,
3101                 ..Default::default()
3102             },
3103         ];
3104         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3105             (GuestAddress(0x1000), 0x1000_0000),
3106             (GuestAddress(0x1000_3000), 0x1000_0000),
3107         ];
3108         let expected = vec![
3109             (0, 0x4000, false),
3110             (0x4000, 0x0fff_c000, true),
3111             (0x1000_0000, 0x4000, false),
3112             (0x1000_4000, 0x0fff_f000, true),
3113         ];
3114         assert_eq!(
3115             expected,
3116             Vm::hob_memory_resources(
3117                 sections,
3118                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3119             )
3120         );
3121     }
3122 }
3123 
3124 #[cfg(target_arch = "aarch64")]
3125 #[cfg(test)]
3126 mod tests {
3127     use super::*;
3128     use arch::aarch64::fdt::create_fdt;
3129     use arch::aarch64::layout;
3130     use arch::{DeviceType, MmioDeviceInfo};
3131     use devices::gic::Gic;
3132 
3133     const LEN: u64 = 4096;
3134 
3135     #[test]
3136     fn test_create_fdt_with_devices() {
3137         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3138         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3139 
3140         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3141             (
3142                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3143                 MmioDeviceInfo {
3144                     addr: 0x00,
3145                     len: LEN,
3146                     irq: 33,
3147                 },
3148             ),
3149             (
3150                 (DeviceType::Virtio(1), "virtio".to_string()),
3151                 MmioDeviceInfo {
3152                     addr: LEN,
3153                     len: LEN,
3154                     irq: 34,
3155                 },
3156             ),
3157             (
3158                 (DeviceType::Rtc, "rtc".to_string()),
3159                 MmioDeviceInfo {
3160                     addr: 2 * LEN,
3161                     len: LEN,
3162                     irq: 35,
3163                 },
3164             ),
3165         ]
3166         .iter()
3167         .cloned()
3168         .collect();
3169 
3170         let hv = hypervisor::new().unwrap();
3171         let vm = hv.create_vm().unwrap();
3172         let gic = vm
3173             .create_vgic(Gic::create_default_config(1))
3174             .expect("Cannot create gic");
3175         assert!(create_fdt(
3176             &mem,
3177             "console=tty0",
3178             vec![0],
3179             Some((0, 0, 0)),
3180             &dev_info,
3181             &gic,
3182             &None,
3183             &Vec::new(),
3184             &BTreeMap::new(),
3185             None,
3186             true,
3187         )
3188         .is_ok())
3189     }
3190 }
3191 
3192 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3193 #[test]
3194 pub fn test_vm() {
3195     use hypervisor::VmExit;
3196     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3197     // This example based on https://lwn.net/Articles/658511/
3198     let code = [
3199         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3200         0x00, 0xd8, /* add %bl, %al */
3201         0x04, b'0', /* add $'0', %al */
3202         0xee, /* out %al, (%dx) */
3203         0xb0, b'\n', /* mov $'\n', %al */
3204         0xee,  /* out %al, (%dx) */
3205         0xf4,  /* hlt */
3206     ];
3207 
3208     let mem_size = 0x1000;
3209     let load_addr = GuestAddress(0x1000);
3210     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3211 
3212     let hv = hypervisor::new().unwrap();
3213     let vm = hv.create_vm().expect("new VM creation failed");
3214 
3215     for (index, region) in mem.iter().enumerate() {
3216         let mem_region = vm.make_user_memory_region(
3217             index as u32,
3218             region.start_addr().raw_value(),
3219             region.len(),
3220             region.as_ptr() as u64,
3221             false,
3222             false,
3223         );
3224 
3225         vm.create_user_memory_region(mem_region)
3226             .expect("Cannot configure guest memory");
3227     }
3228     mem.write_slice(&code, load_addr)
3229         .expect("Writing code to memory failed");
3230 
3231     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3232 
3233     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3234     vcpu_sregs.cs.base = 0;
3235     vcpu_sregs.cs.selector = 0;
3236     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3237 
3238     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3239     vcpu_regs.set_rip(0x1000);
3240     vcpu_regs.set_rax(2);
3241     vcpu_regs.set_rbx(3);
3242     vcpu_regs.set_rflags(2);
3243     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3244 
3245     loop {
3246         match vcpu.run().expect("run failed") {
3247             VmExit::Reset => {
3248                 println!("HLT");
3249                 break;
3250             }
3251             VmExit::Ignore => {}
3252             r => panic!("unexpected exit reason: {r:?}"),
3253         }
3254     }
3255 }
3256