xref: /cloud-hypervisor/vmm/src/vm.rs (revision 88a9f799449c04180c6b9a21d3b9c0c4b57e2bd6)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::cmp;
15 use std::collections::BTreeMap;
16 use std::collections::HashMap;
17 use std::fs::{File, OpenOptions};
18 use std::io::{self, Seek, SeekFrom, Write};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use std::mem::size_of;
21 use std::num::Wrapping;
22 use std::ops::Deref;
23 use std::os::unix::net::UnixStream;
24 use std::sync::{Arc, Mutex, RwLock};
25 use std::time::Instant;
26 use std::{result, str, thread};
27 
28 use anyhow::anyhow;
29 use arch::get_host_cpu_phys_bits;
30 #[cfg(target_arch = "x86_64")]
31 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
32 #[cfg(feature = "tdx")]
33 use arch::x86_64::tdx::TdvfSection;
34 use arch::EntryPoint;
35 #[cfg(target_arch = "aarch64")]
36 use arch::PciSpaceInfo;
37 use arch::{NumaNode, NumaNodes};
38 #[cfg(target_arch = "aarch64")]
39 use devices::interrupt_controller;
40 use devices::AcpiNotificationFlags;
41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
44 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
45 use hypervisor::{HypervisorVmError, VmOps};
46 use libc::{termios, SIGWINCH};
47 use linux_loader::cmdline::Cmdline;
48 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
49 use linux_loader::elf;
50 #[cfg(target_arch = "x86_64")]
51 use linux_loader::loader::bzimage::BzImage;
52 #[cfg(target_arch = "x86_64")]
53 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
54 #[cfg(target_arch = "aarch64")]
55 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
56 use linux_loader::loader::KernelLoader;
57 use seccompiler::SeccompAction;
58 use serde::{Deserialize, Serialize};
59 use thiserror::Error;
60 use tracer::trace_scoped;
61 use vm_device::Bus;
62 #[cfg(feature = "tdx")]
63 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
64 use vm_memory::{
65     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
66 };
67 use vm_migration::protocol::{Request, Response};
68 use vm_migration::{
69     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
70     Snapshottable, Transportable,
71 };
72 use vmm_sys_util::eventfd::EventFd;
73 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
74 
75 use crate::config::{
76     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
77     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
78 };
79 use crate::config::{NumaConfig, PayloadConfig};
80 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo};
81 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
82 use crate::coredump::{
83     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
84 };
85 use crate::cpu;
86 use crate::device_manager::{DeviceManager, DeviceManagerError};
87 use crate::device_tree::DeviceTree;
88 #[cfg(feature = "guest_debug")]
89 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
90 #[cfg(feature = "igvm")]
91 use crate::igvm::igvm_loader;
92 use crate::landlock::LandlockError;
93 use crate::memory_manager::{
94     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
95 };
96 #[cfg(target_arch = "x86_64")]
97 use crate::migration::get_vm_snapshot;
98 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
99 use crate::migration::url_to_file;
100 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
101 use crate::GuestMemoryMmap;
102 use crate::{
103     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
104 };
105 
106 /// Errors associated with VM management
107 #[derive(Debug, Error)]
108 pub enum Error {
109     #[error("Cannot open kernel file: {0}")]
110     KernelFile(#[source] io::Error),
111 
112     #[error("Cannot open initramfs file: {0}")]
113     InitramfsFile(#[source] io::Error),
114 
115     #[error("Cannot load the kernel into memory: {0}")]
116     KernelLoad(#[source] linux_loader::loader::Error),
117 
118     #[cfg(target_arch = "aarch64")]
119     #[error("Cannot load the UEFI binary in memory: {0:?}")]
120     UefiLoad(arch::aarch64::uefi::Error),
121 
122     #[error("Cannot load the initramfs into memory")]
123     InitramfsLoad,
124 
125     #[error("Cannot load the kernel command line in memory: {0}")]
126     LoadCmdLine(#[source] linux_loader::loader::Error),
127 
128     #[error("Failed to apply landlock config during vm_create: {0}")]
129     ApplyLandlock(#[source] LandlockError),
130 
131     #[error("Cannot modify the kernel command line: {0}")]
132     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
133 
134     #[error("Cannot create the kernel command line: {0}")]
135     CmdLineCreate(#[source] linux_loader::cmdline::Error),
136 
137     #[error("Cannot configure system: {0}")]
138     ConfigureSystem(#[source] arch::Error),
139 
140     #[cfg(target_arch = "aarch64")]
141     #[error("Cannot enable interrupt controller: {0:?}")]
142     EnableInterruptController(interrupt_controller::Error),
143 
144     #[error("VM state is poisoned")]
145     PoisonedState,
146 
147     #[error("Error from device manager: {0:?}")]
148     DeviceManager(DeviceManagerError),
149 
150     #[error("No device with id {0:?} to remove")]
151     NoDeviceToRemove(String),
152 
153     #[error("Cannot spawn a signal handler thread: {0}")]
154     SignalHandlerSpawn(#[source] io::Error),
155 
156     #[error("Failed to join on threads: {0:?}")]
157     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
158 
159     #[error("VM config is missing")]
160     VmMissingConfig,
161 
162     #[error("VM is not created")]
163     VmNotCreated,
164 
165     #[error("VM is already created")]
166     VmAlreadyCreated,
167 
168     #[error("VM is not running")]
169     VmNotRunning,
170 
171     #[error("Cannot clone EventFd: {0}")]
172     EventFdClone(#[source] io::Error),
173 
174     #[error("invalid VM state transition: {0:?} to {1:?}")]
175     InvalidStateTransition(VmState, VmState),
176 
177     #[error("Error from CPU manager: {0}")]
178     CpuManager(#[source] cpu::Error),
179 
180     #[error("Cannot pause devices: {0}")]
181     PauseDevices(#[source] MigratableError),
182 
183     #[error("Cannot resume devices: {0}")]
184     ResumeDevices(#[source] MigratableError),
185 
186     #[error("Cannot pause CPUs: {0}")]
187     PauseCpus(#[source] MigratableError),
188 
189     #[error("Cannot resume cpus: {0}")]
190     ResumeCpus(#[source] MigratableError),
191 
192     #[error("Cannot pause VM: {0}")]
193     Pause(#[source] MigratableError),
194 
195     #[error("Cannot resume VM: {0}")]
196     Resume(#[source] MigratableError),
197 
198     #[error("Memory manager error: {0:?}")]
199     MemoryManager(MemoryManagerError),
200 
201     #[error("Eventfd write error: {0}")]
202     EventfdError(#[source] std::io::Error),
203 
204     #[error("Cannot snapshot VM: {0}")]
205     Snapshot(#[source] MigratableError),
206 
207     #[error("Cannot restore VM: {0}")]
208     Restore(#[source] MigratableError),
209 
210     #[error("Cannot send VM snapshot: {0}")]
211     SnapshotSend(#[source] MigratableError),
212 
213     #[error("Invalid restore source URL")]
214     InvalidRestoreSourceUrl,
215 
216     #[error("Failed to validate config: {0}")]
217     ConfigValidation(#[source] ValidationError),
218 
219     #[error("Too many virtio-vsock devices")]
220     TooManyVsockDevices,
221 
222     #[error("Failed serializing into JSON: {0}")]
223     SerializeJson(#[source] serde_json::Error),
224 
225     #[error("Invalid NUMA configuration")]
226     InvalidNumaConfig,
227 
228     #[error("Cannot create seccomp filter: {0}")]
229     CreateSeccompFilter(#[source] seccompiler::Error),
230 
231     #[error("Cannot apply seccomp filter: {0}")]
232     ApplySeccompFilter(#[source] seccompiler::Error),
233 
234     #[error("Failed resizing a memory zone")]
235     ResizeZone,
236 
237     #[error("Cannot activate virtio devices: {0:?}")]
238     ActivateVirtioDevices(DeviceManagerError),
239 
240     #[error("Error triggering power button: {0:?}")]
241     PowerButton(DeviceManagerError),
242 
243     #[error("Kernel lacks PVH header")]
244     KernelMissingPvhHeader,
245 
246     #[error("Failed to allocate firmware RAM: {0:?}")]
247     AllocateFirmwareMemory(MemoryManagerError),
248 
249     #[error("Error manipulating firmware file: {0}")]
250     FirmwareFile(#[source] std::io::Error),
251 
252     #[error("Firmware too big")]
253     FirmwareTooLarge,
254 
255     #[error("Failed to copy firmware to memory: {0}")]
256     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
257 
258     #[cfg(feature = "sev_snp")]
259     #[error("Error enabling SEV-SNP VM: {0}")]
260     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
261 
262     #[cfg(feature = "tdx")]
263     #[error("Error performing I/O on TDX firmware file: {0}")]
264     LoadTdvf(#[source] std::io::Error),
265 
266     #[cfg(feature = "tdx")]
267     #[error("Error performing I/O on the TDX payload file: {0}")]
268     LoadPayload(#[source] std::io::Error),
269 
270     #[cfg(feature = "tdx")]
271     #[error("Error parsing TDVF: {0}")]
272     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
273 
274     #[cfg(feature = "tdx")]
275     #[error("Error populating TDX HOB: {0}")]
276     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
277 
278     #[cfg(feature = "tdx")]
279     #[error("Error allocating TDVF memory: {0:?}")]
280     AllocatingTdvfMemory(crate::memory_manager::Error),
281 
282     #[cfg(feature = "tdx")]
283     #[error("Error enabling TDX VM: {0}")]
284     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
285 
286     #[cfg(feature = "tdx")]
287     #[error("Error enabling TDX memory region: {0}")]
288     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
289 
290     #[cfg(feature = "tdx")]
291     #[error("Error finalizing TDX VM: {0}")]
292     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
293 
294     #[cfg(feature = "tdx")]
295     #[error("TDX firmware missing")]
296     TdxFirmwareMissing,
297 
298     #[cfg(feature = "tdx")]
299     #[error("Invalid TDX payload type")]
300     InvalidPayloadType,
301 
302     #[cfg(feature = "guest_debug")]
303     #[error("Error debugging VM: {0:?}")]
304     Debug(DebuggableError),
305 
306     #[error("Error spawning kernel loading thread")]
307     KernelLoadThreadSpawn(std::io::Error),
308 
309     #[error("Error joining kernel loading thread")]
310     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
311 
312     #[error("Payload configuration is not bootable")]
313     InvalidPayload,
314 
315     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
316     #[error("Error coredumping VM: {0:?}")]
317     Coredump(GuestDebuggableError),
318 
319     #[cfg(feature = "igvm")]
320     #[error("Cannot open igvm file: {0}")]
321     IgvmFile(#[source] io::Error),
322 
323     #[cfg(feature = "igvm")]
324     #[error("Cannot load the igvm into memory: {0}")]
325     IgvmLoad(#[source] igvm_loader::Error),
326 
327     #[error("Error injecting NMI")]
328     ErrorNmi,
329 
330     #[error("Error resuming the VM: {0}")]
331     ResumeVm(#[source] hypervisor::HypervisorVmError),
332 
333     #[error("Error creating console devices")]
334     CreateConsoleDevices(ConsoleDeviceError),
335 }
336 pub type Result<T> = result::Result<T, Error>;
337 
338 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
339 pub enum VmState {
340     Created,
341     Running,
342     Shutdown,
343     Paused,
344     BreakPoint,
345 }
346 
347 impl VmState {
348     fn valid_transition(self, new_state: VmState) -> Result<()> {
349         match self {
350             VmState::Created => match new_state {
351                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
352                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
353                     Ok(())
354                 }
355             },
356 
357             VmState::Running => match new_state {
358                 VmState::Created | VmState::Running => {
359                     Err(Error::InvalidStateTransition(self, new_state))
360                 }
361                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
362             },
363 
364             VmState::Shutdown => match new_state {
365                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
366                     Err(Error::InvalidStateTransition(self, new_state))
367                 }
368                 VmState::Running => Ok(()),
369             },
370 
371             VmState::Paused => match new_state {
372                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
373                     Err(Error::InvalidStateTransition(self, new_state))
374                 }
375                 VmState::Running | VmState::Shutdown => Ok(()),
376             },
377             VmState::BreakPoint => match new_state {
378                 VmState::Created | VmState::Running => Ok(()),
379                 _ => Err(Error::InvalidStateTransition(self, new_state)),
380             },
381         }
382     }
383 }
384 
385 struct VmOpsHandler {
386     memory: GuestMemoryAtomic<GuestMemoryMmap>,
387     #[cfg(target_arch = "x86_64")]
388     io_bus: Arc<Bus>,
389     mmio_bus: Arc<Bus>,
390 }
391 
392 impl VmOps for VmOpsHandler {
393     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
394         self.memory
395             .memory()
396             .write(buf, GuestAddress(gpa))
397             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
398     }
399 
400     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
401         self.memory
402             .memory()
403             .read(buf, GuestAddress(gpa))
404             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
405     }
406 
407     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
408         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
409             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
410         }
411         Ok(())
412     }
413 
414     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
415         match self.mmio_bus.write(gpa, data) {
416             Err(vm_device::BusError::MissingAddressRange) => {
417                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
418             }
419             Ok(Some(barrier)) => {
420                 info!("Waiting for barrier");
421                 barrier.wait();
422                 info!("Barrier released");
423             }
424             _ => {}
425         };
426         Ok(())
427     }
428 
429     #[cfg(target_arch = "x86_64")]
430     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
431         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
432             info!("Guest PIO read to unregistered address 0x{:x}", port);
433         }
434         Ok(())
435     }
436 
437     #[cfg(target_arch = "x86_64")]
438     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
439         match self.io_bus.write(port, data) {
440             Err(vm_device::BusError::MissingAddressRange) => {
441                 info!("Guest PIO write to unregistered address 0x{:x}", port);
442             }
443             Ok(Some(barrier)) => {
444                 info!("Waiting for barrier");
445                 barrier.wait();
446                 info!("Barrier released");
447             }
448             _ => {}
449         };
450         Ok(())
451     }
452 }
453 
454 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
455     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
456 
457     cmp::min(host_phys_bits, max_phys_bits)
458 }
459 
460 pub struct Vm {
461     #[cfg(feature = "tdx")]
462     kernel: Option<File>,
463     initramfs: Option<File>,
464     threads: Vec<thread::JoinHandle<()>>,
465     device_manager: Arc<Mutex<DeviceManager>>,
466     config: Arc<Mutex<VmConfig>>,
467     state: RwLock<VmState>,
468     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
469     memory_manager: Arc<Mutex<MemoryManager>>,
470     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
471     // The hypervisor abstracted virtual machine.
472     vm: Arc<dyn hypervisor::Vm>,
473     #[cfg(target_arch = "x86_64")]
474     saved_clock: Option<hypervisor::ClockData>,
475     numa_nodes: NumaNodes,
476     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
477     hypervisor: Arc<dyn hypervisor::Hypervisor>,
478     stop_on_boot: bool,
479     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
480 }
481 
482 impl Vm {
483     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
484 
485     #[allow(clippy::too_many_arguments)]
486     pub fn new_from_memory_manager(
487         config: Arc<Mutex<VmConfig>>,
488         memory_manager: Arc<Mutex<MemoryManager>>,
489         vm: Arc<dyn hypervisor::Vm>,
490         exit_evt: EventFd,
491         reset_evt: EventFd,
492         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
493         seccomp_action: &SeccompAction,
494         hypervisor: Arc<dyn hypervisor::Hypervisor>,
495         activate_evt: EventFd,
496         timestamp: Instant,
497         console_info: Option<ConsoleInfo>,
498         console_resize_pipe: Option<Arc<File>>,
499         original_termios: Arc<Mutex<Option<termios>>>,
500         snapshot: Option<Snapshot>,
501     ) -> Result<Self> {
502         trace_scoped!("Vm::new_from_memory_manager");
503 
504         let boot_id_list = config
505             .lock()
506             .unwrap()
507             .validate()
508             .map_err(Error::ConfigValidation)?;
509 
510         #[cfg(not(feature = "igvm"))]
511         let load_payload_handle = if snapshot.is_none() {
512             Self::load_payload_async(&memory_manager, &config)?
513         } else {
514             None
515         };
516 
517         info!("Booting VM from config: {:?}", &config);
518 
519         // Create NUMA nodes based on NumaConfig.
520         let numa_nodes =
521             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
522 
523         #[cfg(feature = "tdx")]
524         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
525         #[cfg(feature = "sev_snp")]
526         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
527         #[cfg(feature = "tdx")]
528         let force_iommu = tdx_enabled;
529         #[cfg(feature = "sev_snp")]
530         let force_iommu = sev_snp_enabled;
531         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
532         let force_iommu = false;
533 
534         #[cfg(feature = "guest_debug")]
535         let stop_on_boot = config.lock().unwrap().gdb;
536         #[cfg(not(feature = "guest_debug"))]
537         let stop_on_boot = false;
538 
539         let memory = memory_manager.lock().unwrap().guest_memory();
540         #[cfg(target_arch = "x86_64")]
541         let io_bus = Arc::new(Bus::new());
542         let mmio_bus = Arc::new(Bus::new());
543 
544         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
545             memory,
546             #[cfg(target_arch = "x86_64")]
547             io_bus: io_bus.clone(),
548             mmio_bus: mmio_bus.clone(),
549         });
550 
551         let cpus_config = { &config.lock().unwrap().cpus.clone() };
552         let cpu_manager = cpu::CpuManager::new(
553             cpus_config,
554             vm.clone(),
555             exit_evt.try_clone().map_err(Error::EventFdClone)?,
556             reset_evt.try_clone().map_err(Error::EventFdClone)?,
557             #[cfg(feature = "guest_debug")]
558             vm_debug_evt,
559             &hypervisor,
560             seccomp_action.clone(),
561             vm_ops,
562             #[cfg(feature = "tdx")]
563             tdx_enabled,
564             &numa_nodes,
565             #[cfg(feature = "sev_snp")]
566             sev_snp_enabled,
567         )
568         .map_err(Error::CpuManager)?;
569 
570         #[cfg(target_arch = "x86_64")]
571         cpu_manager
572             .lock()
573             .unwrap()
574             .populate_cpuid(
575                 &memory_manager,
576                 &hypervisor,
577                 #[cfg(feature = "tdx")]
578                 tdx_enabled,
579             )
580             .map_err(Error::CpuManager)?;
581 
582         // Loading the igvm file is pushed down here because
583         // igvm parser needs cpu_manager to retrieve cpuid leaf.
584         // For the regular case, we can start loading early, but for
585         // igvm case we have to wait until cpu_manager is created.
586         // Currently, Microsoft Hypervisor does not provide any
587         // Hypervisor specific common cpuid, we need to call get_cpuid_values
588         // per cpuid through cpu_manager.
589         #[cfg(feature = "igvm")]
590         let load_payload_handle = if snapshot.is_none() {
591             Self::load_payload_async(
592                 &memory_manager,
593                 &config,
594                 &cpu_manager,
595                 #[cfg(feature = "sev_snp")]
596                 sev_snp_enabled,
597             )?
598         } else {
599             None
600         };
601         // The initial TDX configuration must be done before the vCPUs are
602         // created
603         #[cfg(feature = "tdx")]
604         if tdx_enabled {
605             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
606             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
607             vm.tdx_init(&cpuid, max_vcpus)
608                 .map_err(Error::InitializeTdxVm)?;
609         }
610 
611         cpu_manager
612             .lock()
613             .unwrap()
614             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
615             .map_err(Error::CpuManager)?;
616 
617         // This initial SEV-SNP configuration must be done immediately after
618         // vCPUs are created. As part of this initialization we are
619         // transitioning the guest into secure state.
620         #[cfg(feature = "sev_snp")]
621         if sev_snp_enabled {
622             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
623         }
624 
625         #[cfg(feature = "tdx")]
626         let dynamic = !tdx_enabled;
627         #[cfg(not(feature = "tdx"))]
628         let dynamic = true;
629 
630         let device_manager = DeviceManager::new(
631             #[cfg(target_arch = "x86_64")]
632             io_bus,
633             mmio_bus,
634             vm.clone(),
635             config.clone(),
636             memory_manager.clone(),
637             cpu_manager.clone(),
638             exit_evt.try_clone().map_err(Error::EventFdClone)?,
639             reset_evt,
640             seccomp_action.clone(),
641             numa_nodes.clone(),
642             &activate_evt,
643             force_iommu,
644             boot_id_list,
645             timestamp,
646             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
647             dynamic,
648         )
649         .map_err(Error::DeviceManager)?;
650 
651         device_manager
652             .lock()
653             .unwrap()
654             .create_devices(console_info, console_resize_pipe, original_termios)
655             .map_err(Error::DeviceManager)?;
656 
657         #[cfg(feature = "tdx")]
658         let kernel = config
659             .lock()
660             .unwrap()
661             .payload
662             .as_ref()
663             .map(|p| p.kernel.as_ref().map(File::open))
664             .unwrap_or_default()
665             .transpose()
666             .map_err(Error::KernelFile)?;
667 
668         let initramfs = config
669             .lock()
670             .unwrap()
671             .payload
672             .as_ref()
673             .map(|p| p.initramfs.as_ref().map(File::open))
674             .unwrap_or_default()
675             .transpose()
676             .map_err(Error::InitramfsFile)?;
677 
678         #[cfg(target_arch = "x86_64")]
679         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
680             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
681             vm_snapshot.clock
682         } else {
683             None
684         };
685 
686         let vm_state = if snapshot.is_some() {
687             VmState::Paused
688         } else {
689             VmState::Created
690         };
691 
692         Ok(Vm {
693             #[cfg(feature = "tdx")]
694             kernel,
695             initramfs,
696             device_manager,
697             config,
698             threads: Vec::with_capacity(1),
699             state: RwLock::new(vm_state),
700             cpu_manager,
701             memory_manager,
702             vm,
703             #[cfg(target_arch = "x86_64")]
704             saved_clock,
705             numa_nodes,
706             hypervisor,
707             stop_on_boot,
708             load_payload_handle,
709         })
710     }
711 
712     fn create_numa_nodes(
713         configs: Option<Vec<NumaConfig>>,
714         memory_manager: &Arc<Mutex<MemoryManager>>,
715     ) -> Result<NumaNodes> {
716         let mm = memory_manager.lock().unwrap();
717         let mm_zones = mm.memory_zones();
718         let mut numa_nodes = BTreeMap::new();
719 
720         if let Some(configs) = &configs {
721             for config in configs.iter() {
722                 if numa_nodes.contains_key(&config.guest_numa_id) {
723                     error!("Can't define twice the same NUMA node");
724                     return Err(Error::InvalidNumaConfig);
725                 }
726 
727                 let mut node = NumaNode::default();
728 
729                 if let Some(memory_zones) = &config.memory_zones {
730                     for memory_zone in memory_zones.iter() {
731                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
732                             node.memory_regions.extend(mm_zone.regions().clone());
733                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
734                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
735                             }
736                             node.memory_zones.push(memory_zone.clone());
737                         } else {
738                             error!("Unknown memory zone '{}'", memory_zone);
739                             return Err(Error::InvalidNumaConfig);
740                         }
741                     }
742                 }
743 
744                 if let Some(cpus) = &config.cpus {
745                     node.cpus.extend(cpus);
746                 }
747 
748                 if let Some(pci_segments) = &config.pci_segments {
749                     node.pci_segments.extend(pci_segments);
750                 }
751 
752                 if let Some(distances) = &config.distances {
753                     for distance in distances.iter() {
754                         let dest = distance.destination;
755                         let dist = distance.distance;
756 
757                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
758                             error!("Unknown destination NUMA node {}", dest);
759                             return Err(Error::InvalidNumaConfig);
760                         }
761 
762                         if node.distances.contains_key(&dest) {
763                             error!("Destination NUMA node {} has been already set", dest);
764                             return Err(Error::InvalidNumaConfig);
765                         }
766 
767                         node.distances.insert(dest, dist);
768                     }
769                 }
770 
771                 #[cfg(target_arch = "x86_64")]
772                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
773                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
774                         let mm_sections = sgx_epc_region.epc_sections();
775                         for sgx_epc_section in sgx_epc_sections.iter() {
776                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
777                                 node.sgx_epc_sections.push(mm_section.clone());
778                             } else {
779                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
780                                 return Err(Error::InvalidNumaConfig);
781                             }
782                         }
783                     } else {
784                         error!("Missing SGX EPC region");
785                         return Err(Error::InvalidNumaConfig);
786                     }
787                 }
788 
789                 numa_nodes.insert(config.guest_numa_id, node);
790             }
791         }
792 
793         Ok(numa_nodes)
794     }
795 
796     #[allow(clippy::too_many_arguments)]
797     pub fn new(
798         vm_config: Arc<Mutex<VmConfig>>,
799         exit_evt: EventFd,
800         reset_evt: EventFd,
801         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
802         seccomp_action: &SeccompAction,
803         hypervisor: Arc<dyn hypervisor::Hypervisor>,
804         activate_evt: EventFd,
805         console_info: Option<ConsoleInfo>,
806         console_resize_pipe: Option<Arc<File>>,
807         original_termios: Arc<Mutex<Option<termios>>>,
808         snapshot: Option<Snapshot>,
809         source_url: Option<&str>,
810         prefault: Option<bool>,
811     ) -> Result<Self> {
812         trace_scoped!("Vm::new");
813 
814         let timestamp = Instant::now();
815 
816         #[cfg(feature = "tdx")]
817         let tdx_enabled = if snapshot.is_some() {
818             false
819         } else {
820             vm_config.lock().unwrap().is_tdx_enabled()
821         };
822 
823         #[cfg(feature = "sev_snp")]
824         let sev_snp_enabled = if snapshot.is_some() {
825             false
826         } else {
827             vm_config.lock().unwrap().is_sev_snp_enabled()
828         };
829 
830         let vm = Self::create_hypervisor_vm(
831             &hypervisor,
832             #[cfg(feature = "tdx")]
833             tdx_enabled,
834             #[cfg(feature = "sev_snp")]
835             sev_snp_enabled,
836         )?;
837 
838         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
839 
840         let memory_manager = if let Some(snapshot) =
841             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
842         {
843             MemoryManager::new_from_snapshot(
844                 &snapshot,
845                 vm.clone(),
846                 &vm_config.lock().unwrap().memory.clone(),
847                 source_url,
848                 prefault.unwrap(),
849                 phys_bits,
850             )
851             .map_err(Error::MemoryManager)?
852         } else {
853             #[cfg(target_arch = "x86_64")]
854             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
855 
856             MemoryManager::new(
857                 vm.clone(),
858                 &vm_config.lock().unwrap().memory.clone(),
859                 None,
860                 phys_bits,
861                 #[cfg(feature = "tdx")]
862                 tdx_enabled,
863                 None,
864                 None,
865                 #[cfg(target_arch = "x86_64")]
866                 sgx_epc_config,
867             )
868             .map_err(Error::MemoryManager)?
869         };
870 
871         Vm::new_from_memory_manager(
872             vm_config,
873             memory_manager,
874             vm,
875             exit_evt,
876             reset_evt,
877             #[cfg(feature = "guest_debug")]
878             vm_debug_evt,
879             seccomp_action,
880             hypervisor,
881             activate_evt,
882             timestamp,
883             console_info,
884             console_resize_pipe,
885             original_termios,
886             snapshot,
887         )
888     }
889 
890     pub fn create_hypervisor_vm(
891         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
892         #[cfg(feature = "tdx")] tdx_enabled: bool,
893         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
894     ) -> Result<Arc<dyn hypervisor::Vm>> {
895         hypervisor.check_required_extensions().unwrap();
896 
897         cfg_if::cfg_if! {
898             if #[cfg(feature = "tdx")] {
899                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
900                 // Otherwise KVM_X86_LEGACY_VM: 0
901                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
902                 let vm = hypervisor
903                     .create_vm_with_type(u64::from(tdx_enabled))
904                     .unwrap();
905             } else if #[cfg(feature = "sev_snp")] {
906                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
907                 // Otherwise SEV_SNP_DISABLED: 0
908                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
909                 let vm = hypervisor
910                     .create_vm_with_type(u64::from(sev_snp_enabled))
911                     .unwrap();
912             } else {
913                 let vm = hypervisor.create_vm().unwrap();
914             }
915         }
916 
917         #[cfg(target_arch = "x86_64")]
918         {
919             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
920                 .unwrap();
921             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
922             vm.enable_split_irq().unwrap();
923         }
924 
925         Ok(vm)
926     }
927 
928     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
929         let initramfs = self.initramfs.as_mut().unwrap();
930         let size: usize = initramfs
931             .seek(SeekFrom::End(0))
932             .map_err(|_| Error::InitramfsLoad)?
933             .try_into()
934             .unwrap();
935         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
936 
937         let address =
938             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
939         let address = GuestAddress(address);
940 
941         guest_mem
942             .read_volatile_from(address, initramfs, size)
943             .map_err(|_| Error::InitramfsLoad)?;
944 
945         info!("Initramfs loaded: address = 0x{:x}", address.0);
946         Ok(arch::InitramfsConfig { address, size })
947     }
948 
949     pub fn generate_cmdline(
950         payload: &PayloadConfig,
951         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
952     ) -> Result<Cmdline> {
953         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
954         if let Some(s) = payload.cmdline.as_ref() {
955             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
956         }
957 
958         #[cfg(target_arch = "aarch64")]
959         for entry in device_manager.lock().unwrap().cmdline_additions() {
960             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
961         }
962         Ok(cmdline)
963     }
964 
965     #[cfg(target_arch = "aarch64")]
966     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
967         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
968         let mem = uefi_flash.memory();
969         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
970             .map_err(Error::UefiLoad)?;
971         Ok(())
972     }
973 
974     #[cfg(target_arch = "aarch64")]
975     fn load_kernel(
976         firmware: Option<File>,
977         kernel: Option<File>,
978         memory_manager: Arc<Mutex<MemoryManager>>,
979     ) -> Result<EntryPoint> {
980         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
981         let mem = guest_memory.memory();
982         let entry_addr = match (firmware, kernel) {
983             (None, Some(mut kernel)) => {
984                 match linux_loader::loader::pe::PE::load(
985                     mem.deref(),
986                     Some(arch::layout::KERNEL_START),
987                     &mut kernel,
988                     None,
989                 ) {
990                     Ok(entry_addr) => entry_addr.kernel_load,
991                     // Try to load the binary as kernel PE file at first.
992                     // If failed, retry to load it as UEFI binary.
993                     // As the UEFI binary is formatless, it must be the last option to try.
994                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
995                         Self::load_firmware(&kernel, memory_manager)?;
996                         arch::layout::UEFI_START
997                     }
998                     Err(e) => {
999                         return Err(Error::KernelLoad(e));
1000                     }
1001                 }
1002             }
1003             (Some(firmware), None) => {
1004                 Self::load_firmware(&firmware, memory_manager)?;
1005                 arch::layout::UEFI_START
1006             }
1007             _ => return Err(Error::InvalidPayload),
1008         };
1009 
1010         Ok(EntryPoint { entry_addr })
1011     }
1012 
1013     #[cfg(feature = "igvm")]
1014     fn load_igvm(
1015         igvm: File,
1016         memory_manager: Arc<Mutex<MemoryManager>>,
1017         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1018         #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1019     ) -> Result<EntryPoint> {
1020         let res = igvm_loader::load_igvm(
1021             &igvm,
1022             memory_manager,
1023             cpu_manager.clone(),
1024             "",
1025             #[cfg(feature = "sev_snp")]
1026             host_data,
1027         )
1028         .map_err(Error::IgvmLoad)?;
1029 
1030         cfg_if::cfg_if! {
1031             if #[cfg(feature = "sev_snp")] {
1032                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1033                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1034                 } else {
1035                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1036                 };
1037             } else {
1038                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1039             }
1040         };
1041         Ok(entry_point)
1042     }
1043 
1044     #[cfg(target_arch = "x86_64")]
1045     fn load_kernel(
1046         mut kernel: File,
1047         cmdline: Option<Cmdline>,
1048         memory_manager: Arc<Mutex<MemoryManager>>,
1049     ) -> Result<EntryPoint> {
1050         info!("Loading kernel");
1051 
1052         let mem = {
1053             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1054             guest_memory.memory()
1055         };
1056 
1057         // Try ELF binary with PVH boot.
1058         let entry_addr = linux_loader::loader::elf::Elf::load(
1059             mem.deref(),
1060             None,
1061             &mut kernel,
1062             Some(arch::layout::HIGH_RAM_START),
1063         )
1064         // Try loading kernel as bzImage.
1065         .or_else(|_| {
1066             BzImage::load(
1067                 mem.deref(),
1068                 None,
1069                 &mut kernel,
1070                 Some(arch::layout::HIGH_RAM_START),
1071             )
1072         })
1073         .map_err(Error::KernelLoad)?;
1074 
1075         if let Some(cmdline) = cmdline {
1076             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1077                 .map_err(Error::LoadCmdLine)?;
1078         }
1079 
1080         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1081             // Use the PVH kernel entry point to boot the guest
1082             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1083             Ok(EntryPoint {
1084                 entry_addr,
1085                 setup_header: None,
1086             })
1087         } else if entry_addr.setup_header.is_some() {
1088             // Use the bzImage 32bit entry point to boot the guest
1089             info!(
1090                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1091                 entry_addr.kernel_load.0
1092             );
1093             Ok(EntryPoint {
1094                 entry_addr: entry_addr.kernel_load,
1095                 setup_header: entry_addr.setup_header,
1096             })
1097         } else {
1098             Err(Error::KernelMissingPvhHeader)
1099         }
1100     }
1101 
1102     #[cfg(target_arch = "x86_64")]
1103     fn load_payload(
1104         payload: &PayloadConfig,
1105         memory_manager: Arc<Mutex<MemoryManager>>,
1106         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1107         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1108     ) -> Result<EntryPoint> {
1109         trace_scoped!("load_payload");
1110         #[cfg(feature = "igvm")]
1111         {
1112             if let Some(_igvm_file) = &payload.igvm {
1113                 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1114                 #[cfg(feature = "sev_snp")]
1115                 if sev_snp_enabled {
1116                     return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1117                 }
1118                 #[cfg(not(feature = "sev_snp"))]
1119                 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1120             }
1121         }
1122         match (
1123             &payload.firmware,
1124             &payload.kernel,
1125             &payload.initramfs,
1126             &payload.cmdline,
1127         ) {
1128             (Some(firmware), None, None, None) => {
1129                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1130                 Self::load_kernel(firmware, None, memory_manager)
1131             }
1132             (None, Some(kernel), _, _) => {
1133                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1134                 let cmdline = Self::generate_cmdline(payload)?;
1135                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1136             }
1137             _ => Err(Error::InvalidPayload),
1138         }
1139     }
1140 
1141     #[cfg(target_arch = "aarch64")]
1142     fn load_payload(
1143         payload: &PayloadConfig,
1144         memory_manager: Arc<Mutex<MemoryManager>>,
1145     ) -> Result<EntryPoint> {
1146         match (&payload.firmware, &payload.kernel) {
1147             (Some(firmware), None) => {
1148                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1149                 Self::load_kernel(Some(firmware), None, memory_manager)
1150             }
1151             (None, Some(kernel)) => {
1152                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1153                 Self::load_kernel(None, Some(kernel), memory_manager)
1154             }
1155             _ => Err(Error::InvalidPayload),
1156         }
1157     }
1158 
1159     fn load_payload_async(
1160         memory_manager: &Arc<Mutex<MemoryManager>>,
1161         config: &Arc<Mutex<VmConfig>>,
1162         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1163         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1164     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1165         // Kernel with TDX is loaded in a different manner
1166         #[cfg(feature = "tdx")]
1167         if config.lock().unwrap().is_tdx_enabled() {
1168             return Ok(None);
1169         }
1170 
1171         config
1172             .lock()
1173             .unwrap()
1174             .payload
1175             .as_ref()
1176             .map(|payload| {
1177                 let memory_manager = memory_manager.clone();
1178                 let payload = payload.clone();
1179                 #[cfg(feature = "igvm")]
1180                 let cpu_manager = cpu_manager.clone();
1181 
1182                 std::thread::Builder::new()
1183                     .name("payload_loader".into())
1184                     .spawn(move || {
1185                         Self::load_payload(
1186                             &payload,
1187                             memory_manager,
1188                             #[cfg(feature = "igvm")]
1189                             cpu_manager,
1190                             #[cfg(feature = "sev_snp")]
1191                             sev_snp_enabled,
1192                         )
1193                     })
1194                     .map_err(Error::KernelLoadThreadSpawn)
1195             })
1196             .transpose()
1197     }
1198 
1199     #[cfg(target_arch = "x86_64")]
1200     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1201         trace_scoped!("configure_system");
1202         info!("Configuring system");
1203         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1204 
1205         let initramfs_config = match self.initramfs {
1206             Some(_) => Some(self.load_initramfs(&mem)?),
1207             None => None,
1208         };
1209 
1210         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1211         let rsdp_addr = Some(rsdp_addr);
1212         let sgx_epc_region = self
1213             .memory_manager
1214             .lock()
1215             .unwrap()
1216             .sgx_epc_region()
1217             .as_ref()
1218             .cloned();
1219 
1220         let serial_number = self
1221             .config
1222             .lock()
1223             .unwrap()
1224             .platform
1225             .as_ref()
1226             .and_then(|p| p.serial_number.clone());
1227 
1228         let uuid = self
1229             .config
1230             .lock()
1231             .unwrap()
1232             .platform
1233             .as_ref()
1234             .and_then(|p| p.uuid.clone());
1235 
1236         let oem_strings = self
1237             .config
1238             .lock()
1239             .unwrap()
1240             .platform
1241             .as_ref()
1242             .and_then(|p| p.oem_strings.clone());
1243 
1244         let oem_strings = oem_strings
1245             .as_deref()
1246             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1247 
1248         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1249 
1250         arch::configure_system(
1251             &mem,
1252             arch::layout::CMDLINE_START,
1253             arch::layout::CMDLINE_MAX_SIZE,
1254             &initramfs_config,
1255             boot_vcpus,
1256             entry_addr.setup_header,
1257             rsdp_addr,
1258             sgx_epc_region,
1259             serial_number.as_deref(),
1260             uuid.as_deref(),
1261             oem_strings.as_deref(),
1262             topology,
1263         )
1264         .map_err(Error::ConfigureSystem)?;
1265         Ok(())
1266     }
1267 
1268     #[cfg(target_arch = "aarch64")]
1269     fn configure_system(
1270         &mut self,
1271         _rsdp_addr: GuestAddress,
1272         _entry_addr: EntryPoint,
1273     ) -> Result<()> {
1274         let cmdline = Self::generate_cmdline(
1275             self.config.lock().unwrap().payload.as_ref().unwrap(),
1276             &self.device_manager,
1277         )?;
1278         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1279         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1280         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1281         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1282         let initramfs_config = match self.initramfs {
1283             Some(_) => Some(self.load_initramfs(&mem)?),
1284             None => None,
1285         };
1286 
1287         let device_info = &self
1288             .device_manager
1289             .lock()
1290             .unwrap()
1291             .get_device_info()
1292             .clone();
1293 
1294         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1295             let pci_space = PciSpaceInfo {
1296                 pci_segment_id: pci_segment.id,
1297                 mmio_config_address: pci_segment.mmio_config_address,
1298                 pci_device_space_start: pci_segment.start_of_mem64_area,
1299                 pci_device_space_size: pci_segment.end_of_mem64_area
1300                     - pci_segment.start_of_mem64_area
1301                     + 1,
1302             };
1303             pci_space_info.push(pci_space);
1304         }
1305 
1306         let virtio_iommu_bdf = self
1307             .device_manager
1308             .lock()
1309             .unwrap()
1310             .iommu_attached_devices()
1311             .as_ref()
1312             .map(|(v, _)| *v);
1313 
1314         let vgic = self
1315             .device_manager
1316             .lock()
1317             .unwrap()
1318             .get_interrupt_controller()
1319             .unwrap()
1320             .lock()
1321             .unwrap()
1322             .get_vgic()
1323             .map_err(|_| {
1324                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1325                     arch::aarch64::Error::SetupGic,
1326                 ))
1327             })?;
1328 
1329         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1330         let pmu_supported = self
1331             .cpu_manager
1332             .lock()
1333             .unwrap()
1334             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1335             .map_err(|_| {
1336                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1337                     arch::aarch64::Error::VcpuInitPmu,
1338                 ))
1339             })?;
1340 
1341         arch::configure_system(
1342             &mem,
1343             cmdline.as_cstring().unwrap().to_str().unwrap(),
1344             vcpu_mpidrs,
1345             vcpu_topology,
1346             device_info,
1347             &initramfs_config,
1348             &pci_space_info,
1349             virtio_iommu_bdf.map(|bdf| bdf.into()),
1350             &vgic,
1351             &self.numa_nodes,
1352             pmu_supported,
1353         )
1354         .map_err(Error::ConfigureSystem)?;
1355 
1356         Ok(())
1357     }
1358 
1359     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1360         self.device_manager.lock().unwrap().console_resize_pipe()
1361     }
1362 
1363     pub fn shutdown(&mut self) -> Result<()> {
1364         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1365         let new_state = VmState::Shutdown;
1366 
1367         state.valid_transition(new_state)?;
1368 
1369         // Wake up the DeviceManager threads so they will get terminated cleanly
1370         self.device_manager
1371             .lock()
1372             .unwrap()
1373             .resume()
1374             .map_err(Error::Resume)?;
1375 
1376         self.cpu_manager
1377             .lock()
1378             .unwrap()
1379             .shutdown()
1380             .map_err(Error::CpuManager)?;
1381 
1382         // Wait for all the threads to finish
1383         for thread in self.threads.drain(..) {
1384             thread.join().map_err(Error::ThreadCleanup)?
1385         }
1386         *state = new_state;
1387 
1388         Ok(())
1389     }
1390 
1391     pub fn resize(
1392         &mut self,
1393         desired_vcpus: Option<u8>,
1394         desired_memory: Option<u64>,
1395         desired_balloon: Option<u64>,
1396     ) -> Result<()> {
1397         event!("vm", "resizing");
1398 
1399         if let Some(desired_vcpus) = desired_vcpus {
1400             if self
1401                 .cpu_manager
1402                 .lock()
1403                 .unwrap()
1404                 .resize(desired_vcpus)
1405                 .map_err(Error::CpuManager)?
1406             {
1407                 self.device_manager
1408                     .lock()
1409                     .unwrap()
1410                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1411                     .map_err(Error::DeviceManager)?;
1412             }
1413             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1414         }
1415 
1416         if let Some(desired_memory) = desired_memory {
1417             let new_region = self
1418                 .memory_manager
1419                 .lock()
1420                 .unwrap()
1421                 .resize(desired_memory)
1422                 .map_err(Error::MemoryManager)?;
1423 
1424             let memory_config = &mut self.config.lock().unwrap().memory;
1425 
1426             if let Some(new_region) = &new_region {
1427                 self.device_manager
1428                     .lock()
1429                     .unwrap()
1430                     .update_memory(new_region)
1431                     .map_err(Error::DeviceManager)?;
1432 
1433                 match memory_config.hotplug_method {
1434                     HotplugMethod::Acpi => {
1435                         self.device_manager
1436                             .lock()
1437                             .unwrap()
1438                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1439                             .map_err(Error::DeviceManager)?;
1440                     }
1441                     HotplugMethod::VirtioMem => {}
1442                 }
1443             }
1444 
1445             // We update the VM config regardless of the actual guest resize
1446             // operation result (happened or not), so that if the VM reboots
1447             // it will be running with the last configure memory size.
1448             match memory_config.hotplug_method {
1449                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1450                 HotplugMethod::VirtioMem => {
1451                     if desired_memory > memory_config.size {
1452                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1453                     } else {
1454                         memory_config.hotplugged_size = None;
1455                     }
1456                 }
1457             }
1458         }
1459 
1460         if let Some(desired_balloon) = desired_balloon {
1461             self.device_manager
1462                 .lock()
1463                 .unwrap()
1464                 .resize_balloon(desired_balloon)
1465                 .map_err(Error::DeviceManager)?;
1466 
1467             // Update the configuration value for the balloon size to ensure
1468             // a reboot would use the right value.
1469             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1470                 balloon_config.size = desired_balloon;
1471             }
1472         }
1473 
1474         event!("vm", "resized");
1475 
1476         Ok(())
1477     }
1478 
1479     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1480         let memory_config = &mut self.config.lock().unwrap().memory;
1481 
1482         if let Some(zones) = &mut memory_config.zones {
1483             for zone in zones.iter_mut() {
1484                 if zone.id == id {
1485                     if desired_memory >= zone.size {
1486                         let hotplugged_size = desired_memory - zone.size;
1487                         self.memory_manager
1488                             .lock()
1489                             .unwrap()
1490                             .resize_zone(&id, desired_memory - zone.size)
1491                             .map_err(Error::MemoryManager)?;
1492                         // We update the memory zone config regardless of the
1493                         // actual 'resize-zone' operation result (happened or
1494                         // not), so that if the VM reboots it will be running
1495                         // with the last configured memory zone size.
1496                         zone.hotplugged_size = Some(hotplugged_size);
1497 
1498                         return Ok(());
1499                     } else {
1500                         error!(
1501                             "Invalid to ask less ({}) than boot RAM ({}) for \
1502                             this memory zone",
1503                             desired_memory, zone.size,
1504                         );
1505                         return Err(Error::ResizeZone);
1506                     }
1507                 }
1508             }
1509         }
1510 
1511         error!("Could not find the memory zone {} for the resize", id);
1512         Err(Error::ResizeZone)
1513     }
1514 
1515     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1516         let pci_device_info = self
1517             .device_manager
1518             .lock()
1519             .unwrap()
1520             .add_device(&mut device_cfg)
1521             .map_err(Error::DeviceManager)?;
1522 
1523         // Update VmConfig by adding the new device. This is important to
1524         // ensure the device would be created in case of a reboot.
1525         {
1526             let mut config = self.config.lock().unwrap();
1527             add_to_config(&mut config.devices, device_cfg);
1528         }
1529 
1530         self.device_manager
1531             .lock()
1532             .unwrap()
1533             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1534             .map_err(Error::DeviceManager)?;
1535 
1536         Ok(pci_device_info)
1537     }
1538 
1539     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1540         let pci_device_info = self
1541             .device_manager
1542             .lock()
1543             .unwrap()
1544             .add_user_device(&mut device_cfg)
1545             .map_err(Error::DeviceManager)?;
1546 
1547         // Update VmConfig by adding the new device. This is important to
1548         // ensure the device would be created in case of a reboot.
1549         {
1550             let mut config = self.config.lock().unwrap();
1551             add_to_config(&mut config.user_devices, device_cfg);
1552         }
1553 
1554         self.device_manager
1555             .lock()
1556             .unwrap()
1557             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1558             .map_err(Error::DeviceManager)?;
1559 
1560         Ok(pci_device_info)
1561     }
1562 
1563     pub fn remove_device(&mut self, id: String) -> Result<()> {
1564         self.device_manager
1565             .lock()
1566             .unwrap()
1567             .remove_device(id.clone())
1568             .map_err(Error::DeviceManager)?;
1569 
1570         // Update VmConfig by removing the device. This is important to
1571         // ensure the device would not be created in case of a reboot.
1572         self.config.lock().unwrap().remove_device(&id);
1573 
1574         self.device_manager
1575             .lock()
1576             .unwrap()
1577             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1578             .map_err(Error::DeviceManager)?;
1579         Ok(())
1580     }
1581 
1582     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1583         let pci_device_info = self
1584             .device_manager
1585             .lock()
1586             .unwrap()
1587             .add_disk(&mut disk_cfg)
1588             .map_err(Error::DeviceManager)?;
1589 
1590         // Update VmConfig by adding the new device. This is important to
1591         // ensure the device would be created in case of a reboot.
1592         {
1593             let mut config = self.config.lock().unwrap();
1594             add_to_config(&mut config.disks, disk_cfg);
1595         }
1596 
1597         self.device_manager
1598             .lock()
1599             .unwrap()
1600             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1601             .map_err(Error::DeviceManager)?;
1602 
1603         Ok(pci_device_info)
1604     }
1605 
1606     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1607         let pci_device_info = self
1608             .device_manager
1609             .lock()
1610             .unwrap()
1611             .add_fs(&mut fs_cfg)
1612             .map_err(Error::DeviceManager)?;
1613 
1614         // Update VmConfig by adding the new device. This is important to
1615         // ensure the device would be created in case of a reboot.
1616         {
1617             let mut config = self.config.lock().unwrap();
1618             add_to_config(&mut config.fs, fs_cfg);
1619         }
1620 
1621         self.device_manager
1622             .lock()
1623             .unwrap()
1624             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1625             .map_err(Error::DeviceManager)?;
1626 
1627         Ok(pci_device_info)
1628     }
1629 
1630     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1631         let pci_device_info = self
1632             .device_manager
1633             .lock()
1634             .unwrap()
1635             .add_pmem(&mut pmem_cfg)
1636             .map_err(Error::DeviceManager)?;
1637 
1638         // Update VmConfig by adding the new device. This is important to
1639         // ensure the device would be created in case of a reboot.
1640         {
1641             let mut config = self.config.lock().unwrap();
1642             add_to_config(&mut config.pmem, pmem_cfg);
1643         }
1644 
1645         self.device_manager
1646             .lock()
1647             .unwrap()
1648             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1649             .map_err(Error::DeviceManager)?;
1650 
1651         Ok(pci_device_info)
1652     }
1653 
1654     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1655         let pci_device_info = self
1656             .device_manager
1657             .lock()
1658             .unwrap()
1659             .add_net(&mut net_cfg)
1660             .map_err(Error::DeviceManager)?;
1661 
1662         // Update VmConfig by adding the new device. This is important to
1663         // ensure the device would be created in case of a reboot.
1664         {
1665             let mut config = self.config.lock().unwrap();
1666             add_to_config(&mut config.net, net_cfg);
1667         }
1668 
1669         self.device_manager
1670             .lock()
1671             .unwrap()
1672             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1673             .map_err(Error::DeviceManager)?;
1674 
1675         Ok(pci_device_info)
1676     }
1677 
1678     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1679         let pci_device_info = self
1680             .device_manager
1681             .lock()
1682             .unwrap()
1683             .add_vdpa(&mut vdpa_cfg)
1684             .map_err(Error::DeviceManager)?;
1685 
1686         // Update VmConfig by adding the new device. This is important to
1687         // ensure the device would be created in case of a reboot.
1688         {
1689             let mut config = self.config.lock().unwrap();
1690             add_to_config(&mut config.vdpa, vdpa_cfg);
1691         }
1692 
1693         self.device_manager
1694             .lock()
1695             .unwrap()
1696             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1697             .map_err(Error::DeviceManager)?;
1698 
1699         Ok(pci_device_info)
1700     }
1701 
1702     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1703         let pci_device_info = self
1704             .device_manager
1705             .lock()
1706             .unwrap()
1707             .add_vsock(&mut vsock_cfg)
1708             .map_err(Error::DeviceManager)?;
1709 
1710         // Update VmConfig by adding the new device. This is important to
1711         // ensure the device would be created in case of a reboot.
1712         {
1713             let mut config = self.config.lock().unwrap();
1714             config.vsock = Some(vsock_cfg);
1715         }
1716 
1717         self.device_manager
1718             .lock()
1719             .unwrap()
1720             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1721             .map_err(Error::DeviceManager)?;
1722 
1723         Ok(pci_device_info)
1724     }
1725 
1726     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1727         Ok(self.device_manager.lock().unwrap().counters())
1728     }
1729 
1730     #[cfg(feature = "tdx")]
1731     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1732         use arch::x86_64::tdx::*;
1733 
1734         let firmware_path = self
1735             .config
1736             .lock()
1737             .unwrap()
1738             .payload
1739             .as_ref()
1740             .unwrap()
1741             .firmware
1742             .clone()
1743             .ok_or(Error::TdxFirmwareMissing)?;
1744         // The TDVF file contains a table of section as well as code
1745         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1746 
1747         // For all the sections allocate some RAM backing them
1748         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1749     }
1750 
1751     #[cfg(feature = "tdx")]
1752     fn hob_memory_resources(
1753         mut sorted_sections: Vec<TdvfSection>,
1754         guest_memory: &GuestMemoryMmap,
1755     ) -> Vec<(u64, u64, bool)> {
1756         let mut list = Vec::new();
1757 
1758         let mut current_section = sorted_sections.pop();
1759 
1760         // RAM regions interleaved with TDVF sections
1761         let mut next_start_addr = 0;
1762         for region in guest_memory.iter() {
1763             let region_start = region.start_addr().0;
1764             let region_end = region.last_addr().0;
1765             if region_start > next_start_addr {
1766                 next_start_addr = region_start;
1767             }
1768 
1769             loop {
1770                 let (start, size, ram) = if let Some(section) = &current_section {
1771                     if section.address <= next_start_addr {
1772                         (section.address, section.size, false)
1773                     } else {
1774                         let last_addr = std::cmp::min(section.address - 1, region_end);
1775                         (next_start_addr, last_addr - next_start_addr + 1, true)
1776                     }
1777                 } else {
1778                     (next_start_addr, region_end - next_start_addr + 1, true)
1779                 };
1780 
1781                 list.push((start, size, ram));
1782 
1783                 if !ram {
1784                     current_section = sorted_sections.pop();
1785                 }
1786 
1787                 next_start_addr = start + size;
1788 
1789                 if region_start > next_start_addr {
1790                     next_start_addr = region_start;
1791                 }
1792 
1793                 if next_start_addr > region_end {
1794                     break;
1795                 }
1796             }
1797         }
1798 
1799         // Once all the interleaved sections have been processed, let's simply
1800         // pull the remaining ones.
1801         if let Some(section) = current_section {
1802             list.push((section.address, section.size, false));
1803         }
1804         while let Some(section) = sorted_sections.pop() {
1805             list.push((section.address, section.size, false));
1806         }
1807 
1808         list
1809     }
1810 
1811     #[cfg(feature = "tdx")]
1812     fn populate_tdx_sections(
1813         &mut self,
1814         sections: &[TdvfSection],
1815         guid_found: bool,
1816     ) -> Result<Option<u64>> {
1817         use arch::x86_64::tdx::*;
1818         // Get the memory end *before* we start adding TDVF ram regions
1819         let boot_guest_memory = self
1820             .memory_manager
1821             .lock()
1822             .as_ref()
1823             .unwrap()
1824             .boot_guest_memory();
1825         for section in sections {
1826             // No need to allocate if the section falls within guest RAM ranges
1827             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1828                 info!(
1829                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1830                     section
1831                 );
1832                 continue;
1833             }
1834 
1835             info!("Allocating TDVF Section: {:x?}", section);
1836             self.memory_manager
1837                 .lock()
1838                 .unwrap()
1839                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1840                 .map_err(Error::AllocatingTdvfMemory)?;
1841         }
1842 
1843         // The TDVF file contains a table of section as well as code
1844         let firmware_path = self
1845             .config
1846             .lock()
1847             .unwrap()
1848             .payload
1849             .as_ref()
1850             .unwrap()
1851             .firmware
1852             .clone()
1853             .ok_or(Error::TdxFirmwareMissing)?;
1854         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1855 
1856         // The guest memory at this point now has all the required regions so it
1857         // is safe to copy from the TDVF file into it.
1858         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1859         let mem = guest_memory.memory();
1860         let mut payload_info = None;
1861         let mut hob_offset = None;
1862         for section in sections {
1863             info!("Populating TDVF Section: {:x?}", section);
1864             match section.r#type {
1865                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1866                     info!("Copying section to guest memory");
1867                     firmware_file
1868                         .seek(SeekFrom::Start(section.data_offset as u64))
1869                         .map_err(Error::LoadTdvf)?;
1870                     mem.read_volatile_from(
1871                         GuestAddress(section.address),
1872                         &mut firmware_file,
1873                         section.data_size as usize,
1874                     )
1875                     .unwrap();
1876                 }
1877                 TdvfSectionType::TdHob => {
1878                     hob_offset = Some(section.address);
1879                 }
1880                 TdvfSectionType::Payload => {
1881                     info!("Copying payload to guest memory");
1882                     if let Some(payload_file) = self.kernel.as_mut() {
1883                         let payload_size = payload_file
1884                             .seek(SeekFrom::End(0))
1885                             .map_err(Error::LoadPayload)?;
1886 
1887                         payload_file
1888                             .seek(SeekFrom::Start(0x1f1))
1889                             .map_err(Error::LoadPayload)?;
1890 
1891                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1892                         payload_file
1893                             .read_volatile(&mut payload_header.as_bytes())
1894                             .unwrap();
1895 
1896                         if payload_header.header != 0x5372_6448 {
1897                             return Err(Error::InvalidPayloadType);
1898                         }
1899 
1900                         if (payload_header.version < 0x0200)
1901                             || ((payload_header.loadflags & 0x1) == 0x0)
1902                         {
1903                             return Err(Error::InvalidPayloadType);
1904                         }
1905 
1906                         payload_file.rewind().map_err(Error::LoadPayload)?;
1907                         mem.read_volatile_from(
1908                             GuestAddress(section.address),
1909                             payload_file,
1910                             payload_size as usize,
1911                         )
1912                         .unwrap();
1913 
1914                         // Create the payload info that will be inserted into
1915                         // the HOB.
1916                         payload_info = Some(PayloadInfo {
1917                             image_type: PayloadImageType::BzImage,
1918                             entry_point: section.address,
1919                         });
1920                     }
1921                 }
1922                 TdvfSectionType::PayloadParam => {
1923                     info!("Copying payload parameters to guest memory");
1924                     let cmdline = Self::generate_cmdline(
1925                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1926                     )?;
1927                     mem.write_slice(
1928                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1929                         GuestAddress(section.address),
1930                     )
1931                     .unwrap();
1932                 }
1933                 _ => {}
1934             }
1935         }
1936 
1937         // Generate HOB
1938         let mut hob = TdHob::start(hob_offset.unwrap());
1939 
1940         let mut sorted_sections = sections.to_vec();
1941         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1942 
1943         sorted_sections.sort_by_key(|section| section.address);
1944         sorted_sections.reverse();
1945 
1946         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1947             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1948                 .map_err(Error::PopulateHob)?;
1949         }
1950 
1951         // MMIO regions
1952         hob.add_mmio_resource(
1953             &mem,
1954             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1955             arch::layout::APIC_START.raw_value()
1956                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1957         )
1958         .map_err(Error::PopulateHob)?;
1959         let start_of_device_area = self
1960             .memory_manager
1961             .lock()
1962             .unwrap()
1963             .start_of_device_area()
1964             .raw_value();
1965         let end_of_device_area = self
1966             .memory_manager
1967             .lock()
1968             .unwrap()
1969             .end_of_device_area()
1970             .raw_value();
1971         hob.add_mmio_resource(
1972             &mem,
1973             start_of_device_area,
1974             end_of_device_area - start_of_device_area,
1975         )
1976         .map_err(Error::PopulateHob)?;
1977 
1978         // Loop over the ACPI tables and copy them to the HOB.
1979 
1980         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1981             &self.device_manager,
1982             &self.cpu_manager,
1983             &self.memory_manager,
1984             &self.numa_nodes,
1985         ) {
1986             hob.add_acpi_table(&mem, acpi_table.as_slice())
1987                 .map_err(Error::PopulateHob)?;
1988         }
1989 
1990         // If a payload info has been created, let's insert it into the HOB.
1991         if let Some(payload_info) = payload_info {
1992             hob.add_payload(&mem, payload_info)
1993                 .map_err(Error::PopulateHob)?;
1994         }
1995 
1996         hob.finish(&mem).map_err(Error::PopulateHob)?;
1997 
1998         Ok(hob_offset)
1999     }
2000 
2001     #[cfg(feature = "tdx")]
2002     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
2003         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2004         let mem = guest_memory.memory();
2005 
2006         for section in sections {
2007             self.vm
2008                 .tdx_init_memory_region(
2009                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2010                     section.address,
2011                     section.size,
2012                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2013                     section.attributes == 1,
2014                 )
2015                 .map_err(Error::InitializeTdxMemoryRegion)?;
2016         }
2017 
2018         Ok(())
2019     }
2020 
2021     // Creates ACPI tables
2022     // In case of TDX being used, this is a no-op since the tables will be
2023     // created and passed when populating the HOB.
2024 
2025     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2026         #[cfg(feature = "tdx")]
2027         if self.config.lock().unwrap().is_tdx_enabled() {
2028             return None;
2029         }
2030         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2031         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2032         let rsdp_addr = crate::acpi::create_acpi_tables(
2033             &mem,
2034             &self.device_manager,
2035             &self.cpu_manager,
2036             &self.memory_manager,
2037             &self.numa_nodes,
2038             tpm_enabled,
2039         );
2040         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2041 
2042         Some(rsdp_addr)
2043     }
2044 
2045     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2046         trace_scoped!("entry_point");
2047 
2048         self.load_payload_handle
2049             .take()
2050             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2051             .transpose()
2052     }
2053 
2054     pub fn boot(&mut self) -> Result<()> {
2055         trace_scoped!("Vm::boot");
2056         let current_state = self.get_state()?;
2057         if current_state == VmState::Paused {
2058             return self.resume().map_err(Error::Resume);
2059         }
2060 
2061         let new_state = if self.stop_on_boot {
2062             VmState::BreakPoint
2063         } else {
2064             VmState::Running
2065         };
2066         current_state.valid_transition(new_state)?;
2067 
2068         // Do earlier to parallelise with loading kernel
2069         #[cfg(target_arch = "x86_64")]
2070         cfg_if::cfg_if! {
2071             if #[cfg(feature = "sev_snp")] {
2072                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2073                 let rsdp_addr = if sev_snp_enabled {
2074                     // In case of SEV-SNP guest ACPI tables are provided via
2075                     // IGVM. So skip the creation of ACPI tables and set the
2076                     // rsdp addr to None.
2077                     None
2078                 } else {
2079                     self.create_acpi_tables()
2080                 };
2081             } else {
2082                 let rsdp_addr = self.create_acpi_tables();
2083             }
2084         }
2085 
2086         // Load kernel synchronously or if asynchronous then wait for load to
2087         // finish.
2088         let entry_point = self.entry_point()?;
2089 
2090         #[cfg(feature = "tdx")]
2091         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2092 
2093         // Configure the vcpus that have been created
2094         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2095         for vcpu in vcpus {
2096             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2097             let boot_setup = entry_point.map(|e| (e, guest_memory));
2098             self.cpu_manager
2099                 .lock()
2100                 .unwrap()
2101                 .configure_vcpu(vcpu, boot_setup)
2102                 .map_err(Error::CpuManager)?;
2103         }
2104 
2105         #[cfg(feature = "tdx")]
2106         let (sections, guid_found) = if tdx_enabled {
2107             self.extract_tdvf_sections()?
2108         } else {
2109             (Vec::new(), false)
2110         };
2111 
2112         // Configuring the TDX regions requires that the vCPUs are created.
2113         #[cfg(feature = "tdx")]
2114         let hob_address = if tdx_enabled {
2115             // TDX sections are written to memory.
2116             self.populate_tdx_sections(&sections, guid_found)?
2117         } else {
2118             None
2119         };
2120 
2121         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2122         // available after they are configured
2123         #[cfg(target_arch = "aarch64")]
2124         let rsdp_addr = self.create_acpi_tables();
2125 
2126         // Configure shared state based on loaded kernel
2127         entry_point
2128             .map(|entry_point| {
2129                 // Safe to unwrap rsdp_addr as we know it can't be None when
2130                 // the entry_point is Some.
2131                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2132             })
2133             .transpose()?;
2134 
2135         #[cfg(target_arch = "x86_64")]
2136         // Note: For x86, always call this function before invoking start boot vcpus.
2137         // Otherwise guest would fail to boot because we haven't created the
2138         // userspace mappings to update the hypervisor about the memory mappings.
2139         // These mappings must be created before we start the vCPU threads for
2140         // the very first time.
2141         self.memory_manager
2142             .lock()
2143             .unwrap()
2144             .allocate_address_space()
2145             .map_err(Error::MemoryManager)?;
2146 
2147         #[cfg(feature = "tdx")]
2148         if let Some(hob_address) = hob_address {
2149             // With the HOB address extracted the vCPUs can have
2150             // their TDX state configured.
2151             self.cpu_manager
2152                 .lock()
2153                 .unwrap()
2154                 .initialize_tdx(hob_address)
2155                 .map_err(Error::CpuManager)?;
2156             // Let the hypervisor know which memory ranges are shared with the
2157             // guest. This prevents the guest from ignoring/discarding memory
2158             // regions provided by the host.
2159             self.init_tdx_memory(&sections)?;
2160             // With TDX memory and CPU state configured TDX setup is complete
2161             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2162         }
2163 
2164         // Resume the vm for MSHV
2165         if current_state == VmState::Created {
2166             self.vm.resume().map_err(Error::ResumeVm)?;
2167         }
2168 
2169         self.cpu_manager
2170             .lock()
2171             .unwrap()
2172             .start_boot_vcpus(new_state == VmState::BreakPoint)
2173             .map_err(Error::CpuManager)?;
2174 
2175         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2176         *state = new_state;
2177         Ok(())
2178     }
2179 
2180     pub fn restore(&mut self) -> Result<()> {
2181         event!("vm", "restoring");
2182 
2183         #[cfg(target_arch = "x86_64")]
2184         // Note: For x86, always call this function before invoking start boot vcpus.
2185         // Otherwise guest would fail to boot because we haven't created the
2186         // userspace mappings to update the hypervisor about the memory mappings.
2187         // These mappings must be created before we start the vCPU threads for
2188         // the very first time for the restored VM.
2189         self.memory_manager
2190             .lock()
2191             .unwrap()
2192             .allocate_address_space()
2193             .map_err(Error::MemoryManager)?;
2194 
2195         // Now we can start all vCPUs from here.
2196         self.cpu_manager
2197             .lock()
2198             .unwrap()
2199             .start_restored_vcpus()
2200             .map_err(Error::CpuManager)?;
2201 
2202         event!("vm", "restored");
2203         Ok(())
2204     }
2205 
2206     /// Gets a thread-safe reference counted pointer to the VM configuration.
2207     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2208         Arc::clone(&self.config)
2209     }
2210 
2211     /// Get the VM state. Returns an error if the state is poisoned.
2212     pub fn get_state(&self) -> Result<VmState> {
2213         self.state
2214             .try_read()
2215             .map_err(|_| Error::PoisonedState)
2216             .map(|state| *state)
2217     }
2218 
2219     /// Gets the actual size of the balloon.
2220     pub fn balloon_size(&self) -> u64 {
2221         self.device_manager.lock().unwrap().balloon_size()
2222     }
2223 
2224     pub fn send_memory_fds(
2225         &mut self,
2226         socket: &mut UnixStream,
2227     ) -> std::result::Result<(), MigratableError> {
2228         for (slot, fd) in self
2229             .memory_manager
2230             .lock()
2231             .unwrap()
2232             .memory_slot_fds()
2233             .drain()
2234         {
2235             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2236                 .write_to(socket)
2237                 .map_err(|e| {
2238                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2239                 })?;
2240             socket
2241                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2242                 .map_err(|e| {
2243                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2244                 })?;
2245 
2246             Response::read_from(socket)?.ok_or_abandon(
2247                 socket,
2248                 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")),
2249             )?;
2250         }
2251 
2252         Ok(())
2253     }
2254 
2255     pub fn send_memory_regions<F>(
2256         &mut self,
2257         ranges: &MemoryRangeTable,
2258         fd: &mut F,
2259     ) -> std::result::Result<(), MigratableError>
2260     where
2261         F: WriteVolatile,
2262     {
2263         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2264         let mem = guest_memory.memory();
2265 
2266         for range in ranges.regions() {
2267             let mut offset: u64 = 0;
2268             // Here we are manually handling the retry in case we can't the
2269             // whole region at once because we can't use the implementation
2270             // from vm-memory::GuestMemory of write_all_to() as it is not
2271             // following the correct behavior. For more info about this issue
2272             // see: https://github.com/rust-vmm/vm-memory/issues/174
2273             loop {
2274                 let bytes_written = mem
2275                     .write_volatile_to(
2276                         GuestAddress(range.gpa + offset),
2277                         fd,
2278                         (range.length - offset) as usize,
2279                     )
2280                     .map_err(|e| {
2281                         MigratableError::MigrateSend(anyhow!(
2282                             "Error transferring memory to socket: {}",
2283                             e
2284                         ))
2285                     })?;
2286                 offset += bytes_written as u64;
2287 
2288                 if offset == range.length {
2289                     break;
2290                 }
2291             }
2292         }
2293 
2294         Ok(())
2295     }
2296 
2297     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2298         self.memory_manager
2299             .lock()
2300             .unwrap()
2301             .memory_range_table(false)
2302     }
2303 
2304     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2305         self.device_manager.lock().unwrap().device_tree()
2306     }
2307 
2308     pub fn activate_virtio_devices(&self) -> Result<()> {
2309         self.device_manager
2310             .lock()
2311             .unwrap()
2312             .activate_virtio_devices()
2313             .map_err(Error::ActivateVirtioDevices)
2314     }
2315 
2316     #[cfg(target_arch = "x86_64")]
2317     pub fn power_button(&self) -> Result<()> {
2318         return self
2319             .device_manager
2320             .lock()
2321             .unwrap()
2322             .notify_power_button()
2323             .map_err(Error::PowerButton);
2324     }
2325 
2326     #[cfg(target_arch = "aarch64")]
2327     pub fn power_button(&self) -> Result<()> {
2328         self.device_manager
2329             .lock()
2330             .unwrap()
2331             .notify_power_button()
2332             .map_err(Error::PowerButton)
2333     }
2334 
2335     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2336         self.memory_manager.lock().unwrap().snapshot_data()
2337     }
2338 
2339     #[cfg(feature = "guest_debug")]
2340     pub fn debug_request(
2341         &mut self,
2342         gdb_request: &GdbRequestPayload,
2343         cpu_id: usize,
2344     ) -> Result<GdbResponsePayload> {
2345         use GdbRequestPayload::*;
2346         match gdb_request {
2347             SetSingleStep(single_step) => {
2348                 self.set_guest_debug(cpu_id, &[], *single_step)
2349                     .map_err(Error::Debug)?;
2350             }
2351             SetHwBreakPoint(addrs) => {
2352                 self.set_guest_debug(cpu_id, addrs, false)
2353                     .map_err(Error::Debug)?;
2354             }
2355             Pause => {
2356                 self.debug_pause().map_err(Error::Debug)?;
2357             }
2358             Resume => {
2359                 self.debug_resume().map_err(Error::Debug)?;
2360             }
2361             ReadRegs => {
2362                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2363                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2364             }
2365             WriteRegs(regs) => {
2366                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2367             }
2368             ReadMem(vaddr, len) => {
2369                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2370                 let mem = self
2371                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2372                     .map_err(Error::Debug)?;
2373                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2374             }
2375             WriteMem(vaddr, data) => {
2376                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2377                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2378                     .map_err(Error::Debug)?;
2379             }
2380             ActiveVcpus => {
2381                 let active_vcpus = self.active_vcpus();
2382                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2383             }
2384         }
2385         Ok(GdbResponsePayload::CommandComplete)
2386     }
2387 
2388     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2389     fn get_dump_state(
2390         &mut self,
2391         destination_url: &str,
2392     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2393         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2394         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2395         let mut elf_phdr_num = 1;
2396         let elf_sh_info = 0;
2397         let coredump_file_path = url_to_file(destination_url)?;
2398         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2399 
2400         if mapping_num < UINT16_MAX - 2 {
2401             elf_phdr_num += mapping_num as u16;
2402         } else {
2403             panic!("mapping num beyond 65535 not supported");
2404         }
2405         let coredump_file = OpenOptions::new()
2406             .read(true)
2407             .write(true)
2408             .create_new(true)
2409             .open(coredump_file_path)
2410             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2411 
2412         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2413         let mem_data = self
2414             .memory_manager
2415             .lock()
2416             .unwrap()
2417             .coredump_memory_regions(mem_offset);
2418 
2419         Ok(DumpState {
2420             elf_note_size,
2421             elf_phdr_num,
2422             elf_sh_info,
2423             mem_offset,
2424             mem_info: Some(mem_data),
2425             file: Some(coredump_file),
2426         })
2427     }
2428 
2429     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2430     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2431         size_of::<elf::Elf64_Ehdr>() as u64
2432             + note_size as u64
2433             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2434     }
2435 
2436     pub fn nmi(&self) -> Result<()> {
2437         return self
2438             .cpu_manager
2439             .lock()
2440             .unwrap()
2441             .nmi()
2442             .map_err(|_| Error::ErrorNmi);
2443     }
2444 }
2445 
2446 impl Pausable for Vm {
2447     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2448         event!("vm", "pausing");
2449         let mut state = self
2450             .state
2451             .try_write()
2452             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2453         let new_state = VmState::Paused;
2454 
2455         state
2456             .valid_transition(new_state)
2457             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2458 
2459         #[cfg(target_arch = "x86_64")]
2460         {
2461             let mut clock = self
2462                 .vm
2463                 .get_clock()
2464                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2465             clock.reset_flags();
2466             self.saved_clock = Some(clock);
2467         }
2468 
2469         // Before pausing the vCPUs activate any pending virtio devices that might
2470         // need activation between starting the pause (or e.g. a migration it's part of)
2471         self.activate_virtio_devices().map_err(|e| {
2472             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2473         })?;
2474 
2475         self.cpu_manager.lock().unwrap().pause()?;
2476         self.device_manager.lock().unwrap().pause()?;
2477 
2478         self.vm
2479             .pause()
2480             .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?;
2481 
2482         *state = new_state;
2483 
2484         event!("vm", "paused");
2485         Ok(())
2486     }
2487 
2488     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2489         event!("vm", "resuming");
2490         let current_state = self.get_state().unwrap();
2491         let mut state = self
2492             .state
2493             .try_write()
2494             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2495         let new_state = VmState::Running;
2496 
2497         state
2498             .valid_transition(new_state)
2499             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2500 
2501         self.cpu_manager.lock().unwrap().resume()?;
2502         #[cfg(target_arch = "x86_64")]
2503         {
2504             if let Some(clock) = &self.saved_clock {
2505                 self.vm.set_clock(clock).map_err(|e| {
2506                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2507                 })?;
2508             }
2509         }
2510 
2511         if current_state == VmState::Paused {
2512             self.vm
2513                 .resume()
2514                 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?;
2515         }
2516 
2517         self.device_manager.lock().unwrap().resume()?;
2518 
2519         // And we're back to the Running state.
2520         *state = new_state;
2521         event!("vm", "resumed");
2522         Ok(())
2523     }
2524 }
2525 
2526 #[derive(Serialize, Deserialize)]
2527 pub struct VmSnapshot {
2528     #[cfg(target_arch = "x86_64")]
2529     pub clock: Option<hypervisor::ClockData>,
2530     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2531     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2532 }
2533 
2534 pub const VM_SNAPSHOT_ID: &str = "vm";
2535 impl Snapshottable for Vm {
2536     fn id(&self) -> String {
2537         VM_SNAPSHOT_ID.to_string()
2538     }
2539 
2540     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2541         event!("vm", "snapshotting");
2542 
2543         #[cfg(feature = "tdx")]
2544         {
2545             if self.config.lock().unwrap().is_tdx_enabled() {
2546                 return Err(MigratableError::Snapshot(anyhow!(
2547                     "Snapshot not possible with TDX VM"
2548                 )));
2549             }
2550         }
2551 
2552         let current_state = self.get_state().unwrap();
2553         if current_state != VmState::Paused {
2554             return Err(MigratableError::Snapshot(anyhow!(
2555                 "Trying to snapshot while VM is running"
2556             )));
2557         }
2558 
2559         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2560         let common_cpuid = {
2561             let amx = self.config.lock().unwrap().cpus.features.amx;
2562             let phys_bits = physical_bits(
2563                 &self.hypervisor,
2564                 self.config.lock().unwrap().cpus.max_phys_bits,
2565             );
2566             arch::generate_common_cpuid(
2567                 &self.hypervisor,
2568                 &arch::CpuidConfig {
2569                     sgx_epc_sections: None,
2570                     phys_bits,
2571                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2572                     #[cfg(feature = "tdx")]
2573                     tdx: false,
2574                     amx,
2575                 },
2576             )
2577             .map_err(|e| {
2578                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2579             })?
2580         };
2581 
2582         let vm_snapshot_state = VmSnapshot {
2583             #[cfg(target_arch = "x86_64")]
2584             clock: self.saved_clock,
2585             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2586             common_cpuid,
2587         };
2588 
2589         let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?;
2590 
2591         let (id, snapshot) = {
2592             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2593             (cpu_manager.id(), cpu_manager.snapshot()?)
2594         };
2595         vm_snapshot.add_snapshot(id, snapshot);
2596         let (id, snapshot) = {
2597             let mut memory_manager = self.memory_manager.lock().unwrap();
2598             (memory_manager.id(), memory_manager.snapshot()?)
2599         };
2600         vm_snapshot.add_snapshot(id, snapshot);
2601         let (id, snapshot) = {
2602             let mut device_manager = self.device_manager.lock().unwrap();
2603             (device_manager.id(), device_manager.snapshot()?)
2604         };
2605         vm_snapshot.add_snapshot(id, snapshot);
2606 
2607         event!("vm", "snapshotted");
2608         Ok(vm_snapshot)
2609     }
2610 }
2611 
2612 impl Transportable for Vm {
2613     fn send(
2614         &self,
2615         snapshot: &Snapshot,
2616         destination_url: &str,
2617     ) -> std::result::Result<(), MigratableError> {
2618         let mut snapshot_config_path = url_to_path(destination_url)?;
2619         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2620 
2621         // Create the snapshot config file
2622         let mut snapshot_config_file = OpenOptions::new()
2623             .read(true)
2624             .write(true)
2625             .create_new(true)
2626             .open(snapshot_config_path)
2627             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2628 
2629         // Serialize and write the snapshot config
2630         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2631             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2632 
2633         snapshot_config_file
2634             .write(vm_config.as_bytes())
2635             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2636 
2637         let mut snapshot_state_path = url_to_path(destination_url)?;
2638         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2639 
2640         // Create the snapshot state file
2641         let mut snapshot_state_file = OpenOptions::new()
2642             .read(true)
2643             .write(true)
2644             .create_new(true)
2645             .open(snapshot_state_path)
2646             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2647 
2648         // Serialize and write the snapshot state
2649         let vm_state =
2650             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2651 
2652         snapshot_state_file
2653             .write(&vm_state)
2654             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2655 
2656         // Tell the memory manager to also send/write its own snapshot.
2657         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2658             self.memory_manager
2659                 .lock()
2660                 .unwrap()
2661                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2662         } else {
2663             return Err(MigratableError::Restore(anyhow!(
2664                 "Missing memory manager snapshot"
2665             )));
2666         }
2667 
2668         Ok(())
2669     }
2670 }
2671 
2672 impl Migratable for Vm {
2673     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2674         self.memory_manager.lock().unwrap().start_dirty_log()?;
2675         self.device_manager.lock().unwrap().start_dirty_log()
2676     }
2677 
2678     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2679         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2680         self.device_manager.lock().unwrap().stop_dirty_log()
2681     }
2682 
2683     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2684         Ok(MemoryRangeTable::new_from_tables(vec![
2685             self.memory_manager.lock().unwrap().dirty_log()?,
2686             self.device_manager.lock().unwrap().dirty_log()?,
2687         ]))
2688     }
2689 
2690     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2691         self.memory_manager.lock().unwrap().start_migration()?;
2692         self.device_manager.lock().unwrap().start_migration()
2693     }
2694 
2695     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2696         self.memory_manager.lock().unwrap().complete_migration()?;
2697         self.device_manager.lock().unwrap().complete_migration()
2698     }
2699 }
2700 
2701 #[cfg(feature = "guest_debug")]
2702 impl Debuggable for Vm {
2703     fn set_guest_debug(
2704         &self,
2705         cpu_id: usize,
2706         addrs: &[GuestAddress],
2707         singlestep: bool,
2708     ) -> std::result::Result<(), DebuggableError> {
2709         self.cpu_manager
2710             .lock()
2711             .unwrap()
2712             .set_guest_debug(cpu_id, addrs, singlestep)
2713     }
2714 
2715     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2716         if *self.state.read().unwrap() == VmState::Running {
2717             self.pause().map_err(DebuggableError::Pause)?;
2718         }
2719 
2720         let mut state = self
2721             .state
2722             .try_write()
2723             .map_err(|_| DebuggableError::PoisonedState)?;
2724         *state = VmState::BreakPoint;
2725         Ok(())
2726     }
2727 
2728     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2729         if *self.state.read().unwrap() == VmState::BreakPoint {
2730             self.resume().map_err(DebuggableError::Pause)?;
2731         }
2732 
2733         Ok(())
2734     }
2735 
2736     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2737         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2738     }
2739 
2740     fn write_regs(
2741         &self,
2742         cpu_id: usize,
2743         regs: &CoreRegs,
2744     ) -> std::result::Result<(), DebuggableError> {
2745         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2746     }
2747 
2748     fn read_mem(
2749         &self,
2750         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2751         cpu_id: usize,
2752         vaddr: GuestAddress,
2753         len: usize,
2754     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2755         self.cpu_manager
2756             .lock()
2757             .unwrap()
2758             .read_mem(guest_memory, cpu_id, vaddr, len)
2759     }
2760 
2761     fn write_mem(
2762         &self,
2763         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2764         cpu_id: usize,
2765         vaddr: &GuestAddress,
2766         data: &[u8],
2767     ) -> std::result::Result<(), DebuggableError> {
2768         self.cpu_manager
2769             .lock()
2770             .unwrap()
2771             .write_mem(guest_memory, cpu_id, vaddr, data)
2772     }
2773 
2774     fn active_vcpus(&self) -> usize {
2775         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2776         if active_vcpus > 0 {
2777             active_vcpus
2778         } else {
2779             // The VM is not booted yet. Report boot_vcpus() instead.
2780             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2781         }
2782     }
2783 }
2784 
2785 #[cfg(feature = "guest_debug")]
2786 pub const UINT16_MAX: u32 = 65535;
2787 
2788 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2789 impl Elf64Writable for Vm {}
2790 
2791 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2792 impl GuestDebuggable for Vm {
2793     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2794         event!("vm", "coredumping");
2795 
2796         let mut resume = false;
2797 
2798         #[cfg(feature = "tdx")]
2799         {
2800             if let Some(ref platform) = self.config.lock().unwrap().platform {
2801                 if platform.tdx {
2802                     return Err(GuestDebuggableError::Coredump(anyhow!(
2803                         "Coredump not possible with TDX VM"
2804                     )));
2805                 }
2806             }
2807         }
2808 
2809         match self.get_state().unwrap() {
2810             VmState::Running => {
2811                 self.pause().map_err(GuestDebuggableError::Pause)?;
2812                 resume = true;
2813             }
2814             VmState::Paused => {}
2815             _ => {
2816                 return Err(GuestDebuggableError::Coredump(anyhow!(
2817                     "Trying to coredump while VM is not running or paused"
2818                 )));
2819             }
2820         }
2821 
2822         let coredump_state = self.get_dump_state(destination_url)?;
2823 
2824         self.write_header(&coredump_state)?;
2825         self.write_note(&coredump_state)?;
2826         self.write_loads(&coredump_state)?;
2827 
2828         self.cpu_manager
2829             .lock()
2830             .unwrap()
2831             .cpu_write_elf64_note(&coredump_state)?;
2832         self.cpu_manager
2833             .lock()
2834             .unwrap()
2835             .cpu_write_vmm_note(&coredump_state)?;
2836 
2837         self.memory_manager
2838             .lock()
2839             .unwrap()
2840             .coredump_iterate_save_mem(&coredump_state)?;
2841 
2842         if resume {
2843             self.resume().map_err(GuestDebuggableError::Resume)?;
2844         }
2845 
2846         Ok(())
2847     }
2848 }
2849 
2850 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2851 #[cfg(test)]
2852 mod tests {
2853     use super::*;
2854 
2855     fn test_vm_state_transitions(state: VmState) {
2856         match state {
2857             VmState::Created => {
2858                 // Check the transitions from Created
2859                 assert!(state.valid_transition(VmState::Created).is_err());
2860                 assert!(state.valid_transition(VmState::Running).is_ok());
2861                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2862                 assert!(state.valid_transition(VmState::Paused).is_ok());
2863                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2864             }
2865             VmState::Running => {
2866                 // Check the transitions from Running
2867                 assert!(state.valid_transition(VmState::Created).is_err());
2868                 assert!(state.valid_transition(VmState::Running).is_err());
2869                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2870                 assert!(state.valid_transition(VmState::Paused).is_ok());
2871                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2872             }
2873             VmState::Shutdown => {
2874                 // Check the transitions from Shutdown
2875                 assert!(state.valid_transition(VmState::Created).is_err());
2876                 assert!(state.valid_transition(VmState::Running).is_ok());
2877                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2878                 assert!(state.valid_transition(VmState::Paused).is_err());
2879                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2880             }
2881             VmState::Paused => {
2882                 // Check the transitions from Paused
2883                 assert!(state.valid_transition(VmState::Created).is_err());
2884                 assert!(state.valid_transition(VmState::Running).is_ok());
2885                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2886                 assert!(state.valid_transition(VmState::Paused).is_err());
2887                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2888             }
2889             VmState::BreakPoint => {
2890                 // Check the transitions from Breakpoint
2891                 assert!(state.valid_transition(VmState::Created).is_ok());
2892                 assert!(state.valid_transition(VmState::Running).is_ok());
2893                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2894                 assert!(state.valid_transition(VmState::Paused).is_err());
2895                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2896             }
2897         }
2898     }
2899 
2900     #[test]
2901     fn test_vm_created_transitions() {
2902         test_vm_state_transitions(VmState::Created);
2903     }
2904 
2905     #[test]
2906     fn test_vm_running_transitions() {
2907         test_vm_state_transitions(VmState::Running);
2908     }
2909 
2910     #[test]
2911     fn test_vm_shutdown_transitions() {
2912         test_vm_state_transitions(VmState::Shutdown);
2913     }
2914 
2915     #[test]
2916     fn test_vm_paused_transitions() {
2917         test_vm_state_transitions(VmState::Paused);
2918     }
2919 
2920     #[cfg(feature = "tdx")]
2921     #[test]
2922     fn test_hob_memory_resources() {
2923         // Case 1: Two TDVF sections in the middle of the RAM
2924         let sections = vec![
2925             TdvfSection {
2926                 address: 0xc000,
2927                 size: 0x1000,
2928                 ..Default::default()
2929             },
2930             TdvfSection {
2931                 address: 0x1000,
2932                 size: 0x4000,
2933                 ..Default::default()
2934             },
2935         ];
2936         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2937         let expected = vec![
2938             (0, 0x1000, true),
2939             (0x1000, 0x4000, false),
2940             (0x5000, 0x7000, true),
2941             (0xc000, 0x1000, false),
2942             (0xd000, 0x0fff_3000, true),
2943         ];
2944         assert_eq!(
2945             expected,
2946             Vm::hob_memory_resources(
2947                 sections,
2948                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2949             )
2950         );
2951 
2952         // Case 2: Two TDVF sections with no conflict with the RAM
2953         let sections = vec![
2954             TdvfSection {
2955                 address: 0x1000_1000,
2956                 size: 0x1000,
2957                 ..Default::default()
2958             },
2959             TdvfSection {
2960                 address: 0,
2961                 size: 0x1000,
2962                 ..Default::default()
2963             },
2964         ];
2965         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2966         let expected = vec![
2967             (0, 0x1000, false),
2968             (0x1000, 0x1000_0000, true),
2969             (0x1000_1000, 0x1000, false),
2970         ];
2971         assert_eq!(
2972             expected,
2973             Vm::hob_memory_resources(
2974                 sections,
2975                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2976             )
2977         );
2978 
2979         // Case 3: Two TDVF sections with partial conflicts with the RAM
2980         let sections = vec![
2981             TdvfSection {
2982                 address: 0x1000_0000,
2983                 size: 0x2000,
2984                 ..Default::default()
2985             },
2986             TdvfSection {
2987                 address: 0,
2988                 size: 0x2000,
2989                 ..Default::default()
2990             },
2991         ];
2992         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2993         let expected = vec![
2994             (0, 0x2000, false),
2995             (0x2000, 0x0fff_e000, true),
2996             (0x1000_0000, 0x2000, false),
2997         ];
2998         assert_eq!(
2999             expected,
3000             Vm::hob_memory_resources(
3001                 sections,
3002                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3003             )
3004         );
3005 
3006         // Case 4: Two TDVF sections with no conflict before the RAM and two
3007         // more additional sections with no conflict after the RAM.
3008         let sections = vec![
3009             TdvfSection {
3010                 address: 0x2000_1000,
3011                 size: 0x1000,
3012                 ..Default::default()
3013             },
3014             TdvfSection {
3015                 address: 0x2000_0000,
3016                 size: 0x1000,
3017                 ..Default::default()
3018             },
3019             TdvfSection {
3020                 address: 0x1000,
3021                 size: 0x1000,
3022                 ..Default::default()
3023             },
3024             TdvfSection {
3025                 address: 0,
3026                 size: 0x1000,
3027                 ..Default::default()
3028             },
3029         ];
3030         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3031         let expected = vec![
3032             (0, 0x1000, false),
3033             (0x1000, 0x1000, false),
3034             (0x4000, 0x1000_0000, true),
3035             (0x2000_0000, 0x1000, false),
3036             (0x2000_1000, 0x1000, false),
3037         ];
3038         assert_eq!(
3039             expected,
3040             Vm::hob_memory_resources(
3041                 sections,
3042                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3043             )
3044         );
3045 
3046         // Case 5: One TDVF section overriding the entire RAM
3047         let sections = vec![TdvfSection {
3048             address: 0,
3049             size: 0x2000_0000,
3050             ..Default::default()
3051         }];
3052         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3053         let expected = vec![(0, 0x2000_0000, false)];
3054         assert_eq!(
3055             expected,
3056             Vm::hob_memory_resources(
3057                 sections,
3058                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3059             )
3060         );
3061 
3062         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3063         let sections = vec![
3064             TdvfSection {
3065                 address: 0x1000_2000,
3066                 size: 0x2000,
3067                 ..Default::default()
3068             },
3069             TdvfSection {
3070                 address: 0,
3071                 size: 0x2000,
3072                 ..Default::default()
3073             },
3074         ];
3075         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3076             (GuestAddress(0x2000), 0x1000_0000),
3077             (GuestAddress(0x1000_4000), 0x1000_0000),
3078         ];
3079         let expected = vec![
3080             (0, 0x2000, false),
3081             (0x2000, 0x1000_0000, true),
3082             (0x1000_2000, 0x2000, false),
3083             (0x1000_4000, 0x1000_0000, true),
3084         ];
3085         assert_eq!(
3086             expected,
3087             Vm::hob_memory_resources(
3088                 sections,
3089                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3090             )
3091         );
3092 
3093         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3094         let sections = vec![
3095             TdvfSection {
3096                 address: 0x1000_0000,
3097                 size: 0x4000,
3098                 ..Default::default()
3099             },
3100             TdvfSection {
3101                 address: 0,
3102                 size: 0x4000,
3103                 ..Default::default()
3104             },
3105         ];
3106         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3107             (GuestAddress(0x1000), 0x1000_0000),
3108             (GuestAddress(0x1000_3000), 0x1000_0000),
3109         ];
3110         let expected = vec![
3111             (0, 0x4000, false),
3112             (0x4000, 0x0fff_c000, true),
3113             (0x1000_0000, 0x4000, false),
3114             (0x1000_4000, 0x0fff_f000, true),
3115         ];
3116         assert_eq!(
3117             expected,
3118             Vm::hob_memory_resources(
3119                 sections,
3120                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3121             )
3122         );
3123     }
3124 }
3125 
3126 #[cfg(target_arch = "aarch64")]
3127 #[cfg(test)]
3128 mod tests {
3129     use arch::aarch64::fdt::create_fdt;
3130     use arch::aarch64::layout;
3131     use arch::{DeviceType, MmioDeviceInfo};
3132     use devices::gic::Gic;
3133 
3134     use super::*;
3135 
3136     const LEN: u64 = 4096;
3137 
3138     #[test]
3139     fn test_create_fdt_with_devices() {
3140         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3141         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3142 
3143         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3144             (
3145                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3146                 MmioDeviceInfo {
3147                     addr: 0x00,
3148                     len: LEN,
3149                     irq: 33,
3150                 },
3151             ),
3152             (
3153                 (DeviceType::Virtio(1), "virtio".to_string()),
3154                 MmioDeviceInfo {
3155                     addr: LEN,
3156                     len: LEN,
3157                     irq: 34,
3158                 },
3159             ),
3160             (
3161                 (DeviceType::Rtc, "rtc".to_string()),
3162                 MmioDeviceInfo {
3163                     addr: 2 * LEN,
3164                     len: LEN,
3165                     irq: 35,
3166                 },
3167             ),
3168         ]
3169         .iter()
3170         .cloned()
3171         .collect();
3172 
3173         let hv = hypervisor::new().unwrap();
3174         let vm = hv.create_vm().unwrap();
3175         let gic = vm
3176             .create_vgic(Gic::create_default_config(1))
3177             .expect("Cannot create gic");
3178         assert!(create_fdt(
3179             &mem,
3180             "console=tty0",
3181             vec![0],
3182             Some((0, 0, 0)),
3183             &dev_info,
3184             &gic,
3185             &None,
3186             &Vec::new(),
3187             &BTreeMap::new(),
3188             None,
3189             true,
3190         )
3191         .is_ok())
3192     }
3193 }
3194 
3195 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3196 #[test]
3197 pub fn test_vm() {
3198     use hypervisor::VmExit;
3199     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3200     // This example based on https://lwn.net/Articles/658511/
3201     let code = [
3202         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3203         0x00, 0xd8, /* add %bl, %al */
3204         0x04, b'0', /* add $'0', %al */
3205         0xee, /* out %al, (%dx) */
3206         0xb0, b'\n', /* mov $'\n', %al */
3207         0xee,  /* out %al, (%dx) */
3208         0xf4,  /* hlt */
3209     ];
3210 
3211     let mem_size = 0x1000;
3212     let load_addr = GuestAddress(0x1000);
3213     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3214 
3215     let hv = hypervisor::new().unwrap();
3216     let vm = hv.create_vm().expect("new VM creation failed");
3217 
3218     for (index, region) in mem.iter().enumerate() {
3219         let mem_region = vm.make_user_memory_region(
3220             index as u32,
3221             region.start_addr().raw_value(),
3222             region.len(),
3223             region.as_ptr() as u64,
3224             false,
3225             false,
3226         );
3227 
3228         vm.create_user_memory_region(mem_region)
3229             .expect("Cannot configure guest memory");
3230     }
3231     mem.write_slice(&code, load_addr)
3232         .expect("Writing code to memory failed");
3233 
3234     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3235 
3236     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3237     vcpu_sregs.cs.base = 0;
3238     vcpu_sregs.cs.selector = 0;
3239     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3240 
3241     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3242     vcpu_regs.set_rip(0x1000);
3243     vcpu_regs.set_rax(2);
3244     vcpu_regs.set_rbx(3);
3245     vcpu_regs.set_rflags(2);
3246     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3247 
3248     loop {
3249         match vcpu.run().expect("run failed") {
3250             VmExit::Reset => {
3251                 println!("HLT");
3252                 break;
3253             }
3254             VmExit::Ignore => {}
3255             r => panic!("unexpected exit reason: {r:?}"),
3256         }
3257     }
3258 }
3259