xref: /cloud-hypervisor/vmm/src/vm.rs (revision eb0b14f70ed5ed44b76579145fd2a741c0100ae4)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::collections::{BTreeMap, HashMap};
15 use std::fs::{File, OpenOptions};
16 use std::io::{self, Seek, SeekFrom, Write};
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::num::Wrapping;
20 use std::ops::Deref;
21 use std::os::unix::net::UnixStream;
22 use std::sync::{Arc, Mutex, RwLock};
23 #[cfg(not(target_arch = "riscv64"))]
24 use std::time::Instant;
25 use std::{cmp, result, str, thread};
26 
27 use anyhow::anyhow;
28 #[cfg(target_arch = "x86_64")]
29 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
30 #[cfg(feature = "tdx")]
31 use arch::x86_64::tdx::TdvfSection;
32 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
33 use arch::PciSpaceInfo;
34 use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes};
35 #[cfg(target_arch = "aarch64")]
36 use devices::interrupt_controller;
37 use devices::AcpiNotificationFlags;
38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
39 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
41 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
42 #[cfg(target_arch = "aarch64")]
43 use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ;
44 use hypervisor::{HypervisorVmError, VmOps};
45 use libc::{termios, SIGWINCH};
46 use linux_loader::cmdline::Cmdline;
47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
48 use linux_loader::elf;
49 #[cfg(target_arch = "x86_64")]
50 use linux_loader::loader::bzimage::BzImage;
51 #[cfg(target_arch = "x86_64")]
52 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
53 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
54 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
55 use linux_loader::loader::KernelLoader;
56 use seccompiler::SeccompAction;
57 use serde::{Deserialize, Serialize};
58 use thiserror::Error;
59 use tracer::trace_scoped;
60 use vm_device::Bus;
61 #[cfg(feature = "tdx")]
62 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
63 use vm_memory::{
64     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
65 };
66 use vm_migration::protocol::{MemoryRangeTable, Request, Response};
67 use vm_migration::{
68     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable,
69 };
70 use vmm_sys_util::eventfd::EventFd;
71 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
72 
73 use crate::config::{add_to_config, ValidationError};
74 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo};
75 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
76 use crate::coredump::{
77     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
78 };
79 use crate::device_manager::{DeviceManager, DeviceManagerError};
80 use crate::device_tree::DeviceTree;
81 #[cfg(feature = "guest_debug")]
82 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
83 #[cfg(feature = "igvm")]
84 use crate::igvm::igvm_loader;
85 use crate::landlock::LandlockError;
86 use crate::memory_manager::{
87     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
88 };
89 #[cfg(target_arch = "x86_64")]
90 use crate::migration::get_vm_snapshot;
91 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
92 use crate::migration::url_to_file;
93 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
94 use crate::vm_config::{
95     DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig,
96     PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig,
97 };
98 use crate::{
99     cpu, GuestMemoryMmap, PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID,
100     MEMORY_MANAGER_SNAPSHOT_ID,
101 };
102 
103 /// Errors associated with VM management
104 #[derive(Debug, Error)]
105 pub enum Error {
106     #[error("Cannot open kernel file: {0}")]
107     KernelFile(#[source] io::Error),
108 
109     #[error("Cannot open initramfs file: {0}")]
110     InitramfsFile(#[source] io::Error),
111 
112     #[error("Cannot load the kernel into memory: {0}")]
113     KernelLoad(#[source] linux_loader::loader::Error),
114 
115     #[cfg(target_arch = "aarch64")]
116     #[error("Cannot load the UEFI binary in memory: {0:?}")]
117     UefiLoad(arch::aarch64::uefi::Error),
118 
119     #[error("Cannot load the initramfs into memory")]
120     InitramfsLoad,
121 
122     #[error("Cannot load the kernel command line in memory: {0}")]
123     LoadCmdLine(#[source] linux_loader::loader::Error),
124 
125     #[error("Failed to apply landlock config during vm_create: {0}")]
126     ApplyLandlock(#[source] LandlockError),
127 
128     #[error("Cannot modify the kernel command line: {0}")]
129     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
130 
131     #[error("Cannot create the kernel command line: {0}")]
132     CmdLineCreate(#[source] linux_loader::cmdline::Error),
133 
134     #[error("Cannot configure system: {0}")]
135     ConfigureSystem(#[source] arch::Error),
136 
137     #[cfg(target_arch = "aarch64")]
138     #[error("Cannot enable interrupt controller: {0:?}")]
139     EnableInterruptController(interrupt_controller::Error),
140 
141     #[error("VM state is poisoned")]
142     PoisonedState,
143 
144     #[error("Error from device manager: {0:?}")]
145     DeviceManager(#[source] DeviceManagerError),
146 
147     #[error("Error initializing VM: {0:?}")]
148     InitializeVm(#[source] hypervisor::HypervisorVmError),
149 
150     #[error("No device with id {0:?} to remove")]
151     NoDeviceToRemove(String),
152 
153     #[error("Cannot spawn a signal handler thread: {0}")]
154     SignalHandlerSpawn(#[source] io::Error),
155 
156     #[error("Failed to join on threads: {0:?}")]
157     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
158 
159     #[error("VM config is missing")]
160     VmMissingConfig,
161 
162     #[error("VM is not created")]
163     VmNotCreated,
164 
165     #[error("VM is already created")]
166     VmAlreadyCreated,
167 
168     #[error("VM is not running")]
169     VmNotRunning,
170 
171     #[error("Cannot clone EventFd: {0}")]
172     EventFdClone(#[source] io::Error),
173 
174     #[error("invalid VM state transition: {0:?} to {1:?}")]
175     InvalidStateTransition(VmState, VmState),
176 
177     #[error("Error from CPU manager: {0}")]
178     CpuManager(#[source] cpu::Error),
179 
180     #[error("Cannot pause devices: {0}")]
181     PauseDevices(#[source] MigratableError),
182 
183     #[error("Cannot resume devices: {0}")]
184     ResumeDevices(#[source] MigratableError),
185 
186     #[error("Cannot pause CPUs: {0}")]
187     PauseCpus(#[source] MigratableError),
188 
189     #[error("Cannot resume cpus: {0}")]
190     ResumeCpus(#[source] MigratableError),
191 
192     #[error("Cannot pause VM: {0}")]
193     Pause(#[source] MigratableError),
194 
195     #[error("Cannot resume VM: {0}")]
196     Resume(#[source] MigratableError),
197 
198     #[error("Memory manager error: {0:?}")]
199     MemoryManager(#[source] MemoryManagerError),
200 
201     #[error("Eventfd write error: {0}")]
202     EventfdError(#[source] std::io::Error),
203 
204     #[error("Cannot snapshot VM: {0}")]
205     Snapshot(#[source] MigratableError),
206 
207     #[error("Cannot restore VM: {0}")]
208     Restore(#[source] MigratableError),
209 
210     #[error("Cannot send VM snapshot: {0}")]
211     SnapshotSend(#[source] MigratableError),
212 
213     #[error("Invalid restore source URL")]
214     InvalidRestoreSourceUrl,
215 
216     #[error("Failed to validate config: {0}")]
217     ConfigValidation(#[source] ValidationError),
218 
219     #[error("Too many virtio-vsock devices")]
220     TooManyVsockDevices,
221 
222     #[error("Failed serializing into JSON: {0}")]
223     SerializeJson(#[source] serde_json::Error),
224 
225     #[error("Invalid NUMA configuration")]
226     InvalidNumaConfig,
227 
228     #[error("Cannot create seccomp filter: {0}")]
229     CreateSeccompFilter(#[source] seccompiler::Error),
230 
231     #[error("Cannot apply seccomp filter: {0}")]
232     ApplySeccompFilter(#[source] seccompiler::Error),
233 
234     #[error("Failed resizing a memory zone")]
235     ResizeZone,
236 
237     #[error("Cannot activate virtio devices: {0:?}")]
238     ActivateVirtioDevices(#[source] DeviceManagerError),
239 
240     #[error("Error triggering power button: {0:?}")]
241     PowerButton(#[source] DeviceManagerError),
242 
243     #[error("Kernel lacks PVH header")]
244     KernelMissingPvhHeader,
245 
246     #[error("Failed to allocate firmware RAM: {0:?}")]
247     AllocateFirmwareMemory(#[source] MemoryManagerError),
248 
249     #[error("Error manipulating firmware file: {0}")]
250     FirmwareFile(#[source] std::io::Error),
251 
252     #[error("Firmware too big")]
253     FirmwareTooLarge,
254 
255     #[error("Failed to copy firmware to memory: {0}")]
256     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
257 
258     #[cfg(feature = "sev_snp")]
259     #[error("Error enabling SEV-SNP VM: {0}")]
260     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
261 
262     #[cfg(feature = "tdx")]
263     #[error("Error performing I/O on TDX firmware file: {0}")]
264     LoadTdvf(#[source] std::io::Error),
265 
266     #[cfg(feature = "tdx")]
267     #[error("Error performing I/O on the TDX payload file: {0}")]
268     LoadPayload(#[source] std::io::Error),
269 
270     #[cfg(feature = "tdx")]
271     #[error("Error parsing TDVF: {0}")]
272     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
273 
274     #[cfg(feature = "tdx")]
275     #[error("Error populating TDX HOB: {0}")]
276     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
277 
278     #[cfg(feature = "tdx")]
279     #[error("Error allocating TDVF memory: {0:?}")]
280     AllocatingTdvfMemory(crate::memory_manager::Error),
281 
282     #[cfg(feature = "tdx")]
283     #[error("Error enabling TDX VM: {0}")]
284     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
285 
286     #[cfg(feature = "tdx")]
287     #[error("Error enabling TDX memory region: {0}")]
288     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
289 
290     #[cfg(feature = "tdx")]
291     #[error("Error finalizing TDX VM: {0}")]
292     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
293 
294     #[cfg(feature = "tdx")]
295     #[error("TDX firmware missing")]
296     TdxFirmwareMissing,
297 
298     #[cfg(feature = "tdx")]
299     #[error("Invalid TDX payload type")]
300     InvalidPayloadType,
301 
302     #[cfg(feature = "guest_debug")]
303     #[error("Error debugging VM: {0:?}")]
304     Debug(DebuggableError),
305 
306     #[error("Error spawning kernel loading thread")]
307     KernelLoadThreadSpawn(#[source] std::io::Error),
308 
309     #[error("Error joining kernel loading thread")]
310     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
311 
312     #[error("Payload configuration is not bootable")]
313     InvalidPayload,
314 
315     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
316     #[error("Error coredumping VM: {0:?}")]
317     Coredump(#[source] GuestDebuggableError),
318 
319     #[cfg(feature = "igvm")]
320     #[error("Cannot open igvm file: {0}")]
321     IgvmFile(#[source] io::Error),
322 
323     #[cfg(feature = "igvm")]
324     #[error("Cannot load the igvm into memory: {0}")]
325     IgvmLoad(#[source] igvm_loader::Error),
326 
327     #[error("Error injecting NMI")]
328     ErrorNmi,
329 
330     #[error("Error resuming the VM: {0}")]
331     ResumeVm(#[source] hypervisor::HypervisorVmError),
332 
333     #[error("Error creating console devices")]
334     CreateConsoleDevices(#[source] ConsoleDeviceError),
335 
336     #[error("Error locking disk images: Another instance likely holds a lock")]
337     LockingError(#[source] DeviceManagerError),
338 }
339 pub type Result<T> = result::Result<T, Error>;
340 
341 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
342 pub enum VmState {
343     Created,
344     Running,
345     Shutdown,
346     Paused,
347     BreakPoint,
348 }
349 
350 impl VmState {
351     fn valid_transition(self, new_state: VmState) -> Result<()> {
352         match self {
353             VmState::Created => match new_state {
354                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
355                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
356                     Ok(())
357                 }
358             },
359 
360             VmState::Running => match new_state {
361                 VmState::Created | VmState::Running => {
362                     Err(Error::InvalidStateTransition(self, new_state))
363                 }
364                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
365             },
366 
367             VmState::Shutdown => match new_state {
368                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
369                     Err(Error::InvalidStateTransition(self, new_state))
370                 }
371                 VmState::Running => Ok(()),
372             },
373 
374             VmState::Paused => match new_state {
375                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
376                     Err(Error::InvalidStateTransition(self, new_state))
377                 }
378                 VmState::Running | VmState::Shutdown => Ok(()),
379             },
380             VmState::BreakPoint => match new_state {
381                 VmState::Created | VmState::Running => Ok(()),
382                 _ => Err(Error::InvalidStateTransition(self, new_state)),
383             },
384         }
385     }
386 }
387 
388 struct VmOpsHandler {
389     memory: GuestMemoryAtomic<GuestMemoryMmap>,
390     #[cfg(target_arch = "x86_64")]
391     io_bus: Arc<Bus>,
392     mmio_bus: Arc<Bus>,
393 }
394 
395 impl VmOps for VmOpsHandler {
396     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
397         self.memory
398             .memory()
399             .write(buf, GuestAddress(gpa))
400             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
401     }
402 
403     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
404         self.memory
405             .memory()
406             .read(buf, GuestAddress(gpa))
407             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
408     }
409 
410     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
411         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
412             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
413         }
414         Ok(())
415     }
416 
417     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
418         match self.mmio_bus.write(gpa, data) {
419             Err(vm_device::BusError::MissingAddressRange) => {
420                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
421             }
422             Ok(Some(barrier)) => {
423                 info!("Waiting for barrier");
424                 barrier.wait();
425                 info!("Barrier released");
426             }
427             _ => {}
428         };
429         Ok(())
430     }
431 
432     #[cfg(target_arch = "x86_64")]
433     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
434         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
435             info!("Guest PIO read to unregistered address 0x{:x}", port);
436         }
437         Ok(())
438     }
439 
440     #[cfg(target_arch = "x86_64")]
441     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
442         match self.io_bus.write(port, data) {
443             Err(vm_device::BusError::MissingAddressRange) => {
444                 info!("Guest PIO write to unregistered address 0x{:x}", port);
445             }
446             Ok(Some(barrier)) => {
447                 info!("Waiting for barrier");
448                 barrier.wait();
449                 info!("Barrier released");
450             }
451             _ => {}
452         };
453         Ok(())
454     }
455 }
456 
457 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
458     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
459 
460     cmp::min(host_phys_bits, max_phys_bits)
461 }
462 
463 pub struct Vm {
464     #[cfg(feature = "tdx")]
465     kernel: Option<File>,
466     initramfs: Option<File>,
467     threads: Vec<thread::JoinHandle<()>>,
468     device_manager: Arc<Mutex<DeviceManager>>,
469     config: Arc<Mutex<VmConfig>>,
470     state: RwLock<VmState>,
471     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
472     memory_manager: Arc<Mutex<MemoryManager>>,
473     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
474     // The hypervisor abstracted virtual machine.
475     vm: Arc<dyn hypervisor::Vm>,
476     #[cfg(target_arch = "x86_64")]
477     saved_clock: Option<hypervisor::ClockData>,
478     #[cfg(not(target_arch = "riscv64"))]
479     numa_nodes: NumaNodes,
480     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
481     #[cfg(not(target_arch = "riscv64"))]
482     hypervisor: Arc<dyn hypervisor::Hypervisor>,
483     stop_on_boot: bool,
484     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
485 }
486 
487 impl Vm {
488     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
489 
490     #[allow(clippy::too_many_arguments)]
491     pub fn new_from_memory_manager(
492         config: Arc<Mutex<VmConfig>>,
493         memory_manager: Arc<Mutex<MemoryManager>>,
494         vm: Arc<dyn hypervisor::Vm>,
495         exit_evt: EventFd,
496         reset_evt: EventFd,
497         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
498         seccomp_action: &SeccompAction,
499         hypervisor: Arc<dyn hypervisor::Hypervisor>,
500         activate_evt: EventFd,
501         #[cfg(not(target_arch = "riscv64"))] timestamp: Instant,
502         console_info: Option<ConsoleInfo>,
503         console_resize_pipe: Option<Arc<File>>,
504         original_termios: Arc<Mutex<Option<termios>>>,
505         snapshot: Option<Snapshot>,
506     ) -> Result<Self> {
507         trace_scoped!("Vm::new_from_memory_manager");
508 
509         let boot_id_list = config
510             .lock()
511             .unwrap()
512             .validate()
513             .map_err(Error::ConfigValidation)?;
514 
515         info!("Booting VM from config: {:?}", &config);
516 
517         // Create NUMA nodes based on NumaConfig.
518         let numa_nodes =
519             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
520 
521         #[cfg(feature = "tdx")]
522         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
523         #[cfg(feature = "sev_snp")]
524         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
525         #[cfg(feature = "tdx")]
526         let force_iommu = tdx_enabled;
527         #[cfg(feature = "sev_snp")]
528         let force_iommu = sev_snp_enabled;
529         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
530         let force_iommu = false;
531 
532         #[cfg(feature = "guest_debug")]
533         let stop_on_boot = config.lock().unwrap().gdb;
534         #[cfg(not(feature = "guest_debug"))]
535         let stop_on_boot = false;
536 
537         let memory = memory_manager.lock().unwrap().guest_memory();
538         let io_bus = Arc::new(Bus::new());
539         let mmio_bus = Arc::new(Bus::new());
540 
541         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
542             memory,
543             #[cfg(target_arch = "x86_64")]
544             io_bus: io_bus.clone(),
545             mmio_bus: mmio_bus.clone(),
546         });
547 
548         let cpus_config = { &config.lock().unwrap().cpus.clone() };
549         let cpu_manager = cpu::CpuManager::new(
550             cpus_config,
551             vm.clone(),
552             exit_evt.try_clone().map_err(Error::EventFdClone)?,
553             reset_evt.try_clone().map_err(Error::EventFdClone)?,
554             #[cfg(feature = "guest_debug")]
555             vm_debug_evt,
556             &hypervisor,
557             seccomp_action.clone(),
558             vm_ops,
559             #[cfg(feature = "tdx")]
560             tdx_enabled,
561             &numa_nodes,
562             #[cfg(feature = "sev_snp")]
563             sev_snp_enabled,
564         )
565         .map_err(Error::CpuManager)?;
566 
567         #[cfg(target_arch = "x86_64")]
568         cpu_manager
569             .lock()
570             .unwrap()
571             .populate_cpuid(
572                 &memory_manager,
573                 &hypervisor,
574                 #[cfg(feature = "tdx")]
575                 tdx_enabled,
576             )
577             .map_err(Error::CpuManager)?;
578 
579         // The initial TDX configuration must be done before the vCPUs are
580         // created
581         #[cfg(feature = "tdx")]
582         if tdx_enabled {
583             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
584             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
585             vm.tdx_init(&cpuid, max_vcpus)
586                 .map_err(Error::InitializeTdxVm)?;
587         }
588 
589         #[cfg(feature = "tdx")]
590         let dynamic = !tdx_enabled;
591         #[cfg(not(feature = "tdx"))]
592         let dynamic = true;
593 
594         #[cfg(feature = "kvm")]
595         let is_kvm = matches!(
596             hypervisor.hypervisor_type(),
597             hypervisor::HypervisorType::Kvm
598         );
599         #[cfg(feature = "mshv")]
600         let is_mshv = matches!(
601             hypervisor.hypervisor_type(),
602             hypervisor::HypervisorType::Mshv
603         );
604 
605         let device_manager = DeviceManager::new(
606             io_bus,
607             mmio_bus,
608             vm.clone(),
609             config.clone(),
610             memory_manager.clone(),
611             cpu_manager.clone(),
612             exit_evt.try_clone().map_err(Error::EventFdClone)?,
613             reset_evt,
614             seccomp_action.clone(),
615             numa_nodes.clone(),
616             &activate_evt,
617             force_iommu,
618             boot_id_list,
619             #[cfg(not(target_arch = "riscv64"))]
620             timestamp,
621             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
622             dynamic,
623         )
624         .map_err(Error::DeviceManager)?;
625 
626         // For MSHV, we need to create the interrupt controller before we initialize the VM.
627         // Because we need to set the base address of GICD before we initialize the VM.
628         #[cfg(feature = "mshv")]
629         {
630             if is_mshv {
631                 let ic = device_manager
632                     .lock()
633                     .unwrap()
634                     .create_interrupt_controller()
635                     .map_err(Error::DeviceManager)?;
636 
637                 vm.init().map_err(Error::InitializeVm)?;
638 
639                 device_manager
640                     .lock()
641                     .unwrap()
642                     .create_devices(
643                         console_info.clone(),
644                         console_resize_pipe.clone(),
645                         original_termios.clone(),
646                         ic,
647                     )
648                     .map_err(Error::DeviceManager)?;
649             }
650         }
651 
652         memory_manager
653             .lock()
654             .unwrap()
655             .allocate_address_space()
656             .map_err(Error::MemoryManager)?;
657 
658         #[cfg(target_arch = "aarch64")]
659         memory_manager
660             .lock()
661             .unwrap()
662             .add_uefi_flash()
663             .map_err(Error::MemoryManager)?;
664 
665         // Loading the igvm file is pushed down here because
666         // igvm parser needs cpu_manager to retrieve cpuid leaf.
667         // Currently, Microsoft Hypervisor does not provide any
668         // Hypervisor specific common cpuid, we need to call get_cpuid_values
669         // per cpuid through cpu_manager.
670         let load_payload_handle = if snapshot.is_none() {
671             Self::load_payload_async(
672                 &memory_manager,
673                 &config,
674                 #[cfg(feature = "igvm")]
675                 &cpu_manager,
676                 #[cfg(feature = "sev_snp")]
677                 sev_snp_enabled,
678             )?
679         } else {
680             None
681         };
682 
683         cpu_manager
684             .lock()
685             .unwrap()
686             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
687             .map_err(Error::CpuManager)?;
688 
689         // For KVM, we need to create interrupt controller after we create boot vcpus.
690         // Because we restore GIC state from the snapshot as part of boot vcpu creation.
691         // This means that we need to create interrupt controller after we restore in case of KVM guests.
692         #[cfg(feature = "kvm")]
693         {
694             if is_kvm {
695                 let ic = device_manager
696                     .lock()
697                     .unwrap()
698                     .create_interrupt_controller()
699                     .map_err(Error::DeviceManager)?;
700 
701                 vm.init().map_err(Error::InitializeVm)?;
702 
703                 device_manager
704                     .lock()
705                     .unwrap()
706                     .create_devices(console_info, console_resize_pipe, original_termios, ic)
707                     .map_err(Error::DeviceManager)?;
708             }
709         }
710 
711         // This initial SEV-SNP configuration must be done immediately after
712         // vCPUs are created. As part of this initialization we are
713         // transitioning the guest into secure state.
714         #[cfg(feature = "sev_snp")]
715         if sev_snp_enabled {
716             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
717         }
718 
719         #[cfg(feature = "tdx")]
720         let kernel = config
721             .lock()
722             .unwrap()
723             .payload
724             .as_ref()
725             .map(|p| p.kernel.as_ref().map(File::open))
726             .unwrap_or_default()
727             .transpose()
728             .map_err(Error::KernelFile)?;
729 
730         let initramfs = config
731             .lock()
732             .unwrap()
733             .payload
734             .as_ref()
735             .map(|p| p.initramfs.as_ref().map(File::open))
736             .unwrap_or_default()
737             .transpose()
738             .map_err(Error::InitramfsFile)?;
739 
740         #[cfg(target_arch = "x86_64")]
741         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
742             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
743             vm_snapshot.clock
744         } else {
745             None
746         };
747 
748         let vm_state = if snapshot.is_some() {
749             VmState::Paused
750         } else {
751             VmState::Created
752         };
753 
754         Ok(Vm {
755             #[cfg(feature = "tdx")]
756             kernel,
757             initramfs,
758             device_manager,
759             config,
760             threads: Vec::with_capacity(1),
761             state: RwLock::new(vm_state),
762             cpu_manager,
763             memory_manager,
764             vm,
765             #[cfg(target_arch = "x86_64")]
766             saved_clock,
767             #[cfg(not(target_arch = "riscv64"))]
768             numa_nodes,
769             #[cfg(not(target_arch = "riscv64"))]
770             hypervisor,
771             stop_on_boot,
772             load_payload_handle,
773         })
774     }
775 
776     fn create_numa_nodes(
777         configs: Option<Vec<NumaConfig>>,
778         memory_manager: &Arc<Mutex<MemoryManager>>,
779     ) -> Result<NumaNodes> {
780         let mm = memory_manager.lock().unwrap();
781         let mm_zones = mm.memory_zones();
782         let mut numa_nodes = BTreeMap::new();
783 
784         if let Some(configs) = &configs {
785             for config in configs.iter() {
786                 if numa_nodes.contains_key(&config.guest_numa_id) {
787                     error!("Can't define twice the same NUMA node");
788                     return Err(Error::InvalidNumaConfig);
789                 }
790 
791                 let mut node = NumaNode::default();
792 
793                 if let Some(memory_zones) = &config.memory_zones {
794                     for memory_zone in memory_zones.iter() {
795                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
796                             node.memory_regions.extend(mm_zone.regions().clone());
797                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
798                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
799                             }
800                             node.memory_zones.push(memory_zone.clone());
801                         } else {
802                             error!("Unknown memory zone '{}'", memory_zone);
803                             return Err(Error::InvalidNumaConfig);
804                         }
805                     }
806                 }
807 
808                 if let Some(cpus) = &config.cpus {
809                     node.cpus.extend(cpus);
810                 }
811 
812                 if let Some(pci_segments) = &config.pci_segments {
813                     node.pci_segments.extend(pci_segments);
814                 }
815 
816                 if let Some(distances) = &config.distances {
817                     for distance in distances.iter() {
818                         let dest = distance.destination;
819                         let dist = distance.distance;
820 
821                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
822                             error!("Unknown destination NUMA node {}", dest);
823                             return Err(Error::InvalidNumaConfig);
824                         }
825 
826                         if node.distances.contains_key(&dest) {
827                             error!("Destination NUMA node {} has been already set", dest);
828                             return Err(Error::InvalidNumaConfig);
829                         }
830 
831                         node.distances.insert(dest, dist);
832                     }
833                 }
834 
835                 #[cfg(target_arch = "x86_64")]
836                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
837                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
838                         let mm_sections = sgx_epc_region.epc_sections();
839                         for sgx_epc_section in sgx_epc_sections.iter() {
840                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
841                                 node.sgx_epc_sections.push(mm_section.clone());
842                             } else {
843                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
844                                 return Err(Error::InvalidNumaConfig);
845                             }
846                         }
847                     } else {
848                         error!("Missing SGX EPC region");
849                         return Err(Error::InvalidNumaConfig);
850                     }
851                 }
852 
853                 numa_nodes.insert(config.guest_numa_id, node);
854             }
855         }
856 
857         Ok(numa_nodes)
858     }
859 
860     #[allow(clippy::too_many_arguments)]
861     pub fn new(
862         vm_config: Arc<Mutex<VmConfig>>,
863         exit_evt: EventFd,
864         reset_evt: EventFd,
865         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
866         seccomp_action: &SeccompAction,
867         hypervisor: Arc<dyn hypervisor::Hypervisor>,
868         activate_evt: EventFd,
869         console_info: Option<ConsoleInfo>,
870         console_resize_pipe: Option<Arc<File>>,
871         original_termios: Arc<Mutex<Option<termios>>>,
872         snapshot: Option<Snapshot>,
873         source_url: Option<&str>,
874         prefault: Option<bool>,
875     ) -> Result<Self> {
876         trace_scoped!("Vm::new");
877 
878         #[cfg(not(target_arch = "riscv64"))]
879         let timestamp = Instant::now();
880 
881         #[cfg(feature = "tdx")]
882         let tdx_enabled = if snapshot.is_some() {
883             false
884         } else {
885             vm_config.lock().unwrap().is_tdx_enabled()
886         };
887 
888         #[cfg(feature = "sev_snp")]
889         let sev_snp_enabled = if snapshot.is_some() {
890             false
891         } else {
892             vm_config.lock().unwrap().is_sev_snp_enabled()
893         };
894 
895         let vm = Self::create_hypervisor_vm(
896             &hypervisor,
897             #[cfg(feature = "tdx")]
898             tdx_enabled,
899             #[cfg(feature = "sev_snp")]
900             sev_snp_enabled,
901             #[cfg(feature = "sev_snp")]
902             vm_config.lock().unwrap().memory.total_size(),
903         )?;
904 
905         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
906 
907         let memory_manager = if let Some(snapshot) =
908             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
909         {
910             MemoryManager::new_from_snapshot(
911                 &snapshot,
912                 vm.clone(),
913                 &vm_config.lock().unwrap().memory.clone(),
914                 source_url,
915                 prefault.unwrap(),
916                 phys_bits,
917             )
918             .map_err(Error::MemoryManager)?
919         } else {
920             #[cfg(target_arch = "x86_64")]
921             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
922 
923             MemoryManager::new(
924                 vm.clone(),
925                 &vm_config.lock().unwrap().memory.clone(),
926                 None,
927                 phys_bits,
928                 #[cfg(feature = "tdx")]
929                 tdx_enabled,
930                 None,
931                 None,
932                 #[cfg(target_arch = "x86_64")]
933                 sgx_epc_config,
934             )
935             .map_err(Error::MemoryManager)?
936         };
937 
938         Vm::new_from_memory_manager(
939             vm_config,
940             memory_manager,
941             vm,
942             exit_evt,
943             reset_evt,
944             #[cfg(feature = "guest_debug")]
945             vm_debug_evt,
946             seccomp_action,
947             hypervisor,
948             activate_evt,
949             #[cfg(not(target_arch = "riscv64"))]
950             timestamp,
951             console_info,
952             console_resize_pipe,
953             original_termios,
954             snapshot,
955         )
956     }
957 
958     pub fn create_hypervisor_vm(
959         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
960         #[cfg(feature = "tdx")] tdx_enabled: bool,
961         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
962         #[cfg(feature = "sev_snp")] mem_size: u64,
963     ) -> Result<Arc<dyn hypervisor::Vm>> {
964         hypervisor.check_required_extensions().unwrap();
965 
966         cfg_if::cfg_if! {
967             if #[cfg(feature = "tdx")] {
968                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
969                 // Otherwise KVM_X86_LEGACY_VM: 0
970                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
971                 let vm = hypervisor
972                     .create_vm_with_type(u64::from(tdx_enabled))
973                     .unwrap();
974             } else if #[cfg(feature = "sev_snp")] {
975                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
976                 // Otherwise SEV_SNP_DISABLED: 0
977                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
978                 let vm = hypervisor
979                     .create_vm_with_type_and_memory(u64::from(sev_snp_enabled), mem_size)
980                     .unwrap();
981             } else {
982                 let vm = hypervisor.create_vm().unwrap();
983             }
984         }
985 
986         #[cfg(target_arch = "x86_64")]
987         {
988             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
989                 .unwrap();
990             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
991             vm.enable_split_irq().unwrap();
992         }
993 
994         Ok(vm)
995     }
996 
997     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
998         let initramfs = self.initramfs.as_mut().unwrap();
999         let size: usize = initramfs
1000             .seek(SeekFrom::End(0))
1001             .map_err(|_| Error::InitramfsLoad)?
1002             .try_into()
1003             .unwrap();
1004         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
1005 
1006         let address =
1007             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
1008         let address = GuestAddress(address);
1009 
1010         guest_mem
1011             .read_volatile_from(address, initramfs, size)
1012             .map_err(|_| Error::InitramfsLoad)?;
1013 
1014         info!("Initramfs loaded: address = 0x{:x}", address.0);
1015         Ok(arch::InitramfsConfig { address, size })
1016     }
1017 
1018     pub fn generate_cmdline(
1019         payload: &PayloadConfig,
1020         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] device_manager: &Arc<
1021             Mutex<DeviceManager>,
1022         >,
1023     ) -> Result<Cmdline> {
1024         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
1025         if let Some(s) = payload.cmdline.as_ref() {
1026             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
1027         }
1028 
1029         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1030         for entry in device_manager.lock().unwrap().cmdline_additions() {
1031             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
1032         }
1033         Ok(cmdline)
1034     }
1035 
1036     #[cfg(target_arch = "aarch64")]
1037     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
1038         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
1039         let mem = uefi_flash.memory();
1040         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
1041             .map_err(Error::UefiLoad)?;
1042         Ok(())
1043     }
1044 
1045     #[cfg(target_arch = "aarch64")]
1046     fn load_kernel(
1047         firmware: Option<File>,
1048         kernel: Option<File>,
1049         memory_manager: Arc<Mutex<MemoryManager>>,
1050     ) -> Result<EntryPoint> {
1051         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1052         let mem = guest_memory.memory();
1053         let entry_addr = match (firmware, kernel) {
1054             (None, Some(mut kernel)) => {
1055                 match linux_loader::loader::pe::PE::load(
1056                     mem.deref(),
1057                     Some(arch::layout::KERNEL_START),
1058                     &mut kernel,
1059                     None,
1060                 ) {
1061                     Ok(entry_addr) => entry_addr.kernel_load,
1062                     // Try to load the binary as kernel PE file at first.
1063                     // If failed, retry to load it as UEFI binary.
1064                     // As the UEFI binary is formatless, it must be the last option to try.
1065                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
1066                         Self::load_firmware(&kernel, memory_manager)?;
1067                         arch::layout::UEFI_START
1068                     }
1069                     Err(e) => {
1070                         return Err(Error::KernelLoad(e));
1071                     }
1072                 }
1073             }
1074             (Some(firmware), None) => {
1075                 Self::load_firmware(&firmware, memory_manager)?;
1076                 arch::layout::UEFI_START
1077             }
1078             _ => return Err(Error::InvalidPayload),
1079         };
1080 
1081         Ok(EntryPoint { entry_addr })
1082     }
1083 
1084     #[cfg(target_arch = "riscv64")]
1085     fn load_kernel(
1086         firmware: Option<File>,
1087         kernel: Option<File>,
1088         memory_manager: Arc<Mutex<MemoryManager>>,
1089     ) -> Result<EntryPoint> {
1090         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1091         let mem = guest_memory.memory();
1092         let alignment = 0x20_0000;
1093         let aligned_kernel_addr = arch::layout::KERNEL_START.0 + (alignment - 1) & !(alignment - 1);
1094         let entry_addr = match (firmware, kernel) {
1095             (None, Some(mut kernel)) => {
1096                 match linux_loader::loader::pe::PE::load(
1097                     mem.deref(),
1098                     Some(GuestAddress(aligned_kernel_addr)),
1099                     &mut kernel,
1100                     None,
1101                 ) {
1102                     Ok(entry_addr) => entry_addr.kernel_load,
1103                     // Try to load the binary as kernel PE file at first.
1104                     // If failed, retry to load it as UEFI binary.
1105                     // As the UEFI binary is formatless, it must be the last option to try.
1106                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
1107                         // TODO: UEFI for riscv64 is scheduled to next stage.
1108                         unimplemented!()
1109                     }
1110                     Err(e) => {
1111                         return Err(Error::KernelLoad(e));
1112                     }
1113                 }
1114             }
1115             (Some(_firmware), None) => {
1116                 // TODO: UEFI for riscv64 is scheduled to next stage.
1117                 unimplemented!()
1118             }
1119             _ => return Err(Error::InvalidPayload),
1120         };
1121 
1122         Ok(EntryPoint { entry_addr })
1123     }
1124 
1125     #[cfg(feature = "igvm")]
1126     fn load_igvm(
1127         igvm: File,
1128         memory_manager: Arc<Mutex<MemoryManager>>,
1129         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1130         #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1131     ) -> Result<EntryPoint> {
1132         let res = igvm_loader::load_igvm(
1133             &igvm,
1134             memory_manager,
1135             cpu_manager.clone(),
1136             "",
1137             #[cfg(feature = "sev_snp")]
1138             host_data,
1139         )
1140         .map_err(Error::IgvmLoad)?;
1141 
1142         cfg_if::cfg_if! {
1143             if #[cfg(feature = "sev_snp")] {
1144                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1145                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1146                 } else {
1147                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1148                 };
1149             } else {
1150                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1151             }
1152         };
1153         Ok(entry_point)
1154     }
1155 
1156     #[cfg(target_arch = "x86_64")]
1157     fn load_kernel(
1158         mut kernel: File,
1159         cmdline: Option<Cmdline>,
1160         memory_manager: Arc<Mutex<MemoryManager>>,
1161     ) -> Result<EntryPoint> {
1162         info!("Loading kernel");
1163 
1164         let mem = {
1165             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1166             guest_memory.memory()
1167         };
1168 
1169         // Try ELF binary with PVH boot.
1170         let entry_addr = linux_loader::loader::elf::Elf::load(
1171             mem.deref(),
1172             None,
1173             &mut kernel,
1174             Some(arch::layout::HIGH_RAM_START),
1175         )
1176         // Try loading kernel as bzImage.
1177         .or_else(|_| {
1178             BzImage::load(
1179                 mem.deref(),
1180                 None,
1181                 &mut kernel,
1182                 Some(arch::layout::HIGH_RAM_START),
1183             )
1184         })
1185         .map_err(Error::KernelLoad)?;
1186 
1187         if let Some(cmdline) = cmdline {
1188             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1189                 .map_err(Error::LoadCmdLine)?;
1190         }
1191 
1192         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1193             // Use the PVH kernel entry point to boot the guest
1194             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1195             Ok(EntryPoint {
1196                 entry_addr,
1197                 setup_header: None,
1198             })
1199         } else if entry_addr.setup_header.is_some() {
1200             // Use the bzImage 32bit entry point to boot the guest
1201             info!(
1202                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1203                 entry_addr.kernel_load.0
1204             );
1205             Ok(EntryPoint {
1206                 entry_addr: entry_addr.kernel_load,
1207                 setup_header: entry_addr.setup_header,
1208             })
1209         } else {
1210             Err(Error::KernelMissingPvhHeader)
1211         }
1212     }
1213 
1214     #[cfg(target_arch = "x86_64")]
1215     fn load_payload(
1216         payload: &PayloadConfig,
1217         memory_manager: Arc<Mutex<MemoryManager>>,
1218         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1219         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1220     ) -> Result<EntryPoint> {
1221         trace_scoped!("load_payload");
1222         #[cfg(feature = "igvm")]
1223         {
1224             if let Some(_igvm_file) = &payload.igvm {
1225                 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1226                 #[cfg(feature = "sev_snp")]
1227                 if sev_snp_enabled {
1228                     return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1229                 }
1230                 #[cfg(not(feature = "sev_snp"))]
1231                 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1232             }
1233         }
1234         match (
1235             &payload.firmware,
1236             &payload.kernel,
1237             &payload.initramfs,
1238             &payload.cmdline,
1239         ) {
1240             (Some(firmware), None, None, None) => {
1241                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1242                 Self::load_kernel(firmware, None, memory_manager)
1243             }
1244             (None, Some(kernel), _, _) => {
1245                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1246                 let cmdline = Self::generate_cmdline(payload)?;
1247                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1248             }
1249             _ => Err(Error::InvalidPayload),
1250         }
1251     }
1252 
1253     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1254     fn load_payload(
1255         payload: &PayloadConfig,
1256         memory_manager: Arc<Mutex<MemoryManager>>,
1257     ) -> Result<EntryPoint> {
1258         match (&payload.firmware, &payload.kernel) {
1259             (Some(firmware), None) => {
1260                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1261                 Self::load_kernel(Some(firmware), None, memory_manager)
1262             }
1263             (None, Some(kernel)) => {
1264                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1265                 Self::load_kernel(None, Some(kernel), memory_manager)
1266             }
1267             _ => Err(Error::InvalidPayload),
1268         }
1269     }
1270 
1271     fn load_payload_async(
1272         memory_manager: &Arc<Mutex<MemoryManager>>,
1273         config: &Arc<Mutex<VmConfig>>,
1274         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1275         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1276     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1277         // Kernel with TDX is loaded in a different manner
1278         #[cfg(feature = "tdx")]
1279         if config.lock().unwrap().is_tdx_enabled() {
1280             return Ok(None);
1281         }
1282 
1283         config
1284             .lock()
1285             .unwrap()
1286             .payload
1287             .as_ref()
1288             .map(|payload| {
1289                 let memory_manager = memory_manager.clone();
1290                 let payload = payload.clone();
1291                 #[cfg(feature = "igvm")]
1292                 let cpu_manager = cpu_manager.clone();
1293 
1294                 std::thread::Builder::new()
1295                     .name("payload_loader".into())
1296                     .spawn(move || {
1297                         Self::load_payload(
1298                             &payload,
1299                             memory_manager,
1300                             #[cfg(feature = "igvm")]
1301                             cpu_manager,
1302                             #[cfg(feature = "sev_snp")]
1303                             sev_snp_enabled,
1304                         )
1305                     })
1306                     .map_err(Error::KernelLoadThreadSpawn)
1307             })
1308             .transpose()
1309     }
1310 
1311     #[cfg(target_arch = "x86_64")]
1312     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1313         trace_scoped!("configure_system");
1314         info!("Configuring system");
1315         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1316 
1317         let initramfs_config = match self.initramfs {
1318             Some(_) => Some(self.load_initramfs(&mem)?),
1319             None => None,
1320         };
1321 
1322         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1323         let rsdp_addr = Some(rsdp_addr);
1324         let sgx_epc_region = self
1325             .memory_manager
1326             .lock()
1327             .unwrap()
1328             .sgx_epc_region()
1329             .as_ref()
1330             .cloned();
1331 
1332         let serial_number = self
1333             .config
1334             .lock()
1335             .unwrap()
1336             .platform
1337             .as_ref()
1338             .and_then(|p| p.serial_number.clone());
1339 
1340         let uuid = self
1341             .config
1342             .lock()
1343             .unwrap()
1344             .platform
1345             .as_ref()
1346             .and_then(|p| p.uuid.clone());
1347 
1348         let oem_strings = self
1349             .config
1350             .lock()
1351             .unwrap()
1352             .platform
1353             .as_ref()
1354             .and_then(|p| p.oem_strings.clone());
1355 
1356         let oem_strings = oem_strings
1357             .as_deref()
1358             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1359 
1360         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1361 
1362         arch::configure_system(
1363             &mem,
1364             arch::layout::CMDLINE_START,
1365             arch::layout::CMDLINE_MAX_SIZE,
1366             &initramfs_config,
1367             boot_vcpus,
1368             entry_addr.setup_header,
1369             rsdp_addr,
1370             sgx_epc_region,
1371             serial_number.as_deref(),
1372             uuid.as_deref(),
1373             oem_strings.as_deref(),
1374             topology,
1375         )
1376         .map_err(Error::ConfigureSystem)?;
1377         Ok(())
1378     }
1379 
1380     #[cfg(target_arch = "aarch64")]
1381     fn configure_system(
1382         &mut self,
1383         _rsdp_addr: GuestAddress,
1384         _entry_addr: EntryPoint,
1385     ) -> Result<()> {
1386         let cmdline = Self::generate_cmdline(
1387             self.config.lock().unwrap().payload.as_ref().unwrap(),
1388             &self.device_manager,
1389         )?;
1390         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1391         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1392         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1393         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1394         let initramfs_config = match self.initramfs {
1395             Some(_) => Some(self.load_initramfs(&mem)?),
1396             None => None,
1397         };
1398 
1399         let device_info = &self
1400             .device_manager
1401             .lock()
1402             .unwrap()
1403             .get_device_info()
1404             .clone();
1405 
1406         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1407             let pci_space = PciSpaceInfo {
1408                 pci_segment_id: pci_segment.id,
1409                 mmio_config_address: pci_segment.mmio_config_address,
1410                 pci_device_space_start: pci_segment.start_of_mem64_area,
1411                 pci_device_space_size: pci_segment.end_of_mem64_area
1412                     - pci_segment.start_of_mem64_area
1413                     + 1,
1414             };
1415             pci_space_info.push(pci_space);
1416         }
1417 
1418         let virtio_iommu_bdf = self
1419             .device_manager
1420             .lock()
1421             .unwrap()
1422             .iommu_attached_devices()
1423             .as_ref()
1424             .map(|(v, _)| *v);
1425 
1426         let vgic = self
1427             .device_manager
1428             .lock()
1429             .unwrap()
1430             .get_interrupt_controller()
1431             .unwrap()
1432             .lock()
1433             .unwrap()
1434             .get_vgic()
1435             .map_err(|_| {
1436                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1437                     arch::aarch64::Error::SetupGic,
1438                 ))
1439             })?;
1440 
1441         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1442         let pmu_supported = self
1443             .cpu_manager
1444             .lock()
1445             .unwrap()
1446             .init_pmu(AARCH64_PMU_IRQ + 16)
1447             .map_err(|_| {
1448                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1449                     arch::aarch64::Error::VcpuInitPmu,
1450                 ))
1451             })?;
1452 
1453         arch::configure_system(
1454             &mem,
1455             cmdline.as_cstring().unwrap().to_str().unwrap(),
1456             vcpu_mpidrs,
1457             vcpu_topology,
1458             device_info,
1459             &initramfs_config,
1460             &pci_space_info,
1461             virtio_iommu_bdf.map(|bdf| bdf.into()),
1462             &vgic,
1463             &self.numa_nodes,
1464             pmu_supported,
1465         )
1466         .map_err(Error::ConfigureSystem)?;
1467 
1468         Ok(())
1469     }
1470 
1471     #[cfg(target_arch = "riscv64")]
1472     fn configure_system(&mut self) -> Result<()> {
1473         let cmdline = Self::generate_cmdline(
1474             self.config.lock().unwrap().payload.as_ref().unwrap(),
1475             &self.device_manager,
1476         )?;
1477         let num_vcpu = self.cpu_manager.lock().unwrap().vcpus().len();
1478         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1479         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1480         let initramfs_config = match self.initramfs {
1481             Some(_) => Some(self.load_initramfs(&mem)?),
1482             None => None,
1483         };
1484 
1485         let device_info = &self
1486             .device_manager
1487             .lock()
1488             .unwrap()
1489             .get_device_info()
1490             .clone();
1491 
1492         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1493             let pci_space = PciSpaceInfo {
1494                 pci_segment_id: pci_segment.id,
1495                 mmio_config_address: pci_segment.mmio_config_address,
1496                 pci_device_space_start: pci_segment.start_of_mem64_area,
1497                 pci_device_space_size: pci_segment.end_of_mem64_area
1498                     - pci_segment.start_of_mem64_area
1499                     + 1,
1500             };
1501             pci_space_info.push(pci_space);
1502         }
1503 
1504         // TODO: IOMMU for riscv64 is not yet support in kernel.
1505 
1506         let vaia = self
1507             .device_manager
1508             .lock()
1509             .unwrap()
1510             .get_interrupt_controller()
1511             .unwrap()
1512             .lock()
1513             .unwrap()
1514             .get_vaia()
1515             .map_err(|_| {
1516                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1517                     arch::riscv64::Error::SetupAia,
1518                 ))
1519             })?;
1520 
1521         // TODO: PMU support for riscv64 is scheduled to next stage.
1522 
1523         arch::configure_system(
1524             &mem,
1525             cmdline.as_cstring().unwrap().to_str().unwrap(),
1526             num_vcpu as u32,
1527             device_info,
1528             &initramfs_config,
1529             &pci_space_info,
1530             &vaia,
1531         )
1532         .map_err(Error::ConfigureSystem)?;
1533 
1534         Ok(())
1535     }
1536 
1537     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1538         self.device_manager.lock().unwrap().console_resize_pipe()
1539     }
1540 
1541     pub fn shutdown(&mut self) -> Result<()> {
1542         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1543         let new_state = VmState::Shutdown;
1544 
1545         state.valid_transition(new_state)?;
1546 
1547         // Wake up the DeviceManager threads so they will get terminated cleanly
1548         self.device_manager
1549             .lock()
1550             .unwrap()
1551             .resume()
1552             .map_err(Error::Resume)?;
1553 
1554         self.cpu_manager
1555             .lock()
1556             .unwrap()
1557             .shutdown()
1558             .map_err(Error::CpuManager)?;
1559 
1560         // Wait for all the threads to finish
1561         for thread in self.threads.drain(..) {
1562             thread.join().map_err(Error::ThreadCleanup)?
1563         }
1564         *state = new_state;
1565 
1566         Ok(())
1567     }
1568 
1569     pub fn resize(
1570         &mut self,
1571         desired_vcpus: Option<u8>,
1572         desired_memory: Option<u64>,
1573         desired_balloon: Option<u64>,
1574     ) -> Result<()> {
1575         event!("vm", "resizing");
1576 
1577         if let Some(desired_vcpus) = desired_vcpus {
1578             if self
1579                 .cpu_manager
1580                 .lock()
1581                 .unwrap()
1582                 .resize(desired_vcpus)
1583                 .map_err(Error::CpuManager)?
1584             {
1585                 self.device_manager
1586                     .lock()
1587                     .unwrap()
1588                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1589                     .map_err(Error::DeviceManager)?;
1590             }
1591             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1592         }
1593 
1594         if let Some(desired_memory) = desired_memory {
1595             let new_region = self
1596                 .memory_manager
1597                 .lock()
1598                 .unwrap()
1599                 .resize(desired_memory)
1600                 .map_err(Error::MemoryManager)?;
1601 
1602             let memory_config = &mut self.config.lock().unwrap().memory;
1603 
1604             if let Some(new_region) = &new_region {
1605                 self.device_manager
1606                     .lock()
1607                     .unwrap()
1608                     .update_memory(new_region)
1609                     .map_err(Error::DeviceManager)?;
1610 
1611                 match memory_config.hotplug_method {
1612                     HotplugMethod::Acpi => {
1613                         self.device_manager
1614                             .lock()
1615                             .unwrap()
1616                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1617                             .map_err(Error::DeviceManager)?;
1618                     }
1619                     HotplugMethod::VirtioMem => {}
1620                 }
1621             }
1622 
1623             // We update the VM config regardless of the actual guest resize
1624             // operation result (happened or not), so that if the VM reboots
1625             // it will be running with the last configure memory size.
1626             match memory_config.hotplug_method {
1627                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1628                 HotplugMethod::VirtioMem => {
1629                     if desired_memory > memory_config.size {
1630                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1631                     } else {
1632                         memory_config.hotplugged_size = None;
1633                     }
1634                 }
1635             }
1636         }
1637 
1638         if let Some(desired_balloon) = desired_balloon {
1639             self.device_manager
1640                 .lock()
1641                 .unwrap()
1642                 .resize_balloon(desired_balloon)
1643                 .map_err(Error::DeviceManager)?;
1644 
1645             // Update the configuration value for the balloon size to ensure
1646             // a reboot would use the right value.
1647             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1648                 balloon_config.size = desired_balloon;
1649             }
1650         }
1651 
1652         event!("vm", "resized");
1653 
1654         Ok(())
1655     }
1656 
1657     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1658         let memory_config = &mut self.config.lock().unwrap().memory;
1659 
1660         if let Some(zones) = &mut memory_config.zones {
1661             for zone in zones.iter_mut() {
1662                 if zone.id == id {
1663                     if desired_memory >= zone.size {
1664                         let hotplugged_size = desired_memory - zone.size;
1665                         self.memory_manager
1666                             .lock()
1667                             .unwrap()
1668                             .resize_zone(&id, desired_memory - zone.size)
1669                             .map_err(Error::MemoryManager)?;
1670                         // We update the memory zone config regardless of the
1671                         // actual 'resize-zone' operation result (happened or
1672                         // not), so that if the VM reboots it will be running
1673                         // with the last configured memory zone size.
1674                         zone.hotplugged_size = Some(hotplugged_size);
1675 
1676                         return Ok(());
1677                     } else {
1678                         error!(
1679                             "Invalid to ask less ({}) than boot RAM ({}) for \
1680                             this memory zone",
1681                             desired_memory, zone.size,
1682                         );
1683                         return Err(Error::ResizeZone);
1684                     }
1685                 }
1686             }
1687         }
1688 
1689         error!("Could not find the memory zone {} for the resize", id);
1690         Err(Error::ResizeZone)
1691     }
1692 
1693     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1694         let pci_device_info = self
1695             .device_manager
1696             .lock()
1697             .unwrap()
1698             .add_device(&mut device_cfg)
1699             .map_err(Error::DeviceManager)?;
1700 
1701         // Update VmConfig by adding the new device. This is important to
1702         // ensure the device would be created in case of a reboot.
1703         {
1704             let mut config = self.config.lock().unwrap();
1705             add_to_config(&mut config.devices, device_cfg);
1706         }
1707 
1708         self.device_manager
1709             .lock()
1710             .unwrap()
1711             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1712             .map_err(Error::DeviceManager)?;
1713 
1714         Ok(pci_device_info)
1715     }
1716 
1717     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1718         let pci_device_info = self
1719             .device_manager
1720             .lock()
1721             .unwrap()
1722             .add_user_device(&mut device_cfg)
1723             .map_err(Error::DeviceManager)?;
1724 
1725         // Update VmConfig by adding the new device. This is important to
1726         // ensure the device would be created in case of a reboot.
1727         {
1728             let mut config = self.config.lock().unwrap();
1729             add_to_config(&mut config.user_devices, device_cfg);
1730         }
1731 
1732         self.device_manager
1733             .lock()
1734             .unwrap()
1735             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1736             .map_err(Error::DeviceManager)?;
1737 
1738         Ok(pci_device_info)
1739     }
1740 
1741     pub fn remove_device(&mut self, id: String) -> Result<()> {
1742         self.device_manager
1743             .lock()
1744             .unwrap()
1745             .remove_device(id.clone())
1746             .map_err(Error::DeviceManager)?;
1747 
1748         // Update VmConfig by removing the device. This is important to
1749         // ensure the device would not be created in case of a reboot.
1750         self.config.lock().unwrap().remove_device(&id);
1751 
1752         self.device_manager
1753             .lock()
1754             .unwrap()
1755             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1756             .map_err(Error::DeviceManager)?;
1757         Ok(())
1758     }
1759 
1760     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1761         let pci_device_info = self
1762             .device_manager
1763             .lock()
1764             .unwrap()
1765             .add_disk(&mut disk_cfg)
1766             .map_err(Error::DeviceManager)?;
1767 
1768         // Update VmConfig by adding the new device. This is important to
1769         // ensure the device would be created in case of a reboot.
1770         {
1771             let mut config = self.config.lock().unwrap();
1772             add_to_config(&mut config.disks, disk_cfg);
1773         }
1774 
1775         self.device_manager
1776             .lock()
1777             .unwrap()
1778             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1779             .map_err(Error::DeviceManager)?;
1780 
1781         Ok(pci_device_info)
1782     }
1783 
1784     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1785         let pci_device_info = self
1786             .device_manager
1787             .lock()
1788             .unwrap()
1789             .add_fs(&mut fs_cfg)
1790             .map_err(Error::DeviceManager)?;
1791 
1792         // Update VmConfig by adding the new device. This is important to
1793         // ensure the device would be created in case of a reboot.
1794         {
1795             let mut config = self.config.lock().unwrap();
1796             add_to_config(&mut config.fs, fs_cfg);
1797         }
1798 
1799         self.device_manager
1800             .lock()
1801             .unwrap()
1802             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1803             .map_err(Error::DeviceManager)?;
1804 
1805         Ok(pci_device_info)
1806     }
1807 
1808     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1809         let pci_device_info = self
1810             .device_manager
1811             .lock()
1812             .unwrap()
1813             .add_pmem(&mut pmem_cfg)
1814             .map_err(Error::DeviceManager)?;
1815 
1816         // Update VmConfig by adding the new device. This is important to
1817         // ensure the device would be created in case of a reboot.
1818         {
1819             let mut config = self.config.lock().unwrap();
1820             add_to_config(&mut config.pmem, pmem_cfg);
1821         }
1822 
1823         self.device_manager
1824             .lock()
1825             .unwrap()
1826             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1827             .map_err(Error::DeviceManager)?;
1828 
1829         Ok(pci_device_info)
1830     }
1831 
1832     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1833         let pci_device_info = self
1834             .device_manager
1835             .lock()
1836             .unwrap()
1837             .add_net(&mut net_cfg)
1838             .map_err(Error::DeviceManager)?;
1839 
1840         // Update VmConfig by adding the new device. This is important to
1841         // ensure the device would be created in case of a reboot.
1842         {
1843             let mut config = self.config.lock().unwrap();
1844             add_to_config(&mut config.net, net_cfg);
1845         }
1846 
1847         self.device_manager
1848             .lock()
1849             .unwrap()
1850             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1851             .map_err(Error::DeviceManager)?;
1852 
1853         Ok(pci_device_info)
1854     }
1855 
1856     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1857         let pci_device_info = self
1858             .device_manager
1859             .lock()
1860             .unwrap()
1861             .add_vdpa(&mut vdpa_cfg)
1862             .map_err(Error::DeviceManager)?;
1863 
1864         // Update VmConfig by adding the new device. This is important to
1865         // ensure the device would be created in case of a reboot.
1866         {
1867             let mut config = self.config.lock().unwrap();
1868             add_to_config(&mut config.vdpa, vdpa_cfg);
1869         }
1870 
1871         self.device_manager
1872             .lock()
1873             .unwrap()
1874             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1875             .map_err(Error::DeviceManager)?;
1876 
1877         Ok(pci_device_info)
1878     }
1879 
1880     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1881         let pci_device_info = self
1882             .device_manager
1883             .lock()
1884             .unwrap()
1885             .add_vsock(&mut vsock_cfg)
1886             .map_err(Error::DeviceManager)?;
1887 
1888         // Update VmConfig by adding the new device. This is important to
1889         // ensure the device would be created in case of a reboot.
1890         {
1891             let mut config = self.config.lock().unwrap();
1892             config.vsock = Some(vsock_cfg);
1893         }
1894 
1895         self.device_manager
1896             .lock()
1897             .unwrap()
1898             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1899             .map_err(Error::DeviceManager)?;
1900 
1901         Ok(pci_device_info)
1902     }
1903 
1904     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1905         Ok(self.device_manager.lock().unwrap().counters())
1906     }
1907 
1908     #[cfg(feature = "tdx")]
1909     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1910         use arch::x86_64::tdx::*;
1911 
1912         let firmware_path = self
1913             .config
1914             .lock()
1915             .unwrap()
1916             .payload
1917             .as_ref()
1918             .unwrap()
1919             .firmware
1920             .clone()
1921             .ok_or(Error::TdxFirmwareMissing)?;
1922         // The TDVF file contains a table of section as well as code
1923         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1924 
1925         // For all the sections allocate some RAM backing them
1926         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1927     }
1928 
1929     #[cfg(feature = "tdx")]
1930     fn hob_memory_resources(
1931         mut sorted_sections: Vec<TdvfSection>,
1932         guest_memory: &GuestMemoryMmap,
1933     ) -> Vec<(u64, u64, bool)> {
1934         let mut list = Vec::new();
1935 
1936         let mut current_section = sorted_sections.pop();
1937 
1938         // RAM regions interleaved with TDVF sections
1939         let mut next_start_addr = 0;
1940         for region in guest_memory.iter() {
1941             let region_start = region.start_addr().0;
1942             let region_end = region.last_addr().0;
1943             if region_start > next_start_addr {
1944                 next_start_addr = region_start;
1945             }
1946 
1947             loop {
1948                 let (start, size, ram) = if let Some(section) = &current_section {
1949                     if section.address <= next_start_addr {
1950                         (section.address, section.size, false)
1951                     } else {
1952                         let last_addr = std::cmp::min(section.address - 1, region_end);
1953                         (next_start_addr, last_addr - next_start_addr + 1, true)
1954                     }
1955                 } else {
1956                     (next_start_addr, region_end - next_start_addr + 1, true)
1957                 };
1958 
1959                 list.push((start, size, ram));
1960 
1961                 if !ram {
1962                     current_section = sorted_sections.pop();
1963                 }
1964 
1965                 next_start_addr = start + size;
1966 
1967                 if region_start > next_start_addr {
1968                     next_start_addr = region_start;
1969                 }
1970 
1971                 if next_start_addr > region_end {
1972                     break;
1973                 }
1974             }
1975         }
1976 
1977         // Once all the interleaved sections have been processed, let's simply
1978         // pull the remaining ones.
1979         if let Some(section) = current_section {
1980             list.push((section.address, section.size, false));
1981         }
1982         while let Some(section) = sorted_sections.pop() {
1983             list.push((section.address, section.size, false));
1984         }
1985 
1986         list
1987     }
1988 
1989     #[cfg(feature = "tdx")]
1990     fn populate_tdx_sections(
1991         &mut self,
1992         sections: &[TdvfSection],
1993         guid_found: bool,
1994     ) -> Result<Option<u64>> {
1995         use arch::x86_64::tdx::*;
1996         // Get the memory end *before* we start adding TDVF ram regions
1997         let boot_guest_memory = self
1998             .memory_manager
1999             .lock()
2000             .as_ref()
2001             .unwrap()
2002             .boot_guest_memory();
2003         for section in sections {
2004             // No need to allocate if the section falls within guest RAM ranges
2005             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
2006                 info!(
2007                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
2008                     section
2009                 );
2010                 continue;
2011             }
2012 
2013             info!("Allocating TDVF Section: {:x?}", section);
2014             self.memory_manager
2015                 .lock()
2016                 .unwrap()
2017                 .add_ram_region(GuestAddress(section.address), section.size as usize)
2018                 .map_err(Error::AllocatingTdvfMemory)?;
2019         }
2020 
2021         // The TDVF file contains a table of section as well as code
2022         let firmware_path = self
2023             .config
2024             .lock()
2025             .unwrap()
2026             .payload
2027             .as_ref()
2028             .unwrap()
2029             .firmware
2030             .clone()
2031             .ok_or(Error::TdxFirmwareMissing)?;
2032         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
2033 
2034         // The guest memory at this point now has all the required regions so it
2035         // is safe to copy from the TDVF file into it.
2036         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2037         let mem = guest_memory.memory();
2038         let mut payload_info = None;
2039         let mut hob_offset = None;
2040         for section in sections {
2041             info!("Populating TDVF Section: {:x?}", section);
2042             match section.r#type {
2043                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
2044                     info!("Copying section to guest memory");
2045                     firmware_file
2046                         .seek(SeekFrom::Start(section.data_offset as u64))
2047                         .map_err(Error::LoadTdvf)?;
2048                     mem.read_volatile_from(
2049                         GuestAddress(section.address),
2050                         &mut firmware_file,
2051                         section.data_size as usize,
2052                     )
2053                     .unwrap();
2054                 }
2055                 TdvfSectionType::TdHob => {
2056                     hob_offset = Some(section.address);
2057                 }
2058                 TdvfSectionType::Payload => {
2059                     info!("Copying payload to guest memory");
2060                     if let Some(payload_file) = self.kernel.as_mut() {
2061                         let payload_size = payload_file
2062                             .seek(SeekFrom::End(0))
2063                             .map_err(Error::LoadPayload)?;
2064 
2065                         payload_file
2066                             .seek(SeekFrom::Start(0x1f1))
2067                             .map_err(Error::LoadPayload)?;
2068 
2069                         let mut payload_header = linux_loader::bootparam::setup_header::default();
2070                         payload_file
2071                             .read_volatile(&mut payload_header.as_bytes())
2072                             .unwrap();
2073 
2074                         if payload_header.header != 0x5372_6448 {
2075                             return Err(Error::InvalidPayloadType);
2076                         }
2077 
2078                         if (payload_header.version < 0x0200)
2079                             || ((payload_header.loadflags & 0x1) == 0x0)
2080                         {
2081                             return Err(Error::InvalidPayloadType);
2082                         }
2083 
2084                         payload_file.rewind().map_err(Error::LoadPayload)?;
2085                         mem.read_volatile_from(
2086                             GuestAddress(section.address),
2087                             payload_file,
2088                             payload_size as usize,
2089                         )
2090                         .unwrap();
2091 
2092                         // Create the payload info that will be inserted into
2093                         // the HOB.
2094                         payload_info = Some(PayloadInfo {
2095                             image_type: PayloadImageType::BzImage,
2096                             entry_point: section.address,
2097                         });
2098                     }
2099                 }
2100                 TdvfSectionType::PayloadParam => {
2101                     info!("Copying payload parameters to guest memory");
2102                     let cmdline = Self::generate_cmdline(
2103                         self.config.lock().unwrap().payload.as_ref().unwrap(),
2104                     )?;
2105                     mem.write_slice(
2106                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
2107                         GuestAddress(section.address),
2108                     )
2109                     .unwrap();
2110                 }
2111                 _ => {}
2112             }
2113         }
2114 
2115         // Generate HOB
2116         let mut hob = TdHob::start(hob_offset.unwrap());
2117 
2118         let mut sorted_sections = sections.to_vec();
2119         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
2120 
2121         sorted_sections.sort_by_key(|section| section.address);
2122         sorted_sections.reverse();
2123 
2124         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
2125             hob.add_memory_resource(&mem, start, size, ram, guid_found)
2126                 .map_err(Error::PopulateHob)?;
2127         }
2128 
2129         // MMIO regions
2130         hob.add_mmio_resource(
2131             &mem,
2132             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
2133             arch::layout::APIC_START.raw_value()
2134                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
2135         )
2136         .map_err(Error::PopulateHob)?;
2137         let start_of_device_area = self
2138             .memory_manager
2139             .lock()
2140             .unwrap()
2141             .start_of_device_area()
2142             .raw_value();
2143         let end_of_device_area = self
2144             .memory_manager
2145             .lock()
2146             .unwrap()
2147             .end_of_device_area()
2148             .raw_value();
2149         hob.add_mmio_resource(
2150             &mem,
2151             start_of_device_area,
2152             end_of_device_area - start_of_device_area,
2153         )
2154         .map_err(Error::PopulateHob)?;
2155 
2156         // Loop over the ACPI tables and copy them to the HOB.
2157 
2158         for acpi_table in crate::acpi::create_acpi_tables_tdx(
2159             &self.device_manager,
2160             &self.cpu_manager,
2161             &self.memory_manager,
2162             &self.numa_nodes,
2163         ) {
2164             hob.add_acpi_table(&mem, acpi_table.as_slice())
2165                 .map_err(Error::PopulateHob)?;
2166         }
2167 
2168         // If a payload info has been created, let's insert it into the HOB.
2169         if let Some(payload_info) = payload_info {
2170             hob.add_payload(&mem, payload_info)
2171                 .map_err(Error::PopulateHob)?;
2172         }
2173 
2174         hob.finish(&mem).map_err(Error::PopulateHob)?;
2175 
2176         Ok(hob_offset)
2177     }
2178 
2179     #[cfg(feature = "tdx")]
2180     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
2181         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2182         let mem = guest_memory.memory();
2183 
2184         for section in sections {
2185             self.vm
2186                 .tdx_init_memory_region(
2187                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2188                     section.address,
2189                     section.size,
2190                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2191                     section.attributes == 1,
2192                 )
2193                 .map_err(Error::InitializeTdxMemoryRegion)?;
2194         }
2195 
2196         Ok(())
2197     }
2198 
2199     // Creates ACPI tables
2200     // In case of TDX being used, this is a no-op since the tables will be
2201     // created and passed when populating the HOB.
2202 
2203     #[cfg(not(target_arch = "riscv64"))]
2204     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2205         #[cfg(feature = "tdx")]
2206         if self.config.lock().unwrap().is_tdx_enabled() {
2207             return None;
2208         }
2209         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2210         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2211         let rsdp_addr = crate::acpi::create_acpi_tables(
2212             &mem,
2213             &self.device_manager,
2214             &self.cpu_manager,
2215             &self.memory_manager,
2216             &self.numa_nodes,
2217             tpm_enabled,
2218         );
2219         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2220 
2221         Some(rsdp_addr)
2222     }
2223 
2224     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2225         trace_scoped!("entry_point");
2226 
2227         self.load_payload_handle
2228             .take()
2229             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2230             .transpose()
2231     }
2232 
2233     pub fn boot(&mut self) -> Result<()> {
2234         trace_scoped!("Vm::boot");
2235         let current_state = self.get_state()?;
2236         if current_state == VmState::Paused {
2237             return self.resume().map_err(Error::Resume);
2238         }
2239 
2240         // We acquire all advisory disk image locks here and not on device creation
2241         // to enable live-migration without locking issues.
2242         self.device_manager
2243             .lock()
2244             .unwrap()
2245             .try_lock_disks()
2246             .map_err(Error::LockingError)?;
2247 
2248         let new_state = if self.stop_on_boot {
2249             VmState::BreakPoint
2250         } else {
2251             VmState::Running
2252         };
2253         current_state.valid_transition(new_state)?;
2254 
2255         // Do earlier to parallelise with loading kernel
2256         #[cfg(target_arch = "x86_64")]
2257         cfg_if::cfg_if! {
2258             if #[cfg(feature = "sev_snp")] {
2259                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2260                 let rsdp_addr = if sev_snp_enabled {
2261                     // In case of SEV-SNP guest ACPI tables are provided via
2262                     // IGVM. So skip the creation of ACPI tables and set the
2263                     // rsdp addr to None.
2264                     None
2265                 } else {
2266                     self.create_acpi_tables()
2267                 };
2268             } else {
2269                 let rsdp_addr = self.create_acpi_tables();
2270             }
2271         }
2272 
2273         // Load kernel synchronously or if asynchronous then wait for load to
2274         // finish.
2275         let entry_point = self.entry_point()?;
2276 
2277         #[cfg(feature = "tdx")]
2278         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2279 
2280         #[cfg(target_arch = "aarch64")]
2281         let vgic = self
2282             .device_manager
2283             .lock()
2284             .unwrap()
2285             .get_interrupt_controller()
2286             .unwrap()
2287             .lock()
2288             .unwrap()
2289             .get_vgic()
2290             .unwrap();
2291 
2292         #[cfg(target_arch = "aarch64")]
2293         let redist_addr = vgic.lock().unwrap().device_properties();
2294 
2295         // Configure the vcpus that have been created
2296         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2297         for vcpu in vcpus {
2298             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2299             let boot_setup = entry_point.map(|e| (e, guest_memory));
2300             self.cpu_manager
2301                 .lock()
2302                 .unwrap()
2303                 .configure_vcpu(vcpu.clone(), boot_setup)
2304                 .map_err(Error::CpuManager)?;
2305 
2306             #[cfg(target_arch = "aarch64")]
2307             vcpu.lock()
2308                 .unwrap()
2309                 .set_gic_redistributor_addr(redist_addr[2], redist_addr[3])
2310                 .map_err(Error::CpuManager)?;
2311         }
2312 
2313         #[cfg(feature = "tdx")]
2314         let (sections, guid_found) = if tdx_enabled {
2315             self.extract_tdvf_sections()?
2316         } else {
2317             (Vec::new(), false)
2318         };
2319 
2320         // Configuring the TDX regions requires that the vCPUs are created.
2321         #[cfg(feature = "tdx")]
2322         let hob_address = if tdx_enabled {
2323             // TDX sections are written to memory.
2324             self.populate_tdx_sections(&sections, guid_found)?
2325         } else {
2326             None
2327         };
2328 
2329         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2330         // available after they are configured
2331         #[cfg(target_arch = "aarch64")]
2332         let rsdp_addr = self.create_acpi_tables();
2333 
2334         #[cfg(not(target_arch = "riscv64"))]
2335         // Configure shared state based on loaded kernel
2336         entry_point
2337             .map(|entry_point| {
2338                 // Safe to unwrap rsdp_addr as we know it can't be None when
2339                 // the entry_point is Some.
2340                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2341             })
2342             .transpose()?;
2343 
2344         #[cfg(target_arch = "riscv64")]
2345         self.configure_system().unwrap();
2346 
2347         #[cfg(feature = "tdx")]
2348         if let Some(hob_address) = hob_address {
2349             // With the HOB address extracted the vCPUs can have
2350             // their TDX state configured.
2351             self.cpu_manager
2352                 .lock()
2353                 .unwrap()
2354                 .initialize_tdx(hob_address)
2355                 .map_err(Error::CpuManager)?;
2356             // Let the hypervisor know which memory ranges are shared with the
2357             // guest. This prevents the guest from ignoring/discarding memory
2358             // regions provided by the host.
2359             self.init_tdx_memory(&sections)?;
2360             // With TDX memory and CPU state configured TDX setup is complete
2361             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2362         }
2363 
2364         // Resume the vm for MSHV
2365         if current_state == VmState::Created {
2366             self.vm.resume().map_err(Error::ResumeVm)?;
2367         }
2368 
2369         self.cpu_manager
2370             .lock()
2371             .unwrap()
2372             .start_boot_vcpus(new_state == VmState::BreakPoint)
2373             .map_err(Error::CpuManager)?;
2374 
2375         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2376         *state = new_state;
2377         Ok(())
2378     }
2379 
2380     pub fn restore(&mut self) -> Result<()> {
2381         event!("vm", "restoring");
2382 
2383         // We acquire all advisory disk image locks again.
2384         self.device_manager
2385             .lock()
2386             .unwrap()
2387             .try_lock_disks()
2388             .map_err(Error::LockingError)?;
2389 
2390         // Now we can start all vCPUs from here.
2391         self.cpu_manager
2392             .lock()
2393             .unwrap()
2394             .start_restored_vcpus()
2395             .map_err(Error::CpuManager)?;
2396 
2397         event!("vm", "restored");
2398         Ok(())
2399     }
2400 
2401     /// Gets a thread-safe reference counted pointer to the VM configuration.
2402     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2403         Arc::clone(&self.config)
2404     }
2405 
2406     /// Get the VM state. Returns an error if the state is poisoned.
2407     pub fn get_state(&self) -> Result<VmState> {
2408         self.state
2409             .try_read()
2410             .map_err(|_| Error::PoisonedState)
2411             .map(|state| *state)
2412     }
2413 
2414     /// Gets the actual size of the balloon.
2415     pub fn balloon_size(&self) -> u64 {
2416         self.device_manager.lock().unwrap().balloon_size()
2417     }
2418 
2419     pub fn send_memory_fds(
2420         &mut self,
2421         socket: &mut UnixStream,
2422     ) -> std::result::Result<(), MigratableError> {
2423         for (slot, fd) in self
2424             .memory_manager
2425             .lock()
2426             .unwrap()
2427             .memory_slot_fds()
2428             .drain()
2429         {
2430             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2431                 .write_to(socket)
2432                 .map_err(|e| {
2433                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2434                 })?;
2435             socket
2436                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2437                 .map_err(|e| {
2438                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2439                 })?;
2440 
2441             Response::read_from(socket)?.ok_or_abandon(
2442                 socket,
2443                 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")),
2444             )?;
2445         }
2446 
2447         Ok(())
2448     }
2449 
2450     pub fn send_memory_regions<F>(
2451         &mut self,
2452         ranges: &MemoryRangeTable,
2453         fd: &mut F,
2454     ) -> std::result::Result<(), MigratableError>
2455     where
2456         F: WriteVolatile,
2457     {
2458         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2459         let mem = guest_memory.memory();
2460 
2461         for range in ranges.regions() {
2462             let mut offset: u64 = 0;
2463             // Here we are manually handling the retry in case we can't the
2464             // whole region at once because we can't use the implementation
2465             // from vm-memory::GuestMemory of write_all_to() as it is not
2466             // following the correct behavior. For more info about this issue
2467             // see: https://github.com/rust-vmm/vm-memory/issues/174
2468             loop {
2469                 let bytes_written = mem
2470                     .write_volatile_to(
2471                         GuestAddress(range.gpa + offset),
2472                         fd,
2473                         (range.length - offset) as usize,
2474                     )
2475                     .map_err(|e| {
2476                         MigratableError::MigrateSend(anyhow!(
2477                             "Error transferring memory to socket: {}",
2478                             e
2479                         ))
2480                     })?;
2481                 offset += bytes_written as u64;
2482 
2483                 if offset == range.length {
2484                     break;
2485                 }
2486             }
2487         }
2488 
2489         Ok(())
2490     }
2491 
2492     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2493         self.memory_manager
2494             .lock()
2495             .unwrap()
2496             .memory_range_table(false)
2497     }
2498 
2499     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2500         self.device_manager.lock().unwrap().device_tree()
2501     }
2502 
2503     /// Release all advisory locks held for the disk images.
2504     ///
2505     /// This should only be called when the VM is stopped and the VMM supposed
2506     /// to shut down. A new VMM, either after a live migration or a
2507     /// state save/resume cycle, should then acquire all locks before the VM
2508     /// starts to run.
2509     pub fn release_disk_locks(&self) -> Result<()> {
2510         self.device_manager
2511             .lock()
2512             .unwrap()
2513             .release_disk_locks()
2514             .map_err(Error::LockingError)?;
2515         Ok(())
2516     }
2517 
2518     pub fn activate_virtio_devices(&self) -> Result<()> {
2519         self.device_manager
2520             .lock()
2521             .unwrap()
2522             .activate_virtio_devices()
2523             .map_err(Error::ActivateVirtioDevices)
2524     }
2525 
2526     #[cfg(target_arch = "x86_64")]
2527     pub fn power_button(&self) -> Result<()> {
2528         return self
2529             .device_manager
2530             .lock()
2531             .unwrap()
2532             .notify_power_button()
2533             .map_err(Error::PowerButton);
2534     }
2535 
2536     #[cfg(target_arch = "aarch64")]
2537     pub fn power_button(&self) -> Result<()> {
2538         self.device_manager
2539             .lock()
2540             .unwrap()
2541             .notify_power_button()
2542             .map_err(Error::PowerButton)
2543     }
2544 
2545     #[cfg(target_arch = "riscv64")]
2546     pub fn power_button(&self) -> Result<()> {
2547         unimplemented!()
2548     }
2549 
2550     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2551         self.memory_manager.lock().unwrap().snapshot_data()
2552     }
2553 
2554     #[cfg(feature = "guest_debug")]
2555     pub fn debug_request(
2556         &mut self,
2557         gdb_request: &GdbRequestPayload,
2558         cpu_id: usize,
2559     ) -> Result<GdbResponsePayload> {
2560         use GdbRequestPayload::*;
2561         match gdb_request {
2562             SetSingleStep(single_step) => {
2563                 self.set_guest_debug(cpu_id, &[], *single_step)
2564                     .map_err(Error::Debug)?;
2565             }
2566             SetHwBreakPoint(addrs) => {
2567                 self.set_guest_debug(cpu_id, addrs, false)
2568                     .map_err(Error::Debug)?;
2569             }
2570             Pause => {
2571                 self.debug_pause().map_err(Error::Debug)?;
2572             }
2573             Resume => {
2574                 self.debug_resume().map_err(Error::Debug)?;
2575             }
2576             ReadRegs => {
2577                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2578                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2579             }
2580             WriteRegs(regs) => {
2581                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2582             }
2583             ReadMem(vaddr, len) => {
2584                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2585                 let mem = self
2586                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2587                     .map_err(Error::Debug)?;
2588                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2589             }
2590             WriteMem(vaddr, data) => {
2591                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2592                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2593                     .map_err(Error::Debug)?;
2594             }
2595             ActiveVcpus => {
2596                 let active_vcpus = self.active_vcpus();
2597                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2598             }
2599         }
2600         Ok(GdbResponsePayload::CommandComplete)
2601     }
2602 
2603     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2604     fn get_dump_state(
2605         &mut self,
2606         destination_url: &str,
2607     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2608         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2609         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2610         let mut elf_phdr_num = 1;
2611         let elf_sh_info = 0;
2612         let coredump_file_path = url_to_file(destination_url)?;
2613         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2614 
2615         if mapping_num < UINT16_MAX - 2 {
2616             elf_phdr_num += mapping_num as u16;
2617         } else {
2618             panic!("mapping num beyond 65535 not supported");
2619         }
2620         let coredump_file = OpenOptions::new()
2621             .read(true)
2622             .write(true)
2623             .create_new(true)
2624             .open(coredump_file_path)
2625             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2626 
2627         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2628         let mem_data = self
2629             .memory_manager
2630             .lock()
2631             .unwrap()
2632             .coredump_memory_regions(mem_offset);
2633 
2634         Ok(DumpState {
2635             elf_note_size,
2636             elf_phdr_num,
2637             elf_sh_info,
2638             mem_offset,
2639             mem_info: Some(mem_data),
2640             file: Some(coredump_file),
2641         })
2642     }
2643 
2644     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2645     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2646         size_of::<elf::Elf64_Ehdr>() as u64
2647             + note_size as u64
2648             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2649     }
2650 
2651     pub fn nmi(&self) -> Result<()> {
2652         return self
2653             .cpu_manager
2654             .lock()
2655             .unwrap()
2656             .nmi()
2657             .map_err(|_| Error::ErrorNmi);
2658     }
2659 }
2660 
2661 impl Pausable for Vm {
2662     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2663         event!("vm", "pausing");
2664         let mut state = self
2665             .state
2666             .try_write()
2667             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2668         let new_state = VmState::Paused;
2669 
2670         state
2671             .valid_transition(new_state)
2672             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2673 
2674         #[cfg(target_arch = "x86_64")]
2675         {
2676             let mut clock = self
2677                 .vm
2678                 .get_clock()
2679                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2680             clock.reset_flags();
2681             self.saved_clock = Some(clock);
2682         }
2683 
2684         // Before pausing the vCPUs activate any pending virtio devices that might
2685         // need activation between starting the pause (or e.g. a migration it's part of)
2686         self.activate_virtio_devices().map_err(|e| {
2687             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2688         })?;
2689 
2690         self.cpu_manager.lock().unwrap().pause()?;
2691         self.device_manager.lock().unwrap().pause()?;
2692 
2693         self.vm
2694             .pause()
2695             .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?;
2696 
2697         *state = new_state;
2698 
2699         event!("vm", "paused");
2700         Ok(())
2701     }
2702 
2703     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2704         event!("vm", "resuming");
2705         let current_state = self.get_state().unwrap();
2706         let mut state = self
2707             .state
2708             .try_write()
2709             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2710         let new_state = VmState::Running;
2711 
2712         state
2713             .valid_transition(new_state)
2714             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2715 
2716         self.cpu_manager.lock().unwrap().resume()?;
2717         #[cfg(target_arch = "x86_64")]
2718         {
2719             if let Some(clock) = &self.saved_clock {
2720                 self.vm.set_clock(clock).map_err(|e| {
2721                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2722                 })?;
2723             }
2724         }
2725 
2726         if current_state == VmState::Paused {
2727             self.vm
2728                 .resume()
2729                 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?;
2730         }
2731 
2732         self.device_manager.lock().unwrap().resume()?;
2733 
2734         // And we're back to the Running state.
2735         *state = new_state;
2736         event!("vm", "resumed");
2737         Ok(())
2738     }
2739 }
2740 
2741 #[derive(Serialize, Deserialize)]
2742 pub struct VmSnapshot {
2743     #[cfg(target_arch = "x86_64")]
2744     pub clock: Option<hypervisor::ClockData>,
2745     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2746     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2747 }
2748 
2749 pub const VM_SNAPSHOT_ID: &str = "vm";
2750 impl Snapshottable for Vm {
2751     fn id(&self) -> String {
2752         VM_SNAPSHOT_ID.to_string()
2753     }
2754 
2755     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2756         event!("vm", "snapshotting");
2757 
2758         #[cfg(feature = "tdx")]
2759         {
2760             if self.config.lock().unwrap().is_tdx_enabled() {
2761                 return Err(MigratableError::Snapshot(anyhow!(
2762                     "Snapshot not possible with TDX VM"
2763                 )));
2764             }
2765         }
2766 
2767         let current_state = self.get_state().unwrap();
2768         if current_state != VmState::Paused {
2769             return Err(MigratableError::Snapshot(anyhow!(
2770                 "Trying to snapshot while VM is running"
2771             )));
2772         }
2773 
2774         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2775         let common_cpuid = {
2776             let amx = self.config.lock().unwrap().cpus.features.amx;
2777             let phys_bits = physical_bits(
2778                 &self.hypervisor,
2779                 self.config.lock().unwrap().cpus.max_phys_bits,
2780             );
2781             arch::generate_common_cpuid(
2782                 &self.hypervisor,
2783                 &arch::CpuidConfig {
2784                     sgx_epc_sections: None,
2785                     phys_bits,
2786                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2787                     #[cfg(feature = "tdx")]
2788                     tdx: false,
2789                     amx,
2790                 },
2791             )
2792             .map_err(|e| {
2793                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2794             })?
2795         };
2796 
2797         let vm_snapshot_state = VmSnapshot {
2798             #[cfg(target_arch = "x86_64")]
2799             clock: self.saved_clock,
2800             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2801             common_cpuid,
2802         };
2803 
2804         let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?;
2805 
2806         let (id, snapshot) = {
2807             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2808             (cpu_manager.id(), cpu_manager.snapshot()?)
2809         };
2810         vm_snapshot.add_snapshot(id, snapshot);
2811         let (id, snapshot) = {
2812             let mut memory_manager = self.memory_manager.lock().unwrap();
2813             (memory_manager.id(), memory_manager.snapshot()?)
2814         };
2815         vm_snapshot.add_snapshot(id, snapshot);
2816         let (id, snapshot) = {
2817             let mut device_manager = self.device_manager.lock().unwrap();
2818             (device_manager.id(), device_manager.snapshot()?)
2819         };
2820         vm_snapshot.add_snapshot(id, snapshot);
2821 
2822         event!("vm", "snapshotted");
2823         Ok(vm_snapshot)
2824     }
2825 }
2826 
2827 impl Transportable for Vm {
2828     fn send(
2829         &self,
2830         snapshot: &Snapshot,
2831         destination_url: &str,
2832     ) -> std::result::Result<(), MigratableError> {
2833         let mut snapshot_config_path = url_to_path(destination_url)?;
2834         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2835 
2836         // Create the snapshot config file
2837         let mut snapshot_config_file = OpenOptions::new()
2838             .read(true)
2839             .write(true)
2840             .create_new(true)
2841             .open(snapshot_config_path)
2842             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2843 
2844         // Serialize and write the snapshot config
2845         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2846             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2847 
2848         snapshot_config_file
2849             .write(vm_config.as_bytes())
2850             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2851 
2852         let mut snapshot_state_path = url_to_path(destination_url)?;
2853         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2854 
2855         // Create the snapshot state file
2856         let mut snapshot_state_file = OpenOptions::new()
2857             .read(true)
2858             .write(true)
2859             .create_new(true)
2860             .open(snapshot_state_path)
2861             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2862 
2863         // Serialize and write the snapshot state
2864         let vm_state =
2865             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2866 
2867         snapshot_state_file
2868             .write(&vm_state)
2869             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2870 
2871         // Tell the memory manager to also send/write its own snapshot.
2872         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2873             self.memory_manager
2874                 .lock()
2875                 .unwrap()
2876                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2877         } else {
2878             return Err(MigratableError::Restore(anyhow!(
2879                 "Missing memory manager snapshot"
2880             )));
2881         }
2882 
2883         Ok(())
2884     }
2885 }
2886 
2887 impl Migratable for Vm {
2888     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2889         self.memory_manager.lock().unwrap().start_dirty_log()?;
2890         self.device_manager.lock().unwrap().start_dirty_log()
2891     }
2892 
2893     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2894         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2895         self.device_manager.lock().unwrap().stop_dirty_log()
2896     }
2897 
2898     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2899         Ok(MemoryRangeTable::new_from_tables(vec![
2900             self.memory_manager.lock().unwrap().dirty_log()?,
2901             self.device_manager.lock().unwrap().dirty_log()?,
2902         ]))
2903     }
2904 
2905     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2906         self.memory_manager.lock().unwrap().start_migration()?;
2907         self.device_manager.lock().unwrap().start_migration()
2908     }
2909 
2910     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2911         self.memory_manager.lock().unwrap().complete_migration()?;
2912         self.device_manager.lock().unwrap().complete_migration()
2913     }
2914 }
2915 
2916 #[cfg(feature = "guest_debug")]
2917 impl Debuggable for Vm {
2918     fn set_guest_debug(
2919         &self,
2920         cpu_id: usize,
2921         addrs: &[GuestAddress],
2922         singlestep: bool,
2923     ) -> std::result::Result<(), DebuggableError> {
2924         self.cpu_manager
2925             .lock()
2926             .unwrap()
2927             .set_guest_debug(cpu_id, addrs, singlestep)
2928     }
2929 
2930     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2931         if *self.state.read().unwrap() == VmState::Running {
2932             self.pause().map_err(DebuggableError::Pause)?;
2933         }
2934 
2935         let mut state = self
2936             .state
2937             .try_write()
2938             .map_err(|_| DebuggableError::PoisonedState)?;
2939         *state = VmState::BreakPoint;
2940         Ok(())
2941     }
2942 
2943     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2944         if *self.state.read().unwrap() == VmState::BreakPoint {
2945             self.resume().map_err(DebuggableError::Pause)?;
2946         }
2947 
2948         Ok(())
2949     }
2950 
2951     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2952         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2953     }
2954 
2955     fn write_regs(
2956         &self,
2957         cpu_id: usize,
2958         regs: &CoreRegs,
2959     ) -> std::result::Result<(), DebuggableError> {
2960         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2961     }
2962 
2963     fn read_mem(
2964         &self,
2965         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2966         cpu_id: usize,
2967         vaddr: GuestAddress,
2968         len: usize,
2969     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2970         self.cpu_manager
2971             .lock()
2972             .unwrap()
2973             .read_mem(guest_memory, cpu_id, vaddr, len)
2974     }
2975 
2976     fn write_mem(
2977         &self,
2978         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2979         cpu_id: usize,
2980         vaddr: &GuestAddress,
2981         data: &[u8],
2982     ) -> std::result::Result<(), DebuggableError> {
2983         self.cpu_manager
2984             .lock()
2985             .unwrap()
2986             .write_mem(guest_memory, cpu_id, vaddr, data)
2987     }
2988 
2989     fn active_vcpus(&self) -> usize {
2990         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2991         if active_vcpus > 0 {
2992             active_vcpus
2993         } else {
2994             // The VM is not booted yet. Report boot_vcpus() instead.
2995             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2996         }
2997     }
2998 }
2999 
3000 #[cfg(feature = "guest_debug")]
3001 pub const UINT16_MAX: u32 = 65535;
3002 
3003 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
3004 impl Elf64Writable for Vm {}
3005 
3006 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
3007 impl GuestDebuggable for Vm {
3008     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
3009         event!("vm", "coredumping");
3010 
3011         let mut resume = false;
3012 
3013         #[cfg(feature = "tdx")]
3014         {
3015             if let Some(ref platform) = self.config.lock().unwrap().platform {
3016                 if platform.tdx {
3017                     return Err(GuestDebuggableError::Coredump(anyhow!(
3018                         "Coredump not possible with TDX VM"
3019                     )));
3020                 }
3021             }
3022         }
3023 
3024         match self.get_state().unwrap() {
3025             VmState::Running => {
3026                 self.pause().map_err(GuestDebuggableError::Pause)?;
3027                 resume = true;
3028             }
3029             VmState::Paused => {}
3030             _ => {
3031                 return Err(GuestDebuggableError::Coredump(anyhow!(
3032                     "Trying to coredump while VM is not running or paused"
3033                 )));
3034             }
3035         }
3036 
3037         let coredump_state = self.get_dump_state(destination_url)?;
3038 
3039         self.write_header(&coredump_state)?;
3040         self.write_note(&coredump_state)?;
3041         self.write_loads(&coredump_state)?;
3042 
3043         self.cpu_manager
3044             .lock()
3045             .unwrap()
3046             .cpu_write_elf64_note(&coredump_state)?;
3047         self.cpu_manager
3048             .lock()
3049             .unwrap()
3050             .cpu_write_vmm_note(&coredump_state)?;
3051 
3052         self.memory_manager
3053             .lock()
3054             .unwrap()
3055             .coredump_iterate_save_mem(&coredump_state)?;
3056 
3057         if resume {
3058             self.resume().map_err(GuestDebuggableError::Resume)?;
3059         }
3060 
3061         Ok(())
3062     }
3063 }
3064 
3065 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3066 #[cfg(test)]
3067 mod tests {
3068     use super::*;
3069 
3070     fn test_vm_state_transitions(state: VmState) {
3071         match state {
3072             VmState::Created => {
3073                 // Check the transitions from Created
3074                 state.valid_transition(VmState::Created).unwrap_err();
3075                 state.valid_transition(VmState::Running).unwrap();
3076                 state.valid_transition(VmState::Shutdown).unwrap();
3077                 state.valid_transition(VmState::Paused).unwrap();
3078                 state.valid_transition(VmState::BreakPoint).unwrap();
3079             }
3080             VmState::Running => {
3081                 // Check the transitions from Running
3082                 state.valid_transition(VmState::Created).unwrap_err();
3083                 state.valid_transition(VmState::Running).unwrap_err();
3084                 state.valid_transition(VmState::Shutdown).unwrap();
3085                 state.valid_transition(VmState::Paused).unwrap();
3086                 state.valid_transition(VmState::BreakPoint).unwrap();
3087             }
3088             VmState::Shutdown => {
3089                 // Check the transitions from Shutdown
3090                 state.valid_transition(VmState::Created).unwrap_err();
3091                 state.valid_transition(VmState::Running).unwrap();
3092                 state.valid_transition(VmState::Shutdown).unwrap_err();
3093                 state.valid_transition(VmState::Paused).unwrap_err();
3094                 state.valid_transition(VmState::BreakPoint).unwrap_err();
3095             }
3096             VmState::Paused => {
3097                 // Check the transitions from Paused
3098                 state.valid_transition(VmState::Created).unwrap_err();
3099                 state.valid_transition(VmState::Running).unwrap();
3100                 state.valid_transition(VmState::Shutdown).unwrap();
3101                 state.valid_transition(VmState::Paused).unwrap_err();
3102                 state.valid_transition(VmState::BreakPoint).unwrap_err();
3103             }
3104             VmState::BreakPoint => {
3105                 // Check the transitions from Breakpoint
3106                 state.valid_transition(VmState::Created).unwrap();
3107                 state.valid_transition(VmState::Running).unwrap();
3108                 state.valid_transition(VmState::Shutdown).unwrap_err();
3109                 state.valid_transition(VmState::Paused).unwrap_err();
3110                 state.valid_transition(VmState::BreakPoint).unwrap_err();
3111             }
3112         }
3113     }
3114 
3115     #[test]
3116     fn test_vm_created_transitions() {
3117         test_vm_state_transitions(VmState::Created);
3118     }
3119 
3120     #[test]
3121     fn test_vm_running_transitions() {
3122         test_vm_state_transitions(VmState::Running);
3123     }
3124 
3125     #[test]
3126     fn test_vm_shutdown_transitions() {
3127         test_vm_state_transitions(VmState::Shutdown);
3128     }
3129 
3130     #[test]
3131     fn test_vm_paused_transitions() {
3132         test_vm_state_transitions(VmState::Paused);
3133     }
3134 
3135     #[cfg(feature = "tdx")]
3136     #[test]
3137     fn test_hob_memory_resources() {
3138         // Case 1: Two TDVF sections in the middle of the RAM
3139         let sections = vec![
3140             TdvfSection {
3141                 address: 0xc000,
3142                 size: 0x1000,
3143                 ..Default::default()
3144             },
3145             TdvfSection {
3146                 address: 0x1000,
3147                 size: 0x4000,
3148                 ..Default::default()
3149             },
3150         ];
3151         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
3152         let expected = vec![
3153             (0, 0x1000, true),
3154             (0x1000, 0x4000, false),
3155             (0x5000, 0x7000, true),
3156             (0xc000, 0x1000, false),
3157             (0xd000, 0x0fff_3000, true),
3158         ];
3159         assert_eq!(
3160             expected,
3161             Vm::hob_memory_resources(
3162                 sections,
3163                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3164             )
3165         );
3166 
3167         // Case 2: Two TDVF sections with no conflict with the RAM
3168         let sections = vec![
3169             TdvfSection {
3170                 address: 0x1000_1000,
3171                 size: 0x1000,
3172                 ..Default::default()
3173             },
3174             TdvfSection {
3175                 address: 0,
3176                 size: 0x1000,
3177                 ..Default::default()
3178             },
3179         ];
3180         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3181         let expected = vec![
3182             (0, 0x1000, false),
3183             (0x1000, 0x1000_0000, true),
3184             (0x1000_1000, 0x1000, false),
3185         ];
3186         assert_eq!(
3187             expected,
3188             Vm::hob_memory_resources(
3189                 sections,
3190                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3191             )
3192         );
3193 
3194         // Case 3: Two TDVF sections with partial conflicts with the RAM
3195         let sections = vec![
3196             TdvfSection {
3197                 address: 0x1000_0000,
3198                 size: 0x2000,
3199                 ..Default::default()
3200             },
3201             TdvfSection {
3202                 address: 0,
3203                 size: 0x2000,
3204                 ..Default::default()
3205             },
3206         ];
3207         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3208         let expected = vec![
3209             (0, 0x2000, false),
3210             (0x2000, 0x0fff_e000, true),
3211             (0x1000_0000, 0x2000, false),
3212         ];
3213         assert_eq!(
3214             expected,
3215             Vm::hob_memory_resources(
3216                 sections,
3217                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3218             )
3219         );
3220 
3221         // Case 4: Two TDVF sections with no conflict before the RAM and two
3222         // more additional sections with no conflict after the RAM.
3223         let sections = vec![
3224             TdvfSection {
3225                 address: 0x2000_1000,
3226                 size: 0x1000,
3227                 ..Default::default()
3228             },
3229             TdvfSection {
3230                 address: 0x2000_0000,
3231                 size: 0x1000,
3232                 ..Default::default()
3233             },
3234             TdvfSection {
3235                 address: 0x1000,
3236                 size: 0x1000,
3237                 ..Default::default()
3238             },
3239             TdvfSection {
3240                 address: 0,
3241                 size: 0x1000,
3242                 ..Default::default()
3243             },
3244         ];
3245         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3246         let expected = vec![
3247             (0, 0x1000, false),
3248             (0x1000, 0x1000, false),
3249             (0x4000, 0x1000_0000, true),
3250             (0x2000_0000, 0x1000, false),
3251             (0x2000_1000, 0x1000, false),
3252         ];
3253         assert_eq!(
3254             expected,
3255             Vm::hob_memory_resources(
3256                 sections,
3257                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3258             )
3259         );
3260 
3261         // Case 5: One TDVF section overriding the entire RAM
3262         let sections = vec![TdvfSection {
3263             address: 0,
3264             size: 0x2000_0000,
3265             ..Default::default()
3266         }];
3267         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3268         let expected = vec![(0, 0x2000_0000, false)];
3269         assert_eq!(
3270             expected,
3271             Vm::hob_memory_resources(
3272                 sections,
3273                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3274             )
3275         );
3276 
3277         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3278         let sections = vec![
3279             TdvfSection {
3280                 address: 0x1000_2000,
3281                 size: 0x2000,
3282                 ..Default::default()
3283             },
3284             TdvfSection {
3285                 address: 0,
3286                 size: 0x2000,
3287                 ..Default::default()
3288             },
3289         ];
3290         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3291             (GuestAddress(0x2000), 0x1000_0000),
3292             (GuestAddress(0x1000_4000), 0x1000_0000),
3293         ];
3294         let expected = vec![
3295             (0, 0x2000, false),
3296             (0x2000, 0x1000_0000, true),
3297             (0x1000_2000, 0x2000, false),
3298             (0x1000_4000, 0x1000_0000, true),
3299         ];
3300         assert_eq!(
3301             expected,
3302             Vm::hob_memory_resources(
3303                 sections,
3304                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3305             )
3306         );
3307 
3308         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3309         let sections = vec![
3310             TdvfSection {
3311                 address: 0x1000_0000,
3312                 size: 0x4000,
3313                 ..Default::default()
3314             },
3315             TdvfSection {
3316                 address: 0,
3317                 size: 0x4000,
3318                 ..Default::default()
3319             },
3320         ];
3321         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3322             (GuestAddress(0x1000), 0x1000_0000),
3323             (GuestAddress(0x1000_3000), 0x1000_0000),
3324         ];
3325         let expected = vec![
3326             (0, 0x4000, false),
3327             (0x4000, 0x0fff_c000, true),
3328             (0x1000_0000, 0x4000, false),
3329             (0x1000_4000, 0x0fff_f000, true),
3330         ];
3331         assert_eq!(
3332             expected,
3333             Vm::hob_memory_resources(
3334                 sections,
3335                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3336             )
3337         );
3338     }
3339 }
3340 
3341 #[cfg(target_arch = "aarch64")]
3342 #[cfg(test)]
3343 mod tests {
3344     use arch::aarch64::fdt::create_fdt;
3345     use arch::aarch64::layout;
3346     use arch::{DeviceType, MmioDeviceInfo};
3347     use devices::gic::Gic;
3348 
3349     use super::*;
3350 
3351     const LEN: u64 = 4096;
3352 
3353     #[test]
3354     fn test_create_fdt_with_devices() {
3355         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3356         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3357 
3358         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3359             (
3360                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3361                 MmioDeviceInfo {
3362                     addr: 0x00,
3363                     len: LEN,
3364                     irq: 33,
3365                 },
3366             ),
3367             (
3368                 (DeviceType::Virtio(1), "virtio".to_string()),
3369                 MmioDeviceInfo {
3370                     addr: LEN,
3371                     len: LEN,
3372                     irq: 34,
3373                 },
3374             ),
3375             (
3376                 (DeviceType::Rtc, "rtc".to_string()),
3377                 MmioDeviceInfo {
3378                     addr: 2 * LEN,
3379                     len: LEN,
3380                     irq: 35,
3381                 },
3382             ),
3383         ]
3384         .iter()
3385         .cloned()
3386         .collect();
3387 
3388         let hv = hypervisor::new().unwrap();
3389         let vm = hv.create_vm().unwrap();
3390         let gic = vm
3391             .create_vgic(Gic::create_default_config(1))
3392             .expect("Cannot create gic");
3393         create_fdt(
3394             &mem,
3395             "console=tty0",
3396             vec![0],
3397             Some((0, 0, 0)),
3398             &dev_info,
3399             &gic,
3400             &None,
3401             &Vec::new(),
3402             &BTreeMap::new(),
3403             None,
3404             true,
3405         )
3406         .unwrap();
3407     }
3408 }
3409 
3410 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3411 #[test]
3412 pub fn test_vm() {
3413     use hypervisor::VmExit;
3414     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3415     // This example based on https://lwn.net/Articles/658511/
3416     let code = [
3417         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3418         0x00, 0xd8, /* add %bl, %al */
3419         0x04, b'0', /* add $'0', %al */
3420         0xee, /* out %al, (%dx) */
3421         0xb0, b'\n', /* mov $'\n', %al */
3422         0xee,  /* out %al, (%dx) */
3423         0xf4,  /* hlt */
3424     ];
3425 
3426     let mem_size = 0x1000;
3427     let load_addr = GuestAddress(0x1000);
3428     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3429 
3430     let hv = hypervisor::new().unwrap();
3431     let vm = hv.create_vm().expect("new VM creation failed");
3432 
3433     for (index, region) in mem.iter().enumerate() {
3434         let mem_region = vm.make_user_memory_region(
3435             index as u32,
3436             region.start_addr().raw_value(),
3437             region.len(),
3438             region.as_ptr() as u64,
3439             false,
3440             false,
3441         );
3442 
3443         vm.create_user_memory_region(mem_region)
3444             .expect("Cannot configure guest memory");
3445     }
3446     mem.write_slice(&code, load_addr)
3447         .expect("Writing code to memory failed");
3448 
3449     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3450 
3451     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3452     vcpu_sregs.cs.base = 0;
3453     vcpu_sregs.cs.selector = 0;
3454     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3455 
3456     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3457     vcpu_regs.set_rip(0x1000);
3458     vcpu_regs.set_rax(2);
3459     vcpu_regs.set_rbx(3);
3460     vcpu_regs.set_rflags(2);
3461     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3462 
3463     loop {
3464         match vcpu.run().expect("run failed") {
3465             VmExit::Reset => {
3466                 println!("HLT");
3467                 break;
3468             }
3469             VmExit::Ignore => {}
3470             r => panic!("unexpected exit reason: {r:?}"),
3471         }
3472     }
3473 }
3474