xref: /cloud-hypervisor/vmm/src/vm.rs (revision b686a5bb24f949e3b201308d69b01e85c14f1ad6)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::collections::{BTreeMap, HashMap};
15 use std::fs::{File, OpenOptions};
16 use std::io::{self, Seek, SeekFrom, Write};
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::num::Wrapping;
20 use std::ops::Deref;
21 use std::os::unix::net::UnixStream;
22 use std::sync::{Arc, Mutex, RwLock};
23 #[cfg(not(target_arch = "riscv64"))]
24 use std::time::Instant;
25 use std::{cmp, result, str, thread};
26 
27 use anyhow::anyhow;
28 #[cfg(target_arch = "x86_64")]
29 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
30 #[cfg(feature = "tdx")]
31 use arch::x86_64::tdx::TdvfSection;
32 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
33 use arch::PciSpaceInfo;
34 use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes};
35 #[cfg(target_arch = "aarch64")]
36 use devices::interrupt_controller;
37 use devices::AcpiNotificationFlags;
38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
39 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
41 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
42 use hypervisor::{HypervisorVmError, VmOps};
43 use libc::{termios, SIGWINCH};
44 use linux_loader::cmdline::Cmdline;
45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
46 use linux_loader::elf;
47 #[cfg(target_arch = "x86_64")]
48 use linux_loader::loader::bzimage::BzImage;
49 #[cfg(target_arch = "x86_64")]
50 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
51 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
52 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
53 use linux_loader::loader::KernelLoader;
54 use seccompiler::SeccompAction;
55 use serde::{Deserialize, Serialize};
56 use thiserror::Error;
57 use tracer::trace_scoped;
58 use vm_device::Bus;
59 #[cfg(feature = "tdx")]
60 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
61 use vm_memory::{
62     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
63 };
64 use vm_migration::protocol::{MemoryRangeTable, Request, Response};
65 use vm_migration::{
66     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable,
67 };
68 use vmm_sys_util::eventfd::EventFd;
69 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
70 
71 use crate::config::{add_to_config, ValidationError};
72 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo};
73 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
74 use crate::coredump::{
75     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
76 };
77 use crate::device_manager::{DeviceManager, DeviceManagerError};
78 use crate::device_tree::DeviceTree;
79 #[cfg(feature = "guest_debug")]
80 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
81 #[cfg(feature = "igvm")]
82 use crate::igvm::igvm_loader;
83 use crate::landlock::LandlockError;
84 use crate::memory_manager::{
85     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
86 };
87 #[cfg(target_arch = "x86_64")]
88 use crate::migration::get_vm_snapshot;
89 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
90 use crate::migration::url_to_file;
91 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
92 use crate::vm_config::{
93     DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig,
94     PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig,
95 };
96 use crate::{
97     cpu, GuestMemoryMmap, PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID,
98     MEMORY_MANAGER_SNAPSHOT_ID,
99 };
100 
101 /// Errors associated with VM management
102 #[derive(Debug, Error)]
103 pub enum Error {
104     #[error("Cannot open kernel file: {0}")]
105     KernelFile(#[source] io::Error),
106 
107     #[error("Cannot open initramfs file: {0}")]
108     InitramfsFile(#[source] io::Error),
109 
110     #[error("Cannot load the kernel into memory: {0}")]
111     KernelLoad(#[source] linux_loader::loader::Error),
112 
113     #[cfg(target_arch = "aarch64")]
114     #[error("Cannot load the UEFI binary in memory: {0:?}")]
115     UefiLoad(arch::aarch64::uefi::Error),
116 
117     #[error("Cannot load the initramfs into memory")]
118     InitramfsLoad,
119 
120     #[error("Cannot load the kernel command line in memory: {0}")]
121     LoadCmdLine(#[source] linux_loader::loader::Error),
122 
123     #[error("Failed to apply landlock config during vm_create: {0}")]
124     ApplyLandlock(#[source] LandlockError),
125 
126     #[error("Cannot modify the kernel command line: {0}")]
127     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
128 
129     #[error("Cannot create the kernel command line: {0}")]
130     CmdLineCreate(#[source] linux_loader::cmdline::Error),
131 
132     #[error("Cannot configure system: {0}")]
133     ConfigureSystem(#[source] arch::Error),
134 
135     #[cfg(target_arch = "aarch64")]
136     #[error("Cannot enable interrupt controller: {0:?}")]
137     EnableInterruptController(interrupt_controller::Error),
138 
139     #[error("VM state is poisoned")]
140     PoisonedState,
141 
142     #[error("Error from device manager: {0:?}")]
143     DeviceManager(DeviceManagerError),
144 
145     #[error("No device with id {0:?} to remove")]
146     NoDeviceToRemove(String),
147 
148     #[error("Cannot spawn a signal handler thread: {0}")]
149     SignalHandlerSpawn(#[source] io::Error),
150 
151     #[error("Failed to join on threads: {0:?}")]
152     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
153 
154     #[error("VM config is missing")]
155     VmMissingConfig,
156 
157     #[error("VM is not created")]
158     VmNotCreated,
159 
160     #[error("VM is already created")]
161     VmAlreadyCreated,
162 
163     #[error("VM is not running")]
164     VmNotRunning,
165 
166     #[error("Cannot clone EventFd: {0}")]
167     EventFdClone(#[source] io::Error),
168 
169     #[error("invalid VM state transition: {0:?} to {1:?}")]
170     InvalidStateTransition(VmState, VmState),
171 
172     #[error("Error from CPU manager: {0}")]
173     CpuManager(#[source] cpu::Error),
174 
175     #[error("Cannot pause devices: {0}")]
176     PauseDevices(#[source] MigratableError),
177 
178     #[error("Cannot resume devices: {0}")]
179     ResumeDevices(#[source] MigratableError),
180 
181     #[error("Cannot pause CPUs: {0}")]
182     PauseCpus(#[source] MigratableError),
183 
184     #[error("Cannot resume cpus: {0}")]
185     ResumeCpus(#[source] MigratableError),
186 
187     #[error("Cannot pause VM: {0}")]
188     Pause(#[source] MigratableError),
189 
190     #[error("Cannot resume VM: {0}")]
191     Resume(#[source] MigratableError),
192 
193     #[error("Memory manager error: {0:?}")]
194     MemoryManager(MemoryManagerError),
195 
196     #[error("Eventfd write error: {0}")]
197     EventfdError(#[source] std::io::Error),
198 
199     #[error("Cannot snapshot VM: {0}")]
200     Snapshot(#[source] MigratableError),
201 
202     #[error("Cannot restore VM: {0}")]
203     Restore(#[source] MigratableError),
204 
205     #[error("Cannot send VM snapshot: {0}")]
206     SnapshotSend(#[source] MigratableError),
207 
208     #[error("Invalid restore source URL")]
209     InvalidRestoreSourceUrl,
210 
211     #[error("Failed to validate config: {0}")]
212     ConfigValidation(#[source] ValidationError),
213 
214     #[error("Too many virtio-vsock devices")]
215     TooManyVsockDevices,
216 
217     #[error("Failed serializing into JSON: {0}")]
218     SerializeJson(#[source] serde_json::Error),
219 
220     #[error("Invalid NUMA configuration")]
221     InvalidNumaConfig,
222 
223     #[error("Cannot create seccomp filter: {0}")]
224     CreateSeccompFilter(#[source] seccompiler::Error),
225 
226     #[error("Cannot apply seccomp filter: {0}")]
227     ApplySeccompFilter(#[source] seccompiler::Error),
228 
229     #[error("Failed resizing a memory zone")]
230     ResizeZone,
231 
232     #[error("Cannot activate virtio devices: {0:?}")]
233     ActivateVirtioDevices(DeviceManagerError),
234 
235     #[error("Error triggering power button: {0:?}")]
236     PowerButton(DeviceManagerError),
237 
238     #[error("Kernel lacks PVH header")]
239     KernelMissingPvhHeader,
240 
241     #[error("Failed to allocate firmware RAM: {0:?}")]
242     AllocateFirmwareMemory(MemoryManagerError),
243 
244     #[error("Error manipulating firmware file: {0}")]
245     FirmwareFile(#[source] std::io::Error),
246 
247     #[error("Firmware too big")]
248     FirmwareTooLarge,
249 
250     #[error("Failed to copy firmware to memory: {0}")]
251     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
252 
253     #[cfg(feature = "sev_snp")]
254     #[error("Error enabling SEV-SNP VM: {0}")]
255     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
256 
257     #[cfg(feature = "tdx")]
258     #[error("Error performing I/O on TDX firmware file: {0}")]
259     LoadTdvf(#[source] std::io::Error),
260 
261     #[cfg(feature = "tdx")]
262     #[error("Error performing I/O on the TDX payload file: {0}")]
263     LoadPayload(#[source] std::io::Error),
264 
265     #[cfg(feature = "tdx")]
266     #[error("Error parsing TDVF: {0}")]
267     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
268 
269     #[cfg(feature = "tdx")]
270     #[error("Error populating TDX HOB: {0}")]
271     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
272 
273     #[cfg(feature = "tdx")]
274     #[error("Error allocating TDVF memory: {0:?}")]
275     AllocatingTdvfMemory(crate::memory_manager::Error),
276 
277     #[cfg(feature = "tdx")]
278     #[error("Error enabling TDX VM: {0}")]
279     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
280 
281     #[cfg(feature = "tdx")]
282     #[error("Error enabling TDX memory region: {0}")]
283     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
284 
285     #[cfg(feature = "tdx")]
286     #[error("Error finalizing TDX VM: {0}")]
287     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
288 
289     #[cfg(feature = "tdx")]
290     #[error("TDX firmware missing")]
291     TdxFirmwareMissing,
292 
293     #[cfg(feature = "tdx")]
294     #[error("Invalid TDX payload type")]
295     InvalidPayloadType,
296 
297     #[cfg(feature = "guest_debug")]
298     #[error("Error debugging VM: {0:?}")]
299     Debug(DebuggableError),
300 
301     #[error("Error spawning kernel loading thread")]
302     KernelLoadThreadSpawn(std::io::Error),
303 
304     #[error("Error joining kernel loading thread")]
305     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
306 
307     #[error("Payload configuration is not bootable")]
308     InvalidPayload,
309 
310     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
311     #[error("Error coredumping VM: {0:?}")]
312     Coredump(GuestDebuggableError),
313 
314     #[cfg(feature = "igvm")]
315     #[error("Cannot open igvm file: {0}")]
316     IgvmFile(#[source] io::Error),
317 
318     #[cfg(feature = "igvm")]
319     #[error("Cannot load the igvm into memory: {0}")]
320     IgvmLoad(#[source] igvm_loader::Error),
321 
322     #[error("Error injecting NMI")]
323     ErrorNmi,
324 
325     #[error("Error resuming the VM: {0}")]
326     ResumeVm(#[source] hypervisor::HypervisorVmError),
327 
328     #[error("Error creating console devices")]
329     CreateConsoleDevices(ConsoleDeviceError),
330 }
331 pub type Result<T> = result::Result<T, Error>;
332 
333 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
334 pub enum VmState {
335     Created,
336     Running,
337     Shutdown,
338     Paused,
339     BreakPoint,
340 }
341 
342 impl VmState {
343     fn valid_transition(self, new_state: VmState) -> Result<()> {
344         match self {
345             VmState::Created => match new_state {
346                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
347                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
348                     Ok(())
349                 }
350             },
351 
352             VmState::Running => match new_state {
353                 VmState::Created | VmState::Running => {
354                     Err(Error::InvalidStateTransition(self, new_state))
355                 }
356                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
357             },
358 
359             VmState::Shutdown => match new_state {
360                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
361                     Err(Error::InvalidStateTransition(self, new_state))
362                 }
363                 VmState::Running => Ok(()),
364             },
365 
366             VmState::Paused => match new_state {
367                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
368                     Err(Error::InvalidStateTransition(self, new_state))
369                 }
370                 VmState::Running | VmState::Shutdown => Ok(()),
371             },
372             VmState::BreakPoint => match new_state {
373                 VmState::Created | VmState::Running => Ok(()),
374                 _ => Err(Error::InvalidStateTransition(self, new_state)),
375             },
376         }
377     }
378 }
379 
380 struct VmOpsHandler {
381     memory: GuestMemoryAtomic<GuestMemoryMmap>,
382     #[cfg(target_arch = "x86_64")]
383     io_bus: Arc<Bus>,
384     mmio_bus: Arc<Bus>,
385 }
386 
387 impl VmOps for VmOpsHandler {
388     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
389         self.memory
390             .memory()
391             .write(buf, GuestAddress(gpa))
392             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
393     }
394 
395     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
396         self.memory
397             .memory()
398             .read(buf, GuestAddress(gpa))
399             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
400     }
401 
402     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
403         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
404             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
405         }
406         Ok(())
407     }
408 
409     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
410         match self.mmio_bus.write(gpa, data) {
411             Err(vm_device::BusError::MissingAddressRange) => {
412                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
413             }
414             Ok(Some(barrier)) => {
415                 info!("Waiting for barrier");
416                 barrier.wait();
417                 info!("Barrier released");
418             }
419             _ => {}
420         };
421         Ok(())
422     }
423 
424     #[cfg(target_arch = "x86_64")]
425     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
426         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
427             info!("Guest PIO read to unregistered address 0x{:x}", port);
428         }
429         Ok(())
430     }
431 
432     #[cfg(target_arch = "x86_64")]
433     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
434         match self.io_bus.write(port, data) {
435             Err(vm_device::BusError::MissingAddressRange) => {
436                 info!("Guest PIO write to unregistered address 0x{:x}", port);
437             }
438             Ok(Some(barrier)) => {
439                 info!("Waiting for barrier");
440                 barrier.wait();
441                 info!("Barrier released");
442             }
443             _ => {}
444         };
445         Ok(())
446     }
447 }
448 
449 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
450     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
451 
452     cmp::min(host_phys_bits, max_phys_bits)
453 }
454 
455 pub struct Vm {
456     #[cfg(feature = "tdx")]
457     kernel: Option<File>,
458     initramfs: Option<File>,
459     threads: Vec<thread::JoinHandle<()>>,
460     device_manager: Arc<Mutex<DeviceManager>>,
461     config: Arc<Mutex<VmConfig>>,
462     state: RwLock<VmState>,
463     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
464     memory_manager: Arc<Mutex<MemoryManager>>,
465     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
466     // The hypervisor abstracted virtual machine.
467     vm: Arc<dyn hypervisor::Vm>,
468     #[cfg(target_arch = "x86_64")]
469     saved_clock: Option<hypervisor::ClockData>,
470     #[cfg(not(target_arch = "riscv64"))]
471     numa_nodes: NumaNodes,
472     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
473     #[cfg(not(target_arch = "riscv64"))]
474     hypervisor: Arc<dyn hypervisor::Hypervisor>,
475     stop_on_boot: bool,
476     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
477 }
478 
479 impl Vm {
480     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
481 
482     #[allow(clippy::too_many_arguments)]
483     pub fn new_from_memory_manager(
484         config: Arc<Mutex<VmConfig>>,
485         memory_manager: Arc<Mutex<MemoryManager>>,
486         vm: Arc<dyn hypervisor::Vm>,
487         exit_evt: EventFd,
488         reset_evt: EventFd,
489         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
490         seccomp_action: &SeccompAction,
491         hypervisor: Arc<dyn hypervisor::Hypervisor>,
492         activate_evt: EventFd,
493         #[cfg(not(target_arch = "riscv64"))] timestamp: Instant,
494         console_info: Option<ConsoleInfo>,
495         console_resize_pipe: Option<Arc<File>>,
496         original_termios: Arc<Mutex<Option<termios>>>,
497         snapshot: Option<Snapshot>,
498     ) -> Result<Self> {
499         trace_scoped!("Vm::new_from_memory_manager");
500 
501         let boot_id_list = config
502             .lock()
503             .unwrap()
504             .validate()
505             .map_err(Error::ConfigValidation)?;
506 
507         #[cfg(not(feature = "igvm"))]
508         let load_payload_handle = if snapshot.is_none() {
509             Self::load_payload_async(&memory_manager, &config)?
510         } else {
511             None
512         };
513 
514         info!("Booting VM from config: {:?}", &config);
515 
516         // Create NUMA nodes based on NumaConfig.
517         let numa_nodes =
518             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
519 
520         #[cfg(feature = "tdx")]
521         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
522         #[cfg(feature = "sev_snp")]
523         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
524         #[cfg(feature = "tdx")]
525         let force_iommu = tdx_enabled;
526         #[cfg(feature = "sev_snp")]
527         let force_iommu = sev_snp_enabled;
528         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
529         let force_iommu = false;
530 
531         #[cfg(feature = "guest_debug")]
532         let stop_on_boot = config.lock().unwrap().gdb;
533         #[cfg(not(feature = "guest_debug"))]
534         let stop_on_boot = false;
535 
536         let memory = memory_manager.lock().unwrap().guest_memory();
537         let io_bus = Arc::new(Bus::new());
538         let mmio_bus = Arc::new(Bus::new());
539 
540         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
541             memory,
542             #[cfg(target_arch = "x86_64")]
543             io_bus: io_bus.clone(),
544             mmio_bus: mmio_bus.clone(),
545         });
546 
547         let cpus_config = { &config.lock().unwrap().cpus.clone() };
548         let cpu_manager = cpu::CpuManager::new(
549             cpus_config,
550             vm.clone(),
551             exit_evt.try_clone().map_err(Error::EventFdClone)?,
552             reset_evt.try_clone().map_err(Error::EventFdClone)?,
553             #[cfg(feature = "guest_debug")]
554             vm_debug_evt,
555             &hypervisor,
556             seccomp_action.clone(),
557             vm_ops,
558             #[cfg(feature = "tdx")]
559             tdx_enabled,
560             &numa_nodes,
561             #[cfg(feature = "sev_snp")]
562             sev_snp_enabled,
563         )
564         .map_err(Error::CpuManager)?;
565 
566         #[cfg(target_arch = "x86_64")]
567         cpu_manager
568             .lock()
569             .unwrap()
570             .populate_cpuid(
571                 &memory_manager,
572                 &hypervisor,
573                 #[cfg(feature = "tdx")]
574                 tdx_enabled,
575             )
576             .map_err(Error::CpuManager)?;
577 
578         // Loading the igvm file is pushed down here because
579         // igvm parser needs cpu_manager to retrieve cpuid leaf.
580         // For the regular case, we can start loading early, but for
581         // igvm case we have to wait until cpu_manager is created.
582         // Currently, Microsoft Hypervisor does not provide any
583         // Hypervisor specific common cpuid, we need to call get_cpuid_values
584         // per cpuid through cpu_manager.
585         #[cfg(feature = "igvm")]
586         let load_payload_handle = if snapshot.is_none() {
587             Self::load_payload_async(
588                 &memory_manager,
589                 &config,
590                 &cpu_manager,
591                 #[cfg(feature = "sev_snp")]
592                 sev_snp_enabled,
593             )?
594         } else {
595             None
596         };
597         // The initial TDX configuration must be done before the vCPUs are
598         // created
599         #[cfg(feature = "tdx")]
600         if tdx_enabled {
601             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
602             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
603             vm.tdx_init(&cpuid, max_vcpus)
604                 .map_err(Error::InitializeTdxVm)?;
605         }
606 
607         cpu_manager
608             .lock()
609             .unwrap()
610             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
611             .map_err(Error::CpuManager)?;
612 
613         // This initial SEV-SNP configuration must be done immediately after
614         // vCPUs are created. As part of this initialization we are
615         // transitioning the guest into secure state.
616         #[cfg(feature = "sev_snp")]
617         if sev_snp_enabled {
618             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
619         }
620 
621         #[cfg(feature = "tdx")]
622         let dynamic = !tdx_enabled;
623         #[cfg(not(feature = "tdx"))]
624         let dynamic = true;
625 
626         let device_manager = DeviceManager::new(
627             io_bus,
628             mmio_bus,
629             vm.clone(),
630             config.clone(),
631             memory_manager.clone(),
632             cpu_manager.clone(),
633             exit_evt.try_clone().map_err(Error::EventFdClone)?,
634             reset_evt,
635             seccomp_action.clone(),
636             numa_nodes.clone(),
637             &activate_evt,
638             force_iommu,
639             boot_id_list,
640             #[cfg(not(target_arch = "riscv64"))]
641             timestamp,
642             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
643             dynamic,
644         )
645         .map_err(Error::DeviceManager)?;
646 
647         device_manager
648             .lock()
649             .unwrap()
650             .create_devices(console_info, console_resize_pipe, original_termios)
651             .map_err(Error::DeviceManager)?;
652 
653         #[cfg(feature = "tdx")]
654         let kernel = config
655             .lock()
656             .unwrap()
657             .payload
658             .as_ref()
659             .map(|p| p.kernel.as_ref().map(File::open))
660             .unwrap_or_default()
661             .transpose()
662             .map_err(Error::KernelFile)?;
663 
664         let initramfs = config
665             .lock()
666             .unwrap()
667             .payload
668             .as_ref()
669             .map(|p| p.initramfs.as_ref().map(File::open))
670             .unwrap_or_default()
671             .transpose()
672             .map_err(Error::InitramfsFile)?;
673 
674         #[cfg(target_arch = "x86_64")]
675         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
676             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
677             vm_snapshot.clock
678         } else {
679             None
680         };
681 
682         let vm_state = if snapshot.is_some() {
683             VmState::Paused
684         } else {
685             VmState::Created
686         };
687 
688         Ok(Vm {
689             #[cfg(feature = "tdx")]
690             kernel,
691             initramfs,
692             device_manager,
693             config,
694             threads: Vec::with_capacity(1),
695             state: RwLock::new(vm_state),
696             cpu_manager,
697             memory_manager,
698             vm,
699             #[cfg(target_arch = "x86_64")]
700             saved_clock,
701             #[cfg(not(target_arch = "riscv64"))]
702             numa_nodes,
703             #[cfg(not(target_arch = "riscv64"))]
704             hypervisor,
705             stop_on_boot,
706             load_payload_handle,
707         })
708     }
709 
710     fn create_numa_nodes(
711         configs: Option<Vec<NumaConfig>>,
712         memory_manager: &Arc<Mutex<MemoryManager>>,
713     ) -> Result<NumaNodes> {
714         let mm = memory_manager.lock().unwrap();
715         let mm_zones = mm.memory_zones();
716         let mut numa_nodes = BTreeMap::new();
717 
718         if let Some(configs) = &configs {
719             for config in configs.iter() {
720                 if numa_nodes.contains_key(&config.guest_numa_id) {
721                     error!("Can't define twice the same NUMA node");
722                     return Err(Error::InvalidNumaConfig);
723                 }
724 
725                 let mut node = NumaNode::default();
726 
727                 if let Some(memory_zones) = &config.memory_zones {
728                     for memory_zone in memory_zones.iter() {
729                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
730                             node.memory_regions.extend(mm_zone.regions().clone());
731                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
732                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
733                             }
734                             node.memory_zones.push(memory_zone.clone());
735                         } else {
736                             error!("Unknown memory zone '{}'", memory_zone);
737                             return Err(Error::InvalidNumaConfig);
738                         }
739                     }
740                 }
741 
742                 if let Some(cpus) = &config.cpus {
743                     node.cpus.extend(cpus);
744                 }
745 
746                 if let Some(pci_segments) = &config.pci_segments {
747                     node.pci_segments.extend(pci_segments);
748                 }
749 
750                 if let Some(distances) = &config.distances {
751                     for distance in distances.iter() {
752                         let dest = distance.destination;
753                         let dist = distance.distance;
754 
755                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
756                             error!("Unknown destination NUMA node {}", dest);
757                             return Err(Error::InvalidNumaConfig);
758                         }
759 
760                         if node.distances.contains_key(&dest) {
761                             error!("Destination NUMA node {} has been already set", dest);
762                             return Err(Error::InvalidNumaConfig);
763                         }
764 
765                         node.distances.insert(dest, dist);
766                     }
767                 }
768 
769                 #[cfg(target_arch = "x86_64")]
770                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
771                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
772                         let mm_sections = sgx_epc_region.epc_sections();
773                         for sgx_epc_section in sgx_epc_sections.iter() {
774                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
775                                 node.sgx_epc_sections.push(mm_section.clone());
776                             } else {
777                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
778                                 return Err(Error::InvalidNumaConfig);
779                             }
780                         }
781                     } else {
782                         error!("Missing SGX EPC region");
783                         return Err(Error::InvalidNumaConfig);
784                     }
785                 }
786 
787                 numa_nodes.insert(config.guest_numa_id, node);
788             }
789         }
790 
791         Ok(numa_nodes)
792     }
793 
794     #[allow(clippy::too_many_arguments)]
795     pub fn new(
796         vm_config: Arc<Mutex<VmConfig>>,
797         exit_evt: EventFd,
798         reset_evt: EventFd,
799         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
800         seccomp_action: &SeccompAction,
801         hypervisor: Arc<dyn hypervisor::Hypervisor>,
802         activate_evt: EventFd,
803         console_info: Option<ConsoleInfo>,
804         console_resize_pipe: Option<Arc<File>>,
805         original_termios: Arc<Mutex<Option<termios>>>,
806         snapshot: Option<Snapshot>,
807         source_url: Option<&str>,
808         prefault: Option<bool>,
809     ) -> Result<Self> {
810         trace_scoped!("Vm::new");
811 
812         #[cfg(not(target_arch = "riscv64"))]
813         let timestamp = Instant::now();
814 
815         #[cfg(feature = "tdx")]
816         let tdx_enabled = if snapshot.is_some() {
817             false
818         } else {
819             vm_config.lock().unwrap().is_tdx_enabled()
820         };
821 
822         #[cfg(feature = "sev_snp")]
823         let sev_snp_enabled = if snapshot.is_some() {
824             false
825         } else {
826             vm_config.lock().unwrap().is_sev_snp_enabled()
827         };
828 
829         let vm = Self::create_hypervisor_vm(
830             &hypervisor,
831             #[cfg(feature = "tdx")]
832             tdx_enabled,
833             #[cfg(feature = "sev_snp")]
834             sev_snp_enabled,
835             #[cfg(feature = "sev_snp")]
836             vm_config.lock().unwrap().memory.total_size(),
837         )?;
838 
839         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
840 
841         let memory_manager = if let Some(snapshot) =
842             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
843         {
844             MemoryManager::new_from_snapshot(
845                 &snapshot,
846                 vm.clone(),
847                 &vm_config.lock().unwrap().memory.clone(),
848                 source_url,
849                 prefault.unwrap(),
850                 phys_bits,
851             )
852             .map_err(Error::MemoryManager)?
853         } else {
854             #[cfg(target_arch = "x86_64")]
855             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
856 
857             MemoryManager::new(
858                 vm.clone(),
859                 &vm_config.lock().unwrap().memory.clone(),
860                 None,
861                 phys_bits,
862                 #[cfg(feature = "tdx")]
863                 tdx_enabled,
864                 None,
865                 None,
866                 #[cfg(target_arch = "x86_64")]
867                 sgx_epc_config,
868             )
869             .map_err(Error::MemoryManager)?
870         };
871 
872         Vm::new_from_memory_manager(
873             vm_config,
874             memory_manager,
875             vm,
876             exit_evt,
877             reset_evt,
878             #[cfg(feature = "guest_debug")]
879             vm_debug_evt,
880             seccomp_action,
881             hypervisor,
882             activate_evt,
883             #[cfg(not(target_arch = "riscv64"))]
884             timestamp,
885             console_info,
886             console_resize_pipe,
887             original_termios,
888             snapshot,
889         )
890     }
891 
892     pub fn create_hypervisor_vm(
893         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
894         #[cfg(feature = "tdx")] tdx_enabled: bool,
895         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
896         #[cfg(feature = "sev_snp")] mem_size: u64,
897     ) -> Result<Arc<dyn hypervisor::Vm>> {
898         hypervisor.check_required_extensions().unwrap();
899 
900         cfg_if::cfg_if! {
901             if #[cfg(feature = "tdx")] {
902                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
903                 // Otherwise KVM_X86_LEGACY_VM: 0
904                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
905                 let vm = hypervisor
906                     .create_vm_with_type(u64::from(tdx_enabled))
907                     .unwrap();
908             } else if #[cfg(feature = "sev_snp")] {
909                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
910                 // Otherwise SEV_SNP_DISABLED: 0
911                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
912                 let vm = hypervisor
913                     .create_vm_with_type_and_memory(u64::from(sev_snp_enabled), mem_size)
914                     .unwrap();
915             } else {
916                 let vm = hypervisor.create_vm().unwrap();
917             }
918         }
919 
920         #[cfg(target_arch = "x86_64")]
921         {
922             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
923                 .unwrap();
924             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
925             vm.enable_split_irq().unwrap();
926         }
927 
928         Ok(vm)
929     }
930 
931     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
932         let initramfs = self.initramfs.as_mut().unwrap();
933         let size: usize = initramfs
934             .seek(SeekFrom::End(0))
935             .map_err(|_| Error::InitramfsLoad)?
936             .try_into()
937             .unwrap();
938         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
939 
940         let address =
941             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
942         let address = GuestAddress(address);
943 
944         guest_mem
945             .read_volatile_from(address, initramfs, size)
946             .map_err(|_| Error::InitramfsLoad)?;
947 
948         info!("Initramfs loaded: address = 0x{:x}", address.0);
949         Ok(arch::InitramfsConfig { address, size })
950     }
951 
952     pub fn generate_cmdline(
953         payload: &PayloadConfig,
954         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] device_manager: &Arc<
955             Mutex<DeviceManager>,
956         >,
957     ) -> Result<Cmdline> {
958         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
959         if let Some(s) = payload.cmdline.as_ref() {
960             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
961         }
962 
963         #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
964         for entry in device_manager.lock().unwrap().cmdline_additions() {
965             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
966         }
967         Ok(cmdline)
968     }
969 
970     #[cfg(target_arch = "aarch64")]
971     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
972         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
973         let mem = uefi_flash.memory();
974         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
975             .map_err(Error::UefiLoad)?;
976         Ok(())
977     }
978 
979     #[cfg(target_arch = "aarch64")]
980     fn load_kernel(
981         firmware: Option<File>,
982         kernel: Option<File>,
983         memory_manager: Arc<Mutex<MemoryManager>>,
984     ) -> Result<EntryPoint> {
985         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
986         let mem = guest_memory.memory();
987         let entry_addr = match (firmware, kernel) {
988             (None, Some(mut kernel)) => {
989                 match linux_loader::loader::pe::PE::load(
990                     mem.deref(),
991                     Some(arch::layout::KERNEL_START),
992                     &mut kernel,
993                     None,
994                 ) {
995                     Ok(entry_addr) => entry_addr.kernel_load,
996                     // Try to load the binary as kernel PE file at first.
997                     // If failed, retry to load it as UEFI binary.
998                     // As the UEFI binary is formatless, it must be the last option to try.
999                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
1000                         Self::load_firmware(&kernel, memory_manager)?;
1001                         arch::layout::UEFI_START
1002                     }
1003                     Err(e) => {
1004                         return Err(Error::KernelLoad(e));
1005                     }
1006                 }
1007             }
1008             (Some(firmware), None) => {
1009                 Self::load_firmware(&firmware, memory_manager)?;
1010                 arch::layout::UEFI_START
1011             }
1012             _ => return Err(Error::InvalidPayload),
1013         };
1014 
1015         Ok(EntryPoint { entry_addr })
1016     }
1017 
1018     #[cfg(target_arch = "riscv64")]
1019     fn load_kernel(
1020         firmware: Option<File>,
1021         kernel: Option<File>,
1022         memory_manager: Arc<Mutex<MemoryManager>>,
1023     ) -> Result<EntryPoint> {
1024         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1025         let mem = guest_memory.memory();
1026         let alignment = 0x20_0000;
1027         let aligned_kernel_addr = arch::layout::KERNEL_START.0 + (alignment - 1) & !(alignment - 1);
1028         let entry_addr = match (firmware, kernel) {
1029             (None, Some(mut kernel)) => {
1030                 match linux_loader::loader::pe::PE::load(
1031                     mem.deref(),
1032                     Some(GuestAddress(aligned_kernel_addr)),
1033                     &mut kernel,
1034                     None,
1035                 ) {
1036                     Ok(entry_addr) => entry_addr.kernel_load,
1037                     // Try to load the binary as kernel PE file at first.
1038                     // If failed, retry to load it as UEFI binary.
1039                     // As the UEFI binary is formatless, it must be the last option to try.
1040                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
1041                         // TODO: UEFI for riscv64 is scheduled to next stage.
1042                         unimplemented!()
1043                     }
1044                     Err(e) => {
1045                         return Err(Error::KernelLoad(e));
1046                     }
1047                 }
1048             }
1049             (Some(_firmware), None) => {
1050                 // TODO: UEFI for riscv64 is scheduled to next stage.
1051                 unimplemented!()
1052             }
1053             _ => return Err(Error::InvalidPayload),
1054         };
1055 
1056         Ok(EntryPoint { entry_addr })
1057     }
1058 
1059     #[cfg(feature = "igvm")]
1060     fn load_igvm(
1061         igvm: File,
1062         memory_manager: Arc<Mutex<MemoryManager>>,
1063         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1064         #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1065     ) -> Result<EntryPoint> {
1066         let res = igvm_loader::load_igvm(
1067             &igvm,
1068             memory_manager,
1069             cpu_manager.clone(),
1070             "",
1071             #[cfg(feature = "sev_snp")]
1072             host_data,
1073         )
1074         .map_err(Error::IgvmLoad)?;
1075 
1076         cfg_if::cfg_if! {
1077             if #[cfg(feature = "sev_snp")] {
1078                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1079                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1080                 } else {
1081                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1082                 };
1083             } else {
1084                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1085             }
1086         };
1087         Ok(entry_point)
1088     }
1089 
1090     #[cfg(target_arch = "x86_64")]
1091     fn load_kernel(
1092         mut kernel: File,
1093         cmdline: Option<Cmdline>,
1094         memory_manager: Arc<Mutex<MemoryManager>>,
1095     ) -> Result<EntryPoint> {
1096         info!("Loading kernel");
1097 
1098         let mem = {
1099             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1100             guest_memory.memory()
1101         };
1102 
1103         // Try ELF binary with PVH boot.
1104         let entry_addr = linux_loader::loader::elf::Elf::load(
1105             mem.deref(),
1106             None,
1107             &mut kernel,
1108             Some(arch::layout::HIGH_RAM_START),
1109         )
1110         // Try loading kernel as bzImage.
1111         .or_else(|_| {
1112             BzImage::load(
1113                 mem.deref(),
1114                 None,
1115                 &mut kernel,
1116                 Some(arch::layout::HIGH_RAM_START),
1117             )
1118         })
1119         .map_err(Error::KernelLoad)?;
1120 
1121         if let Some(cmdline) = cmdline {
1122             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1123                 .map_err(Error::LoadCmdLine)?;
1124         }
1125 
1126         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1127             // Use the PVH kernel entry point to boot the guest
1128             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1129             Ok(EntryPoint {
1130                 entry_addr,
1131                 setup_header: None,
1132             })
1133         } else if entry_addr.setup_header.is_some() {
1134             // Use the bzImage 32bit entry point to boot the guest
1135             info!(
1136                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1137                 entry_addr.kernel_load.0
1138             );
1139             Ok(EntryPoint {
1140                 entry_addr: entry_addr.kernel_load,
1141                 setup_header: entry_addr.setup_header,
1142             })
1143         } else {
1144             Err(Error::KernelMissingPvhHeader)
1145         }
1146     }
1147 
1148     #[cfg(target_arch = "x86_64")]
1149     fn load_payload(
1150         payload: &PayloadConfig,
1151         memory_manager: Arc<Mutex<MemoryManager>>,
1152         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1153         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1154     ) -> Result<EntryPoint> {
1155         trace_scoped!("load_payload");
1156         #[cfg(feature = "igvm")]
1157         {
1158             if let Some(_igvm_file) = &payload.igvm {
1159                 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1160                 #[cfg(feature = "sev_snp")]
1161                 if sev_snp_enabled {
1162                     return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1163                 }
1164                 #[cfg(not(feature = "sev_snp"))]
1165                 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1166             }
1167         }
1168         match (
1169             &payload.firmware,
1170             &payload.kernel,
1171             &payload.initramfs,
1172             &payload.cmdline,
1173         ) {
1174             (Some(firmware), None, None, None) => {
1175                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1176                 Self::load_kernel(firmware, None, memory_manager)
1177             }
1178             (None, Some(kernel), _, _) => {
1179                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1180                 let cmdline = Self::generate_cmdline(payload)?;
1181                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1182             }
1183             _ => Err(Error::InvalidPayload),
1184         }
1185     }
1186 
1187     #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1188     fn load_payload(
1189         payload: &PayloadConfig,
1190         memory_manager: Arc<Mutex<MemoryManager>>,
1191     ) -> Result<EntryPoint> {
1192         match (&payload.firmware, &payload.kernel) {
1193             (Some(firmware), None) => {
1194                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1195                 Self::load_kernel(Some(firmware), None, memory_manager)
1196             }
1197             (None, Some(kernel)) => {
1198                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1199                 Self::load_kernel(None, Some(kernel), memory_manager)
1200             }
1201             _ => Err(Error::InvalidPayload),
1202         }
1203     }
1204 
1205     fn load_payload_async(
1206         memory_manager: &Arc<Mutex<MemoryManager>>,
1207         config: &Arc<Mutex<VmConfig>>,
1208         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1209         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1210     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1211         // Kernel with TDX is loaded in a different manner
1212         #[cfg(feature = "tdx")]
1213         if config.lock().unwrap().is_tdx_enabled() {
1214             return Ok(None);
1215         }
1216 
1217         config
1218             .lock()
1219             .unwrap()
1220             .payload
1221             .as_ref()
1222             .map(|payload| {
1223                 let memory_manager = memory_manager.clone();
1224                 let payload = payload.clone();
1225                 #[cfg(feature = "igvm")]
1226                 let cpu_manager = cpu_manager.clone();
1227 
1228                 std::thread::Builder::new()
1229                     .name("payload_loader".into())
1230                     .spawn(move || {
1231                         Self::load_payload(
1232                             &payload,
1233                             memory_manager,
1234                             #[cfg(feature = "igvm")]
1235                             cpu_manager,
1236                             #[cfg(feature = "sev_snp")]
1237                             sev_snp_enabled,
1238                         )
1239                     })
1240                     .map_err(Error::KernelLoadThreadSpawn)
1241             })
1242             .transpose()
1243     }
1244 
1245     #[cfg(target_arch = "x86_64")]
1246     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1247         trace_scoped!("configure_system");
1248         info!("Configuring system");
1249         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1250 
1251         let initramfs_config = match self.initramfs {
1252             Some(_) => Some(self.load_initramfs(&mem)?),
1253             None => None,
1254         };
1255 
1256         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1257         let rsdp_addr = Some(rsdp_addr);
1258         let sgx_epc_region = self
1259             .memory_manager
1260             .lock()
1261             .unwrap()
1262             .sgx_epc_region()
1263             .as_ref()
1264             .cloned();
1265 
1266         let serial_number = self
1267             .config
1268             .lock()
1269             .unwrap()
1270             .platform
1271             .as_ref()
1272             .and_then(|p| p.serial_number.clone());
1273 
1274         let uuid = self
1275             .config
1276             .lock()
1277             .unwrap()
1278             .platform
1279             .as_ref()
1280             .and_then(|p| p.uuid.clone());
1281 
1282         let oem_strings = self
1283             .config
1284             .lock()
1285             .unwrap()
1286             .platform
1287             .as_ref()
1288             .and_then(|p| p.oem_strings.clone());
1289 
1290         let oem_strings = oem_strings
1291             .as_deref()
1292             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1293 
1294         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1295 
1296         arch::configure_system(
1297             &mem,
1298             arch::layout::CMDLINE_START,
1299             arch::layout::CMDLINE_MAX_SIZE,
1300             &initramfs_config,
1301             boot_vcpus,
1302             entry_addr.setup_header,
1303             rsdp_addr,
1304             sgx_epc_region,
1305             serial_number.as_deref(),
1306             uuid.as_deref(),
1307             oem_strings.as_deref(),
1308             topology,
1309         )
1310         .map_err(Error::ConfigureSystem)?;
1311         Ok(())
1312     }
1313 
1314     #[cfg(target_arch = "aarch64")]
1315     fn configure_system(
1316         &mut self,
1317         _rsdp_addr: GuestAddress,
1318         _entry_addr: EntryPoint,
1319     ) -> Result<()> {
1320         let cmdline = Self::generate_cmdline(
1321             self.config.lock().unwrap().payload.as_ref().unwrap(),
1322             &self.device_manager,
1323         )?;
1324         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1325         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1326         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1327         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1328         let initramfs_config = match self.initramfs {
1329             Some(_) => Some(self.load_initramfs(&mem)?),
1330             None => None,
1331         };
1332 
1333         let device_info = &self
1334             .device_manager
1335             .lock()
1336             .unwrap()
1337             .get_device_info()
1338             .clone();
1339 
1340         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1341             let pci_space = PciSpaceInfo {
1342                 pci_segment_id: pci_segment.id,
1343                 mmio_config_address: pci_segment.mmio_config_address,
1344                 pci_device_space_start: pci_segment.start_of_mem64_area,
1345                 pci_device_space_size: pci_segment.end_of_mem64_area
1346                     - pci_segment.start_of_mem64_area
1347                     + 1,
1348             };
1349             pci_space_info.push(pci_space);
1350         }
1351 
1352         let virtio_iommu_bdf = self
1353             .device_manager
1354             .lock()
1355             .unwrap()
1356             .iommu_attached_devices()
1357             .as_ref()
1358             .map(|(v, _)| *v);
1359 
1360         let vgic = self
1361             .device_manager
1362             .lock()
1363             .unwrap()
1364             .get_interrupt_controller()
1365             .unwrap()
1366             .lock()
1367             .unwrap()
1368             .get_vgic()
1369             .map_err(|_| {
1370                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1371                     arch::aarch64::Error::SetupGic,
1372                 ))
1373             })?;
1374 
1375         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1376         let pmu_supported = self
1377             .cpu_manager
1378             .lock()
1379             .unwrap()
1380             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1381             .map_err(|_| {
1382                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1383                     arch::aarch64::Error::VcpuInitPmu,
1384                 ))
1385             })?;
1386 
1387         arch::configure_system(
1388             &mem,
1389             cmdline.as_cstring().unwrap().to_str().unwrap(),
1390             vcpu_mpidrs,
1391             vcpu_topology,
1392             device_info,
1393             &initramfs_config,
1394             &pci_space_info,
1395             virtio_iommu_bdf.map(|bdf| bdf.into()),
1396             &vgic,
1397             &self.numa_nodes,
1398             pmu_supported,
1399         )
1400         .map_err(Error::ConfigureSystem)?;
1401 
1402         Ok(())
1403     }
1404 
1405     #[cfg(target_arch = "riscv64")]
1406     fn configure_system(&mut self) -> Result<()> {
1407         let cmdline = Self::generate_cmdline(
1408             self.config.lock().unwrap().payload.as_ref().unwrap(),
1409             &self.device_manager,
1410         )?;
1411         let num_vcpu = self.cpu_manager.lock().unwrap().vcpus().len();
1412         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1413         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1414         let initramfs_config = match self.initramfs {
1415             Some(_) => Some(self.load_initramfs(&mem)?),
1416             None => None,
1417         };
1418 
1419         let device_info = &self
1420             .device_manager
1421             .lock()
1422             .unwrap()
1423             .get_device_info()
1424             .clone();
1425 
1426         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1427             let pci_space = PciSpaceInfo {
1428                 pci_segment_id: pci_segment.id,
1429                 mmio_config_address: pci_segment.mmio_config_address,
1430                 pci_device_space_start: pci_segment.start_of_mem64_area,
1431                 pci_device_space_size: pci_segment.end_of_mem64_area
1432                     - pci_segment.start_of_mem64_area
1433                     + 1,
1434             };
1435             pci_space_info.push(pci_space);
1436         }
1437 
1438         // TODO: IOMMU for riscv64 is not yet support in kernel.
1439 
1440         let vaia = self
1441             .device_manager
1442             .lock()
1443             .unwrap()
1444             .get_interrupt_controller()
1445             .unwrap()
1446             .lock()
1447             .unwrap()
1448             .get_vaia()
1449             .map_err(|_| {
1450                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1451                     arch::riscv64::Error::SetupAia,
1452                 ))
1453             })?;
1454 
1455         // TODO: PMU support for riscv64 is scheduled to next stage.
1456 
1457         arch::configure_system(
1458             &mem,
1459             cmdline.as_cstring().unwrap().to_str().unwrap(),
1460             num_vcpu as u32,
1461             device_info,
1462             &initramfs_config,
1463             &pci_space_info,
1464             &vaia,
1465         )
1466         .map_err(Error::ConfigureSystem)?;
1467 
1468         Ok(())
1469     }
1470 
1471     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1472         self.device_manager.lock().unwrap().console_resize_pipe()
1473     }
1474 
1475     pub fn shutdown(&mut self) -> Result<()> {
1476         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1477         let new_state = VmState::Shutdown;
1478 
1479         state.valid_transition(new_state)?;
1480 
1481         // Wake up the DeviceManager threads so they will get terminated cleanly
1482         self.device_manager
1483             .lock()
1484             .unwrap()
1485             .resume()
1486             .map_err(Error::Resume)?;
1487 
1488         self.cpu_manager
1489             .lock()
1490             .unwrap()
1491             .shutdown()
1492             .map_err(Error::CpuManager)?;
1493 
1494         // Wait for all the threads to finish
1495         for thread in self.threads.drain(..) {
1496             thread.join().map_err(Error::ThreadCleanup)?
1497         }
1498         *state = new_state;
1499 
1500         Ok(())
1501     }
1502 
1503     pub fn resize(
1504         &mut self,
1505         desired_vcpus: Option<u8>,
1506         desired_memory: Option<u64>,
1507         desired_balloon: Option<u64>,
1508     ) -> Result<()> {
1509         event!("vm", "resizing");
1510 
1511         if let Some(desired_vcpus) = desired_vcpus {
1512             if self
1513                 .cpu_manager
1514                 .lock()
1515                 .unwrap()
1516                 .resize(desired_vcpus)
1517                 .map_err(Error::CpuManager)?
1518             {
1519                 self.device_manager
1520                     .lock()
1521                     .unwrap()
1522                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1523                     .map_err(Error::DeviceManager)?;
1524             }
1525             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1526         }
1527 
1528         if let Some(desired_memory) = desired_memory {
1529             let new_region = self
1530                 .memory_manager
1531                 .lock()
1532                 .unwrap()
1533                 .resize(desired_memory)
1534                 .map_err(Error::MemoryManager)?;
1535 
1536             let memory_config = &mut self.config.lock().unwrap().memory;
1537 
1538             if let Some(new_region) = &new_region {
1539                 self.device_manager
1540                     .lock()
1541                     .unwrap()
1542                     .update_memory(new_region)
1543                     .map_err(Error::DeviceManager)?;
1544 
1545                 match memory_config.hotplug_method {
1546                     HotplugMethod::Acpi => {
1547                         self.device_manager
1548                             .lock()
1549                             .unwrap()
1550                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1551                             .map_err(Error::DeviceManager)?;
1552                     }
1553                     HotplugMethod::VirtioMem => {}
1554                 }
1555             }
1556 
1557             // We update the VM config regardless of the actual guest resize
1558             // operation result (happened or not), so that if the VM reboots
1559             // it will be running with the last configure memory size.
1560             match memory_config.hotplug_method {
1561                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1562                 HotplugMethod::VirtioMem => {
1563                     if desired_memory > memory_config.size {
1564                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1565                     } else {
1566                         memory_config.hotplugged_size = None;
1567                     }
1568                 }
1569             }
1570         }
1571 
1572         if let Some(desired_balloon) = desired_balloon {
1573             self.device_manager
1574                 .lock()
1575                 .unwrap()
1576                 .resize_balloon(desired_balloon)
1577                 .map_err(Error::DeviceManager)?;
1578 
1579             // Update the configuration value for the balloon size to ensure
1580             // a reboot would use the right value.
1581             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1582                 balloon_config.size = desired_balloon;
1583             }
1584         }
1585 
1586         event!("vm", "resized");
1587 
1588         Ok(())
1589     }
1590 
1591     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1592         let memory_config = &mut self.config.lock().unwrap().memory;
1593 
1594         if let Some(zones) = &mut memory_config.zones {
1595             for zone in zones.iter_mut() {
1596                 if zone.id == id {
1597                     if desired_memory >= zone.size {
1598                         let hotplugged_size = desired_memory - zone.size;
1599                         self.memory_manager
1600                             .lock()
1601                             .unwrap()
1602                             .resize_zone(&id, desired_memory - zone.size)
1603                             .map_err(Error::MemoryManager)?;
1604                         // We update the memory zone config regardless of the
1605                         // actual 'resize-zone' operation result (happened or
1606                         // not), so that if the VM reboots it will be running
1607                         // with the last configured memory zone size.
1608                         zone.hotplugged_size = Some(hotplugged_size);
1609 
1610                         return Ok(());
1611                     } else {
1612                         error!(
1613                             "Invalid to ask less ({}) than boot RAM ({}) for \
1614                             this memory zone",
1615                             desired_memory, zone.size,
1616                         );
1617                         return Err(Error::ResizeZone);
1618                     }
1619                 }
1620             }
1621         }
1622 
1623         error!("Could not find the memory zone {} for the resize", id);
1624         Err(Error::ResizeZone)
1625     }
1626 
1627     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1628         let pci_device_info = self
1629             .device_manager
1630             .lock()
1631             .unwrap()
1632             .add_device(&mut device_cfg)
1633             .map_err(Error::DeviceManager)?;
1634 
1635         // Update VmConfig by adding the new device. This is important to
1636         // ensure the device would be created in case of a reboot.
1637         {
1638             let mut config = self.config.lock().unwrap();
1639             add_to_config(&mut config.devices, device_cfg);
1640         }
1641 
1642         self.device_manager
1643             .lock()
1644             .unwrap()
1645             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1646             .map_err(Error::DeviceManager)?;
1647 
1648         Ok(pci_device_info)
1649     }
1650 
1651     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1652         let pci_device_info = self
1653             .device_manager
1654             .lock()
1655             .unwrap()
1656             .add_user_device(&mut device_cfg)
1657             .map_err(Error::DeviceManager)?;
1658 
1659         // Update VmConfig by adding the new device. This is important to
1660         // ensure the device would be created in case of a reboot.
1661         {
1662             let mut config = self.config.lock().unwrap();
1663             add_to_config(&mut config.user_devices, device_cfg);
1664         }
1665 
1666         self.device_manager
1667             .lock()
1668             .unwrap()
1669             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1670             .map_err(Error::DeviceManager)?;
1671 
1672         Ok(pci_device_info)
1673     }
1674 
1675     pub fn remove_device(&mut self, id: String) -> Result<()> {
1676         self.device_manager
1677             .lock()
1678             .unwrap()
1679             .remove_device(id.clone())
1680             .map_err(Error::DeviceManager)?;
1681 
1682         // Update VmConfig by removing the device. This is important to
1683         // ensure the device would not be created in case of a reboot.
1684         self.config.lock().unwrap().remove_device(&id);
1685 
1686         self.device_manager
1687             .lock()
1688             .unwrap()
1689             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1690             .map_err(Error::DeviceManager)?;
1691         Ok(())
1692     }
1693 
1694     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1695         let pci_device_info = self
1696             .device_manager
1697             .lock()
1698             .unwrap()
1699             .add_disk(&mut disk_cfg)
1700             .map_err(Error::DeviceManager)?;
1701 
1702         // Update VmConfig by adding the new device. This is important to
1703         // ensure the device would be created in case of a reboot.
1704         {
1705             let mut config = self.config.lock().unwrap();
1706             add_to_config(&mut config.disks, disk_cfg);
1707         }
1708 
1709         self.device_manager
1710             .lock()
1711             .unwrap()
1712             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1713             .map_err(Error::DeviceManager)?;
1714 
1715         Ok(pci_device_info)
1716     }
1717 
1718     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1719         let pci_device_info = self
1720             .device_manager
1721             .lock()
1722             .unwrap()
1723             .add_fs(&mut fs_cfg)
1724             .map_err(Error::DeviceManager)?;
1725 
1726         // Update VmConfig by adding the new device. This is important to
1727         // ensure the device would be created in case of a reboot.
1728         {
1729             let mut config = self.config.lock().unwrap();
1730             add_to_config(&mut config.fs, fs_cfg);
1731         }
1732 
1733         self.device_manager
1734             .lock()
1735             .unwrap()
1736             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1737             .map_err(Error::DeviceManager)?;
1738 
1739         Ok(pci_device_info)
1740     }
1741 
1742     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1743         let pci_device_info = self
1744             .device_manager
1745             .lock()
1746             .unwrap()
1747             .add_pmem(&mut pmem_cfg)
1748             .map_err(Error::DeviceManager)?;
1749 
1750         // Update VmConfig by adding the new device. This is important to
1751         // ensure the device would be created in case of a reboot.
1752         {
1753             let mut config = self.config.lock().unwrap();
1754             add_to_config(&mut config.pmem, pmem_cfg);
1755         }
1756 
1757         self.device_manager
1758             .lock()
1759             .unwrap()
1760             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1761             .map_err(Error::DeviceManager)?;
1762 
1763         Ok(pci_device_info)
1764     }
1765 
1766     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1767         let pci_device_info = self
1768             .device_manager
1769             .lock()
1770             .unwrap()
1771             .add_net(&mut net_cfg)
1772             .map_err(Error::DeviceManager)?;
1773 
1774         // Update VmConfig by adding the new device. This is important to
1775         // ensure the device would be created in case of a reboot.
1776         {
1777             let mut config = self.config.lock().unwrap();
1778             add_to_config(&mut config.net, net_cfg);
1779         }
1780 
1781         self.device_manager
1782             .lock()
1783             .unwrap()
1784             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1785             .map_err(Error::DeviceManager)?;
1786 
1787         Ok(pci_device_info)
1788     }
1789 
1790     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1791         let pci_device_info = self
1792             .device_manager
1793             .lock()
1794             .unwrap()
1795             .add_vdpa(&mut vdpa_cfg)
1796             .map_err(Error::DeviceManager)?;
1797 
1798         // Update VmConfig by adding the new device. This is important to
1799         // ensure the device would be created in case of a reboot.
1800         {
1801             let mut config = self.config.lock().unwrap();
1802             add_to_config(&mut config.vdpa, vdpa_cfg);
1803         }
1804 
1805         self.device_manager
1806             .lock()
1807             .unwrap()
1808             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1809             .map_err(Error::DeviceManager)?;
1810 
1811         Ok(pci_device_info)
1812     }
1813 
1814     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1815         let pci_device_info = self
1816             .device_manager
1817             .lock()
1818             .unwrap()
1819             .add_vsock(&mut vsock_cfg)
1820             .map_err(Error::DeviceManager)?;
1821 
1822         // Update VmConfig by adding the new device. This is important to
1823         // ensure the device would be created in case of a reboot.
1824         {
1825             let mut config = self.config.lock().unwrap();
1826             config.vsock = Some(vsock_cfg);
1827         }
1828 
1829         self.device_manager
1830             .lock()
1831             .unwrap()
1832             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1833             .map_err(Error::DeviceManager)?;
1834 
1835         Ok(pci_device_info)
1836     }
1837 
1838     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1839         Ok(self.device_manager.lock().unwrap().counters())
1840     }
1841 
1842     #[cfg(feature = "tdx")]
1843     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1844         use arch::x86_64::tdx::*;
1845 
1846         let firmware_path = self
1847             .config
1848             .lock()
1849             .unwrap()
1850             .payload
1851             .as_ref()
1852             .unwrap()
1853             .firmware
1854             .clone()
1855             .ok_or(Error::TdxFirmwareMissing)?;
1856         // The TDVF file contains a table of section as well as code
1857         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1858 
1859         // For all the sections allocate some RAM backing them
1860         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1861     }
1862 
1863     #[cfg(feature = "tdx")]
1864     fn hob_memory_resources(
1865         mut sorted_sections: Vec<TdvfSection>,
1866         guest_memory: &GuestMemoryMmap,
1867     ) -> Vec<(u64, u64, bool)> {
1868         let mut list = Vec::new();
1869 
1870         let mut current_section = sorted_sections.pop();
1871 
1872         // RAM regions interleaved with TDVF sections
1873         let mut next_start_addr = 0;
1874         for region in guest_memory.iter() {
1875             let region_start = region.start_addr().0;
1876             let region_end = region.last_addr().0;
1877             if region_start > next_start_addr {
1878                 next_start_addr = region_start;
1879             }
1880 
1881             loop {
1882                 let (start, size, ram) = if let Some(section) = &current_section {
1883                     if section.address <= next_start_addr {
1884                         (section.address, section.size, false)
1885                     } else {
1886                         let last_addr = std::cmp::min(section.address - 1, region_end);
1887                         (next_start_addr, last_addr - next_start_addr + 1, true)
1888                     }
1889                 } else {
1890                     (next_start_addr, region_end - next_start_addr + 1, true)
1891                 };
1892 
1893                 list.push((start, size, ram));
1894 
1895                 if !ram {
1896                     current_section = sorted_sections.pop();
1897                 }
1898 
1899                 next_start_addr = start + size;
1900 
1901                 if region_start > next_start_addr {
1902                     next_start_addr = region_start;
1903                 }
1904 
1905                 if next_start_addr > region_end {
1906                     break;
1907                 }
1908             }
1909         }
1910 
1911         // Once all the interleaved sections have been processed, let's simply
1912         // pull the remaining ones.
1913         if let Some(section) = current_section {
1914             list.push((section.address, section.size, false));
1915         }
1916         while let Some(section) = sorted_sections.pop() {
1917             list.push((section.address, section.size, false));
1918         }
1919 
1920         list
1921     }
1922 
1923     #[cfg(feature = "tdx")]
1924     fn populate_tdx_sections(
1925         &mut self,
1926         sections: &[TdvfSection],
1927         guid_found: bool,
1928     ) -> Result<Option<u64>> {
1929         use arch::x86_64::tdx::*;
1930         // Get the memory end *before* we start adding TDVF ram regions
1931         let boot_guest_memory = self
1932             .memory_manager
1933             .lock()
1934             .as_ref()
1935             .unwrap()
1936             .boot_guest_memory();
1937         for section in sections {
1938             // No need to allocate if the section falls within guest RAM ranges
1939             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1940                 info!(
1941                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1942                     section
1943                 );
1944                 continue;
1945             }
1946 
1947             info!("Allocating TDVF Section: {:x?}", section);
1948             self.memory_manager
1949                 .lock()
1950                 .unwrap()
1951                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1952                 .map_err(Error::AllocatingTdvfMemory)?;
1953         }
1954 
1955         // The TDVF file contains a table of section as well as code
1956         let firmware_path = self
1957             .config
1958             .lock()
1959             .unwrap()
1960             .payload
1961             .as_ref()
1962             .unwrap()
1963             .firmware
1964             .clone()
1965             .ok_or(Error::TdxFirmwareMissing)?;
1966         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1967 
1968         // The guest memory at this point now has all the required regions so it
1969         // is safe to copy from the TDVF file into it.
1970         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1971         let mem = guest_memory.memory();
1972         let mut payload_info = None;
1973         let mut hob_offset = None;
1974         for section in sections {
1975             info!("Populating TDVF Section: {:x?}", section);
1976             match section.r#type {
1977                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1978                     info!("Copying section to guest memory");
1979                     firmware_file
1980                         .seek(SeekFrom::Start(section.data_offset as u64))
1981                         .map_err(Error::LoadTdvf)?;
1982                     mem.read_volatile_from(
1983                         GuestAddress(section.address),
1984                         &mut firmware_file,
1985                         section.data_size as usize,
1986                     )
1987                     .unwrap();
1988                 }
1989                 TdvfSectionType::TdHob => {
1990                     hob_offset = Some(section.address);
1991                 }
1992                 TdvfSectionType::Payload => {
1993                     info!("Copying payload to guest memory");
1994                     if let Some(payload_file) = self.kernel.as_mut() {
1995                         let payload_size = payload_file
1996                             .seek(SeekFrom::End(0))
1997                             .map_err(Error::LoadPayload)?;
1998 
1999                         payload_file
2000                             .seek(SeekFrom::Start(0x1f1))
2001                             .map_err(Error::LoadPayload)?;
2002 
2003                         let mut payload_header = linux_loader::bootparam::setup_header::default();
2004                         payload_file
2005                             .read_volatile(&mut payload_header.as_bytes())
2006                             .unwrap();
2007 
2008                         if payload_header.header != 0x5372_6448 {
2009                             return Err(Error::InvalidPayloadType);
2010                         }
2011 
2012                         if (payload_header.version < 0x0200)
2013                             || ((payload_header.loadflags & 0x1) == 0x0)
2014                         {
2015                             return Err(Error::InvalidPayloadType);
2016                         }
2017 
2018                         payload_file.rewind().map_err(Error::LoadPayload)?;
2019                         mem.read_volatile_from(
2020                             GuestAddress(section.address),
2021                             payload_file,
2022                             payload_size as usize,
2023                         )
2024                         .unwrap();
2025 
2026                         // Create the payload info that will be inserted into
2027                         // the HOB.
2028                         payload_info = Some(PayloadInfo {
2029                             image_type: PayloadImageType::BzImage,
2030                             entry_point: section.address,
2031                         });
2032                     }
2033                 }
2034                 TdvfSectionType::PayloadParam => {
2035                     info!("Copying payload parameters to guest memory");
2036                     let cmdline = Self::generate_cmdline(
2037                         self.config.lock().unwrap().payload.as_ref().unwrap(),
2038                     )?;
2039                     mem.write_slice(
2040                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
2041                         GuestAddress(section.address),
2042                     )
2043                     .unwrap();
2044                 }
2045                 _ => {}
2046             }
2047         }
2048 
2049         // Generate HOB
2050         let mut hob = TdHob::start(hob_offset.unwrap());
2051 
2052         let mut sorted_sections = sections.to_vec();
2053         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
2054 
2055         sorted_sections.sort_by_key(|section| section.address);
2056         sorted_sections.reverse();
2057 
2058         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
2059             hob.add_memory_resource(&mem, start, size, ram, guid_found)
2060                 .map_err(Error::PopulateHob)?;
2061         }
2062 
2063         // MMIO regions
2064         hob.add_mmio_resource(
2065             &mem,
2066             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
2067             arch::layout::APIC_START.raw_value()
2068                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
2069         )
2070         .map_err(Error::PopulateHob)?;
2071         let start_of_device_area = self
2072             .memory_manager
2073             .lock()
2074             .unwrap()
2075             .start_of_device_area()
2076             .raw_value();
2077         let end_of_device_area = self
2078             .memory_manager
2079             .lock()
2080             .unwrap()
2081             .end_of_device_area()
2082             .raw_value();
2083         hob.add_mmio_resource(
2084             &mem,
2085             start_of_device_area,
2086             end_of_device_area - start_of_device_area,
2087         )
2088         .map_err(Error::PopulateHob)?;
2089 
2090         // Loop over the ACPI tables and copy them to the HOB.
2091 
2092         for acpi_table in crate::acpi::create_acpi_tables_tdx(
2093             &self.device_manager,
2094             &self.cpu_manager,
2095             &self.memory_manager,
2096             &self.numa_nodes,
2097         ) {
2098             hob.add_acpi_table(&mem, acpi_table.as_slice())
2099                 .map_err(Error::PopulateHob)?;
2100         }
2101 
2102         // If a payload info has been created, let's insert it into the HOB.
2103         if let Some(payload_info) = payload_info {
2104             hob.add_payload(&mem, payload_info)
2105                 .map_err(Error::PopulateHob)?;
2106         }
2107 
2108         hob.finish(&mem).map_err(Error::PopulateHob)?;
2109 
2110         Ok(hob_offset)
2111     }
2112 
2113     #[cfg(feature = "tdx")]
2114     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
2115         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2116         let mem = guest_memory.memory();
2117 
2118         for section in sections {
2119             self.vm
2120                 .tdx_init_memory_region(
2121                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2122                     section.address,
2123                     section.size,
2124                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2125                     section.attributes == 1,
2126                 )
2127                 .map_err(Error::InitializeTdxMemoryRegion)?;
2128         }
2129 
2130         Ok(())
2131     }
2132 
2133     // Creates ACPI tables
2134     // In case of TDX being used, this is a no-op since the tables will be
2135     // created and passed when populating the HOB.
2136 
2137     #[cfg(not(target_arch = "riscv64"))]
2138     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2139         #[cfg(feature = "tdx")]
2140         if self.config.lock().unwrap().is_tdx_enabled() {
2141             return None;
2142         }
2143         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2144         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2145         let rsdp_addr = crate::acpi::create_acpi_tables(
2146             &mem,
2147             &self.device_manager,
2148             &self.cpu_manager,
2149             &self.memory_manager,
2150             &self.numa_nodes,
2151             tpm_enabled,
2152         );
2153         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2154 
2155         Some(rsdp_addr)
2156     }
2157 
2158     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2159         trace_scoped!("entry_point");
2160 
2161         self.load_payload_handle
2162             .take()
2163             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2164             .transpose()
2165     }
2166 
2167     pub fn boot(&mut self) -> Result<()> {
2168         trace_scoped!("Vm::boot");
2169         let current_state = self.get_state()?;
2170         if current_state == VmState::Paused {
2171             return self.resume().map_err(Error::Resume);
2172         }
2173 
2174         let new_state = if self.stop_on_boot {
2175             VmState::BreakPoint
2176         } else {
2177             VmState::Running
2178         };
2179         current_state.valid_transition(new_state)?;
2180 
2181         // Do earlier to parallelise with loading kernel
2182         #[cfg(target_arch = "x86_64")]
2183         cfg_if::cfg_if! {
2184             if #[cfg(feature = "sev_snp")] {
2185                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2186                 let rsdp_addr = if sev_snp_enabled {
2187                     // In case of SEV-SNP guest ACPI tables are provided via
2188                     // IGVM. So skip the creation of ACPI tables and set the
2189                     // rsdp addr to None.
2190                     None
2191                 } else {
2192                     self.create_acpi_tables()
2193                 };
2194             } else {
2195                 let rsdp_addr = self.create_acpi_tables();
2196             }
2197         }
2198 
2199         // Load kernel synchronously or if asynchronous then wait for load to
2200         // finish.
2201         let entry_point = self.entry_point()?;
2202 
2203         #[cfg(feature = "tdx")]
2204         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2205 
2206         // Configure the vcpus that have been created
2207         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2208         for vcpu in vcpus {
2209             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2210             let boot_setup = entry_point.map(|e| (e, guest_memory));
2211             self.cpu_manager
2212                 .lock()
2213                 .unwrap()
2214                 .configure_vcpu(vcpu, boot_setup)
2215                 .map_err(Error::CpuManager)?;
2216         }
2217 
2218         #[cfg(feature = "tdx")]
2219         let (sections, guid_found) = if tdx_enabled {
2220             self.extract_tdvf_sections()?
2221         } else {
2222             (Vec::new(), false)
2223         };
2224 
2225         // Configuring the TDX regions requires that the vCPUs are created.
2226         #[cfg(feature = "tdx")]
2227         let hob_address = if tdx_enabled {
2228             // TDX sections are written to memory.
2229             self.populate_tdx_sections(&sections, guid_found)?
2230         } else {
2231             None
2232         };
2233 
2234         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2235         // available after they are configured
2236         #[cfg(target_arch = "aarch64")]
2237         let rsdp_addr = self.create_acpi_tables();
2238 
2239         #[cfg(not(target_arch = "riscv64"))]
2240         // Configure shared state based on loaded kernel
2241         entry_point
2242             .map(|entry_point| {
2243                 // Safe to unwrap rsdp_addr as we know it can't be None when
2244                 // the entry_point is Some.
2245                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2246             })
2247             .transpose()?;
2248 
2249         #[cfg(target_arch = "riscv64")]
2250         self.configure_system().unwrap();
2251 
2252         #[cfg(target_arch = "x86_64")]
2253         // Note: For x86, always call this function before invoking start boot vcpus.
2254         // Otherwise guest would fail to boot because we haven't created the
2255         // userspace mappings to update the hypervisor about the memory mappings.
2256         // These mappings must be created before we start the vCPU threads for
2257         // the very first time.
2258         self.memory_manager
2259             .lock()
2260             .unwrap()
2261             .allocate_address_space()
2262             .map_err(Error::MemoryManager)?;
2263 
2264         #[cfg(feature = "tdx")]
2265         if let Some(hob_address) = hob_address {
2266             // With the HOB address extracted the vCPUs can have
2267             // their TDX state configured.
2268             self.cpu_manager
2269                 .lock()
2270                 .unwrap()
2271                 .initialize_tdx(hob_address)
2272                 .map_err(Error::CpuManager)?;
2273             // Let the hypervisor know which memory ranges are shared with the
2274             // guest. This prevents the guest from ignoring/discarding memory
2275             // regions provided by the host.
2276             self.init_tdx_memory(&sections)?;
2277             // With TDX memory and CPU state configured TDX setup is complete
2278             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2279         }
2280 
2281         // Resume the vm for MSHV
2282         if current_state == VmState::Created {
2283             self.vm.resume().map_err(Error::ResumeVm)?;
2284         }
2285 
2286         self.cpu_manager
2287             .lock()
2288             .unwrap()
2289             .start_boot_vcpus(new_state == VmState::BreakPoint)
2290             .map_err(Error::CpuManager)?;
2291 
2292         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2293         *state = new_state;
2294         Ok(())
2295     }
2296 
2297     pub fn restore(&mut self) -> Result<()> {
2298         event!("vm", "restoring");
2299 
2300         #[cfg(target_arch = "x86_64")]
2301         // Note: For x86, always call this function before invoking start boot vcpus.
2302         // Otherwise guest would fail to boot because we haven't created the
2303         // userspace mappings to update the hypervisor about the memory mappings.
2304         // These mappings must be created before we start the vCPU threads for
2305         // the very first time for the restored VM.
2306         self.memory_manager
2307             .lock()
2308             .unwrap()
2309             .allocate_address_space()
2310             .map_err(Error::MemoryManager)?;
2311 
2312         // Now we can start all vCPUs from here.
2313         self.cpu_manager
2314             .lock()
2315             .unwrap()
2316             .start_restored_vcpus()
2317             .map_err(Error::CpuManager)?;
2318 
2319         event!("vm", "restored");
2320         Ok(())
2321     }
2322 
2323     /// Gets a thread-safe reference counted pointer to the VM configuration.
2324     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2325         Arc::clone(&self.config)
2326     }
2327 
2328     /// Get the VM state. Returns an error if the state is poisoned.
2329     pub fn get_state(&self) -> Result<VmState> {
2330         self.state
2331             .try_read()
2332             .map_err(|_| Error::PoisonedState)
2333             .map(|state| *state)
2334     }
2335 
2336     /// Gets the actual size of the balloon.
2337     pub fn balloon_size(&self) -> u64 {
2338         self.device_manager.lock().unwrap().balloon_size()
2339     }
2340 
2341     pub fn send_memory_fds(
2342         &mut self,
2343         socket: &mut UnixStream,
2344     ) -> std::result::Result<(), MigratableError> {
2345         for (slot, fd) in self
2346             .memory_manager
2347             .lock()
2348             .unwrap()
2349             .memory_slot_fds()
2350             .drain()
2351         {
2352             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2353                 .write_to(socket)
2354                 .map_err(|e| {
2355                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2356                 })?;
2357             socket
2358                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2359                 .map_err(|e| {
2360                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2361                 })?;
2362 
2363             Response::read_from(socket)?.ok_or_abandon(
2364                 socket,
2365                 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")),
2366             )?;
2367         }
2368 
2369         Ok(())
2370     }
2371 
2372     pub fn send_memory_regions<F>(
2373         &mut self,
2374         ranges: &MemoryRangeTable,
2375         fd: &mut F,
2376     ) -> std::result::Result<(), MigratableError>
2377     where
2378         F: WriteVolatile,
2379     {
2380         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2381         let mem = guest_memory.memory();
2382 
2383         for range in ranges.regions() {
2384             let mut offset: u64 = 0;
2385             // Here we are manually handling the retry in case we can't the
2386             // whole region at once because we can't use the implementation
2387             // from vm-memory::GuestMemory of write_all_to() as it is not
2388             // following the correct behavior. For more info about this issue
2389             // see: https://github.com/rust-vmm/vm-memory/issues/174
2390             loop {
2391                 let bytes_written = mem
2392                     .write_volatile_to(
2393                         GuestAddress(range.gpa + offset),
2394                         fd,
2395                         (range.length - offset) as usize,
2396                     )
2397                     .map_err(|e| {
2398                         MigratableError::MigrateSend(anyhow!(
2399                             "Error transferring memory to socket: {}",
2400                             e
2401                         ))
2402                     })?;
2403                 offset += bytes_written as u64;
2404 
2405                 if offset == range.length {
2406                     break;
2407                 }
2408             }
2409         }
2410 
2411         Ok(())
2412     }
2413 
2414     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2415         self.memory_manager
2416             .lock()
2417             .unwrap()
2418             .memory_range_table(false)
2419     }
2420 
2421     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2422         self.device_manager.lock().unwrap().device_tree()
2423     }
2424 
2425     pub fn activate_virtio_devices(&self) -> Result<()> {
2426         self.device_manager
2427             .lock()
2428             .unwrap()
2429             .activate_virtio_devices()
2430             .map_err(Error::ActivateVirtioDevices)
2431     }
2432 
2433     #[cfg(target_arch = "x86_64")]
2434     pub fn power_button(&self) -> Result<()> {
2435         return self
2436             .device_manager
2437             .lock()
2438             .unwrap()
2439             .notify_power_button()
2440             .map_err(Error::PowerButton);
2441     }
2442 
2443     #[cfg(target_arch = "aarch64")]
2444     pub fn power_button(&self) -> Result<()> {
2445         self.device_manager
2446             .lock()
2447             .unwrap()
2448             .notify_power_button()
2449             .map_err(Error::PowerButton)
2450     }
2451 
2452     #[cfg(target_arch = "riscv64")]
2453     pub fn power_button(&self) -> Result<()> {
2454         unimplemented!()
2455     }
2456 
2457     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2458         self.memory_manager.lock().unwrap().snapshot_data()
2459     }
2460 
2461     #[cfg(feature = "guest_debug")]
2462     pub fn debug_request(
2463         &mut self,
2464         gdb_request: &GdbRequestPayload,
2465         cpu_id: usize,
2466     ) -> Result<GdbResponsePayload> {
2467         use GdbRequestPayload::*;
2468         match gdb_request {
2469             SetSingleStep(single_step) => {
2470                 self.set_guest_debug(cpu_id, &[], *single_step)
2471                     .map_err(Error::Debug)?;
2472             }
2473             SetHwBreakPoint(addrs) => {
2474                 self.set_guest_debug(cpu_id, addrs, false)
2475                     .map_err(Error::Debug)?;
2476             }
2477             Pause => {
2478                 self.debug_pause().map_err(Error::Debug)?;
2479             }
2480             Resume => {
2481                 self.debug_resume().map_err(Error::Debug)?;
2482             }
2483             ReadRegs => {
2484                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2485                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2486             }
2487             WriteRegs(regs) => {
2488                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2489             }
2490             ReadMem(vaddr, len) => {
2491                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2492                 let mem = self
2493                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2494                     .map_err(Error::Debug)?;
2495                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2496             }
2497             WriteMem(vaddr, data) => {
2498                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2499                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2500                     .map_err(Error::Debug)?;
2501             }
2502             ActiveVcpus => {
2503                 let active_vcpus = self.active_vcpus();
2504                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2505             }
2506         }
2507         Ok(GdbResponsePayload::CommandComplete)
2508     }
2509 
2510     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2511     fn get_dump_state(
2512         &mut self,
2513         destination_url: &str,
2514     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2515         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2516         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2517         let mut elf_phdr_num = 1;
2518         let elf_sh_info = 0;
2519         let coredump_file_path = url_to_file(destination_url)?;
2520         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2521 
2522         if mapping_num < UINT16_MAX - 2 {
2523             elf_phdr_num += mapping_num as u16;
2524         } else {
2525             panic!("mapping num beyond 65535 not supported");
2526         }
2527         let coredump_file = OpenOptions::new()
2528             .read(true)
2529             .write(true)
2530             .create_new(true)
2531             .open(coredump_file_path)
2532             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2533 
2534         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2535         let mem_data = self
2536             .memory_manager
2537             .lock()
2538             .unwrap()
2539             .coredump_memory_regions(mem_offset);
2540 
2541         Ok(DumpState {
2542             elf_note_size,
2543             elf_phdr_num,
2544             elf_sh_info,
2545             mem_offset,
2546             mem_info: Some(mem_data),
2547             file: Some(coredump_file),
2548         })
2549     }
2550 
2551     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2552     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2553         size_of::<elf::Elf64_Ehdr>() as u64
2554             + note_size as u64
2555             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2556     }
2557 
2558     pub fn nmi(&self) -> Result<()> {
2559         return self
2560             .cpu_manager
2561             .lock()
2562             .unwrap()
2563             .nmi()
2564             .map_err(|_| Error::ErrorNmi);
2565     }
2566 }
2567 
2568 impl Pausable for Vm {
2569     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2570         event!("vm", "pausing");
2571         let mut state = self
2572             .state
2573             .try_write()
2574             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2575         let new_state = VmState::Paused;
2576 
2577         state
2578             .valid_transition(new_state)
2579             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2580 
2581         #[cfg(target_arch = "x86_64")]
2582         {
2583             let mut clock = self
2584                 .vm
2585                 .get_clock()
2586                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2587             clock.reset_flags();
2588             self.saved_clock = Some(clock);
2589         }
2590 
2591         // Before pausing the vCPUs activate any pending virtio devices that might
2592         // need activation between starting the pause (or e.g. a migration it's part of)
2593         self.activate_virtio_devices().map_err(|e| {
2594             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2595         })?;
2596 
2597         self.cpu_manager.lock().unwrap().pause()?;
2598         self.device_manager.lock().unwrap().pause()?;
2599 
2600         self.vm
2601             .pause()
2602             .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?;
2603 
2604         *state = new_state;
2605 
2606         event!("vm", "paused");
2607         Ok(())
2608     }
2609 
2610     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2611         event!("vm", "resuming");
2612         let current_state = self.get_state().unwrap();
2613         let mut state = self
2614             .state
2615             .try_write()
2616             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2617         let new_state = VmState::Running;
2618 
2619         state
2620             .valid_transition(new_state)
2621             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2622 
2623         self.cpu_manager.lock().unwrap().resume()?;
2624         #[cfg(target_arch = "x86_64")]
2625         {
2626             if let Some(clock) = &self.saved_clock {
2627                 self.vm.set_clock(clock).map_err(|e| {
2628                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2629                 })?;
2630             }
2631         }
2632 
2633         if current_state == VmState::Paused {
2634             self.vm
2635                 .resume()
2636                 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?;
2637         }
2638 
2639         self.device_manager.lock().unwrap().resume()?;
2640 
2641         // And we're back to the Running state.
2642         *state = new_state;
2643         event!("vm", "resumed");
2644         Ok(())
2645     }
2646 }
2647 
2648 #[derive(Serialize, Deserialize)]
2649 pub struct VmSnapshot {
2650     #[cfg(target_arch = "x86_64")]
2651     pub clock: Option<hypervisor::ClockData>,
2652     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2653     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2654 }
2655 
2656 pub const VM_SNAPSHOT_ID: &str = "vm";
2657 impl Snapshottable for Vm {
2658     fn id(&self) -> String {
2659         VM_SNAPSHOT_ID.to_string()
2660     }
2661 
2662     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2663         event!("vm", "snapshotting");
2664 
2665         #[cfg(feature = "tdx")]
2666         {
2667             if self.config.lock().unwrap().is_tdx_enabled() {
2668                 return Err(MigratableError::Snapshot(anyhow!(
2669                     "Snapshot not possible with TDX VM"
2670                 )));
2671             }
2672         }
2673 
2674         let current_state = self.get_state().unwrap();
2675         if current_state != VmState::Paused {
2676             return Err(MigratableError::Snapshot(anyhow!(
2677                 "Trying to snapshot while VM is running"
2678             )));
2679         }
2680 
2681         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2682         let common_cpuid = {
2683             let amx = self.config.lock().unwrap().cpus.features.amx;
2684             let phys_bits = physical_bits(
2685                 &self.hypervisor,
2686                 self.config.lock().unwrap().cpus.max_phys_bits,
2687             );
2688             arch::generate_common_cpuid(
2689                 &self.hypervisor,
2690                 &arch::CpuidConfig {
2691                     sgx_epc_sections: None,
2692                     phys_bits,
2693                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2694                     #[cfg(feature = "tdx")]
2695                     tdx: false,
2696                     amx,
2697                 },
2698             )
2699             .map_err(|e| {
2700                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2701             })?
2702         };
2703 
2704         let vm_snapshot_state = VmSnapshot {
2705             #[cfg(target_arch = "x86_64")]
2706             clock: self.saved_clock,
2707             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2708             common_cpuid,
2709         };
2710 
2711         let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?;
2712 
2713         let (id, snapshot) = {
2714             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2715             (cpu_manager.id(), cpu_manager.snapshot()?)
2716         };
2717         vm_snapshot.add_snapshot(id, snapshot);
2718         let (id, snapshot) = {
2719             let mut memory_manager = self.memory_manager.lock().unwrap();
2720             (memory_manager.id(), memory_manager.snapshot()?)
2721         };
2722         vm_snapshot.add_snapshot(id, snapshot);
2723         let (id, snapshot) = {
2724             let mut device_manager = self.device_manager.lock().unwrap();
2725             (device_manager.id(), device_manager.snapshot()?)
2726         };
2727         vm_snapshot.add_snapshot(id, snapshot);
2728 
2729         event!("vm", "snapshotted");
2730         Ok(vm_snapshot)
2731     }
2732 }
2733 
2734 impl Transportable for Vm {
2735     fn send(
2736         &self,
2737         snapshot: &Snapshot,
2738         destination_url: &str,
2739     ) -> std::result::Result<(), MigratableError> {
2740         let mut snapshot_config_path = url_to_path(destination_url)?;
2741         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2742 
2743         // Create the snapshot config file
2744         let mut snapshot_config_file = OpenOptions::new()
2745             .read(true)
2746             .write(true)
2747             .create_new(true)
2748             .open(snapshot_config_path)
2749             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2750 
2751         // Serialize and write the snapshot config
2752         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2753             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2754 
2755         snapshot_config_file
2756             .write(vm_config.as_bytes())
2757             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2758 
2759         let mut snapshot_state_path = url_to_path(destination_url)?;
2760         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2761 
2762         // Create the snapshot state file
2763         let mut snapshot_state_file = OpenOptions::new()
2764             .read(true)
2765             .write(true)
2766             .create_new(true)
2767             .open(snapshot_state_path)
2768             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2769 
2770         // Serialize and write the snapshot state
2771         let vm_state =
2772             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2773 
2774         snapshot_state_file
2775             .write(&vm_state)
2776             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2777 
2778         // Tell the memory manager to also send/write its own snapshot.
2779         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2780             self.memory_manager
2781                 .lock()
2782                 .unwrap()
2783                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2784         } else {
2785             return Err(MigratableError::Restore(anyhow!(
2786                 "Missing memory manager snapshot"
2787             )));
2788         }
2789 
2790         Ok(())
2791     }
2792 }
2793 
2794 impl Migratable for Vm {
2795     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2796         self.memory_manager.lock().unwrap().start_dirty_log()?;
2797         self.device_manager.lock().unwrap().start_dirty_log()
2798     }
2799 
2800     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2801         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2802         self.device_manager.lock().unwrap().stop_dirty_log()
2803     }
2804 
2805     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2806         Ok(MemoryRangeTable::new_from_tables(vec![
2807             self.memory_manager.lock().unwrap().dirty_log()?,
2808             self.device_manager.lock().unwrap().dirty_log()?,
2809         ]))
2810     }
2811 
2812     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2813         self.memory_manager.lock().unwrap().start_migration()?;
2814         self.device_manager.lock().unwrap().start_migration()
2815     }
2816 
2817     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2818         self.memory_manager.lock().unwrap().complete_migration()?;
2819         self.device_manager.lock().unwrap().complete_migration()
2820     }
2821 }
2822 
2823 #[cfg(feature = "guest_debug")]
2824 impl Debuggable for Vm {
2825     fn set_guest_debug(
2826         &self,
2827         cpu_id: usize,
2828         addrs: &[GuestAddress],
2829         singlestep: bool,
2830     ) -> std::result::Result<(), DebuggableError> {
2831         self.cpu_manager
2832             .lock()
2833             .unwrap()
2834             .set_guest_debug(cpu_id, addrs, singlestep)
2835     }
2836 
2837     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2838         if *self.state.read().unwrap() == VmState::Running {
2839             self.pause().map_err(DebuggableError::Pause)?;
2840         }
2841 
2842         let mut state = self
2843             .state
2844             .try_write()
2845             .map_err(|_| DebuggableError::PoisonedState)?;
2846         *state = VmState::BreakPoint;
2847         Ok(())
2848     }
2849 
2850     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2851         if *self.state.read().unwrap() == VmState::BreakPoint {
2852             self.resume().map_err(DebuggableError::Pause)?;
2853         }
2854 
2855         Ok(())
2856     }
2857 
2858     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2859         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2860     }
2861 
2862     fn write_regs(
2863         &self,
2864         cpu_id: usize,
2865         regs: &CoreRegs,
2866     ) -> std::result::Result<(), DebuggableError> {
2867         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2868     }
2869 
2870     fn read_mem(
2871         &self,
2872         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2873         cpu_id: usize,
2874         vaddr: GuestAddress,
2875         len: usize,
2876     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2877         self.cpu_manager
2878             .lock()
2879             .unwrap()
2880             .read_mem(guest_memory, cpu_id, vaddr, len)
2881     }
2882 
2883     fn write_mem(
2884         &self,
2885         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2886         cpu_id: usize,
2887         vaddr: &GuestAddress,
2888         data: &[u8],
2889     ) -> std::result::Result<(), DebuggableError> {
2890         self.cpu_manager
2891             .lock()
2892             .unwrap()
2893             .write_mem(guest_memory, cpu_id, vaddr, data)
2894     }
2895 
2896     fn active_vcpus(&self) -> usize {
2897         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2898         if active_vcpus > 0 {
2899             active_vcpus
2900         } else {
2901             // The VM is not booted yet. Report boot_vcpus() instead.
2902             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2903         }
2904     }
2905 }
2906 
2907 #[cfg(feature = "guest_debug")]
2908 pub const UINT16_MAX: u32 = 65535;
2909 
2910 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2911 impl Elf64Writable for Vm {}
2912 
2913 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2914 impl GuestDebuggable for Vm {
2915     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2916         event!("vm", "coredumping");
2917 
2918         let mut resume = false;
2919 
2920         #[cfg(feature = "tdx")]
2921         {
2922             if let Some(ref platform) = self.config.lock().unwrap().platform {
2923                 if platform.tdx {
2924                     return Err(GuestDebuggableError::Coredump(anyhow!(
2925                         "Coredump not possible with TDX VM"
2926                     )));
2927                 }
2928             }
2929         }
2930 
2931         match self.get_state().unwrap() {
2932             VmState::Running => {
2933                 self.pause().map_err(GuestDebuggableError::Pause)?;
2934                 resume = true;
2935             }
2936             VmState::Paused => {}
2937             _ => {
2938                 return Err(GuestDebuggableError::Coredump(anyhow!(
2939                     "Trying to coredump while VM is not running or paused"
2940                 )));
2941             }
2942         }
2943 
2944         let coredump_state = self.get_dump_state(destination_url)?;
2945 
2946         self.write_header(&coredump_state)?;
2947         self.write_note(&coredump_state)?;
2948         self.write_loads(&coredump_state)?;
2949 
2950         self.cpu_manager
2951             .lock()
2952             .unwrap()
2953             .cpu_write_elf64_note(&coredump_state)?;
2954         self.cpu_manager
2955             .lock()
2956             .unwrap()
2957             .cpu_write_vmm_note(&coredump_state)?;
2958 
2959         self.memory_manager
2960             .lock()
2961             .unwrap()
2962             .coredump_iterate_save_mem(&coredump_state)?;
2963 
2964         if resume {
2965             self.resume().map_err(GuestDebuggableError::Resume)?;
2966         }
2967 
2968         Ok(())
2969     }
2970 }
2971 
2972 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2973 #[cfg(test)]
2974 mod tests {
2975     use super::*;
2976 
2977     fn test_vm_state_transitions(state: VmState) {
2978         match state {
2979             VmState::Created => {
2980                 // Check the transitions from Created
2981                 state.valid_transition(VmState::Created).unwrap_err();
2982                 state.valid_transition(VmState::Running).unwrap();
2983                 state.valid_transition(VmState::Shutdown).unwrap();
2984                 state.valid_transition(VmState::Paused).unwrap();
2985                 state.valid_transition(VmState::BreakPoint).unwrap();
2986             }
2987             VmState::Running => {
2988                 // Check the transitions from Running
2989                 state.valid_transition(VmState::Created).unwrap_err();
2990                 state.valid_transition(VmState::Running).unwrap_err();
2991                 state.valid_transition(VmState::Shutdown).unwrap();
2992                 state.valid_transition(VmState::Paused).unwrap();
2993                 state.valid_transition(VmState::BreakPoint).unwrap();
2994             }
2995             VmState::Shutdown => {
2996                 // Check the transitions from Shutdown
2997                 state.valid_transition(VmState::Created).unwrap_err();
2998                 state.valid_transition(VmState::Running).unwrap();
2999                 state.valid_transition(VmState::Shutdown).unwrap_err();
3000                 state.valid_transition(VmState::Paused).unwrap_err();
3001                 state.valid_transition(VmState::BreakPoint).unwrap_err();
3002             }
3003             VmState::Paused => {
3004                 // Check the transitions from Paused
3005                 state.valid_transition(VmState::Created).unwrap_err();
3006                 state.valid_transition(VmState::Running).unwrap();
3007                 state.valid_transition(VmState::Shutdown).unwrap();
3008                 state.valid_transition(VmState::Paused).unwrap_err();
3009                 state.valid_transition(VmState::BreakPoint).unwrap_err();
3010             }
3011             VmState::BreakPoint => {
3012                 // Check the transitions from Breakpoint
3013                 state.valid_transition(VmState::Created).unwrap();
3014                 state.valid_transition(VmState::Running).unwrap();
3015                 state.valid_transition(VmState::Shutdown).unwrap_err();
3016                 state.valid_transition(VmState::Paused).unwrap_err();
3017                 state.valid_transition(VmState::BreakPoint).unwrap_err();
3018             }
3019         }
3020     }
3021 
3022     #[test]
3023     fn test_vm_created_transitions() {
3024         test_vm_state_transitions(VmState::Created);
3025     }
3026 
3027     #[test]
3028     fn test_vm_running_transitions() {
3029         test_vm_state_transitions(VmState::Running);
3030     }
3031 
3032     #[test]
3033     fn test_vm_shutdown_transitions() {
3034         test_vm_state_transitions(VmState::Shutdown);
3035     }
3036 
3037     #[test]
3038     fn test_vm_paused_transitions() {
3039         test_vm_state_transitions(VmState::Paused);
3040     }
3041 
3042     #[cfg(feature = "tdx")]
3043     #[test]
3044     fn test_hob_memory_resources() {
3045         // Case 1: Two TDVF sections in the middle of the RAM
3046         let sections = vec![
3047             TdvfSection {
3048                 address: 0xc000,
3049                 size: 0x1000,
3050                 ..Default::default()
3051             },
3052             TdvfSection {
3053                 address: 0x1000,
3054                 size: 0x4000,
3055                 ..Default::default()
3056             },
3057         ];
3058         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
3059         let expected = vec![
3060             (0, 0x1000, true),
3061             (0x1000, 0x4000, false),
3062             (0x5000, 0x7000, true),
3063             (0xc000, 0x1000, false),
3064             (0xd000, 0x0fff_3000, true),
3065         ];
3066         assert_eq!(
3067             expected,
3068             Vm::hob_memory_resources(
3069                 sections,
3070                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3071             )
3072         );
3073 
3074         // Case 2: Two TDVF sections with no conflict with the RAM
3075         let sections = vec![
3076             TdvfSection {
3077                 address: 0x1000_1000,
3078                 size: 0x1000,
3079                 ..Default::default()
3080             },
3081             TdvfSection {
3082                 address: 0,
3083                 size: 0x1000,
3084                 ..Default::default()
3085             },
3086         ];
3087         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3088         let expected = vec![
3089             (0, 0x1000, false),
3090             (0x1000, 0x1000_0000, true),
3091             (0x1000_1000, 0x1000, false),
3092         ];
3093         assert_eq!(
3094             expected,
3095             Vm::hob_memory_resources(
3096                 sections,
3097                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3098             )
3099         );
3100 
3101         // Case 3: Two TDVF sections with partial conflicts with the RAM
3102         let sections = vec![
3103             TdvfSection {
3104                 address: 0x1000_0000,
3105                 size: 0x2000,
3106                 ..Default::default()
3107             },
3108             TdvfSection {
3109                 address: 0,
3110                 size: 0x2000,
3111                 ..Default::default()
3112             },
3113         ];
3114         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3115         let expected = vec![
3116             (0, 0x2000, false),
3117             (0x2000, 0x0fff_e000, true),
3118             (0x1000_0000, 0x2000, false),
3119         ];
3120         assert_eq!(
3121             expected,
3122             Vm::hob_memory_resources(
3123                 sections,
3124                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3125             )
3126         );
3127 
3128         // Case 4: Two TDVF sections with no conflict before the RAM and two
3129         // more additional sections with no conflict after the RAM.
3130         let sections = vec![
3131             TdvfSection {
3132                 address: 0x2000_1000,
3133                 size: 0x1000,
3134                 ..Default::default()
3135             },
3136             TdvfSection {
3137                 address: 0x2000_0000,
3138                 size: 0x1000,
3139                 ..Default::default()
3140             },
3141             TdvfSection {
3142                 address: 0x1000,
3143                 size: 0x1000,
3144                 ..Default::default()
3145             },
3146             TdvfSection {
3147                 address: 0,
3148                 size: 0x1000,
3149                 ..Default::default()
3150             },
3151         ];
3152         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3153         let expected = vec![
3154             (0, 0x1000, false),
3155             (0x1000, 0x1000, false),
3156             (0x4000, 0x1000_0000, true),
3157             (0x2000_0000, 0x1000, false),
3158             (0x2000_1000, 0x1000, false),
3159         ];
3160         assert_eq!(
3161             expected,
3162             Vm::hob_memory_resources(
3163                 sections,
3164                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3165             )
3166         );
3167 
3168         // Case 5: One TDVF section overriding the entire RAM
3169         let sections = vec![TdvfSection {
3170             address: 0,
3171             size: 0x2000_0000,
3172             ..Default::default()
3173         }];
3174         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3175         let expected = vec![(0, 0x2000_0000, false)];
3176         assert_eq!(
3177             expected,
3178             Vm::hob_memory_resources(
3179                 sections,
3180                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3181             )
3182         );
3183 
3184         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3185         let sections = vec![
3186             TdvfSection {
3187                 address: 0x1000_2000,
3188                 size: 0x2000,
3189                 ..Default::default()
3190             },
3191             TdvfSection {
3192                 address: 0,
3193                 size: 0x2000,
3194                 ..Default::default()
3195             },
3196         ];
3197         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3198             (GuestAddress(0x2000), 0x1000_0000),
3199             (GuestAddress(0x1000_4000), 0x1000_0000),
3200         ];
3201         let expected = vec![
3202             (0, 0x2000, false),
3203             (0x2000, 0x1000_0000, true),
3204             (0x1000_2000, 0x2000, false),
3205             (0x1000_4000, 0x1000_0000, true),
3206         ];
3207         assert_eq!(
3208             expected,
3209             Vm::hob_memory_resources(
3210                 sections,
3211                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3212             )
3213         );
3214 
3215         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3216         let sections = vec![
3217             TdvfSection {
3218                 address: 0x1000_0000,
3219                 size: 0x4000,
3220                 ..Default::default()
3221             },
3222             TdvfSection {
3223                 address: 0,
3224                 size: 0x4000,
3225                 ..Default::default()
3226             },
3227         ];
3228         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3229             (GuestAddress(0x1000), 0x1000_0000),
3230             (GuestAddress(0x1000_3000), 0x1000_0000),
3231         ];
3232         let expected = vec![
3233             (0, 0x4000, false),
3234             (0x4000, 0x0fff_c000, true),
3235             (0x1000_0000, 0x4000, false),
3236             (0x1000_4000, 0x0fff_f000, true),
3237         ];
3238         assert_eq!(
3239             expected,
3240             Vm::hob_memory_resources(
3241                 sections,
3242                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3243             )
3244         );
3245     }
3246 }
3247 
3248 #[cfg(target_arch = "aarch64")]
3249 #[cfg(test)]
3250 mod tests {
3251     use arch::aarch64::fdt::create_fdt;
3252     use arch::aarch64::layout;
3253     use arch::{DeviceType, MmioDeviceInfo};
3254     use devices::gic::Gic;
3255 
3256     use super::*;
3257 
3258     const LEN: u64 = 4096;
3259 
3260     #[test]
3261     fn test_create_fdt_with_devices() {
3262         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3263         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3264 
3265         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3266             (
3267                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3268                 MmioDeviceInfo {
3269                     addr: 0x00,
3270                     len: LEN,
3271                     irq: 33,
3272                 },
3273             ),
3274             (
3275                 (DeviceType::Virtio(1), "virtio".to_string()),
3276                 MmioDeviceInfo {
3277                     addr: LEN,
3278                     len: LEN,
3279                     irq: 34,
3280                 },
3281             ),
3282             (
3283                 (DeviceType::Rtc, "rtc".to_string()),
3284                 MmioDeviceInfo {
3285                     addr: 2 * LEN,
3286                     len: LEN,
3287                     irq: 35,
3288                 },
3289             ),
3290         ]
3291         .iter()
3292         .cloned()
3293         .collect();
3294 
3295         let hv = hypervisor::new().unwrap();
3296         let vm = hv.create_vm().unwrap();
3297         let gic = vm
3298             .create_vgic(Gic::create_default_config(1))
3299             .expect("Cannot create gic");
3300         create_fdt(
3301             &mem,
3302             "console=tty0",
3303             vec![0],
3304             Some((0, 0, 0)),
3305             &dev_info,
3306             &gic,
3307             &None,
3308             &Vec::new(),
3309             &BTreeMap::new(),
3310             None,
3311             true,
3312         )
3313         .unwrap();
3314     }
3315 }
3316 
3317 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3318 #[test]
3319 pub fn test_vm() {
3320     use hypervisor::VmExit;
3321     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3322     // This example based on https://lwn.net/Articles/658511/
3323     let code = [
3324         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3325         0x00, 0xd8, /* add %bl, %al */
3326         0x04, b'0', /* add $'0', %al */
3327         0xee, /* out %al, (%dx) */
3328         0xb0, b'\n', /* mov $'\n', %al */
3329         0xee,  /* out %al, (%dx) */
3330         0xf4,  /* hlt */
3331     ];
3332 
3333     let mem_size = 0x1000;
3334     let load_addr = GuestAddress(0x1000);
3335     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3336 
3337     let hv = hypervisor::new().unwrap();
3338     let vm = hv.create_vm().expect("new VM creation failed");
3339 
3340     for (index, region) in mem.iter().enumerate() {
3341         let mem_region = vm.make_user_memory_region(
3342             index as u32,
3343             region.start_addr().raw_value(),
3344             region.len(),
3345             region.as_ptr() as u64,
3346             false,
3347             false,
3348         );
3349 
3350         vm.create_user_memory_region(mem_region)
3351             .expect("Cannot configure guest memory");
3352     }
3353     mem.write_slice(&code, load_addr)
3354         .expect("Writing code to memory failed");
3355 
3356     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3357 
3358     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3359     vcpu_sregs.cs.base = 0;
3360     vcpu_sregs.cs.selector = 0;
3361     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3362 
3363     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3364     vcpu_regs.set_rip(0x1000);
3365     vcpu_regs.set_rax(2);
3366     vcpu_regs.set_rbx(3);
3367     vcpu_regs.set_rflags(2);
3368     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3369 
3370     loop {
3371         match vcpu.run().expect("run failed") {
3372             VmExit::Reset => {
3373                 println!("HLT");
3374                 break;
3375             }
3376             VmExit::Ignore => {}
3377             r => panic!("unexpected exit reason: {r:?}"),
3378         }
3379     }
3380 }
3381