xref: /cloud-hypervisor/vmm/src/vm.rs (revision 5d0d56f50ba2b69a6d00379d446792e063da9a4f)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 #[cfg(feature = "igvm")]
29 use crate::igvm::igvm_loader;
30 use crate::memory_manager::{
31     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
32 };
33 #[cfg(target_arch = "x86_64")]
34 use crate::migration::get_vm_snapshot;
35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
36 use crate::migration::url_to_file;
37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
38 use crate::GuestMemoryMmap;
39 use crate::{
40     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
41 };
42 use anyhow::anyhow;
43 use arch::get_host_cpu_phys_bits;
44 #[cfg(target_arch = "x86_64")]
45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
46 #[cfg(feature = "tdx")]
47 use arch::x86_64::tdx::TdvfSection;
48 use arch::EntryPoint;
49 #[cfg(target_arch = "aarch64")]
50 use arch::PciSpaceInfo;
51 use arch::{NumaNode, NumaNodes};
52 #[cfg(target_arch = "aarch64")]
53 use devices::interrupt_controller;
54 use devices::AcpiNotificationFlags;
55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
59 use hypervisor::{HypervisorVmError, VmOps};
60 use libc::{termios, SIGWINCH};
61 use linux_loader::cmdline::Cmdline;
62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
63 use linux_loader::elf;
64 #[cfg(target_arch = "x86_64")]
65 use linux_loader::loader::bzimage::BzImage;
66 #[cfg(target_arch = "x86_64")]
67 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
68 #[cfg(target_arch = "aarch64")]
69 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
70 use linux_loader::loader::KernelLoader;
71 use seccompiler::SeccompAction;
72 use serde::{Deserialize, Serialize};
73 use std::cmp;
74 use std::collections::BTreeMap;
75 use std::collections::HashMap;
76 use std::fs::{File, OpenOptions};
77 use std::io::{self, Seek, SeekFrom, Write};
78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
79 use std::mem::size_of;
80 use std::num::Wrapping;
81 use std::ops::Deref;
82 use std::os::unix::net::UnixStream;
83 use std::sync::{Arc, Mutex, RwLock};
84 use std::time::Instant;
85 use std::{result, str, thread};
86 use thiserror::Error;
87 use tracer::trace_scoped;
88 use vm_device::Bus;
89 #[cfg(feature = "tdx")]
90 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
91 use vm_memory::{
92     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
93 };
94 use vm_migration::protocol::{Request, Response, Status};
95 use vm_migration::{
96     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
97     Snapshottable, Transportable,
98 };
99 use vmm_sys_util::eventfd::EventFd;
100 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
101 
102 /// Errors associated with VM management
103 #[derive(Debug, Error)]
104 pub enum Error {
105     #[error("Cannot open kernel file: {0}")]
106     KernelFile(#[source] io::Error),
107 
108     #[error("Cannot open initramfs file: {0}")]
109     InitramfsFile(#[source] io::Error),
110 
111     #[error("Cannot load the kernel into memory: {0}")]
112     KernelLoad(#[source] linux_loader::loader::Error),
113 
114     #[cfg(target_arch = "aarch64")]
115     #[error("Cannot load the UEFI binary in memory: {0:?}")]
116     UefiLoad(arch::aarch64::uefi::Error),
117 
118     #[error("Cannot load the initramfs into memory")]
119     InitramfsLoad,
120 
121     #[error("Cannot load the kernel command line in memory: {0}")]
122     LoadCmdLine(#[source] linux_loader::loader::Error),
123 
124     #[error("Cannot modify the kernel command line: {0}")]
125     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
126 
127     #[error("Cannot create the kernel command line: {0}")]
128     CmdLineCreate(#[source] linux_loader::cmdline::Error),
129 
130     #[error("Cannot configure system: {0}")]
131     ConfigureSystem(#[source] arch::Error),
132 
133     #[cfg(target_arch = "aarch64")]
134     #[error("Cannot enable interrupt controller: {0:?}")]
135     EnableInterruptController(interrupt_controller::Error),
136 
137     #[error("VM state is poisoned")]
138     PoisonedState,
139 
140     #[error("Error from device manager: {0:?}")]
141     DeviceManager(DeviceManagerError),
142 
143     #[error("No device with id {0:?} to remove")]
144     NoDeviceToRemove(String),
145 
146     #[error("Cannot spawn a signal handler thread: {0}")]
147     SignalHandlerSpawn(#[source] io::Error),
148 
149     #[error("Failed to join on threads: {0:?}")]
150     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
151 
152     #[error("VM config is missing")]
153     VmMissingConfig,
154 
155     #[error("VM is not created")]
156     VmNotCreated,
157 
158     #[error("VM is already created")]
159     VmAlreadyCreated,
160 
161     #[error("VM is not running")]
162     VmNotRunning,
163 
164     #[error("Cannot clone EventFd: {0}")]
165     EventFdClone(#[source] io::Error),
166 
167     #[error("invalid VM state transition: {0:?} to {1:?}")]
168     InvalidStateTransition(VmState, VmState),
169 
170     #[error("Error from CPU manager: {0}")]
171     CpuManager(#[source] cpu::Error),
172 
173     #[error("Cannot pause devices: {0}")]
174     PauseDevices(#[source] MigratableError),
175 
176     #[error("Cannot resume devices: {0}")]
177     ResumeDevices(#[source] MigratableError),
178 
179     #[error("Cannot pause CPUs: {0}")]
180     PauseCpus(#[source] MigratableError),
181 
182     #[error("Cannot resume cpus: {0}")]
183     ResumeCpus(#[source] MigratableError),
184 
185     #[error("Cannot pause VM: {0}")]
186     Pause(#[source] MigratableError),
187 
188     #[error("Cannot resume VM: {0}")]
189     Resume(#[source] MigratableError),
190 
191     #[error("Memory manager error: {0:?}")]
192     MemoryManager(MemoryManagerError),
193 
194     #[error("Eventfd write error: {0}")]
195     EventfdError(#[source] std::io::Error),
196 
197     #[error("Cannot snapshot VM: {0}")]
198     Snapshot(#[source] MigratableError),
199 
200     #[error("Cannot restore VM: {0}")]
201     Restore(#[source] MigratableError),
202 
203     #[error("Cannot send VM snapshot: {0}")]
204     SnapshotSend(#[source] MigratableError),
205 
206     #[error("Invalid restore source URL")]
207     InvalidRestoreSourceUrl,
208 
209     #[error("Failed to validate config: {0}")]
210     ConfigValidation(#[source] ValidationError),
211 
212     #[error("Too many virtio-vsock devices")]
213     TooManyVsockDevices,
214 
215     #[error("Failed serializing into JSON: {0}")]
216     SerializeJson(#[source] serde_json::Error),
217 
218     #[error("Invalid NUMA configuration")]
219     InvalidNumaConfig,
220 
221     #[error("Cannot create seccomp filter: {0}")]
222     CreateSeccompFilter(#[source] seccompiler::Error),
223 
224     #[error("Cannot apply seccomp filter: {0}")]
225     ApplySeccompFilter(#[source] seccompiler::Error),
226 
227     #[error("Failed resizing a memory zone")]
228     ResizeZone,
229 
230     #[error("Cannot activate virtio devices: {0:?}")]
231     ActivateVirtioDevices(DeviceManagerError),
232 
233     #[error("Error triggering power button: {0:?}")]
234     PowerButton(DeviceManagerError),
235 
236     #[error("Kernel lacks PVH header")]
237     KernelMissingPvhHeader,
238 
239     #[error("Failed to allocate firmware RAM: {0:?}")]
240     AllocateFirmwareMemory(MemoryManagerError),
241 
242     #[error("Error manipulating firmware file: {0}")]
243     FirmwareFile(#[source] std::io::Error),
244 
245     #[error("Firmware too big")]
246     FirmwareTooLarge,
247 
248     #[error("Failed to copy firmware to memory: {0}")]
249     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
250 
251     #[cfg(feature = "sev_snp")]
252     #[error("Error enabling SEV-SNP VM: {0}")]
253     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
254 
255     #[cfg(feature = "tdx")]
256     #[error("Error performing I/O on TDX firmware file: {0}")]
257     LoadTdvf(#[source] std::io::Error),
258 
259     #[cfg(feature = "tdx")]
260     #[error("Error performing I/O on the TDX payload file: {0}")]
261     LoadPayload(#[source] std::io::Error),
262 
263     #[cfg(feature = "tdx")]
264     #[error("Error parsing TDVF: {0}")]
265     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
266 
267     #[cfg(feature = "tdx")]
268     #[error("Error populating TDX HOB: {0}")]
269     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
270 
271     #[cfg(feature = "tdx")]
272     #[error("Error allocating TDVF memory: {0:?}")]
273     AllocatingTdvfMemory(crate::memory_manager::Error),
274 
275     #[cfg(feature = "tdx")]
276     #[error("Error enabling TDX VM: {0}")]
277     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
278 
279     #[cfg(feature = "tdx")]
280     #[error("Error enabling TDX memory region: {0}")]
281     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
282 
283     #[cfg(feature = "tdx")]
284     #[error("Error finalizing TDX VM: {0}")]
285     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
286 
287     #[cfg(feature = "tdx")]
288     #[error("TDX firmware missing")]
289     TdxFirmwareMissing,
290 
291     #[cfg(feature = "tdx")]
292     #[error("Invalid TDX payload type")]
293     InvalidPayloadType,
294 
295     #[cfg(feature = "guest_debug")]
296     #[error("Error debugging VM: {0:?}")]
297     Debug(DebuggableError),
298 
299     #[error("Error spawning kernel loading thread")]
300     KernelLoadThreadSpawn(std::io::Error),
301 
302     #[error("Error joining kernel loading thread")]
303     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
304 
305     #[error("Payload configuration is not bootable")]
306     InvalidPayload,
307 
308     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
309     #[error("Error coredumping VM: {0:?}")]
310     Coredump(GuestDebuggableError),
311 
312     #[cfg(feature = "igvm")]
313     #[error("Cannot open igvm file: {0}")]
314     IgvmFile(#[source] io::Error),
315 
316     #[cfg(feature = "igvm")]
317     #[error("Cannot load the igvm into memory: {0}")]
318     IgvmLoad(#[source] igvm_loader::Error),
319 
320     #[error("Error injecting NMI")]
321     ErrorNmi,
322 
323     #[error("Error resuming the VM: {0}")]
324     ResumeVm(#[source] hypervisor::HypervisorVmError),
325 }
326 pub type Result<T> = result::Result<T, Error>;
327 
328 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
329 pub enum VmState {
330     Created,
331     Running,
332     Shutdown,
333     Paused,
334     BreakPoint,
335 }
336 
337 impl VmState {
338     fn valid_transition(self, new_state: VmState) -> Result<()> {
339         match self {
340             VmState::Created => match new_state {
341                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
342                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
343                     Ok(())
344                 }
345             },
346 
347             VmState::Running => match new_state {
348                 VmState::Created | VmState::Running => {
349                     Err(Error::InvalidStateTransition(self, new_state))
350                 }
351                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
352             },
353 
354             VmState::Shutdown => match new_state {
355                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
356                     Err(Error::InvalidStateTransition(self, new_state))
357                 }
358                 VmState::Running => Ok(()),
359             },
360 
361             VmState::Paused => match new_state {
362                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
363                     Err(Error::InvalidStateTransition(self, new_state))
364                 }
365                 VmState::Running | VmState::Shutdown => Ok(()),
366             },
367             VmState::BreakPoint => match new_state {
368                 VmState::Created | VmState::Running => Ok(()),
369                 _ => Err(Error::InvalidStateTransition(self, new_state)),
370             },
371         }
372     }
373 }
374 
375 struct VmOpsHandler {
376     memory: GuestMemoryAtomic<GuestMemoryMmap>,
377     #[cfg(target_arch = "x86_64")]
378     io_bus: Arc<Bus>,
379     mmio_bus: Arc<Bus>,
380 }
381 
382 impl VmOps for VmOpsHandler {
383     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
384         self.memory
385             .memory()
386             .write(buf, GuestAddress(gpa))
387             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
388     }
389 
390     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
391         self.memory
392             .memory()
393             .read(buf, GuestAddress(gpa))
394             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
395     }
396 
397     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
398         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
399             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
400         }
401         Ok(())
402     }
403 
404     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
405         match self.mmio_bus.write(gpa, data) {
406             Err(vm_device::BusError::MissingAddressRange) => {
407                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
408             }
409             Ok(Some(barrier)) => {
410                 info!("Waiting for barrier");
411                 barrier.wait();
412                 info!("Barrier released");
413             }
414             _ => {}
415         };
416         Ok(())
417     }
418 
419     #[cfg(target_arch = "x86_64")]
420     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
421         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
422             info!("Guest PIO read to unregistered address 0x{:x}", port);
423         }
424         Ok(())
425     }
426 
427     #[cfg(target_arch = "x86_64")]
428     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
429         match self.io_bus.write(port, data) {
430             Err(vm_device::BusError::MissingAddressRange) => {
431                 info!("Guest PIO write to unregistered address 0x{:x}", port);
432             }
433             Ok(Some(barrier)) => {
434                 info!("Waiting for barrier");
435                 barrier.wait();
436                 info!("Barrier released");
437             }
438             _ => {}
439         };
440         Ok(())
441     }
442 }
443 
444 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
445     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
446 
447     cmp::min(host_phys_bits, max_phys_bits)
448 }
449 
450 pub struct Vm {
451     #[cfg(feature = "tdx")]
452     kernel: Option<File>,
453     initramfs: Option<File>,
454     threads: Vec<thread::JoinHandle<()>>,
455     device_manager: Arc<Mutex<DeviceManager>>,
456     config: Arc<Mutex<VmConfig>>,
457     state: RwLock<VmState>,
458     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
459     memory_manager: Arc<Mutex<MemoryManager>>,
460     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
461     // The hypervisor abstracted virtual machine.
462     vm: Arc<dyn hypervisor::Vm>,
463     #[cfg(target_arch = "x86_64")]
464     saved_clock: Option<hypervisor::ClockData>,
465     numa_nodes: NumaNodes,
466     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
467     hypervisor: Arc<dyn hypervisor::Hypervisor>,
468     stop_on_boot: bool,
469     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
470 }
471 
472 impl Vm {
473     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
474 
475     #[allow(clippy::too_many_arguments)]
476     pub fn new_from_memory_manager(
477         config: Arc<Mutex<VmConfig>>,
478         memory_manager: Arc<Mutex<MemoryManager>>,
479         vm: Arc<dyn hypervisor::Vm>,
480         exit_evt: EventFd,
481         reset_evt: EventFd,
482         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
483         seccomp_action: &SeccompAction,
484         hypervisor: Arc<dyn hypervisor::Hypervisor>,
485         activate_evt: EventFd,
486         timestamp: Instant,
487         serial_pty: Option<PtyPair>,
488         console_pty: Option<PtyPair>,
489         debug_console_pty: Option<PtyPair>,
490         console_resize_pipe: Option<File>,
491         original_termios: Arc<Mutex<Option<termios>>>,
492         snapshot: Option<Snapshot>,
493     ) -> Result<Self> {
494         trace_scoped!("Vm::new_from_memory_manager");
495 
496         let boot_id_list = config
497             .lock()
498             .unwrap()
499             .validate()
500             .map_err(Error::ConfigValidation)?;
501 
502         #[cfg(not(feature = "igvm"))]
503         let load_payload_handle = if snapshot.is_none() {
504             Self::load_payload_async(&memory_manager, &config)?
505         } else {
506             None
507         };
508 
509         info!("Booting VM from config: {:?}", &config);
510 
511         // Create NUMA nodes based on NumaConfig.
512         let numa_nodes =
513             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
514 
515         #[cfg(feature = "tdx")]
516         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
517         #[cfg(feature = "sev_snp")]
518         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
519         #[cfg(feature = "tdx")]
520         let force_iommu = tdx_enabled;
521         #[cfg(feature = "sev_snp")]
522         let force_iommu = sev_snp_enabled;
523         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
524         let force_iommu = false;
525 
526         #[cfg(feature = "guest_debug")]
527         let stop_on_boot = config.lock().unwrap().gdb;
528         #[cfg(not(feature = "guest_debug"))]
529         let stop_on_boot = false;
530 
531         let memory = memory_manager.lock().unwrap().guest_memory();
532         #[cfg(target_arch = "x86_64")]
533         let io_bus = Arc::new(Bus::new());
534         let mmio_bus = Arc::new(Bus::new());
535 
536         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
537             memory,
538             #[cfg(target_arch = "x86_64")]
539             io_bus: io_bus.clone(),
540             mmio_bus: mmio_bus.clone(),
541         });
542 
543         let cpus_config = { &config.lock().unwrap().cpus.clone() };
544         let cpu_manager = cpu::CpuManager::new(
545             cpus_config,
546             vm.clone(),
547             exit_evt.try_clone().map_err(Error::EventFdClone)?,
548             reset_evt.try_clone().map_err(Error::EventFdClone)?,
549             #[cfg(feature = "guest_debug")]
550             vm_debug_evt,
551             &hypervisor,
552             seccomp_action.clone(),
553             vm_ops,
554             #[cfg(feature = "tdx")]
555             tdx_enabled,
556             &numa_nodes,
557             #[cfg(feature = "sev_snp")]
558             sev_snp_enabled,
559         )
560         .map_err(Error::CpuManager)?;
561 
562         #[cfg(target_arch = "x86_64")]
563         cpu_manager
564             .lock()
565             .unwrap()
566             .populate_cpuid(
567                 &memory_manager,
568                 &hypervisor,
569                 #[cfg(feature = "tdx")]
570                 tdx_enabled,
571             )
572             .map_err(Error::CpuManager)?;
573 
574         // Loading the igvm file is pushed down here because
575         // igvm parser needs cpu_manager to retrieve cpuid leaf.
576         // For the regular case, we can start loading early, but for
577         // igvm case we have to wait until cpu_manager is created.
578         // Currently, Microsoft Hypervisor does not provide any
579         // Hypervisor specific common cpuid, we need to call get_cpuid_values
580         // per cpuid through cpu_manager.
581         #[cfg(feature = "igvm")]
582         let load_payload_handle = if snapshot.is_none() {
583             Self::load_payload_async(
584                 &memory_manager,
585                 &config,
586                 &cpu_manager,
587                 #[cfg(feature = "sev_snp")]
588                 sev_snp_enabled,
589             )?
590         } else {
591             None
592         };
593         // The initial TDX configuration must be done before the vCPUs are
594         // created
595         #[cfg(feature = "tdx")]
596         if tdx_enabled {
597             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
598             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
599             vm.tdx_init(&cpuid, max_vcpus)
600                 .map_err(Error::InitializeTdxVm)?;
601         }
602 
603         cpu_manager
604             .lock()
605             .unwrap()
606             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
607             .map_err(Error::CpuManager)?;
608 
609         // This initial SEV-SNP configuration must be done immediately after
610         // vCPUs are created. As part of this initialization we are
611         // transitioning the guest into secure state.
612         #[cfg(feature = "sev_snp")]
613         if sev_snp_enabled {
614             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
615         }
616 
617         #[cfg(feature = "tdx")]
618         let dynamic = !tdx_enabled;
619         #[cfg(not(feature = "tdx"))]
620         let dynamic = true;
621 
622         let device_manager = DeviceManager::new(
623             #[cfg(target_arch = "x86_64")]
624             io_bus,
625             mmio_bus,
626             hypervisor.hypervisor_type(),
627             vm.clone(),
628             config.clone(),
629             memory_manager.clone(),
630             cpu_manager.clone(),
631             exit_evt.try_clone().map_err(Error::EventFdClone)?,
632             reset_evt,
633             seccomp_action.clone(),
634             numa_nodes.clone(),
635             &activate_evt,
636             force_iommu,
637             boot_id_list,
638             timestamp,
639             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
640             dynamic,
641         )
642         .map_err(Error::DeviceManager)?;
643 
644         device_manager
645             .lock()
646             .unwrap()
647             .create_devices(
648                 serial_pty,
649                 console_pty,
650                 debug_console_pty,
651                 console_resize_pipe,
652                 original_termios,
653             )
654             .map_err(Error::DeviceManager)?;
655 
656         #[cfg(feature = "tdx")]
657         let kernel = config
658             .lock()
659             .unwrap()
660             .payload
661             .as_ref()
662             .map(|p| p.kernel.as_ref().map(File::open))
663             .unwrap_or_default()
664             .transpose()
665             .map_err(Error::KernelFile)?;
666 
667         let initramfs = config
668             .lock()
669             .unwrap()
670             .payload
671             .as_ref()
672             .map(|p| p.initramfs.as_ref().map(File::open))
673             .unwrap_or_default()
674             .transpose()
675             .map_err(Error::InitramfsFile)?;
676 
677         #[cfg(target_arch = "x86_64")]
678         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
679             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
680             vm_snapshot.clock
681         } else {
682             None
683         };
684 
685         let vm_state = if snapshot.is_some() {
686             VmState::Paused
687         } else {
688             VmState::Created
689         };
690 
691         Ok(Vm {
692             #[cfg(feature = "tdx")]
693             kernel,
694             initramfs,
695             device_manager,
696             config,
697             threads: Vec::with_capacity(1),
698             state: RwLock::new(vm_state),
699             cpu_manager,
700             memory_manager,
701             vm,
702             #[cfg(target_arch = "x86_64")]
703             saved_clock,
704             numa_nodes,
705             hypervisor,
706             stop_on_boot,
707             load_payload_handle,
708         })
709     }
710 
711     fn create_numa_nodes(
712         configs: Option<Vec<NumaConfig>>,
713         memory_manager: &Arc<Mutex<MemoryManager>>,
714     ) -> Result<NumaNodes> {
715         let mm = memory_manager.lock().unwrap();
716         let mm_zones = mm.memory_zones();
717         let mut numa_nodes = BTreeMap::new();
718 
719         if let Some(configs) = &configs {
720             for config in configs.iter() {
721                 if numa_nodes.contains_key(&config.guest_numa_id) {
722                     error!("Can't define twice the same NUMA node");
723                     return Err(Error::InvalidNumaConfig);
724                 }
725 
726                 let mut node = NumaNode::default();
727 
728                 if let Some(memory_zones) = &config.memory_zones {
729                     for memory_zone in memory_zones.iter() {
730                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
731                             node.memory_regions.extend(mm_zone.regions().clone());
732                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
733                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
734                             }
735                             node.memory_zones.push(memory_zone.clone());
736                         } else {
737                             error!("Unknown memory zone '{}'", memory_zone);
738                             return Err(Error::InvalidNumaConfig);
739                         }
740                     }
741                 }
742 
743                 if let Some(cpus) = &config.cpus {
744                     node.cpus.extend(cpus);
745                 }
746 
747                 if let Some(pci_segments) = &config.pci_segments {
748                     node.pci_segments.extend(pci_segments);
749                 }
750 
751                 if let Some(distances) = &config.distances {
752                     for distance in distances.iter() {
753                         let dest = distance.destination;
754                         let dist = distance.distance;
755 
756                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
757                             error!("Unknown destination NUMA node {}", dest);
758                             return Err(Error::InvalidNumaConfig);
759                         }
760 
761                         if node.distances.contains_key(&dest) {
762                             error!("Destination NUMA node {} has been already set", dest);
763                             return Err(Error::InvalidNumaConfig);
764                         }
765 
766                         node.distances.insert(dest, dist);
767                     }
768                 }
769 
770                 #[cfg(target_arch = "x86_64")]
771                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
772                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
773                         let mm_sections = sgx_epc_region.epc_sections();
774                         for sgx_epc_section in sgx_epc_sections.iter() {
775                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
776                                 node.sgx_epc_sections.push(mm_section.clone());
777                             } else {
778                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
779                                 return Err(Error::InvalidNumaConfig);
780                             }
781                         }
782                     } else {
783                         error!("Missing SGX EPC region");
784                         return Err(Error::InvalidNumaConfig);
785                     }
786                 }
787 
788                 numa_nodes.insert(config.guest_numa_id, node);
789             }
790         }
791 
792         Ok(numa_nodes)
793     }
794 
795     #[allow(clippy::too_many_arguments)]
796     pub fn new(
797         vm_config: Arc<Mutex<VmConfig>>,
798         exit_evt: EventFd,
799         reset_evt: EventFd,
800         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
801         seccomp_action: &SeccompAction,
802         hypervisor: Arc<dyn hypervisor::Hypervisor>,
803         activate_evt: EventFd,
804         serial_pty: Option<PtyPair>,
805         console_pty: Option<PtyPair>,
806         debug_console_pty: Option<PtyPair>,
807         console_resize_pipe: Option<File>,
808         original_termios: Arc<Mutex<Option<termios>>>,
809         snapshot: Option<Snapshot>,
810         source_url: Option<&str>,
811         prefault: Option<bool>,
812     ) -> Result<Self> {
813         trace_scoped!("Vm::new");
814 
815         let timestamp = Instant::now();
816 
817         #[cfg(feature = "tdx")]
818         let tdx_enabled = if snapshot.is_some() {
819             false
820         } else {
821             vm_config.lock().unwrap().is_tdx_enabled()
822         };
823 
824         #[cfg(feature = "sev_snp")]
825         let sev_snp_enabled = if snapshot.is_some() {
826             false
827         } else {
828             vm_config.lock().unwrap().is_sev_snp_enabled()
829         };
830 
831         let vm = Self::create_hypervisor_vm(
832             &hypervisor,
833             #[cfg(feature = "tdx")]
834             tdx_enabled,
835             #[cfg(feature = "sev_snp")]
836             sev_snp_enabled,
837         )?;
838 
839         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
840 
841         let memory_manager = if let Some(snapshot) =
842             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
843         {
844             MemoryManager::new_from_snapshot(
845                 &snapshot,
846                 vm.clone(),
847                 &vm_config.lock().unwrap().memory.clone(),
848                 source_url,
849                 prefault.unwrap(),
850                 phys_bits,
851             )
852             .map_err(Error::MemoryManager)?
853         } else {
854             #[cfg(target_arch = "x86_64")]
855             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
856 
857             MemoryManager::new(
858                 vm.clone(),
859                 &vm_config.lock().unwrap().memory.clone(),
860                 None,
861                 phys_bits,
862                 #[cfg(feature = "tdx")]
863                 tdx_enabled,
864                 None,
865                 None,
866                 #[cfg(target_arch = "x86_64")]
867                 sgx_epc_config,
868             )
869             .map_err(Error::MemoryManager)?
870         };
871 
872         Vm::new_from_memory_manager(
873             vm_config,
874             memory_manager,
875             vm,
876             exit_evt,
877             reset_evt,
878             #[cfg(feature = "guest_debug")]
879             vm_debug_evt,
880             seccomp_action,
881             hypervisor,
882             activate_evt,
883             timestamp,
884             serial_pty,
885             console_pty,
886             debug_console_pty,
887             console_resize_pipe,
888             original_termios,
889             snapshot,
890         )
891     }
892 
893     pub fn create_hypervisor_vm(
894         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
895         #[cfg(feature = "tdx")] tdx_enabled: bool,
896         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
897     ) -> Result<Arc<dyn hypervisor::Vm>> {
898         hypervisor.check_required_extensions().unwrap();
899 
900         cfg_if::cfg_if! {
901             if #[cfg(feature = "tdx")] {
902                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
903                 // Otherwise KVM_X86_LEGACY_VM: 0
904                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
905                 let vm = hypervisor
906                     .create_vm_with_type(u64::from(tdx_enabled))
907                     .unwrap();
908             } else if #[cfg(feature = "sev_snp")] {
909                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
910                 // Otherwise SEV_SNP_DISABLED: 0
911                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
912                 let vm = hypervisor
913                     .create_vm_with_type(u64::from(sev_snp_enabled))
914                     .unwrap();
915             } else {
916                 let vm = hypervisor.create_vm().unwrap();
917             }
918         }
919 
920         #[cfg(target_arch = "x86_64")]
921         {
922             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
923                 .unwrap();
924             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
925             vm.enable_split_irq().unwrap();
926         }
927 
928         Ok(vm)
929     }
930 
931     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
932         let initramfs = self.initramfs.as_mut().unwrap();
933         let size: usize = initramfs
934             .seek(SeekFrom::End(0))
935             .map_err(|_| Error::InitramfsLoad)?
936             .try_into()
937             .unwrap();
938         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
939 
940         let address =
941             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
942         let address = GuestAddress(address);
943 
944         guest_mem
945             .read_volatile_from(address, initramfs, size)
946             .map_err(|_| Error::InitramfsLoad)?;
947 
948         info!("Initramfs loaded: address = 0x{:x}", address.0);
949         Ok(arch::InitramfsConfig { address, size })
950     }
951 
952     pub fn generate_cmdline(
953         payload: &PayloadConfig,
954         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
955     ) -> Result<Cmdline> {
956         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
957         if let Some(s) = payload.cmdline.as_ref() {
958             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
959         }
960 
961         #[cfg(target_arch = "aarch64")]
962         for entry in device_manager.lock().unwrap().cmdline_additions() {
963             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
964         }
965         Ok(cmdline)
966     }
967 
968     #[cfg(target_arch = "aarch64")]
969     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
970         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
971         let mem = uefi_flash.memory();
972         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
973             .map_err(Error::UefiLoad)?;
974         Ok(())
975     }
976 
977     #[cfg(target_arch = "aarch64")]
978     fn load_kernel(
979         firmware: Option<File>,
980         kernel: Option<File>,
981         memory_manager: Arc<Mutex<MemoryManager>>,
982     ) -> Result<EntryPoint> {
983         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
984         let mem = guest_memory.memory();
985         let entry_addr = match (firmware, kernel) {
986             (None, Some(mut kernel)) => {
987                 match linux_loader::loader::pe::PE::load(
988                     mem.deref(),
989                     Some(arch::layout::KERNEL_START),
990                     &mut kernel,
991                     None,
992                 ) {
993                     Ok(entry_addr) => entry_addr.kernel_load,
994                     // Try to load the binary as kernel PE file at first.
995                     // If failed, retry to load it as UEFI binary.
996                     // As the UEFI binary is formatless, it must be the last option to try.
997                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
998                         Self::load_firmware(&kernel, memory_manager)?;
999                         arch::layout::UEFI_START
1000                     }
1001                     Err(e) => {
1002                         return Err(Error::KernelLoad(e));
1003                     }
1004                 }
1005             }
1006             (Some(firmware), None) => {
1007                 Self::load_firmware(&firmware, memory_manager)?;
1008                 arch::layout::UEFI_START
1009             }
1010             _ => return Err(Error::InvalidPayload),
1011         };
1012 
1013         Ok(EntryPoint { entry_addr })
1014     }
1015 
1016     #[cfg(feature = "igvm")]
1017     fn load_igvm(
1018         igvm: File,
1019         memory_manager: Arc<Mutex<MemoryManager>>,
1020         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1021         #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1022     ) -> Result<EntryPoint> {
1023         let res = igvm_loader::load_igvm(
1024             &igvm,
1025             memory_manager,
1026             cpu_manager.clone(),
1027             "",
1028             #[cfg(feature = "sev_snp")]
1029             host_data,
1030         )
1031         .map_err(Error::IgvmLoad)?;
1032 
1033         cfg_if::cfg_if! {
1034             if #[cfg(feature = "sev_snp")] {
1035                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1036                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1037                 } else {
1038                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1039                 };
1040             } else {
1041                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1042             }
1043         };
1044         Ok(entry_point)
1045     }
1046 
1047     #[cfg(target_arch = "x86_64")]
1048     fn load_kernel(
1049         mut kernel: File,
1050         cmdline: Option<Cmdline>,
1051         memory_manager: Arc<Mutex<MemoryManager>>,
1052     ) -> Result<EntryPoint> {
1053         info!("Loading kernel");
1054 
1055         let mem = {
1056             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1057             guest_memory.memory()
1058         };
1059 
1060         // Try ELF binary with PVH boot.
1061         let entry_addr = linux_loader::loader::elf::Elf::load(
1062             mem.deref(),
1063             None,
1064             &mut kernel,
1065             Some(arch::layout::HIGH_RAM_START),
1066         )
1067         // Try loading kernel as bzImage.
1068         .or_else(|_| {
1069             BzImage::load(
1070                 mem.deref(),
1071                 None,
1072                 &mut kernel,
1073                 Some(arch::layout::HIGH_RAM_START),
1074             )
1075         })
1076         .map_err(Error::KernelLoad)?;
1077 
1078         if let Some(cmdline) = cmdline {
1079             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1080                 .map_err(Error::LoadCmdLine)?;
1081         }
1082 
1083         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1084             // Use the PVH kernel entry point to boot the guest
1085             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1086             Ok(EntryPoint {
1087                 entry_addr,
1088                 setup_header: None,
1089             })
1090         } else if entry_addr.setup_header.is_some() {
1091             // Use the bzImage 32bit entry point to boot the guest
1092             info!(
1093                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1094                 entry_addr.kernel_load.0
1095             );
1096             Ok(EntryPoint {
1097                 entry_addr: entry_addr.kernel_load,
1098                 setup_header: entry_addr.setup_header,
1099             })
1100         } else {
1101             Err(Error::KernelMissingPvhHeader)
1102         }
1103     }
1104 
1105     #[cfg(target_arch = "x86_64")]
1106     fn load_payload(
1107         payload: &PayloadConfig,
1108         memory_manager: Arc<Mutex<MemoryManager>>,
1109         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1110         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1111     ) -> Result<EntryPoint> {
1112         trace_scoped!("load_payload");
1113         #[cfg(feature = "igvm")]
1114         {
1115             if let Some(_igvm_file) = &payload.igvm {
1116                 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1117                 #[cfg(feature = "sev_snp")]
1118                 if sev_snp_enabled {
1119                     return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1120                 }
1121                 #[cfg(not(feature = "sev_snp"))]
1122                 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1123             }
1124         }
1125         match (
1126             &payload.firmware,
1127             &payload.kernel,
1128             &payload.initramfs,
1129             &payload.cmdline,
1130         ) {
1131             (Some(firmware), None, None, None) => {
1132                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1133                 Self::load_kernel(firmware, None, memory_manager)
1134             }
1135             (None, Some(kernel), _, _) => {
1136                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1137                 let cmdline = Self::generate_cmdline(payload)?;
1138                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1139             }
1140             _ => Err(Error::InvalidPayload),
1141         }
1142     }
1143 
1144     #[cfg(target_arch = "aarch64")]
1145     fn load_payload(
1146         payload: &PayloadConfig,
1147         memory_manager: Arc<Mutex<MemoryManager>>,
1148     ) -> Result<EntryPoint> {
1149         match (&payload.firmware, &payload.kernel) {
1150             (Some(firmware), None) => {
1151                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1152                 Self::load_kernel(Some(firmware), None, memory_manager)
1153             }
1154             (None, Some(kernel)) => {
1155                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1156                 Self::load_kernel(None, Some(kernel), memory_manager)
1157             }
1158             _ => Err(Error::InvalidPayload),
1159         }
1160     }
1161 
1162     fn load_payload_async(
1163         memory_manager: &Arc<Mutex<MemoryManager>>,
1164         config: &Arc<Mutex<VmConfig>>,
1165         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1166         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1167     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1168         // Kernel with TDX is loaded in a different manner
1169         #[cfg(feature = "tdx")]
1170         if config.lock().unwrap().is_tdx_enabled() {
1171             return Ok(None);
1172         }
1173 
1174         config
1175             .lock()
1176             .unwrap()
1177             .payload
1178             .as_ref()
1179             .map(|payload| {
1180                 let memory_manager = memory_manager.clone();
1181                 let payload = payload.clone();
1182                 #[cfg(feature = "igvm")]
1183                 let cpu_manager = cpu_manager.clone();
1184 
1185                 std::thread::Builder::new()
1186                     .name("payload_loader".into())
1187                     .spawn(move || {
1188                         Self::load_payload(
1189                             &payload,
1190                             memory_manager,
1191                             #[cfg(feature = "igvm")]
1192                             cpu_manager,
1193                             #[cfg(feature = "sev_snp")]
1194                             sev_snp_enabled,
1195                         )
1196                     })
1197                     .map_err(Error::KernelLoadThreadSpawn)
1198             })
1199             .transpose()
1200     }
1201 
1202     #[cfg(target_arch = "x86_64")]
1203     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1204         trace_scoped!("configure_system");
1205         info!("Configuring system");
1206         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1207 
1208         let initramfs_config = match self.initramfs {
1209             Some(_) => Some(self.load_initramfs(&mem)?),
1210             None => None,
1211         };
1212 
1213         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1214         let rsdp_addr = Some(rsdp_addr);
1215         let sgx_epc_region = self
1216             .memory_manager
1217             .lock()
1218             .unwrap()
1219             .sgx_epc_region()
1220             .as_ref()
1221             .cloned();
1222 
1223         let serial_number = self
1224             .config
1225             .lock()
1226             .unwrap()
1227             .platform
1228             .as_ref()
1229             .and_then(|p| p.serial_number.clone());
1230 
1231         let uuid = self
1232             .config
1233             .lock()
1234             .unwrap()
1235             .platform
1236             .as_ref()
1237             .and_then(|p| p.uuid.clone());
1238 
1239         let oem_strings = self
1240             .config
1241             .lock()
1242             .unwrap()
1243             .platform
1244             .as_ref()
1245             .and_then(|p| p.oem_strings.clone());
1246 
1247         let oem_strings = oem_strings
1248             .as_deref()
1249             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1250 
1251         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1252 
1253         arch::configure_system(
1254             &mem,
1255             arch::layout::CMDLINE_START,
1256             arch::layout::CMDLINE_MAX_SIZE,
1257             &initramfs_config,
1258             boot_vcpus,
1259             entry_addr.setup_header,
1260             rsdp_addr,
1261             sgx_epc_region,
1262             serial_number.as_deref(),
1263             uuid.as_deref(),
1264             oem_strings.as_deref(),
1265             topology,
1266         )
1267         .map_err(Error::ConfigureSystem)?;
1268         Ok(())
1269     }
1270 
1271     #[cfg(target_arch = "aarch64")]
1272     fn configure_system(
1273         &mut self,
1274         _rsdp_addr: GuestAddress,
1275         _entry_addr: EntryPoint,
1276     ) -> Result<()> {
1277         let cmdline = Self::generate_cmdline(
1278             self.config.lock().unwrap().payload.as_ref().unwrap(),
1279             &self.device_manager,
1280         )?;
1281         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1282         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1283         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1284         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1285         let initramfs_config = match self.initramfs {
1286             Some(_) => Some(self.load_initramfs(&mem)?),
1287             None => None,
1288         };
1289 
1290         let device_info = &self
1291             .device_manager
1292             .lock()
1293             .unwrap()
1294             .get_device_info()
1295             .clone();
1296 
1297         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1298             let pci_space = PciSpaceInfo {
1299                 pci_segment_id: pci_segment.id,
1300                 mmio_config_address: pci_segment.mmio_config_address,
1301                 pci_device_space_start: pci_segment.start_of_mem64_area,
1302                 pci_device_space_size: pci_segment.end_of_mem64_area
1303                     - pci_segment.start_of_mem64_area
1304                     + 1,
1305             };
1306             pci_space_info.push(pci_space);
1307         }
1308 
1309         let virtio_iommu_bdf = self
1310             .device_manager
1311             .lock()
1312             .unwrap()
1313             .iommu_attached_devices()
1314             .as_ref()
1315             .map(|(v, _)| *v);
1316 
1317         let vgic = self
1318             .device_manager
1319             .lock()
1320             .unwrap()
1321             .get_interrupt_controller()
1322             .unwrap()
1323             .lock()
1324             .unwrap()
1325             .get_vgic()
1326             .map_err(|_| {
1327                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1328                     arch::aarch64::Error::SetupGic,
1329                 ))
1330             })?;
1331 
1332         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1333         let pmu_supported = self
1334             .cpu_manager
1335             .lock()
1336             .unwrap()
1337             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1338             .map_err(|_| {
1339                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1340                     arch::aarch64::Error::VcpuInitPmu,
1341                 ))
1342             })?;
1343 
1344         arch::configure_system(
1345             &mem,
1346             cmdline.as_cstring().unwrap().to_str().unwrap(),
1347             vcpu_mpidrs,
1348             vcpu_topology,
1349             device_info,
1350             &initramfs_config,
1351             &pci_space_info,
1352             virtio_iommu_bdf.map(|bdf| bdf.into()),
1353             &vgic,
1354             &self.numa_nodes,
1355             pmu_supported,
1356         )
1357         .map_err(Error::ConfigureSystem)?;
1358 
1359         Ok(())
1360     }
1361 
1362     pub fn serial_pty(&self) -> Option<PtyPair> {
1363         self.device_manager.lock().unwrap().serial_pty()
1364     }
1365 
1366     pub fn console_pty(&self) -> Option<PtyPair> {
1367         self.device_manager.lock().unwrap().console_pty()
1368     }
1369 
1370     pub fn debug_console_pty(&self) -> Option<PtyPair> {
1371         self.device_manager.lock().unwrap().debug_console_pty()
1372     }
1373 
1374     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1375         self.device_manager.lock().unwrap().console_resize_pipe()
1376     }
1377 
1378     pub fn shutdown(&mut self) -> Result<()> {
1379         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1380         let new_state = VmState::Shutdown;
1381 
1382         state.valid_transition(new_state)?;
1383 
1384         // Wake up the DeviceManager threads so they will get terminated cleanly
1385         self.device_manager
1386             .lock()
1387             .unwrap()
1388             .resume()
1389             .map_err(Error::Resume)?;
1390 
1391         self.cpu_manager
1392             .lock()
1393             .unwrap()
1394             .shutdown()
1395             .map_err(Error::CpuManager)?;
1396 
1397         // Wait for all the threads to finish
1398         for thread in self.threads.drain(..) {
1399             thread.join().map_err(Error::ThreadCleanup)?
1400         }
1401         *state = new_state;
1402 
1403         Ok(())
1404     }
1405 
1406     pub fn resize(
1407         &mut self,
1408         desired_vcpus: Option<u8>,
1409         desired_memory: Option<u64>,
1410         desired_balloon: Option<u64>,
1411     ) -> Result<()> {
1412         event!("vm", "resizing");
1413 
1414         if let Some(desired_vcpus) = desired_vcpus {
1415             if self
1416                 .cpu_manager
1417                 .lock()
1418                 .unwrap()
1419                 .resize(desired_vcpus)
1420                 .map_err(Error::CpuManager)?
1421             {
1422                 self.device_manager
1423                     .lock()
1424                     .unwrap()
1425                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1426                     .map_err(Error::DeviceManager)?;
1427             }
1428             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1429         }
1430 
1431         if let Some(desired_memory) = desired_memory {
1432             let new_region = self
1433                 .memory_manager
1434                 .lock()
1435                 .unwrap()
1436                 .resize(desired_memory)
1437                 .map_err(Error::MemoryManager)?;
1438 
1439             let memory_config = &mut self.config.lock().unwrap().memory;
1440 
1441             if let Some(new_region) = &new_region {
1442                 self.device_manager
1443                     .lock()
1444                     .unwrap()
1445                     .update_memory(new_region)
1446                     .map_err(Error::DeviceManager)?;
1447 
1448                 match memory_config.hotplug_method {
1449                     HotplugMethod::Acpi => {
1450                         self.device_manager
1451                             .lock()
1452                             .unwrap()
1453                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1454                             .map_err(Error::DeviceManager)?;
1455                     }
1456                     HotplugMethod::VirtioMem => {}
1457                 }
1458             }
1459 
1460             // We update the VM config regardless of the actual guest resize
1461             // operation result (happened or not), so that if the VM reboots
1462             // it will be running with the last configure memory size.
1463             match memory_config.hotplug_method {
1464                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1465                 HotplugMethod::VirtioMem => {
1466                     if desired_memory > memory_config.size {
1467                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1468                     } else {
1469                         memory_config.hotplugged_size = None;
1470                     }
1471                 }
1472             }
1473         }
1474 
1475         if let Some(desired_balloon) = desired_balloon {
1476             self.device_manager
1477                 .lock()
1478                 .unwrap()
1479                 .resize_balloon(desired_balloon)
1480                 .map_err(Error::DeviceManager)?;
1481 
1482             // Update the configuration value for the balloon size to ensure
1483             // a reboot would use the right value.
1484             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1485                 balloon_config.size = desired_balloon;
1486             }
1487         }
1488 
1489         event!("vm", "resized");
1490 
1491         Ok(())
1492     }
1493 
1494     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1495         let memory_config = &mut self.config.lock().unwrap().memory;
1496 
1497         if let Some(zones) = &mut memory_config.zones {
1498             for zone in zones.iter_mut() {
1499                 if zone.id == id {
1500                     if desired_memory >= zone.size {
1501                         let hotplugged_size = desired_memory - zone.size;
1502                         self.memory_manager
1503                             .lock()
1504                             .unwrap()
1505                             .resize_zone(&id, desired_memory - zone.size)
1506                             .map_err(Error::MemoryManager)?;
1507                         // We update the memory zone config regardless of the
1508                         // actual 'resize-zone' operation result (happened or
1509                         // not), so that if the VM reboots it will be running
1510                         // with the last configured memory zone size.
1511                         zone.hotplugged_size = Some(hotplugged_size);
1512 
1513                         return Ok(());
1514                     } else {
1515                         error!(
1516                             "Invalid to ask less ({}) than boot RAM ({}) for \
1517                             this memory zone",
1518                             desired_memory, zone.size,
1519                         );
1520                         return Err(Error::ResizeZone);
1521                     }
1522                 }
1523             }
1524         }
1525 
1526         error!("Could not find the memory zone {} for the resize", id);
1527         Err(Error::ResizeZone)
1528     }
1529 
1530     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1531         let pci_device_info = self
1532             .device_manager
1533             .lock()
1534             .unwrap()
1535             .add_device(&mut device_cfg)
1536             .map_err(Error::DeviceManager)?;
1537 
1538         // Update VmConfig by adding the new device. This is important to
1539         // ensure the device would be created in case of a reboot.
1540         {
1541             let mut config = self.config.lock().unwrap();
1542             add_to_config(&mut config.devices, device_cfg);
1543         }
1544 
1545         self.device_manager
1546             .lock()
1547             .unwrap()
1548             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1549             .map_err(Error::DeviceManager)?;
1550 
1551         Ok(pci_device_info)
1552     }
1553 
1554     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1555         let pci_device_info = self
1556             .device_manager
1557             .lock()
1558             .unwrap()
1559             .add_user_device(&mut device_cfg)
1560             .map_err(Error::DeviceManager)?;
1561 
1562         // Update VmConfig by adding the new device. This is important to
1563         // ensure the device would be created in case of a reboot.
1564         {
1565             let mut config = self.config.lock().unwrap();
1566             add_to_config(&mut config.user_devices, device_cfg);
1567         }
1568 
1569         self.device_manager
1570             .lock()
1571             .unwrap()
1572             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1573             .map_err(Error::DeviceManager)?;
1574 
1575         Ok(pci_device_info)
1576     }
1577 
1578     pub fn remove_device(&mut self, id: String) -> Result<()> {
1579         self.device_manager
1580             .lock()
1581             .unwrap()
1582             .remove_device(id.clone())
1583             .map_err(Error::DeviceManager)?;
1584 
1585         // Update VmConfig by removing the device. This is important to
1586         // ensure the device would not be created in case of a reboot.
1587         self.config.lock().unwrap().remove_device(&id);
1588 
1589         self.device_manager
1590             .lock()
1591             .unwrap()
1592             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1593             .map_err(Error::DeviceManager)?;
1594         Ok(())
1595     }
1596 
1597     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1598         let pci_device_info = self
1599             .device_manager
1600             .lock()
1601             .unwrap()
1602             .add_disk(&mut disk_cfg)
1603             .map_err(Error::DeviceManager)?;
1604 
1605         // Update VmConfig by adding the new device. This is important to
1606         // ensure the device would be created in case of a reboot.
1607         {
1608             let mut config = self.config.lock().unwrap();
1609             add_to_config(&mut config.disks, disk_cfg);
1610         }
1611 
1612         self.device_manager
1613             .lock()
1614             .unwrap()
1615             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1616             .map_err(Error::DeviceManager)?;
1617 
1618         Ok(pci_device_info)
1619     }
1620 
1621     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1622         let pci_device_info = self
1623             .device_manager
1624             .lock()
1625             .unwrap()
1626             .add_fs(&mut fs_cfg)
1627             .map_err(Error::DeviceManager)?;
1628 
1629         // Update VmConfig by adding the new device. This is important to
1630         // ensure the device would be created in case of a reboot.
1631         {
1632             let mut config = self.config.lock().unwrap();
1633             add_to_config(&mut config.fs, fs_cfg);
1634         }
1635 
1636         self.device_manager
1637             .lock()
1638             .unwrap()
1639             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1640             .map_err(Error::DeviceManager)?;
1641 
1642         Ok(pci_device_info)
1643     }
1644 
1645     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1646         let pci_device_info = self
1647             .device_manager
1648             .lock()
1649             .unwrap()
1650             .add_pmem(&mut pmem_cfg)
1651             .map_err(Error::DeviceManager)?;
1652 
1653         // Update VmConfig by adding the new device. This is important to
1654         // ensure the device would be created in case of a reboot.
1655         {
1656             let mut config = self.config.lock().unwrap();
1657             add_to_config(&mut config.pmem, pmem_cfg);
1658         }
1659 
1660         self.device_manager
1661             .lock()
1662             .unwrap()
1663             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1664             .map_err(Error::DeviceManager)?;
1665 
1666         Ok(pci_device_info)
1667     }
1668 
1669     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1670         let pci_device_info = self
1671             .device_manager
1672             .lock()
1673             .unwrap()
1674             .add_net(&mut net_cfg)
1675             .map_err(Error::DeviceManager)?;
1676 
1677         // Update VmConfig by adding the new device. This is important to
1678         // ensure the device would be created in case of a reboot.
1679         {
1680             let mut config = self.config.lock().unwrap();
1681             add_to_config(&mut config.net, net_cfg);
1682         }
1683 
1684         self.device_manager
1685             .lock()
1686             .unwrap()
1687             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1688             .map_err(Error::DeviceManager)?;
1689 
1690         Ok(pci_device_info)
1691     }
1692 
1693     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1694         let pci_device_info = self
1695             .device_manager
1696             .lock()
1697             .unwrap()
1698             .add_vdpa(&mut vdpa_cfg)
1699             .map_err(Error::DeviceManager)?;
1700 
1701         // Update VmConfig by adding the new device. This is important to
1702         // ensure the device would be created in case of a reboot.
1703         {
1704             let mut config = self.config.lock().unwrap();
1705             add_to_config(&mut config.vdpa, vdpa_cfg);
1706         }
1707 
1708         self.device_manager
1709             .lock()
1710             .unwrap()
1711             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1712             .map_err(Error::DeviceManager)?;
1713 
1714         Ok(pci_device_info)
1715     }
1716 
1717     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1718         let pci_device_info = self
1719             .device_manager
1720             .lock()
1721             .unwrap()
1722             .add_vsock(&mut vsock_cfg)
1723             .map_err(Error::DeviceManager)?;
1724 
1725         // Update VmConfig by adding the new device. This is important to
1726         // ensure the device would be created in case of a reboot.
1727         {
1728             let mut config = self.config.lock().unwrap();
1729             config.vsock = Some(vsock_cfg);
1730         }
1731 
1732         self.device_manager
1733             .lock()
1734             .unwrap()
1735             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1736             .map_err(Error::DeviceManager)?;
1737 
1738         Ok(pci_device_info)
1739     }
1740 
1741     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1742         Ok(self.device_manager.lock().unwrap().counters())
1743     }
1744 
1745     #[cfg(feature = "tdx")]
1746     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1747         use arch::x86_64::tdx::*;
1748 
1749         let firmware_path = self
1750             .config
1751             .lock()
1752             .unwrap()
1753             .payload
1754             .as_ref()
1755             .unwrap()
1756             .firmware
1757             .clone()
1758             .ok_or(Error::TdxFirmwareMissing)?;
1759         // The TDVF file contains a table of section as well as code
1760         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1761 
1762         // For all the sections allocate some RAM backing them
1763         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1764     }
1765 
1766     #[cfg(feature = "tdx")]
1767     fn hob_memory_resources(
1768         mut sorted_sections: Vec<TdvfSection>,
1769         guest_memory: &GuestMemoryMmap,
1770     ) -> Vec<(u64, u64, bool)> {
1771         let mut list = Vec::new();
1772 
1773         let mut current_section = sorted_sections.pop();
1774 
1775         // RAM regions interleaved with TDVF sections
1776         let mut next_start_addr = 0;
1777         for region in guest_memory.iter() {
1778             let region_start = region.start_addr().0;
1779             let region_end = region.last_addr().0;
1780             if region_start > next_start_addr {
1781                 next_start_addr = region_start;
1782             }
1783 
1784             loop {
1785                 let (start, size, ram) = if let Some(section) = &current_section {
1786                     if section.address <= next_start_addr {
1787                         (section.address, section.size, false)
1788                     } else {
1789                         let last_addr = std::cmp::min(section.address - 1, region_end);
1790                         (next_start_addr, last_addr - next_start_addr + 1, true)
1791                     }
1792                 } else {
1793                     (next_start_addr, region_end - next_start_addr + 1, true)
1794                 };
1795 
1796                 list.push((start, size, ram));
1797 
1798                 if !ram {
1799                     current_section = sorted_sections.pop();
1800                 }
1801 
1802                 next_start_addr = start + size;
1803 
1804                 if region_start > next_start_addr {
1805                     next_start_addr = region_start;
1806                 }
1807 
1808                 if next_start_addr > region_end {
1809                     break;
1810                 }
1811             }
1812         }
1813 
1814         // Once all the interleaved sections have been processed, let's simply
1815         // pull the remaining ones.
1816         if let Some(section) = current_section {
1817             list.push((section.address, section.size, false));
1818         }
1819         while let Some(section) = sorted_sections.pop() {
1820             list.push((section.address, section.size, false));
1821         }
1822 
1823         list
1824     }
1825 
1826     #[cfg(feature = "tdx")]
1827     fn populate_tdx_sections(
1828         &mut self,
1829         sections: &[TdvfSection],
1830         guid_found: bool,
1831     ) -> Result<Option<u64>> {
1832         use arch::x86_64::tdx::*;
1833         // Get the memory end *before* we start adding TDVF ram regions
1834         let boot_guest_memory = self
1835             .memory_manager
1836             .lock()
1837             .as_ref()
1838             .unwrap()
1839             .boot_guest_memory();
1840         for section in sections {
1841             // No need to allocate if the section falls within guest RAM ranges
1842             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1843                 info!(
1844                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1845                     section
1846                 );
1847                 continue;
1848             }
1849 
1850             info!("Allocating TDVF Section: {:x?}", section);
1851             self.memory_manager
1852                 .lock()
1853                 .unwrap()
1854                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1855                 .map_err(Error::AllocatingTdvfMemory)?;
1856         }
1857 
1858         // The TDVF file contains a table of section as well as code
1859         let firmware_path = self
1860             .config
1861             .lock()
1862             .unwrap()
1863             .payload
1864             .as_ref()
1865             .unwrap()
1866             .firmware
1867             .clone()
1868             .ok_or(Error::TdxFirmwareMissing)?;
1869         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1870 
1871         // The guest memory at this point now has all the required regions so it
1872         // is safe to copy from the TDVF file into it.
1873         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1874         let mem = guest_memory.memory();
1875         let mut payload_info = None;
1876         let mut hob_offset = None;
1877         for section in sections {
1878             info!("Populating TDVF Section: {:x?}", section);
1879             match section.r#type {
1880                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1881                     info!("Copying section to guest memory");
1882                     firmware_file
1883                         .seek(SeekFrom::Start(section.data_offset as u64))
1884                         .map_err(Error::LoadTdvf)?;
1885                     mem.read_volatile_from(
1886                         GuestAddress(section.address),
1887                         &mut firmware_file,
1888                         section.data_size as usize,
1889                     )
1890                     .unwrap();
1891                 }
1892                 TdvfSectionType::TdHob => {
1893                     hob_offset = Some(section.address);
1894                 }
1895                 TdvfSectionType::Payload => {
1896                     info!("Copying payload to guest memory");
1897                     if let Some(payload_file) = self.kernel.as_mut() {
1898                         let payload_size = payload_file
1899                             .seek(SeekFrom::End(0))
1900                             .map_err(Error::LoadPayload)?;
1901 
1902                         payload_file
1903                             .seek(SeekFrom::Start(0x1f1))
1904                             .map_err(Error::LoadPayload)?;
1905 
1906                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1907                         payload_file
1908                             .read_volatile(&mut payload_header.as_bytes())
1909                             .unwrap();
1910 
1911                         if payload_header.header != 0x5372_6448 {
1912                             return Err(Error::InvalidPayloadType);
1913                         }
1914 
1915                         if (payload_header.version < 0x0200)
1916                             || ((payload_header.loadflags & 0x1) == 0x0)
1917                         {
1918                             return Err(Error::InvalidPayloadType);
1919                         }
1920 
1921                         payload_file.rewind().map_err(Error::LoadPayload)?;
1922                         mem.read_volatile_from(
1923                             GuestAddress(section.address),
1924                             payload_file,
1925                             payload_size as usize,
1926                         )
1927                         .unwrap();
1928 
1929                         // Create the payload info that will be inserted into
1930                         // the HOB.
1931                         payload_info = Some(PayloadInfo {
1932                             image_type: PayloadImageType::BzImage,
1933                             entry_point: section.address,
1934                         });
1935                     }
1936                 }
1937                 TdvfSectionType::PayloadParam => {
1938                     info!("Copying payload parameters to guest memory");
1939                     let cmdline = Self::generate_cmdline(
1940                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1941                     )?;
1942                     mem.write_slice(
1943                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1944                         GuestAddress(section.address),
1945                     )
1946                     .unwrap();
1947                 }
1948                 _ => {}
1949             }
1950         }
1951 
1952         // Generate HOB
1953         let mut hob = TdHob::start(hob_offset.unwrap());
1954 
1955         let mut sorted_sections = sections.to_vec();
1956         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1957 
1958         sorted_sections.sort_by_key(|section| section.address);
1959         sorted_sections.reverse();
1960 
1961         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1962             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1963                 .map_err(Error::PopulateHob)?;
1964         }
1965 
1966         // MMIO regions
1967         hob.add_mmio_resource(
1968             &mem,
1969             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1970             arch::layout::APIC_START.raw_value()
1971                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1972         )
1973         .map_err(Error::PopulateHob)?;
1974         let start_of_device_area = self
1975             .memory_manager
1976             .lock()
1977             .unwrap()
1978             .start_of_device_area()
1979             .raw_value();
1980         let end_of_device_area = self
1981             .memory_manager
1982             .lock()
1983             .unwrap()
1984             .end_of_device_area()
1985             .raw_value();
1986         hob.add_mmio_resource(
1987             &mem,
1988             start_of_device_area,
1989             end_of_device_area - start_of_device_area,
1990         )
1991         .map_err(Error::PopulateHob)?;
1992 
1993         // Loop over the ACPI tables and copy them to the HOB.
1994 
1995         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1996             &self.device_manager,
1997             &self.cpu_manager,
1998             &self.memory_manager,
1999             &self.numa_nodes,
2000         ) {
2001             hob.add_acpi_table(&mem, acpi_table.as_slice())
2002                 .map_err(Error::PopulateHob)?;
2003         }
2004 
2005         // If a payload info has been created, let's insert it into the HOB.
2006         if let Some(payload_info) = payload_info {
2007             hob.add_payload(&mem, payload_info)
2008                 .map_err(Error::PopulateHob)?;
2009         }
2010 
2011         hob.finish(&mem).map_err(Error::PopulateHob)?;
2012 
2013         Ok(hob_offset)
2014     }
2015 
2016     #[cfg(feature = "tdx")]
2017     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
2018         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2019         let mem = guest_memory.memory();
2020 
2021         for section in sections {
2022             self.vm
2023                 .tdx_init_memory_region(
2024                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2025                     section.address,
2026                     section.size,
2027                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2028                     section.attributes == 1,
2029                 )
2030                 .map_err(Error::InitializeTdxMemoryRegion)?;
2031         }
2032 
2033         Ok(())
2034     }
2035 
2036     // Creates ACPI tables
2037     // In case of TDX being used, this is a no-op since the tables will be
2038     // created and passed when populating the HOB.
2039 
2040     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2041         #[cfg(feature = "tdx")]
2042         if self.config.lock().unwrap().is_tdx_enabled() {
2043             return None;
2044         }
2045         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2046         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2047         let rsdp_addr = crate::acpi::create_acpi_tables(
2048             &mem,
2049             &self.device_manager,
2050             &self.cpu_manager,
2051             &self.memory_manager,
2052             &self.numa_nodes,
2053             tpm_enabled,
2054         );
2055         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2056 
2057         Some(rsdp_addr)
2058     }
2059 
2060     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2061         trace_scoped!("entry_point");
2062 
2063         self.load_payload_handle
2064             .take()
2065             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2066             .transpose()
2067     }
2068 
2069     pub fn boot(&mut self) -> Result<()> {
2070         trace_scoped!("Vm::boot");
2071         let current_state = self.get_state()?;
2072         if current_state == VmState::Paused {
2073             return self.resume().map_err(Error::Resume);
2074         }
2075 
2076         let new_state = if self.stop_on_boot {
2077             VmState::BreakPoint
2078         } else {
2079             VmState::Running
2080         };
2081         current_state.valid_transition(new_state)?;
2082 
2083         // Do earlier to parallelise with loading kernel
2084         #[cfg(target_arch = "x86_64")]
2085         cfg_if::cfg_if! {
2086             if #[cfg(feature = "sev_snp")] {
2087                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2088                 let rsdp_addr = if sev_snp_enabled {
2089                     // In case of SEV-SNP guest ACPI tables are provided via
2090                     // IGVM. So skip the creation of ACPI tables and set the
2091                     // rsdp addr to None.
2092                     None
2093                 } else {
2094                     self.create_acpi_tables()
2095                 };
2096             } else {
2097                 let rsdp_addr = self.create_acpi_tables();
2098             }
2099         }
2100 
2101         // Load kernel synchronously or if asynchronous then wait for load to
2102         // finish.
2103         let entry_point = self.entry_point()?;
2104 
2105         #[cfg(feature = "tdx")]
2106         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2107 
2108         // Configure the vcpus that have been created
2109         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2110         for vcpu in vcpus {
2111             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2112             let boot_setup = entry_point.map(|e| (e, guest_memory));
2113             self.cpu_manager
2114                 .lock()
2115                 .unwrap()
2116                 .configure_vcpu(vcpu, boot_setup)
2117                 .map_err(Error::CpuManager)?;
2118         }
2119 
2120         #[cfg(feature = "tdx")]
2121         let (sections, guid_found) = if tdx_enabled {
2122             self.extract_tdvf_sections()?
2123         } else {
2124             (Vec::new(), false)
2125         };
2126 
2127         // Configuring the TDX regions requires that the vCPUs are created.
2128         #[cfg(feature = "tdx")]
2129         let hob_address = if tdx_enabled {
2130             // TDX sections are written to memory.
2131             self.populate_tdx_sections(&sections, guid_found)?
2132         } else {
2133             None
2134         };
2135 
2136         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2137         // available after they are configured
2138         #[cfg(target_arch = "aarch64")]
2139         let rsdp_addr = self.create_acpi_tables();
2140 
2141         // Configure shared state based on loaded kernel
2142         entry_point
2143             .map(|entry_point| {
2144                 // Safe to unwrap rsdp_addr as we know it can't be None when
2145                 // the entry_point is Some.
2146                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2147             })
2148             .transpose()?;
2149 
2150         #[cfg(target_arch = "x86_64")]
2151         // Note: For x86, always call this function before invoking start boot vcpus.
2152         // Otherwise guest would fail to boot because we haven't created the
2153         // userspace mappings to update the hypervisor about the memory mappings.
2154         // These mappings must be created before we start the vCPU threads for
2155         // the very first time.
2156         self.memory_manager
2157             .lock()
2158             .unwrap()
2159             .allocate_address_space()
2160             .map_err(Error::MemoryManager)?;
2161 
2162         #[cfg(feature = "tdx")]
2163         if let Some(hob_address) = hob_address {
2164             // With the HOB address extracted the vCPUs can have
2165             // their TDX state configured.
2166             self.cpu_manager
2167                 .lock()
2168                 .unwrap()
2169                 .initialize_tdx(hob_address)
2170                 .map_err(Error::CpuManager)?;
2171             // Let the hypervisor know which memory ranges are shared with the
2172             // guest. This prevents the guest from ignoring/discarding memory
2173             // regions provided by the host.
2174             self.init_tdx_memory(&sections)?;
2175             // With TDX memory and CPU state configured TDX setup is complete
2176             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2177         }
2178 
2179         // Resume the vm for MSHV
2180         if current_state == VmState::Created {
2181             self.vm.resume().map_err(Error::ResumeVm)?;
2182         }
2183 
2184         self.cpu_manager
2185             .lock()
2186             .unwrap()
2187             .start_boot_vcpus(new_state == VmState::BreakPoint)
2188             .map_err(Error::CpuManager)?;
2189 
2190         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2191         *state = new_state;
2192         Ok(())
2193     }
2194 
2195     pub fn restore(&mut self) -> Result<()> {
2196         event!("vm", "restoring");
2197 
2198         #[cfg(target_arch = "x86_64")]
2199         // Note: For x86, always call this function before invoking start boot vcpus.
2200         // Otherwise guest would fail to boot because we haven't created the
2201         // userspace mappings to update the hypervisor about the memory mappings.
2202         // These mappings must be created before we start the vCPU threads for
2203         // the very first time for the restored VM.
2204         self.memory_manager
2205             .lock()
2206             .unwrap()
2207             .allocate_address_space()
2208             .map_err(Error::MemoryManager)?;
2209 
2210         // Now we can start all vCPUs from here.
2211         self.cpu_manager
2212             .lock()
2213             .unwrap()
2214             .start_restored_vcpus()
2215             .map_err(Error::CpuManager)?;
2216 
2217         event!("vm", "restored");
2218         Ok(())
2219     }
2220 
2221     /// Gets a thread-safe reference counted pointer to the VM configuration.
2222     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2223         Arc::clone(&self.config)
2224     }
2225 
2226     /// Get the VM state. Returns an error if the state is poisoned.
2227     pub fn get_state(&self) -> Result<VmState> {
2228         self.state
2229             .try_read()
2230             .map_err(|_| Error::PoisonedState)
2231             .map(|state| *state)
2232     }
2233 
2234     /// Gets the actual size of the balloon.
2235     pub fn balloon_size(&self) -> u64 {
2236         self.device_manager.lock().unwrap().balloon_size()
2237     }
2238 
2239     pub fn send_memory_fds(
2240         &mut self,
2241         socket: &mut UnixStream,
2242     ) -> std::result::Result<(), MigratableError> {
2243         for (slot, fd) in self
2244             .memory_manager
2245             .lock()
2246             .unwrap()
2247             .memory_slot_fds()
2248             .drain()
2249         {
2250             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2251                 .write_to(socket)
2252                 .map_err(|e| {
2253                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2254                 })?;
2255             socket
2256                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2257                 .map_err(|e| {
2258                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2259                 })?;
2260 
2261             let res = Response::read_from(socket)?;
2262             if res.status() != Status::Ok {
2263                 warn!("Error during memory fd migration");
2264                 Request::abandon().write_to(socket)?;
2265                 Response::read_from(socket).ok();
2266                 return Err(MigratableError::MigrateSend(anyhow!(
2267                     "Error during memory fd migration"
2268                 )));
2269             }
2270         }
2271 
2272         Ok(())
2273     }
2274 
2275     pub fn send_memory_regions<F>(
2276         &mut self,
2277         ranges: &MemoryRangeTable,
2278         fd: &mut F,
2279     ) -> std::result::Result<(), MigratableError>
2280     where
2281         F: WriteVolatile,
2282     {
2283         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2284         let mem = guest_memory.memory();
2285 
2286         for range in ranges.regions() {
2287             let mut offset: u64 = 0;
2288             // Here we are manually handling the retry in case we can't the
2289             // whole region at once because we can't use the implementation
2290             // from vm-memory::GuestMemory of write_all_to() as it is not
2291             // following the correct behavior. For more info about this issue
2292             // see: https://github.com/rust-vmm/vm-memory/issues/174
2293             loop {
2294                 let bytes_written = mem
2295                     .write_volatile_to(
2296                         GuestAddress(range.gpa + offset),
2297                         fd,
2298                         (range.length - offset) as usize,
2299                     )
2300                     .map_err(|e| {
2301                         MigratableError::MigrateSend(anyhow!(
2302                             "Error transferring memory to socket: {}",
2303                             e
2304                         ))
2305                     })?;
2306                 offset += bytes_written as u64;
2307 
2308                 if offset == range.length {
2309                     break;
2310                 }
2311             }
2312         }
2313 
2314         Ok(())
2315     }
2316 
2317     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2318         self.memory_manager
2319             .lock()
2320             .unwrap()
2321             .memory_range_table(false)
2322     }
2323 
2324     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2325         self.device_manager.lock().unwrap().device_tree()
2326     }
2327 
2328     pub fn activate_virtio_devices(&self) -> Result<()> {
2329         self.device_manager
2330             .lock()
2331             .unwrap()
2332             .activate_virtio_devices()
2333             .map_err(Error::ActivateVirtioDevices)
2334     }
2335 
2336     #[cfg(target_arch = "x86_64")]
2337     pub fn power_button(&self) -> Result<()> {
2338         return self
2339             .device_manager
2340             .lock()
2341             .unwrap()
2342             .notify_power_button()
2343             .map_err(Error::PowerButton);
2344     }
2345 
2346     #[cfg(target_arch = "aarch64")]
2347     pub fn power_button(&self) -> Result<()> {
2348         self.device_manager
2349             .lock()
2350             .unwrap()
2351             .notify_power_button()
2352             .map_err(Error::PowerButton)
2353     }
2354 
2355     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2356         self.memory_manager.lock().unwrap().snapshot_data()
2357     }
2358 
2359     #[cfg(feature = "guest_debug")]
2360     pub fn debug_request(
2361         &mut self,
2362         gdb_request: &GdbRequestPayload,
2363         cpu_id: usize,
2364     ) -> Result<GdbResponsePayload> {
2365         use GdbRequestPayload::*;
2366         match gdb_request {
2367             SetSingleStep(single_step) => {
2368                 self.set_guest_debug(cpu_id, &[], *single_step)
2369                     .map_err(Error::Debug)?;
2370             }
2371             SetHwBreakPoint(addrs) => {
2372                 self.set_guest_debug(cpu_id, addrs, false)
2373                     .map_err(Error::Debug)?;
2374             }
2375             Pause => {
2376                 self.debug_pause().map_err(Error::Debug)?;
2377             }
2378             Resume => {
2379                 self.debug_resume().map_err(Error::Debug)?;
2380             }
2381             ReadRegs => {
2382                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2383                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2384             }
2385             WriteRegs(regs) => {
2386                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2387             }
2388             ReadMem(vaddr, len) => {
2389                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2390                 let mem = self
2391                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2392                     .map_err(Error::Debug)?;
2393                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2394             }
2395             WriteMem(vaddr, data) => {
2396                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2397                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2398                     .map_err(Error::Debug)?;
2399             }
2400             ActiveVcpus => {
2401                 let active_vcpus = self.active_vcpus();
2402                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2403             }
2404         }
2405         Ok(GdbResponsePayload::CommandComplete)
2406     }
2407 
2408     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2409     fn get_dump_state(
2410         &mut self,
2411         destination_url: &str,
2412     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2413         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2414         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2415         let mut elf_phdr_num = 1;
2416         let elf_sh_info = 0;
2417         let coredump_file_path = url_to_file(destination_url)?;
2418         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2419 
2420         if mapping_num < UINT16_MAX - 2 {
2421             elf_phdr_num += mapping_num as u16;
2422         } else {
2423             panic!("mapping num beyond 65535 not supported");
2424         }
2425         let coredump_file = OpenOptions::new()
2426             .read(true)
2427             .write(true)
2428             .create_new(true)
2429             .open(coredump_file_path)
2430             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2431 
2432         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2433         let mem_data = self
2434             .memory_manager
2435             .lock()
2436             .unwrap()
2437             .coredump_memory_regions(mem_offset);
2438 
2439         Ok(DumpState {
2440             elf_note_size,
2441             elf_phdr_num,
2442             elf_sh_info,
2443             mem_offset,
2444             mem_info: Some(mem_data),
2445             file: Some(coredump_file),
2446         })
2447     }
2448 
2449     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2450     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2451         size_of::<elf::Elf64_Ehdr>() as u64
2452             + note_size as u64
2453             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2454     }
2455 
2456     pub fn nmi(&self) -> Result<()> {
2457         return self
2458             .cpu_manager
2459             .lock()
2460             .unwrap()
2461             .nmi()
2462             .map_err(|_| Error::ErrorNmi);
2463     }
2464 }
2465 
2466 impl Pausable for Vm {
2467     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2468         event!("vm", "pausing");
2469         let mut state = self
2470             .state
2471             .try_write()
2472             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2473         let new_state = VmState::Paused;
2474 
2475         state
2476             .valid_transition(new_state)
2477             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2478 
2479         #[cfg(target_arch = "x86_64")]
2480         {
2481             let mut clock = self
2482                 .vm
2483                 .get_clock()
2484                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2485             clock.reset_flags();
2486             self.saved_clock = Some(clock);
2487         }
2488 
2489         // Before pausing the vCPUs activate any pending virtio devices that might
2490         // need activation between starting the pause (or e.g. a migration it's part of)
2491         self.activate_virtio_devices().map_err(|e| {
2492             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2493         })?;
2494 
2495         self.cpu_manager.lock().unwrap().pause()?;
2496         self.device_manager.lock().unwrap().pause()?;
2497 
2498         self.vm
2499             .pause()
2500             .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?;
2501 
2502         *state = new_state;
2503 
2504         event!("vm", "paused");
2505         Ok(())
2506     }
2507 
2508     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2509         event!("vm", "resuming");
2510         let current_state = self.get_state().unwrap();
2511         let mut state = self
2512             .state
2513             .try_write()
2514             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2515         let new_state = VmState::Running;
2516 
2517         state
2518             .valid_transition(new_state)
2519             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2520 
2521         self.cpu_manager.lock().unwrap().resume()?;
2522         #[cfg(target_arch = "x86_64")]
2523         {
2524             if let Some(clock) = &self.saved_clock {
2525                 self.vm.set_clock(clock).map_err(|e| {
2526                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2527                 })?;
2528             }
2529         }
2530 
2531         if current_state == VmState::Paused {
2532             self.vm
2533                 .resume()
2534                 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?;
2535         }
2536 
2537         self.device_manager.lock().unwrap().resume()?;
2538 
2539         // And we're back to the Running state.
2540         *state = new_state;
2541         event!("vm", "resumed");
2542         Ok(())
2543     }
2544 }
2545 
2546 #[derive(Serialize, Deserialize)]
2547 pub struct VmSnapshot {
2548     #[cfg(target_arch = "x86_64")]
2549     pub clock: Option<hypervisor::ClockData>,
2550     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2551     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2552 }
2553 
2554 pub const VM_SNAPSHOT_ID: &str = "vm";
2555 impl Snapshottable for Vm {
2556     fn id(&self) -> String {
2557         VM_SNAPSHOT_ID.to_string()
2558     }
2559 
2560     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2561         event!("vm", "snapshotting");
2562 
2563         #[cfg(feature = "tdx")]
2564         {
2565             if self.config.lock().unwrap().is_tdx_enabled() {
2566                 return Err(MigratableError::Snapshot(anyhow!(
2567                     "Snapshot not possible with TDX VM"
2568                 )));
2569             }
2570         }
2571 
2572         let current_state = self.get_state().unwrap();
2573         if current_state != VmState::Paused {
2574             return Err(MigratableError::Snapshot(anyhow!(
2575                 "Trying to snapshot while VM is running"
2576             )));
2577         }
2578 
2579         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2580         let common_cpuid = {
2581             let amx = self.config.lock().unwrap().cpus.features.amx;
2582             let phys_bits = physical_bits(
2583                 &self.hypervisor,
2584                 self.config.lock().unwrap().cpus.max_phys_bits,
2585             );
2586             arch::generate_common_cpuid(
2587                 &self.hypervisor,
2588                 &arch::CpuidConfig {
2589                     sgx_epc_sections: None,
2590                     phys_bits,
2591                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2592                     #[cfg(feature = "tdx")]
2593                     tdx: false,
2594                     amx,
2595                 },
2596             )
2597             .map_err(|e| {
2598                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2599             })?
2600         };
2601 
2602         let vm_snapshot_state = VmSnapshot {
2603             #[cfg(target_arch = "x86_64")]
2604             clock: self.saved_clock,
2605             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2606             common_cpuid,
2607         };
2608 
2609         let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?;
2610 
2611         let (id, snapshot) = {
2612             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2613             (cpu_manager.id(), cpu_manager.snapshot()?)
2614         };
2615         vm_snapshot.add_snapshot(id, snapshot);
2616         let (id, snapshot) = {
2617             let mut memory_manager = self.memory_manager.lock().unwrap();
2618             (memory_manager.id(), memory_manager.snapshot()?)
2619         };
2620         vm_snapshot.add_snapshot(id, snapshot);
2621         let (id, snapshot) = {
2622             let mut device_manager = self.device_manager.lock().unwrap();
2623             (device_manager.id(), device_manager.snapshot()?)
2624         };
2625         vm_snapshot.add_snapshot(id, snapshot);
2626 
2627         event!("vm", "snapshotted");
2628         Ok(vm_snapshot)
2629     }
2630 }
2631 
2632 impl Transportable for Vm {
2633     fn send(
2634         &self,
2635         snapshot: &Snapshot,
2636         destination_url: &str,
2637     ) -> std::result::Result<(), MigratableError> {
2638         let mut snapshot_config_path = url_to_path(destination_url)?;
2639         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2640 
2641         // Create the snapshot config file
2642         let mut snapshot_config_file = OpenOptions::new()
2643             .read(true)
2644             .write(true)
2645             .create_new(true)
2646             .open(snapshot_config_path)
2647             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2648 
2649         // Serialize and write the snapshot config
2650         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2651             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2652 
2653         snapshot_config_file
2654             .write(vm_config.as_bytes())
2655             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2656 
2657         let mut snapshot_state_path = url_to_path(destination_url)?;
2658         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2659 
2660         // Create the snapshot state file
2661         let mut snapshot_state_file = OpenOptions::new()
2662             .read(true)
2663             .write(true)
2664             .create_new(true)
2665             .open(snapshot_state_path)
2666             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2667 
2668         // Serialize and write the snapshot state
2669         let vm_state =
2670             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2671 
2672         snapshot_state_file
2673             .write(&vm_state)
2674             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2675 
2676         // Tell the memory manager to also send/write its own snapshot.
2677         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2678             self.memory_manager
2679                 .lock()
2680                 .unwrap()
2681                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2682         } else {
2683             return Err(MigratableError::Restore(anyhow!(
2684                 "Missing memory manager snapshot"
2685             )));
2686         }
2687 
2688         Ok(())
2689     }
2690 }
2691 
2692 impl Migratable for Vm {
2693     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2694         self.memory_manager.lock().unwrap().start_dirty_log()?;
2695         self.device_manager.lock().unwrap().start_dirty_log()
2696     }
2697 
2698     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2699         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2700         self.device_manager.lock().unwrap().stop_dirty_log()
2701     }
2702 
2703     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2704         Ok(MemoryRangeTable::new_from_tables(vec![
2705             self.memory_manager.lock().unwrap().dirty_log()?,
2706             self.device_manager.lock().unwrap().dirty_log()?,
2707         ]))
2708     }
2709 
2710     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2711         self.memory_manager.lock().unwrap().start_migration()?;
2712         self.device_manager.lock().unwrap().start_migration()
2713     }
2714 
2715     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2716         self.memory_manager.lock().unwrap().complete_migration()?;
2717         self.device_manager.lock().unwrap().complete_migration()
2718     }
2719 }
2720 
2721 #[cfg(feature = "guest_debug")]
2722 impl Debuggable for Vm {
2723     fn set_guest_debug(
2724         &self,
2725         cpu_id: usize,
2726         addrs: &[GuestAddress],
2727         singlestep: bool,
2728     ) -> std::result::Result<(), DebuggableError> {
2729         self.cpu_manager
2730             .lock()
2731             .unwrap()
2732             .set_guest_debug(cpu_id, addrs, singlestep)
2733     }
2734 
2735     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2736         if *self.state.read().unwrap() == VmState::Running {
2737             self.pause().map_err(DebuggableError::Pause)?;
2738         }
2739 
2740         let mut state = self
2741             .state
2742             .try_write()
2743             .map_err(|_| DebuggableError::PoisonedState)?;
2744         *state = VmState::BreakPoint;
2745         Ok(())
2746     }
2747 
2748     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2749         if *self.state.read().unwrap() == VmState::BreakPoint {
2750             self.resume().map_err(DebuggableError::Pause)?;
2751         }
2752 
2753         Ok(())
2754     }
2755 
2756     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2757         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2758     }
2759 
2760     fn write_regs(
2761         &self,
2762         cpu_id: usize,
2763         regs: &CoreRegs,
2764     ) -> std::result::Result<(), DebuggableError> {
2765         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2766     }
2767 
2768     fn read_mem(
2769         &self,
2770         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2771         cpu_id: usize,
2772         vaddr: GuestAddress,
2773         len: usize,
2774     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2775         self.cpu_manager
2776             .lock()
2777             .unwrap()
2778             .read_mem(guest_memory, cpu_id, vaddr, len)
2779     }
2780 
2781     fn write_mem(
2782         &self,
2783         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2784         cpu_id: usize,
2785         vaddr: &GuestAddress,
2786         data: &[u8],
2787     ) -> std::result::Result<(), DebuggableError> {
2788         self.cpu_manager
2789             .lock()
2790             .unwrap()
2791             .write_mem(guest_memory, cpu_id, vaddr, data)
2792     }
2793 
2794     fn active_vcpus(&self) -> usize {
2795         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2796         if active_vcpus > 0 {
2797             active_vcpus
2798         } else {
2799             // The VM is not booted yet. Report boot_vcpus() instead.
2800             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2801         }
2802     }
2803 }
2804 
2805 #[cfg(feature = "guest_debug")]
2806 pub const UINT16_MAX: u32 = 65535;
2807 
2808 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2809 impl Elf64Writable for Vm {}
2810 
2811 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2812 impl GuestDebuggable for Vm {
2813     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2814         event!("vm", "coredumping");
2815 
2816         let mut resume = false;
2817 
2818         #[cfg(feature = "tdx")]
2819         {
2820             if let Some(ref platform) = self.config.lock().unwrap().platform {
2821                 if platform.tdx {
2822                     return Err(GuestDebuggableError::Coredump(anyhow!(
2823                         "Coredump not possible with TDX VM"
2824                     )));
2825                 }
2826             }
2827         }
2828 
2829         match self.get_state().unwrap() {
2830             VmState::Running => {
2831                 self.pause().map_err(GuestDebuggableError::Pause)?;
2832                 resume = true;
2833             }
2834             VmState::Paused => {}
2835             _ => {
2836                 return Err(GuestDebuggableError::Coredump(anyhow!(
2837                     "Trying to coredump while VM is not running or paused"
2838                 )));
2839             }
2840         }
2841 
2842         let coredump_state = self.get_dump_state(destination_url)?;
2843 
2844         self.write_header(&coredump_state)?;
2845         self.write_note(&coredump_state)?;
2846         self.write_loads(&coredump_state)?;
2847 
2848         self.cpu_manager
2849             .lock()
2850             .unwrap()
2851             .cpu_write_elf64_note(&coredump_state)?;
2852         self.cpu_manager
2853             .lock()
2854             .unwrap()
2855             .cpu_write_vmm_note(&coredump_state)?;
2856 
2857         self.memory_manager
2858             .lock()
2859             .unwrap()
2860             .coredump_iterate_save_mem(&coredump_state)?;
2861 
2862         if resume {
2863             self.resume().map_err(GuestDebuggableError::Resume)?;
2864         }
2865 
2866         Ok(())
2867     }
2868 }
2869 
2870 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2871 #[cfg(test)]
2872 mod tests {
2873     use super::*;
2874 
2875     fn test_vm_state_transitions(state: VmState) {
2876         match state {
2877             VmState::Created => {
2878                 // Check the transitions from Created
2879                 assert!(state.valid_transition(VmState::Created).is_err());
2880                 assert!(state.valid_transition(VmState::Running).is_ok());
2881                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2882                 assert!(state.valid_transition(VmState::Paused).is_ok());
2883                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2884             }
2885             VmState::Running => {
2886                 // Check the transitions from Running
2887                 assert!(state.valid_transition(VmState::Created).is_err());
2888                 assert!(state.valid_transition(VmState::Running).is_err());
2889                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2890                 assert!(state.valid_transition(VmState::Paused).is_ok());
2891                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2892             }
2893             VmState::Shutdown => {
2894                 // Check the transitions from Shutdown
2895                 assert!(state.valid_transition(VmState::Created).is_err());
2896                 assert!(state.valid_transition(VmState::Running).is_ok());
2897                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2898                 assert!(state.valid_transition(VmState::Paused).is_err());
2899                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2900             }
2901             VmState::Paused => {
2902                 // Check the transitions from Paused
2903                 assert!(state.valid_transition(VmState::Created).is_err());
2904                 assert!(state.valid_transition(VmState::Running).is_ok());
2905                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2906                 assert!(state.valid_transition(VmState::Paused).is_err());
2907                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2908             }
2909             VmState::BreakPoint => {
2910                 // Check the transitions from Breakpoint
2911                 assert!(state.valid_transition(VmState::Created).is_ok());
2912                 assert!(state.valid_transition(VmState::Running).is_ok());
2913                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2914                 assert!(state.valid_transition(VmState::Paused).is_err());
2915                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2916             }
2917         }
2918     }
2919 
2920     #[test]
2921     fn test_vm_created_transitions() {
2922         test_vm_state_transitions(VmState::Created);
2923     }
2924 
2925     #[test]
2926     fn test_vm_running_transitions() {
2927         test_vm_state_transitions(VmState::Running);
2928     }
2929 
2930     #[test]
2931     fn test_vm_shutdown_transitions() {
2932         test_vm_state_transitions(VmState::Shutdown);
2933     }
2934 
2935     #[test]
2936     fn test_vm_paused_transitions() {
2937         test_vm_state_transitions(VmState::Paused);
2938     }
2939 
2940     #[cfg(feature = "tdx")]
2941     #[test]
2942     fn test_hob_memory_resources() {
2943         // Case 1: Two TDVF sections in the middle of the RAM
2944         let sections = vec![
2945             TdvfSection {
2946                 address: 0xc000,
2947                 size: 0x1000,
2948                 ..Default::default()
2949             },
2950             TdvfSection {
2951                 address: 0x1000,
2952                 size: 0x4000,
2953                 ..Default::default()
2954             },
2955         ];
2956         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2957         let expected = vec![
2958             (0, 0x1000, true),
2959             (0x1000, 0x4000, false),
2960             (0x5000, 0x7000, true),
2961             (0xc000, 0x1000, false),
2962             (0xd000, 0x0fff_3000, true),
2963         ];
2964         assert_eq!(
2965             expected,
2966             Vm::hob_memory_resources(
2967                 sections,
2968                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2969             )
2970         );
2971 
2972         // Case 2: Two TDVF sections with no conflict with the RAM
2973         let sections = vec![
2974             TdvfSection {
2975                 address: 0x1000_1000,
2976                 size: 0x1000,
2977                 ..Default::default()
2978             },
2979             TdvfSection {
2980                 address: 0,
2981                 size: 0x1000,
2982                 ..Default::default()
2983             },
2984         ];
2985         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2986         let expected = vec![
2987             (0, 0x1000, false),
2988             (0x1000, 0x1000_0000, true),
2989             (0x1000_1000, 0x1000, false),
2990         ];
2991         assert_eq!(
2992             expected,
2993             Vm::hob_memory_resources(
2994                 sections,
2995                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2996             )
2997         );
2998 
2999         // Case 3: Two TDVF sections with partial conflicts with the RAM
3000         let sections = vec![
3001             TdvfSection {
3002                 address: 0x1000_0000,
3003                 size: 0x2000,
3004                 ..Default::default()
3005             },
3006             TdvfSection {
3007                 address: 0,
3008                 size: 0x2000,
3009                 ..Default::default()
3010             },
3011         ];
3012         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3013         let expected = vec![
3014             (0, 0x2000, false),
3015             (0x2000, 0x0fff_e000, true),
3016             (0x1000_0000, 0x2000, false),
3017         ];
3018         assert_eq!(
3019             expected,
3020             Vm::hob_memory_resources(
3021                 sections,
3022                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3023             )
3024         );
3025 
3026         // Case 4: Two TDVF sections with no conflict before the RAM and two
3027         // more additional sections with no conflict after the RAM.
3028         let sections = vec![
3029             TdvfSection {
3030                 address: 0x2000_1000,
3031                 size: 0x1000,
3032                 ..Default::default()
3033             },
3034             TdvfSection {
3035                 address: 0x2000_0000,
3036                 size: 0x1000,
3037                 ..Default::default()
3038             },
3039             TdvfSection {
3040                 address: 0x1000,
3041                 size: 0x1000,
3042                 ..Default::default()
3043             },
3044             TdvfSection {
3045                 address: 0,
3046                 size: 0x1000,
3047                 ..Default::default()
3048             },
3049         ];
3050         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3051         let expected = vec![
3052             (0, 0x1000, false),
3053             (0x1000, 0x1000, false),
3054             (0x4000, 0x1000_0000, true),
3055             (0x2000_0000, 0x1000, false),
3056             (0x2000_1000, 0x1000, false),
3057         ];
3058         assert_eq!(
3059             expected,
3060             Vm::hob_memory_resources(
3061                 sections,
3062                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3063             )
3064         );
3065 
3066         // Case 5: One TDVF section overriding the entire RAM
3067         let sections = vec![TdvfSection {
3068             address: 0,
3069             size: 0x2000_0000,
3070             ..Default::default()
3071         }];
3072         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3073         let expected = vec![(0, 0x2000_0000, false)];
3074         assert_eq!(
3075             expected,
3076             Vm::hob_memory_resources(
3077                 sections,
3078                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3079             )
3080         );
3081 
3082         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3083         let sections = vec![
3084             TdvfSection {
3085                 address: 0x1000_2000,
3086                 size: 0x2000,
3087                 ..Default::default()
3088             },
3089             TdvfSection {
3090                 address: 0,
3091                 size: 0x2000,
3092                 ..Default::default()
3093             },
3094         ];
3095         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3096             (GuestAddress(0x2000), 0x1000_0000),
3097             (GuestAddress(0x1000_4000), 0x1000_0000),
3098         ];
3099         let expected = vec![
3100             (0, 0x2000, false),
3101             (0x2000, 0x1000_0000, true),
3102             (0x1000_2000, 0x2000, false),
3103             (0x1000_4000, 0x1000_0000, true),
3104         ];
3105         assert_eq!(
3106             expected,
3107             Vm::hob_memory_resources(
3108                 sections,
3109                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3110             )
3111         );
3112 
3113         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3114         let sections = vec![
3115             TdvfSection {
3116                 address: 0x1000_0000,
3117                 size: 0x4000,
3118                 ..Default::default()
3119             },
3120             TdvfSection {
3121                 address: 0,
3122                 size: 0x4000,
3123                 ..Default::default()
3124             },
3125         ];
3126         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3127             (GuestAddress(0x1000), 0x1000_0000),
3128             (GuestAddress(0x1000_3000), 0x1000_0000),
3129         ];
3130         let expected = vec![
3131             (0, 0x4000, false),
3132             (0x4000, 0x0fff_c000, true),
3133             (0x1000_0000, 0x4000, false),
3134             (0x1000_4000, 0x0fff_f000, true),
3135         ];
3136         assert_eq!(
3137             expected,
3138             Vm::hob_memory_resources(
3139                 sections,
3140                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3141             )
3142         );
3143     }
3144 }
3145 
3146 #[cfg(target_arch = "aarch64")]
3147 #[cfg(test)]
3148 mod tests {
3149     use super::*;
3150     use arch::aarch64::fdt::create_fdt;
3151     use arch::aarch64::layout;
3152     use arch::{DeviceType, MmioDeviceInfo};
3153     use devices::gic::Gic;
3154 
3155     const LEN: u64 = 4096;
3156 
3157     #[test]
3158     fn test_create_fdt_with_devices() {
3159         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3160         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3161 
3162         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3163             (
3164                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3165                 MmioDeviceInfo {
3166                     addr: 0x00,
3167                     len: LEN,
3168                     irq: 33,
3169                 },
3170             ),
3171             (
3172                 (DeviceType::Virtio(1), "virtio".to_string()),
3173                 MmioDeviceInfo {
3174                     addr: LEN,
3175                     len: LEN,
3176                     irq: 34,
3177                 },
3178             ),
3179             (
3180                 (DeviceType::Rtc, "rtc".to_string()),
3181                 MmioDeviceInfo {
3182                     addr: 2 * LEN,
3183                     len: LEN,
3184                     irq: 35,
3185                 },
3186             ),
3187         ]
3188         .iter()
3189         .cloned()
3190         .collect();
3191 
3192         let hv = hypervisor::new().unwrap();
3193         let vm = hv.create_vm().unwrap();
3194         let gic = vm
3195             .create_vgic(Gic::create_default_config(1))
3196             .expect("Cannot create gic");
3197         assert!(create_fdt(
3198             &mem,
3199             "console=tty0",
3200             vec![0],
3201             Some((0, 0, 0)),
3202             &dev_info,
3203             &gic,
3204             &None,
3205             &Vec::new(),
3206             &BTreeMap::new(),
3207             None,
3208             true,
3209         )
3210         .is_ok())
3211     }
3212 }
3213 
3214 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3215 #[test]
3216 pub fn test_vm() {
3217     use hypervisor::VmExit;
3218     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3219     // This example based on https://lwn.net/Articles/658511/
3220     let code = [
3221         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3222         0x00, 0xd8, /* add %bl, %al */
3223         0x04, b'0', /* add $'0', %al */
3224         0xee, /* out %al, (%dx) */
3225         0xb0, b'\n', /* mov $'\n', %al */
3226         0xee,  /* out %al, (%dx) */
3227         0xf4,  /* hlt */
3228     ];
3229 
3230     let mem_size = 0x1000;
3231     let load_addr = GuestAddress(0x1000);
3232     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3233 
3234     let hv = hypervisor::new().unwrap();
3235     let vm = hv.create_vm().expect("new VM creation failed");
3236 
3237     for (index, region) in mem.iter().enumerate() {
3238         let mem_region = vm.make_user_memory_region(
3239             index as u32,
3240             region.start_addr().raw_value(),
3241             region.len(),
3242             region.as_ptr() as u64,
3243             false,
3244             false,
3245         );
3246 
3247         vm.create_user_memory_region(mem_region)
3248             .expect("Cannot configure guest memory");
3249     }
3250     mem.write_slice(&code, load_addr)
3251         .expect("Writing code to memory failed");
3252 
3253     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3254 
3255     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3256     vcpu_sregs.cs.base = 0;
3257     vcpu_sregs.cs.selector = 0;
3258     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3259 
3260     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3261     vcpu_regs.rip = 0x1000;
3262     vcpu_regs.rax = 2;
3263     vcpu_regs.rbx = 3;
3264     vcpu_regs.rflags = 2;
3265     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3266 
3267     loop {
3268         match vcpu.run().expect("run failed") {
3269             VmExit::Reset => {
3270                 println!("HLT");
3271                 break;
3272             }
3273             VmExit::Ignore => {}
3274             r => panic!("unexpected exit reason: {r:?}"),
3275         }
3276     }
3277 }
3278