xref: /cloud-hypervisor/vmm/src/vm.rs (revision 08cf983d420af7bce0cd67f34e660324ef219de6)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo};
20 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
21 use crate::coredump::{
22     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
23 };
24 use crate::cpu;
25 use crate::device_manager::{DeviceManager, DeviceManagerError};
26 use crate::device_tree::DeviceTree;
27 #[cfg(feature = "guest_debug")]
28 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
29 #[cfg(feature = "igvm")]
30 use crate::igvm::igvm_loader;
31 use crate::memory_manager::{
32     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
33 };
34 #[cfg(target_arch = "x86_64")]
35 use crate::migration::get_vm_snapshot;
36 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
37 use crate::migration::url_to_file;
38 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
39 use crate::GuestMemoryMmap;
40 use crate::{
41     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
42 };
43 use anyhow::anyhow;
44 use arch::get_host_cpu_phys_bits;
45 #[cfg(target_arch = "x86_64")]
46 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
47 #[cfg(feature = "tdx")]
48 use arch::x86_64::tdx::TdvfSection;
49 use arch::EntryPoint;
50 #[cfg(target_arch = "aarch64")]
51 use arch::PciSpaceInfo;
52 use arch::{NumaNode, NumaNodes};
53 #[cfg(target_arch = "aarch64")]
54 use devices::interrupt_controller;
55 use devices::AcpiNotificationFlags;
56 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
57 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
58 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
59 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
60 use hypervisor::{HypervisorVmError, VmOps};
61 use libc::{termios, SIGWINCH};
62 use linux_loader::cmdline::Cmdline;
63 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
64 use linux_loader::elf;
65 #[cfg(target_arch = "x86_64")]
66 use linux_loader::loader::bzimage::BzImage;
67 #[cfg(target_arch = "x86_64")]
68 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
69 #[cfg(target_arch = "aarch64")]
70 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
71 use linux_loader::loader::KernelLoader;
72 use seccompiler::SeccompAction;
73 use serde::{Deserialize, Serialize};
74 use std::cmp;
75 use std::collections::BTreeMap;
76 use std::collections::HashMap;
77 use std::fs::{File, OpenOptions};
78 use std::io::{self, Seek, SeekFrom, Write};
79 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
80 use std::mem::size_of;
81 use std::num::Wrapping;
82 use std::ops::Deref;
83 use std::os::unix::net::UnixStream;
84 use std::sync::{Arc, Mutex, RwLock};
85 use std::time::Instant;
86 use std::{result, str, thread};
87 use thiserror::Error;
88 use tracer::trace_scoped;
89 use vm_device::Bus;
90 #[cfg(feature = "tdx")]
91 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
92 use vm_memory::{
93     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
94 };
95 use vm_migration::protocol::{Request, Response, Status};
96 use vm_migration::{
97     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
98     Snapshottable, Transportable,
99 };
100 use vmm_sys_util::eventfd::EventFd;
101 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
102 
103 /// Errors associated with VM management
104 #[derive(Debug, Error)]
105 pub enum Error {
106     #[error("Cannot open kernel file: {0}")]
107     KernelFile(#[source] io::Error),
108 
109     #[error("Cannot open initramfs file: {0}")]
110     InitramfsFile(#[source] io::Error),
111 
112     #[error("Cannot load the kernel into memory: {0}")]
113     KernelLoad(#[source] linux_loader::loader::Error),
114 
115     #[cfg(target_arch = "aarch64")]
116     #[error("Cannot load the UEFI binary in memory: {0:?}")]
117     UefiLoad(arch::aarch64::uefi::Error),
118 
119     #[error("Cannot load the initramfs into memory")]
120     InitramfsLoad,
121 
122     #[error("Cannot load the kernel command line in memory: {0}")]
123     LoadCmdLine(#[source] linux_loader::loader::Error),
124 
125     #[error("Cannot modify the kernel command line: {0}")]
126     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
127 
128     #[error("Cannot create the kernel command line: {0}")]
129     CmdLineCreate(#[source] linux_loader::cmdline::Error),
130 
131     #[error("Cannot configure system: {0}")]
132     ConfigureSystem(#[source] arch::Error),
133 
134     #[cfg(target_arch = "aarch64")]
135     #[error("Cannot enable interrupt controller: {0:?}")]
136     EnableInterruptController(interrupt_controller::Error),
137 
138     #[error("VM state is poisoned")]
139     PoisonedState,
140 
141     #[error("Error from device manager: {0:?}")]
142     DeviceManager(DeviceManagerError),
143 
144     #[error("No device with id {0:?} to remove")]
145     NoDeviceToRemove(String),
146 
147     #[error("Cannot spawn a signal handler thread: {0}")]
148     SignalHandlerSpawn(#[source] io::Error),
149 
150     #[error("Failed to join on threads: {0:?}")]
151     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
152 
153     #[error("VM config is missing")]
154     VmMissingConfig,
155 
156     #[error("VM is not created")]
157     VmNotCreated,
158 
159     #[error("VM is already created")]
160     VmAlreadyCreated,
161 
162     #[error("VM is not running")]
163     VmNotRunning,
164 
165     #[error("Cannot clone EventFd: {0}")]
166     EventFdClone(#[source] io::Error),
167 
168     #[error("invalid VM state transition: {0:?} to {1:?}")]
169     InvalidStateTransition(VmState, VmState),
170 
171     #[error("Error from CPU manager: {0}")]
172     CpuManager(#[source] cpu::Error),
173 
174     #[error("Cannot pause devices: {0}")]
175     PauseDevices(#[source] MigratableError),
176 
177     #[error("Cannot resume devices: {0}")]
178     ResumeDevices(#[source] MigratableError),
179 
180     #[error("Cannot pause CPUs: {0}")]
181     PauseCpus(#[source] MigratableError),
182 
183     #[error("Cannot resume cpus: {0}")]
184     ResumeCpus(#[source] MigratableError),
185 
186     #[error("Cannot pause VM: {0}")]
187     Pause(#[source] MigratableError),
188 
189     #[error("Cannot resume VM: {0}")]
190     Resume(#[source] MigratableError),
191 
192     #[error("Memory manager error: {0:?}")]
193     MemoryManager(MemoryManagerError),
194 
195     #[error("Eventfd write error: {0}")]
196     EventfdError(#[source] std::io::Error),
197 
198     #[error("Cannot snapshot VM: {0}")]
199     Snapshot(#[source] MigratableError),
200 
201     #[error("Cannot restore VM: {0}")]
202     Restore(#[source] MigratableError),
203 
204     #[error("Cannot send VM snapshot: {0}")]
205     SnapshotSend(#[source] MigratableError),
206 
207     #[error("Invalid restore source URL")]
208     InvalidRestoreSourceUrl,
209 
210     #[error("Failed to validate config: {0}")]
211     ConfigValidation(#[source] ValidationError),
212 
213     #[error("Too many virtio-vsock devices")]
214     TooManyVsockDevices,
215 
216     #[error("Failed serializing into JSON: {0}")]
217     SerializeJson(#[source] serde_json::Error),
218 
219     #[error("Invalid NUMA configuration")]
220     InvalidNumaConfig,
221 
222     #[error("Cannot create seccomp filter: {0}")]
223     CreateSeccompFilter(#[source] seccompiler::Error),
224 
225     #[error("Cannot apply seccomp filter: {0}")]
226     ApplySeccompFilter(#[source] seccompiler::Error),
227 
228     #[error("Failed resizing a memory zone")]
229     ResizeZone,
230 
231     #[error("Cannot activate virtio devices: {0:?}")]
232     ActivateVirtioDevices(DeviceManagerError),
233 
234     #[error("Error triggering power button: {0:?}")]
235     PowerButton(DeviceManagerError),
236 
237     #[error("Kernel lacks PVH header")]
238     KernelMissingPvhHeader,
239 
240     #[error("Failed to allocate firmware RAM: {0:?}")]
241     AllocateFirmwareMemory(MemoryManagerError),
242 
243     #[error("Error manipulating firmware file: {0}")]
244     FirmwareFile(#[source] std::io::Error),
245 
246     #[error("Firmware too big")]
247     FirmwareTooLarge,
248 
249     #[error("Failed to copy firmware to memory: {0}")]
250     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
251 
252     #[cfg(feature = "sev_snp")]
253     #[error("Error enabling SEV-SNP VM: {0}")]
254     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
255 
256     #[cfg(feature = "tdx")]
257     #[error("Error performing I/O on TDX firmware file: {0}")]
258     LoadTdvf(#[source] std::io::Error),
259 
260     #[cfg(feature = "tdx")]
261     #[error("Error performing I/O on the TDX payload file: {0}")]
262     LoadPayload(#[source] std::io::Error),
263 
264     #[cfg(feature = "tdx")]
265     #[error("Error parsing TDVF: {0}")]
266     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
267 
268     #[cfg(feature = "tdx")]
269     #[error("Error populating TDX HOB: {0}")]
270     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
271 
272     #[cfg(feature = "tdx")]
273     #[error("Error allocating TDVF memory: {0:?}")]
274     AllocatingTdvfMemory(crate::memory_manager::Error),
275 
276     #[cfg(feature = "tdx")]
277     #[error("Error enabling TDX VM: {0}")]
278     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
279 
280     #[cfg(feature = "tdx")]
281     #[error("Error enabling TDX memory region: {0}")]
282     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
283 
284     #[cfg(feature = "tdx")]
285     #[error("Error finalizing TDX VM: {0}")]
286     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
287 
288     #[cfg(feature = "tdx")]
289     #[error("TDX firmware missing")]
290     TdxFirmwareMissing,
291 
292     #[cfg(feature = "tdx")]
293     #[error("Invalid TDX payload type")]
294     InvalidPayloadType,
295 
296     #[cfg(feature = "guest_debug")]
297     #[error("Error debugging VM: {0:?}")]
298     Debug(DebuggableError),
299 
300     #[error("Error spawning kernel loading thread")]
301     KernelLoadThreadSpawn(std::io::Error),
302 
303     #[error("Error joining kernel loading thread")]
304     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
305 
306     #[error("Payload configuration is not bootable")]
307     InvalidPayload,
308 
309     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
310     #[error("Error coredumping VM: {0:?}")]
311     Coredump(GuestDebuggableError),
312 
313     #[cfg(feature = "igvm")]
314     #[error("Cannot open igvm file: {0}")]
315     IgvmFile(#[source] io::Error),
316 
317     #[cfg(feature = "igvm")]
318     #[error("Cannot load the igvm into memory: {0}")]
319     IgvmLoad(#[source] igvm_loader::Error),
320 
321     #[error("Error injecting NMI")]
322     ErrorNmi,
323 
324     #[error("Error resuming the VM: {0}")]
325     ResumeVm(#[source] hypervisor::HypervisorVmError),
326 
327     #[error("Error creating console devices")]
328     CreateConsoleDevices(ConsoleDeviceError),
329 }
330 pub type Result<T> = result::Result<T, Error>;
331 
332 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
333 pub enum VmState {
334     Created,
335     Running,
336     Shutdown,
337     Paused,
338     BreakPoint,
339 }
340 
341 impl VmState {
342     fn valid_transition(self, new_state: VmState) -> Result<()> {
343         match self {
344             VmState::Created => match new_state {
345                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
346                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
347                     Ok(())
348                 }
349             },
350 
351             VmState::Running => match new_state {
352                 VmState::Created | VmState::Running => {
353                     Err(Error::InvalidStateTransition(self, new_state))
354                 }
355                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
356             },
357 
358             VmState::Shutdown => match new_state {
359                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
360                     Err(Error::InvalidStateTransition(self, new_state))
361                 }
362                 VmState::Running => Ok(()),
363             },
364 
365             VmState::Paused => match new_state {
366                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
367                     Err(Error::InvalidStateTransition(self, new_state))
368                 }
369                 VmState::Running | VmState::Shutdown => Ok(()),
370             },
371             VmState::BreakPoint => match new_state {
372                 VmState::Created | VmState::Running => Ok(()),
373                 _ => Err(Error::InvalidStateTransition(self, new_state)),
374             },
375         }
376     }
377 }
378 
379 struct VmOpsHandler {
380     memory: GuestMemoryAtomic<GuestMemoryMmap>,
381     #[cfg(target_arch = "x86_64")]
382     io_bus: Arc<Bus>,
383     mmio_bus: Arc<Bus>,
384 }
385 
386 impl VmOps for VmOpsHandler {
387     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
388         self.memory
389             .memory()
390             .write(buf, GuestAddress(gpa))
391             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
392     }
393 
394     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
395         self.memory
396             .memory()
397             .read(buf, GuestAddress(gpa))
398             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
399     }
400 
401     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
402         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
403             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
404         }
405         Ok(())
406     }
407 
408     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
409         match self.mmio_bus.write(gpa, data) {
410             Err(vm_device::BusError::MissingAddressRange) => {
411                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
412             }
413             Ok(Some(barrier)) => {
414                 info!("Waiting for barrier");
415                 barrier.wait();
416                 info!("Barrier released");
417             }
418             _ => {}
419         };
420         Ok(())
421     }
422 
423     #[cfg(target_arch = "x86_64")]
424     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
425         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
426             info!("Guest PIO read to unregistered address 0x{:x}", port);
427         }
428         Ok(())
429     }
430 
431     #[cfg(target_arch = "x86_64")]
432     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
433         match self.io_bus.write(port, data) {
434             Err(vm_device::BusError::MissingAddressRange) => {
435                 info!("Guest PIO write to unregistered address 0x{:x}", port);
436             }
437             Ok(Some(barrier)) => {
438                 info!("Waiting for barrier");
439                 barrier.wait();
440                 info!("Barrier released");
441             }
442             _ => {}
443         };
444         Ok(())
445     }
446 }
447 
448 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
449     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
450 
451     cmp::min(host_phys_bits, max_phys_bits)
452 }
453 
454 pub struct Vm {
455     #[cfg(feature = "tdx")]
456     kernel: Option<File>,
457     initramfs: Option<File>,
458     threads: Vec<thread::JoinHandle<()>>,
459     device_manager: Arc<Mutex<DeviceManager>>,
460     config: Arc<Mutex<VmConfig>>,
461     state: RwLock<VmState>,
462     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
463     memory_manager: Arc<Mutex<MemoryManager>>,
464     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
465     // The hypervisor abstracted virtual machine.
466     vm: Arc<dyn hypervisor::Vm>,
467     #[cfg(target_arch = "x86_64")]
468     saved_clock: Option<hypervisor::ClockData>,
469     numa_nodes: NumaNodes,
470     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
471     hypervisor: Arc<dyn hypervisor::Hypervisor>,
472     stop_on_boot: bool,
473     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
474 }
475 
476 impl Vm {
477     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
478 
479     #[allow(clippy::too_many_arguments)]
480     pub fn new_from_memory_manager(
481         config: Arc<Mutex<VmConfig>>,
482         memory_manager: Arc<Mutex<MemoryManager>>,
483         vm: Arc<dyn hypervisor::Vm>,
484         exit_evt: EventFd,
485         reset_evt: EventFd,
486         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
487         seccomp_action: &SeccompAction,
488         hypervisor: Arc<dyn hypervisor::Hypervisor>,
489         activate_evt: EventFd,
490         timestamp: Instant,
491         console_info: Option<ConsoleInfo>,
492         console_resize_pipe: Option<File>,
493         original_termios: Arc<Mutex<Option<termios>>>,
494         snapshot: Option<Snapshot>,
495     ) -> Result<Self> {
496         trace_scoped!("Vm::new_from_memory_manager");
497 
498         let boot_id_list = config
499             .lock()
500             .unwrap()
501             .validate()
502             .map_err(Error::ConfigValidation)?;
503 
504         #[cfg(not(feature = "igvm"))]
505         let load_payload_handle = if snapshot.is_none() {
506             Self::load_payload_async(&memory_manager, &config)?
507         } else {
508             None
509         };
510 
511         info!("Booting VM from config: {:?}", &config);
512 
513         // Create NUMA nodes based on NumaConfig.
514         let numa_nodes =
515             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
516 
517         #[cfg(feature = "tdx")]
518         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
519         #[cfg(feature = "sev_snp")]
520         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
521         #[cfg(feature = "tdx")]
522         let force_iommu = tdx_enabled;
523         #[cfg(feature = "sev_snp")]
524         let force_iommu = sev_snp_enabled;
525         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
526         let force_iommu = false;
527 
528         #[cfg(feature = "guest_debug")]
529         let stop_on_boot = config.lock().unwrap().gdb;
530         #[cfg(not(feature = "guest_debug"))]
531         let stop_on_boot = false;
532 
533         let memory = memory_manager.lock().unwrap().guest_memory();
534         #[cfg(target_arch = "x86_64")]
535         let io_bus = Arc::new(Bus::new());
536         let mmio_bus = Arc::new(Bus::new());
537 
538         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
539             memory,
540             #[cfg(target_arch = "x86_64")]
541             io_bus: io_bus.clone(),
542             mmio_bus: mmio_bus.clone(),
543         });
544 
545         let cpus_config = { &config.lock().unwrap().cpus.clone() };
546         let cpu_manager = cpu::CpuManager::new(
547             cpus_config,
548             vm.clone(),
549             exit_evt.try_clone().map_err(Error::EventFdClone)?,
550             reset_evt.try_clone().map_err(Error::EventFdClone)?,
551             #[cfg(feature = "guest_debug")]
552             vm_debug_evt,
553             &hypervisor,
554             seccomp_action.clone(),
555             vm_ops,
556             #[cfg(feature = "tdx")]
557             tdx_enabled,
558             &numa_nodes,
559             #[cfg(feature = "sev_snp")]
560             sev_snp_enabled,
561         )
562         .map_err(Error::CpuManager)?;
563 
564         #[cfg(target_arch = "x86_64")]
565         cpu_manager
566             .lock()
567             .unwrap()
568             .populate_cpuid(
569                 &memory_manager,
570                 &hypervisor,
571                 #[cfg(feature = "tdx")]
572                 tdx_enabled,
573             )
574             .map_err(Error::CpuManager)?;
575 
576         // Loading the igvm file is pushed down here because
577         // igvm parser needs cpu_manager to retrieve cpuid leaf.
578         // For the regular case, we can start loading early, but for
579         // igvm case we have to wait until cpu_manager is created.
580         // Currently, Microsoft Hypervisor does not provide any
581         // Hypervisor specific common cpuid, we need to call get_cpuid_values
582         // per cpuid through cpu_manager.
583         #[cfg(feature = "igvm")]
584         let load_payload_handle = if snapshot.is_none() {
585             Self::load_payload_async(
586                 &memory_manager,
587                 &config,
588                 &cpu_manager,
589                 #[cfg(feature = "sev_snp")]
590                 sev_snp_enabled,
591             )?
592         } else {
593             None
594         };
595         // The initial TDX configuration must be done before the vCPUs are
596         // created
597         #[cfg(feature = "tdx")]
598         if tdx_enabled {
599             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
600             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
601             vm.tdx_init(&cpuid, max_vcpus)
602                 .map_err(Error::InitializeTdxVm)?;
603         }
604 
605         cpu_manager
606             .lock()
607             .unwrap()
608             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
609             .map_err(Error::CpuManager)?;
610 
611         // This initial SEV-SNP configuration must be done immediately after
612         // vCPUs are created. As part of this initialization we are
613         // transitioning the guest into secure state.
614         #[cfg(feature = "sev_snp")]
615         if sev_snp_enabled {
616             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
617         }
618 
619         #[cfg(feature = "tdx")]
620         let dynamic = !tdx_enabled;
621         #[cfg(not(feature = "tdx"))]
622         let dynamic = true;
623 
624         let device_manager = DeviceManager::new(
625             #[cfg(target_arch = "x86_64")]
626             io_bus,
627             mmio_bus,
628             vm.clone(),
629             config.clone(),
630             memory_manager.clone(),
631             cpu_manager.clone(),
632             exit_evt.try_clone().map_err(Error::EventFdClone)?,
633             reset_evt,
634             seccomp_action.clone(),
635             numa_nodes.clone(),
636             &activate_evt,
637             force_iommu,
638             boot_id_list,
639             timestamp,
640             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
641             dynamic,
642         )
643         .map_err(Error::DeviceManager)?;
644 
645         device_manager
646             .lock()
647             .unwrap()
648             .create_devices(console_info, console_resize_pipe, original_termios)
649             .map_err(Error::DeviceManager)?;
650 
651         #[cfg(feature = "tdx")]
652         let kernel = config
653             .lock()
654             .unwrap()
655             .payload
656             .as_ref()
657             .map(|p| p.kernel.as_ref().map(File::open))
658             .unwrap_or_default()
659             .transpose()
660             .map_err(Error::KernelFile)?;
661 
662         let initramfs = config
663             .lock()
664             .unwrap()
665             .payload
666             .as_ref()
667             .map(|p| p.initramfs.as_ref().map(File::open))
668             .unwrap_or_default()
669             .transpose()
670             .map_err(Error::InitramfsFile)?;
671 
672         #[cfg(target_arch = "x86_64")]
673         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
674             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
675             vm_snapshot.clock
676         } else {
677             None
678         };
679 
680         let vm_state = if snapshot.is_some() {
681             VmState::Paused
682         } else {
683             VmState::Created
684         };
685 
686         Ok(Vm {
687             #[cfg(feature = "tdx")]
688             kernel,
689             initramfs,
690             device_manager,
691             config,
692             threads: Vec::with_capacity(1),
693             state: RwLock::new(vm_state),
694             cpu_manager,
695             memory_manager,
696             vm,
697             #[cfg(target_arch = "x86_64")]
698             saved_clock,
699             numa_nodes,
700             hypervisor,
701             stop_on_boot,
702             load_payload_handle,
703         })
704     }
705 
706     fn create_numa_nodes(
707         configs: Option<Vec<NumaConfig>>,
708         memory_manager: &Arc<Mutex<MemoryManager>>,
709     ) -> Result<NumaNodes> {
710         let mm = memory_manager.lock().unwrap();
711         let mm_zones = mm.memory_zones();
712         let mut numa_nodes = BTreeMap::new();
713 
714         if let Some(configs) = &configs {
715             for config in configs.iter() {
716                 if numa_nodes.contains_key(&config.guest_numa_id) {
717                     error!("Can't define twice the same NUMA node");
718                     return Err(Error::InvalidNumaConfig);
719                 }
720 
721                 let mut node = NumaNode::default();
722 
723                 if let Some(memory_zones) = &config.memory_zones {
724                     for memory_zone in memory_zones.iter() {
725                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
726                             node.memory_regions.extend(mm_zone.regions().clone());
727                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
728                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
729                             }
730                             node.memory_zones.push(memory_zone.clone());
731                         } else {
732                             error!("Unknown memory zone '{}'", memory_zone);
733                             return Err(Error::InvalidNumaConfig);
734                         }
735                     }
736                 }
737 
738                 if let Some(cpus) = &config.cpus {
739                     node.cpus.extend(cpus);
740                 }
741 
742                 if let Some(pci_segments) = &config.pci_segments {
743                     node.pci_segments.extend(pci_segments);
744                 }
745 
746                 if let Some(distances) = &config.distances {
747                     for distance in distances.iter() {
748                         let dest = distance.destination;
749                         let dist = distance.distance;
750 
751                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
752                             error!("Unknown destination NUMA node {}", dest);
753                             return Err(Error::InvalidNumaConfig);
754                         }
755 
756                         if node.distances.contains_key(&dest) {
757                             error!("Destination NUMA node {} has been already set", dest);
758                             return Err(Error::InvalidNumaConfig);
759                         }
760 
761                         node.distances.insert(dest, dist);
762                     }
763                 }
764 
765                 #[cfg(target_arch = "x86_64")]
766                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
767                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
768                         let mm_sections = sgx_epc_region.epc_sections();
769                         for sgx_epc_section in sgx_epc_sections.iter() {
770                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
771                                 node.sgx_epc_sections.push(mm_section.clone());
772                             } else {
773                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
774                                 return Err(Error::InvalidNumaConfig);
775                             }
776                         }
777                     } else {
778                         error!("Missing SGX EPC region");
779                         return Err(Error::InvalidNumaConfig);
780                     }
781                 }
782 
783                 numa_nodes.insert(config.guest_numa_id, node);
784             }
785         }
786 
787         Ok(numa_nodes)
788     }
789 
790     #[allow(clippy::too_many_arguments)]
791     pub fn new(
792         vm_config: Arc<Mutex<VmConfig>>,
793         exit_evt: EventFd,
794         reset_evt: EventFd,
795         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
796         seccomp_action: &SeccompAction,
797         hypervisor: Arc<dyn hypervisor::Hypervisor>,
798         activate_evt: EventFd,
799         console_info: Option<ConsoleInfo>,
800         console_resize_pipe: Option<File>,
801         original_termios: Arc<Mutex<Option<termios>>>,
802         snapshot: Option<Snapshot>,
803         source_url: Option<&str>,
804         prefault: Option<bool>,
805     ) -> Result<Self> {
806         trace_scoped!("Vm::new");
807 
808         let timestamp = Instant::now();
809 
810         #[cfg(feature = "tdx")]
811         let tdx_enabled = if snapshot.is_some() {
812             false
813         } else {
814             vm_config.lock().unwrap().is_tdx_enabled()
815         };
816 
817         #[cfg(feature = "sev_snp")]
818         let sev_snp_enabled = if snapshot.is_some() {
819             false
820         } else {
821             vm_config.lock().unwrap().is_sev_snp_enabled()
822         };
823 
824         let vm = Self::create_hypervisor_vm(
825             &hypervisor,
826             #[cfg(feature = "tdx")]
827             tdx_enabled,
828             #[cfg(feature = "sev_snp")]
829             sev_snp_enabled,
830         )?;
831 
832         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
833 
834         let memory_manager = if let Some(snapshot) =
835             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
836         {
837             MemoryManager::new_from_snapshot(
838                 &snapshot,
839                 vm.clone(),
840                 &vm_config.lock().unwrap().memory.clone(),
841                 source_url,
842                 prefault.unwrap(),
843                 phys_bits,
844             )
845             .map_err(Error::MemoryManager)?
846         } else {
847             #[cfg(target_arch = "x86_64")]
848             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
849 
850             MemoryManager::new(
851                 vm.clone(),
852                 &vm_config.lock().unwrap().memory.clone(),
853                 None,
854                 phys_bits,
855                 #[cfg(feature = "tdx")]
856                 tdx_enabled,
857                 None,
858                 None,
859                 #[cfg(target_arch = "x86_64")]
860                 sgx_epc_config,
861             )
862             .map_err(Error::MemoryManager)?
863         };
864 
865         Vm::new_from_memory_manager(
866             vm_config,
867             memory_manager,
868             vm,
869             exit_evt,
870             reset_evt,
871             #[cfg(feature = "guest_debug")]
872             vm_debug_evt,
873             seccomp_action,
874             hypervisor,
875             activate_evt,
876             timestamp,
877             console_info,
878             console_resize_pipe,
879             original_termios,
880             snapshot,
881         )
882     }
883 
884     pub fn create_hypervisor_vm(
885         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
886         #[cfg(feature = "tdx")] tdx_enabled: bool,
887         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
888     ) -> Result<Arc<dyn hypervisor::Vm>> {
889         hypervisor.check_required_extensions().unwrap();
890 
891         cfg_if::cfg_if! {
892             if #[cfg(feature = "tdx")] {
893                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
894                 // Otherwise KVM_X86_LEGACY_VM: 0
895                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
896                 let vm = hypervisor
897                     .create_vm_with_type(u64::from(tdx_enabled))
898                     .unwrap();
899             } else if #[cfg(feature = "sev_snp")] {
900                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
901                 // Otherwise SEV_SNP_DISABLED: 0
902                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
903                 let vm = hypervisor
904                     .create_vm_with_type(u64::from(sev_snp_enabled))
905                     .unwrap();
906             } else {
907                 let vm = hypervisor.create_vm().unwrap();
908             }
909         }
910 
911         #[cfg(target_arch = "x86_64")]
912         {
913             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
914                 .unwrap();
915             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
916             vm.enable_split_irq().unwrap();
917         }
918 
919         Ok(vm)
920     }
921 
922     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
923         let initramfs = self.initramfs.as_mut().unwrap();
924         let size: usize = initramfs
925             .seek(SeekFrom::End(0))
926             .map_err(|_| Error::InitramfsLoad)?
927             .try_into()
928             .unwrap();
929         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
930 
931         let address =
932             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
933         let address = GuestAddress(address);
934 
935         guest_mem
936             .read_volatile_from(address, initramfs, size)
937             .map_err(|_| Error::InitramfsLoad)?;
938 
939         info!("Initramfs loaded: address = 0x{:x}", address.0);
940         Ok(arch::InitramfsConfig { address, size })
941     }
942 
943     pub fn generate_cmdline(
944         payload: &PayloadConfig,
945         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
946     ) -> Result<Cmdline> {
947         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
948         if let Some(s) = payload.cmdline.as_ref() {
949             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
950         }
951 
952         #[cfg(target_arch = "aarch64")]
953         for entry in device_manager.lock().unwrap().cmdline_additions() {
954             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
955         }
956         Ok(cmdline)
957     }
958 
959     #[cfg(target_arch = "aarch64")]
960     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
961         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
962         let mem = uefi_flash.memory();
963         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
964             .map_err(Error::UefiLoad)?;
965         Ok(())
966     }
967 
968     #[cfg(target_arch = "aarch64")]
969     fn load_kernel(
970         firmware: Option<File>,
971         kernel: Option<File>,
972         memory_manager: Arc<Mutex<MemoryManager>>,
973     ) -> Result<EntryPoint> {
974         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
975         let mem = guest_memory.memory();
976         let entry_addr = match (firmware, kernel) {
977             (None, Some(mut kernel)) => {
978                 match linux_loader::loader::pe::PE::load(
979                     mem.deref(),
980                     Some(arch::layout::KERNEL_START),
981                     &mut kernel,
982                     None,
983                 ) {
984                     Ok(entry_addr) => entry_addr.kernel_load,
985                     // Try to load the binary as kernel PE file at first.
986                     // If failed, retry to load it as UEFI binary.
987                     // As the UEFI binary is formatless, it must be the last option to try.
988                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
989                         Self::load_firmware(&kernel, memory_manager)?;
990                         arch::layout::UEFI_START
991                     }
992                     Err(e) => {
993                         return Err(Error::KernelLoad(e));
994                     }
995                 }
996             }
997             (Some(firmware), None) => {
998                 Self::load_firmware(&firmware, memory_manager)?;
999                 arch::layout::UEFI_START
1000             }
1001             _ => return Err(Error::InvalidPayload),
1002         };
1003 
1004         Ok(EntryPoint { entry_addr })
1005     }
1006 
1007     #[cfg(feature = "igvm")]
1008     fn load_igvm(
1009         igvm: File,
1010         memory_manager: Arc<Mutex<MemoryManager>>,
1011         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1012         #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1013     ) -> Result<EntryPoint> {
1014         let res = igvm_loader::load_igvm(
1015             &igvm,
1016             memory_manager,
1017             cpu_manager.clone(),
1018             "",
1019             #[cfg(feature = "sev_snp")]
1020             host_data,
1021         )
1022         .map_err(Error::IgvmLoad)?;
1023 
1024         cfg_if::cfg_if! {
1025             if #[cfg(feature = "sev_snp")] {
1026                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1027                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1028                 } else {
1029                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1030                 };
1031             } else {
1032                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1033             }
1034         };
1035         Ok(entry_point)
1036     }
1037 
1038     #[cfg(target_arch = "x86_64")]
1039     fn load_kernel(
1040         mut kernel: File,
1041         cmdline: Option<Cmdline>,
1042         memory_manager: Arc<Mutex<MemoryManager>>,
1043     ) -> Result<EntryPoint> {
1044         info!("Loading kernel");
1045 
1046         let mem = {
1047             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1048             guest_memory.memory()
1049         };
1050 
1051         // Try ELF binary with PVH boot.
1052         let entry_addr = linux_loader::loader::elf::Elf::load(
1053             mem.deref(),
1054             None,
1055             &mut kernel,
1056             Some(arch::layout::HIGH_RAM_START),
1057         )
1058         // Try loading kernel as bzImage.
1059         .or_else(|_| {
1060             BzImage::load(
1061                 mem.deref(),
1062                 None,
1063                 &mut kernel,
1064                 Some(arch::layout::HIGH_RAM_START),
1065             )
1066         })
1067         .map_err(Error::KernelLoad)?;
1068 
1069         if let Some(cmdline) = cmdline {
1070             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1071                 .map_err(Error::LoadCmdLine)?;
1072         }
1073 
1074         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1075             // Use the PVH kernel entry point to boot the guest
1076             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1077             Ok(EntryPoint {
1078                 entry_addr,
1079                 setup_header: None,
1080             })
1081         } else if entry_addr.setup_header.is_some() {
1082             // Use the bzImage 32bit entry point to boot the guest
1083             info!(
1084                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1085                 entry_addr.kernel_load.0
1086             );
1087             Ok(EntryPoint {
1088                 entry_addr: entry_addr.kernel_load,
1089                 setup_header: entry_addr.setup_header,
1090             })
1091         } else {
1092             Err(Error::KernelMissingPvhHeader)
1093         }
1094     }
1095 
1096     #[cfg(target_arch = "x86_64")]
1097     fn load_payload(
1098         payload: &PayloadConfig,
1099         memory_manager: Arc<Mutex<MemoryManager>>,
1100         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1101         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1102     ) -> Result<EntryPoint> {
1103         trace_scoped!("load_payload");
1104         #[cfg(feature = "igvm")]
1105         {
1106             if let Some(_igvm_file) = &payload.igvm {
1107                 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1108                 #[cfg(feature = "sev_snp")]
1109                 if sev_snp_enabled {
1110                     return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1111                 }
1112                 #[cfg(not(feature = "sev_snp"))]
1113                 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1114             }
1115         }
1116         match (
1117             &payload.firmware,
1118             &payload.kernel,
1119             &payload.initramfs,
1120             &payload.cmdline,
1121         ) {
1122             (Some(firmware), None, None, None) => {
1123                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1124                 Self::load_kernel(firmware, None, memory_manager)
1125             }
1126             (None, Some(kernel), _, _) => {
1127                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1128                 let cmdline = Self::generate_cmdline(payload)?;
1129                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1130             }
1131             _ => Err(Error::InvalidPayload),
1132         }
1133     }
1134 
1135     #[cfg(target_arch = "aarch64")]
1136     fn load_payload(
1137         payload: &PayloadConfig,
1138         memory_manager: Arc<Mutex<MemoryManager>>,
1139     ) -> Result<EntryPoint> {
1140         match (&payload.firmware, &payload.kernel) {
1141             (Some(firmware), None) => {
1142                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1143                 Self::load_kernel(Some(firmware), None, memory_manager)
1144             }
1145             (None, Some(kernel)) => {
1146                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1147                 Self::load_kernel(None, Some(kernel), memory_manager)
1148             }
1149             _ => Err(Error::InvalidPayload),
1150         }
1151     }
1152 
1153     fn load_payload_async(
1154         memory_manager: &Arc<Mutex<MemoryManager>>,
1155         config: &Arc<Mutex<VmConfig>>,
1156         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1157         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1158     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1159         // Kernel with TDX is loaded in a different manner
1160         #[cfg(feature = "tdx")]
1161         if config.lock().unwrap().is_tdx_enabled() {
1162             return Ok(None);
1163         }
1164 
1165         config
1166             .lock()
1167             .unwrap()
1168             .payload
1169             .as_ref()
1170             .map(|payload| {
1171                 let memory_manager = memory_manager.clone();
1172                 let payload = payload.clone();
1173                 #[cfg(feature = "igvm")]
1174                 let cpu_manager = cpu_manager.clone();
1175 
1176                 std::thread::Builder::new()
1177                     .name("payload_loader".into())
1178                     .spawn(move || {
1179                         Self::load_payload(
1180                             &payload,
1181                             memory_manager,
1182                             #[cfg(feature = "igvm")]
1183                             cpu_manager,
1184                             #[cfg(feature = "sev_snp")]
1185                             sev_snp_enabled,
1186                         )
1187                     })
1188                     .map_err(Error::KernelLoadThreadSpawn)
1189             })
1190             .transpose()
1191     }
1192 
1193     #[cfg(target_arch = "x86_64")]
1194     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1195         trace_scoped!("configure_system");
1196         info!("Configuring system");
1197         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1198 
1199         let initramfs_config = match self.initramfs {
1200             Some(_) => Some(self.load_initramfs(&mem)?),
1201             None => None,
1202         };
1203 
1204         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1205         let rsdp_addr = Some(rsdp_addr);
1206         let sgx_epc_region = self
1207             .memory_manager
1208             .lock()
1209             .unwrap()
1210             .sgx_epc_region()
1211             .as_ref()
1212             .cloned();
1213 
1214         let serial_number = self
1215             .config
1216             .lock()
1217             .unwrap()
1218             .platform
1219             .as_ref()
1220             .and_then(|p| p.serial_number.clone());
1221 
1222         let uuid = self
1223             .config
1224             .lock()
1225             .unwrap()
1226             .platform
1227             .as_ref()
1228             .and_then(|p| p.uuid.clone());
1229 
1230         let oem_strings = self
1231             .config
1232             .lock()
1233             .unwrap()
1234             .platform
1235             .as_ref()
1236             .and_then(|p| p.oem_strings.clone());
1237 
1238         let oem_strings = oem_strings
1239             .as_deref()
1240             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1241 
1242         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1243 
1244         arch::configure_system(
1245             &mem,
1246             arch::layout::CMDLINE_START,
1247             arch::layout::CMDLINE_MAX_SIZE,
1248             &initramfs_config,
1249             boot_vcpus,
1250             entry_addr.setup_header,
1251             rsdp_addr,
1252             sgx_epc_region,
1253             serial_number.as_deref(),
1254             uuid.as_deref(),
1255             oem_strings.as_deref(),
1256             topology,
1257         )
1258         .map_err(Error::ConfigureSystem)?;
1259         Ok(())
1260     }
1261 
1262     #[cfg(target_arch = "aarch64")]
1263     fn configure_system(
1264         &mut self,
1265         _rsdp_addr: GuestAddress,
1266         _entry_addr: EntryPoint,
1267     ) -> Result<()> {
1268         let cmdline = Self::generate_cmdline(
1269             self.config.lock().unwrap().payload.as_ref().unwrap(),
1270             &self.device_manager,
1271         )?;
1272         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1273         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1274         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1275         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1276         let initramfs_config = match self.initramfs {
1277             Some(_) => Some(self.load_initramfs(&mem)?),
1278             None => None,
1279         };
1280 
1281         let device_info = &self
1282             .device_manager
1283             .lock()
1284             .unwrap()
1285             .get_device_info()
1286             .clone();
1287 
1288         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1289             let pci_space = PciSpaceInfo {
1290                 pci_segment_id: pci_segment.id,
1291                 mmio_config_address: pci_segment.mmio_config_address,
1292                 pci_device_space_start: pci_segment.start_of_mem64_area,
1293                 pci_device_space_size: pci_segment.end_of_mem64_area
1294                     - pci_segment.start_of_mem64_area
1295                     + 1,
1296             };
1297             pci_space_info.push(pci_space);
1298         }
1299 
1300         let virtio_iommu_bdf = self
1301             .device_manager
1302             .lock()
1303             .unwrap()
1304             .iommu_attached_devices()
1305             .as_ref()
1306             .map(|(v, _)| *v);
1307 
1308         let vgic = self
1309             .device_manager
1310             .lock()
1311             .unwrap()
1312             .get_interrupt_controller()
1313             .unwrap()
1314             .lock()
1315             .unwrap()
1316             .get_vgic()
1317             .map_err(|_| {
1318                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1319                     arch::aarch64::Error::SetupGic,
1320                 ))
1321             })?;
1322 
1323         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1324         let pmu_supported = self
1325             .cpu_manager
1326             .lock()
1327             .unwrap()
1328             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1329             .map_err(|_| {
1330                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1331                     arch::aarch64::Error::VcpuInitPmu,
1332                 ))
1333             })?;
1334 
1335         arch::configure_system(
1336             &mem,
1337             cmdline.as_cstring().unwrap().to_str().unwrap(),
1338             vcpu_mpidrs,
1339             vcpu_topology,
1340             device_info,
1341             &initramfs_config,
1342             &pci_space_info,
1343             virtio_iommu_bdf.map(|bdf| bdf.into()),
1344             &vgic,
1345             &self.numa_nodes,
1346             pmu_supported,
1347         )
1348         .map_err(Error::ConfigureSystem)?;
1349 
1350         Ok(())
1351     }
1352 
1353     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1354         self.device_manager.lock().unwrap().console_resize_pipe()
1355     }
1356 
1357     pub fn shutdown(&mut self) -> Result<()> {
1358         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1359         let new_state = VmState::Shutdown;
1360 
1361         state.valid_transition(new_state)?;
1362 
1363         // Wake up the DeviceManager threads so they will get terminated cleanly
1364         self.device_manager
1365             .lock()
1366             .unwrap()
1367             .resume()
1368             .map_err(Error::Resume)?;
1369 
1370         self.cpu_manager
1371             .lock()
1372             .unwrap()
1373             .shutdown()
1374             .map_err(Error::CpuManager)?;
1375 
1376         // Wait for all the threads to finish
1377         for thread in self.threads.drain(..) {
1378             thread.join().map_err(Error::ThreadCleanup)?
1379         }
1380         *state = new_state;
1381 
1382         Ok(())
1383     }
1384 
1385     pub fn resize(
1386         &mut self,
1387         desired_vcpus: Option<u8>,
1388         desired_memory: Option<u64>,
1389         desired_balloon: Option<u64>,
1390     ) -> Result<()> {
1391         event!("vm", "resizing");
1392 
1393         if let Some(desired_vcpus) = desired_vcpus {
1394             if self
1395                 .cpu_manager
1396                 .lock()
1397                 .unwrap()
1398                 .resize(desired_vcpus)
1399                 .map_err(Error::CpuManager)?
1400             {
1401                 self.device_manager
1402                     .lock()
1403                     .unwrap()
1404                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1405                     .map_err(Error::DeviceManager)?;
1406             }
1407             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1408         }
1409 
1410         if let Some(desired_memory) = desired_memory {
1411             let new_region = self
1412                 .memory_manager
1413                 .lock()
1414                 .unwrap()
1415                 .resize(desired_memory)
1416                 .map_err(Error::MemoryManager)?;
1417 
1418             let memory_config = &mut self.config.lock().unwrap().memory;
1419 
1420             if let Some(new_region) = &new_region {
1421                 self.device_manager
1422                     .lock()
1423                     .unwrap()
1424                     .update_memory(new_region)
1425                     .map_err(Error::DeviceManager)?;
1426 
1427                 match memory_config.hotplug_method {
1428                     HotplugMethod::Acpi => {
1429                         self.device_manager
1430                             .lock()
1431                             .unwrap()
1432                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1433                             .map_err(Error::DeviceManager)?;
1434                     }
1435                     HotplugMethod::VirtioMem => {}
1436                 }
1437             }
1438 
1439             // We update the VM config regardless of the actual guest resize
1440             // operation result (happened or not), so that if the VM reboots
1441             // it will be running with the last configure memory size.
1442             match memory_config.hotplug_method {
1443                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1444                 HotplugMethod::VirtioMem => {
1445                     if desired_memory > memory_config.size {
1446                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1447                     } else {
1448                         memory_config.hotplugged_size = None;
1449                     }
1450                 }
1451             }
1452         }
1453 
1454         if let Some(desired_balloon) = desired_balloon {
1455             self.device_manager
1456                 .lock()
1457                 .unwrap()
1458                 .resize_balloon(desired_balloon)
1459                 .map_err(Error::DeviceManager)?;
1460 
1461             // Update the configuration value for the balloon size to ensure
1462             // a reboot would use the right value.
1463             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1464                 balloon_config.size = desired_balloon;
1465             }
1466         }
1467 
1468         event!("vm", "resized");
1469 
1470         Ok(())
1471     }
1472 
1473     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1474         let memory_config = &mut self.config.lock().unwrap().memory;
1475 
1476         if let Some(zones) = &mut memory_config.zones {
1477             for zone in zones.iter_mut() {
1478                 if zone.id == id {
1479                     if desired_memory >= zone.size {
1480                         let hotplugged_size = desired_memory - zone.size;
1481                         self.memory_manager
1482                             .lock()
1483                             .unwrap()
1484                             .resize_zone(&id, desired_memory - zone.size)
1485                             .map_err(Error::MemoryManager)?;
1486                         // We update the memory zone config regardless of the
1487                         // actual 'resize-zone' operation result (happened or
1488                         // not), so that if the VM reboots it will be running
1489                         // with the last configured memory zone size.
1490                         zone.hotplugged_size = Some(hotplugged_size);
1491 
1492                         return Ok(());
1493                     } else {
1494                         error!(
1495                             "Invalid to ask less ({}) than boot RAM ({}) for \
1496                             this memory zone",
1497                             desired_memory, zone.size,
1498                         );
1499                         return Err(Error::ResizeZone);
1500                     }
1501                 }
1502             }
1503         }
1504 
1505         error!("Could not find the memory zone {} for the resize", id);
1506         Err(Error::ResizeZone)
1507     }
1508 
1509     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1510         let pci_device_info = self
1511             .device_manager
1512             .lock()
1513             .unwrap()
1514             .add_device(&mut device_cfg)
1515             .map_err(Error::DeviceManager)?;
1516 
1517         // Update VmConfig by adding the new device. This is important to
1518         // ensure the device would be created in case of a reboot.
1519         {
1520             let mut config = self.config.lock().unwrap();
1521             add_to_config(&mut config.devices, device_cfg);
1522         }
1523 
1524         self.device_manager
1525             .lock()
1526             .unwrap()
1527             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1528             .map_err(Error::DeviceManager)?;
1529 
1530         Ok(pci_device_info)
1531     }
1532 
1533     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1534         let pci_device_info = self
1535             .device_manager
1536             .lock()
1537             .unwrap()
1538             .add_user_device(&mut device_cfg)
1539             .map_err(Error::DeviceManager)?;
1540 
1541         // Update VmConfig by adding the new device. This is important to
1542         // ensure the device would be created in case of a reboot.
1543         {
1544             let mut config = self.config.lock().unwrap();
1545             add_to_config(&mut config.user_devices, device_cfg);
1546         }
1547 
1548         self.device_manager
1549             .lock()
1550             .unwrap()
1551             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1552             .map_err(Error::DeviceManager)?;
1553 
1554         Ok(pci_device_info)
1555     }
1556 
1557     pub fn remove_device(&mut self, id: String) -> Result<()> {
1558         self.device_manager
1559             .lock()
1560             .unwrap()
1561             .remove_device(id.clone())
1562             .map_err(Error::DeviceManager)?;
1563 
1564         // Update VmConfig by removing the device. This is important to
1565         // ensure the device would not be created in case of a reboot.
1566         self.config.lock().unwrap().remove_device(&id);
1567 
1568         self.device_manager
1569             .lock()
1570             .unwrap()
1571             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1572             .map_err(Error::DeviceManager)?;
1573         Ok(())
1574     }
1575 
1576     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1577         let pci_device_info = self
1578             .device_manager
1579             .lock()
1580             .unwrap()
1581             .add_disk(&mut disk_cfg)
1582             .map_err(Error::DeviceManager)?;
1583 
1584         // Update VmConfig by adding the new device. This is important to
1585         // ensure the device would be created in case of a reboot.
1586         {
1587             let mut config = self.config.lock().unwrap();
1588             add_to_config(&mut config.disks, disk_cfg);
1589         }
1590 
1591         self.device_manager
1592             .lock()
1593             .unwrap()
1594             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1595             .map_err(Error::DeviceManager)?;
1596 
1597         Ok(pci_device_info)
1598     }
1599 
1600     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1601         let pci_device_info = self
1602             .device_manager
1603             .lock()
1604             .unwrap()
1605             .add_fs(&mut fs_cfg)
1606             .map_err(Error::DeviceManager)?;
1607 
1608         // Update VmConfig by adding the new device. This is important to
1609         // ensure the device would be created in case of a reboot.
1610         {
1611             let mut config = self.config.lock().unwrap();
1612             add_to_config(&mut config.fs, fs_cfg);
1613         }
1614 
1615         self.device_manager
1616             .lock()
1617             .unwrap()
1618             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1619             .map_err(Error::DeviceManager)?;
1620 
1621         Ok(pci_device_info)
1622     }
1623 
1624     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1625         let pci_device_info = self
1626             .device_manager
1627             .lock()
1628             .unwrap()
1629             .add_pmem(&mut pmem_cfg)
1630             .map_err(Error::DeviceManager)?;
1631 
1632         // Update VmConfig by adding the new device. This is important to
1633         // ensure the device would be created in case of a reboot.
1634         {
1635             let mut config = self.config.lock().unwrap();
1636             add_to_config(&mut config.pmem, pmem_cfg);
1637         }
1638 
1639         self.device_manager
1640             .lock()
1641             .unwrap()
1642             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1643             .map_err(Error::DeviceManager)?;
1644 
1645         Ok(pci_device_info)
1646     }
1647 
1648     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1649         let pci_device_info = self
1650             .device_manager
1651             .lock()
1652             .unwrap()
1653             .add_net(&mut net_cfg)
1654             .map_err(Error::DeviceManager)?;
1655 
1656         // Update VmConfig by adding the new device. This is important to
1657         // ensure the device would be created in case of a reboot.
1658         {
1659             let mut config = self.config.lock().unwrap();
1660             add_to_config(&mut config.net, net_cfg);
1661         }
1662 
1663         self.device_manager
1664             .lock()
1665             .unwrap()
1666             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1667             .map_err(Error::DeviceManager)?;
1668 
1669         Ok(pci_device_info)
1670     }
1671 
1672     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1673         let pci_device_info = self
1674             .device_manager
1675             .lock()
1676             .unwrap()
1677             .add_vdpa(&mut vdpa_cfg)
1678             .map_err(Error::DeviceManager)?;
1679 
1680         // Update VmConfig by adding the new device. This is important to
1681         // ensure the device would be created in case of a reboot.
1682         {
1683             let mut config = self.config.lock().unwrap();
1684             add_to_config(&mut config.vdpa, vdpa_cfg);
1685         }
1686 
1687         self.device_manager
1688             .lock()
1689             .unwrap()
1690             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1691             .map_err(Error::DeviceManager)?;
1692 
1693         Ok(pci_device_info)
1694     }
1695 
1696     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1697         let pci_device_info = self
1698             .device_manager
1699             .lock()
1700             .unwrap()
1701             .add_vsock(&mut vsock_cfg)
1702             .map_err(Error::DeviceManager)?;
1703 
1704         // Update VmConfig by adding the new device. This is important to
1705         // ensure the device would be created in case of a reboot.
1706         {
1707             let mut config = self.config.lock().unwrap();
1708             config.vsock = Some(vsock_cfg);
1709         }
1710 
1711         self.device_manager
1712             .lock()
1713             .unwrap()
1714             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1715             .map_err(Error::DeviceManager)?;
1716 
1717         Ok(pci_device_info)
1718     }
1719 
1720     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1721         Ok(self.device_manager.lock().unwrap().counters())
1722     }
1723 
1724     #[cfg(feature = "tdx")]
1725     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1726         use arch::x86_64::tdx::*;
1727 
1728         let firmware_path = self
1729             .config
1730             .lock()
1731             .unwrap()
1732             .payload
1733             .as_ref()
1734             .unwrap()
1735             .firmware
1736             .clone()
1737             .ok_or(Error::TdxFirmwareMissing)?;
1738         // The TDVF file contains a table of section as well as code
1739         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1740 
1741         // For all the sections allocate some RAM backing them
1742         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1743     }
1744 
1745     #[cfg(feature = "tdx")]
1746     fn hob_memory_resources(
1747         mut sorted_sections: Vec<TdvfSection>,
1748         guest_memory: &GuestMemoryMmap,
1749     ) -> Vec<(u64, u64, bool)> {
1750         let mut list = Vec::new();
1751 
1752         let mut current_section = sorted_sections.pop();
1753 
1754         // RAM regions interleaved with TDVF sections
1755         let mut next_start_addr = 0;
1756         for region in guest_memory.iter() {
1757             let region_start = region.start_addr().0;
1758             let region_end = region.last_addr().0;
1759             if region_start > next_start_addr {
1760                 next_start_addr = region_start;
1761             }
1762 
1763             loop {
1764                 let (start, size, ram) = if let Some(section) = &current_section {
1765                     if section.address <= next_start_addr {
1766                         (section.address, section.size, false)
1767                     } else {
1768                         let last_addr = std::cmp::min(section.address - 1, region_end);
1769                         (next_start_addr, last_addr - next_start_addr + 1, true)
1770                     }
1771                 } else {
1772                     (next_start_addr, region_end - next_start_addr + 1, true)
1773                 };
1774 
1775                 list.push((start, size, ram));
1776 
1777                 if !ram {
1778                     current_section = sorted_sections.pop();
1779                 }
1780 
1781                 next_start_addr = start + size;
1782 
1783                 if region_start > next_start_addr {
1784                     next_start_addr = region_start;
1785                 }
1786 
1787                 if next_start_addr > region_end {
1788                     break;
1789                 }
1790             }
1791         }
1792 
1793         // Once all the interleaved sections have been processed, let's simply
1794         // pull the remaining ones.
1795         if let Some(section) = current_section {
1796             list.push((section.address, section.size, false));
1797         }
1798         while let Some(section) = sorted_sections.pop() {
1799             list.push((section.address, section.size, false));
1800         }
1801 
1802         list
1803     }
1804 
1805     #[cfg(feature = "tdx")]
1806     fn populate_tdx_sections(
1807         &mut self,
1808         sections: &[TdvfSection],
1809         guid_found: bool,
1810     ) -> Result<Option<u64>> {
1811         use arch::x86_64::tdx::*;
1812         // Get the memory end *before* we start adding TDVF ram regions
1813         let boot_guest_memory = self
1814             .memory_manager
1815             .lock()
1816             .as_ref()
1817             .unwrap()
1818             .boot_guest_memory();
1819         for section in sections {
1820             // No need to allocate if the section falls within guest RAM ranges
1821             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1822                 info!(
1823                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1824                     section
1825                 );
1826                 continue;
1827             }
1828 
1829             info!("Allocating TDVF Section: {:x?}", section);
1830             self.memory_manager
1831                 .lock()
1832                 .unwrap()
1833                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1834                 .map_err(Error::AllocatingTdvfMemory)?;
1835         }
1836 
1837         // The TDVF file contains a table of section as well as code
1838         let firmware_path = self
1839             .config
1840             .lock()
1841             .unwrap()
1842             .payload
1843             .as_ref()
1844             .unwrap()
1845             .firmware
1846             .clone()
1847             .ok_or(Error::TdxFirmwareMissing)?;
1848         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1849 
1850         // The guest memory at this point now has all the required regions so it
1851         // is safe to copy from the TDVF file into it.
1852         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1853         let mem = guest_memory.memory();
1854         let mut payload_info = None;
1855         let mut hob_offset = None;
1856         for section in sections {
1857             info!("Populating TDVF Section: {:x?}", section);
1858             match section.r#type {
1859                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1860                     info!("Copying section to guest memory");
1861                     firmware_file
1862                         .seek(SeekFrom::Start(section.data_offset as u64))
1863                         .map_err(Error::LoadTdvf)?;
1864                     mem.read_volatile_from(
1865                         GuestAddress(section.address),
1866                         &mut firmware_file,
1867                         section.data_size as usize,
1868                     )
1869                     .unwrap();
1870                 }
1871                 TdvfSectionType::TdHob => {
1872                     hob_offset = Some(section.address);
1873                 }
1874                 TdvfSectionType::Payload => {
1875                     info!("Copying payload to guest memory");
1876                     if let Some(payload_file) = self.kernel.as_mut() {
1877                         let payload_size = payload_file
1878                             .seek(SeekFrom::End(0))
1879                             .map_err(Error::LoadPayload)?;
1880 
1881                         payload_file
1882                             .seek(SeekFrom::Start(0x1f1))
1883                             .map_err(Error::LoadPayload)?;
1884 
1885                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1886                         payload_file
1887                             .read_volatile(&mut payload_header.as_bytes())
1888                             .unwrap();
1889 
1890                         if payload_header.header != 0x5372_6448 {
1891                             return Err(Error::InvalidPayloadType);
1892                         }
1893 
1894                         if (payload_header.version < 0x0200)
1895                             || ((payload_header.loadflags & 0x1) == 0x0)
1896                         {
1897                             return Err(Error::InvalidPayloadType);
1898                         }
1899 
1900                         payload_file.rewind().map_err(Error::LoadPayload)?;
1901                         mem.read_volatile_from(
1902                             GuestAddress(section.address),
1903                             payload_file,
1904                             payload_size as usize,
1905                         )
1906                         .unwrap();
1907 
1908                         // Create the payload info that will be inserted into
1909                         // the HOB.
1910                         payload_info = Some(PayloadInfo {
1911                             image_type: PayloadImageType::BzImage,
1912                             entry_point: section.address,
1913                         });
1914                     }
1915                 }
1916                 TdvfSectionType::PayloadParam => {
1917                     info!("Copying payload parameters to guest memory");
1918                     let cmdline = Self::generate_cmdline(
1919                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1920                     )?;
1921                     mem.write_slice(
1922                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1923                         GuestAddress(section.address),
1924                     )
1925                     .unwrap();
1926                 }
1927                 _ => {}
1928             }
1929         }
1930 
1931         // Generate HOB
1932         let mut hob = TdHob::start(hob_offset.unwrap());
1933 
1934         let mut sorted_sections = sections.to_vec();
1935         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1936 
1937         sorted_sections.sort_by_key(|section| section.address);
1938         sorted_sections.reverse();
1939 
1940         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1941             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1942                 .map_err(Error::PopulateHob)?;
1943         }
1944 
1945         // MMIO regions
1946         hob.add_mmio_resource(
1947             &mem,
1948             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1949             arch::layout::APIC_START.raw_value()
1950                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1951         )
1952         .map_err(Error::PopulateHob)?;
1953         let start_of_device_area = self
1954             .memory_manager
1955             .lock()
1956             .unwrap()
1957             .start_of_device_area()
1958             .raw_value();
1959         let end_of_device_area = self
1960             .memory_manager
1961             .lock()
1962             .unwrap()
1963             .end_of_device_area()
1964             .raw_value();
1965         hob.add_mmio_resource(
1966             &mem,
1967             start_of_device_area,
1968             end_of_device_area - start_of_device_area,
1969         )
1970         .map_err(Error::PopulateHob)?;
1971 
1972         // Loop over the ACPI tables and copy them to the HOB.
1973 
1974         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1975             &self.device_manager,
1976             &self.cpu_manager,
1977             &self.memory_manager,
1978             &self.numa_nodes,
1979         ) {
1980             hob.add_acpi_table(&mem, acpi_table.as_slice())
1981                 .map_err(Error::PopulateHob)?;
1982         }
1983 
1984         // If a payload info has been created, let's insert it into the HOB.
1985         if let Some(payload_info) = payload_info {
1986             hob.add_payload(&mem, payload_info)
1987                 .map_err(Error::PopulateHob)?;
1988         }
1989 
1990         hob.finish(&mem).map_err(Error::PopulateHob)?;
1991 
1992         Ok(hob_offset)
1993     }
1994 
1995     #[cfg(feature = "tdx")]
1996     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1997         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1998         let mem = guest_memory.memory();
1999 
2000         for section in sections {
2001             self.vm
2002                 .tdx_init_memory_region(
2003                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2004                     section.address,
2005                     section.size,
2006                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2007                     section.attributes == 1,
2008                 )
2009                 .map_err(Error::InitializeTdxMemoryRegion)?;
2010         }
2011 
2012         Ok(())
2013     }
2014 
2015     // Creates ACPI tables
2016     // In case of TDX being used, this is a no-op since the tables will be
2017     // created and passed when populating the HOB.
2018 
2019     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2020         #[cfg(feature = "tdx")]
2021         if self.config.lock().unwrap().is_tdx_enabled() {
2022             return None;
2023         }
2024         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2025         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2026         let rsdp_addr = crate::acpi::create_acpi_tables(
2027             &mem,
2028             &self.device_manager,
2029             &self.cpu_manager,
2030             &self.memory_manager,
2031             &self.numa_nodes,
2032             tpm_enabled,
2033         );
2034         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2035 
2036         Some(rsdp_addr)
2037     }
2038 
2039     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2040         trace_scoped!("entry_point");
2041 
2042         self.load_payload_handle
2043             .take()
2044             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2045             .transpose()
2046     }
2047 
2048     pub fn boot(&mut self) -> Result<()> {
2049         trace_scoped!("Vm::boot");
2050         let current_state = self.get_state()?;
2051         if current_state == VmState::Paused {
2052             return self.resume().map_err(Error::Resume);
2053         }
2054 
2055         let new_state = if self.stop_on_boot {
2056             VmState::BreakPoint
2057         } else {
2058             VmState::Running
2059         };
2060         current_state.valid_transition(new_state)?;
2061 
2062         // Do earlier to parallelise with loading kernel
2063         #[cfg(target_arch = "x86_64")]
2064         cfg_if::cfg_if! {
2065             if #[cfg(feature = "sev_snp")] {
2066                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2067                 let rsdp_addr = if sev_snp_enabled {
2068                     // In case of SEV-SNP guest ACPI tables are provided via
2069                     // IGVM. So skip the creation of ACPI tables and set the
2070                     // rsdp addr to None.
2071                     None
2072                 } else {
2073                     self.create_acpi_tables()
2074                 };
2075             } else {
2076                 let rsdp_addr = self.create_acpi_tables();
2077             }
2078         }
2079 
2080         // Load kernel synchronously or if asynchronous then wait for load to
2081         // finish.
2082         let entry_point = self.entry_point()?;
2083 
2084         #[cfg(feature = "tdx")]
2085         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2086 
2087         // Configure the vcpus that have been created
2088         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2089         for vcpu in vcpus {
2090             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2091             let boot_setup = entry_point.map(|e| (e, guest_memory));
2092             self.cpu_manager
2093                 .lock()
2094                 .unwrap()
2095                 .configure_vcpu(vcpu, boot_setup)
2096                 .map_err(Error::CpuManager)?;
2097         }
2098 
2099         #[cfg(feature = "tdx")]
2100         let (sections, guid_found) = if tdx_enabled {
2101             self.extract_tdvf_sections()?
2102         } else {
2103             (Vec::new(), false)
2104         };
2105 
2106         // Configuring the TDX regions requires that the vCPUs are created.
2107         #[cfg(feature = "tdx")]
2108         let hob_address = if tdx_enabled {
2109             // TDX sections are written to memory.
2110             self.populate_tdx_sections(&sections, guid_found)?
2111         } else {
2112             None
2113         };
2114 
2115         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2116         // available after they are configured
2117         #[cfg(target_arch = "aarch64")]
2118         let rsdp_addr = self.create_acpi_tables();
2119 
2120         // Configure shared state based on loaded kernel
2121         entry_point
2122             .map(|entry_point| {
2123                 // Safe to unwrap rsdp_addr as we know it can't be None when
2124                 // the entry_point is Some.
2125                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2126             })
2127             .transpose()?;
2128 
2129         #[cfg(target_arch = "x86_64")]
2130         // Note: For x86, always call this function before invoking start boot vcpus.
2131         // Otherwise guest would fail to boot because we haven't created the
2132         // userspace mappings to update the hypervisor about the memory mappings.
2133         // These mappings must be created before we start the vCPU threads for
2134         // the very first time.
2135         self.memory_manager
2136             .lock()
2137             .unwrap()
2138             .allocate_address_space()
2139             .map_err(Error::MemoryManager)?;
2140 
2141         #[cfg(feature = "tdx")]
2142         if let Some(hob_address) = hob_address {
2143             // With the HOB address extracted the vCPUs can have
2144             // their TDX state configured.
2145             self.cpu_manager
2146                 .lock()
2147                 .unwrap()
2148                 .initialize_tdx(hob_address)
2149                 .map_err(Error::CpuManager)?;
2150             // Let the hypervisor know which memory ranges are shared with the
2151             // guest. This prevents the guest from ignoring/discarding memory
2152             // regions provided by the host.
2153             self.init_tdx_memory(&sections)?;
2154             // With TDX memory and CPU state configured TDX setup is complete
2155             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2156         }
2157 
2158         // Resume the vm for MSHV
2159         if current_state == VmState::Created {
2160             self.vm.resume().map_err(Error::ResumeVm)?;
2161         }
2162 
2163         self.cpu_manager
2164             .lock()
2165             .unwrap()
2166             .start_boot_vcpus(new_state == VmState::BreakPoint)
2167             .map_err(Error::CpuManager)?;
2168 
2169         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2170         *state = new_state;
2171         Ok(())
2172     }
2173 
2174     pub fn restore(&mut self) -> Result<()> {
2175         event!("vm", "restoring");
2176 
2177         #[cfg(target_arch = "x86_64")]
2178         // Note: For x86, always call this function before invoking start boot vcpus.
2179         // Otherwise guest would fail to boot because we haven't created the
2180         // userspace mappings to update the hypervisor about the memory mappings.
2181         // These mappings must be created before we start the vCPU threads for
2182         // the very first time for the restored VM.
2183         self.memory_manager
2184             .lock()
2185             .unwrap()
2186             .allocate_address_space()
2187             .map_err(Error::MemoryManager)?;
2188 
2189         // Now we can start all vCPUs from here.
2190         self.cpu_manager
2191             .lock()
2192             .unwrap()
2193             .start_restored_vcpus()
2194             .map_err(Error::CpuManager)?;
2195 
2196         event!("vm", "restored");
2197         Ok(())
2198     }
2199 
2200     /// Gets a thread-safe reference counted pointer to the VM configuration.
2201     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2202         Arc::clone(&self.config)
2203     }
2204 
2205     /// Get the VM state. Returns an error if the state is poisoned.
2206     pub fn get_state(&self) -> Result<VmState> {
2207         self.state
2208             .try_read()
2209             .map_err(|_| Error::PoisonedState)
2210             .map(|state| *state)
2211     }
2212 
2213     /// Gets the actual size of the balloon.
2214     pub fn balloon_size(&self) -> u64 {
2215         self.device_manager.lock().unwrap().balloon_size()
2216     }
2217 
2218     pub fn send_memory_fds(
2219         &mut self,
2220         socket: &mut UnixStream,
2221     ) -> std::result::Result<(), MigratableError> {
2222         for (slot, fd) in self
2223             .memory_manager
2224             .lock()
2225             .unwrap()
2226             .memory_slot_fds()
2227             .drain()
2228         {
2229             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2230                 .write_to(socket)
2231                 .map_err(|e| {
2232                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2233                 })?;
2234             socket
2235                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2236                 .map_err(|e| {
2237                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2238                 })?;
2239 
2240             let res = Response::read_from(socket)?;
2241             if res.status() != Status::Ok {
2242                 warn!("Error during memory fd migration");
2243                 Request::abandon().write_to(socket)?;
2244                 Response::read_from(socket).ok();
2245                 return Err(MigratableError::MigrateSend(anyhow!(
2246                     "Error during memory fd migration"
2247                 )));
2248             }
2249         }
2250 
2251         Ok(())
2252     }
2253 
2254     pub fn send_memory_regions<F>(
2255         &mut self,
2256         ranges: &MemoryRangeTable,
2257         fd: &mut F,
2258     ) -> std::result::Result<(), MigratableError>
2259     where
2260         F: WriteVolatile,
2261     {
2262         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2263         let mem = guest_memory.memory();
2264 
2265         for range in ranges.regions() {
2266             let mut offset: u64 = 0;
2267             // Here we are manually handling the retry in case we can't the
2268             // whole region at once because we can't use the implementation
2269             // from vm-memory::GuestMemory of write_all_to() as it is not
2270             // following the correct behavior. For more info about this issue
2271             // see: https://github.com/rust-vmm/vm-memory/issues/174
2272             loop {
2273                 let bytes_written = mem
2274                     .write_volatile_to(
2275                         GuestAddress(range.gpa + offset),
2276                         fd,
2277                         (range.length - offset) as usize,
2278                     )
2279                     .map_err(|e| {
2280                         MigratableError::MigrateSend(anyhow!(
2281                             "Error transferring memory to socket: {}",
2282                             e
2283                         ))
2284                     })?;
2285                 offset += bytes_written as u64;
2286 
2287                 if offset == range.length {
2288                     break;
2289                 }
2290             }
2291         }
2292 
2293         Ok(())
2294     }
2295 
2296     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2297         self.memory_manager
2298             .lock()
2299             .unwrap()
2300             .memory_range_table(false)
2301     }
2302 
2303     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2304         self.device_manager.lock().unwrap().device_tree()
2305     }
2306 
2307     pub fn activate_virtio_devices(&self) -> Result<()> {
2308         self.device_manager
2309             .lock()
2310             .unwrap()
2311             .activate_virtio_devices()
2312             .map_err(Error::ActivateVirtioDevices)
2313     }
2314 
2315     #[cfg(target_arch = "x86_64")]
2316     pub fn power_button(&self) -> Result<()> {
2317         return self
2318             .device_manager
2319             .lock()
2320             .unwrap()
2321             .notify_power_button()
2322             .map_err(Error::PowerButton);
2323     }
2324 
2325     #[cfg(target_arch = "aarch64")]
2326     pub fn power_button(&self) -> Result<()> {
2327         self.device_manager
2328             .lock()
2329             .unwrap()
2330             .notify_power_button()
2331             .map_err(Error::PowerButton)
2332     }
2333 
2334     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2335         self.memory_manager.lock().unwrap().snapshot_data()
2336     }
2337 
2338     #[cfg(feature = "guest_debug")]
2339     pub fn debug_request(
2340         &mut self,
2341         gdb_request: &GdbRequestPayload,
2342         cpu_id: usize,
2343     ) -> Result<GdbResponsePayload> {
2344         use GdbRequestPayload::*;
2345         match gdb_request {
2346             SetSingleStep(single_step) => {
2347                 self.set_guest_debug(cpu_id, &[], *single_step)
2348                     .map_err(Error::Debug)?;
2349             }
2350             SetHwBreakPoint(addrs) => {
2351                 self.set_guest_debug(cpu_id, addrs, false)
2352                     .map_err(Error::Debug)?;
2353             }
2354             Pause => {
2355                 self.debug_pause().map_err(Error::Debug)?;
2356             }
2357             Resume => {
2358                 self.debug_resume().map_err(Error::Debug)?;
2359             }
2360             ReadRegs => {
2361                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2362                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2363             }
2364             WriteRegs(regs) => {
2365                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2366             }
2367             ReadMem(vaddr, len) => {
2368                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2369                 let mem = self
2370                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2371                     .map_err(Error::Debug)?;
2372                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2373             }
2374             WriteMem(vaddr, data) => {
2375                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2376                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2377                     .map_err(Error::Debug)?;
2378             }
2379             ActiveVcpus => {
2380                 let active_vcpus = self.active_vcpus();
2381                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2382             }
2383         }
2384         Ok(GdbResponsePayload::CommandComplete)
2385     }
2386 
2387     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2388     fn get_dump_state(
2389         &mut self,
2390         destination_url: &str,
2391     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2392         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2393         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2394         let mut elf_phdr_num = 1;
2395         let elf_sh_info = 0;
2396         let coredump_file_path = url_to_file(destination_url)?;
2397         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2398 
2399         if mapping_num < UINT16_MAX - 2 {
2400             elf_phdr_num += mapping_num as u16;
2401         } else {
2402             panic!("mapping num beyond 65535 not supported");
2403         }
2404         let coredump_file = OpenOptions::new()
2405             .read(true)
2406             .write(true)
2407             .create_new(true)
2408             .open(coredump_file_path)
2409             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2410 
2411         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2412         let mem_data = self
2413             .memory_manager
2414             .lock()
2415             .unwrap()
2416             .coredump_memory_regions(mem_offset);
2417 
2418         Ok(DumpState {
2419             elf_note_size,
2420             elf_phdr_num,
2421             elf_sh_info,
2422             mem_offset,
2423             mem_info: Some(mem_data),
2424             file: Some(coredump_file),
2425         })
2426     }
2427 
2428     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2429     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2430         size_of::<elf::Elf64_Ehdr>() as u64
2431             + note_size as u64
2432             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2433     }
2434 
2435     pub fn nmi(&self) -> Result<()> {
2436         return self
2437             .cpu_manager
2438             .lock()
2439             .unwrap()
2440             .nmi()
2441             .map_err(|_| Error::ErrorNmi);
2442     }
2443 }
2444 
2445 impl Pausable for Vm {
2446     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2447         event!("vm", "pausing");
2448         let mut state = self
2449             .state
2450             .try_write()
2451             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2452         let new_state = VmState::Paused;
2453 
2454         state
2455             .valid_transition(new_state)
2456             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2457 
2458         #[cfg(target_arch = "x86_64")]
2459         {
2460             let mut clock = self
2461                 .vm
2462                 .get_clock()
2463                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2464             clock.reset_flags();
2465             self.saved_clock = Some(clock);
2466         }
2467 
2468         // Before pausing the vCPUs activate any pending virtio devices that might
2469         // need activation between starting the pause (or e.g. a migration it's part of)
2470         self.activate_virtio_devices().map_err(|e| {
2471             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2472         })?;
2473 
2474         self.cpu_manager.lock().unwrap().pause()?;
2475         self.device_manager.lock().unwrap().pause()?;
2476 
2477         self.vm
2478             .pause()
2479             .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?;
2480 
2481         *state = new_state;
2482 
2483         event!("vm", "paused");
2484         Ok(())
2485     }
2486 
2487     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2488         event!("vm", "resuming");
2489         let current_state = self.get_state().unwrap();
2490         let mut state = self
2491             .state
2492             .try_write()
2493             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2494         let new_state = VmState::Running;
2495 
2496         state
2497             .valid_transition(new_state)
2498             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2499 
2500         self.cpu_manager.lock().unwrap().resume()?;
2501         #[cfg(target_arch = "x86_64")]
2502         {
2503             if let Some(clock) = &self.saved_clock {
2504                 self.vm.set_clock(clock).map_err(|e| {
2505                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2506                 })?;
2507             }
2508         }
2509 
2510         if current_state == VmState::Paused {
2511             self.vm
2512                 .resume()
2513                 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?;
2514         }
2515 
2516         self.device_manager.lock().unwrap().resume()?;
2517 
2518         // And we're back to the Running state.
2519         *state = new_state;
2520         event!("vm", "resumed");
2521         Ok(())
2522     }
2523 }
2524 
2525 #[derive(Serialize, Deserialize)]
2526 pub struct VmSnapshot {
2527     #[cfg(target_arch = "x86_64")]
2528     pub clock: Option<hypervisor::ClockData>,
2529     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2530     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2531 }
2532 
2533 pub const VM_SNAPSHOT_ID: &str = "vm";
2534 impl Snapshottable for Vm {
2535     fn id(&self) -> String {
2536         VM_SNAPSHOT_ID.to_string()
2537     }
2538 
2539     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2540         event!("vm", "snapshotting");
2541 
2542         #[cfg(feature = "tdx")]
2543         {
2544             if self.config.lock().unwrap().is_tdx_enabled() {
2545                 return Err(MigratableError::Snapshot(anyhow!(
2546                     "Snapshot not possible with TDX VM"
2547                 )));
2548             }
2549         }
2550 
2551         let current_state = self.get_state().unwrap();
2552         if current_state != VmState::Paused {
2553             return Err(MigratableError::Snapshot(anyhow!(
2554                 "Trying to snapshot while VM is running"
2555             )));
2556         }
2557 
2558         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2559         let common_cpuid = {
2560             let amx = self.config.lock().unwrap().cpus.features.amx;
2561             let phys_bits = physical_bits(
2562                 &self.hypervisor,
2563                 self.config.lock().unwrap().cpus.max_phys_bits,
2564             );
2565             arch::generate_common_cpuid(
2566                 &self.hypervisor,
2567                 &arch::CpuidConfig {
2568                     sgx_epc_sections: None,
2569                     phys_bits,
2570                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2571                     #[cfg(feature = "tdx")]
2572                     tdx: false,
2573                     amx,
2574                 },
2575             )
2576             .map_err(|e| {
2577                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2578             })?
2579         };
2580 
2581         let vm_snapshot_state = VmSnapshot {
2582             #[cfg(target_arch = "x86_64")]
2583             clock: self.saved_clock,
2584             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2585             common_cpuid,
2586         };
2587 
2588         let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?;
2589 
2590         let (id, snapshot) = {
2591             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2592             (cpu_manager.id(), cpu_manager.snapshot()?)
2593         };
2594         vm_snapshot.add_snapshot(id, snapshot);
2595         let (id, snapshot) = {
2596             let mut memory_manager = self.memory_manager.lock().unwrap();
2597             (memory_manager.id(), memory_manager.snapshot()?)
2598         };
2599         vm_snapshot.add_snapshot(id, snapshot);
2600         let (id, snapshot) = {
2601             let mut device_manager = self.device_manager.lock().unwrap();
2602             (device_manager.id(), device_manager.snapshot()?)
2603         };
2604         vm_snapshot.add_snapshot(id, snapshot);
2605 
2606         event!("vm", "snapshotted");
2607         Ok(vm_snapshot)
2608     }
2609 }
2610 
2611 impl Transportable for Vm {
2612     fn send(
2613         &self,
2614         snapshot: &Snapshot,
2615         destination_url: &str,
2616     ) -> std::result::Result<(), MigratableError> {
2617         let mut snapshot_config_path = url_to_path(destination_url)?;
2618         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2619 
2620         // Create the snapshot config file
2621         let mut snapshot_config_file = OpenOptions::new()
2622             .read(true)
2623             .write(true)
2624             .create_new(true)
2625             .open(snapshot_config_path)
2626             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2627 
2628         // Serialize and write the snapshot config
2629         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2630             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2631 
2632         snapshot_config_file
2633             .write(vm_config.as_bytes())
2634             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2635 
2636         let mut snapshot_state_path = url_to_path(destination_url)?;
2637         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2638 
2639         // Create the snapshot state file
2640         let mut snapshot_state_file = OpenOptions::new()
2641             .read(true)
2642             .write(true)
2643             .create_new(true)
2644             .open(snapshot_state_path)
2645             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2646 
2647         // Serialize and write the snapshot state
2648         let vm_state =
2649             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2650 
2651         snapshot_state_file
2652             .write(&vm_state)
2653             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2654 
2655         // Tell the memory manager to also send/write its own snapshot.
2656         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2657             self.memory_manager
2658                 .lock()
2659                 .unwrap()
2660                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2661         } else {
2662             return Err(MigratableError::Restore(anyhow!(
2663                 "Missing memory manager snapshot"
2664             )));
2665         }
2666 
2667         Ok(())
2668     }
2669 }
2670 
2671 impl Migratable for Vm {
2672     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2673         self.memory_manager.lock().unwrap().start_dirty_log()?;
2674         self.device_manager.lock().unwrap().start_dirty_log()
2675     }
2676 
2677     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2678         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2679         self.device_manager.lock().unwrap().stop_dirty_log()
2680     }
2681 
2682     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2683         Ok(MemoryRangeTable::new_from_tables(vec![
2684             self.memory_manager.lock().unwrap().dirty_log()?,
2685             self.device_manager.lock().unwrap().dirty_log()?,
2686         ]))
2687     }
2688 
2689     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2690         self.memory_manager.lock().unwrap().start_migration()?;
2691         self.device_manager.lock().unwrap().start_migration()
2692     }
2693 
2694     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2695         self.memory_manager.lock().unwrap().complete_migration()?;
2696         self.device_manager.lock().unwrap().complete_migration()
2697     }
2698 }
2699 
2700 #[cfg(feature = "guest_debug")]
2701 impl Debuggable for Vm {
2702     fn set_guest_debug(
2703         &self,
2704         cpu_id: usize,
2705         addrs: &[GuestAddress],
2706         singlestep: bool,
2707     ) -> std::result::Result<(), DebuggableError> {
2708         self.cpu_manager
2709             .lock()
2710             .unwrap()
2711             .set_guest_debug(cpu_id, addrs, singlestep)
2712     }
2713 
2714     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2715         if *self.state.read().unwrap() == VmState::Running {
2716             self.pause().map_err(DebuggableError::Pause)?;
2717         }
2718 
2719         let mut state = self
2720             .state
2721             .try_write()
2722             .map_err(|_| DebuggableError::PoisonedState)?;
2723         *state = VmState::BreakPoint;
2724         Ok(())
2725     }
2726 
2727     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2728         if *self.state.read().unwrap() == VmState::BreakPoint {
2729             self.resume().map_err(DebuggableError::Pause)?;
2730         }
2731 
2732         Ok(())
2733     }
2734 
2735     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2736         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2737     }
2738 
2739     fn write_regs(
2740         &self,
2741         cpu_id: usize,
2742         regs: &CoreRegs,
2743     ) -> std::result::Result<(), DebuggableError> {
2744         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2745     }
2746 
2747     fn read_mem(
2748         &self,
2749         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2750         cpu_id: usize,
2751         vaddr: GuestAddress,
2752         len: usize,
2753     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2754         self.cpu_manager
2755             .lock()
2756             .unwrap()
2757             .read_mem(guest_memory, cpu_id, vaddr, len)
2758     }
2759 
2760     fn write_mem(
2761         &self,
2762         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2763         cpu_id: usize,
2764         vaddr: &GuestAddress,
2765         data: &[u8],
2766     ) -> std::result::Result<(), DebuggableError> {
2767         self.cpu_manager
2768             .lock()
2769             .unwrap()
2770             .write_mem(guest_memory, cpu_id, vaddr, data)
2771     }
2772 
2773     fn active_vcpus(&self) -> usize {
2774         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2775         if active_vcpus > 0 {
2776             active_vcpus
2777         } else {
2778             // The VM is not booted yet. Report boot_vcpus() instead.
2779             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2780         }
2781     }
2782 }
2783 
2784 #[cfg(feature = "guest_debug")]
2785 pub const UINT16_MAX: u32 = 65535;
2786 
2787 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2788 impl Elf64Writable for Vm {}
2789 
2790 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2791 impl GuestDebuggable for Vm {
2792     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2793         event!("vm", "coredumping");
2794 
2795         let mut resume = false;
2796 
2797         #[cfg(feature = "tdx")]
2798         {
2799             if let Some(ref platform) = self.config.lock().unwrap().platform {
2800                 if platform.tdx {
2801                     return Err(GuestDebuggableError::Coredump(anyhow!(
2802                         "Coredump not possible with TDX VM"
2803                     )));
2804                 }
2805             }
2806         }
2807 
2808         match self.get_state().unwrap() {
2809             VmState::Running => {
2810                 self.pause().map_err(GuestDebuggableError::Pause)?;
2811                 resume = true;
2812             }
2813             VmState::Paused => {}
2814             _ => {
2815                 return Err(GuestDebuggableError::Coredump(anyhow!(
2816                     "Trying to coredump while VM is not running or paused"
2817                 )));
2818             }
2819         }
2820 
2821         let coredump_state = self.get_dump_state(destination_url)?;
2822 
2823         self.write_header(&coredump_state)?;
2824         self.write_note(&coredump_state)?;
2825         self.write_loads(&coredump_state)?;
2826 
2827         self.cpu_manager
2828             .lock()
2829             .unwrap()
2830             .cpu_write_elf64_note(&coredump_state)?;
2831         self.cpu_manager
2832             .lock()
2833             .unwrap()
2834             .cpu_write_vmm_note(&coredump_state)?;
2835 
2836         self.memory_manager
2837             .lock()
2838             .unwrap()
2839             .coredump_iterate_save_mem(&coredump_state)?;
2840 
2841         if resume {
2842             self.resume().map_err(GuestDebuggableError::Resume)?;
2843         }
2844 
2845         Ok(())
2846     }
2847 }
2848 
2849 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2850 #[cfg(test)]
2851 mod tests {
2852     use super::*;
2853 
2854     fn test_vm_state_transitions(state: VmState) {
2855         match state {
2856             VmState::Created => {
2857                 // Check the transitions from Created
2858                 assert!(state.valid_transition(VmState::Created).is_err());
2859                 assert!(state.valid_transition(VmState::Running).is_ok());
2860                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2861                 assert!(state.valid_transition(VmState::Paused).is_ok());
2862                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2863             }
2864             VmState::Running => {
2865                 // Check the transitions from Running
2866                 assert!(state.valid_transition(VmState::Created).is_err());
2867                 assert!(state.valid_transition(VmState::Running).is_err());
2868                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2869                 assert!(state.valid_transition(VmState::Paused).is_ok());
2870                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2871             }
2872             VmState::Shutdown => {
2873                 // Check the transitions from Shutdown
2874                 assert!(state.valid_transition(VmState::Created).is_err());
2875                 assert!(state.valid_transition(VmState::Running).is_ok());
2876                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2877                 assert!(state.valid_transition(VmState::Paused).is_err());
2878                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2879             }
2880             VmState::Paused => {
2881                 // Check the transitions from Paused
2882                 assert!(state.valid_transition(VmState::Created).is_err());
2883                 assert!(state.valid_transition(VmState::Running).is_ok());
2884                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2885                 assert!(state.valid_transition(VmState::Paused).is_err());
2886                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2887             }
2888             VmState::BreakPoint => {
2889                 // Check the transitions from Breakpoint
2890                 assert!(state.valid_transition(VmState::Created).is_ok());
2891                 assert!(state.valid_transition(VmState::Running).is_ok());
2892                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2893                 assert!(state.valid_transition(VmState::Paused).is_err());
2894                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2895             }
2896         }
2897     }
2898 
2899     #[test]
2900     fn test_vm_created_transitions() {
2901         test_vm_state_transitions(VmState::Created);
2902     }
2903 
2904     #[test]
2905     fn test_vm_running_transitions() {
2906         test_vm_state_transitions(VmState::Running);
2907     }
2908 
2909     #[test]
2910     fn test_vm_shutdown_transitions() {
2911         test_vm_state_transitions(VmState::Shutdown);
2912     }
2913 
2914     #[test]
2915     fn test_vm_paused_transitions() {
2916         test_vm_state_transitions(VmState::Paused);
2917     }
2918 
2919     #[cfg(feature = "tdx")]
2920     #[test]
2921     fn test_hob_memory_resources() {
2922         // Case 1: Two TDVF sections in the middle of the RAM
2923         let sections = vec![
2924             TdvfSection {
2925                 address: 0xc000,
2926                 size: 0x1000,
2927                 ..Default::default()
2928             },
2929             TdvfSection {
2930                 address: 0x1000,
2931                 size: 0x4000,
2932                 ..Default::default()
2933             },
2934         ];
2935         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2936         let expected = vec![
2937             (0, 0x1000, true),
2938             (0x1000, 0x4000, false),
2939             (0x5000, 0x7000, true),
2940             (0xc000, 0x1000, false),
2941             (0xd000, 0x0fff_3000, true),
2942         ];
2943         assert_eq!(
2944             expected,
2945             Vm::hob_memory_resources(
2946                 sections,
2947                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2948             )
2949         );
2950 
2951         // Case 2: Two TDVF sections with no conflict with the RAM
2952         let sections = vec![
2953             TdvfSection {
2954                 address: 0x1000_1000,
2955                 size: 0x1000,
2956                 ..Default::default()
2957             },
2958             TdvfSection {
2959                 address: 0,
2960                 size: 0x1000,
2961                 ..Default::default()
2962             },
2963         ];
2964         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2965         let expected = vec![
2966             (0, 0x1000, false),
2967             (0x1000, 0x1000_0000, true),
2968             (0x1000_1000, 0x1000, false),
2969         ];
2970         assert_eq!(
2971             expected,
2972             Vm::hob_memory_resources(
2973                 sections,
2974                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2975             )
2976         );
2977 
2978         // Case 3: Two TDVF sections with partial conflicts with the RAM
2979         let sections = vec![
2980             TdvfSection {
2981                 address: 0x1000_0000,
2982                 size: 0x2000,
2983                 ..Default::default()
2984             },
2985             TdvfSection {
2986                 address: 0,
2987                 size: 0x2000,
2988                 ..Default::default()
2989             },
2990         ];
2991         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2992         let expected = vec![
2993             (0, 0x2000, false),
2994             (0x2000, 0x0fff_e000, true),
2995             (0x1000_0000, 0x2000, false),
2996         ];
2997         assert_eq!(
2998             expected,
2999             Vm::hob_memory_resources(
3000                 sections,
3001                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3002             )
3003         );
3004 
3005         // Case 4: Two TDVF sections with no conflict before the RAM and two
3006         // more additional sections with no conflict after the RAM.
3007         let sections = vec![
3008             TdvfSection {
3009                 address: 0x2000_1000,
3010                 size: 0x1000,
3011                 ..Default::default()
3012             },
3013             TdvfSection {
3014                 address: 0x2000_0000,
3015                 size: 0x1000,
3016                 ..Default::default()
3017             },
3018             TdvfSection {
3019                 address: 0x1000,
3020                 size: 0x1000,
3021                 ..Default::default()
3022             },
3023             TdvfSection {
3024                 address: 0,
3025                 size: 0x1000,
3026                 ..Default::default()
3027             },
3028         ];
3029         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3030         let expected = vec![
3031             (0, 0x1000, false),
3032             (0x1000, 0x1000, false),
3033             (0x4000, 0x1000_0000, true),
3034             (0x2000_0000, 0x1000, false),
3035             (0x2000_1000, 0x1000, false),
3036         ];
3037         assert_eq!(
3038             expected,
3039             Vm::hob_memory_resources(
3040                 sections,
3041                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3042             )
3043         );
3044 
3045         // Case 5: One TDVF section overriding the entire RAM
3046         let sections = vec![TdvfSection {
3047             address: 0,
3048             size: 0x2000_0000,
3049             ..Default::default()
3050         }];
3051         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3052         let expected = vec![(0, 0x2000_0000, false)];
3053         assert_eq!(
3054             expected,
3055             Vm::hob_memory_resources(
3056                 sections,
3057                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3058             )
3059         );
3060 
3061         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3062         let sections = vec![
3063             TdvfSection {
3064                 address: 0x1000_2000,
3065                 size: 0x2000,
3066                 ..Default::default()
3067             },
3068             TdvfSection {
3069                 address: 0,
3070                 size: 0x2000,
3071                 ..Default::default()
3072             },
3073         ];
3074         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3075             (GuestAddress(0x2000), 0x1000_0000),
3076             (GuestAddress(0x1000_4000), 0x1000_0000),
3077         ];
3078         let expected = vec![
3079             (0, 0x2000, false),
3080             (0x2000, 0x1000_0000, true),
3081             (0x1000_2000, 0x2000, false),
3082             (0x1000_4000, 0x1000_0000, true),
3083         ];
3084         assert_eq!(
3085             expected,
3086             Vm::hob_memory_resources(
3087                 sections,
3088                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3089             )
3090         );
3091 
3092         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3093         let sections = vec![
3094             TdvfSection {
3095                 address: 0x1000_0000,
3096                 size: 0x4000,
3097                 ..Default::default()
3098             },
3099             TdvfSection {
3100                 address: 0,
3101                 size: 0x4000,
3102                 ..Default::default()
3103             },
3104         ];
3105         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3106             (GuestAddress(0x1000), 0x1000_0000),
3107             (GuestAddress(0x1000_3000), 0x1000_0000),
3108         ];
3109         let expected = vec![
3110             (0, 0x4000, false),
3111             (0x4000, 0x0fff_c000, true),
3112             (0x1000_0000, 0x4000, false),
3113             (0x1000_4000, 0x0fff_f000, true),
3114         ];
3115         assert_eq!(
3116             expected,
3117             Vm::hob_memory_resources(
3118                 sections,
3119                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3120             )
3121         );
3122     }
3123 }
3124 
3125 #[cfg(target_arch = "aarch64")]
3126 #[cfg(test)]
3127 mod tests {
3128     use super::*;
3129     use arch::aarch64::fdt::create_fdt;
3130     use arch::aarch64::layout;
3131     use arch::{DeviceType, MmioDeviceInfo};
3132     use devices::gic::Gic;
3133 
3134     const LEN: u64 = 4096;
3135 
3136     #[test]
3137     fn test_create_fdt_with_devices() {
3138         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3139         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3140 
3141         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3142             (
3143                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3144                 MmioDeviceInfo {
3145                     addr: 0x00,
3146                     len: LEN,
3147                     irq: 33,
3148                 },
3149             ),
3150             (
3151                 (DeviceType::Virtio(1), "virtio".to_string()),
3152                 MmioDeviceInfo {
3153                     addr: LEN,
3154                     len: LEN,
3155                     irq: 34,
3156                 },
3157             ),
3158             (
3159                 (DeviceType::Rtc, "rtc".to_string()),
3160                 MmioDeviceInfo {
3161                     addr: 2 * LEN,
3162                     len: LEN,
3163                     irq: 35,
3164                 },
3165             ),
3166         ]
3167         .iter()
3168         .cloned()
3169         .collect();
3170 
3171         let hv = hypervisor::new().unwrap();
3172         let vm = hv.create_vm().unwrap();
3173         let gic = vm
3174             .create_vgic(Gic::create_default_config(1))
3175             .expect("Cannot create gic");
3176         assert!(create_fdt(
3177             &mem,
3178             "console=tty0",
3179             vec![0],
3180             Some((0, 0, 0)),
3181             &dev_info,
3182             &gic,
3183             &None,
3184             &Vec::new(),
3185             &BTreeMap::new(),
3186             None,
3187             true,
3188         )
3189         .is_ok())
3190     }
3191 }
3192 
3193 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3194 #[test]
3195 pub fn test_vm() {
3196     use hypervisor::VmExit;
3197     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3198     // This example based on https://lwn.net/Articles/658511/
3199     let code = [
3200         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3201         0x00, 0xd8, /* add %bl, %al */
3202         0x04, b'0', /* add $'0', %al */
3203         0xee, /* out %al, (%dx) */
3204         0xb0, b'\n', /* mov $'\n', %al */
3205         0xee,  /* out %al, (%dx) */
3206         0xf4,  /* hlt */
3207     ];
3208 
3209     let mem_size = 0x1000;
3210     let load_addr = GuestAddress(0x1000);
3211     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3212 
3213     let hv = hypervisor::new().unwrap();
3214     let vm = hv.create_vm().expect("new VM creation failed");
3215 
3216     for (index, region) in mem.iter().enumerate() {
3217         let mem_region = vm.make_user_memory_region(
3218             index as u32,
3219             region.start_addr().raw_value(),
3220             region.len(),
3221             region.as_ptr() as u64,
3222             false,
3223             false,
3224         );
3225 
3226         vm.create_user_memory_region(mem_region)
3227             .expect("Cannot configure guest memory");
3228     }
3229     mem.write_slice(&code, load_addr)
3230         .expect("Writing code to memory failed");
3231 
3232     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3233 
3234     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3235     vcpu_sregs.cs.base = 0;
3236     vcpu_sregs.cs.selector = 0;
3237     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3238 
3239     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3240     vcpu_regs.rip = 0x1000;
3241     vcpu_regs.rax = 2;
3242     vcpu_regs.rbx = 3;
3243     vcpu_regs.rflags = 2;
3244     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3245 
3246     loop {
3247         match vcpu.run().expect("run failed") {
3248             VmExit::Reset => {
3249                 println!("HLT");
3250                 break;
3251             }
3252             VmExit::Ignore => {}
3253             r => panic!("unexpected exit reason: {r:?}"),
3254         }
3255     }
3256 }
3257