xref: /cloud-hypervisor/vmm/src/vm.rs (revision adb318f4cd0079246b3cb07e01c4e978330445d2)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 #[cfg(feature = "igvm")]
29 use crate::igvm::igvm_loader;
30 use crate::memory_manager::{
31     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
32 };
33 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
34 use crate::migration::get_vm_snapshot;
35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
36 use crate::migration::url_to_file;
37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
38 use crate::GuestMemoryMmap;
39 use crate::{
40     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
41 };
42 use anyhow::anyhow;
43 use arch::get_host_cpu_phys_bits;
44 #[cfg(target_arch = "x86_64")]
45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
46 #[cfg(feature = "tdx")]
47 use arch::x86_64::tdx::TdvfSection;
48 use arch::EntryPoint;
49 #[cfg(target_arch = "aarch64")]
50 use arch::PciSpaceInfo;
51 use arch::{NumaNode, NumaNodes};
52 #[cfg(target_arch = "aarch64")]
53 use devices::interrupt_controller;
54 use devices::AcpiNotificationFlags;
55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
59 use hypervisor::{HypervisorVmError, VmOps};
60 use libc::{termios, SIGWINCH};
61 use linux_loader::cmdline::Cmdline;
62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
63 use linux_loader::elf;
64 #[cfg(target_arch = "x86_64")]
65 use linux_loader::loader::bzimage::BzImage;
66 #[cfg(target_arch = "x86_64")]
67 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
68 #[cfg(target_arch = "aarch64")]
69 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
70 use linux_loader::loader::KernelLoader;
71 use seccompiler::SeccompAction;
72 use serde::{Deserialize, Serialize};
73 use std::cmp;
74 use std::collections::BTreeMap;
75 use std::collections::HashMap;
76 use std::fs::{File, OpenOptions};
77 use std::io::{self, Seek, SeekFrom, Write};
78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
79 use std::mem::size_of;
80 use std::num::Wrapping;
81 use std::ops::Deref;
82 use std::os::unix::net::UnixStream;
83 use std::sync::{Arc, Mutex, RwLock};
84 use std::time::Instant;
85 use std::{result, str, thread};
86 use thiserror::Error;
87 use tracer::trace_scoped;
88 use vm_device::Bus;
89 #[cfg(feature = "tdx")]
90 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
91 use vm_memory::{
92     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
93 };
94 use vm_migration::protocol::{Request, Response, Status};
95 use vm_migration::{
96     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
97     SnapshotData, Snapshottable, Transportable,
98 };
99 use vmm_sys_util::eventfd::EventFd;
100 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
101 
102 /// Errors associated with VM management
103 #[derive(Debug, Error)]
104 pub enum Error {
105     #[error("Cannot open kernel file: {0}")]
106     KernelFile(#[source] io::Error),
107 
108     #[error("Cannot open initramfs file: {0}")]
109     InitramfsFile(#[source] io::Error),
110 
111     #[error("Cannot load the kernel into memory: {0}")]
112     KernelLoad(#[source] linux_loader::loader::Error),
113 
114     #[cfg(target_arch = "aarch64")]
115     #[error("Cannot load the UEFI binary in memory: {0:?}")]
116     UefiLoad(arch::aarch64::uefi::Error),
117 
118     #[error("Cannot load the initramfs into memory")]
119     InitramfsLoad,
120 
121     #[error("Cannot load the kernel command line in memory: {0}")]
122     LoadCmdLine(#[source] linux_loader::loader::Error),
123 
124     #[error("Cannot modify the kernel command line: {0}")]
125     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
126 
127     #[error("Cannot create the kernel command line: {0}")]
128     CmdLineCreate(#[source] linux_loader::cmdline::Error),
129 
130     #[error("Cannot configure system: {0}")]
131     ConfigureSystem(#[source] arch::Error),
132 
133     #[cfg(target_arch = "aarch64")]
134     #[error("Cannot enable interrupt controller: {0:?}")]
135     EnableInterruptController(interrupt_controller::Error),
136 
137     #[error("VM state is poisoned")]
138     PoisonedState,
139 
140     #[error("Error from device manager: {0:?}")]
141     DeviceManager(DeviceManagerError),
142 
143     #[error("No device with id {0:?} to remove")]
144     NoDeviceToRemove(String),
145 
146     #[error("Cannot spawn a signal handler thread: {0}")]
147     SignalHandlerSpawn(#[source] io::Error),
148 
149     #[error("Failed to join on threads: {0:?}")]
150     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
151 
152     #[error("VM config is missing")]
153     VmMissingConfig,
154 
155     #[error("VM is not created")]
156     VmNotCreated,
157 
158     #[error("VM is already created")]
159     VmAlreadyCreated,
160 
161     #[error("VM is not running")]
162     VmNotRunning,
163 
164     #[error("Cannot clone EventFd: {0}")]
165     EventFdClone(#[source] io::Error),
166 
167     #[error("invalid VM state transition: {0:?} to {1:?}")]
168     InvalidStateTransition(VmState, VmState),
169 
170     #[error("Error from CPU manager: {0}")]
171     CpuManager(#[source] cpu::Error),
172 
173     #[error("Cannot pause devices: {0}")]
174     PauseDevices(#[source] MigratableError),
175 
176     #[error("Cannot resume devices: {0}")]
177     ResumeDevices(#[source] MigratableError),
178 
179     #[error("Cannot pause CPUs: {0}")]
180     PauseCpus(#[source] MigratableError),
181 
182     #[error("Cannot resume cpus: {0}")]
183     ResumeCpus(#[source] MigratableError),
184 
185     #[error("Cannot pause VM: {0}")]
186     Pause(#[source] MigratableError),
187 
188     #[error("Cannot resume VM: {0}")]
189     Resume(#[source] MigratableError),
190 
191     #[error("Memory manager error: {0:?}")]
192     MemoryManager(MemoryManagerError),
193 
194     #[error("Eventfd write error: {0}")]
195     EventfdError(#[source] std::io::Error),
196 
197     #[error("Cannot snapshot VM: {0}")]
198     Snapshot(#[source] MigratableError),
199 
200     #[error("Cannot restore VM: {0}")]
201     Restore(#[source] MigratableError),
202 
203     #[error("Cannot send VM snapshot: {0}")]
204     SnapshotSend(#[source] MigratableError),
205 
206     #[error("Invalid restore source URL")]
207     InvalidRestoreSourceUrl,
208 
209     #[error("Failed to validate config: {0}")]
210     ConfigValidation(#[source] ValidationError),
211 
212     #[error("Too many virtio-vsock devices")]
213     TooManyVsockDevices,
214 
215     #[error("Failed serializing into JSON: {0}")]
216     SerializeJson(#[source] serde_json::Error),
217 
218     #[error("Invalid NUMA configuration")]
219     InvalidNumaConfig,
220 
221     #[error("Cannot create seccomp filter: {0}")]
222     CreateSeccompFilter(#[source] seccompiler::Error),
223 
224     #[error("Cannot apply seccomp filter: {0}")]
225     ApplySeccompFilter(#[source] seccompiler::Error),
226 
227     #[error("Failed resizing a memory zone")]
228     ResizeZone,
229 
230     #[error("Cannot activate virtio devices: {0:?}")]
231     ActivateVirtioDevices(DeviceManagerError),
232 
233     #[error("Error triggering power button: {0:?}")]
234     PowerButton(DeviceManagerError),
235 
236     #[error("Kernel lacks PVH header")]
237     KernelMissingPvhHeader,
238 
239     #[error("Failed to allocate firmware RAM: {0:?}")]
240     AllocateFirmwareMemory(MemoryManagerError),
241 
242     #[error("Error manipulating firmware file: {0}")]
243     FirmwareFile(#[source] std::io::Error),
244 
245     #[error("Firmware too big")]
246     FirmwareTooLarge,
247 
248     #[error("Failed to copy firmware to memory: {0}")]
249     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
250 
251     #[cfg(feature = "sev_snp")]
252     #[error("Error enabling SEV-SNP VM: {0}")]
253     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
254 
255     #[cfg(feature = "tdx")]
256     #[error("Error performing I/O on TDX firmware file: {0}")]
257     LoadTdvf(#[source] std::io::Error),
258 
259     #[cfg(feature = "tdx")]
260     #[error("Error performing I/O on the TDX payload file: {0}")]
261     LoadPayload(#[source] std::io::Error),
262 
263     #[cfg(feature = "tdx")]
264     #[error("Error parsing TDVF: {0}")]
265     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
266 
267     #[cfg(feature = "tdx")]
268     #[error("Error populating TDX HOB: {0}")]
269     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
270 
271     #[cfg(feature = "tdx")]
272     #[error("Error allocating TDVF memory: {0:?}")]
273     AllocatingTdvfMemory(crate::memory_manager::Error),
274 
275     #[cfg(feature = "tdx")]
276     #[error("Error enabling TDX VM: {0}")]
277     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
278 
279     #[cfg(feature = "tdx")]
280     #[error("Error enabling TDX memory region: {0}")]
281     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
282 
283     #[cfg(feature = "tdx")]
284     #[error("Error finalizing TDX VM: {0}")]
285     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
286 
287     #[cfg(feature = "tdx")]
288     #[error("TDX firmware missing")]
289     TdxFirmwareMissing,
290 
291     #[cfg(feature = "tdx")]
292     #[error("Invalid TDX payload type")]
293     InvalidPayloadType,
294 
295     #[cfg(feature = "guest_debug")]
296     #[error("Error debugging VM: {0:?}")]
297     Debug(DebuggableError),
298 
299     #[error("Error spawning kernel loading thread")]
300     KernelLoadThreadSpawn(std::io::Error),
301 
302     #[error("Error joining kernel loading thread")]
303     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
304 
305     #[error("Payload configuration is not bootable")]
306     InvalidPayload,
307 
308     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
309     #[error("Error coredumping VM: {0:?}")]
310     Coredump(GuestDebuggableError),
311 
312     #[cfg(feature = "igvm")]
313     #[error("Cannot open igvm file: {0}")]
314     IgvmFile(#[source] io::Error),
315 
316     #[cfg(feature = "igvm")]
317     #[error("Cannot load the igvm into memory: {0}")]
318     IgvmLoad(#[source] igvm_loader::Error),
319 }
320 pub type Result<T> = result::Result<T, Error>;
321 
322 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
323 pub enum VmState {
324     Created,
325     Running,
326     Shutdown,
327     Paused,
328     BreakPoint,
329 }
330 
331 impl VmState {
332     fn valid_transition(self, new_state: VmState) -> Result<()> {
333         match self {
334             VmState::Created => match new_state {
335                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
336                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
337                     Ok(())
338                 }
339             },
340 
341             VmState::Running => match new_state {
342                 VmState::Created | VmState::Running => {
343                     Err(Error::InvalidStateTransition(self, new_state))
344                 }
345                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
346             },
347 
348             VmState::Shutdown => match new_state {
349                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
350                     Err(Error::InvalidStateTransition(self, new_state))
351                 }
352                 VmState::Running => Ok(()),
353             },
354 
355             VmState::Paused => match new_state {
356                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
357                     Err(Error::InvalidStateTransition(self, new_state))
358                 }
359                 VmState::Running | VmState::Shutdown => Ok(()),
360             },
361             VmState::BreakPoint => match new_state {
362                 VmState::Created | VmState::Running => Ok(()),
363                 _ => Err(Error::InvalidStateTransition(self, new_state)),
364             },
365         }
366     }
367 }
368 
369 struct VmOpsHandler {
370     memory: GuestMemoryAtomic<GuestMemoryMmap>,
371     #[cfg(target_arch = "x86_64")]
372     io_bus: Arc<Bus>,
373     mmio_bus: Arc<Bus>,
374 }
375 
376 impl VmOps for VmOpsHandler {
377     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
378         self.memory
379             .memory()
380             .write(buf, GuestAddress(gpa))
381             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
382     }
383 
384     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
385         self.memory
386             .memory()
387             .read(buf, GuestAddress(gpa))
388             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
389     }
390 
391     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
392         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
393             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
394         }
395         Ok(())
396     }
397 
398     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
399         match self.mmio_bus.write(gpa, data) {
400             Err(vm_device::BusError::MissingAddressRange) => {
401                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
402             }
403             Ok(Some(barrier)) => {
404                 info!("Waiting for barrier");
405                 barrier.wait();
406                 info!("Barrier released");
407             }
408             _ => {}
409         };
410         Ok(())
411     }
412 
413     #[cfg(target_arch = "x86_64")]
414     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
415         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
416             info!("Guest PIO read to unregistered address 0x{:x}", port);
417         }
418         Ok(())
419     }
420 
421     #[cfg(target_arch = "x86_64")]
422     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
423         match self.io_bus.write(port, data) {
424             Err(vm_device::BusError::MissingAddressRange) => {
425                 info!("Guest PIO write to unregistered address 0x{:x}", port);
426             }
427             Ok(Some(barrier)) => {
428                 info!("Waiting for barrier");
429                 barrier.wait();
430                 info!("Barrier released");
431             }
432             _ => {}
433         };
434         Ok(())
435     }
436 }
437 
438 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
439     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
440 
441     cmp::min(host_phys_bits, max_phys_bits)
442 }
443 
444 pub struct Vm {
445     #[cfg(feature = "tdx")]
446     kernel: Option<File>,
447     initramfs: Option<File>,
448     threads: Vec<thread::JoinHandle<()>>,
449     device_manager: Arc<Mutex<DeviceManager>>,
450     config: Arc<Mutex<VmConfig>>,
451     state: RwLock<VmState>,
452     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
453     memory_manager: Arc<Mutex<MemoryManager>>,
454     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
455     // The hypervisor abstracted virtual machine.
456     vm: Arc<dyn hypervisor::Vm>,
457     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
458     saved_clock: Option<hypervisor::ClockData>,
459     numa_nodes: NumaNodes,
460     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
461     hypervisor: Arc<dyn hypervisor::Hypervisor>,
462     stop_on_boot: bool,
463     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
464 }
465 
466 impl Vm {
467     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
468 
469     #[allow(clippy::too_many_arguments)]
470     pub fn new_from_memory_manager(
471         config: Arc<Mutex<VmConfig>>,
472         memory_manager: Arc<Mutex<MemoryManager>>,
473         vm: Arc<dyn hypervisor::Vm>,
474         exit_evt: EventFd,
475         reset_evt: EventFd,
476         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
477         seccomp_action: &SeccompAction,
478         hypervisor: Arc<dyn hypervisor::Hypervisor>,
479         activate_evt: EventFd,
480         timestamp: Instant,
481         serial_pty: Option<PtyPair>,
482         console_pty: Option<PtyPair>,
483         debug_console_pty: Option<PtyPair>,
484         console_resize_pipe: Option<File>,
485         original_termios: Arc<Mutex<Option<termios>>>,
486         snapshot: Option<Snapshot>,
487     ) -> Result<Self> {
488         trace_scoped!("Vm::new_from_memory_manager");
489 
490         let boot_id_list = config
491             .lock()
492             .unwrap()
493             .validate()
494             .map_err(Error::ConfigValidation)?;
495 
496         #[cfg(not(feature = "igvm"))]
497         let load_payload_handle = if snapshot.is_none() {
498             Self::load_payload_async(&memory_manager, &config)?
499         } else {
500             None
501         };
502 
503         info!("Booting VM from config: {:?}", &config);
504 
505         // Create NUMA nodes based on NumaConfig.
506         let numa_nodes =
507             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
508 
509         #[cfg(feature = "tdx")]
510         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
511         #[cfg(feature = "sev_snp")]
512         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
513         #[cfg(feature = "tdx")]
514         let force_iommu = tdx_enabled;
515         #[cfg(feature = "sev_snp")]
516         let force_iommu = sev_snp_enabled;
517         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
518         let force_iommu = false;
519 
520         #[cfg(feature = "guest_debug")]
521         let stop_on_boot = config.lock().unwrap().gdb;
522         #[cfg(not(feature = "guest_debug"))]
523         let stop_on_boot = false;
524 
525         let memory = memory_manager.lock().unwrap().guest_memory();
526         #[cfg(target_arch = "x86_64")]
527         let io_bus = Arc::new(Bus::new());
528         let mmio_bus = Arc::new(Bus::new());
529 
530         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
531             memory,
532             #[cfg(target_arch = "x86_64")]
533             io_bus: io_bus.clone(),
534             mmio_bus: mmio_bus.clone(),
535         });
536 
537         let cpus_config = { &config.lock().unwrap().cpus.clone() };
538         let cpu_manager = cpu::CpuManager::new(
539             cpus_config,
540             vm.clone(),
541             exit_evt.try_clone().map_err(Error::EventFdClone)?,
542             reset_evt.try_clone().map_err(Error::EventFdClone)?,
543             #[cfg(feature = "guest_debug")]
544             vm_debug_evt,
545             &hypervisor,
546             seccomp_action.clone(),
547             vm_ops,
548             #[cfg(feature = "tdx")]
549             tdx_enabled,
550             &numa_nodes,
551             #[cfg(feature = "sev_snp")]
552             sev_snp_enabled,
553         )
554         .map_err(Error::CpuManager)?;
555 
556         #[cfg(target_arch = "x86_64")]
557         cpu_manager
558             .lock()
559             .unwrap()
560             .populate_cpuid(
561                 &memory_manager,
562                 &hypervisor,
563                 #[cfg(feature = "tdx")]
564                 tdx_enabled,
565             )
566             .map_err(Error::CpuManager)?;
567 
568         // Loading the igvm file is pushed down here because
569         // igvm parser needs cpu_manager to retrieve cpuid leaf.
570         // For the regular case, we can start loading early, but for
571         // igvm case we have to wait until cpu_manager is created.
572         // Currently, Microsoft Hypervisor does not provide any
573         // Hypervisor specific common cpuid, we need to call get_cpuid_values
574         // per cpuid through cpu_manager.
575         #[cfg(feature = "igvm")]
576         let load_payload_handle = if snapshot.is_none() {
577             Self::load_payload_async(&memory_manager, &config, &cpu_manager)?
578         } else {
579             None
580         };
581         // The initial TDX configuration must be done before the vCPUs are
582         // created
583         #[cfg(feature = "tdx")]
584         if tdx_enabled {
585             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
586             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
587             vm.tdx_init(&cpuid, max_vcpus)
588                 .map_err(Error::InitializeTdxVm)?;
589         }
590 
591         cpu_manager
592             .lock()
593             .unwrap()
594             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
595             .map_err(Error::CpuManager)?;
596 
597         // This initial SEV-SNP configuration must be done immediately after
598         // vCPUs are created. As part of this initialization we are
599         // transitioning the guest into secure state.
600         #[cfg(feature = "sev_snp")]
601         if sev_snp_enabled {
602             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
603         }
604 
605         #[cfg(feature = "tdx")]
606         let dynamic = !tdx_enabled;
607         #[cfg(not(feature = "tdx"))]
608         let dynamic = true;
609 
610         let device_manager = DeviceManager::new(
611             #[cfg(target_arch = "x86_64")]
612             io_bus,
613             mmio_bus,
614             hypervisor.hypervisor_type(),
615             vm.clone(),
616             config.clone(),
617             memory_manager.clone(),
618             cpu_manager.clone(),
619             exit_evt.try_clone().map_err(Error::EventFdClone)?,
620             reset_evt,
621             seccomp_action.clone(),
622             numa_nodes.clone(),
623             &activate_evt,
624             force_iommu,
625             boot_id_list,
626             timestamp,
627             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
628             dynamic,
629         )
630         .map_err(Error::DeviceManager)?;
631 
632         device_manager
633             .lock()
634             .unwrap()
635             .create_devices(
636                 serial_pty,
637                 console_pty,
638                 debug_console_pty,
639                 console_resize_pipe,
640                 original_termios,
641             )
642             .map_err(Error::DeviceManager)?;
643 
644         #[cfg(feature = "tdx")]
645         let kernel = config
646             .lock()
647             .unwrap()
648             .payload
649             .as_ref()
650             .map(|p| p.kernel.as_ref().map(File::open))
651             .unwrap_or_default()
652             .transpose()
653             .map_err(Error::KernelFile)?;
654 
655         let initramfs = config
656             .lock()
657             .unwrap()
658             .payload
659             .as_ref()
660             .map(|p| p.initramfs.as_ref().map(File::open))
661             .unwrap_or_default()
662             .transpose()
663             .map_err(Error::InitramfsFile)?;
664 
665         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
666         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
667             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
668             vm_snapshot.clock
669         } else {
670             None
671         };
672 
673         let vm_state = if snapshot.is_some() {
674             VmState::Paused
675         } else {
676             VmState::Created
677         };
678 
679         Ok(Vm {
680             #[cfg(feature = "tdx")]
681             kernel,
682             initramfs,
683             device_manager,
684             config,
685             threads: Vec::with_capacity(1),
686             state: RwLock::new(vm_state),
687             cpu_manager,
688             memory_manager,
689             vm,
690             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
691             saved_clock,
692             numa_nodes,
693             hypervisor,
694             stop_on_boot,
695             load_payload_handle,
696         })
697     }
698 
699     fn create_numa_nodes(
700         configs: Option<Vec<NumaConfig>>,
701         memory_manager: &Arc<Mutex<MemoryManager>>,
702     ) -> Result<NumaNodes> {
703         let mm = memory_manager.lock().unwrap();
704         let mm_zones = mm.memory_zones();
705         let mut numa_nodes = BTreeMap::new();
706 
707         if let Some(configs) = &configs {
708             for config in configs.iter() {
709                 if numa_nodes.contains_key(&config.guest_numa_id) {
710                     error!("Can't define twice the same NUMA node");
711                     return Err(Error::InvalidNumaConfig);
712                 }
713 
714                 let mut node = NumaNode::default();
715 
716                 if let Some(memory_zones) = &config.memory_zones {
717                     for memory_zone in memory_zones.iter() {
718                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
719                             node.memory_regions.extend(mm_zone.regions().clone());
720                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
721                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
722                             }
723                             node.memory_zones.push(memory_zone.clone());
724                         } else {
725                             error!("Unknown memory zone '{}'", memory_zone);
726                             return Err(Error::InvalidNumaConfig);
727                         }
728                     }
729                 }
730 
731                 if let Some(cpus) = &config.cpus {
732                     node.cpus.extend(cpus);
733                 }
734 
735                 if let Some(pci_segments) = &config.pci_segments {
736                     node.pci_segments.extend(pci_segments);
737                 }
738 
739                 if let Some(distances) = &config.distances {
740                     for distance in distances.iter() {
741                         let dest = distance.destination;
742                         let dist = distance.distance;
743 
744                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
745                             error!("Unknown destination NUMA node {}", dest);
746                             return Err(Error::InvalidNumaConfig);
747                         }
748 
749                         if node.distances.contains_key(&dest) {
750                             error!("Destination NUMA node {} has been already set", dest);
751                             return Err(Error::InvalidNumaConfig);
752                         }
753 
754                         node.distances.insert(dest, dist);
755                     }
756                 }
757 
758                 #[cfg(target_arch = "x86_64")]
759                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
760                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
761                         let mm_sections = sgx_epc_region.epc_sections();
762                         for sgx_epc_section in sgx_epc_sections.iter() {
763                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
764                                 node.sgx_epc_sections.push(mm_section.clone());
765                             } else {
766                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
767                                 return Err(Error::InvalidNumaConfig);
768                             }
769                         }
770                     } else {
771                         error!("Missing SGX EPC region");
772                         return Err(Error::InvalidNumaConfig);
773                     }
774                 }
775 
776                 numa_nodes.insert(config.guest_numa_id, node);
777             }
778         }
779 
780         Ok(numa_nodes)
781     }
782 
783     #[allow(clippy::too_many_arguments)]
784     pub fn new(
785         vm_config: Arc<Mutex<VmConfig>>,
786         exit_evt: EventFd,
787         reset_evt: EventFd,
788         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
789         seccomp_action: &SeccompAction,
790         hypervisor: Arc<dyn hypervisor::Hypervisor>,
791         activate_evt: EventFd,
792         serial_pty: Option<PtyPair>,
793         console_pty: Option<PtyPair>,
794         debug_console_pty: Option<PtyPair>,
795         console_resize_pipe: Option<File>,
796         original_termios: Arc<Mutex<Option<termios>>>,
797         snapshot: Option<Snapshot>,
798         source_url: Option<&str>,
799         prefault: Option<bool>,
800     ) -> Result<Self> {
801         trace_scoped!("Vm::new");
802 
803         let timestamp = Instant::now();
804 
805         #[cfg(feature = "tdx")]
806         let tdx_enabled = if snapshot.is_some() {
807             false
808         } else {
809             vm_config.lock().unwrap().is_tdx_enabled()
810         };
811 
812         #[cfg(feature = "sev_snp")]
813         let sev_snp_enabled = if snapshot.is_some() {
814             false
815         } else {
816             vm_config.lock().unwrap().is_sev_snp_enabled()
817         };
818 
819         let vm = Self::create_hypervisor_vm(
820             &hypervisor,
821             #[cfg(feature = "tdx")]
822             tdx_enabled,
823             #[cfg(feature = "sev_snp")]
824             sev_snp_enabled,
825         )?;
826 
827         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
828 
829         let memory_manager = if let Some(snapshot) =
830             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
831         {
832             MemoryManager::new_from_snapshot(
833                 &snapshot,
834                 vm.clone(),
835                 &vm_config.lock().unwrap().memory.clone(),
836                 source_url,
837                 prefault.unwrap(),
838                 phys_bits,
839             )
840             .map_err(Error::MemoryManager)?
841         } else {
842             #[cfg(target_arch = "x86_64")]
843             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
844 
845             MemoryManager::new(
846                 vm.clone(),
847                 &vm_config.lock().unwrap().memory.clone(),
848                 None,
849                 phys_bits,
850                 #[cfg(feature = "tdx")]
851                 tdx_enabled,
852                 None,
853                 None,
854                 #[cfg(target_arch = "x86_64")]
855                 sgx_epc_config,
856             )
857             .map_err(Error::MemoryManager)?
858         };
859 
860         Vm::new_from_memory_manager(
861             vm_config,
862             memory_manager,
863             vm,
864             exit_evt,
865             reset_evt,
866             #[cfg(feature = "guest_debug")]
867             vm_debug_evt,
868             seccomp_action,
869             hypervisor,
870             activate_evt,
871             timestamp,
872             serial_pty,
873             console_pty,
874             debug_console_pty,
875             console_resize_pipe,
876             original_termios,
877             snapshot,
878         )
879     }
880 
881     pub fn create_hypervisor_vm(
882         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
883         #[cfg(feature = "tdx")] tdx_enabled: bool,
884         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
885     ) -> Result<Arc<dyn hypervisor::Vm>> {
886         hypervisor.check_required_extensions().unwrap();
887 
888         cfg_if::cfg_if! {
889             if #[cfg(feature = "tdx")] {
890                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
891                 // Otherwise KVM_X86_LEGACY_VM: 0
892                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
893                 let vm = hypervisor
894                     .create_vm_with_type(u64::from(tdx_enabled))
895                     .unwrap();
896             } else if #[cfg(feature = "sev_snp")] {
897                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
898                 // Otherwise SEV_SNP_DISABLED: 0
899                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
900                 let vm = hypervisor
901                     .create_vm_with_type(u64::from(sev_snp_enabled))
902                     .unwrap();
903             } else {
904                 let vm = hypervisor.create_vm().unwrap();
905             }
906         }
907 
908         #[cfg(target_arch = "x86_64")]
909         {
910             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
911                 .unwrap();
912             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
913             vm.enable_split_irq().unwrap();
914         }
915 
916         Ok(vm)
917     }
918 
919     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
920         let initramfs = self.initramfs.as_mut().unwrap();
921         let size: usize = initramfs
922             .seek(SeekFrom::End(0))
923             .map_err(|_| Error::InitramfsLoad)?
924             .try_into()
925             .unwrap();
926         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
927 
928         let address =
929             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
930         let address = GuestAddress(address);
931 
932         guest_mem
933             .read_volatile_from(address, initramfs, size)
934             .map_err(|_| Error::InitramfsLoad)?;
935 
936         info!("Initramfs loaded: address = 0x{:x}", address.0);
937         Ok(arch::InitramfsConfig { address, size })
938     }
939 
940     pub fn generate_cmdline(
941         payload: &PayloadConfig,
942         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
943     ) -> Result<Cmdline> {
944         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
945         if let Some(s) = payload.cmdline.as_ref() {
946             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
947         }
948 
949         #[cfg(target_arch = "aarch64")]
950         for entry in device_manager.lock().unwrap().cmdline_additions() {
951             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
952         }
953         Ok(cmdline)
954     }
955 
956     #[cfg(target_arch = "aarch64")]
957     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
958         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
959         let mem = uefi_flash.memory();
960         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
961             .map_err(Error::UefiLoad)?;
962         Ok(())
963     }
964 
965     #[cfg(target_arch = "aarch64")]
966     fn load_kernel(
967         firmware: Option<File>,
968         kernel: Option<File>,
969         memory_manager: Arc<Mutex<MemoryManager>>,
970     ) -> Result<EntryPoint> {
971         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
972         let mem = guest_memory.memory();
973         let entry_addr = match (firmware, kernel) {
974             (None, Some(mut kernel)) => {
975                 match linux_loader::loader::pe::PE::load(
976                     mem.deref(),
977                     Some(arch::layout::KERNEL_START),
978                     &mut kernel,
979                     None,
980                 ) {
981                     Ok(entry_addr) => entry_addr.kernel_load,
982                     // Try to load the binary as kernel PE file at first.
983                     // If failed, retry to load it as UEFI binary.
984                     // As the UEFI binary is formatless, it must be the last option to try.
985                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
986                         Self::load_firmware(&kernel, memory_manager)?;
987                         arch::layout::UEFI_START
988                     }
989                     Err(e) => {
990                         return Err(Error::KernelLoad(e));
991                     }
992                 }
993             }
994             (Some(firmware), None) => {
995                 Self::load_firmware(&firmware, memory_manager)?;
996                 arch::layout::UEFI_START
997             }
998             _ => return Err(Error::InvalidPayload),
999         };
1000 
1001         Ok(EntryPoint { entry_addr })
1002     }
1003 
1004     #[cfg(feature = "igvm")]
1005     fn load_igvm(
1006         igvm: File,
1007         memory_manager: Arc<Mutex<MemoryManager>>,
1008         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1009     ) -> Result<EntryPoint> {
1010         let res = igvm_loader::load_igvm(&igvm, memory_manager, cpu_manager.clone(), "")
1011             .map_err(Error::IgvmLoad)?;
1012 
1013         cfg_if::cfg_if! {
1014             if #[cfg(feature = "sev_snp")] {
1015                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1016                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1017                 } else {
1018                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1019                 };
1020             } else {
1021                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1022             }
1023         };
1024         Ok(entry_point)
1025     }
1026 
1027     #[cfg(target_arch = "x86_64")]
1028     fn load_kernel(
1029         mut kernel: File,
1030         cmdline: Option<Cmdline>,
1031         memory_manager: Arc<Mutex<MemoryManager>>,
1032     ) -> Result<EntryPoint> {
1033         info!("Loading kernel");
1034 
1035         let mem = {
1036             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1037             guest_memory.memory()
1038         };
1039 
1040         // Try ELF binary with PVH boot.
1041         let entry_addr = linux_loader::loader::elf::Elf::load(
1042             mem.deref(),
1043             None,
1044             &mut kernel,
1045             Some(arch::layout::HIGH_RAM_START),
1046         )
1047         // Try loading kernel as bzImage.
1048         .or_else(|_| {
1049             BzImage::load(
1050                 mem.deref(),
1051                 None,
1052                 &mut kernel,
1053                 Some(arch::layout::HIGH_RAM_START),
1054             )
1055         })
1056         .map_err(Error::KernelLoad)?;
1057 
1058         if let Some(cmdline) = cmdline {
1059             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1060                 .map_err(Error::LoadCmdLine)?;
1061         }
1062 
1063         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1064             // Use the PVH kernel entry point to boot the guest
1065             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1066             Ok(EntryPoint {
1067                 entry_addr,
1068                 setup_header: None,
1069             })
1070         } else if entry_addr.setup_header.is_some() {
1071             // Use the bzImage 32bit entry point to boot the guest
1072             info!(
1073                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1074                 entry_addr.kernel_load.0
1075             );
1076             Ok(EntryPoint {
1077                 entry_addr: entry_addr.kernel_load,
1078                 setup_header: entry_addr.setup_header,
1079             })
1080         } else {
1081             Err(Error::KernelMissingPvhHeader)
1082         }
1083     }
1084 
1085     #[cfg(target_arch = "x86_64")]
1086     fn load_payload(
1087         payload: &PayloadConfig,
1088         memory_manager: Arc<Mutex<MemoryManager>>,
1089         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1090     ) -> Result<EntryPoint> {
1091         trace_scoped!("load_payload");
1092         #[cfg(feature = "igvm")]
1093         if let Some(_igvm_file) = &payload.igvm {
1094             let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1095             return Self::load_igvm(igvm, memory_manager, cpu_manager);
1096         }
1097         match (
1098             &payload.firmware,
1099             &payload.kernel,
1100             &payload.initramfs,
1101             &payload.cmdline,
1102         ) {
1103             (Some(firmware), None, None, None) => {
1104                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1105                 Self::load_kernel(firmware, None, memory_manager)
1106             }
1107             (None, Some(kernel), _, _) => {
1108                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1109                 let cmdline = Self::generate_cmdline(payload)?;
1110                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1111             }
1112             _ => Err(Error::InvalidPayload),
1113         }
1114     }
1115 
1116     #[cfg(target_arch = "aarch64")]
1117     fn load_payload(
1118         payload: &PayloadConfig,
1119         memory_manager: Arc<Mutex<MemoryManager>>,
1120     ) -> Result<EntryPoint> {
1121         match (&payload.firmware, &payload.kernel) {
1122             (Some(firmware), None) => {
1123                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1124                 Self::load_kernel(Some(firmware), None, memory_manager)
1125             }
1126             (None, Some(kernel)) => {
1127                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1128                 Self::load_kernel(None, Some(kernel), memory_manager)
1129             }
1130             _ => Err(Error::InvalidPayload),
1131         }
1132     }
1133 
1134     fn load_payload_async(
1135         memory_manager: &Arc<Mutex<MemoryManager>>,
1136         config: &Arc<Mutex<VmConfig>>,
1137         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1138     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1139         // Kernel with TDX is loaded in a different manner
1140         #[cfg(feature = "tdx")]
1141         if config.lock().unwrap().is_tdx_enabled() {
1142             return Ok(None);
1143         }
1144 
1145         config
1146             .lock()
1147             .unwrap()
1148             .payload
1149             .as_ref()
1150             .map(|payload| {
1151                 let memory_manager = memory_manager.clone();
1152                 let payload = payload.clone();
1153                 #[cfg(feature = "igvm")]
1154                 let cpu_manager = cpu_manager.clone();
1155 
1156                 std::thread::Builder::new()
1157                     .name("payload_loader".into())
1158                     .spawn(move || {
1159                         Self::load_payload(
1160                             &payload,
1161                             memory_manager,
1162                             #[cfg(feature = "igvm")]
1163                             cpu_manager,
1164                         )
1165                     })
1166                     .map_err(Error::KernelLoadThreadSpawn)
1167             })
1168             .transpose()
1169     }
1170 
1171     #[cfg(target_arch = "x86_64")]
1172     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1173         trace_scoped!("configure_system");
1174         info!("Configuring system");
1175         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1176 
1177         let initramfs_config = match self.initramfs {
1178             Some(_) => Some(self.load_initramfs(&mem)?),
1179             None => None,
1180         };
1181 
1182         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1183         let rsdp_addr = Some(rsdp_addr);
1184         let sgx_epc_region = self
1185             .memory_manager
1186             .lock()
1187             .unwrap()
1188             .sgx_epc_region()
1189             .as_ref()
1190             .cloned();
1191 
1192         let serial_number = self
1193             .config
1194             .lock()
1195             .unwrap()
1196             .platform
1197             .as_ref()
1198             .and_then(|p| p.serial_number.clone());
1199 
1200         let uuid = self
1201             .config
1202             .lock()
1203             .unwrap()
1204             .platform
1205             .as_ref()
1206             .and_then(|p| p.uuid.clone());
1207 
1208         let oem_strings = self
1209             .config
1210             .lock()
1211             .unwrap()
1212             .platform
1213             .as_ref()
1214             .and_then(|p| p.oem_strings.clone());
1215 
1216         let oem_strings = oem_strings
1217             .as_deref()
1218             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1219 
1220         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1221 
1222         arch::configure_system(
1223             &mem,
1224             arch::layout::CMDLINE_START,
1225             arch::layout::CMDLINE_MAX_SIZE,
1226             &initramfs_config,
1227             boot_vcpus,
1228             entry_addr.setup_header,
1229             rsdp_addr,
1230             sgx_epc_region,
1231             serial_number.as_deref(),
1232             uuid.as_deref(),
1233             oem_strings.as_deref(),
1234             topology,
1235         )
1236         .map_err(Error::ConfigureSystem)?;
1237         Ok(())
1238     }
1239 
1240     #[cfg(target_arch = "aarch64")]
1241     fn configure_system(
1242         &mut self,
1243         _rsdp_addr: GuestAddress,
1244         _entry_addr: EntryPoint,
1245     ) -> Result<()> {
1246         let cmdline = Self::generate_cmdline(
1247             self.config.lock().unwrap().payload.as_ref().unwrap(),
1248             &self.device_manager,
1249         )?;
1250         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1251         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1252         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1253         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1254         let initramfs_config = match self.initramfs {
1255             Some(_) => Some(self.load_initramfs(&mem)?),
1256             None => None,
1257         };
1258 
1259         let device_info = &self
1260             .device_manager
1261             .lock()
1262             .unwrap()
1263             .get_device_info()
1264             .clone();
1265 
1266         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1267             let pci_space = PciSpaceInfo {
1268                 pci_segment_id: pci_segment.id,
1269                 mmio_config_address: pci_segment.mmio_config_address,
1270                 pci_device_space_start: pci_segment.start_of_mem64_area,
1271                 pci_device_space_size: pci_segment.end_of_mem64_area
1272                     - pci_segment.start_of_mem64_area
1273                     + 1,
1274             };
1275             pci_space_info.push(pci_space);
1276         }
1277 
1278         let virtio_iommu_bdf = self
1279             .device_manager
1280             .lock()
1281             .unwrap()
1282             .iommu_attached_devices()
1283             .as_ref()
1284             .map(|(v, _)| *v);
1285 
1286         let vgic = self
1287             .device_manager
1288             .lock()
1289             .unwrap()
1290             .get_interrupt_controller()
1291             .unwrap()
1292             .lock()
1293             .unwrap()
1294             .get_vgic()
1295             .map_err(|_| {
1296                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1297                     arch::aarch64::Error::SetupGic,
1298                 ))
1299             })?;
1300 
1301         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1302         let pmu_supported = self
1303             .cpu_manager
1304             .lock()
1305             .unwrap()
1306             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1307             .map_err(|_| {
1308                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1309                     arch::aarch64::Error::VcpuInitPmu,
1310                 ))
1311             })?;
1312 
1313         arch::configure_system(
1314             &mem,
1315             cmdline.as_cstring().unwrap().to_str().unwrap(),
1316             vcpu_mpidrs,
1317             vcpu_topology,
1318             device_info,
1319             &initramfs_config,
1320             &pci_space_info,
1321             virtio_iommu_bdf.map(|bdf| bdf.into()),
1322             &vgic,
1323             &self.numa_nodes,
1324             pmu_supported,
1325         )
1326         .map_err(Error::ConfigureSystem)?;
1327 
1328         Ok(())
1329     }
1330 
1331     pub fn serial_pty(&self) -> Option<PtyPair> {
1332         self.device_manager.lock().unwrap().serial_pty()
1333     }
1334 
1335     pub fn console_pty(&self) -> Option<PtyPair> {
1336         self.device_manager.lock().unwrap().console_pty()
1337     }
1338 
1339     pub fn debug_console_pty(&self) -> Option<PtyPair> {
1340         self.device_manager.lock().unwrap().debug_console_pty()
1341     }
1342 
1343     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1344         self.device_manager.lock().unwrap().console_resize_pipe()
1345     }
1346 
1347     pub fn shutdown(&mut self) -> Result<()> {
1348         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1349         let new_state = VmState::Shutdown;
1350 
1351         state.valid_transition(new_state)?;
1352 
1353         // Wake up the DeviceManager threads so they will get terminated cleanly
1354         self.device_manager
1355             .lock()
1356             .unwrap()
1357             .resume()
1358             .map_err(Error::Resume)?;
1359 
1360         self.cpu_manager
1361             .lock()
1362             .unwrap()
1363             .shutdown()
1364             .map_err(Error::CpuManager)?;
1365 
1366         // Wait for all the threads to finish
1367         for thread in self.threads.drain(..) {
1368             thread.join().map_err(Error::ThreadCleanup)?
1369         }
1370         *state = new_state;
1371 
1372         event!("vm", "shutdown");
1373 
1374         Ok(())
1375     }
1376 
1377     pub fn resize(
1378         &mut self,
1379         desired_vcpus: Option<u8>,
1380         desired_memory: Option<u64>,
1381         desired_balloon: Option<u64>,
1382     ) -> Result<()> {
1383         event!("vm", "resizing");
1384 
1385         if let Some(desired_vcpus) = desired_vcpus {
1386             if self
1387                 .cpu_manager
1388                 .lock()
1389                 .unwrap()
1390                 .resize(desired_vcpus)
1391                 .map_err(Error::CpuManager)?
1392             {
1393                 self.device_manager
1394                     .lock()
1395                     .unwrap()
1396                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1397                     .map_err(Error::DeviceManager)?;
1398             }
1399             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1400         }
1401 
1402         if let Some(desired_memory) = desired_memory {
1403             let new_region = self
1404                 .memory_manager
1405                 .lock()
1406                 .unwrap()
1407                 .resize(desired_memory)
1408                 .map_err(Error::MemoryManager)?;
1409 
1410             let memory_config = &mut self.config.lock().unwrap().memory;
1411 
1412             if let Some(new_region) = &new_region {
1413                 self.device_manager
1414                     .lock()
1415                     .unwrap()
1416                     .update_memory(new_region)
1417                     .map_err(Error::DeviceManager)?;
1418 
1419                 match memory_config.hotplug_method {
1420                     HotplugMethod::Acpi => {
1421                         self.device_manager
1422                             .lock()
1423                             .unwrap()
1424                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1425                             .map_err(Error::DeviceManager)?;
1426                     }
1427                     HotplugMethod::VirtioMem => {}
1428                 }
1429             }
1430 
1431             // We update the VM config regardless of the actual guest resize
1432             // operation result (happened or not), so that if the VM reboots
1433             // it will be running with the last configure memory size.
1434             match memory_config.hotplug_method {
1435                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1436                 HotplugMethod::VirtioMem => {
1437                     if desired_memory > memory_config.size {
1438                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1439                     } else {
1440                         memory_config.hotplugged_size = None;
1441                     }
1442                 }
1443             }
1444         }
1445 
1446         if let Some(desired_balloon) = desired_balloon {
1447             self.device_manager
1448                 .lock()
1449                 .unwrap()
1450                 .resize_balloon(desired_balloon)
1451                 .map_err(Error::DeviceManager)?;
1452 
1453             // Update the configuration value for the balloon size to ensure
1454             // a reboot would use the right value.
1455             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1456                 balloon_config.size = desired_balloon;
1457             }
1458         }
1459 
1460         event!("vm", "resized");
1461 
1462         Ok(())
1463     }
1464 
1465     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1466         let memory_config = &mut self.config.lock().unwrap().memory;
1467 
1468         if let Some(zones) = &mut memory_config.zones {
1469             for zone in zones.iter_mut() {
1470                 if zone.id == id {
1471                     if desired_memory >= zone.size {
1472                         let hotplugged_size = desired_memory - zone.size;
1473                         self.memory_manager
1474                             .lock()
1475                             .unwrap()
1476                             .resize_zone(&id, desired_memory - zone.size)
1477                             .map_err(Error::MemoryManager)?;
1478                         // We update the memory zone config regardless of the
1479                         // actual 'resize-zone' operation result (happened or
1480                         // not), so that if the VM reboots it will be running
1481                         // with the last configured memory zone size.
1482                         zone.hotplugged_size = Some(hotplugged_size);
1483 
1484                         return Ok(());
1485                     } else {
1486                         error!(
1487                             "Invalid to ask less ({}) than boot RAM ({}) for \
1488                             this memory zone",
1489                             desired_memory, zone.size,
1490                         );
1491                         return Err(Error::ResizeZone);
1492                     }
1493                 }
1494             }
1495         }
1496 
1497         error!("Could not find the memory zone {} for the resize", id);
1498         Err(Error::ResizeZone)
1499     }
1500 
1501     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1502         let pci_device_info = self
1503             .device_manager
1504             .lock()
1505             .unwrap()
1506             .add_device(&mut device_cfg)
1507             .map_err(Error::DeviceManager)?;
1508 
1509         // Update VmConfig by adding the new device. This is important to
1510         // ensure the device would be created in case of a reboot.
1511         {
1512             let mut config = self.config.lock().unwrap();
1513             add_to_config(&mut config.devices, device_cfg);
1514         }
1515 
1516         self.device_manager
1517             .lock()
1518             .unwrap()
1519             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1520             .map_err(Error::DeviceManager)?;
1521 
1522         Ok(pci_device_info)
1523     }
1524 
1525     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1526         let pci_device_info = self
1527             .device_manager
1528             .lock()
1529             .unwrap()
1530             .add_user_device(&mut device_cfg)
1531             .map_err(Error::DeviceManager)?;
1532 
1533         // Update VmConfig by adding the new device. This is important to
1534         // ensure the device would be created in case of a reboot.
1535         {
1536             let mut config = self.config.lock().unwrap();
1537             add_to_config(&mut config.user_devices, device_cfg);
1538         }
1539 
1540         self.device_manager
1541             .lock()
1542             .unwrap()
1543             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1544             .map_err(Error::DeviceManager)?;
1545 
1546         Ok(pci_device_info)
1547     }
1548 
1549     pub fn remove_device(&mut self, id: String) -> Result<()> {
1550         self.device_manager
1551             .lock()
1552             .unwrap()
1553             .remove_device(id.clone())
1554             .map_err(Error::DeviceManager)?;
1555 
1556         // Update VmConfig by removing the device. This is important to
1557         // ensure the device would not be created in case of a reboot.
1558         self.config.lock().unwrap().remove_device(&id);
1559 
1560         self.device_manager
1561             .lock()
1562             .unwrap()
1563             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1564             .map_err(Error::DeviceManager)?;
1565         Ok(())
1566     }
1567 
1568     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1569         let pci_device_info = self
1570             .device_manager
1571             .lock()
1572             .unwrap()
1573             .add_disk(&mut disk_cfg)
1574             .map_err(Error::DeviceManager)?;
1575 
1576         // Update VmConfig by adding the new device. This is important to
1577         // ensure the device would be created in case of a reboot.
1578         {
1579             let mut config = self.config.lock().unwrap();
1580             add_to_config(&mut config.disks, disk_cfg);
1581         }
1582 
1583         self.device_manager
1584             .lock()
1585             .unwrap()
1586             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1587             .map_err(Error::DeviceManager)?;
1588 
1589         Ok(pci_device_info)
1590     }
1591 
1592     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1593         let pci_device_info = self
1594             .device_manager
1595             .lock()
1596             .unwrap()
1597             .add_fs(&mut fs_cfg)
1598             .map_err(Error::DeviceManager)?;
1599 
1600         // Update VmConfig by adding the new device. This is important to
1601         // ensure the device would be created in case of a reboot.
1602         {
1603             let mut config = self.config.lock().unwrap();
1604             add_to_config(&mut config.fs, fs_cfg);
1605         }
1606 
1607         self.device_manager
1608             .lock()
1609             .unwrap()
1610             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1611             .map_err(Error::DeviceManager)?;
1612 
1613         Ok(pci_device_info)
1614     }
1615 
1616     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1617         let pci_device_info = self
1618             .device_manager
1619             .lock()
1620             .unwrap()
1621             .add_pmem(&mut pmem_cfg)
1622             .map_err(Error::DeviceManager)?;
1623 
1624         // Update VmConfig by adding the new device. This is important to
1625         // ensure the device would be created in case of a reboot.
1626         {
1627             let mut config = self.config.lock().unwrap();
1628             add_to_config(&mut config.pmem, pmem_cfg);
1629         }
1630 
1631         self.device_manager
1632             .lock()
1633             .unwrap()
1634             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1635             .map_err(Error::DeviceManager)?;
1636 
1637         Ok(pci_device_info)
1638     }
1639 
1640     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1641         let pci_device_info = self
1642             .device_manager
1643             .lock()
1644             .unwrap()
1645             .add_net(&mut net_cfg)
1646             .map_err(Error::DeviceManager)?;
1647 
1648         // Update VmConfig by adding the new device. This is important to
1649         // ensure the device would be created in case of a reboot.
1650         {
1651             let mut config = self.config.lock().unwrap();
1652             add_to_config(&mut config.net, net_cfg);
1653         }
1654 
1655         self.device_manager
1656             .lock()
1657             .unwrap()
1658             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1659             .map_err(Error::DeviceManager)?;
1660 
1661         Ok(pci_device_info)
1662     }
1663 
1664     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1665         let pci_device_info = self
1666             .device_manager
1667             .lock()
1668             .unwrap()
1669             .add_vdpa(&mut vdpa_cfg)
1670             .map_err(Error::DeviceManager)?;
1671 
1672         // Update VmConfig by adding the new device. This is important to
1673         // ensure the device would be created in case of a reboot.
1674         {
1675             let mut config = self.config.lock().unwrap();
1676             add_to_config(&mut config.vdpa, vdpa_cfg);
1677         }
1678 
1679         self.device_manager
1680             .lock()
1681             .unwrap()
1682             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1683             .map_err(Error::DeviceManager)?;
1684 
1685         Ok(pci_device_info)
1686     }
1687 
1688     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1689         let pci_device_info = self
1690             .device_manager
1691             .lock()
1692             .unwrap()
1693             .add_vsock(&mut vsock_cfg)
1694             .map_err(Error::DeviceManager)?;
1695 
1696         // Update VmConfig by adding the new device. This is important to
1697         // ensure the device would be created in case of a reboot.
1698         {
1699             let mut config = self.config.lock().unwrap();
1700             config.vsock = Some(vsock_cfg);
1701         }
1702 
1703         self.device_manager
1704             .lock()
1705             .unwrap()
1706             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1707             .map_err(Error::DeviceManager)?;
1708 
1709         Ok(pci_device_info)
1710     }
1711 
1712     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1713         Ok(self.device_manager.lock().unwrap().counters())
1714     }
1715 
1716     #[cfg(feature = "tdx")]
1717     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1718         use arch::x86_64::tdx::*;
1719 
1720         let firmware_path = self
1721             .config
1722             .lock()
1723             .unwrap()
1724             .payload
1725             .as_ref()
1726             .unwrap()
1727             .firmware
1728             .clone()
1729             .ok_or(Error::TdxFirmwareMissing)?;
1730         // The TDVF file contains a table of section as well as code
1731         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1732 
1733         // For all the sections allocate some RAM backing them
1734         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1735     }
1736 
1737     #[cfg(feature = "tdx")]
1738     fn hob_memory_resources(
1739         mut sorted_sections: Vec<TdvfSection>,
1740         guest_memory: &GuestMemoryMmap,
1741     ) -> Vec<(u64, u64, bool)> {
1742         let mut list = Vec::new();
1743 
1744         let mut current_section = sorted_sections.pop();
1745 
1746         // RAM regions interleaved with TDVF sections
1747         let mut next_start_addr = 0;
1748         for region in guest_memory.iter() {
1749             let region_start = region.start_addr().0;
1750             let region_end = region.last_addr().0;
1751             if region_start > next_start_addr {
1752                 next_start_addr = region_start;
1753             }
1754 
1755             loop {
1756                 let (start, size, ram) = if let Some(section) = &current_section {
1757                     if section.address <= next_start_addr {
1758                         (section.address, section.size, false)
1759                     } else {
1760                         let last_addr = std::cmp::min(section.address - 1, region_end);
1761                         (next_start_addr, last_addr - next_start_addr + 1, true)
1762                     }
1763                 } else {
1764                     (next_start_addr, region_end - next_start_addr + 1, true)
1765                 };
1766 
1767                 list.push((start, size, ram));
1768 
1769                 if !ram {
1770                     current_section = sorted_sections.pop();
1771                 }
1772 
1773                 next_start_addr = start + size;
1774 
1775                 if region_start > next_start_addr {
1776                     next_start_addr = region_start;
1777                 }
1778 
1779                 if next_start_addr > region_end {
1780                     break;
1781                 }
1782             }
1783         }
1784 
1785         // Once all the interleaved sections have been processed, let's simply
1786         // pull the remaining ones.
1787         if let Some(section) = current_section {
1788             list.push((section.address, section.size, false));
1789         }
1790         while let Some(section) = sorted_sections.pop() {
1791             list.push((section.address, section.size, false));
1792         }
1793 
1794         list
1795     }
1796 
1797     #[cfg(feature = "tdx")]
1798     fn populate_tdx_sections(
1799         &mut self,
1800         sections: &[TdvfSection],
1801         guid_found: bool,
1802     ) -> Result<Option<u64>> {
1803         use arch::x86_64::tdx::*;
1804         // Get the memory end *before* we start adding TDVF ram regions
1805         let boot_guest_memory = self
1806             .memory_manager
1807             .lock()
1808             .as_ref()
1809             .unwrap()
1810             .boot_guest_memory();
1811         for section in sections {
1812             // No need to allocate if the section falls within guest RAM ranges
1813             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1814                 info!(
1815                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1816                     section
1817                 );
1818                 continue;
1819             }
1820 
1821             info!("Allocating TDVF Section: {:x?}", section);
1822             self.memory_manager
1823                 .lock()
1824                 .unwrap()
1825                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1826                 .map_err(Error::AllocatingTdvfMemory)?;
1827         }
1828 
1829         // The TDVF file contains a table of section as well as code
1830         let firmware_path = self
1831             .config
1832             .lock()
1833             .unwrap()
1834             .payload
1835             .as_ref()
1836             .unwrap()
1837             .firmware
1838             .clone()
1839             .ok_or(Error::TdxFirmwareMissing)?;
1840         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1841 
1842         // The guest memory at this point now has all the required regions so it
1843         // is safe to copy from the TDVF file into it.
1844         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1845         let mem = guest_memory.memory();
1846         let mut payload_info = None;
1847         let mut hob_offset = None;
1848         for section in sections {
1849             info!("Populating TDVF Section: {:x?}", section);
1850             match section.r#type {
1851                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1852                     info!("Copying section to guest memory");
1853                     firmware_file
1854                         .seek(SeekFrom::Start(section.data_offset as u64))
1855                         .map_err(Error::LoadTdvf)?;
1856                     mem.read_volatile_from(
1857                         GuestAddress(section.address),
1858                         &mut firmware_file,
1859                         section.data_size as usize,
1860                     )
1861                     .unwrap();
1862                 }
1863                 TdvfSectionType::TdHob => {
1864                     hob_offset = Some(section.address);
1865                 }
1866                 TdvfSectionType::Payload => {
1867                     info!("Copying payload to guest memory");
1868                     if let Some(payload_file) = self.kernel.as_mut() {
1869                         let payload_size = payload_file
1870                             .seek(SeekFrom::End(0))
1871                             .map_err(Error::LoadPayload)?;
1872 
1873                         payload_file
1874                             .seek(SeekFrom::Start(0x1f1))
1875                             .map_err(Error::LoadPayload)?;
1876 
1877                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1878                         payload_file
1879                             .read_volatile(&mut payload_header.as_bytes())
1880                             .unwrap();
1881 
1882                         if payload_header.header != 0x5372_6448 {
1883                             return Err(Error::InvalidPayloadType);
1884                         }
1885 
1886                         if (payload_header.version < 0x0200)
1887                             || ((payload_header.loadflags & 0x1) == 0x0)
1888                         {
1889                             return Err(Error::InvalidPayloadType);
1890                         }
1891 
1892                         payload_file.rewind().map_err(Error::LoadPayload)?;
1893                         mem.read_volatile_from(
1894                             GuestAddress(section.address),
1895                             payload_file,
1896                             payload_size as usize,
1897                         )
1898                         .unwrap();
1899 
1900                         // Create the payload info that will be inserted into
1901                         // the HOB.
1902                         payload_info = Some(PayloadInfo {
1903                             image_type: PayloadImageType::BzImage,
1904                             entry_point: section.address,
1905                         });
1906                     }
1907                 }
1908                 TdvfSectionType::PayloadParam => {
1909                     info!("Copying payload parameters to guest memory");
1910                     let cmdline = Self::generate_cmdline(
1911                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1912                     )?;
1913                     mem.write_slice(
1914                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1915                         GuestAddress(section.address),
1916                     )
1917                     .unwrap();
1918                 }
1919                 _ => {}
1920             }
1921         }
1922 
1923         // Generate HOB
1924         let mut hob = TdHob::start(hob_offset.unwrap());
1925 
1926         let mut sorted_sections = sections.to_vec();
1927         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1928 
1929         sorted_sections.sort_by_key(|section| section.address);
1930         sorted_sections.reverse();
1931 
1932         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1933             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1934                 .map_err(Error::PopulateHob)?;
1935         }
1936 
1937         // MMIO regions
1938         hob.add_mmio_resource(
1939             &mem,
1940             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1941             arch::layout::APIC_START.raw_value()
1942                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1943         )
1944         .map_err(Error::PopulateHob)?;
1945         let start_of_device_area = self
1946             .memory_manager
1947             .lock()
1948             .unwrap()
1949             .start_of_device_area()
1950             .raw_value();
1951         let end_of_device_area = self
1952             .memory_manager
1953             .lock()
1954             .unwrap()
1955             .end_of_device_area()
1956             .raw_value();
1957         hob.add_mmio_resource(
1958             &mem,
1959             start_of_device_area,
1960             end_of_device_area - start_of_device_area,
1961         )
1962         .map_err(Error::PopulateHob)?;
1963 
1964         // Loop over the ACPI tables and copy them to the HOB.
1965 
1966         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1967             &self.device_manager,
1968             &self.cpu_manager,
1969             &self.memory_manager,
1970             &self.numa_nodes,
1971         ) {
1972             hob.add_acpi_table(&mem, acpi_table.as_slice())
1973                 .map_err(Error::PopulateHob)?;
1974         }
1975 
1976         // If a payload info has been created, let's insert it into the HOB.
1977         if let Some(payload_info) = payload_info {
1978             hob.add_payload(&mem, payload_info)
1979                 .map_err(Error::PopulateHob)?;
1980         }
1981 
1982         hob.finish(&mem).map_err(Error::PopulateHob)?;
1983 
1984         Ok(hob_offset)
1985     }
1986 
1987     #[cfg(feature = "tdx")]
1988     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1989         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1990         let mem = guest_memory.memory();
1991 
1992         for section in sections {
1993             self.vm
1994                 .tdx_init_memory_region(
1995                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1996                     section.address,
1997                     section.size,
1998                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1999                     section.attributes == 1,
2000                 )
2001                 .map_err(Error::InitializeTdxMemoryRegion)?;
2002         }
2003 
2004         Ok(())
2005     }
2006 
2007     // Creates ACPI tables
2008     // In case of TDX being used, this is a no-op since the tables will be
2009     // created and passed when populating the HOB.
2010 
2011     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2012         #[cfg(feature = "tdx")]
2013         if self.config.lock().unwrap().is_tdx_enabled() {
2014             return None;
2015         }
2016         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2017         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2018         let rsdp_addr = crate::acpi::create_acpi_tables(
2019             &mem,
2020             &self.device_manager,
2021             &self.cpu_manager,
2022             &self.memory_manager,
2023             &self.numa_nodes,
2024             tpm_enabled,
2025         );
2026         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2027 
2028         Some(rsdp_addr)
2029     }
2030 
2031     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2032         trace_scoped!("entry_point");
2033 
2034         self.load_payload_handle
2035             .take()
2036             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2037             .transpose()
2038     }
2039 
2040     pub fn boot(&mut self) -> Result<()> {
2041         trace_scoped!("Vm::boot");
2042         info!("Booting VM");
2043         event!("vm", "booting");
2044         let current_state = self.get_state()?;
2045         if current_state == VmState::Paused {
2046             return self.resume().map_err(Error::Resume);
2047         }
2048 
2049         let new_state = if self.stop_on_boot {
2050             VmState::BreakPoint
2051         } else {
2052             VmState::Running
2053         };
2054         current_state.valid_transition(new_state)?;
2055 
2056         // Do earlier to parallelise with loading kernel
2057         #[cfg(target_arch = "x86_64")]
2058         cfg_if::cfg_if! {
2059             if #[cfg(feature = "sev_snp")] {
2060                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2061                 let rsdp_addr = if sev_snp_enabled {
2062                     // In case of SEV-SNP guest ACPI tables are provided via
2063                     // IGVM. So skip the creation of ACPI tables and set the
2064                     // rsdp addr to None.
2065                     None
2066                 } else {
2067                     self.create_acpi_tables()
2068                 };
2069             } else {
2070                 let rsdp_addr = self.create_acpi_tables();
2071             }
2072         }
2073 
2074         // Load kernel synchronously or if asynchronous then wait for load to
2075         // finish.
2076         let entry_point = self.entry_point()?;
2077 
2078         #[cfg(feature = "tdx")]
2079         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2080 
2081         // Configure the vcpus that have been created
2082         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2083         for vcpu in vcpus {
2084             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2085             let boot_setup = entry_point.map(|e| (e, guest_memory));
2086             self.cpu_manager
2087                 .lock()
2088                 .unwrap()
2089                 .configure_vcpu(vcpu, boot_setup)
2090                 .map_err(Error::CpuManager)?;
2091         }
2092 
2093         #[cfg(feature = "tdx")]
2094         let (sections, guid_found) = if tdx_enabled {
2095             self.extract_tdvf_sections()?
2096         } else {
2097             (Vec::new(), false)
2098         };
2099 
2100         // Configuring the TDX regions requires that the vCPUs are created.
2101         #[cfg(feature = "tdx")]
2102         let hob_address = if tdx_enabled {
2103             // TDX sections are written to memory.
2104             self.populate_tdx_sections(&sections, guid_found)?
2105         } else {
2106             None
2107         };
2108 
2109         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2110         // available after they are configured
2111         #[cfg(target_arch = "aarch64")]
2112         let rsdp_addr = self.create_acpi_tables();
2113 
2114         // Configure shared state based on loaded kernel
2115         entry_point
2116             .map(|entry_point| {
2117                 // Safe to unwrap rsdp_addr as we know it can't be None when
2118                 // the entry_point is Some.
2119                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2120             })
2121             .transpose()?;
2122 
2123         #[cfg(target_arch = "x86_64")]
2124         // Note: For x86, always call this function before invoking start boot vcpus.
2125         // Otherwise guest would fail to boot because we haven't created the
2126         // userspace mappings to update the hypervisor about the memory mappings.
2127         // These mappings must be created before we start the vCPU threads for
2128         // the very first time.
2129         self.memory_manager
2130             .lock()
2131             .unwrap()
2132             .allocate_address_space()
2133             .map_err(Error::MemoryManager)?;
2134 
2135         #[cfg(feature = "tdx")]
2136         if let Some(hob_address) = hob_address {
2137             // With the HOB address extracted the vCPUs can have
2138             // their TDX state configured.
2139             self.cpu_manager
2140                 .lock()
2141                 .unwrap()
2142                 .initialize_tdx(hob_address)
2143                 .map_err(Error::CpuManager)?;
2144             // Let the hypervisor know which memory ranges are shared with the
2145             // guest. This prevents the guest from ignoring/discarding memory
2146             // regions provided by the host.
2147             self.init_tdx_memory(&sections)?;
2148             // With TDX memory and CPU state configured TDX setup is complete
2149             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2150         }
2151 
2152         self.cpu_manager
2153             .lock()
2154             .unwrap()
2155             .start_boot_vcpus(new_state == VmState::BreakPoint)
2156             .map_err(Error::CpuManager)?;
2157 
2158         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2159         *state = new_state;
2160         event!("vm", "booted");
2161         Ok(())
2162     }
2163 
2164     pub fn restore(&mut self) -> Result<()> {
2165         event!("vm", "restoring");
2166 
2167         #[cfg(target_arch = "x86_64")]
2168         // Note: For x86, always call this function before invoking start boot vcpus.
2169         // Otherwise guest would fail to boot because we haven't created the
2170         // userspace mappings to update the hypervisor about the memory mappings.
2171         // These mappings must be created before we start the vCPU threads for
2172         // the very first time for the restored VM.
2173         self.memory_manager
2174             .lock()
2175             .unwrap()
2176             .allocate_address_space()
2177             .map_err(Error::MemoryManager)?;
2178 
2179         // Now we can start all vCPUs from here.
2180         self.cpu_manager
2181             .lock()
2182             .unwrap()
2183             .start_restored_vcpus()
2184             .map_err(Error::CpuManager)?;
2185 
2186         event!("vm", "restored");
2187         Ok(())
2188     }
2189 
2190     /// Gets a thread-safe reference counted pointer to the VM configuration.
2191     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2192         Arc::clone(&self.config)
2193     }
2194 
2195     /// Get the VM state. Returns an error if the state is poisoned.
2196     pub fn get_state(&self) -> Result<VmState> {
2197         self.state
2198             .try_read()
2199             .map_err(|_| Error::PoisonedState)
2200             .map(|state| *state)
2201     }
2202 
2203     /// Gets the actual size of the balloon.
2204     pub fn balloon_size(&self) -> u64 {
2205         self.device_manager.lock().unwrap().balloon_size()
2206     }
2207 
2208     pub fn send_memory_fds(
2209         &mut self,
2210         socket: &mut UnixStream,
2211     ) -> std::result::Result<(), MigratableError> {
2212         for (slot, fd) in self
2213             .memory_manager
2214             .lock()
2215             .unwrap()
2216             .memory_slot_fds()
2217             .drain()
2218         {
2219             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2220                 .write_to(socket)
2221                 .map_err(|e| {
2222                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2223                 })?;
2224             socket
2225                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2226                 .map_err(|e| {
2227                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2228                 })?;
2229 
2230             let res = Response::read_from(socket)?;
2231             if res.status() != Status::Ok {
2232                 warn!("Error during memory fd migration");
2233                 Request::abandon().write_to(socket)?;
2234                 Response::read_from(socket).ok();
2235                 return Err(MigratableError::MigrateSend(anyhow!(
2236                     "Error during memory fd migration"
2237                 )));
2238             }
2239         }
2240 
2241         Ok(())
2242     }
2243 
2244     pub fn send_memory_regions<F>(
2245         &mut self,
2246         ranges: &MemoryRangeTable,
2247         fd: &mut F,
2248     ) -> std::result::Result<(), MigratableError>
2249     where
2250         F: WriteVolatile,
2251     {
2252         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2253         let mem = guest_memory.memory();
2254 
2255         for range in ranges.regions() {
2256             let mut offset: u64 = 0;
2257             // Here we are manually handling the retry in case we can't the
2258             // whole region at once because we can't use the implementation
2259             // from vm-memory::GuestMemory of write_all_to() as it is not
2260             // following the correct behavior. For more info about this issue
2261             // see: https://github.com/rust-vmm/vm-memory/issues/174
2262             loop {
2263                 let bytes_written = mem
2264                     .write_volatile_to(
2265                         GuestAddress(range.gpa + offset),
2266                         fd,
2267                         (range.length - offset) as usize,
2268                     )
2269                     .map_err(|e| {
2270                         MigratableError::MigrateSend(anyhow!(
2271                             "Error transferring memory to socket: {}",
2272                             e
2273                         ))
2274                     })?;
2275                 offset += bytes_written as u64;
2276 
2277                 if offset == range.length {
2278                     break;
2279                 }
2280             }
2281         }
2282 
2283         Ok(())
2284     }
2285 
2286     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2287         self.memory_manager
2288             .lock()
2289             .unwrap()
2290             .memory_range_table(false)
2291     }
2292 
2293     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2294         self.device_manager.lock().unwrap().device_tree()
2295     }
2296 
2297     pub fn activate_virtio_devices(&self) -> Result<()> {
2298         self.device_manager
2299             .lock()
2300             .unwrap()
2301             .activate_virtio_devices()
2302             .map_err(Error::ActivateVirtioDevices)
2303     }
2304 
2305     #[cfg(target_arch = "x86_64")]
2306     pub fn power_button(&self) -> Result<()> {
2307         return self
2308             .device_manager
2309             .lock()
2310             .unwrap()
2311             .notify_power_button()
2312             .map_err(Error::PowerButton);
2313     }
2314 
2315     #[cfg(target_arch = "aarch64")]
2316     pub fn power_button(&self) -> Result<()> {
2317         self.device_manager
2318             .lock()
2319             .unwrap()
2320             .notify_power_button()
2321             .map_err(Error::PowerButton)
2322     }
2323 
2324     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2325         self.memory_manager.lock().unwrap().snapshot_data()
2326     }
2327 
2328     #[cfg(feature = "guest_debug")]
2329     pub fn debug_request(
2330         &mut self,
2331         gdb_request: &GdbRequestPayload,
2332         cpu_id: usize,
2333     ) -> Result<GdbResponsePayload> {
2334         use GdbRequestPayload::*;
2335         match gdb_request {
2336             SetSingleStep(single_step) => {
2337                 self.set_guest_debug(cpu_id, &[], *single_step)
2338                     .map_err(Error::Debug)?;
2339             }
2340             SetHwBreakPoint(addrs) => {
2341                 self.set_guest_debug(cpu_id, addrs, false)
2342                     .map_err(Error::Debug)?;
2343             }
2344             Pause => {
2345                 self.debug_pause().map_err(Error::Debug)?;
2346             }
2347             Resume => {
2348                 self.debug_resume().map_err(Error::Debug)?;
2349             }
2350             ReadRegs => {
2351                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2352                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2353             }
2354             WriteRegs(regs) => {
2355                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2356             }
2357             ReadMem(vaddr, len) => {
2358                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2359                 let mem = self
2360                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2361                     .map_err(Error::Debug)?;
2362                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2363             }
2364             WriteMem(vaddr, data) => {
2365                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2366                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2367                     .map_err(Error::Debug)?;
2368             }
2369             ActiveVcpus => {
2370                 let active_vcpus = self.active_vcpus();
2371                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2372             }
2373         }
2374         Ok(GdbResponsePayload::CommandComplete)
2375     }
2376 
2377     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2378     fn get_dump_state(
2379         &mut self,
2380         destination_url: &str,
2381     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2382         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2383         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2384         let mut elf_phdr_num = 1;
2385         let elf_sh_info = 0;
2386         let coredump_file_path = url_to_file(destination_url)?;
2387         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2388 
2389         if mapping_num < UINT16_MAX - 2 {
2390             elf_phdr_num += mapping_num as u16;
2391         } else {
2392             panic!("mapping num beyond 65535 not supported");
2393         }
2394         let coredump_file = OpenOptions::new()
2395             .read(true)
2396             .write(true)
2397             .create_new(true)
2398             .open(coredump_file_path)
2399             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2400 
2401         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2402         let mem_data = self
2403             .memory_manager
2404             .lock()
2405             .unwrap()
2406             .coredump_memory_regions(mem_offset);
2407 
2408         Ok(DumpState {
2409             elf_note_size,
2410             elf_phdr_num,
2411             elf_sh_info,
2412             mem_offset,
2413             mem_info: Some(mem_data),
2414             file: Some(coredump_file),
2415         })
2416     }
2417 
2418     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2419     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2420         size_of::<elf::Elf64_Ehdr>() as u64
2421             + note_size as u64
2422             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2423     }
2424 }
2425 
2426 impl Pausable for Vm {
2427     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2428         event!("vm", "pausing");
2429         let mut state = self
2430             .state
2431             .try_write()
2432             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2433         let new_state = VmState::Paused;
2434 
2435         state
2436             .valid_transition(new_state)
2437             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2438 
2439         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2440         {
2441             let mut clock = self
2442                 .vm
2443                 .get_clock()
2444                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2445             clock.reset_flags();
2446             self.saved_clock = Some(clock);
2447         }
2448 
2449         // Before pausing the vCPUs activate any pending virtio devices that might
2450         // need activation between starting the pause (or e.g. a migration it's part of)
2451         self.activate_virtio_devices().map_err(|e| {
2452             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2453         })?;
2454 
2455         self.cpu_manager.lock().unwrap().pause()?;
2456         self.device_manager.lock().unwrap().pause()?;
2457 
2458         *state = new_state;
2459 
2460         event!("vm", "paused");
2461         Ok(())
2462     }
2463 
2464     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2465         event!("vm", "resuming");
2466         let mut state = self
2467             .state
2468             .try_write()
2469             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2470         let new_state = VmState::Running;
2471 
2472         state
2473             .valid_transition(new_state)
2474             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2475 
2476         self.cpu_manager.lock().unwrap().resume()?;
2477         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2478         {
2479             if let Some(clock) = &self.saved_clock {
2480                 self.vm.set_clock(clock).map_err(|e| {
2481                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2482                 })?;
2483             }
2484         }
2485         self.device_manager.lock().unwrap().resume()?;
2486 
2487         // And we're back to the Running state.
2488         *state = new_state;
2489         event!("vm", "resumed");
2490         Ok(())
2491     }
2492 }
2493 
2494 #[derive(Serialize, Deserialize)]
2495 pub struct VmSnapshot {
2496     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2497     pub clock: Option<hypervisor::ClockData>,
2498     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2499     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2500 }
2501 
2502 pub const VM_SNAPSHOT_ID: &str = "vm";
2503 impl Snapshottable for Vm {
2504     fn id(&self) -> String {
2505         VM_SNAPSHOT_ID.to_string()
2506     }
2507 
2508     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2509         event!("vm", "snapshotting");
2510 
2511         #[cfg(feature = "tdx")]
2512         {
2513             if self.config.lock().unwrap().is_tdx_enabled() {
2514                 return Err(MigratableError::Snapshot(anyhow!(
2515                     "Snapshot not possible with TDX VM"
2516                 )));
2517             }
2518         }
2519 
2520         let current_state = self.get_state().unwrap();
2521         if current_state != VmState::Paused {
2522             return Err(MigratableError::Snapshot(anyhow!(
2523                 "Trying to snapshot while VM is running"
2524             )));
2525         }
2526 
2527         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2528         let common_cpuid = {
2529             let amx = self.config.lock().unwrap().cpus.features.amx;
2530             let phys_bits = physical_bits(
2531                 &self.hypervisor,
2532                 self.config.lock().unwrap().cpus.max_phys_bits,
2533             );
2534             arch::generate_common_cpuid(
2535                 &self.hypervisor,
2536                 &arch::CpuidConfig {
2537                     sgx_epc_sections: None,
2538                     phys_bits,
2539                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2540                     #[cfg(feature = "tdx")]
2541                     tdx: false,
2542                     amx,
2543                 },
2544             )
2545             .map_err(|e| {
2546                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2547             })?
2548         };
2549 
2550         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2551             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2552             clock: self.saved_clock,
2553             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2554             common_cpuid,
2555         })
2556         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2557 
2558         let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data));
2559 
2560         let (id, snapshot) = {
2561             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2562             (cpu_manager.id(), cpu_manager.snapshot()?)
2563         };
2564         vm_snapshot.add_snapshot(id, snapshot);
2565         let (id, snapshot) = {
2566             let mut memory_manager = self.memory_manager.lock().unwrap();
2567             (memory_manager.id(), memory_manager.snapshot()?)
2568         };
2569         vm_snapshot.add_snapshot(id, snapshot);
2570         let (id, snapshot) = {
2571             let mut device_manager = self.device_manager.lock().unwrap();
2572             (device_manager.id(), device_manager.snapshot()?)
2573         };
2574         vm_snapshot.add_snapshot(id, snapshot);
2575 
2576         event!("vm", "snapshotted");
2577         Ok(vm_snapshot)
2578     }
2579 }
2580 
2581 impl Transportable for Vm {
2582     fn send(
2583         &self,
2584         snapshot: &Snapshot,
2585         destination_url: &str,
2586     ) -> std::result::Result<(), MigratableError> {
2587         let mut snapshot_config_path = url_to_path(destination_url)?;
2588         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2589 
2590         // Create the snapshot config file
2591         let mut snapshot_config_file = OpenOptions::new()
2592             .read(true)
2593             .write(true)
2594             .create_new(true)
2595             .open(snapshot_config_path)
2596             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2597 
2598         // Serialize and write the snapshot config
2599         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2600             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2601 
2602         snapshot_config_file
2603             .write(vm_config.as_bytes())
2604             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2605 
2606         let mut snapshot_state_path = url_to_path(destination_url)?;
2607         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2608 
2609         // Create the snapshot state file
2610         let mut snapshot_state_file = OpenOptions::new()
2611             .read(true)
2612             .write(true)
2613             .create_new(true)
2614             .open(snapshot_state_path)
2615             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2616 
2617         // Serialize and write the snapshot state
2618         let vm_state =
2619             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2620 
2621         snapshot_state_file
2622             .write(&vm_state)
2623             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2624 
2625         // Tell the memory manager to also send/write its own snapshot.
2626         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2627             self.memory_manager
2628                 .lock()
2629                 .unwrap()
2630                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2631         } else {
2632             return Err(MigratableError::Restore(anyhow!(
2633                 "Missing memory manager snapshot"
2634             )));
2635         }
2636 
2637         Ok(())
2638     }
2639 }
2640 
2641 impl Migratable for Vm {
2642     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2643         self.memory_manager.lock().unwrap().start_dirty_log()?;
2644         self.device_manager.lock().unwrap().start_dirty_log()
2645     }
2646 
2647     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2648         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2649         self.device_manager.lock().unwrap().stop_dirty_log()
2650     }
2651 
2652     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2653         Ok(MemoryRangeTable::new_from_tables(vec![
2654             self.memory_manager.lock().unwrap().dirty_log()?,
2655             self.device_manager.lock().unwrap().dirty_log()?,
2656         ]))
2657     }
2658 
2659     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2660         self.memory_manager.lock().unwrap().start_migration()?;
2661         self.device_manager.lock().unwrap().start_migration()
2662     }
2663 
2664     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2665         self.memory_manager.lock().unwrap().complete_migration()?;
2666         self.device_manager.lock().unwrap().complete_migration()
2667     }
2668 }
2669 
2670 #[cfg(feature = "guest_debug")]
2671 impl Debuggable for Vm {
2672     fn set_guest_debug(
2673         &self,
2674         cpu_id: usize,
2675         addrs: &[GuestAddress],
2676         singlestep: bool,
2677     ) -> std::result::Result<(), DebuggableError> {
2678         self.cpu_manager
2679             .lock()
2680             .unwrap()
2681             .set_guest_debug(cpu_id, addrs, singlestep)
2682     }
2683 
2684     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2685         if *self.state.read().unwrap() == VmState::Running {
2686             self.pause().map_err(DebuggableError::Pause)?;
2687         }
2688 
2689         let mut state = self
2690             .state
2691             .try_write()
2692             .map_err(|_| DebuggableError::PoisonedState)?;
2693         *state = VmState::BreakPoint;
2694         Ok(())
2695     }
2696 
2697     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2698         if *self.state.read().unwrap() == VmState::BreakPoint {
2699             self.resume().map_err(DebuggableError::Pause)?;
2700         }
2701 
2702         Ok(())
2703     }
2704 
2705     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2706         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2707     }
2708 
2709     fn write_regs(
2710         &self,
2711         cpu_id: usize,
2712         regs: &CoreRegs,
2713     ) -> std::result::Result<(), DebuggableError> {
2714         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2715     }
2716 
2717     fn read_mem(
2718         &self,
2719         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2720         cpu_id: usize,
2721         vaddr: GuestAddress,
2722         len: usize,
2723     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2724         self.cpu_manager
2725             .lock()
2726             .unwrap()
2727             .read_mem(guest_memory, cpu_id, vaddr, len)
2728     }
2729 
2730     fn write_mem(
2731         &self,
2732         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2733         cpu_id: usize,
2734         vaddr: &GuestAddress,
2735         data: &[u8],
2736     ) -> std::result::Result<(), DebuggableError> {
2737         self.cpu_manager
2738             .lock()
2739             .unwrap()
2740             .write_mem(guest_memory, cpu_id, vaddr, data)
2741     }
2742 
2743     fn active_vcpus(&self) -> usize {
2744         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2745         if active_vcpus > 0 {
2746             active_vcpus
2747         } else {
2748             // The VM is not booted yet. Report boot_vcpus() instead.
2749             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2750         }
2751     }
2752 }
2753 
2754 #[cfg(feature = "guest_debug")]
2755 pub const UINT16_MAX: u32 = 65535;
2756 
2757 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2758 impl Elf64Writable for Vm {}
2759 
2760 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2761 impl GuestDebuggable for Vm {
2762     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2763         event!("vm", "coredumping");
2764 
2765         let mut resume = false;
2766 
2767         #[cfg(feature = "tdx")]
2768         {
2769             if let Some(ref platform) = self.config.lock().unwrap().platform {
2770                 if platform.tdx {
2771                     return Err(GuestDebuggableError::Coredump(anyhow!(
2772                         "Coredump not possible with TDX VM"
2773                     )));
2774                 }
2775             }
2776         }
2777 
2778         match self.get_state().unwrap() {
2779             VmState::Running => {
2780                 self.pause().map_err(GuestDebuggableError::Pause)?;
2781                 resume = true;
2782             }
2783             VmState::Paused => {}
2784             _ => {
2785                 return Err(GuestDebuggableError::Coredump(anyhow!(
2786                     "Trying to coredump while VM is not running or paused"
2787                 )));
2788             }
2789         }
2790 
2791         let coredump_state = self.get_dump_state(destination_url)?;
2792 
2793         self.write_header(&coredump_state)?;
2794         self.write_note(&coredump_state)?;
2795         self.write_loads(&coredump_state)?;
2796 
2797         self.cpu_manager
2798             .lock()
2799             .unwrap()
2800             .cpu_write_elf64_note(&coredump_state)?;
2801         self.cpu_manager
2802             .lock()
2803             .unwrap()
2804             .cpu_write_vmm_note(&coredump_state)?;
2805 
2806         self.memory_manager
2807             .lock()
2808             .unwrap()
2809             .coredump_iterate_save_mem(&coredump_state)?;
2810 
2811         if resume {
2812             self.resume().map_err(GuestDebuggableError::Resume)?;
2813         }
2814 
2815         Ok(())
2816     }
2817 }
2818 
2819 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2820 #[cfg(test)]
2821 mod tests {
2822     use super::*;
2823 
2824     fn test_vm_state_transitions(state: VmState) {
2825         match state {
2826             VmState::Created => {
2827                 // Check the transitions from Created
2828                 assert!(state.valid_transition(VmState::Created).is_err());
2829                 assert!(state.valid_transition(VmState::Running).is_ok());
2830                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2831                 assert!(state.valid_transition(VmState::Paused).is_ok());
2832                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2833             }
2834             VmState::Running => {
2835                 // Check the transitions from Running
2836                 assert!(state.valid_transition(VmState::Created).is_err());
2837                 assert!(state.valid_transition(VmState::Running).is_err());
2838                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2839                 assert!(state.valid_transition(VmState::Paused).is_ok());
2840                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2841             }
2842             VmState::Shutdown => {
2843                 // Check the transitions from Shutdown
2844                 assert!(state.valid_transition(VmState::Created).is_err());
2845                 assert!(state.valid_transition(VmState::Running).is_ok());
2846                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2847                 assert!(state.valid_transition(VmState::Paused).is_err());
2848                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2849             }
2850             VmState::Paused => {
2851                 // Check the transitions from Paused
2852                 assert!(state.valid_transition(VmState::Created).is_err());
2853                 assert!(state.valid_transition(VmState::Running).is_ok());
2854                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2855                 assert!(state.valid_transition(VmState::Paused).is_err());
2856                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2857             }
2858             VmState::BreakPoint => {
2859                 // Check the transitions from Breakpoint
2860                 assert!(state.valid_transition(VmState::Created).is_ok());
2861                 assert!(state.valid_transition(VmState::Running).is_ok());
2862                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2863                 assert!(state.valid_transition(VmState::Paused).is_err());
2864                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2865             }
2866         }
2867     }
2868 
2869     #[test]
2870     fn test_vm_created_transitions() {
2871         test_vm_state_transitions(VmState::Created);
2872     }
2873 
2874     #[test]
2875     fn test_vm_running_transitions() {
2876         test_vm_state_transitions(VmState::Running);
2877     }
2878 
2879     #[test]
2880     fn test_vm_shutdown_transitions() {
2881         test_vm_state_transitions(VmState::Shutdown);
2882     }
2883 
2884     #[test]
2885     fn test_vm_paused_transitions() {
2886         test_vm_state_transitions(VmState::Paused);
2887     }
2888 
2889     #[cfg(feature = "tdx")]
2890     #[test]
2891     fn test_hob_memory_resources() {
2892         // Case 1: Two TDVF sections in the middle of the RAM
2893         let sections = vec![
2894             TdvfSection {
2895                 address: 0xc000,
2896                 size: 0x1000,
2897                 ..Default::default()
2898             },
2899             TdvfSection {
2900                 address: 0x1000,
2901                 size: 0x4000,
2902                 ..Default::default()
2903             },
2904         ];
2905         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2906         let expected = vec![
2907             (0, 0x1000, true),
2908             (0x1000, 0x4000, false),
2909             (0x5000, 0x7000, true),
2910             (0xc000, 0x1000, false),
2911             (0xd000, 0x0fff_3000, true),
2912         ];
2913         assert_eq!(
2914             expected,
2915             Vm::hob_memory_resources(
2916                 sections,
2917                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2918             )
2919         );
2920 
2921         // Case 2: Two TDVF sections with no conflict with the RAM
2922         let sections = vec![
2923             TdvfSection {
2924                 address: 0x1000_1000,
2925                 size: 0x1000,
2926                 ..Default::default()
2927             },
2928             TdvfSection {
2929                 address: 0,
2930                 size: 0x1000,
2931                 ..Default::default()
2932             },
2933         ];
2934         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2935         let expected = vec![
2936             (0, 0x1000, false),
2937             (0x1000, 0x1000_0000, true),
2938             (0x1000_1000, 0x1000, false),
2939         ];
2940         assert_eq!(
2941             expected,
2942             Vm::hob_memory_resources(
2943                 sections,
2944                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2945             )
2946         );
2947 
2948         // Case 3: Two TDVF sections with partial conflicts with the RAM
2949         let sections = vec![
2950             TdvfSection {
2951                 address: 0x1000_0000,
2952                 size: 0x2000,
2953                 ..Default::default()
2954             },
2955             TdvfSection {
2956                 address: 0,
2957                 size: 0x2000,
2958                 ..Default::default()
2959             },
2960         ];
2961         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2962         let expected = vec![
2963             (0, 0x2000, false),
2964             (0x2000, 0x0fff_e000, true),
2965             (0x1000_0000, 0x2000, false),
2966         ];
2967         assert_eq!(
2968             expected,
2969             Vm::hob_memory_resources(
2970                 sections,
2971                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2972             )
2973         );
2974 
2975         // Case 4: Two TDVF sections with no conflict before the RAM and two
2976         // more additional sections with no conflict after the RAM.
2977         let sections = vec![
2978             TdvfSection {
2979                 address: 0x2000_1000,
2980                 size: 0x1000,
2981                 ..Default::default()
2982             },
2983             TdvfSection {
2984                 address: 0x2000_0000,
2985                 size: 0x1000,
2986                 ..Default::default()
2987             },
2988             TdvfSection {
2989                 address: 0x1000,
2990                 size: 0x1000,
2991                 ..Default::default()
2992             },
2993             TdvfSection {
2994                 address: 0,
2995                 size: 0x1000,
2996                 ..Default::default()
2997             },
2998         ];
2999         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3000         let expected = vec![
3001             (0, 0x1000, false),
3002             (0x1000, 0x1000, false),
3003             (0x4000, 0x1000_0000, true),
3004             (0x2000_0000, 0x1000, false),
3005             (0x2000_1000, 0x1000, false),
3006         ];
3007         assert_eq!(
3008             expected,
3009             Vm::hob_memory_resources(
3010                 sections,
3011                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3012             )
3013         );
3014 
3015         // Case 5: One TDVF section overriding the entire RAM
3016         let sections = vec![TdvfSection {
3017             address: 0,
3018             size: 0x2000_0000,
3019             ..Default::default()
3020         }];
3021         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3022         let expected = vec![(0, 0x2000_0000, false)];
3023         assert_eq!(
3024             expected,
3025             Vm::hob_memory_resources(
3026                 sections,
3027                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3028             )
3029         );
3030 
3031         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3032         let sections = vec![
3033             TdvfSection {
3034                 address: 0x1000_2000,
3035                 size: 0x2000,
3036                 ..Default::default()
3037             },
3038             TdvfSection {
3039                 address: 0,
3040                 size: 0x2000,
3041                 ..Default::default()
3042             },
3043         ];
3044         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3045             (GuestAddress(0x2000), 0x1000_0000),
3046             (GuestAddress(0x1000_4000), 0x1000_0000),
3047         ];
3048         let expected = vec![
3049             (0, 0x2000, false),
3050             (0x2000, 0x1000_0000, true),
3051             (0x1000_2000, 0x2000, false),
3052             (0x1000_4000, 0x1000_0000, true),
3053         ];
3054         assert_eq!(
3055             expected,
3056             Vm::hob_memory_resources(
3057                 sections,
3058                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3059             )
3060         );
3061 
3062         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3063         let sections = vec![
3064             TdvfSection {
3065                 address: 0x1000_0000,
3066                 size: 0x4000,
3067                 ..Default::default()
3068             },
3069             TdvfSection {
3070                 address: 0,
3071                 size: 0x4000,
3072                 ..Default::default()
3073             },
3074         ];
3075         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3076             (GuestAddress(0x1000), 0x1000_0000),
3077             (GuestAddress(0x1000_3000), 0x1000_0000),
3078         ];
3079         let expected = vec![
3080             (0, 0x4000, false),
3081             (0x4000, 0x0fff_c000, true),
3082             (0x1000_0000, 0x4000, false),
3083             (0x1000_4000, 0x0fff_f000, true),
3084         ];
3085         assert_eq!(
3086             expected,
3087             Vm::hob_memory_resources(
3088                 sections,
3089                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3090             )
3091         );
3092     }
3093 }
3094 
3095 #[cfg(target_arch = "aarch64")]
3096 #[cfg(test)]
3097 mod tests {
3098     use super::*;
3099     use crate::GuestMemoryMmap;
3100     use arch::aarch64::fdt::create_fdt;
3101     use arch::aarch64::layout;
3102     use arch::{DeviceType, MmioDeviceInfo};
3103     use devices::gic::Gic;
3104 
3105     const LEN: u64 = 4096;
3106 
3107     #[test]
3108     fn test_create_fdt_with_devices() {
3109         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3110         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3111 
3112         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3113             (
3114                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3115                 MmioDeviceInfo {
3116                     addr: 0x00,
3117                     len: LEN,
3118                     irq: 33,
3119                 },
3120             ),
3121             (
3122                 (DeviceType::Virtio(1), "virtio".to_string()),
3123                 MmioDeviceInfo {
3124                     addr: LEN,
3125                     len: LEN,
3126                     irq: 34,
3127                 },
3128             ),
3129             (
3130                 (DeviceType::Rtc, "rtc".to_string()),
3131                 MmioDeviceInfo {
3132                     addr: 2 * LEN,
3133                     len: LEN,
3134                     irq: 35,
3135                 },
3136             ),
3137         ]
3138         .iter()
3139         .cloned()
3140         .collect();
3141 
3142         let hv = hypervisor::new().unwrap();
3143         let vm = hv.create_vm().unwrap();
3144         let gic = vm
3145             .create_vgic(Gic::create_default_config(1))
3146             .expect("Cannot create gic");
3147         assert!(create_fdt(
3148             &mem,
3149             "console=tty0",
3150             vec![0],
3151             Some((0, 0, 0)),
3152             &dev_info,
3153             &gic,
3154             &None,
3155             &Vec::new(),
3156             &BTreeMap::new(),
3157             None,
3158             true,
3159         )
3160         .is_ok())
3161     }
3162 }
3163 
3164 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3165 #[test]
3166 pub fn test_vm() {
3167     use hypervisor::VmExit;
3168     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3169     // This example based on https://lwn.net/Articles/658511/
3170     let code = [
3171         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3172         0x00, 0xd8, /* add %bl, %al */
3173         0x04, b'0', /* add $'0', %al */
3174         0xee, /* out %al, (%dx) */
3175         0xb0, b'\n', /* mov $'\n', %al */
3176         0xee,  /* out %al, (%dx) */
3177         0xf4,  /* hlt */
3178     ];
3179 
3180     let mem_size = 0x1000;
3181     let load_addr = GuestAddress(0x1000);
3182     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3183 
3184     let hv = hypervisor::new().unwrap();
3185     let vm = hv.create_vm().expect("new VM creation failed");
3186 
3187     for (index, region) in mem.iter().enumerate() {
3188         let mem_region = vm.make_user_memory_region(
3189             index as u32,
3190             region.start_addr().raw_value(),
3191             region.len(),
3192             region.as_ptr() as u64,
3193             false,
3194             false,
3195         );
3196 
3197         vm.create_user_memory_region(mem_region)
3198             .expect("Cannot configure guest memory");
3199     }
3200     mem.write_slice(&code, load_addr)
3201         .expect("Writing code to memory failed");
3202 
3203     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3204 
3205     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3206     vcpu_sregs.cs.base = 0;
3207     vcpu_sregs.cs.selector = 0;
3208     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3209 
3210     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3211     vcpu_regs.rip = 0x1000;
3212     vcpu_regs.rax = 2;
3213     vcpu_regs.rbx = 3;
3214     vcpu_regs.rflags = 2;
3215     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3216 
3217     loop {
3218         match vcpu.run().expect("run failed") {
3219             VmExit::IoOut(addr, data) => {
3220                 println!(
3221                     "IO out -- addr: {:#x} data [{:?}]",
3222                     addr,
3223                     str::from_utf8(data).unwrap()
3224                 );
3225             }
3226             VmExit::Reset => {
3227                 println!("HLT");
3228                 break;
3229             }
3230             r => panic!("unexpected exit reason: {r:?}"),
3231         }
3232     }
3233 }
3234