xref: /cloud-hypervisor/vmm/src/vm.rs (revision 07d1208dd53a207a65b649b8952780dfd0ca59d9)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 use crate::memory_manager::{
29     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
30 };
31 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
32 use crate::migration::get_vm_snapshot;
33 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
34 use crate::migration::url_to_file;
35 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
36 use crate::GuestMemoryMmap;
37 use crate::{
38     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
39 };
40 use anyhow::anyhow;
41 use arch::get_host_cpu_phys_bits;
42 #[cfg(target_arch = "x86_64")]
43 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
44 #[cfg(feature = "tdx")]
45 use arch::x86_64::tdx::TdvfSection;
46 use arch::EntryPoint;
47 #[cfg(target_arch = "aarch64")]
48 use arch::PciSpaceInfo;
49 use arch::{NumaNode, NumaNodes};
50 #[cfg(target_arch = "aarch64")]
51 use devices::interrupt_controller;
52 use devices::AcpiNotificationFlags;
53 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
54 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
55 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
56 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
57 use hypervisor::{HypervisorVmError, VmOps};
58 use libc::{termios, SIGWINCH};
59 use linux_loader::cmdline::Cmdline;
60 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
61 use linux_loader::elf;
62 #[cfg(target_arch = "x86_64")]
63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
64 #[cfg(target_arch = "aarch64")]
65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
66 use linux_loader::loader::KernelLoader;
67 use seccompiler::SeccompAction;
68 use serde::{Deserialize, Serialize};
69 use std::cmp;
70 use std::collections::BTreeMap;
71 use std::collections::HashMap;
72 use std::convert::TryInto;
73 use std::fs::{File, OpenOptions};
74 use std::io::{self, Seek, SeekFrom, Write};
75 #[cfg(feature = "tdx")]
76 use std::mem;
77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
78 use std::mem::size_of;
79 use std::num::Wrapping;
80 use std::ops::Deref;
81 use std::os::unix::net::UnixStream;
82 use std::sync::{Arc, Mutex, RwLock};
83 use std::time::Instant;
84 use std::{result, str, thread};
85 use thiserror::Error;
86 use tracer::trace_scoped;
87 use vm_device::Bus;
88 #[cfg(feature = "tdx")]
89 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion};
90 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic};
91 use vm_migration::protocol::{Request, Response, Status};
92 use vm_migration::{
93     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
94     SnapshotData, Snapshottable, Transportable,
95 };
96 use vmm_sys_util::eventfd::EventFd;
97 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
98 
99 /// Errors associated with VM management
100 #[derive(Debug, Error)]
101 pub enum Error {
102     #[error("Cannot open kernel file: {0}")]
103     KernelFile(#[source] io::Error),
104 
105     #[error("Cannot open initramfs file: {0}")]
106     InitramfsFile(#[source] io::Error),
107 
108     #[error("Cannot load the kernel into memory: {0}")]
109     KernelLoad(#[source] linux_loader::loader::Error),
110 
111     #[cfg(target_arch = "aarch64")]
112     #[error("Cannot load the UEFI binary in memory: {0:?}")]
113     UefiLoad(arch::aarch64::uefi::Error),
114 
115     #[error("Cannot load the initramfs into memory")]
116     InitramfsLoad,
117 
118     #[error("Cannot load the kernel command line in memory: {0}")]
119     LoadCmdLine(#[source] linux_loader::loader::Error),
120 
121     #[error("Cannot modify the kernel command line: {0}")]
122     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
123 
124     #[error("Cannot create the kernel command line: {0}")]
125     CmdLineCreate(#[source] linux_loader::cmdline::Error),
126 
127     #[error("Cannot configure system: {0}")]
128     ConfigureSystem(#[source] arch::Error),
129 
130     #[cfg(target_arch = "aarch64")]
131     #[error("Cannot enable interrupt controller: {0:?}")]
132     EnableInterruptController(interrupt_controller::Error),
133 
134     #[error("VM state is poisoned")]
135     PoisonedState,
136 
137     #[error("Error from device manager: {0:?}")]
138     DeviceManager(DeviceManagerError),
139 
140     #[error("No device with id {0:?} to remove")]
141     NoDeviceToRemove(String),
142 
143     #[error("Cannot spawn a signal handler thread: {0}")]
144     SignalHandlerSpawn(#[source] io::Error),
145 
146     #[error("Failed to join on threads: {0:?}")]
147     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
148 
149     #[error("VM config is missing")]
150     VmMissingConfig,
151 
152     #[error("VM is not created")]
153     VmNotCreated,
154 
155     #[error("VM is already created")]
156     VmAlreadyCreated,
157 
158     #[error("VM is not running")]
159     VmNotRunning,
160 
161     #[error("Cannot clone EventFd: {0}")]
162     EventFdClone(#[source] io::Error),
163 
164     #[error("invalid VM state transition: {0:?} to {1:?}")]
165     InvalidStateTransition(VmState, VmState),
166 
167     #[error("Error from CPU manager: {0}")]
168     CpuManager(#[source] cpu::Error),
169 
170     #[error("Cannot pause devices: {0}")]
171     PauseDevices(#[source] MigratableError),
172 
173     #[error("Cannot resume devices: {0}")]
174     ResumeDevices(#[source] MigratableError),
175 
176     #[error("Cannot pause CPUs: {0}")]
177     PauseCpus(#[source] MigratableError),
178 
179     #[error("Cannot resume cpus: {0}")]
180     ResumeCpus(#[source] MigratableError),
181 
182     #[error("Cannot pause VM: {0}")]
183     Pause(#[source] MigratableError),
184 
185     #[error("Cannot resume VM: {0}")]
186     Resume(#[source] MigratableError),
187 
188     #[error("Memory manager error: {0:?}")]
189     MemoryManager(MemoryManagerError),
190 
191     #[error("Eventfd write error: {0}")]
192     EventfdError(#[source] std::io::Error),
193 
194     #[error("Cannot snapshot VM: {0}")]
195     Snapshot(#[source] MigratableError),
196 
197     #[error("Cannot restore VM: {0}")]
198     Restore(#[source] MigratableError),
199 
200     #[error("Cannot send VM snapshot: {0}")]
201     SnapshotSend(#[source] MigratableError),
202 
203     #[error("Invalid restore source URL")]
204     InvalidRestoreSourceUrl,
205 
206     #[error("Failed to validate config: {0}")]
207     ConfigValidation(#[source] ValidationError),
208 
209     #[error("Too many virtio-vsock devices")]
210     TooManyVsockDevices,
211 
212     #[error("Failed serializing into JSON: {0}")]
213     SerializeJson(#[source] serde_json::Error),
214 
215     #[error("Invalid NUMA configuration")]
216     InvalidNumaConfig,
217 
218     #[error("Cannot create seccomp filter: {0}")]
219     CreateSeccompFilter(#[source] seccompiler::Error),
220 
221     #[error("Cannot apply seccomp filter: {0}")]
222     ApplySeccompFilter(#[source] seccompiler::Error),
223 
224     #[error("Failed resizing a memory zone")]
225     ResizeZone,
226 
227     #[error("Cannot activate virtio devices: {0:?}")]
228     ActivateVirtioDevices(DeviceManagerError),
229 
230     #[error("Error triggering power button: {0:?}")]
231     PowerButton(DeviceManagerError),
232 
233     #[error("Kernel lacks PVH header")]
234     KernelMissingPvhHeader,
235 
236     #[error("Failed to allocate firmware RAM: {0:?}")]
237     AllocateFirmwareMemory(MemoryManagerError),
238 
239     #[error("Error manipulating firmware file: {0}")]
240     FirmwareFile(#[source] std::io::Error),
241 
242     #[error("Firmware too big")]
243     FirmwareTooLarge,
244 
245     #[error("Failed to copy firmware to memory: {0}")]
246     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
247 
248     #[cfg(feature = "tdx")]
249     #[error("Error performing I/O on TDX firmware file: {0}")]
250     LoadTdvf(#[source] std::io::Error),
251 
252     #[cfg(feature = "tdx")]
253     #[error("Error performing I/O on the TDX payload file: {0}")]
254     LoadPayload(#[source] std::io::Error),
255 
256     #[cfg(feature = "tdx")]
257     #[error("Error parsing TDVF: {0}")]
258     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
259 
260     #[cfg(feature = "tdx")]
261     #[error("Error populating TDX HOB: {0}")]
262     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
263 
264     #[cfg(feature = "tdx")]
265     #[error("Error allocating TDVF memory: {0:?}")]
266     AllocatingTdvfMemory(crate::memory_manager::Error),
267 
268     #[cfg(feature = "tdx")]
269     #[error("Error enabling TDX VM: {0}")]
270     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
271 
272     #[cfg(feature = "tdx")]
273     #[error("Error enabling TDX memory region: {0}")]
274     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
275 
276     #[cfg(feature = "tdx")]
277     #[error("Error finalizing TDX VM: {0}")]
278     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
279 
280     #[cfg(feature = "tdx")]
281     #[error("TDX firmware missing")]
282     TdxFirmwareMissing,
283 
284     #[cfg(feature = "tdx")]
285     #[error("Invalid TDX payload type")]
286     InvalidPayloadType,
287 
288     #[cfg(feature = "guest_debug")]
289     #[error("Error debugging VM: {0:?}")]
290     Debug(DebuggableError),
291 
292     #[error("Error spawning kernel loading thread")]
293     KernelLoadThreadSpawn(std::io::Error),
294 
295     #[error("Error joining kernel loading thread")]
296     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
297 
298     #[error("Payload configuration is not bootable")]
299     InvalidPayload,
300 
301     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
302     #[error("Error coredumping VM: {0:?}")]
303     Coredump(GuestDebuggableError),
304 }
305 pub type Result<T> = result::Result<T, Error>;
306 
307 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
308 pub enum VmState {
309     Created,
310     Running,
311     Shutdown,
312     Paused,
313     BreakPoint,
314 }
315 
316 impl VmState {
317     fn valid_transition(self, new_state: VmState) -> Result<()> {
318         match self {
319             VmState::Created => match new_state {
320                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
321                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
322                     Ok(())
323                 }
324             },
325 
326             VmState::Running => match new_state {
327                 VmState::Created | VmState::Running => {
328                     Err(Error::InvalidStateTransition(self, new_state))
329                 }
330                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
331             },
332 
333             VmState::Shutdown => match new_state {
334                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
335                     Err(Error::InvalidStateTransition(self, new_state))
336                 }
337                 VmState::Running => Ok(()),
338             },
339 
340             VmState::Paused => match new_state {
341                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
342                     Err(Error::InvalidStateTransition(self, new_state))
343                 }
344                 VmState::Running | VmState::Shutdown => Ok(()),
345             },
346             VmState::BreakPoint => match new_state {
347                 VmState::Created | VmState::Running => Ok(()),
348                 _ => Err(Error::InvalidStateTransition(self, new_state)),
349             },
350         }
351     }
352 }
353 
354 struct VmOpsHandler {
355     memory: GuestMemoryAtomic<GuestMemoryMmap>,
356     #[cfg(target_arch = "x86_64")]
357     io_bus: Arc<Bus>,
358     mmio_bus: Arc<Bus>,
359 }
360 
361 impl VmOps for VmOpsHandler {
362     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
363         self.memory
364             .memory()
365             .write(buf, GuestAddress(gpa))
366             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
367     }
368 
369     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
370         self.memory
371             .memory()
372             .read(buf, GuestAddress(gpa))
373             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
374     }
375 
376     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
377         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
378             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
379         }
380         Ok(())
381     }
382 
383     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
384         match self.mmio_bus.write(gpa, data) {
385             Err(vm_device::BusError::MissingAddressRange) => {
386                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
387             }
388             Ok(Some(barrier)) => {
389                 info!("Waiting for barrier");
390                 barrier.wait();
391                 info!("Barrier released");
392             }
393             _ => {}
394         };
395         Ok(())
396     }
397 
398     #[cfg(target_arch = "x86_64")]
399     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
400         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
401             info!("Guest PIO read to unregistered address 0x{:x}", port);
402         }
403         Ok(())
404     }
405 
406     #[cfg(target_arch = "x86_64")]
407     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
408         match self.io_bus.write(port, data) {
409             Err(vm_device::BusError::MissingAddressRange) => {
410                 info!("Guest PIO write to unregistered address 0x{:x}", port);
411             }
412             Ok(Some(barrier)) => {
413                 info!("Waiting for barrier");
414                 barrier.wait();
415                 info!("Barrier released");
416             }
417             _ => {}
418         };
419         Ok(())
420     }
421 }
422 
423 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
424     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
425 
426     cmp::min(host_phys_bits, max_phys_bits)
427 }
428 
429 pub struct Vm {
430     #[cfg(feature = "tdx")]
431     kernel: Option<File>,
432     initramfs: Option<File>,
433     threads: Vec<thread::JoinHandle<()>>,
434     device_manager: Arc<Mutex<DeviceManager>>,
435     config: Arc<Mutex<VmConfig>>,
436     state: RwLock<VmState>,
437     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
438     memory_manager: Arc<Mutex<MemoryManager>>,
439     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
440     // The hypervisor abstracted virtual machine.
441     vm: Arc<dyn hypervisor::Vm>,
442     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
443     saved_clock: Option<hypervisor::ClockData>,
444     numa_nodes: NumaNodes,
445     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
446     hypervisor: Arc<dyn hypervisor::Hypervisor>,
447     stop_on_boot: bool,
448     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
449 }
450 
451 impl Vm {
452     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
453 
454     #[allow(clippy::too_many_arguments)]
455     pub fn new_from_memory_manager(
456         config: Arc<Mutex<VmConfig>>,
457         memory_manager: Arc<Mutex<MemoryManager>>,
458         vm: Arc<dyn hypervisor::Vm>,
459         exit_evt: EventFd,
460         reset_evt: EventFd,
461         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
462         seccomp_action: &SeccompAction,
463         hypervisor: Arc<dyn hypervisor::Hypervisor>,
464         activate_evt: EventFd,
465         timestamp: Instant,
466         serial_pty: Option<PtyPair>,
467         console_pty: Option<PtyPair>,
468         console_resize_pipe: Option<File>,
469         original_termios: Arc<Mutex<Option<termios>>>,
470         snapshot: Option<Snapshot>,
471     ) -> Result<Self> {
472         trace_scoped!("Vm::new_from_memory_manager");
473 
474         let boot_id_list = config
475             .lock()
476             .unwrap()
477             .validate()
478             .map_err(Error::ConfigValidation)?;
479 
480         let load_payload_handle = if snapshot.is_none() {
481             Self::load_payload_async(&memory_manager, &config)?
482         } else {
483             None
484         };
485 
486         info!("Booting VM from config: {:?}", &config);
487 
488         // Create NUMA nodes based on NumaConfig.
489         let numa_nodes =
490             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
491 
492         #[cfg(feature = "tdx")]
493         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
494         #[cfg(feature = "tdx")]
495         let force_iommu = tdx_enabled;
496         #[cfg(not(feature = "tdx"))]
497         let force_iommu = false;
498 
499         #[cfg(feature = "guest_debug")]
500         let stop_on_boot = config.lock().unwrap().gdb;
501         #[cfg(not(feature = "guest_debug"))]
502         let stop_on_boot = false;
503 
504         let memory = memory_manager.lock().unwrap().guest_memory();
505         #[cfg(target_arch = "x86_64")]
506         let io_bus = Arc::new(Bus::new());
507         let mmio_bus = Arc::new(Bus::new());
508 
509         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
510             memory,
511             #[cfg(target_arch = "x86_64")]
512             io_bus: io_bus.clone(),
513             mmio_bus: mmio_bus.clone(),
514         });
515 
516         let cpus_config = { &config.lock().unwrap().cpus.clone() };
517         let cpu_manager = cpu::CpuManager::new(
518             cpus_config,
519             vm.clone(),
520             exit_evt.try_clone().map_err(Error::EventFdClone)?,
521             reset_evt.try_clone().map_err(Error::EventFdClone)?,
522             #[cfg(feature = "guest_debug")]
523             vm_debug_evt,
524             &hypervisor,
525             seccomp_action.clone(),
526             vm_ops,
527             #[cfg(feature = "tdx")]
528             tdx_enabled,
529             &numa_nodes,
530         )
531         .map_err(Error::CpuManager)?;
532 
533         #[cfg(target_arch = "x86_64")]
534         cpu_manager
535             .lock()
536             .unwrap()
537             .populate_cpuid(
538                 &memory_manager,
539                 &hypervisor,
540                 #[cfg(feature = "tdx")]
541                 tdx_enabled,
542             )
543             .map_err(Error::CpuManager)?;
544 
545         // The initial TDX configuration must be done before the vCPUs are
546         // created
547         #[cfg(feature = "tdx")]
548         if tdx_enabled {
549             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
550             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
551             vm.tdx_init(&cpuid, max_vcpus)
552                 .map_err(Error::InitializeTdxVm)?;
553         }
554 
555         cpu_manager
556             .lock()
557             .unwrap()
558             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
559             .map_err(Error::CpuManager)?;
560 
561         #[cfg(feature = "tdx")]
562         let dynamic = !tdx_enabled;
563         #[cfg(not(feature = "tdx"))]
564         let dynamic = true;
565 
566         let device_manager = DeviceManager::new(
567             #[cfg(target_arch = "x86_64")]
568             io_bus,
569             mmio_bus,
570             hypervisor.hypervisor_type(),
571             vm.clone(),
572             config.clone(),
573             memory_manager.clone(),
574             cpu_manager.clone(),
575             exit_evt.try_clone().map_err(Error::EventFdClone)?,
576             reset_evt,
577             seccomp_action.clone(),
578             numa_nodes.clone(),
579             &activate_evt,
580             force_iommu,
581             boot_id_list,
582             timestamp,
583             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
584             dynamic,
585         )
586         .map_err(Error::DeviceManager)?;
587 
588         device_manager
589             .lock()
590             .unwrap()
591             .create_devices(
592                 serial_pty,
593                 console_pty,
594                 console_resize_pipe,
595                 original_termios,
596             )
597             .map_err(Error::DeviceManager)?;
598 
599         #[cfg(feature = "tdx")]
600         let kernel = config
601             .lock()
602             .unwrap()
603             .payload
604             .as_ref()
605             .map(|p| p.kernel.as_ref().map(File::open))
606             .unwrap_or_default()
607             .transpose()
608             .map_err(Error::KernelFile)?;
609 
610         let initramfs = config
611             .lock()
612             .unwrap()
613             .payload
614             .as_ref()
615             .map(|p| p.initramfs.as_ref().map(File::open))
616             .unwrap_or_default()
617             .transpose()
618             .map_err(Error::InitramfsFile)?;
619 
620         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
621         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
622             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
623             vm_snapshot.clock
624         } else {
625             None
626         };
627 
628         let vm_state = if snapshot.is_some() {
629             VmState::Paused
630         } else {
631             VmState::Created
632         };
633 
634         Ok(Vm {
635             #[cfg(feature = "tdx")]
636             kernel,
637             initramfs,
638             device_manager,
639             config,
640             threads: Vec::with_capacity(1),
641             state: RwLock::new(vm_state),
642             cpu_manager,
643             memory_manager,
644             vm,
645             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
646             saved_clock,
647             numa_nodes,
648             hypervisor,
649             stop_on_boot,
650             load_payload_handle,
651         })
652     }
653 
654     fn create_numa_nodes(
655         configs: Option<Vec<NumaConfig>>,
656         memory_manager: &Arc<Mutex<MemoryManager>>,
657     ) -> Result<NumaNodes> {
658         let mm = memory_manager.lock().unwrap();
659         let mm_zones = mm.memory_zones();
660         let mut numa_nodes = BTreeMap::new();
661 
662         if let Some(configs) = &configs {
663             for config in configs.iter() {
664                 if numa_nodes.contains_key(&config.guest_numa_id) {
665                     error!("Can't define twice the same NUMA node");
666                     return Err(Error::InvalidNumaConfig);
667                 }
668 
669                 let mut node = NumaNode::default();
670 
671                 if let Some(memory_zones) = &config.memory_zones {
672                     for memory_zone in memory_zones.iter() {
673                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
674                             node.memory_regions.extend(mm_zone.regions().clone());
675                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
676                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
677                             }
678                             node.memory_zones.push(memory_zone.clone());
679                         } else {
680                             error!("Unknown memory zone '{}'", memory_zone);
681                             return Err(Error::InvalidNumaConfig);
682                         }
683                     }
684                 }
685 
686                 if let Some(cpus) = &config.cpus {
687                     node.cpus.extend(cpus);
688                 }
689 
690                 if let Some(distances) = &config.distances {
691                     for distance in distances.iter() {
692                         let dest = distance.destination;
693                         let dist = distance.distance;
694 
695                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
696                             error!("Unknown destination NUMA node {}", dest);
697                             return Err(Error::InvalidNumaConfig);
698                         }
699 
700                         if node.distances.contains_key(&dest) {
701                             error!("Destination NUMA node {} has been already set", dest);
702                             return Err(Error::InvalidNumaConfig);
703                         }
704 
705                         node.distances.insert(dest, dist);
706                     }
707                 }
708 
709                 #[cfg(target_arch = "x86_64")]
710                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
711                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
712                         let mm_sections = sgx_epc_region.epc_sections();
713                         for sgx_epc_section in sgx_epc_sections.iter() {
714                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
715                                 node.sgx_epc_sections.push(mm_section.clone());
716                             } else {
717                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
718                                 return Err(Error::InvalidNumaConfig);
719                             }
720                         }
721                     } else {
722                         error!("Missing SGX EPC region");
723                         return Err(Error::InvalidNumaConfig);
724                     }
725                 }
726 
727                 numa_nodes.insert(config.guest_numa_id, node);
728             }
729         }
730 
731         Ok(numa_nodes)
732     }
733 
734     #[allow(clippy::too_many_arguments)]
735     pub fn new(
736         vm_config: Arc<Mutex<VmConfig>>,
737         exit_evt: EventFd,
738         reset_evt: EventFd,
739         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
740         seccomp_action: &SeccompAction,
741         hypervisor: Arc<dyn hypervisor::Hypervisor>,
742         activate_evt: EventFd,
743         serial_pty: Option<PtyPair>,
744         console_pty: Option<PtyPair>,
745         console_resize_pipe: Option<File>,
746         original_termios: Arc<Mutex<Option<termios>>>,
747         snapshot: Option<Snapshot>,
748         source_url: Option<&str>,
749         prefault: Option<bool>,
750     ) -> Result<Self> {
751         trace_scoped!("Vm::new");
752 
753         let timestamp = Instant::now();
754 
755         #[cfg(feature = "tdx")]
756         let tdx_enabled = if snapshot.is_some() {
757             false
758         } else {
759             vm_config.lock().unwrap().is_tdx_enabled()
760         };
761 
762         #[cfg(feature = "sev_snp")]
763         let sev_snp_enabled = if snapshot.is_some() {
764             false
765         } else {
766             vm_config.lock().unwrap().is_sev_snp_enabled()
767         };
768 
769         let vm = Self::create_hypervisor_vm(
770             &hypervisor,
771             #[cfg(feature = "tdx")]
772             tdx_enabled,
773             #[cfg(feature = "sev_snp")]
774             sev_snp_enabled,
775         )?;
776 
777         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
778 
779         let memory_manager = if let Some(snapshot) =
780             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
781         {
782             MemoryManager::new_from_snapshot(
783                 &snapshot,
784                 vm.clone(),
785                 &vm_config.lock().unwrap().memory.clone(),
786                 source_url,
787                 prefault.unwrap(),
788                 phys_bits,
789             )
790             .map_err(Error::MemoryManager)?
791         } else {
792             #[cfg(target_arch = "x86_64")]
793             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
794 
795             MemoryManager::new(
796                 vm.clone(),
797                 &vm_config.lock().unwrap().memory.clone(),
798                 None,
799                 phys_bits,
800                 #[cfg(feature = "tdx")]
801                 tdx_enabled,
802                 None,
803                 None,
804                 #[cfg(target_arch = "x86_64")]
805                 sgx_epc_config,
806             )
807             .map_err(Error::MemoryManager)?
808         };
809 
810         Vm::new_from_memory_manager(
811             vm_config,
812             memory_manager,
813             vm,
814             exit_evt,
815             reset_evt,
816             #[cfg(feature = "guest_debug")]
817             vm_debug_evt,
818             seccomp_action,
819             hypervisor,
820             activate_evt,
821             timestamp,
822             serial_pty,
823             console_pty,
824             console_resize_pipe,
825             original_termios,
826             snapshot,
827         )
828     }
829 
830     pub fn create_hypervisor_vm(
831         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
832         #[cfg(feature = "tdx")] tdx_enabled: bool,
833         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
834     ) -> Result<Arc<dyn hypervisor::Vm>> {
835         hypervisor.check_required_extensions().unwrap();
836 
837         cfg_if::cfg_if! {
838             if #[cfg(feature = "tdx")] {
839                 let vm = hypervisor
840                     .create_vm_with_type(if tdx_enabled {
841                         1 // KVM_X86_TDX_VM
842                     } else {
843                         0 // KVM_X86_LEGACY_VM
844                     })
845                     .unwrap();
846             } else if #[cfg(feature = "sev_snp")] {
847                 let vm = hypervisor
848                     .create_vm_with_type(if sev_snp_enabled {
849                         1 // SEV_SNP_ENABLED
850                     } else {
851                         0 // SEV_SNP_DISABLED
852                     })
853                     .unwrap();
854             } else {
855                 let vm = hypervisor.create_vm().unwrap();
856             }
857         }
858 
859         #[cfg(target_arch = "x86_64")]
860         {
861             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
862                 .unwrap();
863             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
864             vm.enable_split_irq().unwrap();
865         }
866 
867         Ok(vm)
868     }
869 
870     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
871         let mut initramfs = self.initramfs.as_ref().unwrap();
872         let size: usize = initramfs
873             .seek(SeekFrom::End(0))
874             .map_err(|_| Error::InitramfsLoad)?
875             .try_into()
876             .unwrap();
877         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
878 
879         let address =
880             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
881         let address = GuestAddress(address);
882 
883         guest_mem
884             .read_from(address, &mut initramfs, size)
885             .map_err(|_| Error::InitramfsLoad)?;
886 
887         info!("Initramfs loaded: address = 0x{:x}", address.0);
888         Ok(arch::InitramfsConfig { address, size })
889     }
890 
891     pub fn generate_cmdline(
892         payload: &PayloadConfig,
893         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
894     ) -> Result<Cmdline> {
895         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
896         if let Some(s) = payload.cmdline.as_ref() {
897             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
898         }
899 
900         #[cfg(target_arch = "aarch64")]
901         for entry in device_manager.lock().unwrap().cmdline_additions() {
902             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
903         }
904         Ok(cmdline)
905     }
906 
907     #[cfg(target_arch = "aarch64")]
908     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
909         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
910         let mem = uefi_flash.memory();
911         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
912             .map_err(Error::UefiLoad)?;
913         Ok(())
914     }
915 
916     #[cfg(target_arch = "aarch64")]
917     fn load_kernel(
918         firmware: Option<File>,
919         kernel: Option<File>,
920         memory_manager: Arc<Mutex<MemoryManager>>,
921     ) -> Result<EntryPoint> {
922         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
923         let mem = guest_memory.memory();
924         let entry_addr = match (firmware, kernel) {
925             (None, Some(mut kernel)) => {
926                 match linux_loader::loader::pe::PE::load(
927                     mem.deref(),
928                     Some(arch::layout::KERNEL_START),
929                     &mut kernel,
930                     None,
931                 ) {
932                     Ok(entry_addr) => entry_addr.kernel_load,
933                     // Try to load the binary as kernel PE file at first.
934                     // If failed, retry to load it as UEFI binary.
935                     // As the UEFI binary is formatless, it must be the last option to try.
936                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
937                         Self::load_firmware(&kernel, memory_manager)?;
938                         arch::layout::UEFI_START
939                     }
940                     Err(e) => {
941                         return Err(Error::KernelLoad(e));
942                     }
943                 }
944             }
945             (Some(firmware), None) => {
946                 Self::load_firmware(&firmware, memory_manager)?;
947                 arch::layout::UEFI_START
948             }
949             _ => return Err(Error::InvalidPayload),
950         };
951 
952         Ok(EntryPoint { entry_addr })
953     }
954 
955     #[cfg(target_arch = "x86_64")]
956     fn load_kernel(
957         mut kernel: File,
958         cmdline: Option<Cmdline>,
959         memory_manager: Arc<Mutex<MemoryManager>>,
960     ) -> Result<EntryPoint> {
961         info!("Loading kernel");
962 
963         let mem = {
964             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
965             guest_memory.memory()
966         };
967         let entry_addr = linux_loader::loader::elf::Elf::load(
968             mem.deref(),
969             None,
970             &mut kernel,
971             Some(arch::layout::HIGH_RAM_START),
972         )
973         .map_err(Error::KernelLoad)?;
974 
975         if let Some(cmdline) = cmdline {
976             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
977                 .map_err(Error::LoadCmdLine)?;
978         }
979 
980         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
981             // Use the PVH kernel entry point to boot the guest
982             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
983             Ok(EntryPoint {
984                 entry_addr: Some(entry_addr),
985             })
986         } else {
987             Err(Error::KernelMissingPvhHeader)
988         }
989     }
990 
991     #[cfg(target_arch = "x86_64")]
992     fn load_payload(
993         payload: &PayloadConfig,
994         memory_manager: Arc<Mutex<MemoryManager>>,
995     ) -> Result<EntryPoint> {
996         trace_scoped!("load_payload");
997         match (
998             &payload.firmware,
999             &payload.kernel,
1000             &payload.initramfs,
1001             &payload.cmdline,
1002         ) {
1003             (Some(firmware), None, None, None) => {
1004                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1005                 Self::load_kernel(firmware, None, memory_manager)
1006             }
1007             (None, Some(kernel), _, _) => {
1008                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1009                 let cmdline = Self::generate_cmdline(payload)?;
1010                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1011             }
1012             _ => Err(Error::InvalidPayload),
1013         }
1014     }
1015 
1016     #[cfg(target_arch = "aarch64")]
1017     fn load_payload(
1018         payload: &PayloadConfig,
1019         memory_manager: Arc<Mutex<MemoryManager>>,
1020     ) -> Result<EntryPoint> {
1021         match (&payload.firmware, &payload.kernel) {
1022             (Some(firmware), None) => {
1023                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1024                 Self::load_kernel(Some(firmware), None, memory_manager)
1025             }
1026             (None, Some(kernel)) => {
1027                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1028                 Self::load_kernel(None, Some(kernel), memory_manager)
1029             }
1030             _ => Err(Error::InvalidPayload),
1031         }
1032     }
1033 
1034     fn load_payload_async(
1035         memory_manager: &Arc<Mutex<MemoryManager>>,
1036         config: &Arc<Mutex<VmConfig>>,
1037     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1038         // Kernel with TDX is loaded in a different manner
1039         #[cfg(feature = "tdx")]
1040         if config.lock().unwrap().is_tdx_enabled() {
1041             return Ok(None);
1042         }
1043 
1044         config
1045             .lock()
1046             .unwrap()
1047             .payload
1048             .as_ref()
1049             .map(|payload| {
1050                 let memory_manager = memory_manager.clone();
1051                 let payload = payload.clone();
1052 
1053                 std::thread::Builder::new()
1054                     .name("payload_loader".into())
1055                     .spawn(move || Self::load_payload(&payload, memory_manager))
1056                     .map_err(Error::KernelLoadThreadSpawn)
1057             })
1058             .transpose()
1059     }
1060 
1061     #[cfg(target_arch = "x86_64")]
1062     fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> {
1063         trace_scoped!("configure_system");
1064         info!("Configuring system");
1065         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1066 
1067         let initramfs_config = match self.initramfs {
1068             Some(_) => Some(self.load_initramfs(&mem)?),
1069             None => None,
1070         };
1071 
1072         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1073         let rsdp_addr = Some(rsdp_addr);
1074         let sgx_epc_region = self
1075             .memory_manager
1076             .lock()
1077             .unwrap()
1078             .sgx_epc_region()
1079             .as_ref()
1080             .cloned();
1081 
1082         let serial_number = self
1083             .config
1084             .lock()
1085             .unwrap()
1086             .platform
1087             .as_ref()
1088             .and_then(|p| p.serial_number.clone());
1089 
1090         let uuid = self
1091             .config
1092             .lock()
1093             .unwrap()
1094             .platform
1095             .as_ref()
1096             .and_then(|p| p.uuid.clone());
1097 
1098         let oem_strings = self
1099             .config
1100             .lock()
1101             .unwrap()
1102             .platform
1103             .as_ref()
1104             .and_then(|p| p.oem_strings.clone());
1105 
1106         let oem_strings = oem_strings
1107             .as_deref()
1108             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1109 
1110         arch::configure_system(
1111             &mem,
1112             arch::layout::CMDLINE_START,
1113             &initramfs_config,
1114             boot_vcpus,
1115             rsdp_addr,
1116             sgx_epc_region,
1117             serial_number.as_deref(),
1118             uuid.as_deref(),
1119             oem_strings.as_deref(),
1120         )
1121         .map_err(Error::ConfigureSystem)?;
1122         Ok(())
1123     }
1124 
1125     #[cfg(target_arch = "aarch64")]
1126     fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> {
1127         let cmdline = Self::generate_cmdline(
1128             self.config.lock().unwrap().payload.as_ref().unwrap(),
1129             &self.device_manager,
1130         )?;
1131         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1132         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1133         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1134         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1135         let initramfs_config = match self.initramfs {
1136             Some(_) => Some(self.load_initramfs(&mem)?),
1137             None => None,
1138         };
1139 
1140         let device_info = &self
1141             .device_manager
1142             .lock()
1143             .unwrap()
1144             .get_device_info()
1145             .clone();
1146 
1147         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1148             let pci_space = PciSpaceInfo {
1149                 pci_segment_id: pci_segment.id,
1150                 mmio_config_address: pci_segment.mmio_config_address,
1151                 pci_device_space_start: pci_segment.start_of_device_area,
1152                 pci_device_space_size: pci_segment.end_of_device_area
1153                     - pci_segment.start_of_device_area
1154                     + 1,
1155             };
1156             pci_space_info.push(pci_space);
1157         }
1158 
1159         let virtio_iommu_bdf = self
1160             .device_manager
1161             .lock()
1162             .unwrap()
1163             .iommu_attached_devices()
1164             .as_ref()
1165             .map(|(v, _)| *v);
1166 
1167         let vgic = self
1168             .device_manager
1169             .lock()
1170             .unwrap()
1171             .get_interrupt_controller()
1172             .unwrap()
1173             .lock()
1174             .unwrap()
1175             .get_vgic()
1176             .map_err(|_| {
1177                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1178                     arch::aarch64::Error::SetupGic,
1179                 ))
1180             })?;
1181 
1182         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1183         let pmu_supported = self
1184             .cpu_manager
1185             .lock()
1186             .unwrap()
1187             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1188             .map_err(|_| {
1189                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1190                     arch::aarch64::Error::VcpuInitPmu,
1191                 ))
1192             })?;
1193 
1194         arch::configure_system(
1195             &mem,
1196             cmdline.as_cstring().unwrap().to_str().unwrap(),
1197             vcpu_mpidrs,
1198             vcpu_topology,
1199             device_info,
1200             &initramfs_config,
1201             &pci_space_info,
1202             virtio_iommu_bdf.map(|bdf| bdf.into()),
1203             &vgic,
1204             &self.numa_nodes,
1205             pmu_supported,
1206         )
1207         .map_err(Error::ConfigureSystem)?;
1208 
1209         Ok(())
1210     }
1211 
1212     pub fn serial_pty(&self) -> Option<PtyPair> {
1213         self.device_manager.lock().unwrap().serial_pty()
1214     }
1215 
1216     pub fn console_pty(&self) -> Option<PtyPair> {
1217         self.device_manager.lock().unwrap().console_pty()
1218     }
1219 
1220     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1221         self.device_manager.lock().unwrap().console_resize_pipe()
1222     }
1223 
1224     pub fn shutdown(&mut self) -> Result<()> {
1225         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1226         let new_state = VmState::Shutdown;
1227 
1228         state.valid_transition(new_state)?;
1229 
1230         // Wake up the DeviceManager threads so they will get terminated cleanly
1231         self.device_manager
1232             .lock()
1233             .unwrap()
1234             .resume()
1235             .map_err(Error::Resume)?;
1236 
1237         self.cpu_manager
1238             .lock()
1239             .unwrap()
1240             .shutdown()
1241             .map_err(Error::CpuManager)?;
1242 
1243         // Wait for all the threads to finish
1244         for thread in self.threads.drain(..) {
1245             thread.join().map_err(Error::ThreadCleanup)?
1246         }
1247         *state = new_state;
1248 
1249         event!("vm", "shutdown");
1250 
1251         Ok(())
1252     }
1253 
1254     pub fn resize(
1255         &mut self,
1256         desired_vcpus: Option<u8>,
1257         desired_memory: Option<u64>,
1258         desired_balloon: Option<u64>,
1259     ) -> Result<()> {
1260         event!("vm", "resizing");
1261 
1262         if let Some(desired_vcpus) = desired_vcpus {
1263             if self
1264                 .cpu_manager
1265                 .lock()
1266                 .unwrap()
1267                 .resize(desired_vcpus)
1268                 .map_err(Error::CpuManager)?
1269             {
1270                 self.device_manager
1271                     .lock()
1272                     .unwrap()
1273                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1274                     .map_err(Error::DeviceManager)?;
1275             }
1276             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1277         }
1278 
1279         if let Some(desired_memory) = desired_memory {
1280             let new_region = self
1281                 .memory_manager
1282                 .lock()
1283                 .unwrap()
1284                 .resize(desired_memory)
1285                 .map_err(Error::MemoryManager)?;
1286 
1287             let memory_config = &mut self.config.lock().unwrap().memory;
1288 
1289             if let Some(new_region) = &new_region {
1290                 self.device_manager
1291                     .lock()
1292                     .unwrap()
1293                     .update_memory(new_region)
1294                     .map_err(Error::DeviceManager)?;
1295 
1296                 match memory_config.hotplug_method {
1297                     HotplugMethod::Acpi => {
1298                         self.device_manager
1299                             .lock()
1300                             .unwrap()
1301                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1302                             .map_err(Error::DeviceManager)?;
1303                     }
1304                     HotplugMethod::VirtioMem => {}
1305                 }
1306             }
1307 
1308             // We update the VM config regardless of the actual guest resize
1309             // operation result (happened or not), so that if the VM reboots
1310             // it will be running with the last configure memory size.
1311             match memory_config.hotplug_method {
1312                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1313                 HotplugMethod::VirtioMem => {
1314                     if desired_memory > memory_config.size {
1315                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1316                     } else {
1317                         memory_config.hotplugged_size = None;
1318                     }
1319                 }
1320             }
1321         }
1322 
1323         if let Some(desired_balloon) = desired_balloon {
1324             self.device_manager
1325                 .lock()
1326                 .unwrap()
1327                 .resize_balloon(desired_balloon)
1328                 .map_err(Error::DeviceManager)?;
1329 
1330             // Update the configuration value for the balloon size to ensure
1331             // a reboot would use the right value.
1332             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1333                 balloon_config.size = desired_balloon;
1334             }
1335         }
1336 
1337         event!("vm", "resized");
1338 
1339         Ok(())
1340     }
1341 
1342     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1343         let memory_config = &mut self.config.lock().unwrap().memory;
1344 
1345         if let Some(zones) = &mut memory_config.zones {
1346             for zone in zones.iter_mut() {
1347                 if zone.id == id {
1348                     if desired_memory >= zone.size {
1349                         let hotplugged_size = desired_memory - zone.size;
1350                         self.memory_manager
1351                             .lock()
1352                             .unwrap()
1353                             .resize_zone(&id, desired_memory - zone.size)
1354                             .map_err(Error::MemoryManager)?;
1355                         // We update the memory zone config regardless of the
1356                         // actual 'resize-zone' operation result (happened or
1357                         // not), so that if the VM reboots it will be running
1358                         // with the last configured memory zone size.
1359                         zone.hotplugged_size = Some(hotplugged_size);
1360 
1361                         return Ok(());
1362                     } else {
1363                         error!(
1364                             "Invalid to ask less ({}) than boot RAM ({}) for \
1365                             this memory zone",
1366                             desired_memory, zone.size,
1367                         );
1368                         return Err(Error::ResizeZone);
1369                     }
1370                 }
1371             }
1372         }
1373 
1374         error!("Could not find the memory zone {} for the resize", id);
1375         Err(Error::ResizeZone)
1376     }
1377 
1378     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1379         let pci_device_info = self
1380             .device_manager
1381             .lock()
1382             .unwrap()
1383             .add_device(&mut device_cfg)
1384             .map_err(Error::DeviceManager)?;
1385 
1386         // Update VmConfig by adding the new device. This is important to
1387         // ensure the device would be created in case of a reboot.
1388         {
1389             let mut config = self.config.lock().unwrap();
1390             add_to_config(&mut config.devices, device_cfg);
1391         }
1392 
1393         self.device_manager
1394             .lock()
1395             .unwrap()
1396             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1397             .map_err(Error::DeviceManager)?;
1398 
1399         Ok(pci_device_info)
1400     }
1401 
1402     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1403         let pci_device_info = self
1404             .device_manager
1405             .lock()
1406             .unwrap()
1407             .add_user_device(&mut device_cfg)
1408             .map_err(Error::DeviceManager)?;
1409 
1410         // Update VmConfig by adding the new device. This is important to
1411         // ensure the device would be created in case of a reboot.
1412         {
1413             let mut config = self.config.lock().unwrap();
1414             add_to_config(&mut config.user_devices, device_cfg);
1415         }
1416 
1417         self.device_manager
1418             .lock()
1419             .unwrap()
1420             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1421             .map_err(Error::DeviceManager)?;
1422 
1423         Ok(pci_device_info)
1424     }
1425 
1426     pub fn remove_device(&mut self, id: String) -> Result<()> {
1427         self.device_manager
1428             .lock()
1429             .unwrap()
1430             .remove_device(id.clone())
1431             .map_err(Error::DeviceManager)?;
1432 
1433         // Update VmConfig by removing the device. This is important to
1434         // ensure the device would not be created in case of a reboot.
1435         self.config.lock().unwrap().remove_device(&id);
1436 
1437         self.device_manager
1438             .lock()
1439             .unwrap()
1440             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1441             .map_err(Error::DeviceManager)?;
1442         Ok(())
1443     }
1444 
1445     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1446         let pci_device_info = self
1447             .device_manager
1448             .lock()
1449             .unwrap()
1450             .add_disk(&mut disk_cfg)
1451             .map_err(Error::DeviceManager)?;
1452 
1453         // Update VmConfig by adding the new device. This is important to
1454         // ensure the device would be created in case of a reboot.
1455         {
1456             let mut config = self.config.lock().unwrap();
1457             add_to_config(&mut config.disks, disk_cfg);
1458         }
1459 
1460         self.device_manager
1461             .lock()
1462             .unwrap()
1463             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1464             .map_err(Error::DeviceManager)?;
1465 
1466         Ok(pci_device_info)
1467     }
1468 
1469     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1470         let pci_device_info = self
1471             .device_manager
1472             .lock()
1473             .unwrap()
1474             .add_fs(&mut fs_cfg)
1475             .map_err(Error::DeviceManager)?;
1476 
1477         // Update VmConfig by adding the new device. This is important to
1478         // ensure the device would be created in case of a reboot.
1479         {
1480             let mut config = self.config.lock().unwrap();
1481             add_to_config(&mut config.fs, fs_cfg);
1482         }
1483 
1484         self.device_manager
1485             .lock()
1486             .unwrap()
1487             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1488             .map_err(Error::DeviceManager)?;
1489 
1490         Ok(pci_device_info)
1491     }
1492 
1493     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1494         let pci_device_info = self
1495             .device_manager
1496             .lock()
1497             .unwrap()
1498             .add_pmem(&mut pmem_cfg)
1499             .map_err(Error::DeviceManager)?;
1500 
1501         // Update VmConfig by adding the new device. This is important to
1502         // ensure the device would be created in case of a reboot.
1503         {
1504             let mut config = self.config.lock().unwrap();
1505             add_to_config(&mut config.pmem, pmem_cfg);
1506         }
1507 
1508         self.device_manager
1509             .lock()
1510             .unwrap()
1511             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1512             .map_err(Error::DeviceManager)?;
1513 
1514         Ok(pci_device_info)
1515     }
1516 
1517     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1518         let pci_device_info = self
1519             .device_manager
1520             .lock()
1521             .unwrap()
1522             .add_net(&mut net_cfg)
1523             .map_err(Error::DeviceManager)?;
1524 
1525         // Update VmConfig by adding the new device. This is important to
1526         // ensure the device would be created in case of a reboot.
1527         {
1528             let mut config = self.config.lock().unwrap();
1529             add_to_config(&mut config.net, net_cfg);
1530         }
1531 
1532         self.device_manager
1533             .lock()
1534             .unwrap()
1535             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1536             .map_err(Error::DeviceManager)?;
1537 
1538         Ok(pci_device_info)
1539     }
1540 
1541     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1542         let pci_device_info = self
1543             .device_manager
1544             .lock()
1545             .unwrap()
1546             .add_vdpa(&mut vdpa_cfg)
1547             .map_err(Error::DeviceManager)?;
1548 
1549         // Update VmConfig by adding the new device. This is important to
1550         // ensure the device would be created in case of a reboot.
1551         {
1552             let mut config = self.config.lock().unwrap();
1553             add_to_config(&mut config.vdpa, vdpa_cfg);
1554         }
1555 
1556         self.device_manager
1557             .lock()
1558             .unwrap()
1559             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1560             .map_err(Error::DeviceManager)?;
1561 
1562         Ok(pci_device_info)
1563     }
1564 
1565     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1566         let pci_device_info = self
1567             .device_manager
1568             .lock()
1569             .unwrap()
1570             .add_vsock(&mut vsock_cfg)
1571             .map_err(Error::DeviceManager)?;
1572 
1573         // Update VmConfig by adding the new device. This is important to
1574         // ensure the device would be created in case of a reboot.
1575         {
1576             let mut config = self.config.lock().unwrap();
1577             config.vsock = Some(vsock_cfg);
1578         }
1579 
1580         self.device_manager
1581             .lock()
1582             .unwrap()
1583             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1584             .map_err(Error::DeviceManager)?;
1585 
1586         Ok(pci_device_info)
1587     }
1588 
1589     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1590         Ok(self.device_manager.lock().unwrap().counters())
1591     }
1592 
1593     #[cfg(feature = "tdx")]
1594     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1595         use arch::x86_64::tdx::*;
1596 
1597         let firmware_path = self
1598             .config
1599             .lock()
1600             .unwrap()
1601             .payload
1602             .as_ref()
1603             .unwrap()
1604             .firmware
1605             .clone()
1606             .ok_or(Error::TdxFirmwareMissing)?;
1607         // The TDVF file contains a table of section as well as code
1608         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1609 
1610         // For all the sections allocate some RAM backing them
1611         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1612     }
1613 
1614     #[cfg(feature = "tdx")]
1615     fn hob_memory_resources(
1616         mut sorted_sections: Vec<TdvfSection>,
1617         guest_memory: &GuestMemoryMmap,
1618     ) -> Vec<(u64, u64, bool)> {
1619         let mut list = Vec::new();
1620 
1621         let mut current_section = sorted_sections.pop();
1622 
1623         // RAM regions interleaved with TDVF sections
1624         let mut next_start_addr = 0;
1625         for region in guest_memory.iter() {
1626             let region_start = region.start_addr().0;
1627             let region_end = region.last_addr().0;
1628             if region_start > next_start_addr {
1629                 next_start_addr = region_start;
1630             }
1631 
1632             loop {
1633                 let (start, size, ram) = if let Some(section) = &current_section {
1634                     if section.address <= next_start_addr {
1635                         (section.address, section.size, false)
1636                     } else {
1637                         let last_addr = std::cmp::min(section.address - 1, region_end);
1638                         (next_start_addr, last_addr - next_start_addr + 1, true)
1639                     }
1640                 } else {
1641                     (next_start_addr, region_end - next_start_addr + 1, true)
1642                 };
1643 
1644                 list.push((start, size, ram));
1645 
1646                 if !ram {
1647                     current_section = sorted_sections.pop();
1648                 }
1649 
1650                 next_start_addr = start + size;
1651 
1652                 if region_start > next_start_addr {
1653                     next_start_addr = region_start;
1654                 }
1655 
1656                 if next_start_addr > region_end {
1657                     break;
1658                 }
1659             }
1660         }
1661 
1662         // Once all the interleaved sections have been processed, let's simply
1663         // pull the remaining ones.
1664         if let Some(section) = current_section {
1665             list.push((section.address, section.size, false));
1666         }
1667         while let Some(section) = sorted_sections.pop() {
1668             list.push((section.address, section.size, false));
1669         }
1670 
1671         list
1672     }
1673 
1674     #[cfg(feature = "tdx")]
1675     fn populate_tdx_sections(
1676         &mut self,
1677         sections: &[TdvfSection],
1678         guid_found: bool,
1679     ) -> Result<Option<u64>> {
1680         use arch::x86_64::tdx::*;
1681         // Get the memory end *before* we start adding TDVF ram regions
1682         let boot_guest_memory = self
1683             .memory_manager
1684             .lock()
1685             .as_ref()
1686             .unwrap()
1687             .boot_guest_memory();
1688         for section in sections {
1689             // No need to allocate if the section falls within guest RAM ranges
1690             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1691                 info!(
1692                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1693                     section
1694                 );
1695                 continue;
1696             }
1697 
1698             info!("Allocating TDVF Section: {:x?}", section);
1699             self.memory_manager
1700                 .lock()
1701                 .unwrap()
1702                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1703                 .map_err(Error::AllocatingTdvfMemory)?;
1704         }
1705 
1706         // The TDVF file contains a table of section as well as code
1707         let firmware_path = self
1708             .config
1709             .lock()
1710             .unwrap()
1711             .payload
1712             .as_ref()
1713             .unwrap()
1714             .firmware
1715             .clone()
1716             .ok_or(Error::TdxFirmwareMissing)?;
1717         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1718 
1719         // The guest memory at this point now has all the required regions so it
1720         // is safe to copy from the TDVF file into it.
1721         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1722         let mem = guest_memory.memory();
1723         let mut payload_info = None;
1724         let mut hob_offset = None;
1725         for section in sections {
1726             info!("Populating TDVF Section: {:x?}", section);
1727             match section.r#type {
1728                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1729                     info!("Copying section to guest memory");
1730                     firmware_file
1731                         .seek(SeekFrom::Start(section.data_offset as u64))
1732                         .map_err(Error::LoadTdvf)?;
1733                     mem.read_from(
1734                         GuestAddress(section.address),
1735                         &mut firmware_file,
1736                         section.data_size as usize,
1737                     )
1738                     .unwrap();
1739                 }
1740                 TdvfSectionType::TdHob => {
1741                     hob_offset = Some(section.address);
1742                 }
1743                 TdvfSectionType::Payload => {
1744                     info!("Copying payload to guest memory");
1745                     if let Some(payload_file) = self.kernel.as_mut() {
1746                         let payload_size = payload_file
1747                             .seek(SeekFrom::End(0))
1748                             .map_err(Error::LoadPayload)?;
1749 
1750                         payload_file
1751                             .seek(SeekFrom::Start(0x1f1))
1752                             .map_err(Error::LoadPayload)?;
1753 
1754                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1755                         payload_header
1756                             .as_bytes()
1757                             .read_from(
1758                                 0,
1759                                 payload_file,
1760                                 mem::size_of::<linux_loader::bootparam::setup_header>(),
1761                             )
1762                             .unwrap();
1763 
1764                         if payload_header.header != 0x5372_6448 {
1765                             return Err(Error::InvalidPayloadType);
1766                         }
1767 
1768                         if (payload_header.version < 0x0200)
1769                             || ((payload_header.loadflags & 0x1) == 0x0)
1770                         {
1771                             return Err(Error::InvalidPayloadType);
1772                         }
1773 
1774                         payload_file.rewind().map_err(Error::LoadPayload)?;
1775                         mem.read_from(
1776                             GuestAddress(section.address),
1777                             payload_file,
1778                             payload_size as usize,
1779                         )
1780                         .unwrap();
1781 
1782                         // Create the payload info that will be inserted into
1783                         // the HOB.
1784                         payload_info = Some(PayloadInfo {
1785                             image_type: PayloadImageType::BzImage,
1786                             entry_point: section.address,
1787                         });
1788                     }
1789                 }
1790                 TdvfSectionType::PayloadParam => {
1791                     info!("Copying payload parameters to guest memory");
1792                     let cmdline = Self::generate_cmdline(
1793                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1794                     )?;
1795                     mem.write_slice(
1796                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1797                         GuestAddress(section.address),
1798                     )
1799                     .unwrap();
1800                 }
1801                 _ => {}
1802             }
1803         }
1804 
1805         // Generate HOB
1806         let mut hob = TdHob::start(hob_offset.unwrap());
1807 
1808         let mut sorted_sections = sections.to_vec();
1809         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1810 
1811         sorted_sections.sort_by_key(|section| section.address);
1812         sorted_sections.reverse();
1813 
1814         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1815             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1816                 .map_err(Error::PopulateHob)?;
1817         }
1818 
1819         // MMIO regions
1820         hob.add_mmio_resource(
1821             &mem,
1822             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1823             arch::layout::APIC_START.raw_value()
1824                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1825         )
1826         .map_err(Error::PopulateHob)?;
1827         let start_of_device_area = self
1828             .memory_manager
1829             .lock()
1830             .unwrap()
1831             .start_of_device_area()
1832             .raw_value();
1833         let end_of_device_area = self
1834             .memory_manager
1835             .lock()
1836             .unwrap()
1837             .end_of_device_area()
1838             .raw_value();
1839         hob.add_mmio_resource(
1840             &mem,
1841             start_of_device_area,
1842             end_of_device_area - start_of_device_area,
1843         )
1844         .map_err(Error::PopulateHob)?;
1845 
1846         // Loop over the ACPI tables and copy them to the HOB.
1847 
1848         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1849             &self.device_manager,
1850             &self.cpu_manager,
1851             &self.memory_manager,
1852             &self.numa_nodes,
1853         ) {
1854             hob.add_acpi_table(&mem, acpi_table.as_slice())
1855                 .map_err(Error::PopulateHob)?;
1856         }
1857 
1858         // If a payload info has been created, let's insert it into the HOB.
1859         if let Some(payload_info) = payload_info {
1860             hob.add_payload(&mem, payload_info)
1861                 .map_err(Error::PopulateHob)?;
1862         }
1863 
1864         hob.finish(&mem).map_err(Error::PopulateHob)?;
1865 
1866         Ok(hob_offset)
1867     }
1868 
1869     #[cfg(feature = "tdx")]
1870     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1871         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1872         let mem = guest_memory.memory();
1873 
1874         for section in sections {
1875             self.vm
1876                 .tdx_init_memory_region(
1877                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1878                     section.address,
1879                     section.size,
1880                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1881                     section.attributes == 1,
1882                 )
1883                 .map_err(Error::InitializeTdxMemoryRegion)?;
1884         }
1885 
1886         Ok(())
1887     }
1888 
1889     // Creates ACPI tables
1890     // In case of TDX being used, this is a no-op since the tables will be
1891     // created and passed when populating the HOB.
1892 
1893     fn create_acpi_tables(&self) -> Option<GuestAddress> {
1894         #[cfg(feature = "tdx")]
1895         if self.config.lock().unwrap().is_tdx_enabled() {
1896             return None;
1897         }
1898         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
1899         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
1900         let rsdp_addr = crate::acpi::create_acpi_tables(
1901             &mem,
1902             &self.device_manager,
1903             &self.cpu_manager,
1904             &self.memory_manager,
1905             &self.numa_nodes,
1906             tpm_enabled,
1907         );
1908         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
1909 
1910         Some(rsdp_addr)
1911     }
1912 
1913     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
1914         trace_scoped!("entry_point");
1915 
1916         self.load_payload_handle
1917             .take()
1918             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
1919             .transpose()
1920     }
1921 
1922     pub fn boot(&mut self) -> Result<()> {
1923         trace_scoped!("Vm::boot");
1924         info!("Booting VM");
1925         event!("vm", "booting");
1926         let current_state = self.get_state()?;
1927         if current_state == VmState::Paused {
1928             return self.resume().map_err(Error::Resume);
1929         }
1930 
1931         let new_state = if self.stop_on_boot {
1932             VmState::BreakPoint
1933         } else {
1934             VmState::Running
1935         };
1936         current_state.valid_transition(new_state)?;
1937 
1938         // Do earlier to parallelise with loading kernel
1939         #[cfg(target_arch = "x86_64")]
1940         let rsdp_addr = self.create_acpi_tables();
1941 
1942         // Load kernel synchronously or if asynchronous then wait for load to
1943         // finish.
1944         let entry_point = self.entry_point()?;
1945 
1946         #[cfg(feature = "tdx")]
1947         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
1948 
1949         // Configure the vcpus that have been created
1950         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
1951         for vcpu in vcpus {
1952             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
1953             let boot_setup = entry_point.map(|e| (e, guest_memory));
1954             self.cpu_manager
1955                 .lock()
1956                 .unwrap()
1957                 .configure_vcpu(vcpu, boot_setup)
1958                 .map_err(Error::CpuManager)?;
1959         }
1960 
1961         #[cfg(feature = "tdx")]
1962         let (sections, guid_found) = if tdx_enabled {
1963             self.extract_tdvf_sections()?
1964         } else {
1965             (Vec::new(), false)
1966         };
1967 
1968         // Configuring the TDX regions requires that the vCPUs are created.
1969         #[cfg(feature = "tdx")]
1970         let hob_address = if tdx_enabled {
1971             // TDX sections are written to memory.
1972             self.populate_tdx_sections(&sections, guid_found)?
1973         } else {
1974             None
1975         };
1976 
1977         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
1978         // available after they are configured
1979         #[cfg(target_arch = "aarch64")]
1980         let rsdp_addr = self.create_acpi_tables();
1981 
1982         // Configure shared state based on loaded kernel
1983         entry_point
1984             .map(|_| {
1985                 // Safe to unwrap rsdp_addr as we know it can't be None when
1986                 // the entry_point is Some.
1987                 self.configure_system(rsdp_addr.unwrap())
1988             })
1989             .transpose()?;
1990 
1991         #[cfg(target_arch = "x86_64")]
1992         // Note: For x86, always call this function before invoking start boot vcpus.
1993         // Otherwise guest would fail to boot because we haven't created the
1994         // userspace mappings to update the hypervisor about the memory mappings.
1995         // These mappings must be created before we start the vCPU threads for
1996         // the very first time.
1997         self.memory_manager
1998             .lock()
1999             .unwrap()
2000             .allocate_address_space()
2001             .map_err(Error::MemoryManager)?;
2002 
2003         #[cfg(feature = "tdx")]
2004         if let Some(hob_address) = hob_address {
2005             // With the HOB address extracted the vCPUs can have
2006             // their TDX state configured.
2007             self.cpu_manager
2008                 .lock()
2009                 .unwrap()
2010                 .initialize_tdx(hob_address)
2011                 .map_err(Error::CpuManager)?;
2012             // Let the hypervisor know which memory ranges are shared with the
2013             // guest. This prevents the guest from ignoring/discarding memory
2014             // regions provided by the host.
2015             self.init_tdx_memory(&sections)?;
2016             // With TDX memory and CPU state configured TDX setup is complete
2017             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2018         }
2019 
2020         self.cpu_manager
2021             .lock()
2022             .unwrap()
2023             .start_boot_vcpus(new_state == VmState::BreakPoint)
2024             .map_err(Error::CpuManager)?;
2025 
2026         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2027         *state = new_state;
2028         event!("vm", "booted");
2029         Ok(())
2030     }
2031 
2032     pub fn restore(&mut self) -> Result<()> {
2033         event!("vm", "restoring");
2034 
2035         #[cfg(target_arch = "x86_64")]
2036         // Note: For x86, always call this function before invoking start boot vcpus.
2037         // Otherwise guest would fail to boot because we haven't created the
2038         // userspace mappings to update the hypervisor about the memory mappings.
2039         // These mappings must be created before we start the vCPU threads for
2040         // the very first time for the restored VM.
2041         self.memory_manager
2042             .lock()
2043             .unwrap()
2044             .allocate_address_space()
2045             .map_err(Error::MemoryManager)?;
2046 
2047         // Now we can start all vCPUs from here.
2048         self.cpu_manager
2049             .lock()
2050             .unwrap()
2051             .start_restored_vcpus()
2052             .map_err(Error::CpuManager)?;
2053 
2054         event!("vm", "restored");
2055         Ok(())
2056     }
2057 
2058     /// Gets a thread-safe reference counted pointer to the VM configuration.
2059     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2060         Arc::clone(&self.config)
2061     }
2062 
2063     /// Get the VM state. Returns an error if the state is poisoned.
2064     pub fn get_state(&self) -> Result<VmState> {
2065         self.state
2066             .try_read()
2067             .map_err(|_| Error::PoisonedState)
2068             .map(|state| *state)
2069     }
2070 
2071     /// Gets the actual size of the balloon.
2072     pub fn balloon_size(&self) -> u64 {
2073         self.device_manager.lock().unwrap().balloon_size()
2074     }
2075 
2076     pub fn send_memory_fds(
2077         &mut self,
2078         socket: &mut UnixStream,
2079     ) -> std::result::Result<(), MigratableError> {
2080         for (slot, fd) in self
2081             .memory_manager
2082             .lock()
2083             .unwrap()
2084             .memory_slot_fds()
2085             .drain()
2086         {
2087             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2088                 .write_to(socket)
2089                 .map_err(|e| {
2090                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2091                 })?;
2092             socket
2093                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2094                 .map_err(|e| {
2095                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2096                 })?;
2097 
2098             let res = Response::read_from(socket)?;
2099             if res.status() != Status::Ok {
2100                 warn!("Error during memory fd migration");
2101                 Request::abandon().write_to(socket)?;
2102                 Response::read_from(socket).ok();
2103                 return Err(MigratableError::MigrateSend(anyhow!(
2104                     "Error during memory fd migration"
2105                 )));
2106             }
2107         }
2108 
2109         Ok(())
2110     }
2111 
2112     pub fn send_memory_regions<F>(
2113         &mut self,
2114         ranges: &MemoryRangeTable,
2115         fd: &mut F,
2116     ) -> std::result::Result<(), MigratableError>
2117     where
2118         F: Write,
2119     {
2120         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2121         let mem = guest_memory.memory();
2122 
2123         for range in ranges.regions() {
2124             let mut offset: u64 = 0;
2125             // Here we are manually handling the retry in case we can't the
2126             // whole region at once because we can't use the implementation
2127             // from vm-memory::GuestMemory of write_all_to() as it is not
2128             // following the correct behavior. For more info about this issue
2129             // see: https://github.com/rust-vmm/vm-memory/issues/174
2130             loop {
2131                 let bytes_written = mem
2132                     .write_to(
2133                         GuestAddress(range.gpa + offset),
2134                         fd,
2135                         (range.length - offset) as usize,
2136                     )
2137                     .map_err(|e| {
2138                         MigratableError::MigrateSend(anyhow!(
2139                             "Error transferring memory to socket: {}",
2140                             e
2141                         ))
2142                     })?;
2143                 offset += bytes_written as u64;
2144 
2145                 if offset == range.length {
2146                     break;
2147                 }
2148             }
2149         }
2150 
2151         Ok(())
2152     }
2153 
2154     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2155         self.memory_manager
2156             .lock()
2157             .unwrap()
2158             .memory_range_table(false)
2159     }
2160 
2161     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2162         self.device_manager.lock().unwrap().device_tree()
2163     }
2164 
2165     pub fn activate_virtio_devices(&self) -> Result<()> {
2166         self.device_manager
2167             .lock()
2168             .unwrap()
2169             .activate_virtio_devices()
2170             .map_err(Error::ActivateVirtioDevices)
2171     }
2172 
2173     #[cfg(target_arch = "x86_64")]
2174     pub fn power_button(&self) -> Result<()> {
2175         return self
2176             .device_manager
2177             .lock()
2178             .unwrap()
2179             .notify_power_button()
2180             .map_err(Error::PowerButton);
2181     }
2182 
2183     #[cfg(target_arch = "aarch64")]
2184     pub fn power_button(&self) -> Result<()> {
2185         self.device_manager
2186             .lock()
2187             .unwrap()
2188             .notify_power_button()
2189             .map_err(Error::PowerButton)
2190     }
2191 
2192     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2193         self.memory_manager.lock().unwrap().snapshot_data()
2194     }
2195 
2196     #[cfg(feature = "guest_debug")]
2197     pub fn debug_request(
2198         &mut self,
2199         gdb_request: &GdbRequestPayload,
2200         cpu_id: usize,
2201     ) -> Result<GdbResponsePayload> {
2202         use GdbRequestPayload::*;
2203         match gdb_request {
2204             SetSingleStep(single_step) => {
2205                 self.set_guest_debug(cpu_id, &[], *single_step)
2206                     .map_err(Error::Debug)?;
2207             }
2208             SetHwBreakPoint(addrs) => {
2209                 self.set_guest_debug(cpu_id, addrs, false)
2210                     .map_err(Error::Debug)?;
2211             }
2212             Pause => {
2213                 self.debug_pause().map_err(Error::Debug)?;
2214             }
2215             Resume => {
2216                 self.debug_resume().map_err(Error::Debug)?;
2217             }
2218             ReadRegs => {
2219                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2220                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2221             }
2222             WriteRegs(regs) => {
2223                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2224             }
2225             ReadMem(vaddr, len) => {
2226                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2227                 let mem = self
2228                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2229                     .map_err(Error::Debug)?;
2230                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2231             }
2232             WriteMem(vaddr, data) => {
2233                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2234                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2235                     .map_err(Error::Debug)?;
2236             }
2237             ActiveVcpus => {
2238                 let active_vcpus = self.active_vcpus();
2239                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2240             }
2241         }
2242         Ok(GdbResponsePayload::CommandComplete)
2243     }
2244 
2245     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2246     fn get_dump_state(
2247         &mut self,
2248         destination_url: &str,
2249     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2250         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2251         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2252         let mut elf_phdr_num = 1;
2253         let elf_sh_info = 0;
2254         let coredump_file_path = url_to_file(destination_url)?;
2255         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2256 
2257         if mapping_num < UINT16_MAX - 2 {
2258             elf_phdr_num += mapping_num as u16;
2259         } else {
2260             panic!("mapping num beyond 65535 not supported");
2261         }
2262         let coredump_file = OpenOptions::new()
2263             .read(true)
2264             .write(true)
2265             .create_new(true)
2266             .open(coredump_file_path)
2267             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2268 
2269         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2270         let mem_data = self
2271             .memory_manager
2272             .lock()
2273             .unwrap()
2274             .coredump_memory_regions(mem_offset);
2275 
2276         Ok(DumpState {
2277             elf_note_size,
2278             elf_phdr_num,
2279             elf_sh_info,
2280             mem_offset,
2281             mem_info: Some(mem_data),
2282             file: Some(coredump_file),
2283         })
2284     }
2285 
2286     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2287     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2288         size_of::<elf::Elf64_Ehdr>() as u64
2289             + note_size as u64
2290             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2291     }
2292 }
2293 
2294 impl Pausable for Vm {
2295     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2296         event!("vm", "pausing");
2297         let mut state = self
2298             .state
2299             .try_write()
2300             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2301         let new_state = VmState::Paused;
2302 
2303         state
2304             .valid_transition(new_state)
2305             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2306 
2307         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2308         {
2309             let mut clock = self
2310                 .vm
2311                 .get_clock()
2312                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2313             clock.reset_flags();
2314             self.saved_clock = Some(clock);
2315         }
2316 
2317         // Before pausing the vCPUs activate any pending virtio devices that might
2318         // need activation between starting the pause (or e.g. a migration it's part of)
2319         self.activate_virtio_devices().map_err(|e| {
2320             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2321         })?;
2322 
2323         self.cpu_manager.lock().unwrap().pause()?;
2324         self.device_manager.lock().unwrap().pause()?;
2325 
2326         *state = new_state;
2327 
2328         event!("vm", "paused");
2329         Ok(())
2330     }
2331 
2332     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2333         event!("vm", "resuming");
2334         let mut state = self
2335             .state
2336             .try_write()
2337             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2338         let new_state = VmState::Running;
2339 
2340         state
2341             .valid_transition(new_state)
2342             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2343 
2344         self.cpu_manager.lock().unwrap().resume()?;
2345         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2346         {
2347             if let Some(clock) = &self.saved_clock {
2348                 self.vm.set_clock(clock).map_err(|e| {
2349                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2350                 })?;
2351             }
2352         }
2353         self.device_manager.lock().unwrap().resume()?;
2354 
2355         // And we're back to the Running state.
2356         *state = new_state;
2357         event!("vm", "resumed");
2358         Ok(())
2359     }
2360 }
2361 
2362 #[derive(Serialize, Deserialize)]
2363 pub struct VmSnapshot {
2364     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2365     pub clock: Option<hypervisor::ClockData>,
2366     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2367     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2368 }
2369 
2370 pub const VM_SNAPSHOT_ID: &str = "vm";
2371 impl Snapshottable for Vm {
2372     fn id(&self) -> String {
2373         VM_SNAPSHOT_ID.to_string()
2374     }
2375 
2376     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2377         event!("vm", "snapshotting");
2378 
2379         #[cfg(feature = "tdx")]
2380         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2381 
2382         #[cfg(feature = "tdx")]
2383         {
2384             if tdx_enabled {
2385                 return Err(MigratableError::Snapshot(anyhow!(
2386                     "Snapshot not possible with TDX VM"
2387                 )));
2388             }
2389         }
2390 
2391         let current_state = self.get_state().unwrap();
2392         if current_state != VmState::Paused {
2393             return Err(MigratableError::Snapshot(anyhow!(
2394                 "Trying to snapshot while VM is running"
2395             )));
2396         }
2397 
2398         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2399         let common_cpuid = {
2400             let phys_bits = physical_bits(
2401                 &self.hypervisor,
2402                 self.config.lock().unwrap().cpus.max_phys_bits,
2403             );
2404             arch::generate_common_cpuid(
2405                 &self.hypervisor,
2406                 None,
2407                 None,
2408                 phys_bits,
2409                 self.config.lock().unwrap().cpus.kvm_hyperv,
2410                 #[cfg(feature = "tdx")]
2411                 tdx_enabled,
2412             )
2413             .map_err(|e| {
2414                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2415             })?
2416         };
2417 
2418         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2419             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2420             clock: self.saved_clock,
2421             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2422             common_cpuid,
2423         })
2424         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2425 
2426         let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data));
2427 
2428         let (id, snapshot) = {
2429             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2430             (cpu_manager.id(), cpu_manager.snapshot()?)
2431         };
2432         vm_snapshot.add_snapshot(id, snapshot);
2433         let (id, snapshot) = {
2434             let mut memory_manager = self.memory_manager.lock().unwrap();
2435             (memory_manager.id(), memory_manager.snapshot()?)
2436         };
2437         vm_snapshot.add_snapshot(id, snapshot);
2438         let (id, snapshot) = {
2439             let mut device_manager = self.device_manager.lock().unwrap();
2440             (device_manager.id(), device_manager.snapshot()?)
2441         };
2442         vm_snapshot.add_snapshot(id, snapshot);
2443 
2444         event!("vm", "snapshotted");
2445         Ok(vm_snapshot)
2446     }
2447 }
2448 
2449 impl Transportable for Vm {
2450     fn send(
2451         &self,
2452         snapshot: &Snapshot,
2453         destination_url: &str,
2454     ) -> std::result::Result<(), MigratableError> {
2455         let mut snapshot_config_path = url_to_path(destination_url)?;
2456         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2457 
2458         // Create the snapshot config file
2459         let mut snapshot_config_file = OpenOptions::new()
2460             .read(true)
2461             .write(true)
2462             .create_new(true)
2463             .open(snapshot_config_path)
2464             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2465 
2466         // Serialize and write the snapshot config
2467         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2468             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2469 
2470         snapshot_config_file
2471             .write(vm_config.as_bytes())
2472             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2473 
2474         let mut snapshot_state_path = url_to_path(destination_url)?;
2475         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2476 
2477         // Create the snapshot state file
2478         let mut snapshot_state_file = OpenOptions::new()
2479             .read(true)
2480             .write(true)
2481             .create_new(true)
2482             .open(snapshot_state_path)
2483             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2484 
2485         // Serialize and write the snapshot state
2486         let vm_state =
2487             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2488 
2489         snapshot_state_file
2490             .write(&vm_state)
2491             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2492 
2493         // Tell the memory manager to also send/write its own snapshot.
2494         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2495             self.memory_manager
2496                 .lock()
2497                 .unwrap()
2498                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2499         } else {
2500             return Err(MigratableError::Restore(anyhow!(
2501                 "Missing memory manager snapshot"
2502             )));
2503         }
2504 
2505         Ok(())
2506     }
2507 }
2508 
2509 impl Migratable for Vm {
2510     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2511         self.memory_manager.lock().unwrap().start_dirty_log()?;
2512         self.device_manager.lock().unwrap().start_dirty_log()
2513     }
2514 
2515     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2516         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2517         self.device_manager.lock().unwrap().stop_dirty_log()
2518     }
2519 
2520     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2521         Ok(MemoryRangeTable::new_from_tables(vec![
2522             self.memory_manager.lock().unwrap().dirty_log()?,
2523             self.device_manager.lock().unwrap().dirty_log()?,
2524         ]))
2525     }
2526 
2527     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2528         self.memory_manager.lock().unwrap().start_migration()?;
2529         self.device_manager.lock().unwrap().start_migration()
2530     }
2531 
2532     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2533         self.memory_manager.lock().unwrap().complete_migration()?;
2534         self.device_manager.lock().unwrap().complete_migration()
2535     }
2536 }
2537 
2538 #[cfg(feature = "guest_debug")]
2539 impl Debuggable for Vm {
2540     fn set_guest_debug(
2541         &self,
2542         cpu_id: usize,
2543         addrs: &[GuestAddress],
2544         singlestep: bool,
2545     ) -> std::result::Result<(), DebuggableError> {
2546         self.cpu_manager
2547             .lock()
2548             .unwrap()
2549             .set_guest_debug(cpu_id, addrs, singlestep)
2550     }
2551 
2552     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2553         if *self.state.read().unwrap() == VmState::Running {
2554             self.pause().map_err(DebuggableError::Pause)?;
2555         }
2556 
2557         let mut state = self
2558             .state
2559             .try_write()
2560             .map_err(|_| DebuggableError::PoisonedState)?;
2561         *state = VmState::BreakPoint;
2562         Ok(())
2563     }
2564 
2565     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2566         if *self.state.read().unwrap() == VmState::BreakPoint {
2567             self.resume().map_err(DebuggableError::Pause)?;
2568         }
2569 
2570         Ok(())
2571     }
2572 
2573     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2574         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2575     }
2576 
2577     fn write_regs(
2578         &self,
2579         cpu_id: usize,
2580         regs: &CoreRegs,
2581     ) -> std::result::Result<(), DebuggableError> {
2582         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2583     }
2584 
2585     fn read_mem(
2586         &self,
2587         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2588         cpu_id: usize,
2589         vaddr: GuestAddress,
2590         len: usize,
2591     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2592         self.cpu_manager
2593             .lock()
2594             .unwrap()
2595             .read_mem(guest_memory, cpu_id, vaddr, len)
2596     }
2597 
2598     fn write_mem(
2599         &self,
2600         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2601         cpu_id: usize,
2602         vaddr: &GuestAddress,
2603         data: &[u8],
2604     ) -> std::result::Result<(), DebuggableError> {
2605         self.cpu_manager
2606             .lock()
2607             .unwrap()
2608             .write_mem(guest_memory, cpu_id, vaddr, data)
2609     }
2610 
2611     fn active_vcpus(&self) -> usize {
2612         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2613         if active_vcpus > 0 {
2614             active_vcpus
2615         } else {
2616             // The VM is not booted yet. Report boot_vcpus() instead.
2617             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2618         }
2619     }
2620 }
2621 
2622 #[cfg(feature = "guest_debug")]
2623 pub const UINT16_MAX: u32 = 65535;
2624 
2625 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2626 impl Elf64Writable for Vm {}
2627 
2628 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2629 impl GuestDebuggable for Vm {
2630     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2631         event!("vm", "coredumping");
2632 
2633         let mut resume = false;
2634 
2635         #[cfg(feature = "tdx")]
2636         {
2637             if let Some(ref platform) = self.config.lock().unwrap().platform {
2638                 if platform.tdx {
2639                     return Err(GuestDebuggableError::Coredump(anyhow!(
2640                         "Coredump not possible with TDX VM"
2641                     )));
2642                 }
2643             }
2644         }
2645 
2646         match self.get_state().unwrap() {
2647             VmState::Running => {
2648                 self.pause().map_err(GuestDebuggableError::Pause)?;
2649                 resume = true;
2650             }
2651             VmState::Paused => {}
2652             _ => {
2653                 return Err(GuestDebuggableError::Coredump(anyhow!(
2654                     "Trying to coredump while VM is not running or paused"
2655                 )));
2656             }
2657         }
2658 
2659         let coredump_state = self.get_dump_state(destination_url)?;
2660 
2661         self.write_header(&coredump_state)?;
2662         self.write_note(&coredump_state)?;
2663         self.write_loads(&coredump_state)?;
2664 
2665         self.cpu_manager
2666             .lock()
2667             .unwrap()
2668             .cpu_write_elf64_note(&coredump_state)?;
2669         self.cpu_manager
2670             .lock()
2671             .unwrap()
2672             .cpu_write_vmm_note(&coredump_state)?;
2673 
2674         self.memory_manager
2675             .lock()
2676             .unwrap()
2677             .coredump_iterate_save_mem(&coredump_state)?;
2678 
2679         if resume {
2680             self.resume().map_err(GuestDebuggableError::Resume)?;
2681         }
2682 
2683         Ok(())
2684     }
2685 }
2686 
2687 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2688 #[cfg(test)]
2689 mod tests {
2690     use super::*;
2691 
2692     fn test_vm_state_transitions(state: VmState) {
2693         match state {
2694             VmState::Created => {
2695                 // Check the transitions from Created
2696                 assert!(state.valid_transition(VmState::Created).is_err());
2697                 assert!(state.valid_transition(VmState::Running).is_ok());
2698                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2699                 assert!(state.valid_transition(VmState::Paused).is_ok());
2700                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2701             }
2702             VmState::Running => {
2703                 // Check the transitions from Running
2704                 assert!(state.valid_transition(VmState::Created).is_err());
2705                 assert!(state.valid_transition(VmState::Running).is_err());
2706                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2707                 assert!(state.valid_transition(VmState::Paused).is_ok());
2708                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2709             }
2710             VmState::Shutdown => {
2711                 // Check the transitions from Shutdown
2712                 assert!(state.valid_transition(VmState::Created).is_err());
2713                 assert!(state.valid_transition(VmState::Running).is_ok());
2714                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2715                 assert!(state.valid_transition(VmState::Paused).is_err());
2716                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2717             }
2718             VmState::Paused => {
2719                 // Check the transitions from Paused
2720                 assert!(state.valid_transition(VmState::Created).is_err());
2721                 assert!(state.valid_transition(VmState::Running).is_ok());
2722                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2723                 assert!(state.valid_transition(VmState::Paused).is_err());
2724                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2725             }
2726             VmState::BreakPoint => {
2727                 // Check the transitions from Breakpoint
2728                 assert!(state.valid_transition(VmState::Created).is_ok());
2729                 assert!(state.valid_transition(VmState::Running).is_ok());
2730                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2731                 assert!(state.valid_transition(VmState::Paused).is_err());
2732                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2733             }
2734         }
2735     }
2736 
2737     #[test]
2738     fn test_vm_created_transitions() {
2739         test_vm_state_transitions(VmState::Created);
2740     }
2741 
2742     #[test]
2743     fn test_vm_running_transitions() {
2744         test_vm_state_transitions(VmState::Running);
2745     }
2746 
2747     #[test]
2748     fn test_vm_shutdown_transitions() {
2749         test_vm_state_transitions(VmState::Shutdown);
2750     }
2751 
2752     #[test]
2753     fn test_vm_paused_transitions() {
2754         test_vm_state_transitions(VmState::Paused);
2755     }
2756 
2757     #[cfg(feature = "tdx")]
2758     #[test]
2759     fn test_hob_memory_resources() {
2760         // Case 1: Two TDVF sections in the middle of the RAM
2761         let sections = vec![
2762             TdvfSection {
2763                 address: 0xc000,
2764                 size: 0x1000,
2765                 ..Default::default()
2766             },
2767             TdvfSection {
2768                 address: 0x1000,
2769                 size: 0x4000,
2770                 ..Default::default()
2771             },
2772         ];
2773         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2774         let expected = vec![
2775             (0, 0x1000, true),
2776             (0x1000, 0x4000, false),
2777             (0x5000, 0x7000, true),
2778             (0xc000, 0x1000, false),
2779             (0xd000, 0x0fff_3000, true),
2780         ];
2781         assert_eq!(
2782             expected,
2783             Vm::hob_memory_resources(
2784                 sections,
2785                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2786             )
2787         );
2788 
2789         // Case 2: Two TDVF sections with no conflict with the RAM
2790         let sections = vec![
2791             TdvfSection {
2792                 address: 0x1000_1000,
2793                 size: 0x1000,
2794                 ..Default::default()
2795             },
2796             TdvfSection {
2797                 address: 0,
2798                 size: 0x1000,
2799                 ..Default::default()
2800             },
2801         ];
2802         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2803         let expected = vec![
2804             (0, 0x1000, false),
2805             (0x1000, 0x1000_0000, true),
2806             (0x1000_1000, 0x1000, false),
2807         ];
2808         assert_eq!(
2809             expected,
2810             Vm::hob_memory_resources(
2811                 sections,
2812                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2813             )
2814         );
2815 
2816         // Case 3: Two TDVF sections with partial conflicts with the RAM
2817         let sections = vec![
2818             TdvfSection {
2819                 address: 0x1000_0000,
2820                 size: 0x2000,
2821                 ..Default::default()
2822             },
2823             TdvfSection {
2824                 address: 0,
2825                 size: 0x2000,
2826                 ..Default::default()
2827             },
2828         ];
2829         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2830         let expected = vec![
2831             (0, 0x2000, false),
2832             (0x2000, 0x0fff_e000, true),
2833             (0x1000_0000, 0x2000, false),
2834         ];
2835         assert_eq!(
2836             expected,
2837             Vm::hob_memory_resources(
2838                 sections,
2839                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2840             )
2841         );
2842 
2843         // Case 4: Two TDVF sections with no conflict before the RAM and two
2844         // more additional sections with no conflict after the RAM.
2845         let sections = vec![
2846             TdvfSection {
2847                 address: 0x2000_1000,
2848                 size: 0x1000,
2849                 ..Default::default()
2850             },
2851             TdvfSection {
2852                 address: 0x2000_0000,
2853                 size: 0x1000,
2854                 ..Default::default()
2855             },
2856             TdvfSection {
2857                 address: 0x1000,
2858                 size: 0x1000,
2859                 ..Default::default()
2860             },
2861             TdvfSection {
2862                 address: 0,
2863                 size: 0x1000,
2864                 ..Default::default()
2865             },
2866         ];
2867         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
2868         let expected = vec![
2869             (0, 0x1000, false),
2870             (0x1000, 0x1000, false),
2871             (0x4000, 0x1000_0000, true),
2872             (0x2000_0000, 0x1000, false),
2873             (0x2000_1000, 0x1000, false),
2874         ];
2875         assert_eq!(
2876             expected,
2877             Vm::hob_memory_resources(
2878                 sections,
2879                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2880             )
2881         );
2882 
2883         // Case 5: One TDVF section overriding the entire RAM
2884         let sections = vec![TdvfSection {
2885             address: 0,
2886             size: 0x2000_0000,
2887             ..Default::default()
2888         }];
2889         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2890         let expected = vec![(0, 0x2000_0000, false)];
2891         assert_eq!(
2892             expected,
2893             Vm::hob_memory_resources(
2894                 sections,
2895                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2896             )
2897         );
2898 
2899         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
2900         let sections = vec![
2901             TdvfSection {
2902                 address: 0x1000_2000,
2903                 size: 0x2000,
2904                 ..Default::default()
2905             },
2906             TdvfSection {
2907                 address: 0,
2908                 size: 0x2000,
2909                 ..Default::default()
2910             },
2911         ];
2912         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
2913             (GuestAddress(0x2000), 0x1000_0000),
2914             (GuestAddress(0x1000_4000), 0x1000_0000),
2915         ];
2916         let expected = vec![
2917             (0, 0x2000, false),
2918             (0x2000, 0x1000_0000, true),
2919             (0x1000_2000, 0x2000, false),
2920             (0x1000_4000, 0x1000_0000, true),
2921         ];
2922         assert_eq!(
2923             expected,
2924             Vm::hob_memory_resources(
2925                 sections,
2926                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2927             )
2928         );
2929 
2930         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
2931         let sections = vec![
2932             TdvfSection {
2933                 address: 0x1000_0000,
2934                 size: 0x4000,
2935                 ..Default::default()
2936             },
2937             TdvfSection {
2938                 address: 0,
2939                 size: 0x4000,
2940                 ..Default::default()
2941             },
2942         ];
2943         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
2944             (GuestAddress(0x1000), 0x1000_0000),
2945             (GuestAddress(0x1000_3000), 0x1000_0000),
2946         ];
2947         let expected = vec![
2948             (0, 0x4000, false),
2949             (0x4000, 0x0fff_c000, true),
2950             (0x1000_0000, 0x4000, false),
2951             (0x1000_4000, 0x0fff_f000, true),
2952         ];
2953         assert_eq!(
2954             expected,
2955             Vm::hob_memory_resources(
2956                 sections,
2957                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2958             )
2959         );
2960     }
2961 }
2962 
2963 #[cfg(target_arch = "aarch64")]
2964 #[cfg(test)]
2965 mod tests {
2966     use super::*;
2967     use crate::GuestMemoryMmap;
2968     use arch::aarch64::fdt::create_fdt;
2969     use arch::aarch64::layout;
2970     use arch::{DeviceType, MmioDeviceInfo};
2971     use devices::gic::Gic;
2972 
2973     const LEN: u64 = 4096;
2974 
2975     #[test]
2976     fn test_create_fdt_with_devices() {
2977         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
2978         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
2979 
2980         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
2981             (
2982                 (DeviceType::Serial, DeviceType::Serial.to_string()),
2983                 MmioDeviceInfo {
2984                     addr: 0x00,
2985                     len: LEN,
2986                     irq: 33,
2987                 },
2988             ),
2989             (
2990                 (DeviceType::Virtio(1), "virtio".to_string()),
2991                 MmioDeviceInfo {
2992                     addr: LEN,
2993                     len: LEN,
2994                     irq: 34,
2995                 },
2996             ),
2997             (
2998                 (DeviceType::Rtc, "rtc".to_string()),
2999                 MmioDeviceInfo {
3000                     addr: 2 * LEN,
3001                     len: LEN,
3002                     irq: 35,
3003                 },
3004             ),
3005         ]
3006         .iter()
3007         .cloned()
3008         .collect();
3009 
3010         let hv = hypervisor::new().unwrap();
3011         let vm = hv.create_vm().unwrap();
3012         let gic = vm
3013             .create_vgic(Gic::create_default_config(1))
3014             .expect("Cannot create gic");
3015         assert!(create_fdt(
3016             &mem,
3017             "console=tty0",
3018             vec![0],
3019             Some((0, 0, 0)),
3020             &dev_info,
3021             &gic,
3022             &None,
3023             &Vec::new(),
3024             &BTreeMap::new(),
3025             None,
3026             true,
3027         )
3028         .is_ok())
3029     }
3030 }
3031 
3032 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3033 #[test]
3034 pub fn test_vm() {
3035     use hypervisor::VmExit;
3036     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3037     // This example based on https://lwn.net/Articles/658511/
3038     let code = [
3039         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3040         0x00, 0xd8, /* add %bl, %al */
3041         0x04, b'0', /* add $'0', %al */
3042         0xee, /* out %al, (%dx) */
3043         0xb0, b'\n', /* mov $'\n', %al */
3044         0xee,  /* out %al, (%dx) */
3045         0xf4,  /* hlt */
3046     ];
3047 
3048     let mem_size = 0x1000;
3049     let load_addr = GuestAddress(0x1000);
3050     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3051 
3052     let hv = hypervisor::new().unwrap();
3053     let vm = hv.create_vm().expect("new VM creation failed");
3054 
3055     for (index, region) in mem.iter().enumerate() {
3056         let mem_region = vm.make_user_memory_region(
3057             index as u32,
3058             region.start_addr().raw_value(),
3059             region.len(),
3060             region.as_ptr() as u64,
3061             false,
3062             false,
3063         );
3064 
3065         vm.create_user_memory_region(mem_region)
3066             .expect("Cannot configure guest memory");
3067     }
3068     mem.write_slice(&code, load_addr)
3069         .expect("Writing code to memory failed");
3070 
3071     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3072 
3073     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3074     vcpu_sregs.cs.base = 0;
3075     vcpu_sregs.cs.selector = 0;
3076     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3077 
3078     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3079     vcpu_regs.rip = 0x1000;
3080     vcpu_regs.rax = 2;
3081     vcpu_regs.rbx = 3;
3082     vcpu_regs.rflags = 2;
3083     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3084 
3085     loop {
3086         match vcpu.run().expect("run failed") {
3087             VmExit::IoOut(addr, data) => {
3088                 println!(
3089                     "IO out -- addr: {:#x} data [{:?}]",
3090                     addr,
3091                     str::from_utf8(data).unwrap()
3092                 );
3093             }
3094             VmExit::Reset => {
3095                 println!("HLT");
3096                 break;
3097             }
3098             r => panic!("unexpected exit reason: {r:?}"),
3099         }
3100     }
3101 }
3102