xref: /cloud-hypervisor/vmm/src/vm.rs (revision 2571e59438597f53aa4993cd70d6462fe1364ba7)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 use crate::memory_manager::{
29     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
30 };
31 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
32 use crate::migration::get_vm_snapshot;
33 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
34 use crate::migration::url_to_file;
35 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
36 use crate::GuestMemoryMmap;
37 use crate::{
38     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
39 };
40 use anyhow::anyhow;
41 use arch::get_host_cpu_phys_bits;
42 #[cfg(target_arch = "x86_64")]
43 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
44 #[cfg(feature = "tdx")]
45 use arch::x86_64::tdx::TdvfSection;
46 use arch::EntryPoint;
47 #[cfg(target_arch = "aarch64")]
48 use arch::PciSpaceInfo;
49 use arch::{NumaNode, NumaNodes};
50 #[cfg(target_arch = "aarch64")]
51 use devices::interrupt_controller;
52 use devices::AcpiNotificationFlags;
53 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
54 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
55 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
56 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
57 use hypervisor::{HypervisorVmError, VmOps};
58 use libc::{termios, SIGWINCH};
59 use linux_loader::cmdline::Cmdline;
60 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
61 use linux_loader::elf;
62 #[cfg(target_arch = "x86_64")]
63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
64 #[cfg(target_arch = "aarch64")]
65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
66 use linux_loader::loader::KernelLoader;
67 use seccompiler::SeccompAction;
68 use serde::{Deserialize, Serialize};
69 use std::cmp;
70 use std::collections::BTreeMap;
71 use std::collections::HashMap;
72 use std::convert::TryInto;
73 use std::fs::{File, OpenOptions};
74 use std::io::{self, Seek, SeekFrom, Write};
75 #[cfg(feature = "tdx")]
76 use std::mem;
77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
78 use std::mem::size_of;
79 use std::num::Wrapping;
80 use std::ops::Deref;
81 use std::os::unix::net::UnixStream;
82 use std::sync::{Arc, Mutex, RwLock};
83 use std::time::Instant;
84 use std::{result, str, thread};
85 use thiserror::Error;
86 use tracer::trace_scoped;
87 use vm_device::Bus;
88 #[cfg(feature = "tdx")]
89 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion};
90 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic};
91 use vm_migration::protocol::{Request, Response, Status};
92 use vm_migration::{
93     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
94     SnapshotData, Snapshottable, Transportable,
95 };
96 use vmm_sys_util::eventfd::EventFd;
97 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
98 
99 /// Errors associated with VM management
100 #[derive(Debug, Error)]
101 pub enum Error {
102     #[error("Cannot open kernel file: {0}")]
103     KernelFile(#[source] io::Error),
104 
105     #[error("Cannot open initramfs file: {0}")]
106     InitramfsFile(#[source] io::Error),
107 
108     #[error("Cannot load the kernel into memory: {0}")]
109     KernelLoad(#[source] linux_loader::loader::Error),
110 
111     #[cfg(target_arch = "aarch64")]
112     #[error("Cannot load the UEFI binary in memory: {0:?}")]
113     UefiLoad(arch::aarch64::uefi::Error),
114 
115     #[error("Cannot load the initramfs into memory")]
116     InitramfsLoad,
117 
118     #[error("Cannot load the kernel command line in memory: {0}")]
119     LoadCmdLine(#[source] linux_loader::loader::Error),
120 
121     #[error("Cannot modify the kernel command line: {0}")]
122     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
123 
124     #[error("Cannot create the kernel command line: {0}")]
125     CmdLineCreate(#[source] linux_loader::cmdline::Error),
126 
127     #[error("Cannot configure system: {0}")]
128     ConfigureSystem(#[source] arch::Error),
129 
130     #[cfg(target_arch = "aarch64")]
131     #[error("Cannot enable interrupt controller: {0:?}")]
132     EnableInterruptController(interrupt_controller::Error),
133 
134     #[error("VM state is poisoned")]
135     PoisonedState,
136 
137     #[error("Error from device manager: {0:?}")]
138     DeviceManager(DeviceManagerError),
139 
140     #[error("No device with id {0:?} to remove")]
141     NoDeviceToRemove(String),
142 
143     #[error("Cannot spawn a signal handler thread: {0}")]
144     SignalHandlerSpawn(#[source] io::Error),
145 
146     #[error("Failed to join on threads: {0:?}")]
147     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
148 
149     #[error("VM config is missing")]
150     VmMissingConfig,
151 
152     #[error("VM is not created")]
153     VmNotCreated,
154 
155     #[error("VM is already created")]
156     VmAlreadyCreated,
157 
158     #[error("VM is not running")]
159     VmNotRunning,
160 
161     #[error("Cannot clone EventFd: {0}")]
162     EventFdClone(#[source] io::Error),
163 
164     #[error("invalid VM state transition: {0:?} to {1:?}")]
165     InvalidStateTransition(VmState, VmState),
166 
167     #[error("Error from CPU manager: {0}")]
168     CpuManager(#[source] cpu::Error),
169 
170     #[error("Cannot pause devices: {0}")]
171     PauseDevices(#[source] MigratableError),
172 
173     #[error("Cannot resume devices: {0}")]
174     ResumeDevices(#[source] MigratableError),
175 
176     #[error("Cannot pause CPUs: {0}")]
177     PauseCpus(#[source] MigratableError),
178 
179     #[error("Cannot resume cpus: {0}")]
180     ResumeCpus(#[source] MigratableError),
181 
182     #[error("Cannot pause VM: {0}")]
183     Pause(#[source] MigratableError),
184 
185     #[error("Cannot resume VM: {0}")]
186     Resume(#[source] MigratableError),
187 
188     #[error("Memory manager error: {0:?}")]
189     MemoryManager(MemoryManagerError),
190 
191     #[error("Eventfd write error: {0}")]
192     EventfdError(#[source] std::io::Error),
193 
194     #[error("Cannot snapshot VM: {0}")]
195     Snapshot(#[source] MigratableError),
196 
197     #[error("Cannot restore VM: {0}")]
198     Restore(#[source] MigratableError),
199 
200     #[error("Cannot send VM snapshot: {0}")]
201     SnapshotSend(#[source] MigratableError),
202 
203     #[error("Invalid restore source URL")]
204     InvalidRestoreSourceUrl,
205 
206     #[error("Failed to validate config: {0}")]
207     ConfigValidation(#[source] ValidationError),
208 
209     #[error("Too many virtio-vsock devices")]
210     TooManyVsockDevices,
211 
212     #[error("Failed serializing into JSON: {0}")]
213     SerializeJson(#[source] serde_json::Error),
214 
215     #[error("Invalid NUMA configuration")]
216     InvalidNumaConfig,
217 
218     #[error("Cannot create seccomp filter: {0}")]
219     CreateSeccompFilter(#[source] seccompiler::Error),
220 
221     #[error("Cannot apply seccomp filter: {0}")]
222     ApplySeccompFilter(#[source] seccompiler::Error),
223 
224     #[error("Failed resizing a memory zone")]
225     ResizeZone,
226 
227     #[error("Cannot activate virtio devices: {0:?}")]
228     ActivateVirtioDevices(DeviceManagerError),
229 
230     #[error("Error triggering power button: {0:?}")]
231     PowerButton(DeviceManagerError),
232 
233     #[error("Kernel lacks PVH header")]
234     KernelMissingPvhHeader,
235 
236     #[error("Failed to allocate firmware RAM: {0:?}")]
237     AllocateFirmwareMemory(MemoryManagerError),
238 
239     #[error("Error manipulating firmware file: {0}")]
240     FirmwareFile(#[source] std::io::Error),
241 
242     #[error("Firmware too big")]
243     FirmwareTooLarge,
244 
245     #[error("Failed to copy firmware to memory: {0}")]
246     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
247 
248     #[cfg(feature = "tdx")]
249     #[error("Error performing I/O on TDX firmware file: {0}")]
250     LoadTdvf(#[source] std::io::Error),
251 
252     #[cfg(feature = "tdx")]
253     #[error("Error performing I/O on the TDX payload file: {0}")]
254     LoadPayload(#[source] std::io::Error),
255 
256     #[cfg(feature = "tdx")]
257     #[error("Error parsing TDVF: {0}")]
258     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
259 
260     #[cfg(feature = "tdx")]
261     #[error("Error populating TDX HOB: {0}")]
262     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
263 
264     #[cfg(feature = "tdx")]
265     #[error("Error allocating TDVF memory: {0:?}")]
266     AllocatingTdvfMemory(crate::memory_manager::Error),
267 
268     #[cfg(feature = "tdx")]
269     #[error("Error enabling TDX VM: {0}")]
270     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
271 
272     #[cfg(feature = "tdx")]
273     #[error("Error enabling TDX memory region: {0}")]
274     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
275 
276     #[cfg(feature = "tdx")]
277     #[error("Error finalizing TDX VM: {0}")]
278     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
279 
280     #[cfg(feature = "tdx")]
281     #[error("TDX firmware missing")]
282     TdxFirmwareMissing,
283 
284     #[cfg(feature = "tdx")]
285     #[error("Invalid TDX payload type")]
286     InvalidPayloadType,
287 
288     #[cfg(feature = "guest_debug")]
289     #[error("Error debugging VM: {0:?}")]
290     Debug(DebuggableError),
291 
292     #[error("Error spawning kernel loading thread")]
293     KernelLoadThreadSpawn(std::io::Error),
294 
295     #[error("Error joining kernel loading thread")]
296     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
297 
298     #[error("Payload configuration is not bootable")]
299     InvalidPayload,
300 
301     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
302     #[error("Error coredumping VM: {0:?}")]
303     Coredump(GuestDebuggableError),
304 }
305 pub type Result<T> = result::Result<T, Error>;
306 
307 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
308 pub enum VmState {
309     Created,
310     Running,
311     Shutdown,
312     Paused,
313     BreakPoint,
314 }
315 
316 impl VmState {
317     fn valid_transition(self, new_state: VmState) -> Result<()> {
318         match self {
319             VmState::Created => match new_state {
320                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
321                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
322                     Ok(())
323                 }
324             },
325 
326             VmState::Running => match new_state {
327                 VmState::Created | VmState::Running => {
328                     Err(Error::InvalidStateTransition(self, new_state))
329                 }
330                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
331             },
332 
333             VmState::Shutdown => match new_state {
334                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
335                     Err(Error::InvalidStateTransition(self, new_state))
336                 }
337                 VmState::Running => Ok(()),
338             },
339 
340             VmState::Paused => match new_state {
341                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
342                     Err(Error::InvalidStateTransition(self, new_state))
343                 }
344                 VmState::Running | VmState::Shutdown => Ok(()),
345             },
346             VmState::BreakPoint => match new_state {
347                 VmState::Created | VmState::Running => Ok(()),
348                 _ => Err(Error::InvalidStateTransition(self, new_state)),
349             },
350         }
351     }
352 }
353 
354 struct VmOpsHandler {
355     memory: GuestMemoryAtomic<GuestMemoryMmap>,
356     #[cfg(target_arch = "x86_64")]
357     io_bus: Arc<Bus>,
358     mmio_bus: Arc<Bus>,
359 }
360 
361 impl VmOps for VmOpsHandler {
362     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
363         self.memory
364             .memory()
365             .write(buf, GuestAddress(gpa))
366             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
367     }
368 
369     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
370         self.memory
371             .memory()
372             .read(buf, GuestAddress(gpa))
373             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
374     }
375 
376     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
377         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
378             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
379         }
380         Ok(())
381     }
382 
383     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
384         match self.mmio_bus.write(gpa, data) {
385             Err(vm_device::BusError::MissingAddressRange) => {
386                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
387             }
388             Ok(Some(barrier)) => {
389                 info!("Waiting for barrier");
390                 barrier.wait();
391                 info!("Barrier released");
392             }
393             _ => {}
394         };
395         Ok(())
396     }
397 
398     #[cfg(target_arch = "x86_64")]
399     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
400         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
401             info!("Guest PIO read to unregistered address 0x{:x}", port);
402         }
403         Ok(())
404     }
405 
406     #[cfg(target_arch = "x86_64")]
407     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
408         match self.io_bus.write(port, data) {
409             Err(vm_device::BusError::MissingAddressRange) => {
410                 info!("Guest PIO write to unregistered address 0x{:x}", port);
411             }
412             Ok(Some(barrier)) => {
413                 info!("Waiting for barrier");
414                 barrier.wait();
415                 info!("Barrier released");
416             }
417             _ => {}
418         };
419         Ok(())
420     }
421 }
422 
423 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
424     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
425 
426     cmp::min(host_phys_bits, max_phys_bits)
427 }
428 
429 pub struct Vm {
430     #[cfg(feature = "tdx")]
431     kernel: Option<File>,
432     initramfs: Option<File>,
433     threads: Vec<thread::JoinHandle<()>>,
434     device_manager: Arc<Mutex<DeviceManager>>,
435     config: Arc<Mutex<VmConfig>>,
436     state: RwLock<VmState>,
437     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
438     memory_manager: Arc<Mutex<MemoryManager>>,
439     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
440     // The hypervisor abstracted virtual machine.
441     vm: Arc<dyn hypervisor::Vm>,
442     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
443     saved_clock: Option<hypervisor::ClockData>,
444     numa_nodes: NumaNodes,
445     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
446     hypervisor: Arc<dyn hypervisor::Hypervisor>,
447     stop_on_boot: bool,
448     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
449 }
450 
451 impl Vm {
452     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
453 
454     #[allow(clippy::too_many_arguments)]
455     pub fn new_from_memory_manager(
456         config: Arc<Mutex<VmConfig>>,
457         memory_manager: Arc<Mutex<MemoryManager>>,
458         vm: Arc<dyn hypervisor::Vm>,
459         exit_evt: EventFd,
460         reset_evt: EventFd,
461         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
462         seccomp_action: &SeccompAction,
463         hypervisor: Arc<dyn hypervisor::Hypervisor>,
464         activate_evt: EventFd,
465         timestamp: Instant,
466         serial_pty: Option<PtyPair>,
467         console_pty: Option<PtyPair>,
468         console_resize_pipe: Option<File>,
469         original_termios: Arc<Mutex<Option<termios>>>,
470         snapshot: Option<Snapshot>,
471     ) -> Result<Self> {
472         trace_scoped!("Vm::new_from_memory_manager");
473 
474         let boot_id_list = config
475             .lock()
476             .unwrap()
477             .validate()
478             .map_err(Error::ConfigValidation)?;
479 
480         let load_payload_handle = if snapshot.is_none() {
481             Self::load_payload_async(&memory_manager, &config)?
482         } else {
483             None
484         };
485 
486         info!("Booting VM from config: {:?}", &config);
487 
488         // Create NUMA nodes based on NumaConfig.
489         let numa_nodes =
490             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
491 
492         #[cfg(feature = "tdx")]
493         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
494         #[cfg(feature = "tdx")]
495         let force_iommu = tdx_enabled;
496         #[cfg(not(feature = "tdx"))]
497         let force_iommu = false;
498 
499         #[cfg(feature = "guest_debug")]
500         let stop_on_boot = config.lock().unwrap().gdb;
501         #[cfg(not(feature = "guest_debug"))]
502         let stop_on_boot = false;
503 
504         let memory = memory_manager.lock().unwrap().guest_memory();
505         #[cfg(target_arch = "x86_64")]
506         let io_bus = Arc::new(Bus::new());
507         let mmio_bus = Arc::new(Bus::new());
508 
509         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
510             memory,
511             #[cfg(target_arch = "x86_64")]
512             io_bus: io_bus.clone(),
513             mmio_bus: mmio_bus.clone(),
514         });
515 
516         let cpus_config = { &config.lock().unwrap().cpus.clone() };
517         let cpu_manager = cpu::CpuManager::new(
518             cpus_config,
519             vm.clone(),
520             exit_evt.try_clone().map_err(Error::EventFdClone)?,
521             reset_evt.try_clone().map_err(Error::EventFdClone)?,
522             #[cfg(feature = "guest_debug")]
523             vm_debug_evt,
524             &hypervisor,
525             seccomp_action.clone(),
526             vm_ops,
527             #[cfg(feature = "tdx")]
528             tdx_enabled,
529             &numa_nodes,
530         )
531         .map_err(Error::CpuManager)?;
532 
533         #[cfg(target_arch = "x86_64")]
534         cpu_manager
535             .lock()
536             .unwrap()
537             .populate_cpuid(
538                 &memory_manager,
539                 &hypervisor,
540                 #[cfg(feature = "tdx")]
541                 tdx_enabled,
542             )
543             .map_err(Error::CpuManager)?;
544 
545         // The initial TDX configuration must be done before the vCPUs are
546         // created
547         #[cfg(feature = "tdx")]
548         if tdx_enabled {
549             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
550             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
551             vm.tdx_init(&cpuid, max_vcpus)
552                 .map_err(Error::InitializeTdxVm)?;
553         }
554 
555         cpu_manager
556             .lock()
557             .unwrap()
558             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
559             .map_err(Error::CpuManager)?;
560 
561         #[cfg(feature = "tdx")]
562         let dynamic = !tdx_enabled;
563         #[cfg(not(feature = "tdx"))]
564         let dynamic = true;
565 
566         let device_manager = DeviceManager::new(
567             #[cfg(target_arch = "x86_64")]
568             io_bus,
569             mmio_bus,
570             hypervisor.hypervisor_type(),
571             vm.clone(),
572             config.clone(),
573             memory_manager.clone(),
574             cpu_manager.clone(),
575             exit_evt.try_clone().map_err(Error::EventFdClone)?,
576             reset_evt,
577             seccomp_action.clone(),
578             numa_nodes.clone(),
579             &activate_evt,
580             force_iommu,
581             boot_id_list,
582             timestamp,
583             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
584             dynamic,
585         )
586         .map_err(Error::DeviceManager)?;
587 
588         device_manager
589             .lock()
590             .unwrap()
591             .create_devices(
592                 serial_pty,
593                 console_pty,
594                 console_resize_pipe,
595                 original_termios,
596             )
597             .map_err(Error::DeviceManager)?;
598 
599         #[cfg(feature = "tdx")]
600         let kernel = config
601             .lock()
602             .unwrap()
603             .payload
604             .as_ref()
605             .map(|p| p.kernel.as_ref().map(File::open))
606             .unwrap_or_default()
607             .transpose()
608             .map_err(Error::KernelFile)?;
609 
610         let initramfs = config
611             .lock()
612             .unwrap()
613             .payload
614             .as_ref()
615             .map(|p| p.initramfs.as_ref().map(File::open))
616             .unwrap_or_default()
617             .transpose()
618             .map_err(Error::InitramfsFile)?;
619 
620         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
621         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
622             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
623             vm_snapshot.clock
624         } else {
625             None
626         };
627 
628         let vm_state = if snapshot.is_some() {
629             VmState::Paused
630         } else {
631             VmState::Created
632         };
633 
634         Ok(Vm {
635             #[cfg(feature = "tdx")]
636             kernel,
637             initramfs,
638             device_manager,
639             config,
640             threads: Vec::with_capacity(1),
641             state: RwLock::new(vm_state),
642             cpu_manager,
643             memory_manager,
644             vm,
645             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
646             saved_clock,
647             numa_nodes,
648             hypervisor,
649             stop_on_boot,
650             load_payload_handle,
651         })
652     }
653 
654     fn create_numa_nodes(
655         configs: Option<Vec<NumaConfig>>,
656         memory_manager: &Arc<Mutex<MemoryManager>>,
657     ) -> Result<NumaNodes> {
658         let mm = memory_manager.lock().unwrap();
659         let mm_zones = mm.memory_zones();
660         let mut numa_nodes = BTreeMap::new();
661 
662         if let Some(configs) = &configs {
663             for config in configs.iter() {
664                 if numa_nodes.contains_key(&config.guest_numa_id) {
665                     error!("Can't define twice the same NUMA node");
666                     return Err(Error::InvalidNumaConfig);
667                 }
668 
669                 let mut node = NumaNode::default();
670 
671                 if let Some(memory_zones) = &config.memory_zones {
672                     for memory_zone in memory_zones.iter() {
673                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
674                             node.memory_regions.extend(mm_zone.regions().clone());
675                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
676                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
677                             }
678                             node.memory_zones.push(memory_zone.clone());
679                         } else {
680                             error!("Unknown memory zone '{}'", memory_zone);
681                             return Err(Error::InvalidNumaConfig);
682                         }
683                     }
684                 }
685 
686                 if let Some(cpus) = &config.cpus {
687                     node.cpus.extend(cpus);
688                 }
689 
690                 if let Some(distances) = &config.distances {
691                     for distance in distances.iter() {
692                         let dest = distance.destination;
693                         let dist = distance.distance;
694 
695                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
696                             error!("Unknown destination NUMA node {}", dest);
697                             return Err(Error::InvalidNumaConfig);
698                         }
699 
700                         if node.distances.contains_key(&dest) {
701                             error!("Destination NUMA node {} has been already set", dest);
702                             return Err(Error::InvalidNumaConfig);
703                         }
704 
705                         node.distances.insert(dest, dist);
706                     }
707                 }
708 
709                 #[cfg(target_arch = "x86_64")]
710                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
711                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
712                         let mm_sections = sgx_epc_region.epc_sections();
713                         for sgx_epc_section in sgx_epc_sections.iter() {
714                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
715                                 node.sgx_epc_sections.push(mm_section.clone());
716                             } else {
717                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
718                                 return Err(Error::InvalidNumaConfig);
719                             }
720                         }
721                     } else {
722                         error!("Missing SGX EPC region");
723                         return Err(Error::InvalidNumaConfig);
724                     }
725                 }
726 
727                 numa_nodes.insert(config.guest_numa_id, node);
728             }
729         }
730 
731         Ok(numa_nodes)
732     }
733 
734     #[allow(clippy::too_many_arguments)]
735     pub fn new(
736         vm_config: Arc<Mutex<VmConfig>>,
737         exit_evt: EventFd,
738         reset_evt: EventFd,
739         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
740         seccomp_action: &SeccompAction,
741         hypervisor: Arc<dyn hypervisor::Hypervisor>,
742         activate_evt: EventFd,
743         serial_pty: Option<PtyPair>,
744         console_pty: Option<PtyPair>,
745         console_resize_pipe: Option<File>,
746         original_termios: Arc<Mutex<Option<termios>>>,
747         snapshot: Option<Snapshot>,
748         source_url: Option<&str>,
749         prefault: Option<bool>,
750     ) -> Result<Self> {
751         trace_scoped!("Vm::new");
752 
753         let timestamp = Instant::now();
754 
755         #[cfg(feature = "tdx")]
756         let tdx_enabled = if snapshot.is_some() {
757             false
758         } else {
759             vm_config.lock().unwrap().is_tdx_enabled()
760         };
761 
762         let vm = Self::create_hypervisor_vm(
763             &hypervisor,
764             #[cfg(feature = "tdx")]
765             tdx_enabled,
766         )?;
767 
768         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
769 
770         let memory_manager = if let Some(snapshot) =
771             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
772         {
773             MemoryManager::new_from_snapshot(
774                 &snapshot,
775                 vm.clone(),
776                 &vm_config.lock().unwrap().memory.clone(),
777                 source_url,
778                 prefault.unwrap(),
779                 phys_bits,
780             )
781             .map_err(Error::MemoryManager)?
782         } else {
783             #[cfg(target_arch = "x86_64")]
784             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
785 
786             MemoryManager::new(
787                 vm.clone(),
788                 &vm_config.lock().unwrap().memory.clone(),
789                 None,
790                 phys_bits,
791                 #[cfg(feature = "tdx")]
792                 tdx_enabled,
793                 None,
794                 None,
795                 #[cfg(target_arch = "x86_64")]
796                 sgx_epc_config,
797             )
798             .map_err(Error::MemoryManager)?
799         };
800 
801         Vm::new_from_memory_manager(
802             vm_config,
803             memory_manager,
804             vm,
805             exit_evt,
806             reset_evt,
807             #[cfg(feature = "guest_debug")]
808             vm_debug_evt,
809             seccomp_action,
810             hypervisor,
811             activate_evt,
812             timestamp,
813             serial_pty,
814             console_pty,
815             console_resize_pipe,
816             original_termios,
817             snapshot,
818         )
819     }
820 
821     pub fn create_hypervisor_vm(
822         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
823         #[cfg(feature = "tdx")] tdx_enabled: bool,
824     ) -> Result<Arc<dyn hypervisor::Vm>> {
825         hypervisor.check_required_extensions().unwrap();
826 
827         // 0 for KVM_X86_LEGACY_VM
828         // 1 for KVM_X86_TDX_VM
829         #[cfg(feature = "tdx")]
830         let vm = hypervisor
831             .create_vm_with_type(u64::from(tdx_enabled))
832             .unwrap();
833         #[cfg(not(feature = "tdx"))]
834         let vm = hypervisor.create_vm().unwrap();
835 
836         #[cfg(target_arch = "x86_64")]
837         {
838             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
839                 .unwrap();
840             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
841             vm.enable_split_irq().unwrap();
842         }
843 
844         Ok(vm)
845     }
846 
847     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
848         let mut initramfs = self.initramfs.as_ref().unwrap();
849         let size: usize = initramfs
850             .seek(SeekFrom::End(0))
851             .map_err(|_| Error::InitramfsLoad)?
852             .try_into()
853             .unwrap();
854         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
855 
856         let address =
857             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
858         let address = GuestAddress(address);
859 
860         guest_mem
861             .read_from(address, &mut initramfs, size)
862             .map_err(|_| Error::InitramfsLoad)?;
863 
864         info!("Initramfs loaded: address = 0x{:x}", address.0);
865         Ok(arch::InitramfsConfig { address, size })
866     }
867 
868     pub fn generate_cmdline(
869         payload: &PayloadConfig,
870         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
871     ) -> Result<Cmdline> {
872         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
873         if let Some(s) = payload.cmdline.as_ref() {
874             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
875         }
876 
877         #[cfg(target_arch = "aarch64")]
878         for entry in device_manager.lock().unwrap().cmdline_additions() {
879             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
880         }
881         Ok(cmdline)
882     }
883 
884     #[cfg(target_arch = "aarch64")]
885     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
886         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
887         let mem = uefi_flash.memory();
888         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
889             .map_err(Error::UefiLoad)?;
890         Ok(())
891     }
892 
893     #[cfg(target_arch = "aarch64")]
894     fn load_kernel(
895         firmware: Option<File>,
896         kernel: Option<File>,
897         memory_manager: Arc<Mutex<MemoryManager>>,
898     ) -> Result<EntryPoint> {
899         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
900         let mem = guest_memory.memory();
901         let entry_addr = match (firmware, kernel) {
902             (None, Some(mut kernel)) => {
903                 match linux_loader::loader::pe::PE::load(
904                     mem.deref(),
905                     Some(arch::layout::KERNEL_START),
906                     &mut kernel,
907                     None,
908                 ) {
909                     Ok(entry_addr) => entry_addr.kernel_load,
910                     // Try to load the binary as kernel PE file at first.
911                     // If failed, retry to load it as UEFI binary.
912                     // As the UEFI binary is formatless, it must be the last option to try.
913                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
914                         Self::load_firmware(&kernel, memory_manager)?;
915                         arch::layout::UEFI_START
916                     }
917                     Err(e) => {
918                         return Err(Error::KernelLoad(e));
919                     }
920                 }
921             }
922             (Some(firmware), None) => {
923                 Self::load_firmware(&firmware, memory_manager)?;
924                 arch::layout::UEFI_START
925             }
926             _ => return Err(Error::InvalidPayload),
927         };
928 
929         Ok(EntryPoint { entry_addr })
930     }
931 
932     #[cfg(target_arch = "x86_64")]
933     fn load_kernel(
934         mut kernel: File,
935         cmdline: Option<Cmdline>,
936         memory_manager: Arc<Mutex<MemoryManager>>,
937     ) -> Result<EntryPoint> {
938         info!("Loading kernel");
939 
940         let mem = {
941             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
942             guest_memory.memory()
943         };
944         let entry_addr = linux_loader::loader::elf::Elf::load(
945             mem.deref(),
946             None,
947             &mut kernel,
948             Some(arch::layout::HIGH_RAM_START),
949         )
950         .map_err(Error::KernelLoad)?;
951 
952         if let Some(cmdline) = cmdline {
953             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
954                 .map_err(Error::LoadCmdLine)?;
955         }
956 
957         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
958             // Use the PVH kernel entry point to boot the guest
959             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
960             Ok(EntryPoint {
961                 entry_addr: Some(entry_addr),
962             })
963         } else {
964             Err(Error::KernelMissingPvhHeader)
965         }
966     }
967 
968     #[cfg(target_arch = "x86_64")]
969     fn load_payload(
970         payload: &PayloadConfig,
971         memory_manager: Arc<Mutex<MemoryManager>>,
972     ) -> Result<EntryPoint> {
973         trace_scoped!("load_payload");
974         match (
975             &payload.firmware,
976             &payload.kernel,
977             &payload.initramfs,
978             &payload.cmdline,
979         ) {
980             (Some(firmware), None, None, None) => {
981                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
982                 Self::load_kernel(firmware, None, memory_manager)
983             }
984             (None, Some(kernel), _, _) => {
985                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
986                 let cmdline = Self::generate_cmdline(payload)?;
987                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
988             }
989             _ => Err(Error::InvalidPayload),
990         }
991     }
992 
993     #[cfg(target_arch = "aarch64")]
994     fn load_payload(
995         payload: &PayloadConfig,
996         memory_manager: Arc<Mutex<MemoryManager>>,
997     ) -> Result<EntryPoint> {
998         match (&payload.firmware, &payload.kernel) {
999             (Some(firmware), None) => {
1000                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1001                 Self::load_kernel(Some(firmware), None, memory_manager)
1002             }
1003             (None, Some(kernel)) => {
1004                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1005                 Self::load_kernel(None, Some(kernel), memory_manager)
1006             }
1007             _ => Err(Error::InvalidPayload),
1008         }
1009     }
1010 
1011     fn load_payload_async(
1012         memory_manager: &Arc<Mutex<MemoryManager>>,
1013         config: &Arc<Mutex<VmConfig>>,
1014     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1015         // Kernel with TDX is loaded in a different manner
1016         #[cfg(feature = "tdx")]
1017         if config.lock().unwrap().is_tdx_enabled() {
1018             return Ok(None);
1019         }
1020 
1021         config
1022             .lock()
1023             .unwrap()
1024             .payload
1025             .as_ref()
1026             .map(|payload| {
1027                 let memory_manager = memory_manager.clone();
1028                 let payload = payload.clone();
1029 
1030                 std::thread::Builder::new()
1031                     .name("payload_loader".into())
1032                     .spawn(move || Self::load_payload(&payload, memory_manager))
1033                     .map_err(Error::KernelLoadThreadSpawn)
1034             })
1035             .transpose()
1036     }
1037 
1038     #[cfg(target_arch = "x86_64")]
1039     fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> {
1040         trace_scoped!("configure_system");
1041         info!("Configuring system");
1042         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1043 
1044         let initramfs_config = match self.initramfs {
1045             Some(_) => Some(self.load_initramfs(&mem)?),
1046             None => None,
1047         };
1048 
1049         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1050         let rsdp_addr = Some(rsdp_addr);
1051         let sgx_epc_region = self
1052             .memory_manager
1053             .lock()
1054             .unwrap()
1055             .sgx_epc_region()
1056             .as_ref()
1057             .cloned();
1058 
1059         let serial_number = self
1060             .config
1061             .lock()
1062             .unwrap()
1063             .platform
1064             .as_ref()
1065             .and_then(|p| p.serial_number.clone());
1066 
1067         let uuid = self
1068             .config
1069             .lock()
1070             .unwrap()
1071             .platform
1072             .as_ref()
1073             .and_then(|p| p.uuid.clone());
1074 
1075         let oem_strings = self
1076             .config
1077             .lock()
1078             .unwrap()
1079             .platform
1080             .as_ref()
1081             .and_then(|p| p.oem_strings.clone());
1082 
1083         let oem_strings = oem_strings
1084             .as_deref()
1085             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1086 
1087         arch::configure_system(
1088             &mem,
1089             arch::layout::CMDLINE_START,
1090             &initramfs_config,
1091             boot_vcpus,
1092             rsdp_addr,
1093             sgx_epc_region,
1094             serial_number.as_deref(),
1095             uuid.as_deref(),
1096             oem_strings.as_deref(),
1097         )
1098         .map_err(Error::ConfigureSystem)?;
1099         Ok(())
1100     }
1101 
1102     #[cfg(target_arch = "aarch64")]
1103     fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> {
1104         let cmdline = Self::generate_cmdline(
1105             self.config.lock().unwrap().payload.as_ref().unwrap(),
1106             &self.device_manager,
1107         )?;
1108         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1109         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1110         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1111         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1112         let initramfs_config = match self.initramfs {
1113             Some(_) => Some(self.load_initramfs(&mem)?),
1114             None => None,
1115         };
1116 
1117         let device_info = &self
1118             .device_manager
1119             .lock()
1120             .unwrap()
1121             .get_device_info()
1122             .clone();
1123 
1124         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1125             let pci_space = PciSpaceInfo {
1126                 pci_segment_id: pci_segment.id,
1127                 mmio_config_address: pci_segment.mmio_config_address,
1128                 pci_device_space_start: pci_segment.start_of_device_area,
1129                 pci_device_space_size: pci_segment.end_of_device_area
1130                     - pci_segment.start_of_device_area
1131                     + 1,
1132             };
1133             pci_space_info.push(pci_space);
1134         }
1135 
1136         let virtio_iommu_bdf = self
1137             .device_manager
1138             .lock()
1139             .unwrap()
1140             .iommu_attached_devices()
1141             .as_ref()
1142             .map(|(v, _)| *v);
1143 
1144         let vgic = self
1145             .device_manager
1146             .lock()
1147             .unwrap()
1148             .get_interrupt_controller()
1149             .unwrap()
1150             .lock()
1151             .unwrap()
1152             .get_vgic()
1153             .map_err(|_| {
1154                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1155                     arch::aarch64::Error::SetupGic,
1156                 ))
1157             })?;
1158 
1159         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1160         let pmu_supported = self
1161             .cpu_manager
1162             .lock()
1163             .unwrap()
1164             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1165             .map_err(|_| {
1166                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1167                     arch::aarch64::Error::VcpuInitPmu,
1168                 ))
1169             })?;
1170 
1171         arch::configure_system(
1172             &mem,
1173             cmdline.as_cstring().unwrap().to_str().unwrap(),
1174             vcpu_mpidrs,
1175             vcpu_topology,
1176             device_info,
1177             &initramfs_config,
1178             &pci_space_info,
1179             virtio_iommu_bdf.map(|bdf| bdf.into()),
1180             &vgic,
1181             &self.numa_nodes,
1182             pmu_supported,
1183         )
1184         .map_err(Error::ConfigureSystem)?;
1185 
1186         Ok(())
1187     }
1188 
1189     pub fn serial_pty(&self) -> Option<PtyPair> {
1190         self.device_manager.lock().unwrap().serial_pty()
1191     }
1192 
1193     pub fn console_pty(&self) -> Option<PtyPair> {
1194         self.device_manager.lock().unwrap().console_pty()
1195     }
1196 
1197     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1198         self.device_manager.lock().unwrap().console_resize_pipe()
1199     }
1200 
1201     pub fn shutdown(&mut self) -> Result<()> {
1202         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1203         let new_state = VmState::Shutdown;
1204 
1205         state.valid_transition(new_state)?;
1206 
1207         // Wake up the DeviceManager threads so they will get terminated cleanly
1208         self.device_manager
1209             .lock()
1210             .unwrap()
1211             .resume()
1212             .map_err(Error::Resume)?;
1213 
1214         self.cpu_manager
1215             .lock()
1216             .unwrap()
1217             .shutdown()
1218             .map_err(Error::CpuManager)?;
1219 
1220         // Wait for all the threads to finish
1221         for thread in self.threads.drain(..) {
1222             thread.join().map_err(Error::ThreadCleanup)?
1223         }
1224         *state = new_state;
1225 
1226         event!("vm", "shutdown");
1227 
1228         Ok(())
1229     }
1230 
1231     pub fn resize(
1232         &mut self,
1233         desired_vcpus: Option<u8>,
1234         desired_memory: Option<u64>,
1235         desired_balloon: Option<u64>,
1236     ) -> Result<()> {
1237         event!("vm", "resizing");
1238 
1239         if let Some(desired_vcpus) = desired_vcpus {
1240             if self
1241                 .cpu_manager
1242                 .lock()
1243                 .unwrap()
1244                 .resize(desired_vcpus)
1245                 .map_err(Error::CpuManager)?
1246             {
1247                 self.device_manager
1248                     .lock()
1249                     .unwrap()
1250                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1251                     .map_err(Error::DeviceManager)?;
1252             }
1253             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1254         }
1255 
1256         if let Some(desired_memory) = desired_memory {
1257             let new_region = self
1258                 .memory_manager
1259                 .lock()
1260                 .unwrap()
1261                 .resize(desired_memory)
1262                 .map_err(Error::MemoryManager)?;
1263 
1264             let memory_config = &mut self.config.lock().unwrap().memory;
1265 
1266             if let Some(new_region) = &new_region {
1267                 self.device_manager
1268                     .lock()
1269                     .unwrap()
1270                     .update_memory(new_region)
1271                     .map_err(Error::DeviceManager)?;
1272 
1273                 match memory_config.hotplug_method {
1274                     HotplugMethod::Acpi => {
1275                         self.device_manager
1276                             .lock()
1277                             .unwrap()
1278                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1279                             .map_err(Error::DeviceManager)?;
1280                     }
1281                     HotplugMethod::VirtioMem => {}
1282                 }
1283             }
1284 
1285             // We update the VM config regardless of the actual guest resize
1286             // operation result (happened or not), so that if the VM reboots
1287             // it will be running with the last configure memory size.
1288             match memory_config.hotplug_method {
1289                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1290                 HotplugMethod::VirtioMem => {
1291                     if desired_memory > memory_config.size {
1292                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1293                     } else {
1294                         memory_config.hotplugged_size = None;
1295                     }
1296                 }
1297             }
1298         }
1299 
1300         if let Some(desired_balloon) = desired_balloon {
1301             self.device_manager
1302                 .lock()
1303                 .unwrap()
1304                 .resize_balloon(desired_balloon)
1305                 .map_err(Error::DeviceManager)?;
1306 
1307             // Update the configuration value for the balloon size to ensure
1308             // a reboot would use the right value.
1309             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1310                 balloon_config.size = desired_balloon;
1311             }
1312         }
1313 
1314         event!("vm", "resized");
1315 
1316         Ok(())
1317     }
1318 
1319     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1320         let memory_config = &mut self.config.lock().unwrap().memory;
1321 
1322         if let Some(zones) = &mut memory_config.zones {
1323             for zone in zones.iter_mut() {
1324                 if zone.id == id {
1325                     if desired_memory >= zone.size {
1326                         let hotplugged_size = desired_memory - zone.size;
1327                         self.memory_manager
1328                             .lock()
1329                             .unwrap()
1330                             .resize_zone(&id, desired_memory - zone.size)
1331                             .map_err(Error::MemoryManager)?;
1332                         // We update the memory zone config regardless of the
1333                         // actual 'resize-zone' operation result (happened or
1334                         // not), so that if the VM reboots it will be running
1335                         // with the last configured memory zone size.
1336                         zone.hotplugged_size = Some(hotplugged_size);
1337 
1338                         return Ok(());
1339                     } else {
1340                         error!(
1341                             "Invalid to ask less ({}) than boot RAM ({}) for \
1342                             this memory zone",
1343                             desired_memory, zone.size,
1344                         );
1345                         return Err(Error::ResizeZone);
1346                     }
1347                 }
1348             }
1349         }
1350 
1351         error!("Could not find the memory zone {} for the resize", id);
1352         Err(Error::ResizeZone)
1353     }
1354 
1355     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1356         let pci_device_info = self
1357             .device_manager
1358             .lock()
1359             .unwrap()
1360             .add_device(&mut device_cfg)
1361             .map_err(Error::DeviceManager)?;
1362 
1363         // Update VmConfig by adding the new device. This is important to
1364         // ensure the device would be created in case of a reboot.
1365         {
1366             let mut config = self.config.lock().unwrap();
1367             add_to_config(&mut config.devices, device_cfg);
1368         }
1369 
1370         self.device_manager
1371             .lock()
1372             .unwrap()
1373             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1374             .map_err(Error::DeviceManager)?;
1375 
1376         Ok(pci_device_info)
1377     }
1378 
1379     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1380         let pci_device_info = self
1381             .device_manager
1382             .lock()
1383             .unwrap()
1384             .add_user_device(&mut device_cfg)
1385             .map_err(Error::DeviceManager)?;
1386 
1387         // Update VmConfig by adding the new device. This is important to
1388         // ensure the device would be created in case of a reboot.
1389         {
1390             let mut config = self.config.lock().unwrap();
1391             add_to_config(&mut config.user_devices, device_cfg);
1392         }
1393 
1394         self.device_manager
1395             .lock()
1396             .unwrap()
1397             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1398             .map_err(Error::DeviceManager)?;
1399 
1400         Ok(pci_device_info)
1401     }
1402 
1403     pub fn remove_device(&mut self, id: String) -> Result<()> {
1404         self.device_manager
1405             .lock()
1406             .unwrap()
1407             .remove_device(id.clone())
1408             .map_err(Error::DeviceManager)?;
1409 
1410         // Update VmConfig by removing the device. This is important to
1411         // ensure the device would not be created in case of a reboot.
1412         self.config.lock().unwrap().remove_device(&id);
1413 
1414         self.device_manager
1415             .lock()
1416             .unwrap()
1417             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1418             .map_err(Error::DeviceManager)?;
1419         Ok(())
1420     }
1421 
1422     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1423         let pci_device_info = self
1424             .device_manager
1425             .lock()
1426             .unwrap()
1427             .add_disk(&mut disk_cfg)
1428             .map_err(Error::DeviceManager)?;
1429 
1430         // Update VmConfig by adding the new device. This is important to
1431         // ensure the device would be created in case of a reboot.
1432         {
1433             let mut config = self.config.lock().unwrap();
1434             add_to_config(&mut config.disks, disk_cfg);
1435         }
1436 
1437         self.device_manager
1438             .lock()
1439             .unwrap()
1440             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1441             .map_err(Error::DeviceManager)?;
1442 
1443         Ok(pci_device_info)
1444     }
1445 
1446     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1447         let pci_device_info = self
1448             .device_manager
1449             .lock()
1450             .unwrap()
1451             .add_fs(&mut fs_cfg)
1452             .map_err(Error::DeviceManager)?;
1453 
1454         // Update VmConfig by adding the new device. This is important to
1455         // ensure the device would be created in case of a reboot.
1456         {
1457             let mut config = self.config.lock().unwrap();
1458             add_to_config(&mut config.fs, fs_cfg);
1459         }
1460 
1461         self.device_manager
1462             .lock()
1463             .unwrap()
1464             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1465             .map_err(Error::DeviceManager)?;
1466 
1467         Ok(pci_device_info)
1468     }
1469 
1470     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1471         let pci_device_info = self
1472             .device_manager
1473             .lock()
1474             .unwrap()
1475             .add_pmem(&mut pmem_cfg)
1476             .map_err(Error::DeviceManager)?;
1477 
1478         // Update VmConfig by adding the new device. This is important to
1479         // ensure the device would be created in case of a reboot.
1480         {
1481             let mut config = self.config.lock().unwrap();
1482             add_to_config(&mut config.pmem, pmem_cfg);
1483         }
1484 
1485         self.device_manager
1486             .lock()
1487             .unwrap()
1488             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1489             .map_err(Error::DeviceManager)?;
1490 
1491         Ok(pci_device_info)
1492     }
1493 
1494     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1495         let pci_device_info = self
1496             .device_manager
1497             .lock()
1498             .unwrap()
1499             .add_net(&mut net_cfg)
1500             .map_err(Error::DeviceManager)?;
1501 
1502         // Update VmConfig by adding the new device. This is important to
1503         // ensure the device would be created in case of a reboot.
1504         {
1505             let mut config = self.config.lock().unwrap();
1506             add_to_config(&mut config.net, net_cfg);
1507         }
1508 
1509         self.device_manager
1510             .lock()
1511             .unwrap()
1512             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1513             .map_err(Error::DeviceManager)?;
1514 
1515         Ok(pci_device_info)
1516     }
1517 
1518     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1519         let pci_device_info = self
1520             .device_manager
1521             .lock()
1522             .unwrap()
1523             .add_vdpa(&mut vdpa_cfg)
1524             .map_err(Error::DeviceManager)?;
1525 
1526         // Update VmConfig by adding the new device. This is important to
1527         // ensure the device would be created in case of a reboot.
1528         {
1529             let mut config = self.config.lock().unwrap();
1530             add_to_config(&mut config.vdpa, vdpa_cfg);
1531         }
1532 
1533         self.device_manager
1534             .lock()
1535             .unwrap()
1536             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1537             .map_err(Error::DeviceManager)?;
1538 
1539         Ok(pci_device_info)
1540     }
1541 
1542     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1543         let pci_device_info = self
1544             .device_manager
1545             .lock()
1546             .unwrap()
1547             .add_vsock(&mut vsock_cfg)
1548             .map_err(Error::DeviceManager)?;
1549 
1550         // Update VmConfig by adding the new device. This is important to
1551         // ensure the device would be created in case of a reboot.
1552         {
1553             let mut config = self.config.lock().unwrap();
1554             config.vsock = Some(vsock_cfg);
1555         }
1556 
1557         self.device_manager
1558             .lock()
1559             .unwrap()
1560             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1561             .map_err(Error::DeviceManager)?;
1562 
1563         Ok(pci_device_info)
1564     }
1565 
1566     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1567         Ok(self.device_manager.lock().unwrap().counters())
1568     }
1569 
1570     #[cfg(feature = "tdx")]
1571     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1572         use arch::x86_64::tdx::*;
1573 
1574         let firmware_path = self
1575             .config
1576             .lock()
1577             .unwrap()
1578             .payload
1579             .as_ref()
1580             .unwrap()
1581             .firmware
1582             .clone()
1583             .ok_or(Error::TdxFirmwareMissing)?;
1584         // The TDVF file contains a table of section as well as code
1585         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1586 
1587         // For all the sections allocate some RAM backing them
1588         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1589     }
1590 
1591     #[cfg(feature = "tdx")]
1592     fn hob_memory_resources(
1593         mut sorted_sections: Vec<TdvfSection>,
1594         guest_memory: &GuestMemoryMmap,
1595     ) -> Vec<(u64, u64, bool)> {
1596         let mut list = Vec::new();
1597 
1598         let mut current_section = sorted_sections.pop();
1599 
1600         // RAM regions interleaved with TDVF sections
1601         let mut next_start_addr = 0;
1602         for region in guest_memory.iter() {
1603             let region_start = region.start_addr().0;
1604             let region_end = region.last_addr().0;
1605             if region_start > next_start_addr {
1606                 next_start_addr = region_start;
1607             }
1608 
1609             loop {
1610                 let (start, size, ram) = if let Some(section) = &current_section {
1611                     if section.address <= next_start_addr {
1612                         (section.address, section.size, false)
1613                     } else {
1614                         let last_addr = std::cmp::min(section.address - 1, region_end);
1615                         (next_start_addr, last_addr - next_start_addr + 1, true)
1616                     }
1617                 } else {
1618                     (next_start_addr, region_end - next_start_addr + 1, true)
1619                 };
1620 
1621                 list.push((start, size, ram));
1622 
1623                 if !ram {
1624                     current_section = sorted_sections.pop();
1625                 }
1626 
1627                 next_start_addr = start + size;
1628 
1629                 if region_start > next_start_addr {
1630                     next_start_addr = region_start;
1631                 }
1632 
1633                 if next_start_addr > region_end {
1634                     break;
1635                 }
1636             }
1637         }
1638 
1639         // Once all the interleaved sections have been processed, let's simply
1640         // pull the remaining ones.
1641         if let Some(section) = current_section {
1642             list.push((section.address, section.size, false));
1643         }
1644         while let Some(section) = sorted_sections.pop() {
1645             list.push((section.address, section.size, false));
1646         }
1647 
1648         list
1649     }
1650 
1651     #[cfg(feature = "tdx")]
1652     fn populate_tdx_sections(
1653         &mut self,
1654         sections: &[TdvfSection],
1655         guid_found: bool,
1656     ) -> Result<Option<u64>> {
1657         use arch::x86_64::tdx::*;
1658         // Get the memory end *before* we start adding TDVF ram regions
1659         let boot_guest_memory = self
1660             .memory_manager
1661             .lock()
1662             .as_ref()
1663             .unwrap()
1664             .boot_guest_memory();
1665         for section in sections {
1666             // No need to allocate if the section falls within guest RAM ranges
1667             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1668                 info!(
1669                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1670                     section
1671                 );
1672                 continue;
1673             }
1674 
1675             info!("Allocating TDVF Section: {:x?}", section);
1676             self.memory_manager
1677                 .lock()
1678                 .unwrap()
1679                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1680                 .map_err(Error::AllocatingTdvfMemory)?;
1681         }
1682 
1683         // The TDVF file contains a table of section as well as code
1684         let firmware_path = self
1685             .config
1686             .lock()
1687             .unwrap()
1688             .payload
1689             .as_ref()
1690             .unwrap()
1691             .firmware
1692             .clone()
1693             .ok_or(Error::TdxFirmwareMissing)?;
1694         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1695 
1696         // The guest memory at this point now has all the required regions so it
1697         // is safe to copy from the TDVF file into it.
1698         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1699         let mem = guest_memory.memory();
1700         let mut payload_info = None;
1701         let mut hob_offset = None;
1702         for section in sections {
1703             info!("Populating TDVF Section: {:x?}", section);
1704             match section.r#type {
1705                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1706                     info!("Copying section to guest memory");
1707                     firmware_file
1708                         .seek(SeekFrom::Start(section.data_offset as u64))
1709                         .map_err(Error::LoadTdvf)?;
1710                     mem.read_from(
1711                         GuestAddress(section.address),
1712                         &mut firmware_file,
1713                         section.data_size as usize,
1714                     )
1715                     .unwrap();
1716                 }
1717                 TdvfSectionType::TdHob => {
1718                     hob_offset = Some(section.address);
1719                 }
1720                 TdvfSectionType::Payload => {
1721                     info!("Copying payload to guest memory");
1722                     if let Some(payload_file) = self.kernel.as_mut() {
1723                         let payload_size = payload_file
1724                             .seek(SeekFrom::End(0))
1725                             .map_err(Error::LoadPayload)?;
1726 
1727                         payload_file
1728                             .seek(SeekFrom::Start(0x1f1))
1729                             .map_err(Error::LoadPayload)?;
1730 
1731                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1732                         payload_header
1733                             .as_bytes()
1734                             .read_from(
1735                                 0,
1736                                 payload_file,
1737                                 mem::size_of::<linux_loader::bootparam::setup_header>(),
1738                             )
1739                             .unwrap();
1740 
1741                         if payload_header.header != 0x5372_6448 {
1742                             return Err(Error::InvalidPayloadType);
1743                         }
1744 
1745                         if (payload_header.version < 0x0200)
1746                             || ((payload_header.loadflags & 0x1) == 0x0)
1747                         {
1748                             return Err(Error::InvalidPayloadType);
1749                         }
1750 
1751                         payload_file.rewind().map_err(Error::LoadPayload)?;
1752                         mem.read_from(
1753                             GuestAddress(section.address),
1754                             payload_file,
1755                             payload_size as usize,
1756                         )
1757                         .unwrap();
1758 
1759                         // Create the payload info that will be inserted into
1760                         // the HOB.
1761                         payload_info = Some(PayloadInfo {
1762                             image_type: PayloadImageType::BzImage,
1763                             entry_point: section.address,
1764                         });
1765                     }
1766                 }
1767                 TdvfSectionType::PayloadParam => {
1768                     info!("Copying payload parameters to guest memory");
1769                     let cmdline = Self::generate_cmdline(
1770                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1771                     )?;
1772                     mem.write_slice(
1773                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1774                         GuestAddress(section.address),
1775                     )
1776                     .unwrap();
1777                 }
1778                 _ => {}
1779             }
1780         }
1781 
1782         // Generate HOB
1783         let mut hob = TdHob::start(hob_offset.unwrap());
1784 
1785         let mut sorted_sections = sections.to_vec();
1786         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1787 
1788         sorted_sections.sort_by_key(|section| section.address);
1789         sorted_sections.reverse();
1790 
1791         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1792             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1793                 .map_err(Error::PopulateHob)?;
1794         }
1795 
1796         // MMIO regions
1797         hob.add_mmio_resource(
1798             &mem,
1799             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1800             arch::layout::APIC_START.raw_value()
1801                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1802         )
1803         .map_err(Error::PopulateHob)?;
1804         let start_of_device_area = self
1805             .memory_manager
1806             .lock()
1807             .unwrap()
1808             .start_of_device_area()
1809             .raw_value();
1810         let end_of_device_area = self
1811             .memory_manager
1812             .lock()
1813             .unwrap()
1814             .end_of_device_area()
1815             .raw_value();
1816         hob.add_mmio_resource(
1817             &mem,
1818             start_of_device_area,
1819             end_of_device_area - start_of_device_area,
1820         )
1821         .map_err(Error::PopulateHob)?;
1822 
1823         // Loop over the ACPI tables and copy them to the HOB.
1824 
1825         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1826             &self.device_manager,
1827             &self.cpu_manager,
1828             &self.memory_manager,
1829             &self.numa_nodes,
1830         ) {
1831             hob.add_acpi_table(&mem, acpi_table.as_slice())
1832                 .map_err(Error::PopulateHob)?;
1833         }
1834 
1835         // If a payload info has been created, let's insert it into the HOB.
1836         if let Some(payload_info) = payload_info {
1837             hob.add_payload(&mem, payload_info)
1838                 .map_err(Error::PopulateHob)?;
1839         }
1840 
1841         hob.finish(&mem).map_err(Error::PopulateHob)?;
1842 
1843         Ok(hob_offset)
1844     }
1845 
1846     #[cfg(feature = "tdx")]
1847     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1848         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1849         let mem = guest_memory.memory();
1850 
1851         for section in sections {
1852             self.vm
1853                 .tdx_init_memory_region(
1854                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1855                     section.address,
1856                     section.size,
1857                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1858                     section.attributes == 1,
1859                 )
1860                 .map_err(Error::InitializeTdxMemoryRegion)?;
1861         }
1862 
1863         Ok(())
1864     }
1865 
1866     // Creates ACPI tables
1867     // In case of TDX being used, this is a no-op since the tables will be
1868     // created and passed when populating the HOB.
1869 
1870     fn create_acpi_tables(&self) -> Option<GuestAddress> {
1871         #[cfg(feature = "tdx")]
1872         if self.config.lock().unwrap().is_tdx_enabled() {
1873             return None;
1874         }
1875         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
1876         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
1877         let rsdp_addr = crate::acpi::create_acpi_tables(
1878             &mem,
1879             &self.device_manager,
1880             &self.cpu_manager,
1881             &self.memory_manager,
1882             &self.numa_nodes,
1883             tpm_enabled,
1884         );
1885         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
1886 
1887         Some(rsdp_addr)
1888     }
1889 
1890     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
1891         trace_scoped!("entry_point");
1892 
1893         self.load_payload_handle
1894             .take()
1895             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
1896             .transpose()
1897     }
1898 
1899     pub fn boot(&mut self) -> Result<()> {
1900         trace_scoped!("Vm::boot");
1901         info!("Booting VM");
1902         event!("vm", "booting");
1903         let current_state = self.get_state()?;
1904         if current_state == VmState::Paused {
1905             return self.resume().map_err(Error::Resume);
1906         }
1907 
1908         let new_state = if self.stop_on_boot {
1909             VmState::BreakPoint
1910         } else {
1911             VmState::Running
1912         };
1913         current_state.valid_transition(new_state)?;
1914 
1915         // Do earlier to parallelise with loading kernel
1916         #[cfg(target_arch = "x86_64")]
1917         let rsdp_addr = self.create_acpi_tables();
1918 
1919         // Load kernel synchronously or if asynchronous then wait for load to
1920         // finish.
1921         let entry_point = self.entry_point()?;
1922 
1923         #[cfg(feature = "tdx")]
1924         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
1925 
1926         // Configure the vcpus that have been created
1927         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
1928         for vcpu in vcpus {
1929             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
1930             let boot_setup = entry_point.map(|e| (e, guest_memory));
1931             self.cpu_manager
1932                 .lock()
1933                 .unwrap()
1934                 .configure_vcpu(vcpu, boot_setup)
1935                 .map_err(Error::CpuManager)?;
1936         }
1937 
1938         #[cfg(feature = "tdx")]
1939         let (sections, guid_found) = if tdx_enabled {
1940             self.extract_tdvf_sections()?
1941         } else {
1942             (Vec::new(), false)
1943         };
1944 
1945         // Configuring the TDX regions requires that the vCPUs are created.
1946         #[cfg(feature = "tdx")]
1947         let hob_address = if tdx_enabled {
1948             // TDX sections are written to memory.
1949             self.populate_tdx_sections(&sections, guid_found)?
1950         } else {
1951             None
1952         };
1953 
1954         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
1955         // available after they are configured
1956         #[cfg(target_arch = "aarch64")]
1957         let rsdp_addr = self.create_acpi_tables();
1958 
1959         // Configure shared state based on loaded kernel
1960         entry_point
1961             .map(|_| {
1962                 // Safe to unwrap rsdp_addr as we know it can't be None when
1963                 // the entry_point is Some.
1964                 self.configure_system(rsdp_addr.unwrap())
1965             })
1966             .transpose()?;
1967 
1968         #[cfg(target_arch = "x86_64")]
1969         // Note: For x86, always call this function before invoking start boot vcpus.
1970         // Otherwise guest would fail to boot because we haven't created the
1971         // userspace mappings to update the hypervisor about the memory mappings.
1972         // These mappings must be created before we start the vCPU threads for
1973         // the very first time.
1974         self.memory_manager
1975             .lock()
1976             .unwrap()
1977             .allocate_address_space()
1978             .map_err(Error::MemoryManager)?;
1979 
1980         #[cfg(feature = "tdx")]
1981         if let Some(hob_address) = hob_address {
1982             // With the HOB address extracted the vCPUs can have
1983             // their TDX state configured.
1984             self.cpu_manager
1985                 .lock()
1986                 .unwrap()
1987                 .initialize_tdx(hob_address)
1988                 .map_err(Error::CpuManager)?;
1989             // Let the hypervisor know which memory ranges are shared with the
1990             // guest. This prevents the guest from ignoring/discarding memory
1991             // regions provided by the host.
1992             self.init_tdx_memory(&sections)?;
1993             // With TDX memory and CPU state configured TDX setup is complete
1994             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
1995         }
1996 
1997         self.cpu_manager
1998             .lock()
1999             .unwrap()
2000             .start_boot_vcpus(new_state == VmState::BreakPoint)
2001             .map_err(Error::CpuManager)?;
2002 
2003         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2004         *state = new_state;
2005         event!("vm", "booted");
2006         Ok(())
2007     }
2008 
2009     pub fn restore(&mut self) -> Result<()> {
2010         event!("vm", "restoring");
2011 
2012         #[cfg(target_arch = "x86_64")]
2013         // Note: For x86, always call this function before invoking start boot vcpus.
2014         // Otherwise guest would fail to boot because we haven't created the
2015         // userspace mappings to update the hypervisor about the memory mappings.
2016         // These mappings must be created before we start the vCPU threads for
2017         // the very first time for the restored VM.
2018         self.memory_manager
2019             .lock()
2020             .unwrap()
2021             .allocate_address_space()
2022             .map_err(Error::MemoryManager)?;
2023 
2024         // Now we can start all vCPUs from here.
2025         self.cpu_manager
2026             .lock()
2027             .unwrap()
2028             .start_restored_vcpus()
2029             .map_err(Error::CpuManager)?;
2030 
2031         event!("vm", "restored");
2032         Ok(())
2033     }
2034 
2035     /// Gets a thread-safe reference counted pointer to the VM configuration.
2036     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2037         Arc::clone(&self.config)
2038     }
2039 
2040     /// Get the VM state. Returns an error if the state is poisoned.
2041     pub fn get_state(&self) -> Result<VmState> {
2042         self.state
2043             .try_read()
2044             .map_err(|_| Error::PoisonedState)
2045             .map(|state| *state)
2046     }
2047 
2048     /// Gets the actual size of the balloon.
2049     pub fn balloon_size(&self) -> u64 {
2050         self.device_manager.lock().unwrap().balloon_size()
2051     }
2052 
2053     pub fn send_memory_fds(
2054         &mut self,
2055         socket: &mut UnixStream,
2056     ) -> std::result::Result<(), MigratableError> {
2057         for (slot, fd) in self
2058             .memory_manager
2059             .lock()
2060             .unwrap()
2061             .memory_slot_fds()
2062             .drain()
2063         {
2064             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2065                 .write_to(socket)
2066                 .map_err(|e| {
2067                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2068                 })?;
2069             socket
2070                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2071                 .map_err(|e| {
2072                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2073                 })?;
2074 
2075             let res = Response::read_from(socket)?;
2076             if res.status() != Status::Ok {
2077                 warn!("Error during memory fd migration");
2078                 Request::abandon().write_to(socket)?;
2079                 Response::read_from(socket).ok();
2080                 return Err(MigratableError::MigrateSend(anyhow!(
2081                     "Error during memory fd migration"
2082                 )));
2083             }
2084         }
2085 
2086         Ok(())
2087     }
2088 
2089     pub fn send_memory_regions<F>(
2090         &mut self,
2091         ranges: &MemoryRangeTable,
2092         fd: &mut F,
2093     ) -> std::result::Result<(), MigratableError>
2094     where
2095         F: Write,
2096     {
2097         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2098         let mem = guest_memory.memory();
2099 
2100         for range in ranges.regions() {
2101             let mut offset: u64 = 0;
2102             // Here we are manually handling the retry in case we can't the
2103             // whole region at once because we can't use the implementation
2104             // from vm-memory::GuestMemory of write_all_to() as it is not
2105             // following the correct behavior. For more info about this issue
2106             // see: https://github.com/rust-vmm/vm-memory/issues/174
2107             loop {
2108                 let bytes_written = mem
2109                     .write_to(
2110                         GuestAddress(range.gpa + offset),
2111                         fd,
2112                         (range.length - offset) as usize,
2113                     )
2114                     .map_err(|e| {
2115                         MigratableError::MigrateSend(anyhow!(
2116                             "Error transferring memory to socket: {}",
2117                             e
2118                         ))
2119                     })?;
2120                 offset += bytes_written as u64;
2121 
2122                 if offset == range.length {
2123                     break;
2124                 }
2125             }
2126         }
2127 
2128         Ok(())
2129     }
2130 
2131     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2132         self.memory_manager
2133             .lock()
2134             .unwrap()
2135             .memory_range_table(false)
2136     }
2137 
2138     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2139         self.device_manager.lock().unwrap().device_tree()
2140     }
2141 
2142     pub fn activate_virtio_devices(&self) -> Result<()> {
2143         self.device_manager
2144             .lock()
2145             .unwrap()
2146             .activate_virtio_devices()
2147             .map_err(Error::ActivateVirtioDevices)
2148     }
2149 
2150     #[cfg(target_arch = "x86_64")]
2151     pub fn power_button(&self) -> Result<()> {
2152         return self
2153             .device_manager
2154             .lock()
2155             .unwrap()
2156             .notify_power_button()
2157             .map_err(Error::PowerButton);
2158     }
2159 
2160     #[cfg(target_arch = "aarch64")]
2161     pub fn power_button(&self) -> Result<()> {
2162         self.device_manager
2163             .lock()
2164             .unwrap()
2165             .notify_power_button()
2166             .map_err(Error::PowerButton)
2167     }
2168 
2169     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2170         self.memory_manager.lock().unwrap().snapshot_data()
2171     }
2172 
2173     #[cfg(feature = "guest_debug")]
2174     pub fn debug_request(
2175         &mut self,
2176         gdb_request: &GdbRequestPayload,
2177         cpu_id: usize,
2178     ) -> Result<GdbResponsePayload> {
2179         use GdbRequestPayload::*;
2180         match gdb_request {
2181             SetSingleStep(single_step) => {
2182                 self.set_guest_debug(cpu_id, &[], *single_step)
2183                     .map_err(Error::Debug)?;
2184             }
2185             SetHwBreakPoint(addrs) => {
2186                 self.set_guest_debug(cpu_id, addrs, false)
2187                     .map_err(Error::Debug)?;
2188             }
2189             Pause => {
2190                 self.debug_pause().map_err(Error::Debug)?;
2191             }
2192             Resume => {
2193                 self.debug_resume().map_err(Error::Debug)?;
2194             }
2195             ReadRegs => {
2196                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2197                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2198             }
2199             WriteRegs(regs) => {
2200                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2201             }
2202             ReadMem(vaddr, len) => {
2203                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2204                 let mem = self
2205                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2206                     .map_err(Error::Debug)?;
2207                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2208             }
2209             WriteMem(vaddr, data) => {
2210                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2211                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2212                     .map_err(Error::Debug)?;
2213             }
2214             ActiveVcpus => {
2215                 let active_vcpus = self.active_vcpus();
2216                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2217             }
2218         }
2219         Ok(GdbResponsePayload::CommandComplete)
2220     }
2221 
2222     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2223     fn get_dump_state(
2224         &mut self,
2225         destination_url: &str,
2226     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2227         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2228         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2229         let mut elf_phdr_num = 1;
2230         let elf_sh_info = 0;
2231         let coredump_file_path = url_to_file(destination_url)?;
2232         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2233 
2234         if mapping_num < UINT16_MAX - 2 {
2235             elf_phdr_num += mapping_num as u16;
2236         } else {
2237             panic!("mapping num beyond 65535 not supported");
2238         }
2239         let coredump_file = OpenOptions::new()
2240             .read(true)
2241             .write(true)
2242             .create_new(true)
2243             .open(coredump_file_path)
2244             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2245 
2246         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2247         let mem_data = self
2248             .memory_manager
2249             .lock()
2250             .unwrap()
2251             .coredump_memory_regions(mem_offset);
2252 
2253         Ok(DumpState {
2254             elf_note_size,
2255             elf_phdr_num,
2256             elf_sh_info,
2257             mem_offset,
2258             mem_info: Some(mem_data),
2259             file: Some(coredump_file),
2260         })
2261     }
2262 
2263     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2264     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2265         size_of::<elf::Elf64_Ehdr>() as u64
2266             + note_size as u64
2267             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2268     }
2269 }
2270 
2271 impl Pausable for Vm {
2272     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2273         event!("vm", "pausing");
2274         let mut state = self
2275             .state
2276             .try_write()
2277             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2278         let new_state = VmState::Paused;
2279 
2280         state
2281             .valid_transition(new_state)
2282             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2283 
2284         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2285         {
2286             let mut clock = self
2287                 .vm
2288                 .get_clock()
2289                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2290             clock.reset_flags();
2291             self.saved_clock = Some(clock);
2292         }
2293 
2294         // Before pausing the vCPUs activate any pending virtio devices that might
2295         // need activation between starting the pause (or e.g. a migration it's part of)
2296         self.activate_virtio_devices().map_err(|e| {
2297             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2298         })?;
2299 
2300         self.cpu_manager.lock().unwrap().pause()?;
2301         self.device_manager.lock().unwrap().pause()?;
2302 
2303         *state = new_state;
2304 
2305         event!("vm", "paused");
2306         Ok(())
2307     }
2308 
2309     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2310         event!("vm", "resuming");
2311         let mut state = self
2312             .state
2313             .try_write()
2314             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2315         let new_state = VmState::Running;
2316 
2317         state
2318             .valid_transition(new_state)
2319             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2320 
2321         self.cpu_manager.lock().unwrap().resume()?;
2322         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2323         {
2324             if let Some(clock) = &self.saved_clock {
2325                 self.vm.set_clock(clock).map_err(|e| {
2326                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2327                 })?;
2328             }
2329         }
2330         self.device_manager.lock().unwrap().resume()?;
2331 
2332         // And we're back to the Running state.
2333         *state = new_state;
2334         event!("vm", "resumed");
2335         Ok(())
2336     }
2337 }
2338 
2339 #[derive(Serialize, Deserialize)]
2340 pub struct VmSnapshot {
2341     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2342     pub clock: Option<hypervisor::ClockData>,
2343     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2344     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2345 }
2346 
2347 pub const VM_SNAPSHOT_ID: &str = "vm";
2348 impl Snapshottable for Vm {
2349     fn id(&self) -> String {
2350         VM_SNAPSHOT_ID.to_string()
2351     }
2352 
2353     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2354         event!("vm", "snapshotting");
2355 
2356         #[cfg(feature = "tdx")]
2357         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2358 
2359         #[cfg(feature = "tdx")]
2360         {
2361             if tdx_enabled {
2362                 return Err(MigratableError::Snapshot(anyhow!(
2363                     "Snapshot not possible with TDX VM"
2364                 )));
2365             }
2366         }
2367 
2368         let current_state = self.get_state().unwrap();
2369         if current_state != VmState::Paused {
2370             return Err(MigratableError::Snapshot(anyhow!(
2371                 "Trying to snapshot while VM is running"
2372             )));
2373         }
2374 
2375         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2376         let common_cpuid = {
2377             let phys_bits = physical_bits(
2378                 &self.hypervisor,
2379                 self.config.lock().unwrap().cpus.max_phys_bits,
2380             );
2381             arch::generate_common_cpuid(
2382                 &self.hypervisor,
2383                 None,
2384                 None,
2385                 phys_bits,
2386                 self.config.lock().unwrap().cpus.kvm_hyperv,
2387                 #[cfg(feature = "tdx")]
2388                 tdx_enabled,
2389             )
2390             .map_err(|e| {
2391                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2392             })?
2393         };
2394 
2395         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2396             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2397             clock: self.saved_clock,
2398             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2399             common_cpuid,
2400         })
2401         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2402 
2403         let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data));
2404 
2405         let (id, snapshot) = {
2406             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2407             (cpu_manager.id(), cpu_manager.snapshot()?)
2408         };
2409         vm_snapshot.add_snapshot(id, snapshot);
2410         let (id, snapshot) = {
2411             let mut memory_manager = self.memory_manager.lock().unwrap();
2412             (memory_manager.id(), memory_manager.snapshot()?)
2413         };
2414         vm_snapshot.add_snapshot(id, snapshot);
2415         let (id, snapshot) = {
2416             let mut device_manager = self.device_manager.lock().unwrap();
2417             (device_manager.id(), device_manager.snapshot()?)
2418         };
2419         vm_snapshot.add_snapshot(id, snapshot);
2420 
2421         event!("vm", "snapshotted");
2422         Ok(vm_snapshot)
2423     }
2424 }
2425 
2426 impl Transportable for Vm {
2427     fn send(
2428         &self,
2429         snapshot: &Snapshot,
2430         destination_url: &str,
2431     ) -> std::result::Result<(), MigratableError> {
2432         let mut snapshot_config_path = url_to_path(destination_url)?;
2433         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2434 
2435         // Create the snapshot config file
2436         let mut snapshot_config_file = OpenOptions::new()
2437             .read(true)
2438             .write(true)
2439             .create_new(true)
2440             .open(snapshot_config_path)
2441             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2442 
2443         // Serialize and write the snapshot config
2444         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2445             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2446 
2447         snapshot_config_file
2448             .write(vm_config.as_bytes())
2449             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2450 
2451         let mut snapshot_state_path = url_to_path(destination_url)?;
2452         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2453 
2454         // Create the snapshot state file
2455         let mut snapshot_state_file = OpenOptions::new()
2456             .read(true)
2457             .write(true)
2458             .create_new(true)
2459             .open(snapshot_state_path)
2460             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2461 
2462         // Serialize and write the snapshot state
2463         let vm_state =
2464             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2465 
2466         snapshot_state_file
2467             .write(&vm_state)
2468             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2469 
2470         // Tell the memory manager to also send/write its own snapshot.
2471         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2472             self.memory_manager
2473                 .lock()
2474                 .unwrap()
2475                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2476         } else {
2477             return Err(MigratableError::Restore(anyhow!(
2478                 "Missing memory manager snapshot"
2479             )));
2480         }
2481 
2482         Ok(())
2483     }
2484 }
2485 
2486 impl Migratable for Vm {
2487     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2488         self.memory_manager.lock().unwrap().start_dirty_log()?;
2489         self.device_manager.lock().unwrap().start_dirty_log()
2490     }
2491 
2492     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2493         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2494         self.device_manager.lock().unwrap().stop_dirty_log()
2495     }
2496 
2497     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2498         Ok(MemoryRangeTable::new_from_tables(vec![
2499             self.memory_manager.lock().unwrap().dirty_log()?,
2500             self.device_manager.lock().unwrap().dirty_log()?,
2501         ]))
2502     }
2503 
2504     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2505         self.memory_manager.lock().unwrap().start_migration()?;
2506         self.device_manager.lock().unwrap().start_migration()
2507     }
2508 
2509     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2510         self.memory_manager.lock().unwrap().complete_migration()?;
2511         self.device_manager.lock().unwrap().complete_migration()
2512     }
2513 }
2514 
2515 #[cfg(feature = "guest_debug")]
2516 impl Debuggable for Vm {
2517     fn set_guest_debug(
2518         &self,
2519         cpu_id: usize,
2520         addrs: &[GuestAddress],
2521         singlestep: bool,
2522     ) -> std::result::Result<(), DebuggableError> {
2523         self.cpu_manager
2524             .lock()
2525             .unwrap()
2526             .set_guest_debug(cpu_id, addrs, singlestep)
2527     }
2528 
2529     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2530         if *self.state.read().unwrap() == VmState::Running {
2531             self.pause().map_err(DebuggableError::Pause)?;
2532         }
2533 
2534         let mut state = self
2535             .state
2536             .try_write()
2537             .map_err(|_| DebuggableError::PoisonedState)?;
2538         *state = VmState::BreakPoint;
2539         Ok(())
2540     }
2541 
2542     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2543         if *self.state.read().unwrap() == VmState::BreakPoint {
2544             self.resume().map_err(DebuggableError::Pause)?;
2545         }
2546 
2547         Ok(())
2548     }
2549 
2550     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2551         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2552     }
2553 
2554     fn write_regs(
2555         &self,
2556         cpu_id: usize,
2557         regs: &CoreRegs,
2558     ) -> std::result::Result<(), DebuggableError> {
2559         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2560     }
2561 
2562     fn read_mem(
2563         &self,
2564         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2565         cpu_id: usize,
2566         vaddr: GuestAddress,
2567         len: usize,
2568     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2569         self.cpu_manager
2570             .lock()
2571             .unwrap()
2572             .read_mem(guest_memory, cpu_id, vaddr, len)
2573     }
2574 
2575     fn write_mem(
2576         &self,
2577         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2578         cpu_id: usize,
2579         vaddr: &GuestAddress,
2580         data: &[u8],
2581     ) -> std::result::Result<(), DebuggableError> {
2582         self.cpu_manager
2583             .lock()
2584             .unwrap()
2585             .write_mem(guest_memory, cpu_id, vaddr, data)
2586     }
2587 
2588     fn active_vcpus(&self) -> usize {
2589         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2590         if active_vcpus > 0 {
2591             active_vcpus
2592         } else {
2593             // The VM is not booted yet. Report boot_vcpus() instead.
2594             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2595         }
2596     }
2597 }
2598 
2599 #[cfg(feature = "guest_debug")]
2600 pub const UINT16_MAX: u32 = 65535;
2601 
2602 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2603 impl Elf64Writable for Vm {}
2604 
2605 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2606 impl GuestDebuggable for Vm {
2607     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2608         event!("vm", "coredumping");
2609 
2610         let mut resume = false;
2611 
2612         #[cfg(feature = "tdx")]
2613         {
2614             if let Some(ref platform) = self.config.lock().unwrap().platform {
2615                 if platform.tdx {
2616                     return Err(GuestDebuggableError::Coredump(anyhow!(
2617                         "Coredump not possible with TDX VM"
2618                     )));
2619                 }
2620             }
2621         }
2622 
2623         match self.get_state().unwrap() {
2624             VmState::Running => {
2625                 self.pause().map_err(GuestDebuggableError::Pause)?;
2626                 resume = true;
2627             }
2628             VmState::Paused => {}
2629             _ => {
2630                 return Err(GuestDebuggableError::Coredump(anyhow!(
2631                     "Trying to coredump while VM is not running or paused"
2632                 )));
2633             }
2634         }
2635 
2636         let coredump_state = self.get_dump_state(destination_url)?;
2637 
2638         self.write_header(&coredump_state)?;
2639         self.write_note(&coredump_state)?;
2640         self.write_loads(&coredump_state)?;
2641 
2642         self.cpu_manager
2643             .lock()
2644             .unwrap()
2645             .cpu_write_elf64_note(&coredump_state)?;
2646         self.cpu_manager
2647             .lock()
2648             .unwrap()
2649             .cpu_write_vmm_note(&coredump_state)?;
2650 
2651         self.memory_manager
2652             .lock()
2653             .unwrap()
2654             .coredump_iterate_save_mem(&coredump_state)?;
2655 
2656         if resume {
2657             self.resume().map_err(GuestDebuggableError::Resume)?;
2658         }
2659 
2660         Ok(())
2661     }
2662 }
2663 
2664 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2665 #[cfg(test)]
2666 mod tests {
2667     use super::*;
2668 
2669     fn test_vm_state_transitions(state: VmState) {
2670         match state {
2671             VmState::Created => {
2672                 // Check the transitions from Created
2673                 assert!(state.valid_transition(VmState::Created).is_err());
2674                 assert!(state.valid_transition(VmState::Running).is_ok());
2675                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2676                 assert!(state.valid_transition(VmState::Paused).is_ok());
2677                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2678             }
2679             VmState::Running => {
2680                 // Check the transitions from Running
2681                 assert!(state.valid_transition(VmState::Created).is_err());
2682                 assert!(state.valid_transition(VmState::Running).is_err());
2683                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2684                 assert!(state.valid_transition(VmState::Paused).is_ok());
2685                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2686             }
2687             VmState::Shutdown => {
2688                 // Check the transitions from Shutdown
2689                 assert!(state.valid_transition(VmState::Created).is_err());
2690                 assert!(state.valid_transition(VmState::Running).is_ok());
2691                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2692                 assert!(state.valid_transition(VmState::Paused).is_err());
2693                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2694             }
2695             VmState::Paused => {
2696                 // Check the transitions from Paused
2697                 assert!(state.valid_transition(VmState::Created).is_err());
2698                 assert!(state.valid_transition(VmState::Running).is_ok());
2699                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2700                 assert!(state.valid_transition(VmState::Paused).is_err());
2701                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2702             }
2703             VmState::BreakPoint => {
2704                 // Check the transitions from Breakpoint
2705                 assert!(state.valid_transition(VmState::Created).is_ok());
2706                 assert!(state.valid_transition(VmState::Running).is_ok());
2707                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2708                 assert!(state.valid_transition(VmState::Paused).is_err());
2709                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2710             }
2711         }
2712     }
2713 
2714     #[test]
2715     fn test_vm_created_transitions() {
2716         test_vm_state_transitions(VmState::Created);
2717     }
2718 
2719     #[test]
2720     fn test_vm_running_transitions() {
2721         test_vm_state_transitions(VmState::Running);
2722     }
2723 
2724     #[test]
2725     fn test_vm_shutdown_transitions() {
2726         test_vm_state_transitions(VmState::Shutdown);
2727     }
2728 
2729     #[test]
2730     fn test_vm_paused_transitions() {
2731         test_vm_state_transitions(VmState::Paused);
2732     }
2733 
2734     #[cfg(feature = "tdx")]
2735     #[test]
2736     fn test_hob_memory_resources() {
2737         // Case 1: Two TDVF sections in the middle of the RAM
2738         let sections = vec![
2739             TdvfSection {
2740                 address: 0xc000,
2741                 size: 0x1000,
2742                 ..Default::default()
2743             },
2744             TdvfSection {
2745                 address: 0x1000,
2746                 size: 0x4000,
2747                 ..Default::default()
2748             },
2749         ];
2750         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2751         let expected = vec![
2752             (0, 0x1000, true),
2753             (0x1000, 0x4000, false),
2754             (0x5000, 0x7000, true),
2755             (0xc000, 0x1000, false),
2756             (0xd000, 0x0fff_3000, true),
2757         ];
2758         assert_eq!(
2759             expected,
2760             Vm::hob_memory_resources(
2761                 sections,
2762                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2763             )
2764         );
2765 
2766         // Case 2: Two TDVF sections with no conflict with the RAM
2767         let sections = vec![
2768             TdvfSection {
2769                 address: 0x1000_1000,
2770                 size: 0x1000,
2771                 ..Default::default()
2772             },
2773             TdvfSection {
2774                 address: 0,
2775                 size: 0x1000,
2776                 ..Default::default()
2777             },
2778         ];
2779         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2780         let expected = vec![
2781             (0, 0x1000, false),
2782             (0x1000, 0x1000_0000, true),
2783             (0x1000_1000, 0x1000, false),
2784         ];
2785         assert_eq!(
2786             expected,
2787             Vm::hob_memory_resources(
2788                 sections,
2789                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2790             )
2791         );
2792 
2793         // Case 3: Two TDVF sections with partial conflicts with the RAM
2794         let sections = vec![
2795             TdvfSection {
2796                 address: 0x1000_0000,
2797                 size: 0x2000,
2798                 ..Default::default()
2799             },
2800             TdvfSection {
2801                 address: 0,
2802                 size: 0x2000,
2803                 ..Default::default()
2804             },
2805         ];
2806         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2807         let expected = vec![
2808             (0, 0x2000, false),
2809             (0x2000, 0x0fff_e000, true),
2810             (0x1000_0000, 0x2000, false),
2811         ];
2812         assert_eq!(
2813             expected,
2814             Vm::hob_memory_resources(
2815                 sections,
2816                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2817             )
2818         );
2819 
2820         // Case 4: Two TDVF sections with no conflict before the RAM and two
2821         // more additional sections with no conflict after the RAM.
2822         let sections = vec![
2823             TdvfSection {
2824                 address: 0x2000_1000,
2825                 size: 0x1000,
2826                 ..Default::default()
2827             },
2828             TdvfSection {
2829                 address: 0x2000_0000,
2830                 size: 0x1000,
2831                 ..Default::default()
2832             },
2833             TdvfSection {
2834                 address: 0x1000,
2835                 size: 0x1000,
2836                 ..Default::default()
2837             },
2838             TdvfSection {
2839                 address: 0,
2840                 size: 0x1000,
2841                 ..Default::default()
2842             },
2843         ];
2844         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
2845         let expected = vec![
2846             (0, 0x1000, false),
2847             (0x1000, 0x1000, false),
2848             (0x4000, 0x1000_0000, true),
2849             (0x2000_0000, 0x1000, false),
2850             (0x2000_1000, 0x1000, false),
2851         ];
2852         assert_eq!(
2853             expected,
2854             Vm::hob_memory_resources(
2855                 sections,
2856                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2857             )
2858         );
2859 
2860         // Case 5: One TDVF section overriding the entire RAM
2861         let sections = vec![TdvfSection {
2862             address: 0,
2863             size: 0x2000_0000,
2864             ..Default::default()
2865         }];
2866         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2867         let expected = vec![(0, 0x2000_0000, false)];
2868         assert_eq!(
2869             expected,
2870             Vm::hob_memory_resources(
2871                 sections,
2872                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2873             )
2874         );
2875 
2876         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
2877         let sections = vec![
2878             TdvfSection {
2879                 address: 0x1000_2000,
2880                 size: 0x2000,
2881                 ..Default::default()
2882             },
2883             TdvfSection {
2884                 address: 0,
2885                 size: 0x2000,
2886                 ..Default::default()
2887             },
2888         ];
2889         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
2890             (GuestAddress(0x2000), 0x1000_0000),
2891             (GuestAddress(0x1000_4000), 0x1000_0000),
2892         ];
2893         let expected = vec![
2894             (0, 0x2000, false),
2895             (0x2000, 0x1000_0000, true),
2896             (0x1000_2000, 0x2000, false),
2897             (0x1000_4000, 0x1000_0000, true),
2898         ];
2899         assert_eq!(
2900             expected,
2901             Vm::hob_memory_resources(
2902                 sections,
2903                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2904             )
2905         );
2906 
2907         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
2908         let sections = vec![
2909             TdvfSection {
2910                 address: 0x1000_0000,
2911                 size: 0x4000,
2912                 ..Default::default()
2913             },
2914             TdvfSection {
2915                 address: 0,
2916                 size: 0x4000,
2917                 ..Default::default()
2918             },
2919         ];
2920         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
2921             (GuestAddress(0x1000), 0x1000_0000),
2922             (GuestAddress(0x1000_3000), 0x1000_0000),
2923         ];
2924         let expected = vec![
2925             (0, 0x4000, false),
2926             (0x4000, 0x0fff_c000, true),
2927             (0x1000_0000, 0x4000, false),
2928             (0x1000_4000, 0x0fff_f000, true),
2929         ];
2930         assert_eq!(
2931             expected,
2932             Vm::hob_memory_resources(
2933                 sections,
2934                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2935             )
2936         );
2937     }
2938 }
2939 
2940 #[cfg(target_arch = "aarch64")]
2941 #[cfg(test)]
2942 mod tests {
2943     use super::*;
2944     use crate::GuestMemoryMmap;
2945     use arch::aarch64::fdt::create_fdt;
2946     use arch::aarch64::layout;
2947     use arch::{DeviceType, MmioDeviceInfo};
2948     use devices::gic::Gic;
2949 
2950     const LEN: u64 = 4096;
2951 
2952     #[test]
2953     fn test_create_fdt_with_devices() {
2954         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
2955         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
2956 
2957         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
2958             (
2959                 (DeviceType::Serial, DeviceType::Serial.to_string()),
2960                 MmioDeviceInfo {
2961                     addr: 0x00,
2962                     len: LEN,
2963                     irq: 33,
2964                 },
2965             ),
2966             (
2967                 (DeviceType::Virtio(1), "virtio".to_string()),
2968                 MmioDeviceInfo {
2969                     addr: LEN,
2970                     len: LEN,
2971                     irq: 34,
2972                 },
2973             ),
2974             (
2975                 (DeviceType::Rtc, "rtc".to_string()),
2976                 MmioDeviceInfo {
2977                     addr: 2 * LEN,
2978                     len: LEN,
2979                     irq: 35,
2980                 },
2981             ),
2982         ]
2983         .iter()
2984         .cloned()
2985         .collect();
2986 
2987         let hv = hypervisor::new().unwrap();
2988         let vm = hv.create_vm().unwrap();
2989         let gic = vm
2990             .create_vgic(Gic::create_default_config(1))
2991             .expect("Cannot create gic");
2992         assert!(create_fdt(
2993             &mem,
2994             "console=tty0",
2995             vec![0],
2996             Some((0, 0, 0)),
2997             &dev_info,
2998             &gic,
2999             &None,
3000             &Vec::new(),
3001             &BTreeMap::new(),
3002             None,
3003             true,
3004         )
3005         .is_ok())
3006     }
3007 }
3008 
3009 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3010 #[test]
3011 pub fn test_vm() {
3012     use hypervisor::VmExit;
3013     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3014     // This example based on https://lwn.net/Articles/658511/
3015     let code = [
3016         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3017         0x00, 0xd8, /* add %bl, %al */
3018         0x04, b'0', /* add $'0', %al */
3019         0xee, /* out %al, (%dx) */
3020         0xb0, b'\n', /* mov $'\n', %al */
3021         0xee,  /* out %al, (%dx) */
3022         0xf4,  /* hlt */
3023     ];
3024 
3025     let mem_size = 0x1000;
3026     let load_addr = GuestAddress(0x1000);
3027     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3028 
3029     let hv = hypervisor::new().unwrap();
3030     let vm = hv.create_vm().expect("new VM creation failed");
3031 
3032     for (index, region) in mem.iter().enumerate() {
3033         let mem_region = vm.make_user_memory_region(
3034             index as u32,
3035             region.start_addr().raw_value(),
3036             region.len(),
3037             region.as_ptr() as u64,
3038             false,
3039             false,
3040         );
3041 
3042         vm.create_user_memory_region(mem_region)
3043             .expect("Cannot configure guest memory");
3044     }
3045     mem.write_slice(&code, load_addr)
3046         .expect("Writing code to memory failed");
3047 
3048     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3049 
3050     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3051     vcpu_sregs.cs.base = 0;
3052     vcpu_sregs.cs.selector = 0;
3053     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3054 
3055     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3056     vcpu_regs.rip = 0x1000;
3057     vcpu_regs.rax = 2;
3058     vcpu_regs.rbx = 3;
3059     vcpu_regs.rflags = 2;
3060     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3061 
3062     loop {
3063         match vcpu.run().expect("run failed") {
3064             VmExit::IoOut(addr, data) => {
3065                 println!(
3066                     "IO out -- addr: {:#x} data [{:?}]",
3067                     addr,
3068                     str::from_utf8(data).unwrap()
3069                 );
3070             }
3071             VmExit::Reset => {
3072                 println!("HLT");
3073                 break;
3074             }
3075             r => panic!("unexpected exit reason: {r:?}"),
3076         }
3077     }
3078 }
3079