xref: /cloud-hypervisor/vmm/src/vm.rs (revision eea9bcea38e0c5649f444c829f3a4f9c22aa486c)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(feature = "guest_debug")]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{Console, DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 use crate::memory_manager::{
29     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
30 };
31 #[cfg(feature = "guest_debug")]
32 use crate::migration::url_to_file;
33 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
34 use crate::seccomp_filters::{get_seccomp_filter, Thread};
35 use crate::GuestMemoryMmap;
36 use crate::{
37     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
38 };
39 use anyhow::anyhow;
40 use arch::get_host_cpu_phys_bits;
41 #[cfg(target_arch = "x86_64")]
42 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
43 #[cfg(feature = "tdx")]
44 use arch::x86_64::tdx::TdvfSection;
45 use arch::EntryPoint;
46 #[cfg(target_arch = "aarch64")]
47 use arch::PciSpaceInfo;
48 use arch::{NumaNode, NumaNodes};
49 #[cfg(target_arch = "aarch64")]
50 use devices::gic::{Gic, GIC_V3_ITS_SNAPSHOT_ID};
51 #[cfg(target_arch = "aarch64")]
52 use devices::interrupt_controller::{self, InterruptController};
53 use devices::AcpiNotificationFlags;
54 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
55 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
56 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
57 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
58 use hypervisor::{HypervisorVmError, VmOps};
59 use linux_loader::cmdline::Cmdline;
60 #[cfg(feature = "guest_debug")]
61 use linux_loader::elf;
62 #[cfg(target_arch = "x86_64")]
63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
64 #[cfg(target_arch = "aarch64")]
65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
66 use linux_loader::loader::KernelLoader;
67 use seccompiler::{apply_filter, SeccompAction};
68 use serde::{Deserialize, Serialize};
69 use signal_hook::{consts::SIGWINCH, iterator::backend::Handle, iterator::Signals};
70 use std::cmp;
71 use std::collections::BTreeMap;
72 use std::collections::HashMap;
73 use std::convert::TryInto;
74 use std::fs::{File, OpenOptions};
75 use std::io::{self, Read, Write};
76 use std::io::{Seek, SeekFrom};
77 #[cfg(feature = "tdx")]
78 use std::mem;
79 #[cfg(feature = "guest_debug")]
80 use std::mem::size_of;
81 use std::num::Wrapping;
82 use std::ops::Deref;
83 use std::os::unix::net::UnixStream;
84 use std::panic::AssertUnwindSafe;
85 use std::sync::{Arc, Mutex, RwLock};
86 use std::time::Instant;
87 use std::{result, str, thread};
88 use thiserror::Error;
89 use tracer::trace_scoped;
90 use vm_device::Bus;
91 #[cfg(target_arch = "x86_64")]
92 use vm_device::BusDevice;
93 #[cfg(target_arch = "x86_64")]
94 use vm_memory::Address;
95 #[cfg(feature = "tdx")]
96 use vm_memory::{ByteValued, GuestMemory, GuestMemoryRegion};
97 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic};
98 use vm_migration::protocol::{Request, Response, Status};
99 use vm_migration::{
100     protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, Snapshot,
101     SnapshotDataSection, Snapshottable, Transportable,
102 };
103 use vmm_sys_util::eventfd::EventFd;
104 use vmm_sys_util::signal::unblock_signal;
105 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
106 use vmm_sys_util::terminal::Terminal;
107 
108 /// Errors associated with VM management
109 #[derive(Debug, Error)]
110 pub enum Error {
111     #[error("Cannot open kernel file: {0}")]
112     KernelFile(#[source] io::Error),
113 
114     #[error("Cannot open initramfs file: {0}")]
115     InitramfsFile(#[source] io::Error),
116 
117     #[error("Cannot load the kernel into memory: {0}")]
118     KernelLoad(#[source] linux_loader::loader::Error),
119 
120     #[cfg(target_arch = "aarch64")]
121     #[error("Cannot load the UEFI binary in memory: {0:?}")]
122     UefiLoad(arch::aarch64::uefi::Error),
123 
124     #[error("Cannot load the initramfs into memory")]
125     InitramfsLoad,
126 
127     #[error("Cannot load the kernel command line in memory: {0}")]
128     LoadCmdLine(#[source] linux_loader::loader::Error),
129 
130     #[error("Cannot modify the kernel command line: {0}")]
131     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
132 
133     #[error("Cannot configure system: {0}")]
134     ConfigureSystem(#[source] arch::Error),
135 
136     #[cfg(target_arch = "aarch64")]
137     #[error("Cannot enable interrupt controller: {0:?}")]
138     EnableInterruptController(interrupt_controller::Error),
139 
140     #[error("VM state is poisoned")]
141     PoisonedState,
142 
143     #[error("Error from device manager: {0:?}")]
144     DeviceManager(DeviceManagerError),
145 
146     #[error("Cannot setup terminal in raw mode: {0}")]
147     SetTerminalRaw(#[source] vmm_sys_util::errno::Error),
148 
149     #[error("Cannot setup terminal in canonical mode.: {0}")]
150     SetTerminalCanon(#[source] vmm_sys_util::errno::Error),
151 
152     #[error("Cannot spawn a signal handler thread: {0}")]
153     SignalHandlerSpawn(#[source] io::Error),
154 
155     #[error("Failed to join on threads: {0:?}")]
156     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
157 
158     #[error("VM config is missing")]
159     VmMissingConfig,
160 
161     #[error("VM is not created")]
162     VmNotCreated,
163 
164     #[error("VM is already created")]
165     VmAlreadyCreated,
166 
167     #[error("VM is not running")]
168     VmNotRunning,
169 
170     #[error("Cannot clone EventFd: {0}")]
171     EventFdClone(#[source] io::Error),
172 
173     #[error("invalid VM state transition: {0:?} to {1:?}")]
174     InvalidStateTransition(VmState, VmState),
175 
176     #[error("Error from CPU manager: {0}")]
177     CpuManager(#[source] cpu::Error),
178 
179     #[error("Cannot pause devices: {0}")]
180     PauseDevices(#[source] MigratableError),
181 
182     #[error("Cannot resume devices: {0}")]
183     ResumeDevices(#[source] MigratableError),
184 
185     #[error("Cannot pause CPUs: {0}")]
186     PauseCpus(#[source] MigratableError),
187 
188     #[error("Cannot resume cpus: {0}")]
189     ResumeCpus(#[source] MigratableError),
190 
191     #[error("Cannot pause VM: {0}")]
192     Pause(#[source] MigratableError),
193 
194     #[error("Cannot resume VM: {0}")]
195     Resume(#[source] MigratableError),
196 
197     #[error("Memory manager error: {0:?}")]
198     MemoryManager(MemoryManagerError),
199 
200     #[error("Eventfd write error: {0}")]
201     EventfdError(#[source] std::io::Error),
202 
203     #[error("Cannot snapshot VM: {0}")]
204     Snapshot(#[source] MigratableError),
205 
206     #[error("Cannot restore VM: {0}")]
207     Restore(#[source] MigratableError),
208 
209     #[error("Cannot send VM snapshot: {0}")]
210     SnapshotSend(#[source] MigratableError),
211 
212     #[error("Invalid restore source URL")]
213     InvalidRestoreSourceUrl,
214 
215     #[error("Failed to validate config: {0}")]
216     ConfigValidation(#[source] ValidationError),
217 
218     #[error("Too many virtio-vsock devices")]
219     TooManyVsockDevices,
220 
221     #[error("Failed serializing into JSON: {0}")]
222     SerializeJson(#[source] serde_json::Error),
223 
224     #[error("Invalid NUMA configuration")]
225     InvalidNumaConfig,
226 
227     #[error("Cannot create seccomp filter: {0}")]
228     CreateSeccompFilter(#[source] seccompiler::Error),
229 
230     #[error("Cannot apply seccomp filter: {0}")]
231     ApplySeccompFilter(#[source] seccompiler::Error),
232 
233     #[error("Failed resizing a memory zone")]
234     ResizeZone,
235 
236     #[error("Cannot activate virtio devices: {0:?}")]
237     ActivateVirtioDevices(DeviceManagerError),
238 
239     #[error("Error triggering power button: {0:?}")]
240     PowerButton(DeviceManagerError),
241 
242     #[error("Kernel lacks PVH header")]
243     KernelMissingPvhHeader,
244 
245     #[error("Failed to allocate firmware RAM: {0:?}")]
246     AllocateFirmwareMemory(MemoryManagerError),
247 
248     #[error("Error manipulating firmware file: {0}")]
249     FirmwareFile(#[source] std::io::Error),
250 
251     #[error("Firmware too big")]
252     FirmwareTooLarge,
253 
254     #[error("Failed to copy firmware to memory: {0}")]
255     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
256 
257     #[cfg(feature = "tdx")]
258     #[error("Error performing I/O on TDX firmware file: {0}")]
259     LoadTdvf(#[source] std::io::Error),
260 
261     #[cfg(feature = "tdx")]
262     #[error("Error performing I/O on the TDX payload file: {0}")]
263     LoadPayload(#[source] std::io::Error),
264 
265     #[cfg(feature = "tdx")]
266     #[error("Error parsing TDVF: {0}")]
267     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
268 
269     #[cfg(feature = "tdx")]
270     #[error("Error populating TDX HOB: {0}")]
271     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
272 
273     #[cfg(feature = "tdx")]
274     #[error("Error allocating TDVF memory: {0:?}")]
275     AllocatingTdvfMemory(crate::memory_manager::Error),
276 
277     #[cfg(feature = "tdx")]
278     #[error("Error enabling TDX VM: {0}")]
279     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
280 
281     #[cfg(feature = "tdx")]
282     #[error("Error enabling TDX memory region: {0}")]
283     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
284 
285     #[cfg(feature = "tdx")]
286     #[error("Error finalizing TDX VM: {0}")]
287     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
288 
289     #[cfg(feature = "tdx")]
290     #[error("TDX firmware missing")]
291     TdxFirmwareMissing,
292 
293     #[cfg(feature = "tdx")]
294     #[error("Invalid TDX payload type")]
295     InvalidPayloadType,
296 
297     #[cfg(feature = "guest_debug")]
298     #[error("Error debugging VM: {0:?}")]
299     Debug(DebuggableError),
300 
301     #[error("Error spawning kernel loading thread")]
302     KernelLoadThreadSpawn(std::io::Error),
303 
304     #[error("Error joining kernel loading thread")]
305     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
306 
307     #[error("Payload configuration is not bootable")]
308     InvalidPayload,
309 
310     #[cfg(feature = "guest_debug")]
311     #[error("Error coredumping VM: {0:?}")]
312     Coredump(GuestDebuggableError),
313 }
314 pub type Result<T> = result::Result<T, Error>;
315 
316 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
317 pub enum VmState {
318     Created,
319     Running,
320     Shutdown,
321     Paused,
322     BreakPoint,
323 }
324 
325 impl VmState {
326     fn valid_transition(self, new_state: VmState) -> Result<()> {
327         match self {
328             VmState::Created => match new_state {
329                 VmState::Created | VmState::Shutdown => {
330                     Err(Error::InvalidStateTransition(self, new_state))
331                 }
332                 VmState::Running | VmState::Paused | VmState::BreakPoint => Ok(()),
333             },
334 
335             VmState::Running => match new_state {
336                 VmState::Created | VmState::Running => {
337                     Err(Error::InvalidStateTransition(self, new_state))
338                 }
339                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
340             },
341 
342             VmState::Shutdown => match new_state {
343                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
344                     Err(Error::InvalidStateTransition(self, new_state))
345                 }
346                 VmState::Running => Ok(()),
347             },
348 
349             VmState::Paused => match new_state {
350                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
351                     Err(Error::InvalidStateTransition(self, new_state))
352                 }
353                 VmState::Running | VmState::Shutdown => Ok(()),
354             },
355             VmState::BreakPoint => match new_state {
356                 VmState::Created | VmState::Running => Ok(()),
357                 _ => Err(Error::InvalidStateTransition(self, new_state)),
358             },
359         }
360     }
361 }
362 
363 struct VmOpsHandler {
364     memory: GuestMemoryAtomic<GuestMemoryMmap>,
365     #[cfg(target_arch = "x86_64")]
366     io_bus: Arc<Bus>,
367     mmio_bus: Arc<Bus>,
368     #[cfg(target_arch = "x86_64")]
369     pci_config_io: Arc<Mutex<dyn BusDevice>>,
370 }
371 
372 impl VmOps for VmOpsHandler {
373     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
374         self.memory
375             .memory()
376             .write(buf, GuestAddress(gpa))
377             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
378     }
379 
380     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
381         self.memory
382             .memory()
383             .read(buf, GuestAddress(gpa))
384             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
385     }
386 
387     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
388         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
389             warn!("Guest MMIO read to unregistered address 0x{:x}", gpa);
390         }
391         Ok(())
392     }
393 
394     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
395         match self.mmio_bus.write(gpa, data) {
396             Err(vm_device::BusError::MissingAddressRange) => {
397                 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa);
398             }
399             Ok(Some(barrier)) => {
400                 info!("Waiting for barrier");
401                 barrier.wait();
402                 info!("Barrier released");
403             }
404             _ => {}
405         };
406         Ok(())
407     }
408 
409     #[cfg(target_arch = "x86_64")]
410     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
411         use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE};
412 
413         if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) {
414             self.pci_config_io.lock().unwrap().read(
415                 PCI_CONFIG_IO_PORT,
416                 port - PCI_CONFIG_IO_PORT,
417                 data,
418             );
419             return Ok(());
420         }
421 
422         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
423             warn!("Guest PIO read to unregistered address 0x{:x}", port);
424         }
425         Ok(())
426     }
427 
428     #[cfg(target_arch = "x86_64")]
429     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
430         use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE};
431 
432         if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) {
433             self.pci_config_io.lock().unwrap().write(
434                 PCI_CONFIG_IO_PORT,
435                 port - PCI_CONFIG_IO_PORT,
436                 data,
437             );
438             return Ok(());
439         }
440 
441         match self.io_bus.write(port, data) {
442             Err(vm_device::BusError::MissingAddressRange) => {
443                 warn!("Guest PIO write to unregistered address 0x{:x}", port);
444             }
445             Ok(Some(barrier)) => {
446                 info!("Waiting for barrier");
447                 barrier.wait();
448                 info!("Barrier released");
449             }
450             _ => {}
451         };
452         Ok(())
453     }
454 }
455 
456 pub fn physical_bits(max_phys_bits: u8) -> u8 {
457     let host_phys_bits = get_host_cpu_phys_bits();
458 
459     cmp::min(host_phys_bits, max_phys_bits)
460 }
461 
462 pub struct Vm {
463     #[cfg(feature = "tdx")]
464     kernel: Option<File>,
465     initramfs: Option<File>,
466     threads: Vec<thread::JoinHandle<()>>,
467     device_manager: Arc<Mutex<DeviceManager>>,
468     config: Arc<Mutex<VmConfig>>,
469     on_tty: bool,
470     signals: Option<Handle>,
471     state: RwLock<VmState>,
472     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
473     memory_manager: Arc<Mutex<MemoryManager>>,
474     #[cfg_attr(not(feature = "kvm"), allow(dead_code))]
475     // The hypervisor abstracted virtual machine.
476     vm: Arc<dyn hypervisor::Vm>,
477     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
478     saved_clock: Option<hypervisor::ClockData>,
479     numa_nodes: NumaNodes,
480     seccomp_action: SeccompAction,
481     exit_evt: EventFd,
482     hypervisor: Arc<dyn hypervisor::Hypervisor>,
483     stop_on_boot: bool,
484     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
485 }
486 
487 impl Vm {
488     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
489 
490     #[allow(clippy::too_many_arguments)]
491     fn new_from_memory_manager(
492         config: Arc<Mutex<VmConfig>>,
493         memory_manager: Arc<Mutex<MemoryManager>>,
494         vm: Arc<dyn hypervisor::Vm>,
495         exit_evt: EventFd,
496         reset_evt: EventFd,
497         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
498         seccomp_action: &SeccompAction,
499         hypervisor: Arc<dyn hypervisor::Hypervisor>,
500         activate_evt: EventFd,
501         restoring: bool,
502         timestamp: Instant,
503     ) -> Result<Self> {
504         trace_scoped!("Vm::new_from_memory_manager");
505 
506         let boot_id_list = config
507             .lock()
508             .unwrap()
509             .validate()
510             .map_err(Error::ConfigValidation)?;
511 
512         let load_payload_handle = if !restoring {
513             Self::load_payload_async(&memory_manager, &config)?
514         } else {
515             None
516         };
517 
518         info!("Booting VM from config: {:?}", &config);
519 
520         // Create NUMA nodes based on NumaConfig.
521         let numa_nodes =
522             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
523 
524         #[cfg(feature = "tdx")]
525         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
526         #[cfg(feature = "tdx")]
527         let force_iommu = tdx_enabled;
528         #[cfg(not(feature = "tdx"))]
529         let force_iommu = false;
530 
531         #[cfg(feature = "guest_debug")]
532         let stop_on_boot = config.lock().unwrap().gdb;
533         #[cfg(not(feature = "guest_debug"))]
534         let stop_on_boot = false;
535 
536         let device_manager = DeviceManager::new(
537             hypervisor.hypervisor_type(),
538             vm.clone(),
539             config.clone(),
540             memory_manager.clone(),
541             &exit_evt,
542             &reset_evt,
543             seccomp_action.clone(),
544             numa_nodes.clone(),
545             &activate_evt,
546             force_iommu,
547             restoring,
548             boot_id_list,
549             timestamp,
550         )
551         .map_err(Error::DeviceManager)?;
552 
553         let memory = memory_manager.lock().unwrap().guest_memory();
554         #[cfg(target_arch = "x86_64")]
555         let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus());
556         let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus());
557 
558         #[cfg(target_arch = "x86_64")]
559         let pci_config_io =
560             device_manager.lock().unwrap().pci_config_io() as Arc<Mutex<dyn BusDevice>>;
561         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
562             memory,
563             #[cfg(target_arch = "x86_64")]
564             io_bus,
565             mmio_bus,
566             #[cfg(target_arch = "x86_64")]
567             pci_config_io,
568         });
569 
570         let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?;
571         let cpus_config = { &config.lock().unwrap().cpus.clone() };
572         let cpu_manager = cpu::CpuManager::new(
573             cpus_config,
574             &device_manager,
575             &memory_manager,
576             vm.clone(),
577             exit_evt_clone,
578             reset_evt,
579             #[cfg(feature = "guest_debug")]
580             vm_debug_evt,
581             hypervisor.clone(),
582             seccomp_action.clone(),
583             vm_ops,
584             #[cfg(feature = "tdx")]
585             tdx_enabled,
586             &numa_nodes,
587         )
588         .map_err(Error::CpuManager)?;
589 
590         let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0;
591 
592         #[cfg(feature = "tdx")]
593         let kernel = config
594             .lock()
595             .unwrap()
596             .payload
597             .as_ref()
598             .map(|p| p.kernel.as_ref().map(File::open))
599             .unwrap_or_default()
600             .transpose()
601             .map_err(Error::KernelFile)?;
602 
603         let initramfs = config
604             .lock()
605             .unwrap()
606             .payload
607             .as_ref()
608             .map(|p| p.initramfs.as_ref().map(File::open))
609             .unwrap_or_default()
610             .transpose()
611             .map_err(Error::InitramfsFile)?;
612 
613         Ok(Vm {
614             #[cfg(feature = "tdx")]
615             kernel,
616             initramfs,
617             device_manager,
618             config,
619             on_tty,
620             threads: Vec::with_capacity(1),
621             signals: None,
622             state: RwLock::new(VmState::Created),
623             cpu_manager,
624             memory_manager,
625             vm,
626             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
627             saved_clock: None,
628             numa_nodes,
629             seccomp_action: seccomp_action.clone(),
630             exit_evt,
631             hypervisor,
632             stop_on_boot,
633             load_payload_handle,
634         })
635     }
636 
637     fn create_numa_nodes(
638         configs: Option<Vec<NumaConfig>>,
639         memory_manager: &Arc<Mutex<MemoryManager>>,
640     ) -> Result<NumaNodes> {
641         let mm = memory_manager.lock().unwrap();
642         let mm_zones = mm.memory_zones();
643         let mut numa_nodes = BTreeMap::new();
644 
645         if let Some(configs) = &configs {
646             for config in configs.iter() {
647                 if numa_nodes.contains_key(&config.guest_numa_id) {
648                     error!("Can't define twice the same NUMA node");
649                     return Err(Error::InvalidNumaConfig);
650                 }
651 
652                 let mut node = NumaNode::default();
653 
654                 if let Some(memory_zones) = &config.memory_zones {
655                     for memory_zone in memory_zones.iter() {
656                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
657                             node.memory_regions.extend(mm_zone.regions().clone());
658                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
659                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
660                             }
661                             node.memory_zones.push(memory_zone.clone());
662                         } else {
663                             error!("Unknown memory zone '{}'", memory_zone);
664                             return Err(Error::InvalidNumaConfig);
665                         }
666                     }
667                 }
668 
669                 if let Some(cpus) = &config.cpus {
670                     node.cpus.extend(cpus);
671                 }
672 
673                 if let Some(distances) = &config.distances {
674                     for distance in distances.iter() {
675                         let dest = distance.destination;
676                         let dist = distance.distance;
677 
678                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
679                             error!("Unknown destination NUMA node {}", dest);
680                             return Err(Error::InvalidNumaConfig);
681                         }
682 
683                         if node.distances.contains_key(&dest) {
684                             error!("Destination NUMA node {} has been already set", dest);
685                             return Err(Error::InvalidNumaConfig);
686                         }
687 
688                         node.distances.insert(dest, dist);
689                     }
690                 }
691 
692                 #[cfg(target_arch = "x86_64")]
693                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
694                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
695                         let mm_sections = sgx_epc_region.epc_sections();
696                         for sgx_epc_section in sgx_epc_sections.iter() {
697                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
698                                 node.sgx_epc_sections.push(mm_section.clone());
699                             } else {
700                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
701                                 return Err(Error::InvalidNumaConfig);
702                             }
703                         }
704                     } else {
705                         error!("Missing SGX EPC region");
706                         return Err(Error::InvalidNumaConfig);
707                     }
708                 }
709 
710                 numa_nodes.insert(config.guest_numa_id, node);
711             }
712         }
713 
714         Ok(numa_nodes)
715     }
716 
717     #[allow(clippy::too_many_arguments)]
718     pub fn new(
719         config: Arc<Mutex<VmConfig>>,
720         exit_evt: EventFd,
721         reset_evt: EventFd,
722         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
723         seccomp_action: &SeccompAction,
724         hypervisor: Arc<dyn hypervisor::Hypervisor>,
725         activate_evt: EventFd,
726         serial_pty: Option<PtyPair>,
727         console_pty: Option<PtyPair>,
728         console_resize_pipe: Option<File>,
729     ) -> Result<Self> {
730         trace_scoped!("Vm::new");
731 
732         let timestamp = Instant::now();
733 
734         #[cfg(feature = "tdx")]
735         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
736         hypervisor.check_required_extensions().unwrap();
737         #[cfg(feature = "tdx")]
738         let vm = hypervisor
739             .create_vm_with_type(if tdx_enabled {
740                 2 // KVM_X86_TDX_VM
741             } else {
742                 0 // KVM_X86_LEGACY_VM
743             })
744             .unwrap();
745         #[cfg(not(feature = "tdx"))]
746         let vm = hypervisor.create_vm().unwrap();
747 
748         #[cfg(target_arch = "x86_64")]
749         {
750             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
751                 .unwrap();
752             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
753             vm.enable_split_irq().unwrap();
754         }
755 
756         let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits);
757 
758         #[cfg(target_arch = "x86_64")]
759         let sgx_epc_config = config.lock().unwrap().sgx_epc.clone();
760 
761         let memory_manager = MemoryManager::new(
762             vm.clone(),
763             &config.lock().unwrap().memory.clone(),
764             None,
765             phys_bits,
766             #[cfg(feature = "tdx")]
767             tdx_enabled,
768             None,
769             None,
770             #[cfg(target_arch = "x86_64")]
771             sgx_epc_config,
772         )
773         .map_err(Error::MemoryManager)?;
774 
775         let new_vm = Vm::new_from_memory_manager(
776             config,
777             memory_manager,
778             vm,
779             exit_evt,
780             reset_evt,
781             #[cfg(feature = "guest_debug")]
782             vm_debug_evt,
783             seccomp_action,
784             hypervisor,
785             activate_evt,
786             false,
787             timestamp,
788         )?;
789 
790         // The device manager must create the devices from here as it is part
791         // of the regular code path creating everything from scratch.
792         new_vm
793             .device_manager
794             .lock()
795             .unwrap()
796             .create_devices(serial_pty, console_pty, console_resize_pipe)
797             .map_err(Error::DeviceManager)?;
798         Ok(new_vm)
799     }
800 
801     #[allow(clippy::too_many_arguments)]
802     pub fn new_from_snapshot(
803         snapshot: &Snapshot,
804         vm_config: Arc<Mutex<VmConfig>>,
805         exit_evt: EventFd,
806         reset_evt: EventFd,
807         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
808         source_url: Option<&str>,
809         prefault: bool,
810         seccomp_action: &SeccompAction,
811         hypervisor: Arc<dyn hypervisor::Hypervisor>,
812         activate_evt: EventFd,
813     ) -> Result<Self> {
814         let timestamp = Instant::now();
815 
816         hypervisor.check_required_extensions().unwrap();
817         let vm = hypervisor.create_vm().unwrap();
818 
819         #[cfg(target_arch = "x86_64")]
820         {
821             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
822                 .unwrap();
823             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
824             vm.enable_split_irq().unwrap();
825         }
826 
827         let memory_manager = if let Some(memory_manager_snapshot) =
828             snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID)
829         {
830             let phys_bits = physical_bits(vm_config.lock().unwrap().cpus.max_phys_bits);
831             MemoryManager::new_from_snapshot(
832                 memory_manager_snapshot,
833                 vm.clone(),
834                 &vm_config.lock().unwrap().memory.clone(),
835                 source_url,
836                 prefault,
837                 phys_bits,
838             )
839             .map_err(Error::MemoryManager)?
840         } else {
841             return Err(Error::Restore(MigratableError::Restore(anyhow!(
842                 "Missing memory manager snapshot"
843             ))));
844         };
845 
846         Vm::new_from_memory_manager(
847             vm_config,
848             memory_manager,
849             vm,
850             exit_evt,
851             reset_evt,
852             #[cfg(feature = "guest_debug")]
853             vm_debug_evt,
854             seccomp_action,
855             hypervisor,
856             activate_evt,
857             true,
858             timestamp,
859         )
860     }
861 
862     #[allow(clippy::too_many_arguments)]
863     pub fn new_from_migration(
864         config: Arc<Mutex<VmConfig>>,
865         exit_evt: EventFd,
866         reset_evt: EventFd,
867         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
868         seccomp_action: &SeccompAction,
869         hypervisor: Arc<dyn hypervisor::Hypervisor>,
870         activate_evt: EventFd,
871         memory_manager_data: &MemoryManagerSnapshotData,
872         existing_memory_files: Option<HashMap<u32, File>>,
873     ) -> Result<Self> {
874         let timestamp = Instant::now();
875 
876         hypervisor.check_required_extensions().unwrap();
877         let vm = hypervisor.create_vm().unwrap();
878 
879         #[cfg(target_arch = "x86_64")]
880         {
881             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
882                 .unwrap();
883             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
884             vm.enable_split_irq().unwrap();
885         }
886 
887         let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits);
888 
889         let memory_manager = MemoryManager::new(
890             vm.clone(),
891             &config.lock().unwrap().memory.clone(),
892             None,
893             phys_bits,
894             #[cfg(feature = "tdx")]
895             false,
896             Some(memory_manager_data),
897             existing_memory_files,
898             #[cfg(target_arch = "x86_64")]
899             None,
900         )
901         .map_err(Error::MemoryManager)?;
902 
903         Vm::new_from_memory_manager(
904             config,
905             memory_manager,
906             vm,
907             exit_evt,
908             reset_evt,
909             #[cfg(feature = "guest_debug")]
910             vm_debug_evt,
911             seccomp_action,
912             hypervisor,
913             activate_evt,
914             true,
915             timestamp,
916         )
917     }
918 
919     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
920         let mut initramfs = self.initramfs.as_ref().unwrap();
921         let size: usize = initramfs
922             .seek(SeekFrom::End(0))
923             .map_err(|_| Error::InitramfsLoad)?
924             .try_into()
925             .unwrap();
926         initramfs
927             .seek(SeekFrom::Start(0))
928             .map_err(|_| Error::InitramfsLoad)?;
929 
930         let address =
931             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
932         let address = GuestAddress(address);
933 
934         guest_mem
935             .read_from(address, &mut initramfs, size)
936             .map_err(|_| Error::InitramfsLoad)?;
937 
938         info!("Initramfs loaded: address = 0x{:x}", address.0);
939         Ok(arch::InitramfsConfig { address, size })
940     }
941 
942     fn generate_cmdline(
943         payload: &PayloadConfig,
944         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
945     ) -> Result<Cmdline> {
946         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE);
947         if let Some(s) = payload.cmdline.as_ref() {
948             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
949         }
950 
951         #[cfg(target_arch = "aarch64")]
952         for entry in device_manager.lock().unwrap().cmdline_additions() {
953             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
954         }
955         Ok(cmdline)
956     }
957 
958     #[cfg(target_arch = "aarch64")]
959     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
960         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
961         let mem = uefi_flash.memory();
962         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
963             .map_err(Error::UefiLoad)?;
964         Ok(())
965     }
966 
967     #[cfg(target_arch = "aarch64")]
968     fn load_kernel(
969         firmware: Option<File>,
970         kernel: Option<File>,
971         memory_manager: Arc<Mutex<MemoryManager>>,
972     ) -> Result<EntryPoint> {
973         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
974         let mem = guest_memory.memory();
975         let entry_addr = match (firmware, kernel) {
976             (None, Some(mut kernel)) => {
977                 match linux_loader::loader::pe::PE::load(
978                     mem.deref(),
979                     Some(arch::layout::KERNEL_START),
980                     &mut kernel,
981                     None,
982                 ) {
983                     Ok(entry_addr) => entry_addr.kernel_load,
984                     // Try to load the binary as kernel PE file at first.
985                     // If failed, retry to load it as UEFI binary.
986                     // As the UEFI binary is formatless, it must be the last option to try.
987                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
988                         Self::load_firmware(&kernel, memory_manager)?;
989                         arch::layout::UEFI_START
990                     }
991                     Err(e) => {
992                         return Err(Error::KernelLoad(e));
993                     }
994                 }
995             }
996             (Some(firmware), None) => {
997                 Self::load_firmware(&firmware, memory_manager)?;
998                 arch::layout::UEFI_START
999             }
1000             _ => return Err(Error::InvalidPayload),
1001         };
1002 
1003         Ok(EntryPoint { entry_addr })
1004     }
1005 
1006     #[cfg(target_arch = "x86_64")]
1007     fn load_legacy_firmware(
1008         mut firmware: File,
1009         memory_manager: &Arc<Mutex<MemoryManager>>,
1010     ) -> Result<EntryPoint> {
1011         warn!("Loading of legacy (non-PVH) firmware is deprecated and will be removed in a future version.");
1012 
1013         // Not an ELF header - assume raw binary data / firmware
1014         let size = firmware
1015             .seek(SeekFrom::End(0))
1016             .map_err(Error::FirmwareFile)?;
1017 
1018         // The OVMF firmware is as big as you might expect and it's 4MiB so limit to that
1019         if size > 4 << 20 {
1020             return Err(Error::FirmwareTooLarge);
1021         }
1022 
1023         // Loaded at the end of the 4GiB
1024         let load_address = GuestAddress(4 << 30)
1025             .checked_sub(size)
1026             .ok_or(Error::FirmwareTooLarge)?;
1027 
1028         info!(
1029             "Loading RAW firmware at 0x{:x} (size: {})",
1030             load_address.raw_value(),
1031             size
1032         );
1033 
1034         memory_manager
1035             .lock()
1036             .unwrap()
1037             .add_ram_region(load_address, size as usize)
1038             .map_err(Error::AllocateFirmwareMemory)?;
1039 
1040         firmware
1041             .seek(SeekFrom::Start(0))
1042             .map_err(Error::FirmwareFile)?;
1043         memory_manager
1044             .lock()
1045             .unwrap()
1046             .guest_memory()
1047             .memory()
1048             .read_exact_from(load_address, &mut firmware, size as usize)
1049             .map_err(Error::FirmwareLoad)?;
1050 
1051         Ok(EntryPoint { entry_addr: None })
1052     }
1053 
1054     #[cfg(target_arch = "x86_64")]
1055     fn load_kernel(
1056         mut kernel: File,
1057         cmdline: Option<Cmdline>,
1058         memory_manager: Arc<Mutex<MemoryManager>>,
1059     ) -> Result<EntryPoint> {
1060         use linux_loader::loader::{elf::Error::InvalidElfMagicNumber, Error::Elf};
1061         info!("Loading kernel");
1062 
1063         let mem = {
1064             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1065             guest_memory.memory()
1066         };
1067         let entry_addr = match linux_loader::loader::elf::Elf::load(
1068             mem.deref(),
1069             None,
1070             &mut kernel,
1071             Some(arch::layout::HIGH_RAM_START),
1072         ) {
1073             Ok(entry_addr) => entry_addr,
1074             Err(e) => match e {
1075                 Elf(InvalidElfMagicNumber) => {
1076                     return Self::load_legacy_firmware(kernel, &memory_manager)
1077                 }
1078                 _ => {
1079                     return Err(Error::KernelLoad(e));
1080                 }
1081             },
1082         };
1083 
1084         if let Some(cmdline) = cmdline {
1085             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1086                 .map_err(Error::LoadCmdLine)?;
1087         }
1088 
1089         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1090             // Use the PVH kernel entry point to boot the guest
1091             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1092             Ok(EntryPoint {
1093                 entry_addr: Some(entry_addr),
1094             })
1095         } else {
1096             Err(Error::KernelMissingPvhHeader)
1097         }
1098     }
1099 
1100     #[cfg(target_arch = "x86_64")]
1101     fn load_payload(
1102         payload: &PayloadConfig,
1103         memory_manager: Arc<Mutex<MemoryManager>>,
1104     ) -> Result<EntryPoint> {
1105         trace_scoped!("load_payload");
1106         match (
1107             &payload.firmware,
1108             &payload.kernel,
1109             &payload.initramfs,
1110             &payload.cmdline,
1111         ) {
1112             (Some(firmware), None, None, None) => {
1113                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1114                 Self::load_kernel(firmware, None, memory_manager)
1115             }
1116             (None, Some(kernel), _, _) => {
1117                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1118                 let cmdline = Self::generate_cmdline(payload)?;
1119                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1120             }
1121             _ => Err(Error::InvalidPayload),
1122         }
1123     }
1124 
1125     #[cfg(target_arch = "aarch64")]
1126     fn load_payload(
1127         payload: &PayloadConfig,
1128         memory_manager: Arc<Mutex<MemoryManager>>,
1129     ) -> Result<EntryPoint> {
1130         match (&payload.firmware, &payload.kernel) {
1131             (Some(firmware), None) => {
1132                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1133                 Self::load_kernel(Some(firmware), None, memory_manager)
1134             }
1135             (None, Some(kernel)) => {
1136                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1137                 Self::load_kernel(None, Some(kernel), memory_manager)
1138             }
1139             _ => Err(Error::InvalidPayload),
1140         }
1141     }
1142 
1143     fn load_payload_async(
1144         memory_manager: &Arc<Mutex<MemoryManager>>,
1145         config: &Arc<Mutex<VmConfig>>,
1146     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1147         // Kernel with TDX is loaded in a different manner
1148         #[cfg(feature = "tdx")]
1149         if config.lock().unwrap().is_tdx_enabled() {
1150             return Ok(None);
1151         }
1152 
1153         config
1154             .lock()
1155             .unwrap()
1156             .payload
1157             .as_ref()
1158             .map(|payload| {
1159                 let memory_manager = memory_manager.clone();
1160                 let payload = payload.clone();
1161 
1162                 std::thread::Builder::new()
1163                     .name("payload_loader".into())
1164                     .spawn(move || Self::load_payload(&payload, memory_manager))
1165                     .map_err(Error::KernelLoadThreadSpawn)
1166             })
1167             .transpose()
1168     }
1169 
1170     #[cfg(target_arch = "x86_64")]
1171     fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> {
1172         trace_scoped!("configure_system");
1173         info!("Configuring system");
1174         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1175 
1176         let initramfs_config = match self.initramfs {
1177             Some(_) => Some(self.load_initramfs(&mem)?),
1178             None => None,
1179         };
1180 
1181         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1182         let rsdp_addr = Some(rsdp_addr);
1183         let sgx_epc_region = self
1184             .memory_manager
1185             .lock()
1186             .unwrap()
1187             .sgx_epc_region()
1188             .as_ref()
1189             .cloned();
1190 
1191         let serial_number = self
1192             .config
1193             .lock()
1194             .unwrap()
1195             .platform
1196             .as_ref()
1197             .and_then(|p| p.serial_number.clone());
1198 
1199         let uuid = self
1200             .config
1201             .lock()
1202             .unwrap()
1203             .platform
1204             .as_ref()
1205             .and_then(|p| p.uuid.clone());
1206 
1207         let oem_strings = self
1208             .config
1209             .lock()
1210             .unwrap()
1211             .platform
1212             .as_ref()
1213             .and_then(|p| p.oem_strings.clone());
1214 
1215         let oem_strings = oem_strings
1216             .as_deref()
1217             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1218 
1219         arch::configure_system(
1220             &mem,
1221             arch::layout::CMDLINE_START,
1222             &initramfs_config,
1223             boot_vcpus,
1224             rsdp_addr,
1225             sgx_epc_region,
1226             serial_number.as_deref(),
1227             uuid.as_deref(),
1228             oem_strings.as_deref(),
1229         )
1230         .map_err(Error::ConfigureSystem)?;
1231         Ok(())
1232     }
1233 
1234     #[cfg(target_arch = "aarch64")]
1235     fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> {
1236         let cmdline = Self::generate_cmdline(
1237             self.config.lock().unwrap().payload.as_ref().unwrap(),
1238             &self.device_manager,
1239         )?;
1240         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1241         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1242         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1243         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1244         let initramfs_config = match self.initramfs {
1245             Some(_) => Some(self.load_initramfs(&mem)?),
1246             None => None,
1247         };
1248 
1249         let device_info = &self
1250             .device_manager
1251             .lock()
1252             .unwrap()
1253             .get_device_info()
1254             .clone();
1255 
1256         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1257             let pci_space = PciSpaceInfo {
1258                 pci_segment_id: pci_segment.id,
1259                 mmio_config_address: pci_segment.mmio_config_address,
1260                 pci_device_space_start: pci_segment.start_of_device_area,
1261                 pci_device_space_size: pci_segment.end_of_device_area
1262                     - pci_segment.start_of_device_area
1263                     + 1,
1264             };
1265             pci_space_info.push(pci_space);
1266         }
1267 
1268         let virtio_iommu_bdf = self
1269             .device_manager
1270             .lock()
1271             .unwrap()
1272             .iommu_attached_devices()
1273             .as_ref()
1274             .map(|(v, _)| *v);
1275 
1276         let vcpu_count = self.cpu_manager.lock().unwrap().boot_vcpus() as u64;
1277         let vgic = self
1278             .device_manager
1279             .lock()
1280             .unwrap()
1281             .get_interrupt_controller()
1282             .unwrap()
1283             .lock()
1284             .unwrap()
1285             .create_vgic(
1286                 &self.memory_manager.lock().as_ref().unwrap().vm,
1287                 Gic::create_default_config(vcpu_count),
1288             )
1289             .map_err(|_| {
1290                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1291                     arch::aarch64::Error::SetupGic,
1292                 ))
1293             })?;
1294 
1295         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1296         let pmu_supported = self
1297             .cpu_manager
1298             .lock()
1299             .unwrap()
1300             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1301             .map_err(|_| {
1302                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1303                     arch::aarch64::Error::VcpuInitPmu,
1304                 ))
1305             })?;
1306 
1307         arch::configure_system(
1308             &mem,
1309             cmdline.as_cstring().unwrap().to_str().unwrap(),
1310             vcpu_mpidrs,
1311             vcpu_topology,
1312             device_info,
1313             &initramfs_config,
1314             &pci_space_info,
1315             virtio_iommu_bdf.map(|bdf| bdf.into()),
1316             &vgic,
1317             &self.numa_nodes,
1318             pmu_supported,
1319         )
1320         .map_err(Error::ConfigureSystem)?;
1321 
1322         // Activate gic device
1323         self.device_manager
1324             .lock()
1325             .unwrap()
1326             .get_interrupt_controller()
1327             .unwrap()
1328             .lock()
1329             .unwrap()
1330             .enable()
1331             .map_err(Error::EnableInterruptController)?;
1332 
1333         Ok(())
1334     }
1335 
1336     pub fn serial_pty(&self) -> Option<PtyPair> {
1337         self.device_manager.lock().unwrap().serial_pty()
1338     }
1339 
1340     pub fn console_pty(&self) -> Option<PtyPair> {
1341         self.device_manager.lock().unwrap().console_pty()
1342     }
1343 
1344     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1345         self.device_manager.lock().unwrap().console_resize_pipe()
1346     }
1347 
1348     pub fn shutdown(&mut self) -> Result<()> {
1349         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1350         let new_state = VmState::Shutdown;
1351 
1352         state.valid_transition(new_state)?;
1353 
1354         if self.on_tty {
1355             // Don't forget to set the terminal in canonical mode
1356             // before to exit.
1357             io::stdin()
1358                 .lock()
1359                 .set_canon_mode()
1360                 .map_err(Error::SetTerminalCanon)?;
1361         }
1362 
1363         // Trigger the termination of the signal_handler thread
1364         if let Some(signals) = self.signals.take() {
1365             signals.close();
1366         }
1367 
1368         // Wake up the DeviceManager threads so they will get terminated cleanly
1369         self.device_manager
1370             .lock()
1371             .unwrap()
1372             .resume()
1373             .map_err(Error::Resume)?;
1374 
1375         self.cpu_manager
1376             .lock()
1377             .unwrap()
1378             .shutdown()
1379             .map_err(Error::CpuManager)?;
1380 
1381         // Wait for all the threads to finish
1382         for thread in self.threads.drain(..) {
1383             thread.join().map_err(Error::ThreadCleanup)?
1384         }
1385         *state = new_state;
1386 
1387         event!("vm", "shutdown");
1388 
1389         Ok(())
1390     }
1391 
1392     pub fn resize(
1393         &mut self,
1394         desired_vcpus: Option<u8>,
1395         desired_memory: Option<u64>,
1396         desired_balloon: Option<u64>,
1397     ) -> Result<()> {
1398         event!("vm", "resizing");
1399 
1400         if let Some(desired_vcpus) = desired_vcpus {
1401             if self
1402                 .cpu_manager
1403                 .lock()
1404                 .unwrap()
1405                 .resize(desired_vcpus)
1406                 .map_err(Error::CpuManager)?
1407             {
1408                 self.device_manager
1409                     .lock()
1410                     .unwrap()
1411                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1412                     .map_err(Error::DeviceManager)?;
1413             }
1414             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1415         }
1416 
1417         if let Some(desired_memory) = desired_memory {
1418             let new_region = self
1419                 .memory_manager
1420                 .lock()
1421                 .unwrap()
1422                 .resize(desired_memory)
1423                 .map_err(Error::MemoryManager)?;
1424 
1425             let mut memory_config = &mut self.config.lock().unwrap().memory;
1426 
1427             if let Some(new_region) = &new_region {
1428                 self.device_manager
1429                     .lock()
1430                     .unwrap()
1431                     .update_memory(new_region)
1432                     .map_err(Error::DeviceManager)?;
1433 
1434                 match memory_config.hotplug_method {
1435                     HotplugMethod::Acpi => {
1436                         self.device_manager
1437                             .lock()
1438                             .unwrap()
1439                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1440                             .map_err(Error::DeviceManager)?;
1441                     }
1442                     HotplugMethod::VirtioMem => {}
1443                 }
1444             }
1445 
1446             // We update the VM config regardless of the actual guest resize
1447             // operation result (happened or not), so that if the VM reboots
1448             // it will be running with the last configure memory size.
1449             match memory_config.hotplug_method {
1450                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1451                 HotplugMethod::VirtioMem => {
1452                     if desired_memory > memory_config.size {
1453                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1454                     } else {
1455                         memory_config.hotplugged_size = None;
1456                     }
1457                 }
1458             }
1459         }
1460 
1461         if let Some(desired_balloon) = desired_balloon {
1462             self.device_manager
1463                 .lock()
1464                 .unwrap()
1465                 .resize_balloon(desired_balloon)
1466                 .map_err(Error::DeviceManager)?;
1467 
1468             // Update the configuration value for the balloon size to ensure
1469             // a reboot would use the right value.
1470             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1471                 balloon_config.size = desired_balloon;
1472             }
1473         }
1474 
1475         event!("vm", "resized");
1476 
1477         Ok(())
1478     }
1479 
1480     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1481         let memory_config = &mut self.config.lock().unwrap().memory;
1482 
1483         if let Some(zones) = &mut memory_config.zones {
1484             for zone in zones.iter_mut() {
1485                 if zone.id == id {
1486                     if desired_memory >= zone.size {
1487                         let hotplugged_size = desired_memory - zone.size;
1488                         self.memory_manager
1489                             .lock()
1490                             .unwrap()
1491                             .resize_zone(&id, desired_memory - zone.size)
1492                             .map_err(Error::MemoryManager)?;
1493                         // We update the memory zone config regardless of the
1494                         // actual 'resize-zone' operation result (happened or
1495                         // not), so that if the VM reboots it will be running
1496                         // with the last configured memory zone size.
1497                         zone.hotplugged_size = Some(hotplugged_size);
1498 
1499                         return Ok(());
1500                     } else {
1501                         error!(
1502                             "Invalid to ask less ({}) than boot RAM ({}) for \
1503                             this memory zone",
1504                             desired_memory, zone.size,
1505                         );
1506                         return Err(Error::ResizeZone);
1507                     }
1508                 }
1509             }
1510         }
1511 
1512         error!("Could not find the memory zone {} for the resize", id);
1513         Err(Error::ResizeZone)
1514     }
1515 
1516     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1517         let pci_device_info = self
1518             .device_manager
1519             .lock()
1520             .unwrap()
1521             .add_device(&mut device_cfg)
1522             .map_err(Error::DeviceManager)?;
1523 
1524         // Update VmConfig by adding the new device. This is important to
1525         // ensure the device would be created in case of a reboot.
1526         {
1527             let mut config = self.config.lock().unwrap();
1528             add_to_config(&mut config.devices, device_cfg);
1529         }
1530 
1531         self.device_manager
1532             .lock()
1533             .unwrap()
1534             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1535             .map_err(Error::DeviceManager)?;
1536 
1537         Ok(pci_device_info)
1538     }
1539 
1540     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1541         let pci_device_info = self
1542             .device_manager
1543             .lock()
1544             .unwrap()
1545             .add_user_device(&mut device_cfg)
1546             .map_err(Error::DeviceManager)?;
1547 
1548         // Update VmConfig by adding the new device. This is important to
1549         // ensure the device would be created in case of a reboot.
1550         {
1551             let mut config = self.config.lock().unwrap();
1552             add_to_config(&mut config.user_devices, device_cfg);
1553         }
1554 
1555         self.device_manager
1556             .lock()
1557             .unwrap()
1558             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1559             .map_err(Error::DeviceManager)?;
1560 
1561         Ok(pci_device_info)
1562     }
1563 
1564     pub fn remove_device(&mut self, id: String) -> Result<()> {
1565         self.device_manager
1566             .lock()
1567             .unwrap()
1568             .remove_device(id.clone())
1569             .map_err(Error::DeviceManager)?;
1570 
1571         // Update VmConfig by removing the device. This is important to
1572         // ensure the device would not be created in case of a reboot.
1573         let mut config = self.config.lock().unwrap();
1574 
1575         // Remove if VFIO device
1576         if let Some(devices) = config.devices.as_mut() {
1577             devices.retain(|dev| dev.id.as_ref() != Some(&id));
1578         }
1579 
1580         // Remove if VFIO user device
1581         if let Some(user_devices) = config.user_devices.as_mut() {
1582             user_devices.retain(|dev| dev.id.as_ref() != Some(&id));
1583         }
1584 
1585         // Remove if disk device
1586         if let Some(disks) = config.disks.as_mut() {
1587             disks.retain(|dev| dev.id.as_ref() != Some(&id));
1588         }
1589 
1590         // Remove if fs device
1591         if let Some(fs) = config.fs.as_mut() {
1592             fs.retain(|dev| dev.id.as_ref() != Some(&id));
1593         }
1594 
1595         // Remove if net device
1596         if let Some(net) = config.net.as_mut() {
1597             net.retain(|dev| dev.id.as_ref() != Some(&id));
1598         }
1599 
1600         // Remove if pmem device
1601         if let Some(pmem) = config.pmem.as_mut() {
1602             pmem.retain(|dev| dev.id.as_ref() != Some(&id));
1603         }
1604 
1605         // Remove if vDPA device
1606         if let Some(vdpa) = config.vdpa.as_mut() {
1607             vdpa.retain(|dev| dev.id.as_ref() != Some(&id));
1608         }
1609 
1610         // Remove if vsock device
1611         if let Some(vsock) = config.vsock.as_ref() {
1612             if vsock.id.as_ref() == Some(&id) {
1613                 config.vsock = None;
1614             }
1615         }
1616 
1617         self.device_manager
1618             .lock()
1619             .unwrap()
1620             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1621             .map_err(Error::DeviceManager)?;
1622         Ok(())
1623     }
1624 
1625     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1626         let pci_device_info = self
1627             .device_manager
1628             .lock()
1629             .unwrap()
1630             .add_disk(&mut disk_cfg)
1631             .map_err(Error::DeviceManager)?;
1632 
1633         // Update VmConfig by adding the new device. This is important to
1634         // ensure the device would be created in case of a reboot.
1635         {
1636             let mut config = self.config.lock().unwrap();
1637             add_to_config(&mut config.disks, disk_cfg);
1638         }
1639 
1640         self.device_manager
1641             .lock()
1642             .unwrap()
1643             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1644             .map_err(Error::DeviceManager)?;
1645 
1646         Ok(pci_device_info)
1647     }
1648 
1649     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1650         let pci_device_info = self
1651             .device_manager
1652             .lock()
1653             .unwrap()
1654             .add_fs(&mut fs_cfg)
1655             .map_err(Error::DeviceManager)?;
1656 
1657         // Update VmConfig by adding the new device. This is important to
1658         // ensure the device would be created in case of a reboot.
1659         {
1660             let mut config = self.config.lock().unwrap();
1661             add_to_config(&mut config.fs, fs_cfg);
1662         }
1663 
1664         self.device_manager
1665             .lock()
1666             .unwrap()
1667             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1668             .map_err(Error::DeviceManager)?;
1669 
1670         Ok(pci_device_info)
1671     }
1672 
1673     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1674         let pci_device_info = self
1675             .device_manager
1676             .lock()
1677             .unwrap()
1678             .add_pmem(&mut pmem_cfg)
1679             .map_err(Error::DeviceManager)?;
1680 
1681         // Update VmConfig by adding the new device. This is important to
1682         // ensure the device would be created in case of a reboot.
1683         {
1684             let mut config = self.config.lock().unwrap();
1685             add_to_config(&mut config.pmem, pmem_cfg);
1686         }
1687 
1688         self.device_manager
1689             .lock()
1690             .unwrap()
1691             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1692             .map_err(Error::DeviceManager)?;
1693 
1694         Ok(pci_device_info)
1695     }
1696 
1697     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1698         let pci_device_info = self
1699             .device_manager
1700             .lock()
1701             .unwrap()
1702             .add_net(&mut net_cfg)
1703             .map_err(Error::DeviceManager)?;
1704 
1705         // Update VmConfig by adding the new device. This is important to
1706         // ensure the device would be created in case of a reboot.
1707         {
1708             let mut config = self.config.lock().unwrap();
1709             add_to_config(&mut config.net, net_cfg);
1710         }
1711 
1712         self.device_manager
1713             .lock()
1714             .unwrap()
1715             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1716             .map_err(Error::DeviceManager)?;
1717 
1718         Ok(pci_device_info)
1719     }
1720 
1721     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1722         let pci_device_info = self
1723             .device_manager
1724             .lock()
1725             .unwrap()
1726             .add_vdpa(&mut vdpa_cfg)
1727             .map_err(Error::DeviceManager)?;
1728 
1729         // Update VmConfig by adding the new device. This is important to
1730         // ensure the device would be created in case of a reboot.
1731         {
1732             let mut config = self.config.lock().unwrap();
1733             add_to_config(&mut config.vdpa, vdpa_cfg);
1734         }
1735 
1736         self.device_manager
1737             .lock()
1738             .unwrap()
1739             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1740             .map_err(Error::DeviceManager)?;
1741 
1742         Ok(pci_device_info)
1743     }
1744 
1745     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1746         let pci_device_info = self
1747             .device_manager
1748             .lock()
1749             .unwrap()
1750             .add_vsock(&mut vsock_cfg)
1751             .map_err(Error::DeviceManager)?;
1752 
1753         // Update VmConfig by adding the new device. This is important to
1754         // ensure the device would be created in case of a reboot.
1755         {
1756             let mut config = self.config.lock().unwrap();
1757             config.vsock = Some(vsock_cfg);
1758         }
1759 
1760         self.device_manager
1761             .lock()
1762             .unwrap()
1763             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1764             .map_err(Error::DeviceManager)?;
1765 
1766         Ok(pci_device_info)
1767     }
1768 
1769     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1770         Ok(self.device_manager.lock().unwrap().counters())
1771     }
1772 
1773     fn signal_handler(mut signals: Signals, console_input_clone: Arc<Console>) {
1774         for sig in &Vm::HANDLED_SIGNALS {
1775             unblock_signal(*sig).unwrap();
1776         }
1777 
1778         for signal in signals.forever() {
1779             if signal == SIGWINCH {
1780                 console_input_clone.update_console_size();
1781             }
1782         }
1783     }
1784 
1785     #[cfg(feature = "tdx")]
1786     fn init_tdx(&mut self) -> Result<()> {
1787         let cpuid = self.cpu_manager.lock().unwrap().common_cpuid();
1788         let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32;
1789         self.vm
1790             .tdx_init(&cpuid, max_vcpus)
1791             .map_err(Error::InitializeTdxVm)?;
1792         Ok(())
1793     }
1794 
1795     #[cfg(feature = "tdx")]
1796     fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> {
1797         use arch::x86_64::tdx::*;
1798 
1799         let firmware_path = self
1800             .config
1801             .lock()
1802             .unwrap()
1803             .payload
1804             .as_ref()
1805             .unwrap()
1806             .firmware
1807             .clone()
1808             .ok_or(Error::TdxFirmwareMissing)?;
1809         // The TDVF file contains a table of section as well as code
1810         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1811 
1812         // For all the sections allocate some RAM backing them
1813         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1814     }
1815 
1816     #[cfg(feature = "tdx")]
1817     fn hob_memory_resources(
1818         mut sorted_sections: Vec<TdvfSection>,
1819         guest_memory: &GuestMemoryMmap,
1820     ) -> Vec<(u64, u64, bool)> {
1821         let mut list = Vec::new();
1822 
1823         let mut current_section = sorted_sections.pop();
1824 
1825         // RAM regions interleaved with TDVF sections
1826         let mut next_start_addr = 0;
1827         for region in guest_memory.iter() {
1828             let region_start = region.start_addr().0;
1829             let region_end = region.last_addr().0;
1830             if region_start > next_start_addr {
1831                 next_start_addr = region_start;
1832             }
1833 
1834             loop {
1835                 let (start, size, ram) = if let Some(section) = &current_section {
1836                     if section.address <= next_start_addr {
1837                         (section.address, section.size, false)
1838                     } else {
1839                         let last_addr = std::cmp::min(section.address - 1, region_end);
1840                         (next_start_addr, last_addr - next_start_addr + 1, true)
1841                     }
1842                 } else {
1843                     (next_start_addr, region_end - next_start_addr + 1, true)
1844                 };
1845 
1846                 list.push((start, size, ram));
1847 
1848                 if !ram {
1849                     current_section = sorted_sections.pop();
1850                 }
1851 
1852                 next_start_addr = start + size;
1853 
1854                 if region_start > next_start_addr {
1855                     next_start_addr = region_start;
1856                 }
1857 
1858                 if next_start_addr > region_end {
1859                     break;
1860                 }
1861             }
1862         }
1863 
1864         // Once all the interleaved sections have been processed, let's simply
1865         // pull the remaining ones.
1866         if let Some(section) = current_section {
1867             list.push((section.address, section.size, false));
1868         }
1869         while let Some(section) = sorted_sections.pop() {
1870             list.push((section.address, section.size, false));
1871         }
1872 
1873         list
1874     }
1875 
1876     #[cfg(feature = "tdx")]
1877     fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> {
1878         use arch::x86_64::tdx::*;
1879         // Get the memory end *before* we start adding TDVF ram regions
1880         let boot_guest_memory = self
1881             .memory_manager
1882             .lock()
1883             .as_ref()
1884             .unwrap()
1885             .boot_guest_memory();
1886         for section in sections {
1887             // No need to allocate if the section falls within guest RAM ranges
1888             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1889                 info!(
1890                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1891                     section
1892                 );
1893                 continue;
1894             }
1895 
1896             info!("Allocating TDVF Section: {:x?}", section);
1897             self.memory_manager
1898                 .lock()
1899                 .unwrap()
1900                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1901                 .map_err(Error::AllocatingTdvfMemory)?;
1902         }
1903 
1904         // The TDVF file contains a table of section as well as code
1905         let firmware_path = self
1906             .config
1907             .lock()
1908             .unwrap()
1909             .payload
1910             .as_ref()
1911             .unwrap()
1912             .firmware
1913             .clone()
1914             .ok_or(Error::TdxFirmwareMissing)?;
1915         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1916 
1917         // The guest memory at this point now has all the required regions so it
1918         // is safe to copy from the TDVF file into it.
1919         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1920         let mem = guest_memory.memory();
1921         let mut payload_info = None;
1922         let mut hob_offset = None;
1923         for section in sections {
1924             info!("Populating TDVF Section: {:x?}", section);
1925             match section.r#type {
1926                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1927                     info!("Copying section to guest memory");
1928                     firmware_file
1929                         .seek(SeekFrom::Start(section.data_offset as u64))
1930                         .map_err(Error::LoadTdvf)?;
1931                     mem.read_from(
1932                         GuestAddress(section.address),
1933                         &mut firmware_file,
1934                         section.data_size as usize,
1935                     )
1936                     .unwrap();
1937                 }
1938                 TdvfSectionType::TdHob => {
1939                     hob_offset = Some(section.address);
1940                 }
1941                 TdvfSectionType::Payload => {
1942                     info!("Copying payload to guest memory");
1943                     if let Some(payload_file) = self.kernel.as_mut() {
1944                         let payload_size = payload_file
1945                             .seek(SeekFrom::End(0))
1946                             .map_err(Error::LoadPayload)?;
1947 
1948                         payload_file
1949                             .seek(SeekFrom::Start(0x1f1))
1950                             .map_err(Error::LoadPayload)?;
1951 
1952                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1953                         payload_header
1954                             .as_bytes()
1955                             .read_from(
1956                                 0,
1957                                 payload_file,
1958                                 mem::size_of::<linux_loader::bootparam::setup_header>(),
1959                             )
1960                             .unwrap();
1961 
1962                         if payload_header.header != 0x5372_6448 {
1963                             return Err(Error::InvalidPayloadType);
1964                         }
1965 
1966                         if (payload_header.version < 0x0200)
1967                             || ((payload_header.loadflags & 0x1) == 0x0)
1968                         {
1969                             return Err(Error::InvalidPayloadType);
1970                         }
1971 
1972                         payload_file
1973                             .seek(SeekFrom::Start(0))
1974                             .map_err(Error::LoadPayload)?;
1975                         mem.read_from(
1976                             GuestAddress(section.address),
1977                             payload_file,
1978                             payload_size as usize,
1979                         )
1980                         .unwrap();
1981 
1982                         // Create the payload info that will be inserted into
1983                         // the HOB.
1984                         payload_info = Some(PayloadInfo {
1985                             image_type: PayloadImageType::BzImage,
1986                             entry_point: section.address,
1987                         });
1988                     }
1989                 }
1990                 TdvfSectionType::PayloadParam => {
1991                     info!("Copying payload parameters to guest memory");
1992                     let cmdline = Self::generate_cmdline(
1993                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1994                     )?;
1995                     mem.write_slice(
1996                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1997                         GuestAddress(section.address),
1998                     )
1999                     .unwrap();
2000                 }
2001                 _ => {}
2002             }
2003         }
2004 
2005         // Generate HOB
2006         let mut hob = TdHob::start(hob_offset.unwrap());
2007 
2008         let mut sorted_sections = sections.to_vec();
2009         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
2010 
2011         sorted_sections.sort_by_key(|section| section.address);
2012         sorted_sections.reverse();
2013 
2014         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
2015             hob.add_memory_resource(&mem, start, size, ram)
2016                 .map_err(Error::PopulateHob)?;
2017         }
2018 
2019         // MMIO regions
2020         hob.add_mmio_resource(
2021             &mem,
2022             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
2023             arch::layout::APIC_START.raw_value()
2024                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
2025         )
2026         .map_err(Error::PopulateHob)?;
2027         let start_of_device_area = self
2028             .memory_manager
2029             .lock()
2030             .unwrap()
2031             .start_of_device_area()
2032             .raw_value();
2033         let end_of_device_area = self
2034             .memory_manager
2035             .lock()
2036             .unwrap()
2037             .end_of_device_area()
2038             .raw_value();
2039         hob.add_mmio_resource(
2040             &mem,
2041             start_of_device_area,
2042             end_of_device_area - start_of_device_area,
2043         )
2044         .map_err(Error::PopulateHob)?;
2045 
2046         // Loop over the ACPI tables and copy them to the HOB.
2047 
2048         for acpi_table in crate::acpi::create_acpi_tables_tdx(
2049             &self.device_manager,
2050             &self.cpu_manager,
2051             &self.memory_manager,
2052             &self.numa_nodes,
2053         ) {
2054             hob.add_acpi_table(&mem, acpi_table.as_slice())
2055                 .map_err(Error::PopulateHob)?;
2056         }
2057 
2058         // If a payload info has been created, let's insert it into the HOB.
2059         if let Some(payload_info) = payload_info {
2060             hob.add_payload(&mem, payload_info)
2061                 .map_err(Error::PopulateHob)?;
2062         }
2063 
2064         hob.finish(&mem).map_err(Error::PopulateHob)?;
2065 
2066         Ok(hob_offset)
2067     }
2068 
2069     #[cfg(feature = "tdx")]
2070     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
2071         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2072         let mem = guest_memory.memory();
2073 
2074         for section in sections {
2075             self.vm
2076                 .tdx_init_memory_region(
2077                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2078                     section.address,
2079                     section.size,
2080                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2081                     section.attributes == 1,
2082                 )
2083                 .map_err(Error::InitializeTdxMemoryRegion)?;
2084         }
2085 
2086         Ok(())
2087     }
2088 
2089     fn setup_signal_handler(&mut self) -> Result<()> {
2090         let console = self.device_manager.lock().unwrap().console().clone();
2091         let signals = Signals::new(Vm::HANDLED_SIGNALS);
2092         match signals {
2093             Ok(signals) => {
2094                 self.signals = Some(signals.handle());
2095                 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?;
2096                 let signal_handler_seccomp_filter = get_seccomp_filter(
2097                     &self.seccomp_action,
2098                     Thread::SignalHandler,
2099                     self.hypervisor.hypervisor_type(),
2100                 )
2101                 .map_err(Error::CreateSeccompFilter)?;
2102                 self.threads.push(
2103                     thread::Builder::new()
2104                         .name("vm_signal_handler".to_string())
2105                         .spawn(move || {
2106                             if !signal_handler_seccomp_filter.is_empty() {
2107                                 if let Err(e) = apply_filter(&signal_handler_seccomp_filter)
2108                                     .map_err(Error::ApplySeccompFilter)
2109                                 {
2110                                     error!("Error applying seccomp filter: {:?}", e);
2111                                     exit_evt.write(1).ok();
2112                                     return;
2113                                 }
2114                             }
2115                             std::panic::catch_unwind(AssertUnwindSafe(|| {
2116                                 Vm::signal_handler(signals, console);
2117                             }))
2118                             .map_err(|_| {
2119                                 error!("signal_handler thead panicked");
2120                                 exit_evt.write(1).ok()
2121                             })
2122                             .ok();
2123                         })
2124                         .map_err(Error::SignalHandlerSpawn)?,
2125                 );
2126             }
2127             Err(e) => error!("Signal not found {}", e),
2128         }
2129         Ok(())
2130     }
2131 
2132     fn setup_tty(&self) -> Result<()> {
2133         if self.on_tty {
2134             io::stdin()
2135                 .lock()
2136                 .set_raw_mode()
2137                 .map_err(Error::SetTerminalRaw)?;
2138         }
2139 
2140         Ok(())
2141     }
2142 
2143     // Creates ACPI tables
2144     // In case of TDX being used, this is a no-op since the tables will be
2145     // created and passed when populating the HOB.
2146 
2147     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2148         #[cfg(feature = "tdx")]
2149         if self.config.lock().unwrap().is_tdx_enabled() {
2150             return None;
2151         }
2152 
2153         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2154 
2155         let rsdp_addr = crate::acpi::create_acpi_tables(
2156             &mem,
2157             &self.device_manager,
2158             &self.cpu_manager,
2159             &self.memory_manager,
2160             &self.numa_nodes,
2161         );
2162         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2163 
2164         Some(rsdp_addr)
2165     }
2166 
2167     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2168         trace_scoped!("entry_point");
2169 
2170         self.load_payload_handle
2171             .take()
2172             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2173             .transpose()
2174     }
2175 
2176     pub fn boot(&mut self) -> Result<()> {
2177         trace_scoped!("Vm::boot");
2178         info!("Booting VM");
2179         event!("vm", "booting");
2180         let current_state = self.get_state()?;
2181         if current_state == VmState::Paused {
2182             return self.resume().map_err(Error::Resume);
2183         }
2184 
2185         let new_state = if self.stop_on_boot {
2186             VmState::BreakPoint
2187         } else {
2188             VmState::Running
2189         };
2190         current_state.valid_transition(new_state)?;
2191 
2192         // Do earlier to parallelise with loading kernel
2193         #[cfg(target_arch = "x86_64")]
2194         let rsdp_addr = self.create_acpi_tables();
2195 
2196         self.setup_signal_handler()?;
2197         self.setup_tty()?;
2198 
2199         // Load kernel synchronously or if asynchronous then wait for load to
2200         // finish.
2201         let entry_point = self.entry_point()?;
2202 
2203         #[cfg(feature = "tdx")]
2204         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2205 
2206         // The initial TDX configuration must be done before the vCPUs are
2207         // created
2208         #[cfg(feature = "tdx")]
2209         if tdx_enabled {
2210             self.init_tdx()?;
2211         }
2212 
2213         // Create and configure vcpus
2214         self.cpu_manager
2215             .lock()
2216             .unwrap()
2217             .create_boot_vcpus(entry_point)
2218             .map_err(Error::CpuManager)?;
2219 
2220         #[cfg(feature = "tdx")]
2221         let sections = if tdx_enabled {
2222             self.extract_tdvf_sections()?
2223         } else {
2224             Vec::new()
2225         };
2226 
2227         // Configuring the TDX regions requires that the vCPUs are created.
2228         #[cfg(feature = "tdx")]
2229         let hob_address = if tdx_enabled {
2230             // TDX sections are written to memory.
2231             self.populate_tdx_sections(&sections)?
2232         } else {
2233             None
2234         };
2235 
2236         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2237         // available after they are configured
2238         #[cfg(target_arch = "aarch64")]
2239         let rsdp_addr = self.create_acpi_tables();
2240 
2241         // Configure shared state based on loaded kernel
2242         entry_point
2243             .map(|_| {
2244                 // Safe to unwrap rsdp_addr as we know it can't be None when
2245                 // the entry_point is Some.
2246                 self.configure_system(rsdp_addr.unwrap())
2247             })
2248             .transpose()?;
2249 
2250         #[cfg(feature = "tdx")]
2251         if let Some(hob_address) = hob_address {
2252             // With the HOB address extracted the vCPUs can have
2253             // their TDX state configured.
2254             self.cpu_manager
2255                 .lock()
2256                 .unwrap()
2257                 .initialize_tdx(hob_address)
2258                 .map_err(Error::CpuManager)?;
2259             // Let the hypervisor know which memory ranges are shared with the
2260             // guest. This prevents the guest from ignoring/discarding memory
2261             // regions provided by the host.
2262             self.init_tdx_memory(&sections)?;
2263             // With TDX memory and CPU state configured TDX setup is complete
2264             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2265         }
2266 
2267         self.cpu_manager
2268             .lock()
2269             .unwrap()
2270             .start_boot_vcpus(new_state == VmState::BreakPoint)
2271             .map_err(Error::CpuManager)?;
2272 
2273         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2274         *state = new_state;
2275         event!("vm", "booted");
2276         Ok(())
2277     }
2278 
2279     /// Gets a thread-safe reference counted pointer to the VM configuration.
2280     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2281         Arc::clone(&self.config)
2282     }
2283 
2284     /// Get the VM state. Returns an error if the state is poisoned.
2285     pub fn get_state(&self) -> Result<VmState> {
2286         self.state
2287             .try_read()
2288             .map_err(|_| Error::PoisonedState)
2289             .map(|state| *state)
2290     }
2291 
2292     /// Load saved clock from snapshot
2293     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2294     pub fn load_clock_from_snapshot(
2295         &mut self,
2296         snapshot: &Snapshot,
2297     ) -> Result<Option<hypervisor::ClockData>> {
2298         use crate::migration::get_vm_snapshot;
2299         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
2300         self.saved_clock = vm_snapshot.clock;
2301         Ok(self.saved_clock)
2302     }
2303 
2304     #[cfg(target_arch = "aarch64")]
2305     /// Add the vGIC section to the VM snapshot.
2306     fn add_vgic_snapshot_section(
2307         &self,
2308         vm_snapshot: &mut Snapshot,
2309     ) -> std::result::Result<(), MigratableError> {
2310         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2311         self.device_manager
2312             .lock()
2313             .unwrap()
2314             .get_interrupt_controller()
2315             .unwrap()
2316             .lock()
2317             .unwrap()
2318             .set_gicr_typers(&saved_vcpu_states);
2319 
2320         vm_snapshot.add_snapshot(
2321             self.device_manager
2322                 .lock()
2323                 .unwrap()
2324                 .get_interrupt_controller()
2325                 .unwrap()
2326                 .lock()
2327                 .unwrap()
2328                 .snapshot()?,
2329         );
2330 
2331         Ok(())
2332     }
2333 
2334     #[cfg(target_arch = "aarch64")]
2335     /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing.
2336     fn restore_vgic_and_enable_interrupt(
2337         &self,
2338         vm_snapshot: &Snapshot,
2339     ) -> std::result::Result<(), MigratableError> {
2340         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2341         // The number of vCPUs is the same as the number of saved vCPU states.
2342         let vcpu_numbers = saved_vcpu_states.len();
2343 
2344         // Creating a GIC device here, as the GIC will not be created when
2345         // restoring the device manager. Note that currently only the bare GICv3
2346         // without ITS is supported.
2347         let vcpu_count = vcpu_numbers.try_into().unwrap();
2348         self.device_manager
2349             .lock()
2350             .unwrap()
2351             .get_interrupt_controller()
2352             .unwrap()
2353             .lock()
2354             .unwrap()
2355             .create_vgic(&self.vm, Gic::create_default_config(vcpu_count))
2356             .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?;
2357 
2358         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
2359         self.cpu_manager
2360             .lock()
2361             .unwrap()
2362             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
2363             .map_err(|e| MigratableError::Restore(anyhow!("Error init PMU: {:?}", e)))?;
2364 
2365         // Here we prepare the GICR_TYPER registers from the restored vCPU states.
2366         self.device_manager
2367             .lock()
2368             .unwrap()
2369             .get_interrupt_controller()
2370             .unwrap()
2371             .lock()
2372             .unwrap()
2373             .set_gicr_typers(&saved_vcpu_states);
2374 
2375         // Restore GIC states.
2376         if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) {
2377             self.device_manager
2378                 .lock()
2379                 .unwrap()
2380                 .get_interrupt_controller()
2381                 .unwrap()
2382                 .lock()
2383                 .unwrap()
2384                 .restore(*gicv3_its_snapshot.clone())?;
2385         } else {
2386             return Err(MigratableError::Restore(anyhow!(
2387                 "Missing GicV3Its snapshot"
2388             )));
2389         }
2390 
2391         // Activate gic device
2392         self.device_manager
2393             .lock()
2394             .unwrap()
2395             .get_interrupt_controller()
2396             .unwrap()
2397             .lock()
2398             .unwrap()
2399             .enable()
2400             .map_err(|e| {
2401                 MigratableError::Restore(anyhow!(
2402                     "Could not enable interrupt controller routing: {:#?}",
2403                     e
2404                 ))
2405             })?;
2406 
2407         Ok(())
2408     }
2409 
2410     /// Gets the actual size of the balloon.
2411     pub fn balloon_size(&self) -> u64 {
2412         self.device_manager.lock().unwrap().balloon_size()
2413     }
2414 
2415     pub fn receive_memory_regions<F>(
2416         &mut self,
2417         ranges: &MemoryRangeTable,
2418         fd: &mut F,
2419     ) -> std::result::Result<(), MigratableError>
2420     where
2421         F: Read,
2422     {
2423         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2424         let mem = guest_memory.memory();
2425 
2426         for range in ranges.regions() {
2427             let mut offset: u64 = 0;
2428             // Here we are manually handling the retry in case we can't the
2429             // whole region at once because we can't use the implementation
2430             // from vm-memory::GuestMemory of read_exact_from() as it is not
2431             // following the correct behavior. For more info about this issue
2432             // see: https://github.com/rust-vmm/vm-memory/issues/174
2433             loop {
2434                 let bytes_read = mem
2435                     .read_from(
2436                         GuestAddress(range.gpa + offset),
2437                         fd,
2438                         (range.length - offset) as usize,
2439                     )
2440                     .map_err(|e| {
2441                         MigratableError::MigrateReceive(anyhow!(
2442                             "Error receiving memory from socket: {}",
2443                             e
2444                         ))
2445                     })?;
2446                 offset += bytes_read as u64;
2447 
2448                 if offset == range.length {
2449                     break;
2450                 }
2451             }
2452         }
2453 
2454         Ok(())
2455     }
2456 
2457     pub fn send_memory_fds(
2458         &mut self,
2459         socket: &mut UnixStream,
2460     ) -> std::result::Result<(), MigratableError> {
2461         for (slot, fd) in self
2462             .memory_manager
2463             .lock()
2464             .unwrap()
2465             .memory_slot_fds()
2466             .drain()
2467         {
2468             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2469                 .write_to(socket)
2470                 .map_err(|e| {
2471                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2472                 })?;
2473             socket
2474                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2475                 .map_err(|e| {
2476                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2477                 })?;
2478 
2479             let res = Response::read_from(socket)?;
2480             if res.status() != Status::Ok {
2481                 warn!("Error during memory fd migration");
2482                 Request::abandon().write_to(socket)?;
2483                 Response::read_from(socket).ok();
2484                 return Err(MigratableError::MigrateSend(anyhow!(
2485                     "Error during memory fd migration"
2486                 )));
2487             }
2488         }
2489 
2490         Ok(())
2491     }
2492 
2493     pub fn send_memory_regions<F>(
2494         &mut self,
2495         ranges: &MemoryRangeTable,
2496         fd: &mut F,
2497     ) -> std::result::Result<(), MigratableError>
2498     where
2499         F: Write,
2500     {
2501         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2502         let mem = guest_memory.memory();
2503 
2504         for range in ranges.regions() {
2505             let mut offset: u64 = 0;
2506             // Here we are manually handling the retry in case we can't the
2507             // whole region at once because we can't use the implementation
2508             // from vm-memory::GuestMemory of write_all_to() as it is not
2509             // following the correct behavior. For more info about this issue
2510             // see: https://github.com/rust-vmm/vm-memory/issues/174
2511             loop {
2512                 let bytes_written = mem
2513                     .write_to(
2514                         GuestAddress(range.gpa + offset),
2515                         fd,
2516                         (range.length - offset) as usize,
2517                     )
2518                     .map_err(|e| {
2519                         MigratableError::MigrateSend(anyhow!(
2520                             "Error transferring memory to socket: {}",
2521                             e
2522                         ))
2523                     })?;
2524                 offset += bytes_written as u64;
2525 
2526                 if offset == range.length {
2527                     break;
2528                 }
2529             }
2530         }
2531 
2532         Ok(())
2533     }
2534 
2535     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2536         self.memory_manager
2537             .lock()
2538             .unwrap()
2539             .memory_range_table(false)
2540     }
2541 
2542     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2543         self.device_manager.lock().unwrap().device_tree()
2544     }
2545 
2546     pub fn activate_virtio_devices(&self) -> Result<()> {
2547         self.device_manager
2548             .lock()
2549             .unwrap()
2550             .activate_virtio_devices()
2551             .map_err(Error::ActivateVirtioDevices)
2552     }
2553 
2554     #[cfg(target_arch = "x86_64")]
2555     pub fn power_button(&self) -> Result<()> {
2556         return self
2557             .device_manager
2558             .lock()
2559             .unwrap()
2560             .notify_power_button()
2561             .map_err(Error::PowerButton);
2562     }
2563 
2564     #[cfg(target_arch = "aarch64")]
2565     pub fn power_button(&self) -> Result<()> {
2566         self.device_manager
2567             .lock()
2568             .unwrap()
2569             .notify_power_button()
2570             .map_err(Error::PowerButton)
2571     }
2572 
2573     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2574         self.memory_manager.lock().unwrap().snapshot_data()
2575     }
2576 
2577     #[cfg(feature = "guest_debug")]
2578     pub fn debug_request(
2579         &mut self,
2580         gdb_request: &GdbRequestPayload,
2581         cpu_id: usize,
2582     ) -> Result<GdbResponsePayload> {
2583         use GdbRequestPayload::*;
2584         match gdb_request {
2585             SetSingleStep(single_step) => {
2586                 self.set_guest_debug(cpu_id, &[], *single_step)
2587                     .map_err(Error::Debug)?;
2588             }
2589             SetHwBreakPoint(addrs) => {
2590                 self.set_guest_debug(cpu_id, addrs, false)
2591                     .map_err(Error::Debug)?;
2592             }
2593             Pause => {
2594                 self.debug_pause().map_err(Error::Debug)?;
2595             }
2596             Resume => {
2597                 self.debug_resume().map_err(Error::Debug)?;
2598             }
2599             ReadRegs => {
2600                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2601                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2602             }
2603             WriteRegs(regs) => {
2604                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2605             }
2606             ReadMem(vaddr, len) => {
2607                 let mem = self.read_mem(cpu_id, *vaddr, *len).map_err(Error::Debug)?;
2608                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2609             }
2610             WriteMem(vaddr, data) => {
2611                 self.write_mem(cpu_id, vaddr, data).map_err(Error::Debug)?;
2612             }
2613             ActiveVcpus => {
2614                 let active_vcpus = self.active_vcpus();
2615                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2616             }
2617         }
2618         Ok(GdbResponsePayload::CommandComplete)
2619     }
2620 
2621     #[cfg(feature = "guest_debug")]
2622     fn get_dump_state(
2623         &mut self,
2624         destination_url: &str,
2625     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2626         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2627         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2628         let mut elf_phdr_num = 1;
2629         let elf_sh_info = 0;
2630         let coredump_file_path = url_to_file(destination_url)?;
2631         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2632 
2633         if mapping_num < UINT16_MAX - 2 {
2634             elf_phdr_num += mapping_num as u16;
2635         } else {
2636             panic!("mapping num beyond 65535 not supported");
2637         }
2638         let coredump_file = OpenOptions::new()
2639             .read(true)
2640             .write(true)
2641             .create_new(true)
2642             .open(coredump_file_path)
2643             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2644 
2645         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2646         let mem_data = self
2647             .memory_manager
2648             .lock()
2649             .unwrap()
2650             .coredump_memory_regions(mem_offset);
2651 
2652         Ok(DumpState {
2653             elf_note_size,
2654             elf_phdr_num,
2655             elf_sh_info,
2656             mem_offset,
2657             mem_info: Some(mem_data),
2658             file: Some(coredump_file),
2659         })
2660     }
2661 
2662     #[cfg(feature = "guest_debug")]
2663     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2664         size_of::<elf::Elf64_Ehdr>() as u64
2665             + note_size as u64
2666             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2667     }
2668 }
2669 
2670 impl Pausable for Vm {
2671     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2672         event!("vm", "pausing");
2673         let mut state = self
2674             .state
2675             .try_write()
2676             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2677         let new_state = VmState::Paused;
2678 
2679         state
2680             .valid_transition(new_state)
2681             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2682 
2683         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2684         {
2685             let mut clock = self
2686                 .vm
2687                 .get_clock()
2688                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2689             clock.reset_flags();
2690             self.saved_clock = Some(clock);
2691         }
2692 
2693         // Before pausing the vCPUs activate any pending virtio devices that might
2694         // need activation between starting the pause (or e.g. a migration it's part of)
2695         self.activate_virtio_devices().map_err(|e| {
2696             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2697         })?;
2698 
2699         self.cpu_manager.lock().unwrap().pause()?;
2700         self.device_manager.lock().unwrap().pause()?;
2701 
2702         *state = new_state;
2703 
2704         event!("vm", "paused");
2705         Ok(())
2706     }
2707 
2708     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2709         event!("vm", "resuming");
2710         let mut state = self
2711             .state
2712             .try_write()
2713             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2714         let new_state = VmState::Running;
2715 
2716         state
2717             .valid_transition(new_state)
2718             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2719 
2720         self.cpu_manager.lock().unwrap().resume()?;
2721         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2722         {
2723             if let Some(clock) = &self.saved_clock {
2724                 self.vm.set_clock(clock).map_err(|e| {
2725                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2726                 })?;
2727             }
2728         }
2729         self.device_manager.lock().unwrap().resume()?;
2730 
2731         // And we're back to the Running state.
2732         *state = new_state;
2733         event!("vm", "resumed");
2734         Ok(())
2735     }
2736 }
2737 
2738 #[derive(Serialize, Deserialize)]
2739 pub struct VmSnapshot {
2740     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2741     pub clock: Option<hypervisor::ClockData>,
2742     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2743     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2744 }
2745 
2746 pub const VM_SNAPSHOT_ID: &str = "vm";
2747 impl Snapshottable for Vm {
2748     fn id(&self) -> String {
2749         VM_SNAPSHOT_ID.to_string()
2750     }
2751 
2752     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2753         event!("vm", "snapshotting");
2754 
2755         #[cfg(feature = "tdx")]
2756         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2757 
2758         #[cfg(feature = "tdx")]
2759         {
2760             if tdx_enabled {
2761                 return Err(MigratableError::Snapshot(anyhow!(
2762                     "Snapshot not possible with TDX VM"
2763                 )));
2764             }
2765         }
2766 
2767         let current_state = self.get_state().unwrap();
2768         if current_state != VmState::Paused {
2769             return Err(MigratableError::Snapshot(anyhow!(
2770                 "Trying to snapshot while VM is running"
2771             )));
2772         }
2773 
2774         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2775         let common_cpuid = {
2776             let phys_bits = physical_bits(self.config.lock().unwrap().cpus.max_phys_bits);
2777             arch::generate_common_cpuid(
2778                 self.hypervisor.clone(),
2779                 None,
2780                 None,
2781                 phys_bits,
2782                 self.config.lock().unwrap().cpus.kvm_hyperv,
2783                 #[cfg(feature = "tdx")]
2784                 tdx_enabled,
2785             )
2786             .map_err(|e| {
2787                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2788             })?
2789         };
2790 
2791         let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID);
2792         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2793             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2794             clock: self.saved_clock,
2795             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2796             common_cpuid,
2797         })
2798         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2799 
2800         vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?);
2801         vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?);
2802 
2803         #[cfg(target_arch = "aarch64")]
2804         self.add_vgic_snapshot_section(&mut vm_snapshot)
2805             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2806 
2807         vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?);
2808         vm_snapshot.add_data_section(SnapshotDataSection {
2809             id: format!("{}-section", VM_SNAPSHOT_ID),
2810             snapshot: vm_snapshot_data,
2811         });
2812 
2813         event!("vm", "snapshotted");
2814         Ok(vm_snapshot)
2815     }
2816 
2817     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
2818         event!("vm", "restoring");
2819 
2820         let current_state = self
2821             .get_state()
2822             .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?;
2823         let new_state = VmState::Paused;
2824         current_state.valid_transition(new_state).map_err(|e| {
2825             MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e))
2826         })?;
2827 
2828         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2829         self.load_clock_from_snapshot(&snapshot)
2830             .map_err(|e| MigratableError::Restore(anyhow!("Error restoring clock: {:?}", e)))?;
2831 
2832         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2833             self.memory_manager
2834                 .lock()
2835                 .unwrap()
2836                 .restore(*memory_manager_snapshot.clone())?;
2837         } else {
2838             return Err(MigratableError::Restore(anyhow!(
2839                 "Missing memory manager snapshot"
2840             )));
2841         }
2842 
2843         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2844             self.device_manager
2845                 .lock()
2846                 .unwrap()
2847                 .restore(*device_manager_snapshot.clone())?;
2848         } else {
2849             return Err(MigratableError::Restore(anyhow!(
2850                 "Missing device manager snapshot"
2851             )));
2852         }
2853 
2854         if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) {
2855             self.cpu_manager
2856                 .lock()
2857                 .unwrap()
2858                 .restore(*cpu_manager_snapshot.clone())?;
2859         } else {
2860             return Err(MigratableError::Restore(anyhow!(
2861                 "Missing CPU manager snapshot"
2862             )));
2863         }
2864 
2865         #[cfg(target_arch = "aarch64")]
2866         self.restore_vgic_and_enable_interrupt(&snapshot)?;
2867 
2868         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2869             self.device_manager
2870                 .lock()
2871                 .unwrap()
2872                 .restore_devices(*device_manager_snapshot.clone())?;
2873         } else {
2874             return Err(MigratableError::Restore(anyhow!(
2875                 "Missing device manager snapshot"
2876             )));
2877         }
2878 
2879         // Now we can start all vCPUs from here.
2880         self.cpu_manager
2881             .lock()
2882             .unwrap()
2883             .start_restored_vcpus()
2884             .map_err(|e| {
2885                 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e))
2886             })?;
2887 
2888         self.setup_signal_handler().map_err(|e| {
2889             MigratableError::Restore(anyhow!("Could not setup signal handler: {:#?}", e))
2890         })?;
2891         self.setup_tty()
2892             .map_err(|e| MigratableError::Restore(anyhow!("Could not setup tty: {:#?}", e)))?;
2893 
2894         let mut state = self
2895             .state
2896             .try_write()
2897             .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?;
2898         *state = new_state;
2899 
2900         event!("vm", "restored");
2901         Ok(())
2902     }
2903 }
2904 
2905 impl Transportable for Vm {
2906     fn send(
2907         &self,
2908         snapshot: &Snapshot,
2909         destination_url: &str,
2910     ) -> std::result::Result<(), MigratableError> {
2911         let mut snapshot_config_path = url_to_path(destination_url)?;
2912         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2913 
2914         // Create the snapshot config file
2915         let mut snapshot_config_file = OpenOptions::new()
2916             .read(true)
2917             .write(true)
2918             .create_new(true)
2919             .open(snapshot_config_path)
2920             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2921 
2922         // Serialize and write the snapshot config
2923         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2924             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2925 
2926         snapshot_config_file
2927             .write(vm_config.as_bytes())
2928             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2929 
2930         let mut snapshot_state_path = url_to_path(destination_url)?;
2931         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2932 
2933         // Create the snapshot state file
2934         let mut snapshot_state_file = OpenOptions::new()
2935             .read(true)
2936             .write(true)
2937             .create_new(true)
2938             .open(snapshot_state_path)
2939             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2940 
2941         // Serialize and write the snapshot state
2942         let vm_state =
2943             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2944 
2945         snapshot_state_file
2946             .write(&vm_state)
2947             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2948 
2949         // Tell the memory manager to also send/write its own snapshot.
2950         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2951             self.memory_manager
2952                 .lock()
2953                 .unwrap()
2954                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2955         } else {
2956             return Err(MigratableError::Restore(anyhow!(
2957                 "Missing memory manager snapshot"
2958             )));
2959         }
2960 
2961         Ok(())
2962     }
2963 }
2964 
2965 impl Migratable for Vm {
2966     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2967         self.memory_manager.lock().unwrap().start_dirty_log()?;
2968         self.device_manager.lock().unwrap().start_dirty_log()
2969     }
2970 
2971     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2972         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2973         self.device_manager.lock().unwrap().stop_dirty_log()
2974     }
2975 
2976     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2977         Ok(MemoryRangeTable::new_from_tables(vec![
2978             self.memory_manager.lock().unwrap().dirty_log()?,
2979             self.device_manager.lock().unwrap().dirty_log()?,
2980         ]))
2981     }
2982 
2983     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2984         self.memory_manager.lock().unwrap().start_migration()?;
2985         self.device_manager.lock().unwrap().start_migration()
2986     }
2987 
2988     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2989         self.memory_manager.lock().unwrap().complete_migration()?;
2990         self.device_manager.lock().unwrap().complete_migration()
2991     }
2992 }
2993 
2994 #[cfg(feature = "guest_debug")]
2995 impl Debuggable for Vm {
2996     fn set_guest_debug(
2997         &self,
2998         cpu_id: usize,
2999         addrs: &[GuestAddress],
3000         singlestep: bool,
3001     ) -> std::result::Result<(), DebuggableError> {
3002         self.cpu_manager
3003             .lock()
3004             .unwrap()
3005             .set_guest_debug(cpu_id, addrs, singlestep)
3006     }
3007 
3008     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
3009         if *self.state.read().unwrap() == VmState::Running {
3010             self.pause().map_err(DebuggableError::Pause)?;
3011         }
3012 
3013         let mut state = self
3014             .state
3015             .try_write()
3016             .map_err(|_| DebuggableError::PoisonedState)?;
3017         *state = VmState::BreakPoint;
3018         Ok(())
3019     }
3020 
3021     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
3022         if *self.state.read().unwrap() == VmState::BreakPoint {
3023             self.resume().map_err(DebuggableError::Pause)?;
3024         }
3025 
3026         Ok(())
3027     }
3028 
3029     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
3030         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
3031     }
3032 
3033     fn write_regs(
3034         &self,
3035         cpu_id: usize,
3036         regs: &CoreRegs,
3037     ) -> std::result::Result<(), DebuggableError> {
3038         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
3039     }
3040 
3041     fn read_mem(
3042         &self,
3043         cpu_id: usize,
3044         vaddr: GuestAddress,
3045         len: usize,
3046     ) -> std::result::Result<Vec<u8>, DebuggableError> {
3047         self.cpu_manager
3048             .lock()
3049             .unwrap()
3050             .read_mem(cpu_id, vaddr, len)
3051     }
3052 
3053     fn write_mem(
3054         &self,
3055         cpu_id: usize,
3056         vaddr: &GuestAddress,
3057         data: &[u8],
3058     ) -> std::result::Result<(), DebuggableError> {
3059         self.cpu_manager
3060             .lock()
3061             .unwrap()
3062             .write_mem(cpu_id, vaddr, data)
3063     }
3064 
3065     fn active_vcpus(&self) -> usize {
3066         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
3067         if active_vcpus > 0 {
3068             active_vcpus
3069         } else {
3070             // The VM is not booted yet. Report boot_vcpus() instead.
3071             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
3072         }
3073     }
3074 }
3075 
3076 #[cfg(feature = "guest_debug")]
3077 pub const UINT16_MAX: u32 = 65535;
3078 
3079 #[cfg(feature = "guest_debug")]
3080 impl Elf64Writable for Vm {}
3081 
3082 #[cfg(feature = "guest_debug")]
3083 impl GuestDebuggable for Vm {
3084     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
3085         event!("vm", "coredumping");
3086 
3087         #[cfg(feature = "tdx")]
3088         {
3089             if self.config.lock().unwrap().tdx.is_some() {
3090                 return Err(GuestDebuggableError::Coredump(anyhow!(
3091                     "Coredump not possible with TDX VM"
3092                 )));
3093             }
3094         }
3095 
3096         let current_state = self.get_state().unwrap();
3097         if current_state != VmState::Paused {
3098             return Err(GuestDebuggableError::Coredump(anyhow!(
3099                 "Trying to coredump while VM is running"
3100             )));
3101         }
3102 
3103         let coredump_state = self.get_dump_state(destination_url)?;
3104 
3105         self.write_header(&coredump_state)?;
3106         self.write_note(&coredump_state)?;
3107         self.write_loads(&coredump_state)?;
3108 
3109         self.cpu_manager
3110             .lock()
3111             .unwrap()
3112             .cpu_write_elf64_note(&coredump_state)?;
3113         self.cpu_manager
3114             .lock()
3115             .unwrap()
3116             .cpu_write_vmm_note(&coredump_state)?;
3117 
3118         self.memory_manager
3119             .lock()
3120             .unwrap()
3121             .coredump_iterate_save_mem(&coredump_state)
3122     }
3123 }
3124 
3125 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3126 #[cfg(test)]
3127 mod tests {
3128     use super::*;
3129 
3130     fn test_vm_state_transitions(state: VmState) {
3131         match state {
3132             VmState::Created => {
3133                 // Check the transitions from Created
3134                 assert!(state.valid_transition(VmState::Created).is_err());
3135                 assert!(state.valid_transition(VmState::Running).is_ok());
3136                 assert!(state.valid_transition(VmState::Shutdown).is_err());
3137                 assert!(state.valid_transition(VmState::Paused).is_ok());
3138                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
3139             }
3140             VmState::Running => {
3141                 // Check the transitions from Running
3142                 assert!(state.valid_transition(VmState::Created).is_err());
3143                 assert!(state.valid_transition(VmState::Running).is_err());
3144                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
3145                 assert!(state.valid_transition(VmState::Paused).is_ok());
3146                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
3147             }
3148             VmState::Shutdown => {
3149                 // Check the transitions from Shutdown
3150                 assert!(state.valid_transition(VmState::Created).is_err());
3151                 assert!(state.valid_transition(VmState::Running).is_ok());
3152                 assert!(state.valid_transition(VmState::Shutdown).is_err());
3153                 assert!(state.valid_transition(VmState::Paused).is_err());
3154                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
3155             }
3156             VmState::Paused => {
3157                 // Check the transitions from Paused
3158                 assert!(state.valid_transition(VmState::Created).is_err());
3159                 assert!(state.valid_transition(VmState::Running).is_ok());
3160                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
3161                 assert!(state.valid_transition(VmState::Paused).is_err());
3162                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
3163             }
3164             VmState::BreakPoint => {
3165                 // Check the transitions from Breakpoint
3166                 assert!(state.valid_transition(VmState::Created).is_ok());
3167                 assert!(state.valid_transition(VmState::Running).is_ok());
3168                 assert!(state.valid_transition(VmState::Shutdown).is_err());
3169                 assert!(state.valid_transition(VmState::Paused).is_err());
3170                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
3171             }
3172         }
3173     }
3174 
3175     #[test]
3176     fn test_vm_created_transitions() {
3177         test_vm_state_transitions(VmState::Created);
3178     }
3179 
3180     #[test]
3181     fn test_vm_running_transitions() {
3182         test_vm_state_transitions(VmState::Running);
3183     }
3184 
3185     #[test]
3186     fn test_vm_shutdown_transitions() {
3187         test_vm_state_transitions(VmState::Shutdown);
3188     }
3189 
3190     #[test]
3191     fn test_vm_paused_transitions() {
3192         test_vm_state_transitions(VmState::Paused);
3193     }
3194 
3195     #[cfg(feature = "tdx")]
3196     #[test]
3197     fn test_hob_memory_resources() {
3198         // Case 1: Two TDVF sections in the middle of the RAM
3199         let sections = vec![
3200             TdvfSection {
3201                 address: 0xc000,
3202                 size: 0x1000,
3203                 ..Default::default()
3204             },
3205             TdvfSection {
3206                 address: 0x1000,
3207                 size: 0x4000,
3208                 ..Default::default()
3209             },
3210         ];
3211         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
3212         let expected = vec![
3213             (0, 0x1000, true),
3214             (0x1000, 0x4000, false),
3215             (0x5000, 0x7000, true),
3216             (0xc000, 0x1000, false),
3217             (0xd000, 0x0fff_3000, true),
3218         ];
3219         assert_eq!(
3220             expected,
3221             Vm::hob_memory_resources(
3222                 sections,
3223                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3224             )
3225         );
3226 
3227         // Case 2: Two TDVF sections with no conflict with the RAM
3228         let sections = vec![
3229             TdvfSection {
3230                 address: 0x1000_1000,
3231                 size: 0x1000,
3232                 ..Default::default()
3233             },
3234             TdvfSection {
3235                 address: 0,
3236                 size: 0x1000,
3237                 ..Default::default()
3238             },
3239         ];
3240         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3241         let expected = vec![
3242             (0, 0x1000, false),
3243             (0x1000, 0x1000_0000, true),
3244             (0x1000_1000, 0x1000, false),
3245         ];
3246         assert_eq!(
3247             expected,
3248             Vm::hob_memory_resources(
3249                 sections,
3250                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3251             )
3252         );
3253 
3254         // Case 3: Two TDVF sections with partial conflicts with the RAM
3255         let sections = vec![
3256             TdvfSection {
3257                 address: 0x1000_0000,
3258                 size: 0x2000,
3259                 ..Default::default()
3260             },
3261             TdvfSection {
3262                 address: 0,
3263                 size: 0x2000,
3264                 ..Default::default()
3265             },
3266         ];
3267         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3268         let expected = vec![
3269             (0, 0x2000, false),
3270             (0x2000, 0x0fff_e000, true),
3271             (0x1000_0000, 0x2000, false),
3272         ];
3273         assert_eq!(
3274             expected,
3275             Vm::hob_memory_resources(
3276                 sections,
3277                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3278             )
3279         );
3280 
3281         // Case 4: Two TDVF sections with no conflict before the RAM and two
3282         // more additional sections with no conflict after the RAM.
3283         let sections = vec![
3284             TdvfSection {
3285                 address: 0x2000_1000,
3286                 size: 0x1000,
3287                 ..Default::default()
3288             },
3289             TdvfSection {
3290                 address: 0x2000_0000,
3291                 size: 0x1000,
3292                 ..Default::default()
3293             },
3294             TdvfSection {
3295                 address: 0x1000,
3296                 size: 0x1000,
3297                 ..Default::default()
3298             },
3299             TdvfSection {
3300                 address: 0,
3301                 size: 0x1000,
3302                 ..Default::default()
3303             },
3304         ];
3305         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3306         let expected = vec![
3307             (0, 0x1000, false),
3308             (0x1000, 0x1000, false),
3309             (0x4000, 0x1000_0000, true),
3310             (0x2000_0000, 0x1000, false),
3311             (0x2000_1000, 0x1000, false),
3312         ];
3313         assert_eq!(
3314             expected,
3315             Vm::hob_memory_resources(
3316                 sections,
3317                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3318             )
3319         );
3320 
3321         // Case 5: One TDVF section overriding the entire RAM
3322         let sections = vec![TdvfSection {
3323             address: 0,
3324             size: 0x2000_0000,
3325             ..Default::default()
3326         }];
3327         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3328         let expected = vec![(0, 0x2000_0000, false)];
3329         assert_eq!(
3330             expected,
3331             Vm::hob_memory_resources(
3332                 sections,
3333                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3334             )
3335         );
3336 
3337         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3338         let sections = vec![
3339             TdvfSection {
3340                 address: 0x1000_2000,
3341                 size: 0x2000,
3342                 ..Default::default()
3343             },
3344             TdvfSection {
3345                 address: 0,
3346                 size: 0x2000,
3347                 ..Default::default()
3348             },
3349         ];
3350         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3351             (GuestAddress(0x2000), 0x1000_0000),
3352             (GuestAddress(0x1000_4000), 0x1000_0000),
3353         ];
3354         let expected = vec![
3355             (0, 0x2000, false),
3356             (0x2000, 0x1000_0000, true),
3357             (0x1000_2000, 0x2000, false),
3358             (0x1000_4000, 0x1000_0000, true),
3359         ];
3360         assert_eq!(
3361             expected,
3362             Vm::hob_memory_resources(
3363                 sections,
3364                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3365             )
3366         );
3367 
3368         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3369         let sections = vec![
3370             TdvfSection {
3371                 address: 0x1000_0000,
3372                 size: 0x4000,
3373                 ..Default::default()
3374             },
3375             TdvfSection {
3376                 address: 0,
3377                 size: 0x4000,
3378                 ..Default::default()
3379             },
3380         ];
3381         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3382             (GuestAddress(0x1000), 0x1000_0000),
3383             (GuestAddress(0x1000_3000), 0x1000_0000),
3384         ];
3385         let expected = vec![
3386             (0, 0x4000, false),
3387             (0x4000, 0x0fff_c000, true),
3388             (0x1000_0000, 0x4000, false),
3389             (0x1000_4000, 0x0fff_f000, true),
3390         ];
3391         assert_eq!(
3392             expected,
3393             Vm::hob_memory_resources(
3394                 sections,
3395                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3396             )
3397         );
3398     }
3399 }
3400 
3401 #[cfg(target_arch = "aarch64")]
3402 #[cfg(test)]
3403 mod tests {
3404     use super::*;
3405     use crate::GuestMemoryMmap;
3406     use arch::aarch64::fdt::create_fdt;
3407     use arch::aarch64::layout;
3408     use arch::{DeviceType, MmioDeviceInfo};
3409 
3410     const LEN: u64 = 4096;
3411 
3412     #[test]
3413     fn test_create_fdt_with_devices() {
3414         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3415         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3416 
3417         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3418             (
3419                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3420                 MmioDeviceInfo {
3421                     addr: 0x00,
3422                     len: LEN,
3423                     irq: 33,
3424                 },
3425             ),
3426             (
3427                 (DeviceType::Virtio(1), "virtio".to_string()),
3428                 MmioDeviceInfo {
3429                     addr: LEN,
3430                     len: LEN,
3431                     irq: 34,
3432                 },
3433             ),
3434             (
3435                 (DeviceType::Rtc, "rtc".to_string()),
3436                 MmioDeviceInfo {
3437                     addr: 2 * LEN,
3438                     len: LEN,
3439                     irq: 35,
3440                 },
3441             ),
3442         ]
3443         .iter()
3444         .cloned()
3445         .collect();
3446 
3447         let hv = hypervisor::new().unwrap();
3448         let vm = hv.create_vm().unwrap();
3449         let gic = vm
3450             .create_vgic(Gic::create_default_config(1))
3451             .expect("Cannot create gic");
3452         assert!(create_fdt(
3453             &mem,
3454             "console=tty0",
3455             vec![0],
3456             Some((0, 0, 0)),
3457             &dev_info,
3458             &gic,
3459             &None,
3460             &Vec::new(),
3461             &BTreeMap::new(),
3462             None,
3463             true,
3464         )
3465         .is_ok())
3466     }
3467 }
3468 
3469 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3470 #[test]
3471 pub fn test_vm() {
3472     use hypervisor::VmExit;
3473     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3474     // This example based on https://lwn.net/Articles/658511/
3475     let code = [
3476         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3477         0x00, 0xd8, /* add %bl, %al */
3478         0x04, b'0', /* add $'0', %al */
3479         0xee, /* out %al, (%dx) */
3480         0xb0, b'\n', /* mov $'\n', %al */
3481         0xee,  /* out %al, (%dx) */
3482         0xf4,  /* hlt */
3483     ];
3484 
3485     let mem_size = 0x1000;
3486     let load_addr = GuestAddress(0x1000);
3487     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3488 
3489     let hv = hypervisor::new().unwrap();
3490     let vm = hv.create_vm().expect("new VM creation failed");
3491 
3492     for (index, region) in mem.iter().enumerate() {
3493         let mem_region = vm.make_user_memory_region(
3494             index as u32,
3495             region.start_addr().raw_value(),
3496             region.len() as u64,
3497             region.as_ptr() as u64,
3498             false,
3499             false,
3500         );
3501 
3502         vm.create_user_memory_region(mem_region)
3503             .expect("Cannot configure guest memory");
3504     }
3505     mem.write_slice(&code, load_addr)
3506         .expect("Writing code to memory failed");
3507 
3508     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3509 
3510     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3511     vcpu_sregs.cs.base = 0;
3512     vcpu_sregs.cs.selector = 0;
3513     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3514 
3515     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3516     vcpu_regs.rip = 0x1000;
3517     vcpu_regs.rax = 2;
3518     vcpu_regs.rbx = 3;
3519     vcpu_regs.rflags = 2;
3520     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3521 
3522     loop {
3523         match vcpu.run().expect("run failed") {
3524             VmExit::IoOut(addr, data) => {
3525                 println!(
3526                     "IO out -- addr: {:#x} data [{:?}]",
3527                     addr,
3528                     str::from_utf8(data).unwrap()
3529                 );
3530             }
3531             VmExit::Reset => {
3532                 println!("HLT");
3533                 break;
3534             }
3535             r => panic!("unexpected exit reason: {:?}", r),
3536         }
3537     }
3538 }
3539