xref: /cloud-hypervisor/vmm/src/vm.rs (revision 5e52729453cb62edbe4fb3a4aa24f8cca31e667e)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{Console, DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 use crate::memory_manager::{
29     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
30 };
31 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
32 use crate::migration::get_vm_snapshot;
33 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
34 use crate::migration::url_to_file;
35 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
36 use crate::seccomp_filters::{get_seccomp_filter, Thread};
37 use crate::GuestMemoryMmap;
38 use crate::{
39     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
40 };
41 use anyhow::anyhow;
42 use arch::get_host_cpu_phys_bits;
43 #[cfg(target_arch = "x86_64")]
44 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
45 #[cfg(feature = "tdx")]
46 use arch::x86_64::tdx::TdvfSection;
47 use arch::EntryPoint;
48 #[cfg(target_arch = "aarch64")]
49 use arch::PciSpaceInfo;
50 use arch::{NumaNode, NumaNodes};
51 #[cfg(target_arch = "aarch64")]
52 use devices::interrupt_controller;
53 use devices::AcpiNotificationFlags;
54 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
55 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
56 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
57 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
58 use hypervisor::{HypervisorVmError, VmOps};
59 use linux_loader::cmdline::Cmdline;
60 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
61 use linux_loader::elf;
62 #[cfg(target_arch = "x86_64")]
63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
64 #[cfg(target_arch = "aarch64")]
65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
66 use linux_loader::loader::KernelLoader;
67 use seccompiler::{apply_filter, SeccompAction};
68 use serde::{Deserialize, Serialize};
69 use signal_hook::{consts::SIGWINCH, iterator::backend::Handle, iterator::Signals};
70 use std::cmp;
71 use std::collections::BTreeMap;
72 use std::collections::HashMap;
73 use std::convert::TryInto;
74 use std::fs::{File, OpenOptions};
75 use std::io::{self, Seek, SeekFrom, Write};
76 #[cfg(feature = "tdx")]
77 use std::mem;
78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
79 use std::mem::size_of;
80 use std::num::Wrapping;
81 use std::ops::Deref;
82 use std::os::unix::net::UnixStream;
83 use std::panic::AssertUnwindSafe;
84 use std::sync::{Arc, Mutex, RwLock};
85 use std::time::Instant;
86 use std::{result, str, thread};
87 use thiserror::Error;
88 use tracer::trace_scoped;
89 use vm_device::Bus;
90 #[cfg(feature = "tdx")]
91 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion};
92 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic};
93 use vm_migration::protocol::{Request, Response, Status};
94 use vm_migration::{
95     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
96     SnapshotData, Snapshottable, Transportable,
97 };
98 use vmm_sys_util::eventfd::EventFd;
99 use vmm_sys_util::signal::unblock_signal;
100 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
101 use vmm_sys_util::terminal::Terminal;
102 
103 /// Errors associated with VM management
104 #[derive(Debug, Error)]
105 pub enum Error {
106     #[error("Cannot open kernel file: {0}")]
107     KernelFile(#[source] io::Error),
108 
109     #[error("Cannot open initramfs file: {0}")]
110     InitramfsFile(#[source] io::Error),
111 
112     #[error("Cannot load the kernel into memory: {0}")]
113     KernelLoad(#[source] linux_loader::loader::Error),
114 
115     #[cfg(target_arch = "aarch64")]
116     #[error("Cannot load the UEFI binary in memory: {0:?}")]
117     UefiLoad(arch::aarch64::uefi::Error),
118 
119     #[error("Cannot load the initramfs into memory")]
120     InitramfsLoad,
121 
122     #[error("Cannot load the kernel command line in memory: {0}")]
123     LoadCmdLine(#[source] linux_loader::loader::Error),
124 
125     #[error("Cannot modify the kernel command line: {0}")]
126     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
127 
128     #[error("Cannot create the kernel command line: {0}")]
129     CmdLineCreate(#[source] linux_loader::cmdline::Error),
130 
131     #[error("Cannot configure system: {0}")]
132     ConfigureSystem(#[source] arch::Error),
133 
134     #[cfg(target_arch = "aarch64")]
135     #[error("Cannot enable interrupt controller: {0:?}")]
136     EnableInterruptController(interrupt_controller::Error),
137 
138     #[error("VM state is poisoned")]
139     PoisonedState,
140 
141     #[error("Error from device manager: {0:?}")]
142     DeviceManager(DeviceManagerError),
143 
144     #[error("Cannot setup terminal in raw mode: {0}")]
145     SetTerminalRaw(#[source] vmm_sys_util::errno::Error),
146 
147     #[error("Cannot setup terminal in canonical mode.: {0}")]
148     SetTerminalCanon(#[source] vmm_sys_util::errno::Error),
149 
150     #[error("Cannot spawn a signal handler thread: {0}")]
151     SignalHandlerSpawn(#[source] io::Error),
152 
153     #[error("Failed to join on threads: {0:?}")]
154     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
155 
156     #[error("VM config is missing")]
157     VmMissingConfig,
158 
159     #[error("VM is not created")]
160     VmNotCreated,
161 
162     #[error("VM is already created")]
163     VmAlreadyCreated,
164 
165     #[error("VM is not running")]
166     VmNotRunning,
167 
168     #[error("Cannot clone EventFd: {0}")]
169     EventFdClone(#[source] io::Error),
170 
171     #[error("invalid VM state transition: {0:?} to {1:?}")]
172     InvalidStateTransition(VmState, VmState),
173 
174     #[error("Error from CPU manager: {0}")]
175     CpuManager(#[source] cpu::Error),
176 
177     #[error("Cannot pause devices: {0}")]
178     PauseDevices(#[source] MigratableError),
179 
180     #[error("Cannot resume devices: {0}")]
181     ResumeDevices(#[source] MigratableError),
182 
183     #[error("Cannot pause CPUs: {0}")]
184     PauseCpus(#[source] MigratableError),
185 
186     #[error("Cannot resume cpus: {0}")]
187     ResumeCpus(#[source] MigratableError),
188 
189     #[error("Cannot pause VM: {0}")]
190     Pause(#[source] MigratableError),
191 
192     #[error("Cannot resume VM: {0}")]
193     Resume(#[source] MigratableError),
194 
195     #[error("Memory manager error: {0:?}")]
196     MemoryManager(MemoryManagerError),
197 
198     #[error("Eventfd write error: {0}")]
199     EventfdError(#[source] std::io::Error),
200 
201     #[error("Cannot snapshot VM: {0}")]
202     Snapshot(#[source] MigratableError),
203 
204     #[error("Cannot restore VM: {0}")]
205     Restore(#[source] MigratableError),
206 
207     #[error("Cannot send VM snapshot: {0}")]
208     SnapshotSend(#[source] MigratableError),
209 
210     #[error("Invalid restore source URL")]
211     InvalidRestoreSourceUrl,
212 
213     #[error("Failed to validate config: {0}")]
214     ConfigValidation(#[source] ValidationError),
215 
216     #[error("Too many virtio-vsock devices")]
217     TooManyVsockDevices,
218 
219     #[error("Failed serializing into JSON: {0}")]
220     SerializeJson(#[source] serde_json::Error),
221 
222     #[error("Invalid NUMA configuration")]
223     InvalidNumaConfig,
224 
225     #[error("Cannot create seccomp filter: {0}")]
226     CreateSeccompFilter(#[source] seccompiler::Error),
227 
228     #[error("Cannot apply seccomp filter: {0}")]
229     ApplySeccompFilter(#[source] seccompiler::Error),
230 
231     #[error("Failed resizing a memory zone")]
232     ResizeZone,
233 
234     #[error("Cannot activate virtio devices: {0:?}")]
235     ActivateVirtioDevices(DeviceManagerError),
236 
237     #[error("Error triggering power button: {0:?}")]
238     PowerButton(DeviceManagerError),
239 
240     #[error("Kernel lacks PVH header")]
241     KernelMissingPvhHeader,
242 
243     #[error("Failed to allocate firmware RAM: {0:?}")]
244     AllocateFirmwareMemory(MemoryManagerError),
245 
246     #[error("Error manipulating firmware file: {0}")]
247     FirmwareFile(#[source] std::io::Error),
248 
249     #[error("Firmware too big")]
250     FirmwareTooLarge,
251 
252     #[error("Failed to copy firmware to memory: {0}")]
253     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
254 
255     #[cfg(feature = "tdx")]
256     #[error("Error performing I/O on TDX firmware file: {0}")]
257     LoadTdvf(#[source] std::io::Error),
258 
259     #[cfg(feature = "tdx")]
260     #[error("Error performing I/O on the TDX payload file: {0}")]
261     LoadPayload(#[source] std::io::Error),
262 
263     #[cfg(feature = "tdx")]
264     #[error("Error parsing TDVF: {0}")]
265     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
266 
267     #[cfg(feature = "tdx")]
268     #[error("Error populating TDX HOB: {0}")]
269     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
270 
271     #[cfg(feature = "tdx")]
272     #[error("Error allocating TDVF memory: {0:?}")]
273     AllocatingTdvfMemory(crate::memory_manager::Error),
274 
275     #[cfg(feature = "tdx")]
276     #[error("Error enabling TDX VM: {0}")]
277     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
278 
279     #[cfg(feature = "tdx")]
280     #[error("Error enabling TDX memory region: {0}")]
281     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
282 
283     #[cfg(feature = "tdx")]
284     #[error("Error finalizing TDX VM: {0}")]
285     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
286 
287     #[cfg(feature = "tdx")]
288     #[error("TDX firmware missing")]
289     TdxFirmwareMissing,
290 
291     #[cfg(feature = "tdx")]
292     #[error("Invalid TDX payload type")]
293     InvalidPayloadType,
294 
295     #[cfg(feature = "guest_debug")]
296     #[error("Error debugging VM: {0:?}")]
297     Debug(DebuggableError),
298 
299     #[error("Error spawning kernel loading thread")]
300     KernelLoadThreadSpawn(std::io::Error),
301 
302     #[error("Error joining kernel loading thread")]
303     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
304 
305     #[error("Payload configuration is not bootable")]
306     InvalidPayload,
307 
308     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
309     #[error("Error coredumping VM: {0:?}")]
310     Coredump(GuestDebuggableError),
311 }
312 pub type Result<T> = result::Result<T, Error>;
313 
314 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
315 pub enum VmState {
316     Created,
317     Running,
318     Shutdown,
319     Paused,
320     BreakPoint,
321 }
322 
323 impl VmState {
324     fn valid_transition(self, new_state: VmState) -> Result<()> {
325         match self {
326             VmState::Created => match new_state {
327                 VmState::Created | VmState::Shutdown => {
328                     Err(Error::InvalidStateTransition(self, new_state))
329                 }
330                 VmState::Running | VmState::Paused | VmState::BreakPoint => Ok(()),
331             },
332 
333             VmState::Running => match new_state {
334                 VmState::Created | VmState::Running => {
335                     Err(Error::InvalidStateTransition(self, new_state))
336                 }
337                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
338             },
339 
340             VmState::Shutdown => match new_state {
341                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
342                     Err(Error::InvalidStateTransition(self, new_state))
343                 }
344                 VmState::Running => Ok(()),
345             },
346 
347             VmState::Paused => match new_state {
348                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
349                     Err(Error::InvalidStateTransition(self, new_state))
350                 }
351                 VmState::Running | VmState::Shutdown => Ok(()),
352             },
353             VmState::BreakPoint => match new_state {
354                 VmState::Created | VmState::Running => Ok(()),
355                 _ => Err(Error::InvalidStateTransition(self, new_state)),
356             },
357         }
358     }
359 }
360 
361 struct VmOpsHandler {
362     memory: GuestMemoryAtomic<GuestMemoryMmap>,
363     #[cfg(target_arch = "x86_64")]
364     io_bus: Arc<Bus>,
365     mmio_bus: Arc<Bus>,
366 }
367 
368 impl VmOps for VmOpsHandler {
369     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
370         self.memory
371             .memory()
372             .write(buf, GuestAddress(gpa))
373             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
374     }
375 
376     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
377         self.memory
378             .memory()
379             .read(buf, GuestAddress(gpa))
380             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
381     }
382 
383     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
384         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
385             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
386         }
387         Ok(())
388     }
389 
390     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
391         match self.mmio_bus.write(gpa, data) {
392             Err(vm_device::BusError::MissingAddressRange) => {
393                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
394             }
395             Ok(Some(barrier)) => {
396                 info!("Waiting for barrier");
397                 barrier.wait();
398                 info!("Barrier released");
399             }
400             _ => {}
401         };
402         Ok(())
403     }
404 
405     #[cfg(target_arch = "x86_64")]
406     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
407         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
408             info!("Guest PIO read to unregistered address 0x{:x}", port);
409         }
410         Ok(())
411     }
412 
413     #[cfg(target_arch = "x86_64")]
414     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
415         match self.io_bus.write(port, data) {
416             Err(vm_device::BusError::MissingAddressRange) => {
417                 info!("Guest PIO write to unregistered address 0x{:x}", port);
418             }
419             Ok(Some(barrier)) => {
420                 info!("Waiting for barrier");
421                 barrier.wait();
422                 info!("Barrier released");
423             }
424             _ => {}
425         };
426         Ok(())
427     }
428 }
429 
430 pub fn physical_bits(max_phys_bits: u8) -> u8 {
431     let host_phys_bits = get_host_cpu_phys_bits();
432 
433     cmp::min(host_phys_bits, max_phys_bits)
434 }
435 
436 pub struct Vm {
437     #[cfg(feature = "tdx")]
438     kernel: Option<File>,
439     initramfs: Option<File>,
440     threads: Vec<thread::JoinHandle<()>>,
441     device_manager: Arc<Mutex<DeviceManager>>,
442     config: Arc<Mutex<VmConfig>>,
443     on_tty: bool,
444     signals: Option<Handle>,
445     state: RwLock<VmState>,
446     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
447     memory_manager: Arc<Mutex<MemoryManager>>,
448     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
449     // The hypervisor abstracted virtual machine.
450     vm: Arc<dyn hypervisor::Vm>,
451     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
452     saved_clock: Option<hypervisor::ClockData>,
453     numa_nodes: NumaNodes,
454     seccomp_action: SeccompAction,
455     exit_evt: EventFd,
456     hypervisor: Arc<dyn hypervisor::Hypervisor>,
457     stop_on_boot: bool,
458     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
459 }
460 
461 impl Vm {
462     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
463 
464     #[allow(clippy::too_many_arguments)]
465     pub fn new_from_memory_manager(
466         config: Arc<Mutex<VmConfig>>,
467         memory_manager: Arc<Mutex<MemoryManager>>,
468         vm: Arc<dyn hypervisor::Vm>,
469         exit_evt: EventFd,
470         reset_evt: EventFd,
471         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
472         seccomp_action: &SeccompAction,
473         hypervisor: Arc<dyn hypervisor::Hypervisor>,
474         activate_evt: EventFd,
475         timestamp: Instant,
476         serial_pty: Option<PtyPair>,
477         console_pty: Option<PtyPair>,
478         console_resize_pipe: Option<File>,
479         snapshot: Option<Snapshot>,
480     ) -> Result<Self> {
481         trace_scoped!("Vm::new_from_memory_manager");
482 
483         let boot_id_list = config
484             .lock()
485             .unwrap()
486             .validate()
487             .map_err(Error::ConfigValidation)?;
488 
489         let load_payload_handle = if snapshot.is_none() {
490             Self::load_payload_async(&memory_manager, &config)?
491         } else {
492             None
493         };
494 
495         info!("Booting VM from config: {:?}", &config);
496 
497         // Create NUMA nodes based on NumaConfig.
498         let numa_nodes =
499             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
500 
501         #[cfg(feature = "tdx")]
502         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
503         #[cfg(feature = "tdx")]
504         let force_iommu = tdx_enabled;
505         #[cfg(not(feature = "tdx"))]
506         let force_iommu = false;
507 
508         #[cfg(feature = "guest_debug")]
509         let stop_on_boot = config.lock().unwrap().gdb;
510         #[cfg(not(feature = "guest_debug"))]
511         let stop_on_boot = false;
512 
513         let memory = memory_manager.lock().unwrap().guest_memory();
514         #[cfg(target_arch = "x86_64")]
515         let io_bus = Arc::new(Bus::new());
516         let mmio_bus = Arc::new(Bus::new());
517 
518         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
519             memory,
520             #[cfg(target_arch = "x86_64")]
521             io_bus: io_bus.clone(),
522             mmio_bus: mmio_bus.clone(),
523         });
524 
525         let cpus_config = { &config.lock().unwrap().cpus.clone() };
526         let cpu_manager = cpu::CpuManager::new(
527             cpus_config,
528             vm.clone(),
529             exit_evt.try_clone().map_err(Error::EventFdClone)?,
530             reset_evt.try_clone().map_err(Error::EventFdClone)?,
531             #[cfg(feature = "guest_debug")]
532             vm_debug_evt,
533             &hypervisor,
534             seccomp_action.clone(),
535             vm_ops,
536             #[cfg(feature = "tdx")]
537             tdx_enabled,
538             &numa_nodes,
539         )
540         .map_err(Error::CpuManager)?;
541 
542         #[cfg(target_arch = "x86_64")]
543         cpu_manager
544             .lock()
545             .unwrap()
546             .populate_cpuid(
547                 &memory_manager,
548                 &hypervisor,
549                 #[cfg(feature = "tdx")]
550                 tdx_enabled,
551             )
552             .map_err(Error::CpuManager)?;
553 
554         // The initial TDX configuration must be done before the vCPUs are
555         // created
556         #[cfg(feature = "tdx")]
557         if tdx_enabled {
558             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
559             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
560             vm.tdx_init(&cpuid, max_vcpus)
561                 .map_err(Error::InitializeTdxVm)?;
562         }
563 
564         cpu_manager
565             .lock()
566             .unwrap()
567             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
568             .map_err(Error::CpuManager)?;
569 
570         #[cfg(feature = "tdx")]
571         let dynamic = !tdx_enabled;
572         #[cfg(not(feature = "tdx"))]
573         let dynamic = true;
574 
575         let device_manager = DeviceManager::new(
576             #[cfg(target_arch = "x86_64")]
577             io_bus,
578             mmio_bus,
579             hypervisor.hypervisor_type(),
580             vm.clone(),
581             config.clone(),
582             memory_manager.clone(),
583             cpu_manager.clone(),
584             exit_evt.try_clone().map_err(Error::EventFdClone)?,
585             reset_evt,
586             seccomp_action.clone(),
587             numa_nodes.clone(),
588             &activate_evt,
589             force_iommu,
590             boot_id_list,
591             timestamp,
592             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
593             dynamic,
594         )
595         .map_err(Error::DeviceManager)?;
596 
597         device_manager
598             .lock()
599             .unwrap()
600             .create_devices(serial_pty, console_pty, console_resize_pipe)
601             .map_err(Error::DeviceManager)?;
602 
603         // SAFETY: trivially safe
604         let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO) } != 0;
605 
606         #[cfg(feature = "tdx")]
607         let kernel = config
608             .lock()
609             .unwrap()
610             .payload
611             .as_ref()
612             .map(|p| p.kernel.as_ref().map(File::open))
613             .unwrap_or_default()
614             .transpose()
615             .map_err(Error::KernelFile)?;
616 
617         let initramfs = config
618             .lock()
619             .unwrap()
620             .payload
621             .as_ref()
622             .map(|p| p.initramfs.as_ref().map(File::open))
623             .unwrap_or_default()
624             .transpose()
625             .map_err(Error::InitramfsFile)?;
626 
627         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
628         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
629             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
630             vm_snapshot.clock
631         } else {
632             None
633         };
634 
635         let vm_state = if snapshot.is_some() {
636             VmState::Paused
637         } else {
638             VmState::Created
639         };
640 
641         Ok(Vm {
642             #[cfg(feature = "tdx")]
643             kernel,
644             initramfs,
645             device_manager,
646             config,
647             on_tty,
648             threads: Vec::with_capacity(1),
649             signals: None,
650             state: RwLock::new(vm_state),
651             cpu_manager,
652             memory_manager,
653             vm,
654             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
655             saved_clock,
656             numa_nodes,
657             seccomp_action: seccomp_action.clone(),
658             exit_evt,
659             hypervisor,
660             stop_on_boot,
661             load_payload_handle,
662         })
663     }
664 
665     fn create_numa_nodes(
666         configs: Option<Vec<NumaConfig>>,
667         memory_manager: &Arc<Mutex<MemoryManager>>,
668     ) -> Result<NumaNodes> {
669         let mm = memory_manager.lock().unwrap();
670         let mm_zones = mm.memory_zones();
671         let mut numa_nodes = BTreeMap::new();
672 
673         if let Some(configs) = &configs {
674             for config in configs.iter() {
675                 if numa_nodes.contains_key(&config.guest_numa_id) {
676                     error!("Can't define twice the same NUMA node");
677                     return Err(Error::InvalidNumaConfig);
678                 }
679 
680                 let mut node = NumaNode::default();
681 
682                 if let Some(memory_zones) = &config.memory_zones {
683                     for memory_zone in memory_zones.iter() {
684                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
685                             node.memory_regions.extend(mm_zone.regions().clone());
686                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
687                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
688                             }
689                             node.memory_zones.push(memory_zone.clone());
690                         } else {
691                             error!("Unknown memory zone '{}'", memory_zone);
692                             return Err(Error::InvalidNumaConfig);
693                         }
694                     }
695                 }
696 
697                 if let Some(cpus) = &config.cpus {
698                     node.cpus.extend(cpus);
699                 }
700 
701                 if let Some(distances) = &config.distances {
702                     for distance in distances.iter() {
703                         let dest = distance.destination;
704                         let dist = distance.distance;
705 
706                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
707                             error!("Unknown destination NUMA node {}", dest);
708                             return Err(Error::InvalidNumaConfig);
709                         }
710 
711                         if node.distances.contains_key(&dest) {
712                             error!("Destination NUMA node {} has been already set", dest);
713                             return Err(Error::InvalidNumaConfig);
714                         }
715 
716                         node.distances.insert(dest, dist);
717                     }
718                 }
719 
720                 #[cfg(target_arch = "x86_64")]
721                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
722                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
723                         let mm_sections = sgx_epc_region.epc_sections();
724                         for sgx_epc_section in sgx_epc_sections.iter() {
725                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
726                                 node.sgx_epc_sections.push(mm_section.clone());
727                             } else {
728                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
729                                 return Err(Error::InvalidNumaConfig);
730                             }
731                         }
732                     } else {
733                         error!("Missing SGX EPC region");
734                         return Err(Error::InvalidNumaConfig);
735                     }
736                 }
737 
738                 numa_nodes.insert(config.guest_numa_id, node);
739             }
740         }
741 
742         Ok(numa_nodes)
743     }
744 
745     #[allow(clippy::too_many_arguments)]
746     pub fn new(
747         vm_config: Arc<Mutex<VmConfig>>,
748         exit_evt: EventFd,
749         reset_evt: EventFd,
750         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
751         seccomp_action: &SeccompAction,
752         hypervisor: Arc<dyn hypervisor::Hypervisor>,
753         activate_evt: EventFd,
754         serial_pty: Option<PtyPair>,
755         console_pty: Option<PtyPair>,
756         console_resize_pipe: Option<File>,
757         snapshot: Option<Snapshot>,
758         source_url: Option<&str>,
759         prefault: Option<bool>,
760     ) -> Result<Self> {
761         trace_scoped!("Vm::new");
762 
763         let timestamp = Instant::now();
764 
765         #[cfg(feature = "tdx")]
766         let tdx_enabled = if snapshot.is_some() {
767             false
768         } else {
769             vm_config.lock().unwrap().is_tdx_enabled()
770         };
771 
772         let vm = Self::create_hypervisor_vm(
773             &hypervisor,
774             #[cfg(feature = "tdx")]
775             tdx_enabled,
776         )?;
777 
778         let phys_bits = physical_bits(vm_config.lock().unwrap().cpus.max_phys_bits);
779 
780         let memory_manager = if let Some(snapshot) =
781             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
782         {
783             MemoryManager::new_from_snapshot(
784                 &snapshot,
785                 vm.clone(),
786                 &vm_config.lock().unwrap().memory.clone(),
787                 source_url,
788                 prefault.unwrap(),
789                 phys_bits,
790             )
791             .map_err(Error::MemoryManager)?
792         } else {
793             #[cfg(target_arch = "x86_64")]
794             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
795 
796             MemoryManager::new(
797                 vm.clone(),
798                 &vm_config.lock().unwrap().memory.clone(),
799                 None,
800                 phys_bits,
801                 #[cfg(feature = "tdx")]
802                 tdx_enabled,
803                 None,
804                 None,
805                 #[cfg(target_arch = "x86_64")]
806                 sgx_epc_config,
807             )
808             .map_err(Error::MemoryManager)?
809         };
810 
811         Vm::new_from_memory_manager(
812             vm_config,
813             memory_manager,
814             vm,
815             exit_evt,
816             reset_evt,
817             #[cfg(feature = "guest_debug")]
818             vm_debug_evt,
819             seccomp_action,
820             hypervisor,
821             activate_evt,
822             timestamp,
823             serial_pty,
824             console_pty,
825             console_resize_pipe,
826             snapshot,
827         )
828     }
829 
830     pub fn create_hypervisor_vm(
831         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
832         #[cfg(feature = "tdx")] tdx_enabled: bool,
833     ) -> Result<Arc<dyn hypervisor::Vm>> {
834         hypervisor.check_required_extensions().unwrap();
835 
836         #[cfg(feature = "tdx")]
837         let vm = hypervisor
838             .create_vm_with_type(if tdx_enabled {
839                 2 // KVM_X86_TDX_VM
840             } else {
841                 0 // KVM_X86_LEGACY_VM
842             })
843             .unwrap();
844         #[cfg(not(feature = "tdx"))]
845         let vm = hypervisor.create_vm().unwrap();
846 
847         #[cfg(target_arch = "x86_64")]
848         {
849             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
850                 .unwrap();
851             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
852             vm.enable_split_irq().unwrap();
853         }
854 
855         Ok(vm)
856     }
857 
858     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
859         let mut initramfs = self.initramfs.as_ref().unwrap();
860         let size: usize = initramfs
861             .seek(SeekFrom::End(0))
862             .map_err(|_| Error::InitramfsLoad)?
863             .try_into()
864             .unwrap();
865         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
866 
867         let address =
868             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
869         let address = GuestAddress(address);
870 
871         guest_mem
872             .read_from(address, &mut initramfs, size)
873             .map_err(|_| Error::InitramfsLoad)?;
874 
875         info!("Initramfs loaded: address = 0x{:x}", address.0);
876         Ok(arch::InitramfsConfig { address, size })
877     }
878 
879     pub fn generate_cmdline(
880         payload: &PayloadConfig,
881         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
882     ) -> Result<Cmdline> {
883         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
884         if let Some(s) = payload.cmdline.as_ref() {
885             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
886         }
887 
888         #[cfg(target_arch = "aarch64")]
889         for entry in device_manager.lock().unwrap().cmdline_additions() {
890             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
891         }
892         Ok(cmdline)
893     }
894 
895     #[cfg(target_arch = "aarch64")]
896     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
897         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
898         let mem = uefi_flash.memory();
899         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
900             .map_err(Error::UefiLoad)?;
901         Ok(())
902     }
903 
904     #[cfg(target_arch = "aarch64")]
905     fn load_kernel(
906         firmware: Option<File>,
907         kernel: Option<File>,
908         memory_manager: Arc<Mutex<MemoryManager>>,
909     ) -> Result<EntryPoint> {
910         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
911         let mem = guest_memory.memory();
912         let entry_addr = match (firmware, kernel) {
913             (None, Some(mut kernel)) => {
914                 match linux_loader::loader::pe::PE::load(
915                     mem.deref(),
916                     Some(arch::layout::KERNEL_START),
917                     &mut kernel,
918                     None,
919                 ) {
920                     Ok(entry_addr) => entry_addr.kernel_load,
921                     // Try to load the binary as kernel PE file at first.
922                     // If failed, retry to load it as UEFI binary.
923                     // As the UEFI binary is formatless, it must be the last option to try.
924                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
925                         Self::load_firmware(&kernel, memory_manager)?;
926                         arch::layout::UEFI_START
927                     }
928                     Err(e) => {
929                         return Err(Error::KernelLoad(e));
930                     }
931                 }
932             }
933             (Some(firmware), None) => {
934                 Self::load_firmware(&firmware, memory_manager)?;
935                 arch::layout::UEFI_START
936             }
937             _ => return Err(Error::InvalidPayload),
938         };
939 
940         Ok(EntryPoint { entry_addr })
941     }
942 
943     #[cfg(target_arch = "x86_64")]
944     fn load_kernel(
945         mut kernel: File,
946         cmdline: Option<Cmdline>,
947         memory_manager: Arc<Mutex<MemoryManager>>,
948     ) -> Result<EntryPoint> {
949         info!("Loading kernel");
950 
951         let mem = {
952             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
953             guest_memory.memory()
954         };
955         let entry_addr = linux_loader::loader::elf::Elf::load(
956             mem.deref(),
957             None,
958             &mut kernel,
959             Some(arch::layout::HIGH_RAM_START),
960         )
961         .map_err(Error::KernelLoad)?;
962 
963         if let Some(cmdline) = cmdline {
964             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
965                 .map_err(Error::LoadCmdLine)?;
966         }
967 
968         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
969             // Use the PVH kernel entry point to boot the guest
970             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
971             Ok(EntryPoint {
972                 entry_addr: Some(entry_addr),
973             })
974         } else {
975             Err(Error::KernelMissingPvhHeader)
976         }
977     }
978 
979     #[cfg(target_arch = "x86_64")]
980     fn load_payload(
981         payload: &PayloadConfig,
982         memory_manager: Arc<Mutex<MemoryManager>>,
983     ) -> Result<EntryPoint> {
984         trace_scoped!("load_payload");
985         match (
986             &payload.firmware,
987             &payload.kernel,
988             &payload.initramfs,
989             &payload.cmdline,
990         ) {
991             (Some(firmware), None, None, None) => {
992                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
993                 Self::load_kernel(firmware, None, memory_manager)
994             }
995             (None, Some(kernel), _, _) => {
996                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
997                 let cmdline = Self::generate_cmdline(payload)?;
998                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
999             }
1000             _ => Err(Error::InvalidPayload),
1001         }
1002     }
1003 
1004     #[cfg(target_arch = "aarch64")]
1005     fn load_payload(
1006         payload: &PayloadConfig,
1007         memory_manager: Arc<Mutex<MemoryManager>>,
1008     ) -> Result<EntryPoint> {
1009         match (&payload.firmware, &payload.kernel) {
1010             (Some(firmware), None) => {
1011                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1012                 Self::load_kernel(Some(firmware), None, memory_manager)
1013             }
1014             (None, Some(kernel)) => {
1015                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1016                 Self::load_kernel(None, Some(kernel), memory_manager)
1017             }
1018             _ => Err(Error::InvalidPayload),
1019         }
1020     }
1021 
1022     fn load_payload_async(
1023         memory_manager: &Arc<Mutex<MemoryManager>>,
1024         config: &Arc<Mutex<VmConfig>>,
1025     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1026         // Kernel with TDX is loaded in a different manner
1027         #[cfg(feature = "tdx")]
1028         if config.lock().unwrap().is_tdx_enabled() {
1029             return Ok(None);
1030         }
1031 
1032         config
1033             .lock()
1034             .unwrap()
1035             .payload
1036             .as_ref()
1037             .map(|payload| {
1038                 let memory_manager = memory_manager.clone();
1039                 let payload = payload.clone();
1040 
1041                 std::thread::Builder::new()
1042                     .name("payload_loader".into())
1043                     .spawn(move || Self::load_payload(&payload, memory_manager))
1044                     .map_err(Error::KernelLoadThreadSpawn)
1045             })
1046             .transpose()
1047     }
1048 
1049     #[cfg(target_arch = "x86_64")]
1050     fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> {
1051         trace_scoped!("configure_system");
1052         info!("Configuring system");
1053         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1054 
1055         let initramfs_config = match self.initramfs {
1056             Some(_) => Some(self.load_initramfs(&mem)?),
1057             None => None,
1058         };
1059 
1060         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1061         let rsdp_addr = Some(rsdp_addr);
1062         let sgx_epc_region = self
1063             .memory_manager
1064             .lock()
1065             .unwrap()
1066             .sgx_epc_region()
1067             .as_ref()
1068             .cloned();
1069 
1070         let serial_number = self
1071             .config
1072             .lock()
1073             .unwrap()
1074             .platform
1075             .as_ref()
1076             .and_then(|p| p.serial_number.clone());
1077 
1078         let uuid = self
1079             .config
1080             .lock()
1081             .unwrap()
1082             .platform
1083             .as_ref()
1084             .and_then(|p| p.uuid.clone());
1085 
1086         let oem_strings = self
1087             .config
1088             .lock()
1089             .unwrap()
1090             .platform
1091             .as_ref()
1092             .and_then(|p| p.oem_strings.clone());
1093 
1094         let oem_strings = oem_strings
1095             .as_deref()
1096             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1097 
1098         arch::configure_system(
1099             &mem,
1100             arch::layout::CMDLINE_START,
1101             &initramfs_config,
1102             boot_vcpus,
1103             rsdp_addr,
1104             sgx_epc_region,
1105             serial_number.as_deref(),
1106             uuid.as_deref(),
1107             oem_strings.as_deref(),
1108         )
1109         .map_err(Error::ConfigureSystem)?;
1110         Ok(())
1111     }
1112 
1113     #[cfg(target_arch = "aarch64")]
1114     fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> {
1115         let cmdline = Self::generate_cmdline(
1116             self.config.lock().unwrap().payload.as_ref().unwrap(),
1117             &self.device_manager,
1118         )?;
1119         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1120         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1121         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1122         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1123         let initramfs_config = match self.initramfs {
1124             Some(_) => Some(self.load_initramfs(&mem)?),
1125             None => None,
1126         };
1127 
1128         let device_info = &self
1129             .device_manager
1130             .lock()
1131             .unwrap()
1132             .get_device_info()
1133             .clone();
1134 
1135         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1136             let pci_space = PciSpaceInfo {
1137                 pci_segment_id: pci_segment.id,
1138                 mmio_config_address: pci_segment.mmio_config_address,
1139                 pci_device_space_start: pci_segment.start_of_device_area,
1140                 pci_device_space_size: pci_segment.end_of_device_area
1141                     - pci_segment.start_of_device_area
1142                     + 1,
1143             };
1144             pci_space_info.push(pci_space);
1145         }
1146 
1147         let virtio_iommu_bdf = self
1148             .device_manager
1149             .lock()
1150             .unwrap()
1151             .iommu_attached_devices()
1152             .as_ref()
1153             .map(|(v, _)| *v);
1154 
1155         let vgic = self
1156             .device_manager
1157             .lock()
1158             .unwrap()
1159             .get_interrupt_controller()
1160             .unwrap()
1161             .lock()
1162             .unwrap()
1163             .get_vgic()
1164             .map_err(|_| {
1165                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1166                     arch::aarch64::Error::SetupGic,
1167                 ))
1168             })?;
1169 
1170         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1171         let pmu_supported = self
1172             .cpu_manager
1173             .lock()
1174             .unwrap()
1175             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1176             .map_err(|_| {
1177                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1178                     arch::aarch64::Error::VcpuInitPmu,
1179                 ))
1180             })?;
1181 
1182         arch::configure_system(
1183             &mem,
1184             cmdline.as_cstring().unwrap().to_str().unwrap(),
1185             vcpu_mpidrs,
1186             vcpu_topology,
1187             device_info,
1188             &initramfs_config,
1189             &pci_space_info,
1190             virtio_iommu_bdf.map(|bdf| bdf.into()),
1191             &vgic,
1192             &self.numa_nodes,
1193             pmu_supported,
1194         )
1195         .map_err(Error::ConfigureSystem)?;
1196 
1197         Ok(())
1198     }
1199 
1200     pub fn serial_pty(&self) -> Option<PtyPair> {
1201         self.device_manager.lock().unwrap().serial_pty()
1202     }
1203 
1204     pub fn console_pty(&self) -> Option<PtyPair> {
1205         self.device_manager.lock().unwrap().console_pty()
1206     }
1207 
1208     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1209         self.device_manager.lock().unwrap().console_resize_pipe()
1210     }
1211 
1212     pub fn shutdown(&mut self) -> Result<()> {
1213         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1214         let new_state = VmState::Shutdown;
1215 
1216         state.valid_transition(new_state)?;
1217 
1218         if self.on_tty {
1219             // Don't forget to set the terminal in canonical mode
1220             // before to exit.
1221             io::stdin()
1222                 .lock()
1223                 .set_canon_mode()
1224                 .map_err(Error::SetTerminalCanon)?;
1225         }
1226 
1227         // Trigger the termination of the signal_handler thread
1228         if let Some(signals) = self.signals.take() {
1229             signals.close();
1230         }
1231 
1232         // Wake up the DeviceManager threads so they will get terminated cleanly
1233         self.device_manager
1234             .lock()
1235             .unwrap()
1236             .resume()
1237             .map_err(Error::Resume)?;
1238 
1239         self.cpu_manager
1240             .lock()
1241             .unwrap()
1242             .shutdown()
1243             .map_err(Error::CpuManager)?;
1244 
1245         // Wait for all the threads to finish
1246         for thread in self.threads.drain(..) {
1247             thread.join().map_err(Error::ThreadCleanup)?
1248         }
1249         *state = new_state;
1250 
1251         event!("vm", "shutdown");
1252 
1253         Ok(())
1254     }
1255 
1256     pub fn resize(
1257         &mut self,
1258         desired_vcpus: Option<u8>,
1259         desired_memory: Option<u64>,
1260         desired_balloon: Option<u64>,
1261     ) -> Result<()> {
1262         event!("vm", "resizing");
1263 
1264         if let Some(desired_vcpus) = desired_vcpus {
1265             if self
1266                 .cpu_manager
1267                 .lock()
1268                 .unwrap()
1269                 .resize(desired_vcpus)
1270                 .map_err(Error::CpuManager)?
1271             {
1272                 self.device_manager
1273                     .lock()
1274                     .unwrap()
1275                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1276                     .map_err(Error::DeviceManager)?;
1277             }
1278             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1279         }
1280 
1281         if let Some(desired_memory) = desired_memory {
1282             let new_region = self
1283                 .memory_manager
1284                 .lock()
1285                 .unwrap()
1286                 .resize(desired_memory)
1287                 .map_err(Error::MemoryManager)?;
1288 
1289             let mut memory_config = &mut self.config.lock().unwrap().memory;
1290 
1291             if let Some(new_region) = &new_region {
1292                 self.device_manager
1293                     .lock()
1294                     .unwrap()
1295                     .update_memory(new_region)
1296                     .map_err(Error::DeviceManager)?;
1297 
1298                 match memory_config.hotplug_method {
1299                     HotplugMethod::Acpi => {
1300                         self.device_manager
1301                             .lock()
1302                             .unwrap()
1303                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1304                             .map_err(Error::DeviceManager)?;
1305                     }
1306                     HotplugMethod::VirtioMem => {}
1307                 }
1308             }
1309 
1310             // We update the VM config regardless of the actual guest resize
1311             // operation result (happened or not), so that if the VM reboots
1312             // it will be running with the last configure memory size.
1313             match memory_config.hotplug_method {
1314                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1315                 HotplugMethod::VirtioMem => {
1316                     if desired_memory > memory_config.size {
1317                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1318                     } else {
1319                         memory_config.hotplugged_size = None;
1320                     }
1321                 }
1322             }
1323         }
1324 
1325         if let Some(desired_balloon) = desired_balloon {
1326             self.device_manager
1327                 .lock()
1328                 .unwrap()
1329                 .resize_balloon(desired_balloon)
1330                 .map_err(Error::DeviceManager)?;
1331 
1332             // Update the configuration value for the balloon size to ensure
1333             // a reboot would use the right value.
1334             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1335                 balloon_config.size = desired_balloon;
1336             }
1337         }
1338 
1339         event!("vm", "resized");
1340 
1341         Ok(())
1342     }
1343 
1344     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1345         let memory_config = &mut self.config.lock().unwrap().memory;
1346 
1347         if let Some(zones) = &mut memory_config.zones {
1348             for zone in zones.iter_mut() {
1349                 if zone.id == id {
1350                     if desired_memory >= zone.size {
1351                         let hotplugged_size = desired_memory - zone.size;
1352                         self.memory_manager
1353                             .lock()
1354                             .unwrap()
1355                             .resize_zone(&id, desired_memory - zone.size)
1356                             .map_err(Error::MemoryManager)?;
1357                         // We update the memory zone config regardless of the
1358                         // actual 'resize-zone' operation result (happened or
1359                         // not), so that if the VM reboots it will be running
1360                         // with the last configured memory zone size.
1361                         zone.hotplugged_size = Some(hotplugged_size);
1362 
1363                         return Ok(());
1364                     } else {
1365                         error!(
1366                             "Invalid to ask less ({}) than boot RAM ({}) for \
1367                             this memory zone",
1368                             desired_memory, zone.size,
1369                         );
1370                         return Err(Error::ResizeZone);
1371                     }
1372                 }
1373             }
1374         }
1375 
1376         error!("Could not find the memory zone {} for the resize", id);
1377         Err(Error::ResizeZone)
1378     }
1379 
1380     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1381         let pci_device_info = self
1382             .device_manager
1383             .lock()
1384             .unwrap()
1385             .add_device(&mut device_cfg)
1386             .map_err(Error::DeviceManager)?;
1387 
1388         // Update VmConfig by adding the new device. This is important to
1389         // ensure the device would be created in case of a reboot.
1390         {
1391             let mut config = self.config.lock().unwrap();
1392             add_to_config(&mut config.devices, device_cfg);
1393         }
1394 
1395         self.device_manager
1396             .lock()
1397             .unwrap()
1398             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1399             .map_err(Error::DeviceManager)?;
1400 
1401         Ok(pci_device_info)
1402     }
1403 
1404     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1405         let pci_device_info = self
1406             .device_manager
1407             .lock()
1408             .unwrap()
1409             .add_user_device(&mut device_cfg)
1410             .map_err(Error::DeviceManager)?;
1411 
1412         // Update VmConfig by adding the new device. This is important to
1413         // ensure the device would be created in case of a reboot.
1414         {
1415             let mut config = self.config.lock().unwrap();
1416             add_to_config(&mut config.user_devices, device_cfg);
1417         }
1418 
1419         self.device_manager
1420             .lock()
1421             .unwrap()
1422             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1423             .map_err(Error::DeviceManager)?;
1424 
1425         Ok(pci_device_info)
1426     }
1427 
1428     pub fn remove_device(&mut self, id: String) -> Result<()> {
1429         self.device_manager
1430             .lock()
1431             .unwrap()
1432             .remove_device(id.clone())
1433             .map_err(Error::DeviceManager)?;
1434 
1435         // Update VmConfig by removing the device. This is important to
1436         // ensure the device would not be created in case of a reboot.
1437         let mut config = self.config.lock().unwrap();
1438 
1439         // Remove if VFIO device
1440         if let Some(devices) = config.devices.as_mut() {
1441             devices.retain(|dev| dev.id.as_ref() != Some(&id));
1442         }
1443 
1444         // Remove if VFIO user device
1445         if let Some(user_devices) = config.user_devices.as_mut() {
1446             user_devices.retain(|dev| dev.id.as_ref() != Some(&id));
1447         }
1448 
1449         // Remove if disk device
1450         if let Some(disks) = config.disks.as_mut() {
1451             disks.retain(|dev| dev.id.as_ref() != Some(&id));
1452         }
1453 
1454         // Remove if fs device
1455         if let Some(fs) = config.fs.as_mut() {
1456             fs.retain(|dev| dev.id.as_ref() != Some(&id));
1457         }
1458 
1459         // Remove if net device
1460         if let Some(net) = config.net.as_mut() {
1461             net.retain(|dev| dev.id.as_ref() != Some(&id));
1462         }
1463 
1464         // Remove if pmem device
1465         if let Some(pmem) = config.pmem.as_mut() {
1466             pmem.retain(|dev| dev.id.as_ref() != Some(&id));
1467         }
1468 
1469         // Remove if vDPA device
1470         if let Some(vdpa) = config.vdpa.as_mut() {
1471             vdpa.retain(|dev| dev.id.as_ref() != Some(&id));
1472         }
1473 
1474         // Remove if vsock device
1475         if let Some(vsock) = config.vsock.as_ref() {
1476             if vsock.id.as_ref() == Some(&id) {
1477                 config.vsock = None;
1478             }
1479         }
1480 
1481         self.device_manager
1482             .lock()
1483             .unwrap()
1484             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1485             .map_err(Error::DeviceManager)?;
1486         Ok(())
1487     }
1488 
1489     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1490         let pci_device_info = self
1491             .device_manager
1492             .lock()
1493             .unwrap()
1494             .add_disk(&mut disk_cfg)
1495             .map_err(Error::DeviceManager)?;
1496 
1497         // Update VmConfig by adding the new device. This is important to
1498         // ensure the device would be created in case of a reboot.
1499         {
1500             let mut config = self.config.lock().unwrap();
1501             add_to_config(&mut config.disks, disk_cfg);
1502         }
1503 
1504         self.device_manager
1505             .lock()
1506             .unwrap()
1507             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1508             .map_err(Error::DeviceManager)?;
1509 
1510         Ok(pci_device_info)
1511     }
1512 
1513     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1514         let pci_device_info = self
1515             .device_manager
1516             .lock()
1517             .unwrap()
1518             .add_fs(&mut fs_cfg)
1519             .map_err(Error::DeviceManager)?;
1520 
1521         // Update VmConfig by adding the new device. This is important to
1522         // ensure the device would be created in case of a reboot.
1523         {
1524             let mut config = self.config.lock().unwrap();
1525             add_to_config(&mut config.fs, fs_cfg);
1526         }
1527 
1528         self.device_manager
1529             .lock()
1530             .unwrap()
1531             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1532             .map_err(Error::DeviceManager)?;
1533 
1534         Ok(pci_device_info)
1535     }
1536 
1537     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1538         let pci_device_info = self
1539             .device_manager
1540             .lock()
1541             .unwrap()
1542             .add_pmem(&mut pmem_cfg)
1543             .map_err(Error::DeviceManager)?;
1544 
1545         // Update VmConfig by adding the new device. This is important to
1546         // ensure the device would be created in case of a reboot.
1547         {
1548             let mut config = self.config.lock().unwrap();
1549             add_to_config(&mut config.pmem, pmem_cfg);
1550         }
1551 
1552         self.device_manager
1553             .lock()
1554             .unwrap()
1555             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1556             .map_err(Error::DeviceManager)?;
1557 
1558         Ok(pci_device_info)
1559     }
1560 
1561     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1562         let pci_device_info = self
1563             .device_manager
1564             .lock()
1565             .unwrap()
1566             .add_net(&mut net_cfg)
1567             .map_err(Error::DeviceManager)?;
1568 
1569         // Update VmConfig by adding the new device. This is important to
1570         // ensure the device would be created in case of a reboot.
1571         {
1572             let mut config = self.config.lock().unwrap();
1573             add_to_config(&mut config.net, net_cfg);
1574         }
1575 
1576         self.device_manager
1577             .lock()
1578             .unwrap()
1579             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1580             .map_err(Error::DeviceManager)?;
1581 
1582         Ok(pci_device_info)
1583     }
1584 
1585     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1586         let pci_device_info = self
1587             .device_manager
1588             .lock()
1589             .unwrap()
1590             .add_vdpa(&mut vdpa_cfg)
1591             .map_err(Error::DeviceManager)?;
1592 
1593         // Update VmConfig by adding the new device. This is important to
1594         // ensure the device would be created in case of a reboot.
1595         {
1596             let mut config = self.config.lock().unwrap();
1597             add_to_config(&mut config.vdpa, vdpa_cfg);
1598         }
1599 
1600         self.device_manager
1601             .lock()
1602             .unwrap()
1603             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1604             .map_err(Error::DeviceManager)?;
1605 
1606         Ok(pci_device_info)
1607     }
1608 
1609     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1610         let pci_device_info = self
1611             .device_manager
1612             .lock()
1613             .unwrap()
1614             .add_vsock(&mut vsock_cfg)
1615             .map_err(Error::DeviceManager)?;
1616 
1617         // Update VmConfig by adding the new device. This is important to
1618         // ensure the device would be created in case of a reboot.
1619         {
1620             let mut config = self.config.lock().unwrap();
1621             config.vsock = Some(vsock_cfg);
1622         }
1623 
1624         self.device_manager
1625             .lock()
1626             .unwrap()
1627             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1628             .map_err(Error::DeviceManager)?;
1629 
1630         Ok(pci_device_info)
1631     }
1632 
1633     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1634         Ok(self.device_manager.lock().unwrap().counters())
1635     }
1636 
1637     fn signal_handler(mut signals: Signals, console_input_clone: Arc<Console>) {
1638         for sig in &Vm::HANDLED_SIGNALS {
1639             unblock_signal(*sig).unwrap();
1640         }
1641 
1642         for signal in signals.forever() {
1643             if signal == SIGWINCH {
1644                 console_input_clone.update_console_size();
1645             }
1646         }
1647     }
1648 
1649     #[cfg(feature = "tdx")]
1650     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1651         use arch::x86_64::tdx::*;
1652 
1653         let firmware_path = self
1654             .config
1655             .lock()
1656             .unwrap()
1657             .payload
1658             .as_ref()
1659             .unwrap()
1660             .firmware
1661             .clone()
1662             .ok_or(Error::TdxFirmwareMissing)?;
1663         // The TDVF file contains a table of section as well as code
1664         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1665 
1666         // For all the sections allocate some RAM backing them
1667         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1668     }
1669 
1670     #[cfg(feature = "tdx")]
1671     fn hob_memory_resources(
1672         mut sorted_sections: Vec<TdvfSection>,
1673         guest_memory: &GuestMemoryMmap,
1674     ) -> Vec<(u64, u64, bool)> {
1675         let mut list = Vec::new();
1676 
1677         let mut current_section = sorted_sections.pop();
1678 
1679         // RAM regions interleaved with TDVF sections
1680         let mut next_start_addr = 0;
1681         for region in guest_memory.iter() {
1682             let region_start = region.start_addr().0;
1683             let region_end = region.last_addr().0;
1684             if region_start > next_start_addr {
1685                 next_start_addr = region_start;
1686             }
1687 
1688             loop {
1689                 let (start, size, ram) = if let Some(section) = &current_section {
1690                     if section.address <= next_start_addr {
1691                         (section.address, section.size, false)
1692                     } else {
1693                         let last_addr = std::cmp::min(section.address - 1, region_end);
1694                         (next_start_addr, last_addr - next_start_addr + 1, true)
1695                     }
1696                 } else {
1697                     (next_start_addr, region_end - next_start_addr + 1, true)
1698                 };
1699 
1700                 list.push((start, size, ram));
1701 
1702                 if !ram {
1703                     current_section = sorted_sections.pop();
1704                 }
1705 
1706                 next_start_addr = start + size;
1707 
1708                 if region_start > next_start_addr {
1709                     next_start_addr = region_start;
1710                 }
1711 
1712                 if next_start_addr > region_end {
1713                     break;
1714                 }
1715             }
1716         }
1717 
1718         // Once all the interleaved sections have been processed, let's simply
1719         // pull the remaining ones.
1720         if let Some(section) = current_section {
1721             list.push((section.address, section.size, false));
1722         }
1723         while let Some(section) = sorted_sections.pop() {
1724             list.push((section.address, section.size, false));
1725         }
1726 
1727         list
1728     }
1729 
1730     #[cfg(feature = "tdx")]
1731     fn populate_tdx_sections(
1732         &mut self,
1733         sections: &[TdvfSection],
1734         guid_found: bool,
1735     ) -> Result<Option<u64>> {
1736         use arch::x86_64::tdx::*;
1737         // Get the memory end *before* we start adding TDVF ram regions
1738         let boot_guest_memory = self
1739             .memory_manager
1740             .lock()
1741             .as_ref()
1742             .unwrap()
1743             .boot_guest_memory();
1744         for section in sections {
1745             // No need to allocate if the section falls within guest RAM ranges
1746             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1747                 info!(
1748                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1749                     section
1750                 );
1751                 continue;
1752             }
1753 
1754             info!("Allocating TDVF Section: {:x?}", section);
1755             self.memory_manager
1756                 .lock()
1757                 .unwrap()
1758                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1759                 .map_err(Error::AllocatingTdvfMemory)?;
1760         }
1761 
1762         // The TDVF file contains a table of section as well as code
1763         let firmware_path = self
1764             .config
1765             .lock()
1766             .unwrap()
1767             .payload
1768             .as_ref()
1769             .unwrap()
1770             .firmware
1771             .clone()
1772             .ok_or(Error::TdxFirmwareMissing)?;
1773         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1774 
1775         // The guest memory at this point now has all the required regions so it
1776         // is safe to copy from the TDVF file into it.
1777         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1778         let mem = guest_memory.memory();
1779         let mut payload_info = None;
1780         let mut hob_offset = None;
1781         for section in sections {
1782             info!("Populating TDVF Section: {:x?}", section);
1783             match section.r#type {
1784                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1785                     info!("Copying section to guest memory");
1786                     firmware_file
1787                         .seek(SeekFrom::Start(section.data_offset as u64))
1788                         .map_err(Error::LoadTdvf)?;
1789                     mem.read_from(
1790                         GuestAddress(section.address),
1791                         &mut firmware_file,
1792                         section.data_size as usize,
1793                     )
1794                     .unwrap();
1795                 }
1796                 TdvfSectionType::TdHob => {
1797                     hob_offset = Some(section.address);
1798                 }
1799                 TdvfSectionType::Payload => {
1800                     info!("Copying payload to guest memory");
1801                     if let Some(payload_file) = self.kernel.as_mut() {
1802                         let payload_size = payload_file
1803                             .seek(SeekFrom::End(0))
1804                             .map_err(Error::LoadPayload)?;
1805 
1806                         payload_file
1807                             .seek(SeekFrom::Start(0x1f1))
1808                             .map_err(Error::LoadPayload)?;
1809 
1810                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1811                         payload_header
1812                             .as_bytes()
1813                             .read_from(
1814                                 0,
1815                                 payload_file,
1816                                 mem::size_of::<linux_loader::bootparam::setup_header>(),
1817                             )
1818                             .unwrap();
1819 
1820                         if payload_header.header != 0x5372_6448 {
1821                             return Err(Error::InvalidPayloadType);
1822                         }
1823 
1824                         if (payload_header.version < 0x0200)
1825                             || ((payload_header.loadflags & 0x1) == 0x0)
1826                         {
1827                             return Err(Error::InvalidPayloadType);
1828                         }
1829 
1830                         payload_file.rewind().map_err(Error::LoadPayload)?;
1831                         mem.read_from(
1832                             GuestAddress(section.address),
1833                             payload_file,
1834                             payload_size as usize,
1835                         )
1836                         .unwrap();
1837 
1838                         // Create the payload info that will be inserted into
1839                         // the HOB.
1840                         payload_info = Some(PayloadInfo {
1841                             image_type: PayloadImageType::BzImage,
1842                             entry_point: section.address,
1843                         });
1844                     }
1845                 }
1846                 TdvfSectionType::PayloadParam => {
1847                     info!("Copying payload parameters to guest memory");
1848                     let cmdline = Self::generate_cmdline(
1849                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1850                     )?;
1851                     mem.write_slice(
1852                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1853                         GuestAddress(section.address),
1854                     )
1855                     .unwrap();
1856                 }
1857                 _ => {}
1858             }
1859         }
1860 
1861         // Generate HOB
1862         let mut hob = TdHob::start(hob_offset.unwrap());
1863 
1864         let mut sorted_sections = sections.to_vec();
1865         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1866 
1867         sorted_sections.sort_by_key(|section| section.address);
1868         sorted_sections.reverse();
1869 
1870         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1871             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1872                 .map_err(Error::PopulateHob)?;
1873         }
1874 
1875         // MMIO regions
1876         hob.add_mmio_resource(
1877             &mem,
1878             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1879             arch::layout::APIC_START.raw_value()
1880                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1881         )
1882         .map_err(Error::PopulateHob)?;
1883         let start_of_device_area = self
1884             .memory_manager
1885             .lock()
1886             .unwrap()
1887             .start_of_device_area()
1888             .raw_value();
1889         let end_of_device_area = self
1890             .memory_manager
1891             .lock()
1892             .unwrap()
1893             .end_of_device_area()
1894             .raw_value();
1895         hob.add_mmio_resource(
1896             &mem,
1897             start_of_device_area,
1898             end_of_device_area - start_of_device_area,
1899         )
1900         .map_err(Error::PopulateHob)?;
1901 
1902         // Loop over the ACPI tables and copy them to the HOB.
1903 
1904         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1905             &self.device_manager,
1906             &self.cpu_manager,
1907             &self.memory_manager,
1908             &self.numa_nodes,
1909         ) {
1910             hob.add_acpi_table(&mem, acpi_table.as_slice())
1911                 .map_err(Error::PopulateHob)?;
1912         }
1913 
1914         // If a payload info has been created, let's insert it into the HOB.
1915         if let Some(payload_info) = payload_info {
1916             hob.add_payload(&mem, payload_info)
1917                 .map_err(Error::PopulateHob)?;
1918         }
1919 
1920         hob.finish(&mem).map_err(Error::PopulateHob)?;
1921 
1922         Ok(hob_offset)
1923     }
1924 
1925     #[cfg(feature = "tdx")]
1926     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1927         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1928         let mem = guest_memory.memory();
1929 
1930         for section in sections {
1931             self.vm
1932                 .tdx_init_memory_region(
1933                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1934                     section.address,
1935                     section.size,
1936                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1937                     section.attributes == 1,
1938                 )
1939                 .map_err(Error::InitializeTdxMemoryRegion)?;
1940         }
1941 
1942         Ok(())
1943     }
1944 
1945     fn setup_signal_handler(&mut self) -> Result<()> {
1946         let console = self.device_manager.lock().unwrap().console().clone();
1947         let signals = Signals::new(Vm::HANDLED_SIGNALS);
1948         match signals {
1949             Ok(signals) => {
1950                 self.signals = Some(signals.handle());
1951                 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?;
1952                 let signal_handler_seccomp_filter = get_seccomp_filter(
1953                     &self.seccomp_action,
1954                     Thread::SignalHandler,
1955                     self.hypervisor.hypervisor_type(),
1956                 )
1957                 .map_err(Error::CreateSeccompFilter)?;
1958                 self.threads.push(
1959                     thread::Builder::new()
1960                         .name("vm_signal_handler".to_string())
1961                         .spawn(move || {
1962                             if !signal_handler_seccomp_filter.is_empty() {
1963                                 if let Err(e) = apply_filter(&signal_handler_seccomp_filter)
1964                                     .map_err(Error::ApplySeccompFilter)
1965                                 {
1966                                     error!("Error applying seccomp filter: {:?}", e);
1967                                     exit_evt.write(1).ok();
1968                                     return;
1969                                 }
1970                             }
1971                             std::panic::catch_unwind(AssertUnwindSafe(|| {
1972                                 Vm::signal_handler(signals, console);
1973                             }))
1974                             .map_err(|_| {
1975                                 error!("signal_handler thead panicked");
1976                                 exit_evt.write(1).ok()
1977                             })
1978                             .ok();
1979                         })
1980                         .map_err(Error::SignalHandlerSpawn)?,
1981                 );
1982             }
1983             Err(e) => error!("Signal not found {}", e),
1984         }
1985         Ok(())
1986     }
1987 
1988     fn setup_tty(&self) -> Result<()> {
1989         if self.on_tty {
1990             io::stdin()
1991                 .lock()
1992                 .set_raw_mode()
1993                 .map_err(Error::SetTerminalRaw)?;
1994         }
1995 
1996         Ok(())
1997     }
1998 
1999     // Creates ACPI tables
2000     // In case of TDX being used, this is a no-op since the tables will be
2001     // created and passed when populating the HOB.
2002 
2003     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2004         #[cfg(feature = "tdx")]
2005         if self.config.lock().unwrap().is_tdx_enabled() {
2006             return None;
2007         }
2008         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2009         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2010         let rsdp_addr = crate::acpi::create_acpi_tables(
2011             &mem,
2012             &self.device_manager,
2013             &self.cpu_manager,
2014             &self.memory_manager,
2015             &self.numa_nodes,
2016             tpm_enabled,
2017         );
2018         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2019 
2020         Some(rsdp_addr)
2021     }
2022 
2023     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2024         trace_scoped!("entry_point");
2025 
2026         self.load_payload_handle
2027             .take()
2028             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2029             .transpose()
2030     }
2031 
2032     pub fn boot(&mut self) -> Result<()> {
2033         trace_scoped!("Vm::boot");
2034         info!("Booting VM");
2035         event!("vm", "booting");
2036         let current_state = self.get_state()?;
2037         if current_state == VmState::Paused {
2038             return self.resume().map_err(Error::Resume);
2039         }
2040 
2041         let new_state = if self.stop_on_boot {
2042             VmState::BreakPoint
2043         } else {
2044             VmState::Running
2045         };
2046         current_state.valid_transition(new_state)?;
2047 
2048         // Do earlier to parallelise with loading kernel
2049         #[cfg(target_arch = "x86_64")]
2050         let rsdp_addr = self.create_acpi_tables();
2051 
2052         self.setup_signal_handler()?;
2053         self.setup_tty()?;
2054 
2055         // Load kernel synchronously or if asynchronous then wait for load to
2056         // finish.
2057         let entry_point = self.entry_point()?;
2058 
2059         #[cfg(feature = "tdx")]
2060         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2061 
2062         // Configure the vcpus that have been created
2063         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2064         for vcpu in vcpus {
2065             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2066             let boot_setup = entry_point.map(|e| (e, guest_memory));
2067             self.cpu_manager
2068                 .lock()
2069                 .unwrap()
2070                 .configure_vcpu(vcpu, boot_setup)
2071                 .map_err(Error::CpuManager)?;
2072         }
2073 
2074         #[cfg(feature = "tdx")]
2075         let (sections, guid_found) = if tdx_enabled {
2076             self.extract_tdvf_sections()?
2077         } else {
2078             (Vec::new(), false)
2079         };
2080 
2081         // Configuring the TDX regions requires that the vCPUs are created.
2082         #[cfg(feature = "tdx")]
2083         let hob_address = if tdx_enabled {
2084             // TDX sections are written to memory.
2085             self.populate_tdx_sections(&sections, guid_found)?
2086         } else {
2087             None
2088         };
2089 
2090         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2091         // available after they are configured
2092         #[cfg(target_arch = "aarch64")]
2093         let rsdp_addr = self.create_acpi_tables();
2094 
2095         // Configure shared state based on loaded kernel
2096         entry_point
2097             .map(|_| {
2098                 // Safe to unwrap rsdp_addr as we know it can't be None when
2099                 // the entry_point is Some.
2100                 self.configure_system(rsdp_addr.unwrap())
2101             })
2102             .transpose()?;
2103 
2104         #[cfg(feature = "tdx")]
2105         if let Some(hob_address) = hob_address {
2106             // With the HOB address extracted the vCPUs can have
2107             // their TDX state configured.
2108             self.cpu_manager
2109                 .lock()
2110                 .unwrap()
2111                 .initialize_tdx(hob_address)
2112                 .map_err(Error::CpuManager)?;
2113             // Let the hypervisor know which memory ranges are shared with the
2114             // guest. This prevents the guest from ignoring/discarding memory
2115             // regions provided by the host.
2116             self.init_tdx_memory(&sections)?;
2117             // With TDX memory and CPU state configured TDX setup is complete
2118             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2119         }
2120 
2121         self.cpu_manager
2122             .lock()
2123             .unwrap()
2124             .start_boot_vcpus(new_state == VmState::BreakPoint)
2125             .map_err(Error::CpuManager)?;
2126 
2127         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2128         *state = new_state;
2129         event!("vm", "booted");
2130         Ok(())
2131     }
2132 
2133     pub fn restore(&mut self) -> Result<()> {
2134         event!("vm", "restoring");
2135 
2136         // Now we can start all vCPUs from here.
2137         self.cpu_manager
2138             .lock()
2139             .unwrap()
2140             .start_restored_vcpus()
2141             .map_err(Error::CpuManager)?;
2142 
2143         self.setup_signal_handler()?;
2144         self.setup_tty()?;
2145 
2146         event!("vm", "restored");
2147         Ok(())
2148     }
2149 
2150     /// Gets a thread-safe reference counted pointer to the VM configuration.
2151     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2152         Arc::clone(&self.config)
2153     }
2154 
2155     /// Get the VM state. Returns an error if the state is poisoned.
2156     pub fn get_state(&self) -> Result<VmState> {
2157         self.state
2158             .try_read()
2159             .map_err(|_| Error::PoisonedState)
2160             .map(|state| *state)
2161     }
2162 
2163     /// Gets the actual size of the balloon.
2164     pub fn balloon_size(&self) -> u64 {
2165         self.device_manager.lock().unwrap().balloon_size()
2166     }
2167 
2168     pub fn send_memory_fds(
2169         &mut self,
2170         socket: &mut UnixStream,
2171     ) -> std::result::Result<(), MigratableError> {
2172         for (slot, fd) in self
2173             .memory_manager
2174             .lock()
2175             .unwrap()
2176             .memory_slot_fds()
2177             .drain()
2178         {
2179             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2180                 .write_to(socket)
2181                 .map_err(|e| {
2182                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2183                 })?;
2184             socket
2185                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2186                 .map_err(|e| {
2187                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2188                 })?;
2189 
2190             let res = Response::read_from(socket)?;
2191             if res.status() != Status::Ok {
2192                 warn!("Error during memory fd migration");
2193                 Request::abandon().write_to(socket)?;
2194                 Response::read_from(socket).ok();
2195                 return Err(MigratableError::MigrateSend(anyhow!(
2196                     "Error during memory fd migration"
2197                 )));
2198             }
2199         }
2200 
2201         Ok(())
2202     }
2203 
2204     pub fn send_memory_regions<F>(
2205         &mut self,
2206         ranges: &MemoryRangeTable,
2207         fd: &mut F,
2208     ) -> std::result::Result<(), MigratableError>
2209     where
2210         F: Write,
2211     {
2212         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2213         let mem = guest_memory.memory();
2214 
2215         for range in ranges.regions() {
2216             let mut offset: u64 = 0;
2217             // Here we are manually handling the retry in case we can't the
2218             // whole region at once because we can't use the implementation
2219             // from vm-memory::GuestMemory of write_all_to() as it is not
2220             // following the correct behavior. For more info about this issue
2221             // see: https://github.com/rust-vmm/vm-memory/issues/174
2222             loop {
2223                 let bytes_written = mem
2224                     .write_to(
2225                         GuestAddress(range.gpa + offset),
2226                         fd,
2227                         (range.length - offset) as usize,
2228                     )
2229                     .map_err(|e| {
2230                         MigratableError::MigrateSend(anyhow!(
2231                             "Error transferring memory to socket: {}",
2232                             e
2233                         ))
2234                     })?;
2235                 offset += bytes_written as u64;
2236 
2237                 if offset == range.length {
2238                     break;
2239                 }
2240             }
2241         }
2242 
2243         Ok(())
2244     }
2245 
2246     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2247         self.memory_manager
2248             .lock()
2249             .unwrap()
2250             .memory_range_table(false)
2251     }
2252 
2253     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2254         self.device_manager.lock().unwrap().device_tree()
2255     }
2256 
2257     pub fn activate_virtio_devices(&self) -> Result<()> {
2258         self.device_manager
2259             .lock()
2260             .unwrap()
2261             .activate_virtio_devices()
2262             .map_err(Error::ActivateVirtioDevices)
2263     }
2264 
2265     #[cfg(target_arch = "x86_64")]
2266     pub fn power_button(&self) -> Result<()> {
2267         return self
2268             .device_manager
2269             .lock()
2270             .unwrap()
2271             .notify_power_button()
2272             .map_err(Error::PowerButton);
2273     }
2274 
2275     #[cfg(target_arch = "aarch64")]
2276     pub fn power_button(&self) -> Result<()> {
2277         self.device_manager
2278             .lock()
2279             .unwrap()
2280             .notify_power_button()
2281             .map_err(Error::PowerButton)
2282     }
2283 
2284     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2285         self.memory_manager.lock().unwrap().snapshot_data()
2286     }
2287 
2288     #[cfg(feature = "guest_debug")]
2289     pub fn debug_request(
2290         &mut self,
2291         gdb_request: &GdbRequestPayload,
2292         cpu_id: usize,
2293     ) -> Result<GdbResponsePayload> {
2294         use GdbRequestPayload::*;
2295         match gdb_request {
2296             SetSingleStep(single_step) => {
2297                 self.set_guest_debug(cpu_id, &[], *single_step)
2298                     .map_err(Error::Debug)?;
2299             }
2300             SetHwBreakPoint(addrs) => {
2301                 self.set_guest_debug(cpu_id, addrs, false)
2302                     .map_err(Error::Debug)?;
2303             }
2304             Pause => {
2305                 self.debug_pause().map_err(Error::Debug)?;
2306             }
2307             Resume => {
2308                 self.debug_resume().map_err(Error::Debug)?;
2309             }
2310             ReadRegs => {
2311                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2312                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2313             }
2314             WriteRegs(regs) => {
2315                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2316             }
2317             ReadMem(vaddr, len) => {
2318                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2319                 let mem = self
2320                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2321                     .map_err(Error::Debug)?;
2322                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2323             }
2324             WriteMem(vaddr, data) => {
2325                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2326                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2327                     .map_err(Error::Debug)?;
2328             }
2329             ActiveVcpus => {
2330                 let active_vcpus = self.active_vcpus();
2331                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2332             }
2333         }
2334         Ok(GdbResponsePayload::CommandComplete)
2335     }
2336 
2337     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2338     fn get_dump_state(
2339         &mut self,
2340         destination_url: &str,
2341     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2342         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2343         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2344         let mut elf_phdr_num = 1;
2345         let elf_sh_info = 0;
2346         let coredump_file_path = url_to_file(destination_url)?;
2347         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2348 
2349         if mapping_num < UINT16_MAX - 2 {
2350             elf_phdr_num += mapping_num as u16;
2351         } else {
2352             panic!("mapping num beyond 65535 not supported");
2353         }
2354         let coredump_file = OpenOptions::new()
2355             .read(true)
2356             .write(true)
2357             .create_new(true)
2358             .open(coredump_file_path)
2359             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2360 
2361         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2362         let mem_data = self
2363             .memory_manager
2364             .lock()
2365             .unwrap()
2366             .coredump_memory_regions(mem_offset);
2367 
2368         Ok(DumpState {
2369             elf_note_size,
2370             elf_phdr_num,
2371             elf_sh_info,
2372             mem_offset,
2373             mem_info: Some(mem_data),
2374             file: Some(coredump_file),
2375         })
2376     }
2377 
2378     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2379     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2380         size_of::<elf::Elf64_Ehdr>() as u64
2381             + note_size as u64
2382             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2383     }
2384 }
2385 
2386 impl Pausable for Vm {
2387     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2388         event!("vm", "pausing");
2389         let mut state = self
2390             .state
2391             .try_write()
2392             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2393         let new_state = VmState::Paused;
2394 
2395         state
2396             .valid_transition(new_state)
2397             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2398 
2399         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2400         {
2401             let mut clock = self
2402                 .vm
2403                 .get_clock()
2404                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2405             clock.reset_flags();
2406             self.saved_clock = Some(clock);
2407         }
2408 
2409         // Before pausing the vCPUs activate any pending virtio devices that might
2410         // need activation between starting the pause (or e.g. a migration it's part of)
2411         self.activate_virtio_devices().map_err(|e| {
2412             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2413         })?;
2414 
2415         self.cpu_manager.lock().unwrap().pause()?;
2416         self.device_manager.lock().unwrap().pause()?;
2417 
2418         *state = new_state;
2419 
2420         event!("vm", "paused");
2421         Ok(())
2422     }
2423 
2424     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2425         event!("vm", "resuming");
2426         let mut state = self
2427             .state
2428             .try_write()
2429             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2430         let new_state = VmState::Running;
2431 
2432         state
2433             .valid_transition(new_state)
2434             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2435 
2436         self.cpu_manager.lock().unwrap().resume()?;
2437         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2438         {
2439             if let Some(clock) = &self.saved_clock {
2440                 self.vm.set_clock(clock).map_err(|e| {
2441                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2442                 })?;
2443             }
2444         }
2445         self.device_manager.lock().unwrap().resume()?;
2446 
2447         // And we're back to the Running state.
2448         *state = new_state;
2449         event!("vm", "resumed");
2450         Ok(())
2451     }
2452 }
2453 
2454 #[derive(Serialize, Deserialize)]
2455 pub struct VmSnapshot {
2456     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2457     pub clock: Option<hypervisor::ClockData>,
2458     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2459     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2460 }
2461 
2462 pub const VM_SNAPSHOT_ID: &str = "vm";
2463 impl Snapshottable for Vm {
2464     fn id(&self) -> String {
2465         VM_SNAPSHOT_ID.to_string()
2466     }
2467 
2468     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2469         event!("vm", "snapshotting");
2470 
2471         #[cfg(feature = "tdx")]
2472         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2473 
2474         #[cfg(feature = "tdx")]
2475         {
2476             if tdx_enabled {
2477                 return Err(MigratableError::Snapshot(anyhow!(
2478                     "Snapshot not possible with TDX VM"
2479                 )));
2480             }
2481         }
2482 
2483         let current_state = self.get_state().unwrap();
2484         if current_state != VmState::Paused {
2485             return Err(MigratableError::Snapshot(anyhow!(
2486                 "Trying to snapshot while VM is running"
2487             )));
2488         }
2489 
2490         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2491         let common_cpuid = {
2492             let phys_bits = physical_bits(self.config.lock().unwrap().cpus.max_phys_bits);
2493             arch::generate_common_cpuid(
2494                 &self.hypervisor,
2495                 None,
2496                 None,
2497                 phys_bits,
2498                 self.config.lock().unwrap().cpus.kvm_hyperv,
2499                 #[cfg(feature = "tdx")]
2500                 tdx_enabled,
2501             )
2502             .map_err(|e| {
2503                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2504             })?
2505         };
2506 
2507         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2508             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2509             clock: self.saved_clock,
2510             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2511             common_cpuid,
2512         })
2513         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2514 
2515         let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data));
2516 
2517         let (id, snapshot) = {
2518             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2519             (cpu_manager.id(), cpu_manager.snapshot()?)
2520         };
2521         vm_snapshot.add_snapshot(id, snapshot);
2522         let (id, snapshot) = {
2523             let mut memory_manager = self.memory_manager.lock().unwrap();
2524             (memory_manager.id(), memory_manager.snapshot()?)
2525         };
2526         vm_snapshot.add_snapshot(id, snapshot);
2527         let (id, snapshot) = {
2528             let mut device_manager = self.device_manager.lock().unwrap();
2529             (device_manager.id(), device_manager.snapshot()?)
2530         };
2531         vm_snapshot.add_snapshot(id, snapshot);
2532 
2533         event!("vm", "snapshotted");
2534         Ok(vm_snapshot)
2535     }
2536 }
2537 
2538 impl Transportable for Vm {
2539     fn send(
2540         &self,
2541         snapshot: &Snapshot,
2542         destination_url: &str,
2543     ) -> std::result::Result<(), MigratableError> {
2544         let mut snapshot_config_path = url_to_path(destination_url)?;
2545         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2546 
2547         // Create the snapshot config file
2548         let mut snapshot_config_file = OpenOptions::new()
2549             .read(true)
2550             .write(true)
2551             .create_new(true)
2552             .open(snapshot_config_path)
2553             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2554 
2555         // Serialize and write the snapshot config
2556         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2557             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2558 
2559         snapshot_config_file
2560             .write(vm_config.as_bytes())
2561             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2562 
2563         let mut snapshot_state_path = url_to_path(destination_url)?;
2564         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2565 
2566         // Create the snapshot state file
2567         let mut snapshot_state_file = OpenOptions::new()
2568             .read(true)
2569             .write(true)
2570             .create_new(true)
2571             .open(snapshot_state_path)
2572             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2573 
2574         // Serialize and write the snapshot state
2575         let vm_state =
2576             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2577 
2578         snapshot_state_file
2579             .write(&vm_state)
2580             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2581 
2582         // Tell the memory manager to also send/write its own snapshot.
2583         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2584             self.memory_manager
2585                 .lock()
2586                 .unwrap()
2587                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2588         } else {
2589             return Err(MigratableError::Restore(anyhow!(
2590                 "Missing memory manager snapshot"
2591             )));
2592         }
2593 
2594         Ok(())
2595     }
2596 }
2597 
2598 impl Migratable for Vm {
2599     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2600         self.memory_manager.lock().unwrap().start_dirty_log()?;
2601         self.device_manager.lock().unwrap().start_dirty_log()
2602     }
2603 
2604     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2605         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2606         self.device_manager.lock().unwrap().stop_dirty_log()
2607     }
2608 
2609     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2610         Ok(MemoryRangeTable::new_from_tables(vec![
2611             self.memory_manager.lock().unwrap().dirty_log()?,
2612             self.device_manager.lock().unwrap().dirty_log()?,
2613         ]))
2614     }
2615 
2616     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2617         self.memory_manager.lock().unwrap().start_migration()?;
2618         self.device_manager.lock().unwrap().start_migration()
2619     }
2620 
2621     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2622         self.memory_manager.lock().unwrap().complete_migration()?;
2623         self.device_manager.lock().unwrap().complete_migration()
2624     }
2625 }
2626 
2627 #[cfg(feature = "guest_debug")]
2628 impl Debuggable for Vm {
2629     fn set_guest_debug(
2630         &self,
2631         cpu_id: usize,
2632         addrs: &[GuestAddress],
2633         singlestep: bool,
2634     ) -> std::result::Result<(), DebuggableError> {
2635         self.cpu_manager
2636             .lock()
2637             .unwrap()
2638             .set_guest_debug(cpu_id, addrs, singlestep)
2639     }
2640 
2641     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2642         if *self.state.read().unwrap() == VmState::Running {
2643             self.pause().map_err(DebuggableError::Pause)?;
2644         }
2645 
2646         let mut state = self
2647             .state
2648             .try_write()
2649             .map_err(|_| DebuggableError::PoisonedState)?;
2650         *state = VmState::BreakPoint;
2651         Ok(())
2652     }
2653 
2654     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2655         if *self.state.read().unwrap() == VmState::BreakPoint {
2656             self.resume().map_err(DebuggableError::Pause)?;
2657         }
2658 
2659         Ok(())
2660     }
2661 
2662     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2663         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2664     }
2665 
2666     fn write_regs(
2667         &self,
2668         cpu_id: usize,
2669         regs: &CoreRegs,
2670     ) -> std::result::Result<(), DebuggableError> {
2671         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2672     }
2673 
2674     fn read_mem(
2675         &self,
2676         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2677         cpu_id: usize,
2678         vaddr: GuestAddress,
2679         len: usize,
2680     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2681         self.cpu_manager
2682             .lock()
2683             .unwrap()
2684             .read_mem(guest_memory, cpu_id, vaddr, len)
2685     }
2686 
2687     fn write_mem(
2688         &self,
2689         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2690         cpu_id: usize,
2691         vaddr: &GuestAddress,
2692         data: &[u8],
2693     ) -> std::result::Result<(), DebuggableError> {
2694         self.cpu_manager
2695             .lock()
2696             .unwrap()
2697             .write_mem(guest_memory, cpu_id, vaddr, data)
2698     }
2699 
2700     fn active_vcpus(&self) -> usize {
2701         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2702         if active_vcpus > 0 {
2703             active_vcpus
2704         } else {
2705             // The VM is not booted yet. Report boot_vcpus() instead.
2706             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2707         }
2708     }
2709 }
2710 
2711 #[cfg(feature = "guest_debug")]
2712 pub const UINT16_MAX: u32 = 65535;
2713 
2714 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2715 impl Elf64Writable for Vm {}
2716 
2717 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2718 impl GuestDebuggable for Vm {
2719     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2720         event!("vm", "coredumping");
2721 
2722         #[cfg(feature = "tdx")]
2723         {
2724             if let Some(ref platform) = self.config.lock().unwrap().platform {
2725                 if platform.tdx {
2726                     return Err(GuestDebuggableError::Coredump(anyhow!(
2727                         "Coredump not possible with TDX VM"
2728                     )));
2729                 }
2730             }
2731         }
2732 
2733         let current_state = self.get_state().unwrap();
2734         if current_state != VmState::Paused {
2735             return Err(GuestDebuggableError::Coredump(anyhow!(
2736                 "Trying to coredump while VM is running"
2737             )));
2738         }
2739 
2740         let coredump_state = self.get_dump_state(destination_url)?;
2741 
2742         self.write_header(&coredump_state)?;
2743         self.write_note(&coredump_state)?;
2744         self.write_loads(&coredump_state)?;
2745 
2746         self.cpu_manager
2747             .lock()
2748             .unwrap()
2749             .cpu_write_elf64_note(&coredump_state)?;
2750         self.cpu_manager
2751             .lock()
2752             .unwrap()
2753             .cpu_write_vmm_note(&coredump_state)?;
2754 
2755         self.memory_manager
2756             .lock()
2757             .unwrap()
2758             .coredump_iterate_save_mem(&coredump_state)
2759     }
2760 }
2761 
2762 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2763 #[cfg(test)]
2764 mod tests {
2765     use super::*;
2766 
2767     fn test_vm_state_transitions(state: VmState) {
2768         match state {
2769             VmState::Created => {
2770                 // Check the transitions from Created
2771                 assert!(state.valid_transition(VmState::Created).is_err());
2772                 assert!(state.valid_transition(VmState::Running).is_ok());
2773                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2774                 assert!(state.valid_transition(VmState::Paused).is_ok());
2775                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2776             }
2777             VmState::Running => {
2778                 // Check the transitions from Running
2779                 assert!(state.valid_transition(VmState::Created).is_err());
2780                 assert!(state.valid_transition(VmState::Running).is_err());
2781                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2782                 assert!(state.valid_transition(VmState::Paused).is_ok());
2783                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2784             }
2785             VmState::Shutdown => {
2786                 // Check the transitions from Shutdown
2787                 assert!(state.valid_transition(VmState::Created).is_err());
2788                 assert!(state.valid_transition(VmState::Running).is_ok());
2789                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2790                 assert!(state.valid_transition(VmState::Paused).is_err());
2791                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2792             }
2793             VmState::Paused => {
2794                 // Check the transitions from Paused
2795                 assert!(state.valid_transition(VmState::Created).is_err());
2796                 assert!(state.valid_transition(VmState::Running).is_ok());
2797                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2798                 assert!(state.valid_transition(VmState::Paused).is_err());
2799                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2800             }
2801             VmState::BreakPoint => {
2802                 // Check the transitions from Breakpoint
2803                 assert!(state.valid_transition(VmState::Created).is_ok());
2804                 assert!(state.valid_transition(VmState::Running).is_ok());
2805                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2806                 assert!(state.valid_transition(VmState::Paused).is_err());
2807                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2808             }
2809         }
2810     }
2811 
2812     #[test]
2813     fn test_vm_created_transitions() {
2814         test_vm_state_transitions(VmState::Created);
2815     }
2816 
2817     #[test]
2818     fn test_vm_running_transitions() {
2819         test_vm_state_transitions(VmState::Running);
2820     }
2821 
2822     #[test]
2823     fn test_vm_shutdown_transitions() {
2824         test_vm_state_transitions(VmState::Shutdown);
2825     }
2826 
2827     #[test]
2828     fn test_vm_paused_transitions() {
2829         test_vm_state_transitions(VmState::Paused);
2830     }
2831 
2832     #[cfg(feature = "tdx")]
2833     #[test]
2834     fn test_hob_memory_resources() {
2835         // Case 1: Two TDVF sections in the middle of the RAM
2836         let sections = vec![
2837             TdvfSection {
2838                 address: 0xc000,
2839                 size: 0x1000,
2840                 ..Default::default()
2841             },
2842             TdvfSection {
2843                 address: 0x1000,
2844                 size: 0x4000,
2845                 ..Default::default()
2846             },
2847         ];
2848         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2849         let expected = vec![
2850             (0, 0x1000, true),
2851             (0x1000, 0x4000, false),
2852             (0x5000, 0x7000, true),
2853             (0xc000, 0x1000, false),
2854             (0xd000, 0x0fff_3000, true),
2855         ];
2856         assert_eq!(
2857             expected,
2858             Vm::hob_memory_resources(
2859                 sections,
2860                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2861             )
2862         );
2863 
2864         // Case 2: Two TDVF sections with no conflict with the RAM
2865         let sections = vec![
2866             TdvfSection {
2867                 address: 0x1000_1000,
2868                 size: 0x1000,
2869                 ..Default::default()
2870             },
2871             TdvfSection {
2872                 address: 0,
2873                 size: 0x1000,
2874                 ..Default::default()
2875             },
2876         ];
2877         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2878         let expected = vec![
2879             (0, 0x1000, false),
2880             (0x1000, 0x1000_0000, true),
2881             (0x1000_1000, 0x1000, false),
2882         ];
2883         assert_eq!(
2884             expected,
2885             Vm::hob_memory_resources(
2886                 sections,
2887                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2888             )
2889         );
2890 
2891         // Case 3: Two TDVF sections with partial conflicts with the RAM
2892         let sections = vec![
2893             TdvfSection {
2894                 address: 0x1000_0000,
2895                 size: 0x2000,
2896                 ..Default::default()
2897             },
2898             TdvfSection {
2899                 address: 0,
2900                 size: 0x2000,
2901                 ..Default::default()
2902             },
2903         ];
2904         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2905         let expected = vec![
2906             (0, 0x2000, false),
2907             (0x2000, 0x0fff_e000, true),
2908             (0x1000_0000, 0x2000, false),
2909         ];
2910         assert_eq!(
2911             expected,
2912             Vm::hob_memory_resources(
2913                 sections,
2914                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2915             )
2916         );
2917 
2918         // Case 4: Two TDVF sections with no conflict before the RAM and two
2919         // more additional sections with no conflict after the RAM.
2920         let sections = vec![
2921             TdvfSection {
2922                 address: 0x2000_1000,
2923                 size: 0x1000,
2924                 ..Default::default()
2925             },
2926             TdvfSection {
2927                 address: 0x2000_0000,
2928                 size: 0x1000,
2929                 ..Default::default()
2930             },
2931             TdvfSection {
2932                 address: 0x1000,
2933                 size: 0x1000,
2934                 ..Default::default()
2935             },
2936             TdvfSection {
2937                 address: 0,
2938                 size: 0x1000,
2939                 ..Default::default()
2940             },
2941         ];
2942         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
2943         let expected = vec![
2944             (0, 0x1000, false),
2945             (0x1000, 0x1000, false),
2946             (0x4000, 0x1000_0000, true),
2947             (0x2000_0000, 0x1000, false),
2948             (0x2000_1000, 0x1000, false),
2949         ];
2950         assert_eq!(
2951             expected,
2952             Vm::hob_memory_resources(
2953                 sections,
2954                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2955             )
2956         );
2957 
2958         // Case 5: One TDVF section overriding the entire RAM
2959         let sections = vec![TdvfSection {
2960             address: 0,
2961             size: 0x2000_0000,
2962             ..Default::default()
2963         }];
2964         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2965         let expected = vec![(0, 0x2000_0000, false)];
2966         assert_eq!(
2967             expected,
2968             Vm::hob_memory_resources(
2969                 sections,
2970                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2971             )
2972         );
2973 
2974         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
2975         let sections = vec![
2976             TdvfSection {
2977                 address: 0x1000_2000,
2978                 size: 0x2000,
2979                 ..Default::default()
2980             },
2981             TdvfSection {
2982                 address: 0,
2983                 size: 0x2000,
2984                 ..Default::default()
2985             },
2986         ];
2987         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
2988             (GuestAddress(0x2000), 0x1000_0000),
2989             (GuestAddress(0x1000_4000), 0x1000_0000),
2990         ];
2991         let expected = vec![
2992             (0, 0x2000, false),
2993             (0x2000, 0x1000_0000, true),
2994             (0x1000_2000, 0x2000, false),
2995             (0x1000_4000, 0x1000_0000, true),
2996         ];
2997         assert_eq!(
2998             expected,
2999             Vm::hob_memory_resources(
3000                 sections,
3001                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3002             )
3003         );
3004 
3005         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3006         let sections = vec![
3007             TdvfSection {
3008                 address: 0x1000_0000,
3009                 size: 0x4000,
3010                 ..Default::default()
3011             },
3012             TdvfSection {
3013                 address: 0,
3014                 size: 0x4000,
3015                 ..Default::default()
3016             },
3017         ];
3018         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3019             (GuestAddress(0x1000), 0x1000_0000),
3020             (GuestAddress(0x1000_3000), 0x1000_0000),
3021         ];
3022         let expected = vec![
3023             (0, 0x4000, false),
3024             (0x4000, 0x0fff_c000, true),
3025             (0x1000_0000, 0x4000, false),
3026             (0x1000_4000, 0x0fff_f000, true),
3027         ];
3028         assert_eq!(
3029             expected,
3030             Vm::hob_memory_resources(
3031                 sections,
3032                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3033             )
3034         );
3035     }
3036 }
3037 
3038 #[cfg(target_arch = "aarch64")]
3039 #[cfg(test)]
3040 mod tests {
3041     use super::*;
3042     use crate::GuestMemoryMmap;
3043     use arch::aarch64::fdt::create_fdt;
3044     use arch::aarch64::layout;
3045     use arch::{DeviceType, MmioDeviceInfo};
3046     use devices::gic::Gic;
3047 
3048     const LEN: u64 = 4096;
3049 
3050     #[test]
3051     fn test_create_fdt_with_devices() {
3052         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3053         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3054 
3055         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3056             (
3057                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3058                 MmioDeviceInfo {
3059                     addr: 0x00,
3060                     len: LEN,
3061                     irq: 33,
3062                 },
3063             ),
3064             (
3065                 (DeviceType::Virtio(1), "virtio".to_string()),
3066                 MmioDeviceInfo {
3067                     addr: LEN,
3068                     len: LEN,
3069                     irq: 34,
3070                 },
3071             ),
3072             (
3073                 (DeviceType::Rtc, "rtc".to_string()),
3074                 MmioDeviceInfo {
3075                     addr: 2 * LEN,
3076                     len: LEN,
3077                     irq: 35,
3078                 },
3079             ),
3080         ]
3081         .iter()
3082         .cloned()
3083         .collect();
3084 
3085         let hv = hypervisor::new().unwrap();
3086         let vm = hv.create_vm().unwrap();
3087         let gic = vm
3088             .create_vgic(Gic::create_default_config(1))
3089             .expect("Cannot create gic");
3090         assert!(create_fdt(
3091             &mem,
3092             "console=tty0",
3093             vec![0],
3094             Some((0, 0, 0)),
3095             &dev_info,
3096             &gic,
3097             &None,
3098             &Vec::new(),
3099             &BTreeMap::new(),
3100             None,
3101             true,
3102         )
3103         .is_ok())
3104     }
3105 }
3106 
3107 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3108 #[test]
3109 pub fn test_vm() {
3110     use hypervisor::VmExit;
3111     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3112     // This example based on https://lwn.net/Articles/658511/
3113     let code = [
3114         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3115         0x00, 0xd8, /* add %bl, %al */
3116         0x04, b'0', /* add $'0', %al */
3117         0xee, /* out %al, (%dx) */
3118         0xb0, b'\n', /* mov $'\n', %al */
3119         0xee,  /* out %al, (%dx) */
3120         0xf4,  /* hlt */
3121     ];
3122 
3123     let mem_size = 0x1000;
3124     let load_addr = GuestAddress(0x1000);
3125     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3126 
3127     let hv = hypervisor::new().unwrap();
3128     let vm = hv.create_vm().expect("new VM creation failed");
3129 
3130     for (index, region) in mem.iter().enumerate() {
3131         let mem_region = vm.make_user_memory_region(
3132             index as u32,
3133             region.start_addr().raw_value(),
3134             region.len(),
3135             region.as_ptr() as u64,
3136             false,
3137             false,
3138         );
3139 
3140         vm.create_user_memory_region(mem_region)
3141             .expect("Cannot configure guest memory");
3142     }
3143     mem.write_slice(&code, load_addr)
3144         .expect("Writing code to memory failed");
3145 
3146     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3147 
3148     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3149     vcpu_sregs.cs.base = 0;
3150     vcpu_sregs.cs.selector = 0;
3151     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3152 
3153     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3154     vcpu_regs.rip = 0x1000;
3155     vcpu_regs.rax = 2;
3156     vcpu_regs.rbx = 3;
3157     vcpu_regs.rflags = 2;
3158     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3159 
3160     loop {
3161         match vcpu.run().expect("run failed") {
3162             VmExit::IoOut(addr, data) => {
3163                 println!(
3164                     "IO out -- addr: {:#x} data [{:?}]",
3165                     addr,
3166                     str::from_utf8(data).unwrap()
3167                 );
3168             }
3169             VmExit::Reset => {
3170                 println!("HLT");
3171                 break;
3172             }
3173             r => panic!("unexpected exit reason: {r:?}"),
3174         }
3175     }
3176 }
3177