xref: /cloud-hypervisor/vmm/src/vm.rs (revision 72452707eeb4dc17439d609e194b8de108ad623b)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use std::collections::{BTreeMap, HashMap};
15 use std::fs::{File, OpenOptions};
16 use std::io::{self, Seek, SeekFrom, Write};
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::num::Wrapping;
20 use std::ops::Deref;
21 use std::os::unix::net::UnixStream;
22 use std::sync::{Arc, Mutex, RwLock};
23 use std::time::Instant;
24 use std::{cmp, result, str, thread};
25 
26 use anyhow::anyhow;
27 #[cfg(target_arch = "x86_64")]
28 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
29 #[cfg(feature = "tdx")]
30 use arch::x86_64::tdx::TdvfSection;
31 #[cfg(target_arch = "aarch64")]
32 use arch::PciSpaceInfo;
33 use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes};
34 #[cfg(target_arch = "aarch64")]
35 use devices::interrupt_controller;
36 use devices::AcpiNotificationFlags;
37 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
38 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
39 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
40 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
41 use hypervisor::{HypervisorVmError, VmOps};
42 use libc::{termios, SIGWINCH};
43 use linux_loader::cmdline::Cmdline;
44 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
45 use linux_loader::elf;
46 #[cfg(target_arch = "x86_64")]
47 use linux_loader::loader::bzimage::BzImage;
48 #[cfg(target_arch = "x86_64")]
49 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
50 #[cfg(target_arch = "aarch64")]
51 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
52 use linux_loader::loader::KernelLoader;
53 use seccompiler::SeccompAction;
54 use serde::{Deserialize, Serialize};
55 use thiserror::Error;
56 use tracer::trace_scoped;
57 use vm_device::Bus;
58 #[cfg(feature = "tdx")]
59 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
60 use vm_memory::{
61     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
62 };
63 use vm_migration::protocol::{MemoryRangeTable, Request, Response};
64 use vm_migration::{
65     snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable,
66 };
67 use vmm_sys_util::eventfd::EventFd;
68 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
69 
70 use crate::config::{add_to_config, ValidationError};
71 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo};
72 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
73 use crate::coredump::{
74     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
75 };
76 use crate::device_manager::{DeviceManager, DeviceManagerError};
77 use crate::device_tree::DeviceTree;
78 #[cfg(feature = "guest_debug")]
79 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
80 #[cfg(feature = "igvm")]
81 use crate::igvm::igvm_loader;
82 use crate::landlock::LandlockError;
83 use crate::memory_manager::{
84     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
85 };
86 #[cfg(target_arch = "x86_64")]
87 use crate::migration::get_vm_snapshot;
88 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
89 use crate::migration::url_to_file;
90 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
91 use crate::vm_config::{
92     DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig,
93     PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig,
94 };
95 use crate::{
96     cpu, GuestMemoryMmap, PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID,
97     MEMORY_MANAGER_SNAPSHOT_ID,
98 };
99 
100 /// Errors associated with VM management
101 #[derive(Debug, Error)]
102 pub enum Error {
103     #[error("Cannot open kernel file: {0}")]
104     KernelFile(#[source] io::Error),
105 
106     #[error("Cannot open initramfs file: {0}")]
107     InitramfsFile(#[source] io::Error),
108 
109     #[error("Cannot load the kernel into memory: {0}")]
110     KernelLoad(#[source] linux_loader::loader::Error),
111 
112     #[cfg(target_arch = "aarch64")]
113     #[error("Cannot load the UEFI binary in memory: {0:?}")]
114     UefiLoad(arch::aarch64::uefi::Error),
115 
116     #[error("Cannot load the initramfs into memory")]
117     InitramfsLoad,
118 
119     #[error("Cannot load the kernel command line in memory: {0}")]
120     LoadCmdLine(#[source] linux_loader::loader::Error),
121 
122     #[error("Failed to apply landlock config during vm_create: {0}")]
123     ApplyLandlock(#[source] LandlockError),
124 
125     #[error("Cannot modify the kernel command line: {0}")]
126     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
127 
128     #[error("Cannot create the kernel command line: {0}")]
129     CmdLineCreate(#[source] linux_loader::cmdline::Error),
130 
131     #[error("Cannot configure system: {0}")]
132     ConfigureSystem(#[source] arch::Error),
133 
134     #[cfg(target_arch = "aarch64")]
135     #[error("Cannot enable interrupt controller: {0:?}")]
136     EnableInterruptController(interrupt_controller::Error),
137 
138     #[error("VM state is poisoned")]
139     PoisonedState,
140 
141     #[error("Error from device manager: {0:?}")]
142     DeviceManager(DeviceManagerError),
143 
144     #[error("No device with id {0:?} to remove")]
145     NoDeviceToRemove(String),
146 
147     #[error("Cannot spawn a signal handler thread: {0}")]
148     SignalHandlerSpawn(#[source] io::Error),
149 
150     #[error("Failed to join on threads: {0:?}")]
151     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
152 
153     #[error("VM config is missing")]
154     VmMissingConfig,
155 
156     #[error("VM is not created")]
157     VmNotCreated,
158 
159     #[error("VM is already created")]
160     VmAlreadyCreated,
161 
162     #[error("VM is not running")]
163     VmNotRunning,
164 
165     #[error("Cannot clone EventFd: {0}")]
166     EventFdClone(#[source] io::Error),
167 
168     #[error("invalid VM state transition: {0:?} to {1:?}")]
169     InvalidStateTransition(VmState, VmState),
170 
171     #[error("Error from CPU manager: {0}")]
172     CpuManager(#[source] cpu::Error),
173 
174     #[error("Cannot pause devices: {0}")]
175     PauseDevices(#[source] MigratableError),
176 
177     #[error("Cannot resume devices: {0}")]
178     ResumeDevices(#[source] MigratableError),
179 
180     #[error("Cannot pause CPUs: {0}")]
181     PauseCpus(#[source] MigratableError),
182 
183     #[error("Cannot resume cpus: {0}")]
184     ResumeCpus(#[source] MigratableError),
185 
186     #[error("Cannot pause VM: {0}")]
187     Pause(#[source] MigratableError),
188 
189     #[error("Cannot resume VM: {0}")]
190     Resume(#[source] MigratableError),
191 
192     #[error("Memory manager error: {0:?}")]
193     MemoryManager(MemoryManagerError),
194 
195     #[error("Eventfd write error: {0}")]
196     EventfdError(#[source] std::io::Error),
197 
198     #[error("Cannot snapshot VM: {0}")]
199     Snapshot(#[source] MigratableError),
200 
201     #[error("Cannot restore VM: {0}")]
202     Restore(#[source] MigratableError),
203 
204     #[error("Cannot send VM snapshot: {0}")]
205     SnapshotSend(#[source] MigratableError),
206 
207     #[error("Invalid restore source URL")]
208     InvalidRestoreSourceUrl,
209 
210     #[error("Failed to validate config: {0}")]
211     ConfigValidation(#[source] ValidationError),
212 
213     #[error("Too many virtio-vsock devices")]
214     TooManyVsockDevices,
215 
216     #[error("Failed serializing into JSON: {0}")]
217     SerializeJson(#[source] serde_json::Error),
218 
219     #[error("Invalid NUMA configuration")]
220     InvalidNumaConfig,
221 
222     #[error("Cannot create seccomp filter: {0}")]
223     CreateSeccompFilter(#[source] seccompiler::Error),
224 
225     #[error("Cannot apply seccomp filter: {0}")]
226     ApplySeccompFilter(#[source] seccompiler::Error),
227 
228     #[error("Failed resizing a memory zone")]
229     ResizeZone,
230 
231     #[error("Cannot activate virtio devices: {0:?}")]
232     ActivateVirtioDevices(DeviceManagerError),
233 
234     #[error("Error triggering power button: {0:?}")]
235     PowerButton(DeviceManagerError),
236 
237     #[error("Kernel lacks PVH header")]
238     KernelMissingPvhHeader,
239 
240     #[error("Failed to allocate firmware RAM: {0:?}")]
241     AllocateFirmwareMemory(MemoryManagerError),
242 
243     #[error("Error manipulating firmware file: {0}")]
244     FirmwareFile(#[source] std::io::Error),
245 
246     #[error("Firmware too big")]
247     FirmwareTooLarge,
248 
249     #[error("Failed to copy firmware to memory: {0}")]
250     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
251 
252     #[cfg(feature = "sev_snp")]
253     #[error("Error enabling SEV-SNP VM: {0}")]
254     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
255 
256     #[cfg(feature = "tdx")]
257     #[error("Error performing I/O on TDX firmware file: {0}")]
258     LoadTdvf(#[source] std::io::Error),
259 
260     #[cfg(feature = "tdx")]
261     #[error("Error performing I/O on the TDX payload file: {0}")]
262     LoadPayload(#[source] std::io::Error),
263 
264     #[cfg(feature = "tdx")]
265     #[error("Error parsing TDVF: {0}")]
266     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
267 
268     #[cfg(feature = "tdx")]
269     #[error("Error populating TDX HOB: {0}")]
270     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
271 
272     #[cfg(feature = "tdx")]
273     #[error("Error allocating TDVF memory: {0:?}")]
274     AllocatingTdvfMemory(crate::memory_manager::Error),
275 
276     #[cfg(feature = "tdx")]
277     #[error("Error enabling TDX VM: {0}")]
278     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
279 
280     #[cfg(feature = "tdx")]
281     #[error("Error enabling TDX memory region: {0}")]
282     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
283 
284     #[cfg(feature = "tdx")]
285     #[error("Error finalizing TDX VM: {0}")]
286     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
287 
288     #[cfg(feature = "tdx")]
289     #[error("TDX firmware missing")]
290     TdxFirmwareMissing,
291 
292     #[cfg(feature = "tdx")]
293     #[error("Invalid TDX payload type")]
294     InvalidPayloadType,
295 
296     #[cfg(feature = "guest_debug")]
297     #[error("Error debugging VM: {0:?}")]
298     Debug(DebuggableError),
299 
300     #[error("Error spawning kernel loading thread")]
301     KernelLoadThreadSpawn(std::io::Error),
302 
303     #[error("Error joining kernel loading thread")]
304     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
305 
306     #[error("Payload configuration is not bootable")]
307     InvalidPayload,
308 
309     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
310     #[error("Error coredumping VM: {0:?}")]
311     Coredump(GuestDebuggableError),
312 
313     #[cfg(feature = "igvm")]
314     #[error("Cannot open igvm file: {0}")]
315     IgvmFile(#[source] io::Error),
316 
317     #[cfg(feature = "igvm")]
318     #[error("Cannot load the igvm into memory: {0}")]
319     IgvmLoad(#[source] igvm_loader::Error),
320 
321     #[error("Error injecting NMI")]
322     ErrorNmi,
323 
324     #[error("Error resuming the VM: {0}")]
325     ResumeVm(#[source] hypervisor::HypervisorVmError),
326 
327     #[error("Error creating console devices")]
328     CreateConsoleDevices(ConsoleDeviceError),
329 }
330 pub type Result<T> = result::Result<T, Error>;
331 
332 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
333 pub enum VmState {
334     Created,
335     Running,
336     Shutdown,
337     Paused,
338     BreakPoint,
339 }
340 
341 impl VmState {
342     fn valid_transition(self, new_state: VmState) -> Result<()> {
343         match self {
344             VmState::Created => match new_state {
345                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
346                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
347                     Ok(())
348                 }
349             },
350 
351             VmState::Running => match new_state {
352                 VmState::Created | VmState::Running => {
353                     Err(Error::InvalidStateTransition(self, new_state))
354                 }
355                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
356             },
357 
358             VmState::Shutdown => match new_state {
359                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
360                     Err(Error::InvalidStateTransition(self, new_state))
361                 }
362                 VmState::Running => Ok(()),
363             },
364 
365             VmState::Paused => match new_state {
366                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
367                     Err(Error::InvalidStateTransition(self, new_state))
368                 }
369                 VmState::Running | VmState::Shutdown => Ok(()),
370             },
371             VmState::BreakPoint => match new_state {
372                 VmState::Created | VmState::Running => Ok(()),
373                 _ => Err(Error::InvalidStateTransition(self, new_state)),
374             },
375         }
376     }
377 }
378 
379 struct VmOpsHandler {
380     memory: GuestMemoryAtomic<GuestMemoryMmap>,
381     #[cfg(target_arch = "x86_64")]
382     io_bus: Arc<Bus>,
383     mmio_bus: Arc<Bus>,
384 }
385 
386 impl VmOps for VmOpsHandler {
387     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
388         self.memory
389             .memory()
390             .write(buf, GuestAddress(gpa))
391             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
392     }
393 
394     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
395         self.memory
396             .memory()
397             .read(buf, GuestAddress(gpa))
398             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
399     }
400 
401     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
402         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
403             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
404         }
405         Ok(())
406     }
407 
408     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
409         match self.mmio_bus.write(gpa, data) {
410             Err(vm_device::BusError::MissingAddressRange) => {
411                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
412             }
413             Ok(Some(barrier)) => {
414                 info!("Waiting for barrier");
415                 barrier.wait();
416                 info!("Barrier released");
417             }
418             _ => {}
419         };
420         Ok(())
421     }
422 
423     #[cfg(target_arch = "x86_64")]
424     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
425         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
426             info!("Guest PIO read to unregistered address 0x{:x}", port);
427         }
428         Ok(())
429     }
430 
431     #[cfg(target_arch = "x86_64")]
432     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
433         match self.io_bus.write(port, data) {
434             Err(vm_device::BusError::MissingAddressRange) => {
435                 info!("Guest PIO write to unregistered address 0x{:x}", port);
436             }
437             Ok(Some(barrier)) => {
438                 info!("Waiting for barrier");
439                 barrier.wait();
440                 info!("Barrier released");
441             }
442             _ => {}
443         };
444         Ok(())
445     }
446 }
447 
448 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
449     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
450 
451     cmp::min(host_phys_bits, max_phys_bits)
452 }
453 
454 pub struct Vm {
455     #[cfg(feature = "tdx")]
456     kernel: Option<File>,
457     initramfs: Option<File>,
458     threads: Vec<thread::JoinHandle<()>>,
459     device_manager: Arc<Mutex<DeviceManager>>,
460     config: Arc<Mutex<VmConfig>>,
461     state: RwLock<VmState>,
462     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
463     memory_manager: Arc<Mutex<MemoryManager>>,
464     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
465     // The hypervisor abstracted virtual machine.
466     vm: Arc<dyn hypervisor::Vm>,
467     #[cfg(target_arch = "x86_64")]
468     saved_clock: Option<hypervisor::ClockData>,
469     numa_nodes: NumaNodes,
470     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
471     hypervisor: Arc<dyn hypervisor::Hypervisor>,
472     stop_on_boot: bool,
473     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
474 }
475 
476 impl Vm {
477     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
478 
479     #[allow(clippy::too_many_arguments)]
480     pub fn new_from_memory_manager(
481         config: Arc<Mutex<VmConfig>>,
482         memory_manager: Arc<Mutex<MemoryManager>>,
483         vm: Arc<dyn hypervisor::Vm>,
484         exit_evt: EventFd,
485         reset_evt: EventFd,
486         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
487         seccomp_action: &SeccompAction,
488         hypervisor: Arc<dyn hypervisor::Hypervisor>,
489         activate_evt: EventFd,
490         timestamp: Instant,
491         console_info: Option<ConsoleInfo>,
492         console_resize_pipe: Option<Arc<File>>,
493         original_termios: Arc<Mutex<Option<termios>>>,
494         snapshot: Option<Snapshot>,
495     ) -> Result<Self> {
496         trace_scoped!("Vm::new_from_memory_manager");
497 
498         let boot_id_list = config
499             .lock()
500             .unwrap()
501             .validate()
502             .map_err(Error::ConfigValidation)?;
503 
504         #[cfg(not(feature = "igvm"))]
505         let load_payload_handle = if snapshot.is_none() {
506             Self::load_payload_async(&memory_manager, &config)?
507         } else {
508             None
509         };
510 
511         info!("Booting VM from config: {:?}", &config);
512 
513         // Create NUMA nodes based on NumaConfig.
514         let numa_nodes =
515             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
516 
517         #[cfg(feature = "tdx")]
518         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
519         #[cfg(feature = "sev_snp")]
520         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
521         #[cfg(feature = "tdx")]
522         let force_iommu = tdx_enabled;
523         #[cfg(feature = "sev_snp")]
524         let force_iommu = sev_snp_enabled;
525         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
526         let force_iommu = false;
527 
528         #[cfg(feature = "guest_debug")]
529         let stop_on_boot = config.lock().unwrap().gdb;
530         #[cfg(not(feature = "guest_debug"))]
531         let stop_on_boot = false;
532 
533         let memory = memory_manager.lock().unwrap().guest_memory();
534         let io_bus = Arc::new(Bus::new());
535         let mmio_bus = Arc::new(Bus::new());
536 
537         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
538             memory,
539             #[cfg(target_arch = "x86_64")]
540             io_bus: io_bus.clone(),
541             mmio_bus: mmio_bus.clone(),
542         });
543 
544         let cpus_config = { &config.lock().unwrap().cpus.clone() };
545         let cpu_manager = cpu::CpuManager::new(
546             cpus_config,
547             vm.clone(),
548             exit_evt.try_clone().map_err(Error::EventFdClone)?,
549             reset_evt.try_clone().map_err(Error::EventFdClone)?,
550             #[cfg(feature = "guest_debug")]
551             vm_debug_evt,
552             &hypervisor,
553             seccomp_action.clone(),
554             vm_ops,
555             #[cfg(feature = "tdx")]
556             tdx_enabled,
557             &numa_nodes,
558             #[cfg(feature = "sev_snp")]
559             sev_snp_enabled,
560         )
561         .map_err(Error::CpuManager)?;
562 
563         #[cfg(target_arch = "x86_64")]
564         cpu_manager
565             .lock()
566             .unwrap()
567             .populate_cpuid(
568                 &memory_manager,
569                 &hypervisor,
570                 #[cfg(feature = "tdx")]
571                 tdx_enabled,
572             )
573             .map_err(Error::CpuManager)?;
574 
575         // Loading the igvm file is pushed down here because
576         // igvm parser needs cpu_manager to retrieve cpuid leaf.
577         // For the regular case, we can start loading early, but for
578         // igvm case we have to wait until cpu_manager is created.
579         // Currently, Microsoft Hypervisor does not provide any
580         // Hypervisor specific common cpuid, we need to call get_cpuid_values
581         // per cpuid through cpu_manager.
582         #[cfg(feature = "igvm")]
583         let load_payload_handle = if snapshot.is_none() {
584             Self::load_payload_async(
585                 &memory_manager,
586                 &config,
587                 &cpu_manager,
588                 #[cfg(feature = "sev_snp")]
589                 sev_snp_enabled,
590             )?
591         } else {
592             None
593         };
594         // The initial TDX configuration must be done before the vCPUs are
595         // created
596         #[cfg(feature = "tdx")]
597         if tdx_enabled {
598             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
599             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
600             vm.tdx_init(&cpuid, max_vcpus)
601                 .map_err(Error::InitializeTdxVm)?;
602         }
603 
604         cpu_manager
605             .lock()
606             .unwrap()
607             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
608             .map_err(Error::CpuManager)?;
609 
610         // This initial SEV-SNP configuration must be done immediately after
611         // vCPUs are created. As part of this initialization we are
612         // transitioning the guest into secure state.
613         #[cfg(feature = "sev_snp")]
614         if sev_snp_enabled {
615             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
616         }
617 
618         #[cfg(feature = "tdx")]
619         let dynamic = !tdx_enabled;
620         #[cfg(not(feature = "tdx"))]
621         let dynamic = true;
622 
623         let device_manager = DeviceManager::new(
624             io_bus,
625             mmio_bus,
626             vm.clone(),
627             config.clone(),
628             memory_manager.clone(),
629             cpu_manager.clone(),
630             exit_evt.try_clone().map_err(Error::EventFdClone)?,
631             reset_evt,
632             seccomp_action.clone(),
633             numa_nodes.clone(),
634             &activate_evt,
635             force_iommu,
636             boot_id_list,
637             timestamp,
638             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
639             dynamic,
640         )
641         .map_err(Error::DeviceManager)?;
642 
643         device_manager
644             .lock()
645             .unwrap()
646             .create_devices(console_info, console_resize_pipe, original_termios)
647             .map_err(Error::DeviceManager)?;
648 
649         #[cfg(feature = "tdx")]
650         let kernel = config
651             .lock()
652             .unwrap()
653             .payload
654             .as_ref()
655             .map(|p| p.kernel.as_ref().map(File::open))
656             .unwrap_or_default()
657             .transpose()
658             .map_err(Error::KernelFile)?;
659 
660         let initramfs = config
661             .lock()
662             .unwrap()
663             .payload
664             .as_ref()
665             .map(|p| p.initramfs.as_ref().map(File::open))
666             .unwrap_or_default()
667             .transpose()
668             .map_err(Error::InitramfsFile)?;
669 
670         #[cfg(target_arch = "x86_64")]
671         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
672             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
673             vm_snapshot.clock
674         } else {
675             None
676         };
677 
678         let vm_state = if snapshot.is_some() {
679             VmState::Paused
680         } else {
681             VmState::Created
682         };
683 
684         Ok(Vm {
685             #[cfg(feature = "tdx")]
686             kernel,
687             initramfs,
688             device_manager,
689             config,
690             threads: Vec::with_capacity(1),
691             state: RwLock::new(vm_state),
692             cpu_manager,
693             memory_manager,
694             vm,
695             #[cfg(target_arch = "x86_64")]
696             saved_clock,
697             numa_nodes,
698             hypervisor,
699             stop_on_boot,
700             load_payload_handle,
701         })
702     }
703 
704     fn create_numa_nodes(
705         configs: Option<Vec<NumaConfig>>,
706         memory_manager: &Arc<Mutex<MemoryManager>>,
707     ) -> Result<NumaNodes> {
708         let mm = memory_manager.lock().unwrap();
709         let mm_zones = mm.memory_zones();
710         let mut numa_nodes = BTreeMap::new();
711 
712         if let Some(configs) = &configs {
713             for config in configs.iter() {
714                 if numa_nodes.contains_key(&config.guest_numa_id) {
715                     error!("Can't define twice the same NUMA node");
716                     return Err(Error::InvalidNumaConfig);
717                 }
718 
719                 let mut node = NumaNode::default();
720 
721                 if let Some(memory_zones) = &config.memory_zones {
722                     for memory_zone in memory_zones.iter() {
723                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
724                             node.memory_regions.extend(mm_zone.regions().clone());
725                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
726                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
727                             }
728                             node.memory_zones.push(memory_zone.clone());
729                         } else {
730                             error!("Unknown memory zone '{}'", memory_zone);
731                             return Err(Error::InvalidNumaConfig);
732                         }
733                     }
734                 }
735 
736                 if let Some(cpus) = &config.cpus {
737                     node.cpus.extend(cpus);
738                 }
739 
740                 if let Some(pci_segments) = &config.pci_segments {
741                     node.pci_segments.extend(pci_segments);
742                 }
743 
744                 if let Some(distances) = &config.distances {
745                     for distance in distances.iter() {
746                         let dest = distance.destination;
747                         let dist = distance.distance;
748 
749                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
750                             error!("Unknown destination NUMA node {}", dest);
751                             return Err(Error::InvalidNumaConfig);
752                         }
753 
754                         if node.distances.contains_key(&dest) {
755                             error!("Destination NUMA node {} has been already set", dest);
756                             return Err(Error::InvalidNumaConfig);
757                         }
758 
759                         node.distances.insert(dest, dist);
760                     }
761                 }
762 
763                 #[cfg(target_arch = "x86_64")]
764                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
765                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
766                         let mm_sections = sgx_epc_region.epc_sections();
767                         for sgx_epc_section in sgx_epc_sections.iter() {
768                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
769                                 node.sgx_epc_sections.push(mm_section.clone());
770                             } else {
771                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
772                                 return Err(Error::InvalidNumaConfig);
773                             }
774                         }
775                     } else {
776                         error!("Missing SGX EPC region");
777                         return Err(Error::InvalidNumaConfig);
778                     }
779                 }
780 
781                 numa_nodes.insert(config.guest_numa_id, node);
782             }
783         }
784 
785         Ok(numa_nodes)
786     }
787 
788     #[allow(clippy::too_many_arguments)]
789     pub fn new(
790         vm_config: Arc<Mutex<VmConfig>>,
791         exit_evt: EventFd,
792         reset_evt: EventFd,
793         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
794         seccomp_action: &SeccompAction,
795         hypervisor: Arc<dyn hypervisor::Hypervisor>,
796         activate_evt: EventFd,
797         console_info: Option<ConsoleInfo>,
798         console_resize_pipe: Option<Arc<File>>,
799         original_termios: Arc<Mutex<Option<termios>>>,
800         snapshot: Option<Snapshot>,
801         source_url: Option<&str>,
802         prefault: Option<bool>,
803     ) -> Result<Self> {
804         trace_scoped!("Vm::new");
805 
806         let timestamp = Instant::now();
807 
808         #[cfg(feature = "tdx")]
809         let tdx_enabled = if snapshot.is_some() {
810             false
811         } else {
812             vm_config.lock().unwrap().is_tdx_enabled()
813         };
814 
815         #[cfg(feature = "sev_snp")]
816         let sev_snp_enabled = if snapshot.is_some() {
817             false
818         } else {
819             vm_config.lock().unwrap().is_sev_snp_enabled()
820         };
821 
822         let vm = Self::create_hypervisor_vm(
823             &hypervisor,
824             #[cfg(feature = "tdx")]
825             tdx_enabled,
826             #[cfg(feature = "sev_snp")]
827             sev_snp_enabled,
828             #[cfg(feature = "sev_snp")]
829             vm_config.lock().unwrap().memory.total_size(),
830         )?;
831 
832         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
833 
834         let memory_manager = if let Some(snapshot) =
835             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
836         {
837             MemoryManager::new_from_snapshot(
838                 &snapshot,
839                 vm.clone(),
840                 &vm_config.lock().unwrap().memory.clone(),
841                 source_url,
842                 prefault.unwrap(),
843                 phys_bits,
844             )
845             .map_err(Error::MemoryManager)?
846         } else {
847             #[cfg(target_arch = "x86_64")]
848             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
849 
850             MemoryManager::new(
851                 vm.clone(),
852                 &vm_config.lock().unwrap().memory.clone(),
853                 None,
854                 phys_bits,
855                 #[cfg(feature = "tdx")]
856                 tdx_enabled,
857                 None,
858                 None,
859                 #[cfg(target_arch = "x86_64")]
860                 sgx_epc_config,
861             )
862             .map_err(Error::MemoryManager)?
863         };
864 
865         Vm::new_from_memory_manager(
866             vm_config,
867             memory_manager,
868             vm,
869             exit_evt,
870             reset_evt,
871             #[cfg(feature = "guest_debug")]
872             vm_debug_evt,
873             seccomp_action,
874             hypervisor,
875             activate_evt,
876             timestamp,
877             console_info,
878             console_resize_pipe,
879             original_termios,
880             snapshot,
881         )
882     }
883 
884     pub fn create_hypervisor_vm(
885         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
886         #[cfg(feature = "tdx")] tdx_enabled: bool,
887         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
888         #[cfg(feature = "sev_snp")] mem_size: u64,
889     ) -> Result<Arc<dyn hypervisor::Vm>> {
890         hypervisor.check_required_extensions().unwrap();
891 
892         cfg_if::cfg_if! {
893             if #[cfg(feature = "tdx")] {
894                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
895                 // Otherwise KVM_X86_LEGACY_VM: 0
896                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
897                 let vm = hypervisor
898                     .create_vm_with_type(u64::from(tdx_enabled))
899                     .unwrap();
900             } else if #[cfg(feature = "sev_snp")] {
901                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
902                 // Otherwise SEV_SNP_DISABLED: 0
903                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
904                 let vm = hypervisor
905                     .create_vm_with_type_and_memory(u64::from(sev_snp_enabled), mem_size)
906                     .unwrap();
907             } else {
908                 let vm = hypervisor.create_vm().unwrap();
909             }
910         }
911 
912         #[cfg(target_arch = "x86_64")]
913         {
914             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
915                 .unwrap();
916             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
917             vm.enable_split_irq().unwrap();
918         }
919 
920         Ok(vm)
921     }
922 
923     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
924         let initramfs = self.initramfs.as_mut().unwrap();
925         let size: usize = initramfs
926             .seek(SeekFrom::End(0))
927             .map_err(|_| Error::InitramfsLoad)?
928             .try_into()
929             .unwrap();
930         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
931 
932         let address =
933             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
934         let address = GuestAddress(address);
935 
936         guest_mem
937             .read_volatile_from(address, initramfs, size)
938             .map_err(|_| Error::InitramfsLoad)?;
939 
940         info!("Initramfs loaded: address = 0x{:x}", address.0);
941         Ok(arch::InitramfsConfig { address, size })
942     }
943 
944     pub fn generate_cmdline(
945         payload: &PayloadConfig,
946         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
947     ) -> Result<Cmdline> {
948         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
949         if let Some(s) = payload.cmdline.as_ref() {
950             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
951         }
952 
953         #[cfg(target_arch = "aarch64")]
954         for entry in device_manager.lock().unwrap().cmdline_additions() {
955             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
956         }
957         Ok(cmdline)
958     }
959 
960     #[cfg(target_arch = "aarch64")]
961     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
962         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
963         let mem = uefi_flash.memory();
964         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
965             .map_err(Error::UefiLoad)?;
966         Ok(())
967     }
968 
969     #[cfg(target_arch = "aarch64")]
970     fn load_kernel(
971         firmware: Option<File>,
972         kernel: Option<File>,
973         memory_manager: Arc<Mutex<MemoryManager>>,
974     ) -> Result<EntryPoint> {
975         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
976         let mem = guest_memory.memory();
977         let entry_addr = match (firmware, kernel) {
978             (None, Some(mut kernel)) => {
979                 match linux_loader::loader::pe::PE::load(
980                     mem.deref(),
981                     Some(arch::layout::KERNEL_START),
982                     &mut kernel,
983                     None,
984                 ) {
985                     Ok(entry_addr) => entry_addr.kernel_load,
986                     // Try to load the binary as kernel PE file at first.
987                     // If failed, retry to load it as UEFI binary.
988                     // As the UEFI binary is formatless, it must be the last option to try.
989                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
990                         Self::load_firmware(&kernel, memory_manager)?;
991                         arch::layout::UEFI_START
992                     }
993                     Err(e) => {
994                         return Err(Error::KernelLoad(e));
995                     }
996                 }
997             }
998             (Some(firmware), None) => {
999                 Self::load_firmware(&firmware, memory_manager)?;
1000                 arch::layout::UEFI_START
1001             }
1002             _ => return Err(Error::InvalidPayload),
1003         };
1004 
1005         Ok(EntryPoint { entry_addr })
1006     }
1007 
1008     #[cfg(feature = "igvm")]
1009     fn load_igvm(
1010         igvm: File,
1011         memory_manager: Arc<Mutex<MemoryManager>>,
1012         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1013         #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1014     ) -> Result<EntryPoint> {
1015         let res = igvm_loader::load_igvm(
1016             &igvm,
1017             memory_manager,
1018             cpu_manager.clone(),
1019             "",
1020             #[cfg(feature = "sev_snp")]
1021             host_data,
1022         )
1023         .map_err(Error::IgvmLoad)?;
1024 
1025         cfg_if::cfg_if! {
1026             if #[cfg(feature = "sev_snp")] {
1027                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1028                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1029                 } else {
1030                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1031                 };
1032             } else {
1033                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1034             }
1035         };
1036         Ok(entry_point)
1037     }
1038 
1039     #[cfg(target_arch = "x86_64")]
1040     fn load_kernel(
1041         mut kernel: File,
1042         cmdline: Option<Cmdline>,
1043         memory_manager: Arc<Mutex<MemoryManager>>,
1044     ) -> Result<EntryPoint> {
1045         info!("Loading kernel");
1046 
1047         let mem = {
1048             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1049             guest_memory.memory()
1050         };
1051 
1052         // Try ELF binary with PVH boot.
1053         let entry_addr = linux_loader::loader::elf::Elf::load(
1054             mem.deref(),
1055             None,
1056             &mut kernel,
1057             Some(arch::layout::HIGH_RAM_START),
1058         )
1059         // Try loading kernel as bzImage.
1060         .or_else(|_| {
1061             BzImage::load(
1062                 mem.deref(),
1063                 None,
1064                 &mut kernel,
1065                 Some(arch::layout::HIGH_RAM_START),
1066             )
1067         })
1068         .map_err(Error::KernelLoad)?;
1069 
1070         if let Some(cmdline) = cmdline {
1071             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1072                 .map_err(Error::LoadCmdLine)?;
1073         }
1074 
1075         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1076             // Use the PVH kernel entry point to boot the guest
1077             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1078             Ok(EntryPoint {
1079                 entry_addr,
1080                 setup_header: None,
1081             })
1082         } else if entry_addr.setup_header.is_some() {
1083             // Use the bzImage 32bit entry point to boot the guest
1084             info!(
1085                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1086                 entry_addr.kernel_load.0
1087             );
1088             Ok(EntryPoint {
1089                 entry_addr: entry_addr.kernel_load,
1090                 setup_header: entry_addr.setup_header,
1091             })
1092         } else {
1093             Err(Error::KernelMissingPvhHeader)
1094         }
1095     }
1096 
1097     #[cfg(target_arch = "x86_64")]
1098     fn load_payload(
1099         payload: &PayloadConfig,
1100         memory_manager: Arc<Mutex<MemoryManager>>,
1101         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1102         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1103     ) -> Result<EntryPoint> {
1104         trace_scoped!("load_payload");
1105         #[cfg(feature = "igvm")]
1106         {
1107             if let Some(_igvm_file) = &payload.igvm {
1108                 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1109                 #[cfg(feature = "sev_snp")]
1110                 if sev_snp_enabled {
1111                     return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1112                 }
1113                 #[cfg(not(feature = "sev_snp"))]
1114                 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1115             }
1116         }
1117         match (
1118             &payload.firmware,
1119             &payload.kernel,
1120             &payload.initramfs,
1121             &payload.cmdline,
1122         ) {
1123             (Some(firmware), None, None, None) => {
1124                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1125                 Self::load_kernel(firmware, None, memory_manager)
1126             }
1127             (None, Some(kernel), _, _) => {
1128                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1129                 let cmdline = Self::generate_cmdline(payload)?;
1130                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1131             }
1132             _ => Err(Error::InvalidPayload),
1133         }
1134     }
1135 
1136     #[cfg(target_arch = "aarch64")]
1137     fn load_payload(
1138         payload: &PayloadConfig,
1139         memory_manager: Arc<Mutex<MemoryManager>>,
1140     ) -> Result<EntryPoint> {
1141         match (&payload.firmware, &payload.kernel) {
1142             (Some(firmware), None) => {
1143                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1144                 Self::load_kernel(Some(firmware), None, memory_manager)
1145             }
1146             (None, Some(kernel)) => {
1147                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1148                 Self::load_kernel(None, Some(kernel), memory_manager)
1149             }
1150             _ => Err(Error::InvalidPayload),
1151         }
1152     }
1153 
1154     fn load_payload_async(
1155         memory_manager: &Arc<Mutex<MemoryManager>>,
1156         config: &Arc<Mutex<VmConfig>>,
1157         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1158         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1159     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1160         // Kernel with TDX is loaded in a different manner
1161         #[cfg(feature = "tdx")]
1162         if config.lock().unwrap().is_tdx_enabled() {
1163             return Ok(None);
1164         }
1165 
1166         config
1167             .lock()
1168             .unwrap()
1169             .payload
1170             .as_ref()
1171             .map(|payload| {
1172                 let memory_manager = memory_manager.clone();
1173                 let payload = payload.clone();
1174                 #[cfg(feature = "igvm")]
1175                 let cpu_manager = cpu_manager.clone();
1176 
1177                 std::thread::Builder::new()
1178                     .name("payload_loader".into())
1179                     .spawn(move || {
1180                         Self::load_payload(
1181                             &payload,
1182                             memory_manager,
1183                             #[cfg(feature = "igvm")]
1184                             cpu_manager,
1185                             #[cfg(feature = "sev_snp")]
1186                             sev_snp_enabled,
1187                         )
1188                     })
1189                     .map_err(Error::KernelLoadThreadSpawn)
1190             })
1191             .transpose()
1192     }
1193 
1194     #[cfg(target_arch = "x86_64")]
1195     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1196         trace_scoped!("configure_system");
1197         info!("Configuring system");
1198         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1199 
1200         let initramfs_config = match self.initramfs {
1201             Some(_) => Some(self.load_initramfs(&mem)?),
1202             None => None,
1203         };
1204 
1205         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1206         let rsdp_addr = Some(rsdp_addr);
1207         let sgx_epc_region = self
1208             .memory_manager
1209             .lock()
1210             .unwrap()
1211             .sgx_epc_region()
1212             .as_ref()
1213             .cloned();
1214 
1215         let serial_number = self
1216             .config
1217             .lock()
1218             .unwrap()
1219             .platform
1220             .as_ref()
1221             .and_then(|p| p.serial_number.clone());
1222 
1223         let uuid = self
1224             .config
1225             .lock()
1226             .unwrap()
1227             .platform
1228             .as_ref()
1229             .and_then(|p| p.uuid.clone());
1230 
1231         let oem_strings = self
1232             .config
1233             .lock()
1234             .unwrap()
1235             .platform
1236             .as_ref()
1237             .and_then(|p| p.oem_strings.clone());
1238 
1239         let oem_strings = oem_strings
1240             .as_deref()
1241             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1242 
1243         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1244 
1245         arch::configure_system(
1246             &mem,
1247             arch::layout::CMDLINE_START,
1248             arch::layout::CMDLINE_MAX_SIZE,
1249             &initramfs_config,
1250             boot_vcpus,
1251             entry_addr.setup_header,
1252             rsdp_addr,
1253             sgx_epc_region,
1254             serial_number.as_deref(),
1255             uuid.as_deref(),
1256             oem_strings.as_deref(),
1257             topology,
1258         )
1259         .map_err(Error::ConfigureSystem)?;
1260         Ok(())
1261     }
1262 
1263     #[cfg(target_arch = "aarch64")]
1264     fn configure_system(
1265         &mut self,
1266         _rsdp_addr: GuestAddress,
1267         _entry_addr: EntryPoint,
1268     ) -> Result<()> {
1269         let cmdline = Self::generate_cmdline(
1270             self.config.lock().unwrap().payload.as_ref().unwrap(),
1271             &self.device_manager,
1272         )?;
1273         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1274         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1275         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1276         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1277         let initramfs_config = match self.initramfs {
1278             Some(_) => Some(self.load_initramfs(&mem)?),
1279             None => None,
1280         };
1281 
1282         let device_info = &self
1283             .device_manager
1284             .lock()
1285             .unwrap()
1286             .get_device_info()
1287             .clone();
1288 
1289         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1290             let pci_space = PciSpaceInfo {
1291                 pci_segment_id: pci_segment.id,
1292                 mmio_config_address: pci_segment.mmio_config_address,
1293                 pci_device_space_start: pci_segment.start_of_mem64_area,
1294                 pci_device_space_size: pci_segment.end_of_mem64_area
1295                     - pci_segment.start_of_mem64_area
1296                     + 1,
1297             };
1298             pci_space_info.push(pci_space);
1299         }
1300 
1301         let virtio_iommu_bdf = self
1302             .device_manager
1303             .lock()
1304             .unwrap()
1305             .iommu_attached_devices()
1306             .as_ref()
1307             .map(|(v, _)| *v);
1308 
1309         let vgic = self
1310             .device_manager
1311             .lock()
1312             .unwrap()
1313             .get_interrupt_controller()
1314             .unwrap()
1315             .lock()
1316             .unwrap()
1317             .get_vgic()
1318             .map_err(|_| {
1319                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1320                     arch::aarch64::Error::SetupGic,
1321                 ))
1322             })?;
1323 
1324         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1325         let pmu_supported = self
1326             .cpu_manager
1327             .lock()
1328             .unwrap()
1329             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1330             .map_err(|_| {
1331                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1332                     arch::aarch64::Error::VcpuInitPmu,
1333                 ))
1334             })?;
1335 
1336         arch::configure_system(
1337             &mem,
1338             cmdline.as_cstring().unwrap().to_str().unwrap(),
1339             vcpu_mpidrs,
1340             vcpu_topology,
1341             device_info,
1342             &initramfs_config,
1343             &pci_space_info,
1344             virtio_iommu_bdf.map(|bdf| bdf.into()),
1345             &vgic,
1346             &self.numa_nodes,
1347             pmu_supported,
1348         )
1349         .map_err(Error::ConfigureSystem)?;
1350 
1351         Ok(())
1352     }
1353 
1354     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1355         self.device_manager.lock().unwrap().console_resize_pipe()
1356     }
1357 
1358     pub fn shutdown(&mut self) -> Result<()> {
1359         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1360         let new_state = VmState::Shutdown;
1361 
1362         state.valid_transition(new_state)?;
1363 
1364         // Wake up the DeviceManager threads so they will get terminated cleanly
1365         self.device_manager
1366             .lock()
1367             .unwrap()
1368             .resume()
1369             .map_err(Error::Resume)?;
1370 
1371         self.cpu_manager
1372             .lock()
1373             .unwrap()
1374             .shutdown()
1375             .map_err(Error::CpuManager)?;
1376 
1377         // Wait for all the threads to finish
1378         for thread in self.threads.drain(..) {
1379             thread.join().map_err(Error::ThreadCleanup)?
1380         }
1381         *state = new_state;
1382 
1383         Ok(())
1384     }
1385 
1386     pub fn resize(
1387         &mut self,
1388         desired_vcpus: Option<u8>,
1389         desired_memory: Option<u64>,
1390         desired_balloon: Option<u64>,
1391     ) -> Result<()> {
1392         event!("vm", "resizing");
1393 
1394         if let Some(desired_vcpus) = desired_vcpus {
1395             if self
1396                 .cpu_manager
1397                 .lock()
1398                 .unwrap()
1399                 .resize(desired_vcpus)
1400                 .map_err(Error::CpuManager)?
1401             {
1402                 self.device_manager
1403                     .lock()
1404                     .unwrap()
1405                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1406                     .map_err(Error::DeviceManager)?;
1407             }
1408             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1409         }
1410 
1411         if let Some(desired_memory) = desired_memory {
1412             let new_region = self
1413                 .memory_manager
1414                 .lock()
1415                 .unwrap()
1416                 .resize(desired_memory)
1417                 .map_err(Error::MemoryManager)?;
1418 
1419             let memory_config = &mut self.config.lock().unwrap().memory;
1420 
1421             if let Some(new_region) = &new_region {
1422                 self.device_manager
1423                     .lock()
1424                     .unwrap()
1425                     .update_memory(new_region)
1426                     .map_err(Error::DeviceManager)?;
1427 
1428                 match memory_config.hotplug_method {
1429                     HotplugMethod::Acpi => {
1430                         self.device_manager
1431                             .lock()
1432                             .unwrap()
1433                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1434                             .map_err(Error::DeviceManager)?;
1435                     }
1436                     HotplugMethod::VirtioMem => {}
1437                 }
1438             }
1439 
1440             // We update the VM config regardless of the actual guest resize
1441             // operation result (happened or not), so that if the VM reboots
1442             // it will be running with the last configure memory size.
1443             match memory_config.hotplug_method {
1444                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1445                 HotplugMethod::VirtioMem => {
1446                     if desired_memory > memory_config.size {
1447                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1448                     } else {
1449                         memory_config.hotplugged_size = None;
1450                     }
1451                 }
1452             }
1453         }
1454 
1455         if let Some(desired_balloon) = desired_balloon {
1456             self.device_manager
1457                 .lock()
1458                 .unwrap()
1459                 .resize_balloon(desired_balloon)
1460                 .map_err(Error::DeviceManager)?;
1461 
1462             // Update the configuration value for the balloon size to ensure
1463             // a reboot would use the right value.
1464             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1465                 balloon_config.size = desired_balloon;
1466             }
1467         }
1468 
1469         event!("vm", "resized");
1470 
1471         Ok(())
1472     }
1473 
1474     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1475         let memory_config = &mut self.config.lock().unwrap().memory;
1476 
1477         if let Some(zones) = &mut memory_config.zones {
1478             for zone in zones.iter_mut() {
1479                 if zone.id == id {
1480                     if desired_memory >= zone.size {
1481                         let hotplugged_size = desired_memory - zone.size;
1482                         self.memory_manager
1483                             .lock()
1484                             .unwrap()
1485                             .resize_zone(&id, desired_memory - zone.size)
1486                             .map_err(Error::MemoryManager)?;
1487                         // We update the memory zone config regardless of the
1488                         // actual 'resize-zone' operation result (happened or
1489                         // not), so that if the VM reboots it will be running
1490                         // with the last configured memory zone size.
1491                         zone.hotplugged_size = Some(hotplugged_size);
1492 
1493                         return Ok(());
1494                     } else {
1495                         error!(
1496                             "Invalid to ask less ({}) than boot RAM ({}) for \
1497                             this memory zone",
1498                             desired_memory, zone.size,
1499                         );
1500                         return Err(Error::ResizeZone);
1501                     }
1502                 }
1503             }
1504         }
1505 
1506         error!("Could not find the memory zone {} for the resize", id);
1507         Err(Error::ResizeZone)
1508     }
1509 
1510     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1511         let pci_device_info = self
1512             .device_manager
1513             .lock()
1514             .unwrap()
1515             .add_device(&mut device_cfg)
1516             .map_err(Error::DeviceManager)?;
1517 
1518         // Update VmConfig by adding the new device. This is important to
1519         // ensure the device would be created in case of a reboot.
1520         {
1521             let mut config = self.config.lock().unwrap();
1522             add_to_config(&mut config.devices, device_cfg);
1523         }
1524 
1525         self.device_manager
1526             .lock()
1527             .unwrap()
1528             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1529             .map_err(Error::DeviceManager)?;
1530 
1531         Ok(pci_device_info)
1532     }
1533 
1534     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1535         let pci_device_info = self
1536             .device_manager
1537             .lock()
1538             .unwrap()
1539             .add_user_device(&mut device_cfg)
1540             .map_err(Error::DeviceManager)?;
1541 
1542         // Update VmConfig by adding the new device. This is important to
1543         // ensure the device would be created in case of a reboot.
1544         {
1545             let mut config = self.config.lock().unwrap();
1546             add_to_config(&mut config.user_devices, device_cfg);
1547         }
1548 
1549         self.device_manager
1550             .lock()
1551             .unwrap()
1552             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1553             .map_err(Error::DeviceManager)?;
1554 
1555         Ok(pci_device_info)
1556     }
1557 
1558     pub fn remove_device(&mut self, id: String) -> Result<()> {
1559         self.device_manager
1560             .lock()
1561             .unwrap()
1562             .remove_device(id.clone())
1563             .map_err(Error::DeviceManager)?;
1564 
1565         // Update VmConfig by removing the device. This is important to
1566         // ensure the device would not be created in case of a reboot.
1567         self.config.lock().unwrap().remove_device(&id);
1568 
1569         self.device_manager
1570             .lock()
1571             .unwrap()
1572             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1573             .map_err(Error::DeviceManager)?;
1574         Ok(())
1575     }
1576 
1577     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1578         let pci_device_info = self
1579             .device_manager
1580             .lock()
1581             .unwrap()
1582             .add_disk(&mut disk_cfg)
1583             .map_err(Error::DeviceManager)?;
1584 
1585         // Update VmConfig by adding the new device. This is important to
1586         // ensure the device would be created in case of a reboot.
1587         {
1588             let mut config = self.config.lock().unwrap();
1589             add_to_config(&mut config.disks, disk_cfg);
1590         }
1591 
1592         self.device_manager
1593             .lock()
1594             .unwrap()
1595             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1596             .map_err(Error::DeviceManager)?;
1597 
1598         Ok(pci_device_info)
1599     }
1600 
1601     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1602         let pci_device_info = self
1603             .device_manager
1604             .lock()
1605             .unwrap()
1606             .add_fs(&mut fs_cfg)
1607             .map_err(Error::DeviceManager)?;
1608 
1609         // Update VmConfig by adding the new device. This is important to
1610         // ensure the device would be created in case of a reboot.
1611         {
1612             let mut config = self.config.lock().unwrap();
1613             add_to_config(&mut config.fs, fs_cfg);
1614         }
1615 
1616         self.device_manager
1617             .lock()
1618             .unwrap()
1619             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1620             .map_err(Error::DeviceManager)?;
1621 
1622         Ok(pci_device_info)
1623     }
1624 
1625     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1626         let pci_device_info = self
1627             .device_manager
1628             .lock()
1629             .unwrap()
1630             .add_pmem(&mut pmem_cfg)
1631             .map_err(Error::DeviceManager)?;
1632 
1633         // Update VmConfig by adding the new device. This is important to
1634         // ensure the device would be created in case of a reboot.
1635         {
1636             let mut config = self.config.lock().unwrap();
1637             add_to_config(&mut config.pmem, pmem_cfg);
1638         }
1639 
1640         self.device_manager
1641             .lock()
1642             .unwrap()
1643             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1644             .map_err(Error::DeviceManager)?;
1645 
1646         Ok(pci_device_info)
1647     }
1648 
1649     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1650         let pci_device_info = self
1651             .device_manager
1652             .lock()
1653             .unwrap()
1654             .add_net(&mut net_cfg)
1655             .map_err(Error::DeviceManager)?;
1656 
1657         // Update VmConfig by adding the new device. This is important to
1658         // ensure the device would be created in case of a reboot.
1659         {
1660             let mut config = self.config.lock().unwrap();
1661             add_to_config(&mut config.net, net_cfg);
1662         }
1663 
1664         self.device_manager
1665             .lock()
1666             .unwrap()
1667             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1668             .map_err(Error::DeviceManager)?;
1669 
1670         Ok(pci_device_info)
1671     }
1672 
1673     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1674         let pci_device_info = self
1675             .device_manager
1676             .lock()
1677             .unwrap()
1678             .add_vdpa(&mut vdpa_cfg)
1679             .map_err(Error::DeviceManager)?;
1680 
1681         // Update VmConfig by adding the new device. This is important to
1682         // ensure the device would be created in case of a reboot.
1683         {
1684             let mut config = self.config.lock().unwrap();
1685             add_to_config(&mut config.vdpa, vdpa_cfg);
1686         }
1687 
1688         self.device_manager
1689             .lock()
1690             .unwrap()
1691             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1692             .map_err(Error::DeviceManager)?;
1693 
1694         Ok(pci_device_info)
1695     }
1696 
1697     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1698         let pci_device_info = self
1699             .device_manager
1700             .lock()
1701             .unwrap()
1702             .add_vsock(&mut vsock_cfg)
1703             .map_err(Error::DeviceManager)?;
1704 
1705         // Update VmConfig by adding the new device. This is important to
1706         // ensure the device would be created in case of a reboot.
1707         {
1708             let mut config = self.config.lock().unwrap();
1709             config.vsock = Some(vsock_cfg);
1710         }
1711 
1712         self.device_manager
1713             .lock()
1714             .unwrap()
1715             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1716             .map_err(Error::DeviceManager)?;
1717 
1718         Ok(pci_device_info)
1719     }
1720 
1721     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1722         Ok(self.device_manager.lock().unwrap().counters())
1723     }
1724 
1725     #[cfg(feature = "tdx")]
1726     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1727         use arch::x86_64::tdx::*;
1728 
1729         let firmware_path = self
1730             .config
1731             .lock()
1732             .unwrap()
1733             .payload
1734             .as_ref()
1735             .unwrap()
1736             .firmware
1737             .clone()
1738             .ok_or(Error::TdxFirmwareMissing)?;
1739         // The TDVF file contains a table of section as well as code
1740         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1741 
1742         // For all the sections allocate some RAM backing them
1743         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1744     }
1745 
1746     #[cfg(feature = "tdx")]
1747     fn hob_memory_resources(
1748         mut sorted_sections: Vec<TdvfSection>,
1749         guest_memory: &GuestMemoryMmap,
1750     ) -> Vec<(u64, u64, bool)> {
1751         let mut list = Vec::new();
1752 
1753         let mut current_section = sorted_sections.pop();
1754 
1755         // RAM regions interleaved with TDVF sections
1756         let mut next_start_addr = 0;
1757         for region in guest_memory.iter() {
1758             let region_start = region.start_addr().0;
1759             let region_end = region.last_addr().0;
1760             if region_start > next_start_addr {
1761                 next_start_addr = region_start;
1762             }
1763 
1764             loop {
1765                 let (start, size, ram) = if let Some(section) = &current_section {
1766                     if section.address <= next_start_addr {
1767                         (section.address, section.size, false)
1768                     } else {
1769                         let last_addr = std::cmp::min(section.address - 1, region_end);
1770                         (next_start_addr, last_addr - next_start_addr + 1, true)
1771                     }
1772                 } else {
1773                     (next_start_addr, region_end - next_start_addr + 1, true)
1774                 };
1775 
1776                 list.push((start, size, ram));
1777 
1778                 if !ram {
1779                     current_section = sorted_sections.pop();
1780                 }
1781 
1782                 next_start_addr = start + size;
1783 
1784                 if region_start > next_start_addr {
1785                     next_start_addr = region_start;
1786                 }
1787 
1788                 if next_start_addr > region_end {
1789                     break;
1790                 }
1791             }
1792         }
1793 
1794         // Once all the interleaved sections have been processed, let's simply
1795         // pull the remaining ones.
1796         if let Some(section) = current_section {
1797             list.push((section.address, section.size, false));
1798         }
1799         while let Some(section) = sorted_sections.pop() {
1800             list.push((section.address, section.size, false));
1801         }
1802 
1803         list
1804     }
1805 
1806     #[cfg(feature = "tdx")]
1807     fn populate_tdx_sections(
1808         &mut self,
1809         sections: &[TdvfSection],
1810         guid_found: bool,
1811     ) -> Result<Option<u64>> {
1812         use arch::x86_64::tdx::*;
1813         // Get the memory end *before* we start adding TDVF ram regions
1814         let boot_guest_memory = self
1815             .memory_manager
1816             .lock()
1817             .as_ref()
1818             .unwrap()
1819             .boot_guest_memory();
1820         for section in sections {
1821             // No need to allocate if the section falls within guest RAM ranges
1822             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1823                 info!(
1824                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1825                     section
1826                 );
1827                 continue;
1828             }
1829 
1830             info!("Allocating TDVF Section: {:x?}", section);
1831             self.memory_manager
1832                 .lock()
1833                 .unwrap()
1834                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1835                 .map_err(Error::AllocatingTdvfMemory)?;
1836         }
1837 
1838         // The TDVF file contains a table of section as well as code
1839         let firmware_path = self
1840             .config
1841             .lock()
1842             .unwrap()
1843             .payload
1844             .as_ref()
1845             .unwrap()
1846             .firmware
1847             .clone()
1848             .ok_or(Error::TdxFirmwareMissing)?;
1849         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1850 
1851         // The guest memory at this point now has all the required regions so it
1852         // is safe to copy from the TDVF file into it.
1853         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1854         let mem = guest_memory.memory();
1855         let mut payload_info = None;
1856         let mut hob_offset = None;
1857         for section in sections {
1858             info!("Populating TDVF Section: {:x?}", section);
1859             match section.r#type {
1860                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1861                     info!("Copying section to guest memory");
1862                     firmware_file
1863                         .seek(SeekFrom::Start(section.data_offset as u64))
1864                         .map_err(Error::LoadTdvf)?;
1865                     mem.read_volatile_from(
1866                         GuestAddress(section.address),
1867                         &mut firmware_file,
1868                         section.data_size as usize,
1869                     )
1870                     .unwrap();
1871                 }
1872                 TdvfSectionType::TdHob => {
1873                     hob_offset = Some(section.address);
1874                 }
1875                 TdvfSectionType::Payload => {
1876                     info!("Copying payload to guest memory");
1877                     if let Some(payload_file) = self.kernel.as_mut() {
1878                         let payload_size = payload_file
1879                             .seek(SeekFrom::End(0))
1880                             .map_err(Error::LoadPayload)?;
1881 
1882                         payload_file
1883                             .seek(SeekFrom::Start(0x1f1))
1884                             .map_err(Error::LoadPayload)?;
1885 
1886                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1887                         payload_file
1888                             .read_volatile(&mut payload_header.as_bytes())
1889                             .unwrap();
1890 
1891                         if payload_header.header != 0x5372_6448 {
1892                             return Err(Error::InvalidPayloadType);
1893                         }
1894 
1895                         if (payload_header.version < 0x0200)
1896                             || ((payload_header.loadflags & 0x1) == 0x0)
1897                         {
1898                             return Err(Error::InvalidPayloadType);
1899                         }
1900 
1901                         payload_file.rewind().map_err(Error::LoadPayload)?;
1902                         mem.read_volatile_from(
1903                             GuestAddress(section.address),
1904                             payload_file,
1905                             payload_size as usize,
1906                         )
1907                         .unwrap();
1908 
1909                         // Create the payload info that will be inserted into
1910                         // the HOB.
1911                         payload_info = Some(PayloadInfo {
1912                             image_type: PayloadImageType::BzImage,
1913                             entry_point: section.address,
1914                         });
1915                     }
1916                 }
1917                 TdvfSectionType::PayloadParam => {
1918                     info!("Copying payload parameters to guest memory");
1919                     let cmdline = Self::generate_cmdline(
1920                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1921                     )?;
1922                     mem.write_slice(
1923                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1924                         GuestAddress(section.address),
1925                     )
1926                     .unwrap();
1927                 }
1928                 _ => {}
1929             }
1930         }
1931 
1932         // Generate HOB
1933         let mut hob = TdHob::start(hob_offset.unwrap());
1934 
1935         let mut sorted_sections = sections.to_vec();
1936         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1937 
1938         sorted_sections.sort_by_key(|section| section.address);
1939         sorted_sections.reverse();
1940 
1941         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1942             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1943                 .map_err(Error::PopulateHob)?;
1944         }
1945 
1946         // MMIO regions
1947         hob.add_mmio_resource(
1948             &mem,
1949             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1950             arch::layout::APIC_START.raw_value()
1951                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1952         )
1953         .map_err(Error::PopulateHob)?;
1954         let start_of_device_area = self
1955             .memory_manager
1956             .lock()
1957             .unwrap()
1958             .start_of_device_area()
1959             .raw_value();
1960         let end_of_device_area = self
1961             .memory_manager
1962             .lock()
1963             .unwrap()
1964             .end_of_device_area()
1965             .raw_value();
1966         hob.add_mmio_resource(
1967             &mem,
1968             start_of_device_area,
1969             end_of_device_area - start_of_device_area,
1970         )
1971         .map_err(Error::PopulateHob)?;
1972 
1973         // Loop over the ACPI tables and copy them to the HOB.
1974 
1975         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1976             &self.device_manager,
1977             &self.cpu_manager,
1978             &self.memory_manager,
1979             &self.numa_nodes,
1980         ) {
1981             hob.add_acpi_table(&mem, acpi_table.as_slice())
1982                 .map_err(Error::PopulateHob)?;
1983         }
1984 
1985         // If a payload info has been created, let's insert it into the HOB.
1986         if let Some(payload_info) = payload_info {
1987             hob.add_payload(&mem, payload_info)
1988                 .map_err(Error::PopulateHob)?;
1989         }
1990 
1991         hob.finish(&mem).map_err(Error::PopulateHob)?;
1992 
1993         Ok(hob_offset)
1994     }
1995 
1996     #[cfg(feature = "tdx")]
1997     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1998         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1999         let mem = guest_memory.memory();
2000 
2001         for section in sections {
2002             self.vm
2003                 .tdx_init_memory_region(
2004                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2005                     section.address,
2006                     section.size,
2007                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2008                     section.attributes == 1,
2009                 )
2010                 .map_err(Error::InitializeTdxMemoryRegion)?;
2011         }
2012 
2013         Ok(())
2014     }
2015 
2016     // Creates ACPI tables
2017     // In case of TDX being used, this is a no-op since the tables will be
2018     // created and passed when populating the HOB.
2019 
2020     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2021         #[cfg(feature = "tdx")]
2022         if self.config.lock().unwrap().is_tdx_enabled() {
2023             return None;
2024         }
2025         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2026         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2027         let rsdp_addr = crate::acpi::create_acpi_tables(
2028             &mem,
2029             &self.device_manager,
2030             &self.cpu_manager,
2031             &self.memory_manager,
2032             &self.numa_nodes,
2033             tpm_enabled,
2034         );
2035         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2036 
2037         Some(rsdp_addr)
2038     }
2039 
2040     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2041         trace_scoped!("entry_point");
2042 
2043         self.load_payload_handle
2044             .take()
2045             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2046             .transpose()
2047     }
2048 
2049     pub fn boot(&mut self) -> Result<()> {
2050         trace_scoped!("Vm::boot");
2051         let current_state = self.get_state()?;
2052         if current_state == VmState::Paused {
2053             return self.resume().map_err(Error::Resume);
2054         }
2055 
2056         let new_state = if self.stop_on_boot {
2057             VmState::BreakPoint
2058         } else {
2059             VmState::Running
2060         };
2061         current_state.valid_transition(new_state)?;
2062 
2063         // Do earlier to parallelise with loading kernel
2064         #[cfg(target_arch = "x86_64")]
2065         cfg_if::cfg_if! {
2066             if #[cfg(feature = "sev_snp")] {
2067                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2068                 let rsdp_addr = if sev_snp_enabled {
2069                     // In case of SEV-SNP guest ACPI tables are provided via
2070                     // IGVM. So skip the creation of ACPI tables and set the
2071                     // rsdp addr to None.
2072                     None
2073                 } else {
2074                     self.create_acpi_tables()
2075                 };
2076             } else {
2077                 let rsdp_addr = self.create_acpi_tables();
2078             }
2079         }
2080 
2081         // Load kernel synchronously or if asynchronous then wait for load to
2082         // finish.
2083         let entry_point = self.entry_point()?;
2084 
2085         #[cfg(feature = "tdx")]
2086         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2087 
2088         // Configure the vcpus that have been created
2089         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2090         for vcpu in vcpus {
2091             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2092             let boot_setup = entry_point.map(|e| (e, guest_memory));
2093             self.cpu_manager
2094                 .lock()
2095                 .unwrap()
2096                 .configure_vcpu(vcpu, boot_setup)
2097                 .map_err(Error::CpuManager)?;
2098         }
2099 
2100         #[cfg(feature = "tdx")]
2101         let (sections, guid_found) = if tdx_enabled {
2102             self.extract_tdvf_sections()?
2103         } else {
2104             (Vec::new(), false)
2105         };
2106 
2107         // Configuring the TDX regions requires that the vCPUs are created.
2108         #[cfg(feature = "tdx")]
2109         let hob_address = if tdx_enabled {
2110             // TDX sections are written to memory.
2111             self.populate_tdx_sections(&sections, guid_found)?
2112         } else {
2113             None
2114         };
2115 
2116         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2117         // available after they are configured
2118         #[cfg(target_arch = "aarch64")]
2119         let rsdp_addr = self.create_acpi_tables();
2120 
2121         // Configure shared state based on loaded kernel
2122         entry_point
2123             .map(|entry_point| {
2124                 // Safe to unwrap rsdp_addr as we know it can't be None when
2125                 // the entry_point is Some.
2126                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2127             })
2128             .transpose()?;
2129 
2130         #[cfg(target_arch = "x86_64")]
2131         // Note: For x86, always call this function before invoking start boot vcpus.
2132         // Otherwise guest would fail to boot because we haven't created the
2133         // userspace mappings to update the hypervisor about the memory mappings.
2134         // These mappings must be created before we start the vCPU threads for
2135         // the very first time.
2136         self.memory_manager
2137             .lock()
2138             .unwrap()
2139             .allocate_address_space()
2140             .map_err(Error::MemoryManager)?;
2141 
2142         #[cfg(feature = "tdx")]
2143         if let Some(hob_address) = hob_address {
2144             // With the HOB address extracted the vCPUs can have
2145             // their TDX state configured.
2146             self.cpu_manager
2147                 .lock()
2148                 .unwrap()
2149                 .initialize_tdx(hob_address)
2150                 .map_err(Error::CpuManager)?;
2151             // Let the hypervisor know which memory ranges are shared with the
2152             // guest. This prevents the guest from ignoring/discarding memory
2153             // regions provided by the host.
2154             self.init_tdx_memory(&sections)?;
2155             // With TDX memory and CPU state configured TDX setup is complete
2156             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2157         }
2158 
2159         // Resume the vm for MSHV
2160         if current_state == VmState::Created {
2161             self.vm.resume().map_err(Error::ResumeVm)?;
2162         }
2163 
2164         self.cpu_manager
2165             .lock()
2166             .unwrap()
2167             .start_boot_vcpus(new_state == VmState::BreakPoint)
2168             .map_err(Error::CpuManager)?;
2169 
2170         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2171         *state = new_state;
2172         Ok(())
2173     }
2174 
2175     pub fn restore(&mut self) -> Result<()> {
2176         event!("vm", "restoring");
2177 
2178         #[cfg(target_arch = "x86_64")]
2179         // Note: For x86, always call this function before invoking start boot vcpus.
2180         // Otherwise guest would fail to boot because we haven't created the
2181         // userspace mappings to update the hypervisor about the memory mappings.
2182         // These mappings must be created before we start the vCPU threads for
2183         // the very first time for the restored VM.
2184         self.memory_manager
2185             .lock()
2186             .unwrap()
2187             .allocate_address_space()
2188             .map_err(Error::MemoryManager)?;
2189 
2190         // Now we can start all vCPUs from here.
2191         self.cpu_manager
2192             .lock()
2193             .unwrap()
2194             .start_restored_vcpus()
2195             .map_err(Error::CpuManager)?;
2196 
2197         event!("vm", "restored");
2198         Ok(())
2199     }
2200 
2201     /// Gets a thread-safe reference counted pointer to the VM configuration.
2202     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2203         Arc::clone(&self.config)
2204     }
2205 
2206     /// Get the VM state. Returns an error if the state is poisoned.
2207     pub fn get_state(&self) -> Result<VmState> {
2208         self.state
2209             .try_read()
2210             .map_err(|_| Error::PoisonedState)
2211             .map(|state| *state)
2212     }
2213 
2214     /// Gets the actual size of the balloon.
2215     pub fn balloon_size(&self) -> u64 {
2216         self.device_manager.lock().unwrap().balloon_size()
2217     }
2218 
2219     pub fn send_memory_fds(
2220         &mut self,
2221         socket: &mut UnixStream,
2222     ) -> std::result::Result<(), MigratableError> {
2223         for (slot, fd) in self
2224             .memory_manager
2225             .lock()
2226             .unwrap()
2227             .memory_slot_fds()
2228             .drain()
2229         {
2230             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2231                 .write_to(socket)
2232                 .map_err(|e| {
2233                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2234                 })?;
2235             socket
2236                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2237                 .map_err(|e| {
2238                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2239                 })?;
2240 
2241             Response::read_from(socket)?.ok_or_abandon(
2242                 socket,
2243                 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")),
2244             )?;
2245         }
2246 
2247         Ok(())
2248     }
2249 
2250     pub fn send_memory_regions<F>(
2251         &mut self,
2252         ranges: &MemoryRangeTable,
2253         fd: &mut F,
2254     ) -> std::result::Result<(), MigratableError>
2255     where
2256         F: WriteVolatile,
2257     {
2258         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2259         let mem = guest_memory.memory();
2260 
2261         for range in ranges.regions() {
2262             let mut offset: u64 = 0;
2263             // Here we are manually handling the retry in case we can't the
2264             // whole region at once because we can't use the implementation
2265             // from vm-memory::GuestMemory of write_all_to() as it is not
2266             // following the correct behavior. For more info about this issue
2267             // see: https://github.com/rust-vmm/vm-memory/issues/174
2268             loop {
2269                 let bytes_written = mem
2270                     .write_volatile_to(
2271                         GuestAddress(range.gpa + offset),
2272                         fd,
2273                         (range.length - offset) as usize,
2274                     )
2275                     .map_err(|e| {
2276                         MigratableError::MigrateSend(anyhow!(
2277                             "Error transferring memory to socket: {}",
2278                             e
2279                         ))
2280                     })?;
2281                 offset += bytes_written as u64;
2282 
2283                 if offset == range.length {
2284                     break;
2285                 }
2286             }
2287         }
2288 
2289         Ok(())
2290     }
2291 
2292     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2293         self.memory_manager
2294             .lock()
2295             .unwrap()
2296             .memory_range_table(false)
2297     }
2298 
2299     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2300         self.device_manager.lock().unwrap().device_tree()
2301     }
2302 
2303     pub fn activate_virtio_devices(&self) -> Result<()> {
2304         self.device_manager
2305             .lock()
2306             .unwrap()
2307             .activate_virtio_devices()
2308             .map_err(Error::ActivateVirtioDevices)
2309     }
2310 
2311     #[cfg(target_arch = "x86_64")]
2312     pub fn power_button(&self) -> Result<()> {
2313         return self
2314             .device_manager
2315             .lock()
2316             .unwrap()
2317             .notify_power_button()
2318             .map_err(Error::PowerButton);
2319     }
2320 
2321     #[cfg(target_arch = "aarch64")]
2322     pub fn power_button(&self) -> Result<()> {
2323         self.device_manager
2324             .lock()
2325             .unwrap()
2326             .notify_power_button()
2327             .map_err(Error::PowerButton)
2328     }
2329 
2330     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2331         self.memory_manager.lock().unwrap().snapshot_data()
2332     }
2333 
2334     #[cfg(feature = "guest_debug")]
2335     pub fn debug_request(
2336         &mut self,
2337         gdb_request: &GdbRequestPayload,
2338         cpu_id: usize,
2339     ) -> Result<GdbResponsePayload> {
2340         use GdbRequestPayload::*;
2341         match gdb_request {
2342             SetSingleStep(single_step) => {
2343                 self.set_guest_debug(cpu_id, &[], *single_step)
2344                     .map_err(Error::Debug)?;
2345             }
2346             SetHwBreakPoint(addrs) => {
2347                 self.set_guest_debug(cpu_id, addrs, false)
2348                     .map_err(Error::Debug)?;
2349             }
2350             Pause => {
2351                 self.debug_pause().map_err(Error::Debug)?;
2352             }
2353             Resume => {
2354                 self.debug_resume().map_err(Error::Debug)?;
2355             }
2356             ReadRegs => {
2357                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2358                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2359             }
2360             WriteRegs(regs) => {
2361                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2362             }
2363             ReadMem(vaddr, len) => {
2364                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2365                 let mem = self
2366                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2367                     .map_err(Error::Debug)?;
2368                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2369             }
2370             WriteMem(vaddr, data) => {
2371                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2372                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2373                     .map_err(Error::Debug)?;
2374             }
2375             ActiveVcpus => {
2376                 let active_vcpus = self.active_vcpus();
2377                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2378             }
2379         }
2380         Ok(GdbResponsePayload::CommandComplete)
2381     }
2382 
2383     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2384     fn get_dump_state(
2385         &mut self,
2386         destination_url: &str,
2387     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2388         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2389         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2390         let mut elf_phdr_num = 1;
2391         let elf_sh_info = 0;
2392         let coredump_file_path = url_to_file(destination_url)?;
2393         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2394 
2395         if mapping_num < UINT16_MAX - 2 {
2396             elf_phdr_num += mapping_num as u16;
2397         } else {
2398             panic!("mapping num beyond 65535 not supported");
2399         }
2400         let coredump_file = OpenOptions::new()
2401             .read(true)
2402             .write(true)
2403             .create_new(true)
2404             .open(coredump_file_path)
2405             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2406 
2407         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2408         let mem_data = self
2409             .memory_manager
2410             .lock()
2411             .unwrap()
2412             .coredump_memory_regions(mem_offset);
2413 
2414         Ok(DumpState {
2415             elf_note_size,
2416             elf_phdr_num,
2417             elf_sh_info,
2418             mem_offset,
2419             mem_info: Some(mem_data),
2420             file: Some(coredump_file),
2421         })
2422     }
2423 
2424     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2425     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2426         size_of::<elf::Elf64_Ehdr>() as u64
2427             + note_size as u64
2428             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2429     }
2430 
2431     pub fn nmi(&self) -> Result<()> {
2432         return self
2433             .cpu_manager
2434             .lock()
2435             .unwrap()
2436             .nmi()
2437             .map_err(|_| Error::ErrorNmi);
2438     }
2439 }
2440 
2441 impl Pausable for Vm {
2442     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2443         event!("vm", "pausing");
2444         let mut state = self
2445             .state
2446             .try_write()
2447             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2448         let new_state = VmState::Paused;
2449 
2450         state
2451             .valid_transition(new_state)
2452             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2453 
2454         #[cfg(target_arch = "x86_64")]
2455         {
2456             let mut clock = self
2457                 .vm
2458                 .get_clock()
2459                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2460             clock.reset_flags();
2461             self.saved_clock = Some(clock);
2462         }
2463 
2464         // Before pausing the vCPUs activate any pending virtio devices that might
2465         // need activation between starting the pause (or e.g. a migration it's part of)
2466         self.activate_virtio_devices().map_err(|e| {
2467             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2468         })?;
2469 
2470         self.cpu_manager.lock().unwrap().pause()?;
2471         self.device_manager.lock().unwrap().pause()?;
2472 
2473         self.vm
2474             .pause()
2475             .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?;
2476 
2477         *state = new_state;
2478 
2479         event!("vm", "paused");
2480         Ok(())
2481     }
2482 
2483     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2484         event!("vm", "resuming");
2485         let current_state = self.get_state().unwrap();
2486         let mut state = self
2487             .state
2488             .try_write()
2489             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2490         let new_state = VmState::Running;
2491 
2492         state
2493             .valid_transition(new_state)
2494             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2495 
2496         self.cpu_manager.lock().unwrap().resume()?;
2497         #[cfg(target_arch = "x86_64")]
2498         {
2499             if let Some(clock) = &self.saved_clock {
2500                 self.vm.set_clock(clock).map_err(|e| {
2501                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2502                 })?;
2503             }
2504         }
2505 
2506         if current_state == VmState::Paused {
2507             self.vm
2508                 .resume()
2509                 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?;
2510         }
2511 
2512         self.device_manager.lock().unwrap().resume()?;
2513 
2514         // And we're back to the Running state.
2515         *state = new_state;
2516         event!("vm", "resumed");
2517         Ok(())
2518     }
2519 }
2520 
2521 #[derive(Serialize, Deserialize)]
2522 pub struct VmSnapshot {
2523     #[cfg(target_arch = "x86_64")]
2524     pub clock: Option<hypervisor::ClockData>,
2525     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2526     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2527 }
2528 
2529 pub const VM_SNAPSHOT_ID: &str = "vm";
2530 impl Snapshottable for Vm {
2531     fn id(&self) -> String {
2532         VM_SNAPSHOT_ID.to_string()
2533     }
2534 
2535     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2536         event!("vm", "snapshotting");
2537 
2538         #[cfg(feature = "tdx")]
2539         {
2540             if self.config.lock().unwrap().is_tdx_enabled() {
2541                 return Err(MigratableError::Snapshot(anyhow!(
2542                     "Snapshot not possible with TDX VM"
2543                 )));
2544             }
2545         }
2546 
2547         let current_state = self.get_state().unwrap();
2548         if current_state != VmState::Paused {
2549             return Err(MigratableError::Snapshot(anyhow!(
2550                 "Trying to snapshot while VM is running"
2551             )));
2552         }
2553 
2554         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2555         let common_cpuid = {
2556             let amx = self.config.lock().unwrap().cpus.features.amx;
2557             let phys_bits = physical_bits(
2558                 &self.hypervisor,
2559                 self.config.lock().unwrap().cpus.max_phys_bits,
2560             );
2561             arch::generate_common_cpuid(
2562                 &self.hypervisor,
2563                 &arch::CpuidConfig {
2564                     sgx_epc_sections: None,
2565                     phys_bits,
2566                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2567                     #[cfg(feature = "tdx")]
2568                     tdx: false,
2569                     amx,
2570                 },
2571             )
2572             .map_err(|e| {
2573                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2574             })?
2575         };
2576 
2577         let vm_snapshot_state = VmSnapshot {
2578             #[cfg(target_arch = "x86_64")]
2579             clock: self.saved_clock,
2580             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2581             common_cpuid,
2582         };
2583 
2584         let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?;
2585 
2586         let (id, snapshot) = {
2587             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2588             (cpu_manager.id(), cpu_manager.snapshot()?)
2589         };
2590         vm_snapshot.add_snapshot(id, snapshot);
2591         let (id, snapshot) = {
2592             let mut memory_manager = self.memory_manager.lock().unwrap();
2593             (memory_manager.id(), memory_manager.snapshot()?)
2594         };
2595         vm_snapshot.add_snapshot(id, snapshot);
2596         let (id, snapshot) = {
2597             let mut device_manager = self.device_manager.lock().unwrap();
2598             (device_manager.id(), device_manager.snapshot()?)
2599         };
2600         vm_snapshot.add_snapshot(id, snapshot);
2601 
2602         event!("vm", "snapshotted");
2603         Ok(vm_snapshot)
2604     }
2605 }
2606 
2607 impl Transportable for Vm {
2608     fn send(
2609         &self,
2610         snapshot: &Snapshot,
2611         destination_url: &str,
2612     ) -> std::result::Result<(), MigratableError> {
2613         let mut snapshot_config_path = url_to_path(destination_url)?;
2614         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2615 
2616         // Create the snapshot config file
2617         let mut snapshot_config_file = OpenOptions::new()
2618             .read(true)
2619             .write(true)
2620             .create_new(true)
2621             .open(snapshot_config_path)
2622             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2623 
2624         // Serialize and write the snapshot config
2625         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2626             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2627 
2628         snapshot_config_file
2629             .write(vm_config.as_bytes())
2630             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2631 
2632         let mut snapshot_state_path = url_to_path(destination_url)?;
2633         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2634 
2635         // Create the snapshot state file
2636         let mut snapshot_state_file = OpenOptions::new()
2637             .read(true)
2638             .write(true)
2639             .create_new(true)
2640             .open(snapshot_state_path)
2641             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2642 
2643         // Serialize and write the snapshot state
2644         let vm_state =
2645             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2646 
2647         snapshot_state_file
2648             .write(&vm_state)
2649             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2650 
2651         // Tell the memory manager to also send/write its own snapshot.
2652         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2653             self.memory_manager
2654                 .lock()
2655                 .unwrap()
2656                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2657         } else {
2658             return Err(MigratableError::Restore(anyhow!(
2659                 "Missing memory manager snapshot"
2660             )));
2661         }
2662 
2663         Ok(())
2664     }
2665 }
2666 
2667 impl Migratable for Vm {
2668     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2669         self.memory_manager.lock().unwrap().start_dirty_log()?;
2670         self.device_manager.lock().unwrap().start_dirty_log()
2671     }
2672 
2673     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2674         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2675         self.device_manager.lock().unwrap().stop_dirty_log()
2676     }
2677 
2678     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2679         Ok(MemoryRangeTable::new_from_tables(vec![
2680             self.memory_manager.lock().unwrap().dirty_log()?,
2681             self.device_manager.lock().unwrap().dirty_log()?,
2682         ]))
2683     }
2684 
2685     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2686         self.memory_manager.lock().unwrap().start_migration()?;
2687         self.device_manager.lock().unwrap().start_migration()
2688     }
2689 
2690     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2691         self.memory_manager.lock().unwrap().complete_migration()?;
2692         self.device_manager.lock().unwrap().complete_migration()
2693     }
2694 }
2695 
2696 #[cfg(feature = "guest_debug")]
2697 impl Debuggable for Vm {
2698     fn set_guest_debug(
2699         &self,
2700         cpu_id: usize,
2701         addrs: &[GuestAddress],
2702         singlestep: bool,
2703     ) -> std::result::Result<(), DebuggableError> {
2704         self.cpu_manager
2705             .lock()
2706             .unwrap()
2707             .set_guest_debug(cpu_id, addrs, singlestep)
2708     }
2709 
2710     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2711         if *self.state.read().unwrap() == VmState::Running {
2712             self.pause().map_err(DebuggableError::Pause)?;
2713         }
2714 
2715         let mut state = self
2716             .state
2717             .try_write()
2718             .map_err(|_| DebuggableError::PoisonedState)?;
2719         *state = VmState::BreakPoint;
2720         Ok(())
2721     }
2722 
2723     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2724         if *self.state.read().unwrap() == VmState::BreakPoint {
2725             self.resume().map_err(DebuggableError::Pause)?;
2726         }
2727 
2728         Ok(())
2729     }
2730 
2731     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2732         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2733     }
2734 
2735     fn write_regs(
2736         &self,
2737         cpu_id: usize,
2738         regs: &CoreRegs,
2739     ) -> std::result::Result<(), DebuggableError> {
2740         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2741     }
2742 
2743     fn read_mem(
2744         &self,
2745         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2746         cpu_id: usize,
2747         vaddr: GuestAddress,
2748         len: usize,
2749     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2750         self.cpu_manager
2751             .lock()
2752             .unwrap()
2753             .read_mem(guest_memory, cpu_id, vaddr, len)
2754     }
2755 
2756     fn write_mem(
2757         &self,
2758         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2759         cpu_id: usize,
2760         vaddr: &GuestAddress,
2761         data: &[u8],
2762     ) -> std::result::Result<(), DebuggableError> {
2763         self.cpu_manager
2764             .lock()
2765             .unwrap()
2766             .write_mem(guest_memory, cpu_id, vaddr, data)
2767     }
2768 
2769     fn active_vcpus(&self) -> usize {
2770         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2771         if active_vcpus > 0 {
2772             active_vcpus
2773         } else {
2774             // The VM is not booted yet. Report boot_vcpus() instead.
2775             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2776         }
2777     }
2778 }
2779 
2780 #[cfg(feature = "guest_debug")]
2781 pub const UINT16_MAX: u32 = 65535;
2782 
2783 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2784 impl Elf64Writable for Vm {}
2785 
2786 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2787 impl GuestDebuggable for Vm {
2788     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2789         event!("vm", "coredumping");
2790 
2791         let mut resume = false;
2792 
2793         #[cfg(feature = "tdx")]
2794         {
2795             if let Some(ref platform) = self.config.lock().unwrap().platform {
2796                 if platform.tdx {
2797                     return Err(GuestDebuggableError::Coredump(anyhow!(
2798                         "Coredump not possible with TDX VM"
2799                     )));
2800                 }
2801             }
2802         }
2803 
2804         match self.get_state().unwrap() {
2805             VmState::Running => {
2806                 self.pause().map_err(GuestDebuggableError::Pause)?;
2807                 resume = true;
2808             }
2809             VmState::Paused => {}
2810             _ => {
2811                 return Err(GuestDebuggableError::Coredump(anyhow!(
2812                     "Trying to coredump while VM is not running or paused"
2813                 )));
2814             }
2815         }
2816 
2817         let coredump_state = self.get_dump_state(destination_url)?;
2818 
2819         self.write_header(&coredump_state)?;
2820         self.write_note(&coredump_state)?;
2821         self.write_loads(&coredump_state)?;
2822 
2823         self.cpu_manager
2824             .lock()
2825             .unwrap()
2826             .cpu_write_elf64_note(&coredump_state)?;
2827         self.cpu_manager
2828             .lock()
2829             .unwrap()
2830             .cpu_write_vmm_note(&coredump_state)?;
2831 
2832         self.memory_manager
2833             .lock()
2834             .unwrap()
2835             .coredump_iterate_save_mem(&coredump_state)?;
2836 
2837         if resume {
2838             self.resume().map_err(GuestDebuggableError::Resume)?;
2839         }
2840 
2841         Ok(())
2842     }
2843 }
2844 
2845 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2846 #[cfg(test)]
2847 mod tests {
2848     use super::*;
2849 
2850     fn test_vm_state_transitions(state: VmState) {
2851         match state {
2852             VmState::Created => {
2853                 // Check the transitions from Created
2854                 state.valid_transition(VmState::Created).unwrap_err();
2855                 state.valid_transition(VmState::Running).unwrap();
2856                 state.valid_transition(VmState::Shutdown).unwrap();
2857                 state.valid_transition(VmState::Paused).unwrap();
2858                 state.valid_transition(VmState::BreakPoint).unwrap();
2859             }
2860             VmState::Running => {
2861                 // Check the transitions from Running
2862                 state.valid_transition(VmState::Created).unwrap_err();
2863                 state.valid_transition(VmState::Running).unwrap_err();
2864                 state.valid_transition(VmState::Shutdown).unwrap();
2865                 state.valid_transition(VmState::Paused).unwrap();
2866                 state.valid_transition(VmState::BreakPoint).unwrap();
2867             }
2868             VmState::Shutdown => {
2869                 // Check the transitions from Shutdown
2870                 state.valid_transition(VmState::Created).unwrap_err();
2871                 state.valid_transition(VmState::Running).unwrap();
2872                 state.valid_transition(VmState::Shutdown).unwrap_err();
2873                 state.valid_transition(VmState::Paused).unwrap_err();
2874                 state.valid_transition(VmState::BreakPoint).unwrap_err();
2875             }
2876             VmState::Paused => {
2877                 // Check the transitions from Paused
2878                 state.valid_transition(VmState::Created).unwrap_err();
2879                 state.valid_transition(VmState::Running).unwrap();
2880                 state.valid_transition(VmState::Shutdown).unwrap();
2881                 state.valid_transition(VmState::Paused).unwrap_err();
2882                 state.valid_transition(VmState::BreakPoint).unwrap_err();
2883             }
2884             VmState::BreakPoint => {
2885                 // Check the transitions from Breakpoint
2886                 state.valid_transition(VmState::Created).unwrap();
2887                 state.valid_transition(VmState::Running).unwrap();
2888                 state.valid_transition(VmState::Shutdown).unwrap_err();
2889                 state.valid_transition(VmState::Paused).unwrap_err();
2890                 state.valid_transition(VmState::BreakPoint).unwrap_err();
2891             }
2892         }
2893     }
2894 
2895     #[test]
2896     fn test_vm_created_transitions() {
2897         test_vm_state_transitions(VmState::Created);
2898     }
2899 
2900     #[test]
2901     fn test_vm_running_transitions() {
2902         test_vm_state_transitions(VmState::Running);
2903     }
2904 
2905     #[test]
2906     fn test_vm_shutdown_transitions() {
2907         test_vm_state_transitions(VmState::Shutdown);
2908     }
2909 
2910     #[test]
2911     fn test_vm_paused_transitions() {
2912         test_vm_state_transitions(VmState::Paused);
2913     }
2914 
2915     #[cfg(feature = "tdx")]
2916     #[test]
2917     fn test_hob_memory_resources() {
2918         // Case 1: Two TDVF sections in the middle of the RAM
2919         let sections = vec![
2920             TdvfSection {
2921                 address: 0xc000,
2922                 size: 0x1000,
2923                 ..Default::default()
2924             },
2925             TdvfSection {
2926                 address: 0x1000,
2927                 size: 0x4000,
2928                 ..Default::default()
2929             },
2930         ];
2931         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2932         let expected = vec![
2933             (0, 0x1000, true),
2934             (0x1000, 0x4000, false),
2935             (0x5000, 0x7000, true),
2936             (0xc000, 0x1000, false),
2937             (0xd000, 0x0fff_3000, true),
2938         ];
2939         assert_eq!(
2940             expected,
2941             Vm::hob_memory_resources(
2942                 sections,
2943                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2944             )
2945         );
2946 
2947         // Case 2: Two TDVF sections with no conflict with the RAM
2948         let sections = vec![
2949             TdvfSection {
2950                 address: 0x1000_1000,
2951                 size: 0x1000,
2952                 ..Default::default()
2953             },
2954             TdvfSection {
2955                 address: 0,
2956                 size: 0x1000,
2957                 ..Default::default()
2958             },
2959         ];
2960         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2961         let expected = vec![
2962             (0, 0x1000, false),
2963             (0x1000, 0x1000_0000, true),
2964             (0x1000_1000, 0x1000, false),
2965         ];
2966         assert_eq!(
2967             expected,
2968             Vm::hob_memory_resources(
2969                 sections,
2970                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2971             )
2972         );
2973 
2974         // Case 3: Two TDVF sections with partial conflicts with the RAM
2975         let sections = vec![
2976             TdvfSection {
2977                 address: 0x1000_0000,
2978                 size: 0x2000,
2979                 ..Default::default()
2980             },
2981             TdvfSection {
2982                 address: 0,
2983                 size: 0x2000,
2984                 ..Default::default()
2985             },
2986         ];
2987         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2988         let expected = vec![
2989             (0, 0x2000, false),
2990             (0x2000, 0x0fff_e000, true),
2991             (0x1000_0000, 0x2000, false),
2992         ];
2993         assert_eq!(
2994             expected,
2995             Vm::hob_memory_resources(
2996                 sections,
2997                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2998             )
2999         );
3000 
3001         // Case 4: Two TDVF sections with no conflict before the RAM and two
3002         // more additional sections with no conflict after the RAM.
3003         let sections = vec![
3004             TdvfSection {
3005                 address: 0x2000_1000,
3006                 size: 0x1000,
3007                 ..Default::default()
3008             },
3009             TdvfSection {
3010                 address: 0x2000_0000,
3011                 size: 0x1000,
3012                 ..Default::default()
3013             },
3014             TdvfSection {
3015                 address: 0x1000,
3016                 size: 0x1000,
3017                 ..Default::default()
3018             },
3019             TdvfSection {
3020                 address: 0,
3021                 size: 0x1000,
3022                 ..Default::default()
3023             },
3024         ];
3025         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3026         let expected = vec![
3027             (0, 0x1000, false),
3028             (0x1000, 0x1000, false),
3029             (0x4000, 0x1000_0000, true),
3030             (0x2000_0000, 0x1000, false),
3031             (0x2000_1000, 0x1000, false),
3032         ];
3033         assert_eq!(
3034             expected,
3035             Vm::hob_memory_resources(
3036                 sections,
3037                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3038             )
3039         );
3040 
3041         // Case 5: One TDVF section overriding the entire RAM
3042         let sections = vec![TdvfSection {
3043             address: 0,
3044             size: 0x2000_0000,
3045             ..Default::default()
3046         }];
3047         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3048         let expected = vec![(0, 0x2000_0000, false)];
3049         assert_eq!(
3050             expected,
3051             Vm::hob_memory_resources(
3052                 sections,
3053                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3054             )
3055         );
3056 
3057         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3058         let sections = vec![
3059             TdvfSection {
3060                 address: 0x1000_2000,
3061                 size: 0x2000,
3062                 ..Default::default()
3063             },
3064             TdvfSection {
3065                 address: 0,
3066                 size: 0x2000,
3067                 ..Default::default()
3068             },
3069         ];
3070         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3071             (GuestAddress(0x2000), 0x1000_0000),
3072             (GuestAddress(0x1000_4000), 0x1000_0000),
3073         ];
3074         let expected = vec![
3075             (0, 0x2000, false),
3076             (0x2000, 0x1000_0000, true),
3077             (0x1000_2000, 0x2000, false),
3078             (0x1000_4000, 0x1000_0000, true),
3079         ];
3080         assert_eq!(
3081             expected,
3082             Vm::hob_memory_resources(
3083                 sections,
3084                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3085             )
3086         );
3087 
3088         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3089         let sections = vec![
3090             TdvfSection {
3091                 address: 0x1000_0000,
3092                 size: 0x4000,
3093                 ..Default::default()
3094             },
3095             TdvfSection {
3096                 address: 0,
3097                 size: 0x4000,
3098                 ..Default::default()
3099             },
3100         ];
3101         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3102             (GuestAddress(0x1000), 0x1000_0000),
3103             (GuestAddress(0x1000_3000), 0x1000_0000),
3104         ];
3105         let expected = vec![
3106             (0, 0x4000, false),
3107             (0x4000, 0x0fff_c000, true),
3108             (0x1000_0000, 0x4000, false),
3109             (0x1000_4000, 0x0fff_f000, true),
3110         ];
3111         assert_eq!(
3112             expected,
3113             Vm::hob_memory_resources(
3114                 sections,
3115                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3116             )
3117         );
3118     }
3119 }
3120 
3121 #[cfg(target_arch = "aarch64")]
3122 #[cfg(test)]
3123 mod tests {
3124     use arch::aarch64::fdt::create_fdt;
3125     use arch::aarch64::layout;
3126     use arch::{DeviceType, MmioDeviceInfo};
3127     use devices::gic::Gic;
3128 
3129     use super::*;
3130 
3131     const LEN: u64 = 4096;
3132 
3133     #[test]
3134     fn test_create_fdt_with_devices() {
3135         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3136         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3137 
3138         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3139             (
3140                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3141                 MmioDeviceInfo {
3142                     addr: 0x00,
3143                     len: LEN,
3144                     irq: 33,
3145                 },
3146             ),
3147             (
3148                 (DeviceType::Virtio(1), "virtio".to_string()),
3149                 MmioDeviceInfo {
3150                     addr: LEN,
3151                     len: LEN,
3152                     irq: 34,
3153                 },
3154             ),
3155             (
3156                 (DeviceType::Rtc, "rtc".to_string()),
3157                 MmioDeviceInfo {
3158                     addr: 2 * LEN,
3159                     len: LEN,
3160                     irq: 35,
3161                 },
3162             ),
3163         ]
3164         .iter()
3165         .cloned()
3166         .collect();
3167 
3168         let hv = hypervisor::new().unwrap();
3169         let vm = hv.create_vm().unwrap();
3170         let gic = vm
3171             .create_vgic(Gic::create_default_config(1))
3172             .expect("Cannot create gic");
3173         create_fdt(
3174             &mem,
3175             "console=tty0",
3176             vec![0],
3177             Some((0, 0, 0)),
3178             &dev_info,
3179             &gic,
3180             &None,
3181             &Vec::new(),
3182             &BTreeMap::new(),
3183             None,
3184             true,
3185         )
3186         .unwrap();
3187     }
3188 }
3189 
3190 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3191 #[test]
3192 pub fn test_vm() {
3193     use hypervisor::VmExit;
3194     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3195     // This example based on https://lwn.net/Articles/658511/
3196     let code = [
3197         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3198         0x00, 0xd8, /* add %bl, %al */
3199         0x04, b'0', /* add $'0', %al */
3200         0xee, /* out %al, (%dx) */
3201         0xb0, b'\n', /* mov $'\n', %al */
3202         0xee,  /* out %al, (%dx) */
3203         0xf4,  /* hlt */
3204     ];
3205 
3206     let mem_size = 0x1000;
3207     let load_addr = GuestAddress(0x1000);
3208     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3209 
3210     let hv = hypervisor::new().unwrap();
3211     let vm = hv.create_vm().expect("new VM creation failed");
3212 
3213     for (index, region) in mem.iter().enumerate() {
3214         let mem_region = vm.make_user_memory_region(
3215             index as u32,
3216             region.start_addr().raw_value(),
3217             region.len(),
3218             region.as_ptr() as u64,
3219             false,
3220             false,
3221         );
3222 
3223         vm.create_user_memory_region(mem_region)
3224             .expect("Cannot configure guest memory");
3225     }
3226     mem.write_slice(&code, load_addr)
3227         .expect("Writing code to memory failed");
3228 
3229     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3230 
3231     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3232     vcpu_sregs.cs.base = 0;
3233     vcpu_sregs.cs.selector = 0;
3234     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3235 
3236     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3237     vcpu_regs.set_rip(0x1000);
3238     vcpu_regs.set_rax(2);
3239     vcpu_regs.set_rbx(3);
3240     vcpu_regs.set_rflags(2);
3241     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3242 
3243     loop {
3244         match vcpu.run().expect("run failed") {
3245             VmExit::Reset => {
3246                 println!("HLT");
3247                 break;
3248             }
3249             VmExit::Ignore => {}
3250             r => panic!("unexpected exit reason: {r:?}"),
3251         }
3252     }
3253 }
3254