xref: /cloud-hypervisor/vmm/src/vm.rs (revision 0310c5726f4d1e560ec39e0a6cd4298bbfe8dc07)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 #[cfg(feature = "igvm")]
29 use crate::igvm::igvm_loader;
30 use crate::memory_manager::{
31     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
32 };
33 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
34 use crate::migration::get_vm_snapshot;
35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
36 use crate::migration::url_to_file;
37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
38 use crate::GuestMemoryMmap;
39 use crate::{
40     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
41 };
42 use anyhow::anyhow;
43 use arch::get_host_cpu_phys_bits;
44 #[cfg(target_arch = "x86_64")]
45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
46 #[cfg(feature = "tdx")]
47 use arch::x86_64::tdx::TdvfSection;
48 use arch::EntryPoint;
49 #[cfg(target_arch = "aarch64")]
50 use arch::PciSpaceInfo;
51 use arch::{NumaNode, NumaNodes};
52 #[cfg(target_arch = "aarch64")]
53 use devices::interrupt_controller;
54 use devices::AcpiNotificationFlags;
55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
59 use hypervisor::{HypervisorVmError, VmOps};
60 use libc::{termios, SIGWINCH};
61 use linux_loader::cmdline::Cmdline;
62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
63 use linux_loader::elf;
64 #[cfg(target_arch = "x86_64")]
65 use linux_loader::loader::bzimage::BzImage;
66 #[cfg(target_arch = "x86_64")]
67 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
68 #[cfg(target_arch = "aarch64")]
69 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
70 use linux_loader::loader::KernelLoader;
71 use seccompiler::SeccompAction;
72 use serde::{Deserialize, Serialize};
73 use std::cmp;
74 use std::collections::BTreeMap;
75 use std::collections::HashMap;
76 use std::fs::{File, OpenOptions};
77 use std::io::{self, Seek, SeekFrom, Write};
78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
79 use std::mem::size_of;
80 use std::num::Wrapping;
81 use std::ops::Deref;
82 use std::os::unix::net::UnixStream;
83 use std::sync::{Arc, Mutex, RwLock};
84 use std::time::Instant;
85 use std::{result, str, thread};
86 use thiserror::Error;
87 use tracer::trace_scoped;
88 use vm_device::Bus;
89 #[cfg(feature = "tdx")]
90 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
91 use vm_memory::{
92     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
93 };
94 use vm_migration::protocol::{Request, Response, Status};
95 use vm_migration::{
96     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
97     SnapshotData, Snapshottable, Transportable,
98 };
99 use vmm_sys_util::eventfd::EventFd;
100 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
101 
102 /// Errors associated with VM management
103 #[derive(Debug, Error)]
104 pub enum Error {
105     #[error("Cannot open kernel file: {0}")]
106     KernelFile(#[source] io::Error),
107 
108     #[error("Cannot open initramfs file: {0}")]
109     InitramfsFile(#[source] io::Error),
110 
111     #[error("Cannot load the kernel into memory: {0}")]
112     KernelLoad(#[source] linux_loader::loader::Error),
113 
114     #[cfg(target_arch = "aarch64")]
115     #[error("Cannot load the UEFI binary in memory: {0:?}")]
116     UefiLoad(arch::aarch64::uefi::Error),
117 
118     #[error("Cannot load the initramfs into memory")]
119     InitramfsLoad,
120 
121     #[error("Cannot load the kernel command line in memory: {0}")]
122     LoadCmdLine(#[source] linux_loader::loader::Error),
123 
124     #[error("Cannot modify the kernel command line: {0}")]
125     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
126 
127     #[error("Cannot create the kernel command line: {0}")]
128     CmdLineCreate(#[source] linux_loader::cmdline::Error),
129 
130     #[error("Cannot configure system: {0}")]
131     ConfigureSystem(#[source] arch::Error),
132 
133     #[cfg(target_arch = "aarch64")]
134     #[error("Cannot enable interrupt controller: {0:?}")]
135     EnableInterruptController(interrupt_controller::Error),
136 
137     #[error("VM state is poisoned")]
138     PoisonedState,
139 
140     #[error("Error from device manager: {0:?}")]
141     DeviceManager(DeviceManagerError),
142 
143     #[error("No device with id {0:?} to remove")]
144     NoDeviceToRemove(String),
145 
146     #[error("Cannot spawn a signal handler thread: {0}")]
147     SignalHandlerSpawn(#[source] io::Error),
148 
149     #[error("Failed to join on threads: {0:?}")]
150     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
151 
152     #[error("VM config is missing")]
153     VmMissingConfig,
154 
155     #[error("VM is not created")]
156     VmNotCreated,
157 
158     #[error("VM is already created")]
159     VmAlreadyCreated,
160 
161     #[error("VM is not running")]
162     VmNotRunning,
163 
164     #[error("Cannot clone EventFd: {0}")]
165     EventFdClone(#[source] io::Error),
166 
167     #[error("invalid VM state transition: {0:?} to {1:?}")]
168     InvalidStateTransition(VmState, VmState),
169 
170     #[error("Error from CPU manager: {0}")]
171     CpuManager(#[source] cpu::Error),
172 
173     #[error("Cannot pause devices: {0}")]
174     PauseDevices(#[source] MigratableError),
175 
176     #[error("Cannot resume devices: {0}")]
177     ResumeDevices(#[source] MigratableError),
178 
179     #[error("Cannot pause CPUs: {0}")]
180     PauseCpus(#[source] MigratableError),
181 
182     #[error("Cannot resume cpus: {0}")]
183     ResumeCpus(#[source] MigratableError),
184 
185     #[error("Cannot pause VM: {0}")]
186     Pause(#[source] MigratableError),
187 
188     #[error("Cannot resume VM: {0}")]
189     Resume(#[source] MigratableError),
190 
191     #[error("Memory manager error: {0:?}")]
192     MemoryManager(MemoryManagerError),
193 
194     #[error("Eventfd write error: {0}")]
195     EventfdError(#[source] std::io::Error),
196 
197     #[error("Cannot snapshot VM: {0}")]
198     Snapshot(#[source] MigratableError),
199 
200     #[error("Cannot restore VM: {0}")]
201     Restore(#[source] MigratableError),
202 
203     #[error("Cannot send VM snapshot: {0}")]
204     SnapshotSend(#[source] MigratableError),
205 
206     #[error("Invalid restore source URL")]
207     InvalidRestoreSourceUrl,
208 
209     #[error("Failed to validate config: {0}")]
210     ConfigValidation(#[source] ValidationError),
211 
212     #[error("Too many virtio-vsock devices")]
213     TooManyVsockDevices,
214 
215     #[error("Failed serializing into JSON: {0}")]
216     SerializeJson(#[source] serde_json::Error),
217 
218     #[error("Invalid NUMA configuration")]
219     InvalidNumaConfig,
220 
221     #[error("Cannot create seccomp filter: {0}")]
222     CreateSeccompFilter(#[source] seccompiler::Error),
223 
224     #[error("Cannot apply seccomp filter: {0}")]
225     ApplySeccompFilter(#[source] seccompiler::Error),
226 
227     #[error("Failed resizing a memory zone")]
228     ResizeZone,
229 
230     #[error("Cannot activate virtio devices: {0:?}")]
231     ActivateVirtioDevices(DeviceManagerError),
232 
233     #[error("Error triggering power button: {0:?}")]
234     PowerButton(DeviceManagerError),
235 
236     #[error("Kernel lacks PVH header")]
237     KernelMissingPvhHeader,
238 
239     #[error("Failed to allocate firmware RAM: {0:?}")]
240     AllocateFirmwareMemory(MemoryManagerError),
241 
242     #[error("Error manipulating firmware file: {0}")]
243     FirmwareFile(#[source] std::io::Error),
244 
245     #[error("Firmware too big")]
246     FirmwareTooLarge,
247 
248     #[error("Failed to copy firmware to memory: {0}")]
249     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
250 
251     #[cfg(feature = "sev_snp")]
252     #[error("Error enabling SEV-SNP VM: {0}")]
253     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
254 
255     #[cfg(feature = "tdx")]
256     #[error("Error performing I/O on TDX firmware file: {0}")]
257     LoadTdvf(#[source] std::io::Error),
258 
259     #[cfg(feature = "tdx")]
260     #[error("Error performing I/O on the TDX payload file: {0}")]
261     LoadPayload(#[source] std::io::Error),
262 
263     #[cfg(feature = "tdx")]
264     #[error("Error parsing TDVF: {0}")]
265     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
266 
267     #[cfg(feature = "tdx")]
268     #[error("Error populating TDX HOB: {0}")]
269     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
270 
271     #[cfg(feature = "tdx")]
272     #[error("Error allocating TDVF memory: {0:?}")]
273     AllocatingTdvfMemory(crate::memory_manager::Error),
274 
275     #[cfg(feature = "tdx")]
276     #[error("Error enabling TDX VM: {0}")]
277     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
278 
279     #[cfg(feature = "tdx")]
280     #[error("Error enabling TDX memory region: {0}")]
281     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
282 
283     #[cfg(feature = "tdx")]
284     #[error("Error finalizing TDX VM: {0}")]
285     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
286 
287     #[cfg(feature = "tdx")]
288     #[error("TDX firmware missing")]
289     TdxFirmwareMissing,
290 
291     #[cfg(feature = "tdx")]
292     #[error("Invalid TDX payload type")]
293     InvalidPayloadType,
294 
295     #[cfg(feature = "guest_debug")]
296     #[error("Error debugging VM: {0:?}")]
297     Debug(DebuggableError),
298 
299     #[error("Error spawning kernel loading thread")]
300     KernelLoadThreadSpawn(std::io::Error),
301 
302     #[error("Error joining kernel loading thread")]
303     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
304 
305     #[error("Payload configuration is not bootable")]
306     InvalidPayload,
307 
308     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
309     #[error("Error coredumping VM: {0:?}")]
310     Coredump(GuestDebuggableError),
311 
312     #[cfg(feature = "igvm")]
313     #[error("Cannot open igvm file: {0}")]
314     IgvmFile(#[source] io::Error),
315 
316     #[cfg(feature = "igvm")]
317     #[error("Cannot load the igvm into memory: {0}")]
318     IgvmLoad(#[source] igvm_loader::Error),
319 }
320 pub type Result<T> = result::Result<T, Error>;
321 
322 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
323 pub enum VmState {
324     Created,
325     Running,
326     Shutdown,
327     Paused,
328     BreakPoint,
329 }
330 
331 impl VmState {
332     fn valid_transition(self, new_state: VmState) -> Result<()> {
333         match self {
334             VmState::Created => match new_state {
335                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
336                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
337                     Ok(())
338                 }
339             },
340 
341             VmState::Running => match new_state {
342                 VmState::Created | VmState::Running => {
343                     Err(Error::InvalidStateTransition(self, new_state))
344                 }
345                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
346             },
347 
348             VmState::Shutdown => match new_state {
349                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
350                     Err(Error::InvalidStateTransition(self, new_state))
351                 }
352                 VmState::Running => Ok(()),
353             },
354 
355             VmState::Paused => match new_state {
356                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
357                     Err(Error::InvalidStateTransition(self, new_state))
358                 }
359                 VmState::Running | VmState::Shutdown => Ok(()),
360             },
361             VmState::BreakPoint => match new_state {
362                 VmState::Created | VmState::Running => Ok(()),
363                 _ => Err(Error::InvalidStateTransition(self, new_state)),
364             },
365         }
366     }
367 }
368 
369 struct VmOpsHandler {
370     memory: GuestMemoryAtomic<GuestMemoryMmap>,
371     #[cfg(target_arch = "x86_64")]
372     io_bus: Arc<Bus>,
373     mmio_bus: Arc<Bus>,
374 }
375 
376 impl VmOps for VmOpsHandler {
377     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
378         self.memory
379             .memory()
380             .write(buf, GuestAddress(gpa))
381             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
382     }
383 
384     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
385         self.memory
386             .memory()
387             .read(buf, GuestAddress(gpa))
388             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
389     }
390 
391     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
392         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
393             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
394         }
395         Ok(())
396     }
397 
398     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
399         match self.mmio_bus.write(gpa, data) {
400             Err(vm_device::BusError::MissingAddressRange) => {
401                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
402             }
403             Ok(Some(barrier)) => {
404                 info!("Waiting for barrier");
405                 barrier.wait();
406                 info!("Barrier released");
407             }
408             _ => {}
409         };
410         Ok(())
411     }
412 
413     #[cfg(target_arch = "x86_64")]
414     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
415         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
416             info!("Guest PIO read to unregistered address 0x{:x}", port);
417         }
418         Ok(())
419     }
420 
421     #[cfg(target_arch = "x86_64")]
422     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
423         match self.io_bus.write(port, data) {
424             Err(vm_device::BusError::MissingAddressRange) => {
425                 info!("Guest PIO write to unregistered address 0x{:x}", port);
426             }
427             Ok(Some(barrier)) => {
428                 info!("Waiting for barrier");
429                 barrier.wait();
430                 info!("Barrier released");
431             }
432             _ => {}
433         };
434         Ok(())
435     }
436 }
437 
438 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
439     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
440 
441     cmp::min(host_phys_bits, max_phys_bits)
442 }
443 
444 pub struct Vm {
445     #[cfg(feature = "tdx")]
446     kernel: Option<File>,
447     initramfs: Option<File>,
448     threads: Vec<thread::JoinHandle<()>>,
449     device_manager: Arc<Mutex<DeviceManager>>,
450     config: Arc<Mutex<VmConfig>>,
451     state: RwLock<VmState>,
452     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
453     memory_manager: Arc<Mutex<MemoryManager>>,
454     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
455     // The hypervisor abstracted virtual machine.
456     vm: Arc<dyn hypervisor::Vm>,
457     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
458     saved_clock: Option<hypervisor::ClockData>,
459     numa_nodes: NumaNodes,
460     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
461     hypervisor: Arc<dyn hypervisor::Hypervisor>,
462     stop_on_boot: bool,
463     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
464 }
465 
466 impl Vm {
467     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
468 
469     #[allow(clippy::too_many_arguments)]
470     pub fn new_from_memory_manager(
471         config: Arc<Mutex<VmConfig>>,
472         memory_manager: Arc<Mutex<MemoryManager>>,
473         vm: Arc<dyn hypervisor::Vm>,
474         exit_evt: EventFd,
475         reset_evt: EventFd,
476         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
477         seccomp_action: &SeccompAction,
478         hypervisor: Arc<dyn hypervisor::Hypervisor>,
479         activate_evt: EventFd,
480         timestamp: Instant,
481         serial_pty: Option<PtyPair>,
482         console_pty: Option<PtyPair>,
483         debug_console_pty: Option<PtyPair>,
484         console_resize_pipe: Option<File>,
485         original_termios: Arc<Mutex<Option<termios>>>,
486         snapshot: Option<Snapshot>,
487     ) -> Result<Self> {
488         trace_scoped!("Vm::new_from_memory_manager");
489 
490         let boot_id_list = config
491             .lock()
492             .unwrap()
493             .validate()
494             .map_err(Error::ConfigValidation)?;
495 
496         #[cfg(not(feature = "igvm"))]
497         let load_payload_handle = if snapshot.is_none() {
498             Self::load_payload_async(&memory_manager, &config)?
499         } else {
500             None
501         };
502 
503         info!("Booting VM from config: {:?}", &config);
504 
505         // Create NUMA nodes based on NumaConfig.
506         let numa_nodes =
507             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
508 
509         #[cfg(feature = "tdx")]
510         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
511         #[cfg(feature = "sev_snp")]
512         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
513         #[cfg(feature = "tdx")]
514         let force_iommu = tdx_enabled;
515         #[cfg(feature = "sev_snp")]
516         let force_iommu = sev_snp_enabled;
517         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
518         let force_iommu = false;
519 
520         #[cfg(feature = "guest_debug")]
521         let stop_on_boot = config.lock().unwrap().gdb;
522         #[cfg(not(feature = "guest_debug"))]
523         let stop_on_boot = false;
524 
525         let memory = memory_manager.lock().unwrap().guest_memory();
526         #[cfg(target_arch = "x86_64")]
527         let io_bus = Arc::new(Bus::new());
528         let mmio_bus = Arc::new(Bus::new());
529 
530         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
531             memory,
532             #[cfg(target_arch = "x86_64")]
533             io_bus: io_bus.clone(),
534             mmio_bus: mmio_bus.clone(),
535         });
536 
537         let cpus_config = { &config.lock().unwrap().cpus.clone() };
538         let cpu_manager = cpu::CpuManager::new(
539             cpus_config,
540             vm.clone(),
541             exit_evt.try_clone().map_err(Error::EventFdClone)?,
542             reset_evt.try_clone().map_err(Error::EventFdClone)?,
543             #[cfg(feature = "guest_debug")]
544             vm_debug_evt,
545             &hypervisor,
546             seccomp_action.clone(),
547             vm_ops,
548             #[cfg(feature = "tdx")]
549             tdx_enabled,
550             &numa_nodes,
551             #[cfg(feature = "sev_snp")]
552             sev_snp_enabled,
553         )
554         .map_err(Error::CpuManager)?;
555 
556         #[cfg(target_arch = "x86_64")]
557         cpu_manager
558             .lock()
559             .unwrap()
560             .populate_cpuid(
561                 &memory_manager,
562                 &hypervisor,
563                 #[cfg(feature = "tdx")]
564                 tdx_enabled,
565             )
566             .map_err(Error::CpuManager)?;
567 
568         // Loading the igvm file is pushed down here because
569         // igvm parser needs cpu_manager to retrieve cpuid leaf.
570         // For the regular case, we can start loading early, but for
571         // igvm case we have to wait until cpu_manager is created.
572         // Currently, Microsoft Hypervisor does not provide any
573         // Hypervisor specific common cpuid, we need to call get_cpuid_values
574         // per cpuid through cpu_manager.
575         #[cfg(feature = "igvm")]
576         let load_payload_handle = if snapshot.is_none() {
577             Self::load_payload_async(
578                 &memory_manager,
579                 &config,
580                 &cpu_manager,
581                 #[cfg(feature = "sev_snp")]
582                 sev_snp_enabled,
583             )?
584         } else {
585             None
586         };
587         // The initial TDX configuration must be done before the vCPUs are
588         // created
589         #[cfg(feature = "tdx")]
590         if tdx_enabled {
591             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
592             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
593             vm.tdx_init(&cpuid, max_vcpus)
594                 .map_err(Error::InitializeTdxVm)?;
595         }
596 
597         cpu_manager
598             .lock()
599             .unwrap()
600             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
601             .map_err(Error::CpuManager)?;
602 
603         // This initial SEV-SNP configuration must be done immediately after
604         // vCPUs are created. As part of this initialization we are
605         // transitioning the guest into secure state.
606         #[cfg(feature = "sev_snp")]
607         if sev_snp_enabled {
608             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
609         }
610 
611         #[cfg(feature = "tdx")]
612         let dynamic = !tdx_enabled;
613         #[cfg(not(feature = "tdx"))]
614         let dynamic = true;
615 
616         let device_manager = DeviceManager::new(
617             #[cfg(target_arch = "x86_64")]
618             io_bus,
619             mmio_bus,
620             hypervisor.hypervisor_type(),
621             vm.clone(),
622             config.clone(),
623             memory_manager.clone(),
624             cpu_manager.clone(),
625             exit_evt.try_clone().map_err(Error::EventFdClone)?,
626             reset_evt,
627             seccomp_action.clone(),
628             numa_nodes.clone(),
629             &activate_evt,
630             force_iommu,
631             boot_id_list,
632             timestamp,
633             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
634             dynamic,
635         )
636         .map_err(Error::DeviceManager)?;
637 
638         device_manager
639             .lock()
640             .unwrap()
641             .create_devices(
642                 serial_pty,
643                 console_pty,
644                 debug_console_pty,
645                 console_resize_pipe,
646                 original_termios,
647             )
648             .map_err(Error::DeviceManager)?;
649 
650         #[cfg(feature = "tdx")]
651         let kernel = config
652             .lock()
653             .unwrap()
654             .payload
655             .as_ref()
656             .map(|p| p.kernel.as_ref().map(File::open))
657             .unwrap_or_default()
658             .transpose()
659             .map_err(Error::KernelFile)?;
660 
661         let initramfs = config
662             .lock()
663             .unwrap()
664             .payload
665             .as_ref()
666             .map(|p| p.initramfs.as_ref().map(File::open))
667             .unwrap_or_default()
668             .transpose()
669             .map_err(Error::InitramfsFile)?;
670 
671         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
672         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
673             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
674             vm_snapshot.clock
675         } else {
676             None
677         };
678 
679         let vm_state = if snapshot.is_some() {
680             VmState::Paused
681         } else {
682             VmState::Created
683         };
684 
685         Ok(Vm {
686             #[cfg(feature = "tdx")]
687             kernel,
688             initramfs,
689             device_manager,
690             config,
691             threads: Vec::with_capacity(1),
692             state: RwLock::new(vm_state),
693             cpu_manager,
694             memory_manager,
695             vm,
696             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
697             saved_clock,
698             numa_nodes,
699             hypervisor,
700             stop_on_boot,
701             load_payload_handle,
702         })
703     }
704 
705     fn create_numa_nodes(
706         configs: Option<Vec<NumaConfig>>,
707         memory_manager: &Arc<Mutex<MemoryManager>>,
708     ) -> Result<NumaNodes> {
709         let mm = memory_manager.lock().unwrap();
710         let mm_zones = mm.memory_zones();
711         let mut numa_nodes = BTreeMap::new();
712 
713         if let Some(configs) = &configs {
714             for config in configs.iter() {
715                 if numa_nodes.contains_key(&config.guest_numa_id) {
716                     error!("Can't define twice the same NUMA node");
717                     return Err(Error::InvalidNumaConfig);
718                 }
719 
720                 let mut node = NumaNode::default();
721 
722                 if let Some(memory_zones) = &config.memory_zones {
723                     for memory_zone in memory_zones.iter() {
724                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
725                             node.memory_regions.extend(mm_zone.regions().clone());
726                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
727                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
728                             }
729                             node.memory_zones.push(memory_zone.clone());
730                         } else {
731                             error!("Unknown memory zone '{}'", memory_zone);
732                             return Err(Error::InvalidNumaConfig);
733                         }
734                     }
735                 }
736 
737                 if let Some(cpus) = &config.cpus {
738                     node.cpus.extend(cpus);
739                 }
740 
741                 if let Some(pci_segments) = &config.pci_segments {
742                     node.pci_segments.extend(pci_segments);
743                 }
744 
745                 if let Some(distances) = &config.distances {
746                     for distance in distances.iter() {
747                         let dest = distance.destination;
748                         let dist = distance.distance;
749 
750                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
751                             error!("Unknown destination NUMA node {}", dest);
752                             return Err(Error::InvalidNumaConfig);
753                         }
754 
755                         if node.distances.contains_key(&dest) {
756                             error!("Destination NUMA node {} has been already set", dest);
757                             return Err(Error::InvalidNumaConfig);
758                         }
759 
760                         node.distances.insert(dest, dist);
761                     }
762                 }
763 
764                 #[cfg(target_arch = "x86_64")]
765                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
766                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
767                         let mm_sections = sgx_epc_region.epc_sections();
768                         for sgx_epc_section in sgx_epc_sections.iter() {
769                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
770                                 node.sgx_epc_sections.push(mm_section.clone());
771                             } else {
772                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
773                                 return Err(Error::InvalidNumaConfig);
774                             }
775                         }
776                     } else {
777                         error!("Missing SGX EPC region");
778                         return Err(Error::InvalidNumaConfig);
779                     }
780                 }
781 
782                 numa_nodes.insert(config.guest_numa_id, node);
783             }
784         }
785 
786         Ok(numa_nodes)
787     }
788 
789     #[allow(clippy::too_many_arguments)]
790     pub fn new(
791         vm_config: Arc<Mutex<VmConfig>>,
792         exit_evt: EventFd,
793         reset_evt: EventFd,
794         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
795         seccomp_action: &SeccompAction,
796         hypervisor: Arc<dyn hypervisor::Hypervisor>,
797         activate_evt: EventFd,
798         serial_pty: Option<PtyPair>,
799         console_pty: Option<PtyPair>,
800         debug_console_pty: Option<PtyPair>,
801         console_resize_pipe: Option<File>,
802         original_termios: Arc<Mutex<Option<termios>>>,
803         snapshot: Option<Snapshot>,
804         source_url: Option<&str>,
805         prefault: Option<bool>,
806     ) -> Result<Self> {
807         trace_scoped!("Vm::new");
808 
809         let timestamp = Instant::now();
810 
811         #[cfg(feature = "tdx")]
812         let tdx_enabled = if snapshot.is_some() {
813             false
814         } else {
815             vm_config.lock().unwrap().is_tdx_enabled()
816         };
817 
818         #[cfg(feature = "sev_snp")]
819         let sev_snp_enabled = if snapshot.is_some() {
820             false
821         } else {
822             vm_config.lock().unwrap().is_sev_snp_enabled()
823         };
824 
825         let vm = Self::create_hypervisor_vm(
826             &hypervisor,
827             #[cfg(feature = "tdx")]
828             tdx_enabled,
829             #[cfg(feature = "sev_snp")]
830             sev_snp_enabled,
831         )?;
832 
833         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
834 
835         let memory_manager = if let Some(snapshot) =
836             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
837         {
838             MemoryManager::new_from_snapshot(
839                 &snapshot,
840                 vm.clone(),
841                 &vm_config.lock().unwrap().memory.clone(),
842                 source_url,
843                 prefault.unwrap(),
844                 phys_bits,
845             )
846             .map_err(Error::MemoryManager)?
847         } else {
848             #[cfg(target_arch = "x86_64")]
849             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
850 
851             MemoryManager::new(
852                 vm.clone(),
853                 &vm_config.lock().unwrap().memory.clone(),
854                 None,
855                 phys_bits,
856                 #[cfg(feature = "tdx")]
857                 tdx_enabled,
858                 None,
859                 None,
860                 #[cfg(target_arch = "x86_64")]
861                 sgx_epc_config,
862             )
863             .map_err(Error::MemoryManager)?
864         };
865 
866         Vm::new_from_memory_manager(
867             vm_config,
868             memory_manager,
869             vm,
870             exit_evt,
871             reset_evt,
872             #[cfg(feature = "guest_debug")]
873             vm_debug_evt,
874             seccomp_action,
875             hypervisor,
876             activate_evt,
877             timestamp,
878             serial_pty,
879             console_pty,
880             debug_console_pty,
881             console_resize_pipe,
882             original_termios,
883             snapshot,
884         )
885     }
886 
887     pub fn create_hypervisor_vm(
888         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
889         #[cfg(feature = "tdx")] tdx_enabled: bool,
890         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
891     ) -> Result<Arc<dyn hypervisor::Vm>> {
892         hypervisor.check_required_extensions().unwrap();
893 
894         cfg_if::cfg_if! {
895             if #[cfg(feature = "tdx")] {
896                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
897                 // Otherwise KVM_X86_LEGACY_VM: 0
898                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
899                 let vm = hypervisor
900                     .create_vm_with_type(u64::from(tdx_enabled))
901                     .unwrap();
902             } else if #[cfg(feature = "sev_snp")] {
903                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
904                 // Otherwise SEV_SNP_DISABLED: 0
905                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
906                 let vm = hypervisor
907                     .create_vm_with_type(u64::from(sev_snp_enabled))
908                     .unwrap();
909             } else {
910                 let vm = hypervisor.create_vm().unwrap();
911             }
912         }
913 
914         #[cfg(target_arch = "x86_64")]
915         {
916             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
917                 .unwrap();
918             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
919             vm.enable_split_irq().unwrap();
920         }
921 
922         Ok(vm)
923     }
924 
925     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
926         let initramfs = self.initramfs.as_mut().unwrap();
927         let size: usize = initramfs
928             .seek(SeekFrom::End(0))
929             .map_err(|_| Error::InitramfsLoad)?
930             .try_into()
931             .unwrap();
932         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
933 
934         let address =
935             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
936         let address = GuestAddress(address);
937 
938         guest_mem
939             .read_volatile_from(address, initramfs, size)
940             .map_err(|_| Error::InitramfsLoad)?;
941 
942         info!("Initramfs loaded: address = 0x{:x}", address.0);
943         Ok(arch::InitramfsConfig { address, size })
944     }
945 
946     pub fn generate_cmdline(
947         payload: &PayloadConfig,
948         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
949     ) -> Result<Cmdline> {
950         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
951         if let Some(s) = payload.cmdline.as_ref() {
952             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
953         }
954 
955         #[cfg(target_arch = "aarch64")]
956         for entry in device_manager.lock().unwrap().cmdline_additions() {
957             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
958         }
959         Ok(cmdline)
960     }
961 
962     #[cfg(target_arch = "aarch64")]
963     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
964         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
965         let mem = uefi_flash.memory();
966         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
967             .map_err(Error::UefiLoad)?;
968         Ok(())
969     }
970 
971     #[cfg(target_arch = "aarch64")]
972     fn load_kernel(
973         firmware: Option<File>,
974         kernel: Option<File>,
975         memory_manager: Arc<Mutex<MemoryManager>>,
976     ) -> Result<EntryPoint> {
977         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
978         let mem = guest_memory.memory();
979         let entry_addr = match (firmware, kernel) {
980             (None, Some(mut kernel)) => {
981                 match linux_loader::loader::pe::PE::load(
982                     mem.deref(),
983                     Some(arch::layout::KERNEL_START),
984                     &mut kernel,
985                     None,
986                 ) {
987                     Ok(entry_addr) => entry_addr.kernel_load,
988                     // Try to load the binary as kernel PE file at first.
989                     // If failed, retry to load it as UEFI binary.
990                     // As the UEFI binary is formatless, it must be the last option to try.
991                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
992                         Self::load_firmware(&kernel, memory_manager)?;
993                         arch::layout::UEFI_START
994                     }
995                     Err(e) => {
996                         return Err(Error::KernelLoad(e));
997                     }
998                 }
999             }
1000             (Some(firmware), None) => {
1001                 Self::load_firmware(&firmware, memory_manager)?;
1002                 arch::layout::UEFI_START
1003             }
1004             _ => return Err(Error::InvalidPayload),
1005         };
1006 
1007         Ok(EntryPoint { entry_addr })
1008     }
1009 
1010     #[cfg(feature = "igvm")]
1011     fn load_igvm(
1012         igvm: File,
1013         memory_manager: Arc<Mutex<MemoryManager>>,
1014         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1015         #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1016     ) -> Result<EntryPoint> {
1017         let res = igvm_loader::load_igvm(
1018             &igvm,
1019             memory_manager,
1020             cpu_manager.clone(),
1021             "",
1022             #[cfg(feature = "sev_snp")]
1023             host_data,
1024         )
1025         .map_err(Error::IgvmLoad)?;
1026 
1027         cfg_if::cfg_if! {
1028             if #[cfg(feature = "sev_snp")] {
1029                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1030                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1031                 } else {
1032                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1033                 };
1034             } else {
1035                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1036             }
1037         };
1038         Ok(entry_point)
1039     }
1040 
1041     #[cfg(target_arch = "x86_64")]
1042     fn load_kernel(
1043         mut kernel: File,
1044         cmdline: Option<Cmdline>,
1045         memory_manager: Arc<Mutex<MemoryManager>>,
1046     ) -> Result<EntryPoint> {
1047         info!("Loading kernel");
1048 
1049         let mem = {
1050             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1051             guest_memory.memory()
1052         };
1053 
1054         // Try ELF binary with PVH boot.
1055         let entry_addr = linux_loader::loader::elf::Elf::load(
1056             mem.deref(),
1057             None,
1058             &mut kernel,
1059             Some(arch::layout::HIGH_RAM_START),
1060         )
1061         // Try loading kernel as bzImage.
1062         .or_else(|_| {
1063             BzImage::load(
1064                 mem.deref(),
1065                 None,
1066                 &mut kernel,
1067                 Some(arch::layout::HIGH_RAM_START),
1068             )
1069         })
1070         .map_err(Error::KernelLoad)?;
1071 
1072         if let Some(cmdline) = cmdline {
1073             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1074                 .map_err(Error::LoadCmdLine)?;
1075         }
1076 
1077         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1078             // Use the PVH kernel entry point to boot the guest
1079             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1080             Ok(EntryPoint {
1081                 entry_addr,
1082                 setup_header: None,
1083             })
1084         } else if entry_addr.setup_header.is_some() {
1085             // Use the bzImage 32bit entry point to boot the guest
1086             info!(
1087                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1088                 entry_addr.kernel_load.0
1089             );
1090             Ok(EntryPoint {
1091                 entry_addr: entry_addr.kernel_load,
1092                 setup_header: entry_addr.setup_header,
1093             })
1094         } else {
1095             Err(Error::KernelMissingPvhHeader)
1096         }
1097     }
1098 
1099     #[cfg(target_arch = "x86_64")]
1100     fn load_payload(
1101         payload: &PayloadConfig,
1102         memory_manager: Arc<Mutex<MemoryManager>>,
1103         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1104         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1105     ) -> Result<EntryPoint> {
1106         trace_scoped!("load_payload");
1107         #[cfg(feature = "igvm")]
1108         {
1109             if let Some(_igvm_file) = &payload.igvm {
1110                 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1111                 #[cfg(feature = "sev_snp")]
1112                 if sev_snp_enabled {
1113                     return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1114                 }
1115                 #[cfg(not(feature = "sev_snp"))]
1116                 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1117             }
1118         }
1119         match (
1120             &payload.firmware,
1121             &payload.kernel,
1122             &payload.initramfs,
1123             &payload.cmdline,
1124         ) {
1125             (Some(firmware), None, None, None) => {
1126                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1127                 Self::load_kernel(firmware, None, memory_manager)
1128             }
1129             (None, Some(kernel), _, _) => {
1130                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1131                 let cmdline = Self::generate_cmdline(payload)?;
1132                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1133             }
1134             _ => Err(Error::InvalidPayload),
1135         }
1136     }
1137 
1138     #[cfg(target_arch = "aarch64")]
1139     fn load_payload(
1140         payload: &PayloadConfig,
1141         memory_manager: Arc<Mutex<MemoryManager>>,
1142     ) -> Result<EntryPoint> {
1143         match (&payload.firmware, &payload.kernel) {
1144             (Some(firmware), None) => {
1145                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1146                 Self::load_kernel(Some(firmware), None, memory_manager)
1147             }
1148             (None, Some(kernel)) => {
1149                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1150                 Self::load_kernel(None, Some(kernel), memory_manager)
1151             }
1152             _ => Err(Error::InvalidPayload),
1153         }
1154     }
1155 
1156     fn load_payload_async(
1157         memory_manager: &Arc<Mutex<MemoryManager>>,
1158         config: &Arc<Mutex<VmConfig>>,
1159         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1160         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1161     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1162         // Kernel with TDX is loaded in a different manner
1163         #[cfg(feature = "tdx")]
1164         if config.lock().unwrap().is_tdx_enabled() {
1165             return Ok(None);
1166         }
1167 
1168         config
1169             .lock()
1170             .unwrap()
1171             .payload
1172             .as_ref()
1173             .map(|payload| {
1174                 let memory_manager = memory_manager.clone();
1175                 let payload = payload.clone();
1176                 #[cfg(feature = "igvm")]
1177                 let cpu_manager = cpu_manager.clone();
1178 
1179                 std::thread::Builder::new()
1180                     .name("payload_loader".into())
1181                     .spawn(move || {
1182                         Self::load_payload(
1183                             &payload,
1184                             memory_manager,
1185                             #[cfg(feature = "igvm")]
1186                             cpu_manager,
1187                             #[cfg(feature = "sev_snp")]
1188                             sev_snp_enabled,
1189                         )
1190                     })
1191                     .map_err(Error::KernelLoadThreadSpawn)
1192             })
1193             .transpose()
1194     }
1195 
1196     #[cfg(target_arch = "x86_64")]
1197     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1198         trace_scoped!("configure_system");
1199         info!("Configuring system");
1200         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1201 
1202         let initramfs_config = match self.initramfs {
1203             Some(_) => Some(self.load_initramfs(&mem)?),
1204             None => None,
1205         };
1206 
1207         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1208         let rsdp_addr = Some(rsdp_addr);
1209         let sgx_epc_region = self
1210             .memory_manager
1211             .lock()
1212             .unwrap()
1213             .sgx_epc_region()
1214             .as_ref()
1215             .cloned();
1216 
1217         let serial_number = self
1218             .config
1219             .lock()
1220             .unwrap()
1221             .platform
1222             .as_ref()
1223             .and_then(|p| p.serial_number.clone());
1224 
1225         let uuid = self
1226             .config
1227             .lock()
1228             .unwrap()
1229             .platform
1230             .as_ref()
1231             .and_then(|p| p.uuid.clone());
1232 
1233         let oem_strings = self
1234             .config
1235             .lock()
1236             .unwrap()
1237             .platform
1238             .as_ref()
1239             .and_then(|p| p.oem_strings.clone());
1240 
1241         let oem_strings = oem_strings
1242             .as_deref()
1243             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1244 
1245         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1246 
1247         arch::configure_system(
1248             &mem,
1249             arch::layout::CMDLINE_START,
1250             arch::layout::CMDLINE_MAX_SIZE,
1251             &initramfs_config,
1252             boot_vcpus,
1253             entry_addr.setup_header,
1254             rsdp_addr,
1255             sgx_epc_region,
1256             serial_number.as_deref(),
1257             uuid.as_deref(),
1258             oem_strings.as_deref(),
1259             topology,
1260         )
1261         .map_err(Error::ConfigureSystem)?;
1262         Ok(())
1263     }
1264 
1265     #[cfg(target_arch = "aarch64")]
1266     fn configure_system(
1267         &mut self,
1268         _rsdp_addr: GuestAddress,
1269         _entry_addr: EntryPoint,
1270     ) -> Result<()> {
1271         let cmdline = Self::generate_cmdline(
1272             self.config.lock().unwrap().payload.as_ref().unwrap(),
1273             &self.device_manager,
1274         )?;
1275         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1276         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1277         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1278         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1279         let initramfs_config = match self.initramfs {
1280             Some(_) => Some(self.load_initramfs(&mem)?),
1281             None => None,
1282         };
1283 
1284         let device_info = &self
1285             .device_manager
1286             .lock()
1287             .unwrap()
1288             .get_device_info()
1289             .clone();
1290 
1291         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1292             let pci_space = PciSpaceInfo {
1293                 pci_segment_id: pci_segment.id,
1294                 mmio_config_address: pci_segment.mmio_config_address,
1295                 pci_device_space_start: pci_segment.start_of_mem64_area,
1296                 pci_device_space_size: pci_segment.end_of_mem64_area
1297                     - pci_segment.start_of_mem64_area
1298                     + 1,
1299             };
1300             pci_space_info.push(pci_space);
1301         }
1302 
1303         let virtio_iommu_bdf = self
1304             .device_manager
1305             .lock()
1306             .unwrap()
1307             .iommu_attached_devices()
1308             .as_ref()
1309             .map(|(v, _)| *v);
1310 
1311         let vgic = self
1312             .device_manager
1313             .lock()
1314             .unwrap()
1315             .get_interrupt_controller()
1316             .unwrap()
1317             .lock()
1318             .unwrap()
1319             .get_vgic()
1320             .map_err(|_| {
1321                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1322                     arch::aarch64::Error::SetupGic,
1323                 ))
1324             })?;
1325 
1326         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1327         let pmu_supported = self
1328             .cpu_manager
1329             .lock()
1330             .unwrap()
1331             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1332             .map_err(|_| {
1333                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1334                     arch::aarch64::Error::VcpuInitPmu,
1335                 ))
1336             })?;
1337 
1338         arch::configure_system(
1339             &mem,
1340             cmdline.as_cstring().unwrap().to_str().unwrap(),
1341             vcpu_mpidrs,
1342             vcpu_topology,
1343             device_info,
1344             &initramfs_config,
1345             &pci_space_info,
1346             virtio_iommu_bdf.map(|bdf| bdf.into()),
1347             &vgic,
1348             &self.numa_nodes,
1349             pmu_supported,
1350         )
1351         .map_err(Error::ConfigureSystem)?;
1352 
1353         Ok(())
1354     }
1355 
1356     pub fn serial_pty(&self) -> Option<PtyPair> {
1357         self.device_manager.lock().unwrap().serial_pty()
1358     }
1359 
1360     pub fn console_pty(&self) -> Option<PtyPair> {
1361         self.device_manager.lock().unwrap().console_pty()
1362     }
1363 
1364     pub fn debug_console_pty(&self) -> Option<PtyPair> {
1365         self.device_manager.lock().unwrap().debug_console_pty()
1366     }
1367 
1368     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1369         self.device_manager.lock().unwrap().console_resize_pipe()
1370     }
1371 
1372     pub fn shutdown(&mut self) -> Result<()> {
1373         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1374         let new_state = VmState::Shutdown;
1375 
1376         state.valid_transition(new_state)?;
1377 
1378         // Wake up the DeviceManager threads so they will get terminated cleanly
1379         self.device_manager
1380             .lock()
1381             .unwrap()
1382             .resume()
1383             .map_err(Error::Resume)?;
1384 
1385         self.cpu_manager
1386             .lock()
1387             .unwrap()
1388             .shutdown()
1389             .map_err(Error::CpuManager)?;
1390 
1391         // Wait for all the threads to finish
1392         for thread in self.threads.drain(..) {
1393             thread.join().map_err(Error::ThreadCleanup)?
1394         }
1395         *state = new_state;
1396 
1397         event!("vm", "shutdown");
1398 
1399         Ok(())
1400     }
1401 
1402     pub fn resize(
1403         &mut self,
1404         desired_vcpus: Option<u8>,
1405         desired_memory: Option<u64>,
1406         desired_balloon: Option<u64>,
1407     ) -> Result<()> {
1408         event!("vm", "resizing");
1409 
1410         if let Some(desired_vcpus) = desired_vcpus {
1411             if self
1412                 .cpu_manager
1413                 .lock()
1414                 .unwrap()
1415                 .resize(desired_vcpus)
1416                 .map_err(Error::CpuManager)?
1417             {
1418                 self.device_manager
1419                     .lock()
1420                     .unwrap()
1421                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1422                     .map_err(Error::DeviceManager)?;
1423             }
1424             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1425         }
1426 
1427         if let Some(desired_memory) = desired_memory {
1428             let new_region = self
1429                 .memory_manager
1430                 .lock()
1431                 .unwrap()
1432                 .resize(desired_memory)
1433                 .map_err(Error::MemoryManager)?;
1434 
1435             let memory_config = &mut self.config.lock().unwrap().memory;
1436 
1437             if let Some(new_region) = &new_region {
1438                 self.device_manager
1439                     .lock()
1440                     .unwrap()
1441                     .update_memory(new_region)
1442                     .map_err(Error::DeviceManager)?;
1443 
1444                 match memory_config.hotplug_method {
1445                     HotplugMethod::Acpi => {
1446                         self.device_manager
1447                             .lock()
1448                             .unwrap()
1449                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1450                             .map_err(Error::DeviceManager)?;
1451                     }
1452                     HotplugMethod::VirtioMem => {}
1453                 }
1454             }
1455 
1456             // We update the VM config regardless of the actual guest resize
1457             // operation result (happened or not), so that if the VM reboots
1458             // it will be running with the last configure memory size.
1459             match memory_config.hotplug_method {
1460                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1461                 HotplugMethod::VirtioMem => {
1462                     if desired_memory > memory_config.size {
1463                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1464                     } else {
1465                         memory_config.hotplugged_size = None;
1466                     }
1467                 }
1468             }
1469         }
1470 
1471         if let Some(desired_balloon) = desired_balloon {
1472             self.device_manager
1473                 .lock()
1474                 .unwrap()
1475                 .resize_balloon(desired_balloon)
1476                 .map_err(Error::DeviceManager)?;
1477 
1478             // Update the configuration value for the balloon size to ensure
1479             // a reboot would use the right value.
1480             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1481                 balloon_config.size = desired_balloon;
1482             }
1483         }
1484 
1485         event!("vm", "resized");
1486 
1487         Ok(())
1488     }
1489 
1490     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1491         let memory_config = &mut self.config.lock().unwrap().memory;
1492 
1493         if let Some(zones) = &mut memory_config.zones {
1494             for zone in zones.iter_mut() {
1495                 if zone.id == id {
1496                     if desired_memory >= zone.size {
1497                         let hotplugged_size = desired_memory - zone.size;
1498                         self.memory_manager
1499                             .lock()
1500                             .unwrap()
1501                             .resize_zone(&id, desired_memory - zone.size)
1502                             .map_err(Error::MemoryManager)?;
1503                         // We update the memory zone config regardless of the
1504                         // actual 'resize-zone' operation result (happened or
1505                         // not), so that if the VM reboots it will be running
1506                         // with the last configured memory zone size.
1507                         zone.hotplugged_size = Some(hotplugged_size);
1508 
1509                         return Ok(());
1510                     } else {
1511                         error!(
1512                             "Invalid to ask less ({}) than boot RAM ({}) for \
1513                             this memory zone",
1514                             desired_memory, zone.size,
1515                         );
1516                         return Err(Error::ResizeZone);
1517                     }
1518                 }
1519             }
1520         }
1521 
1522         error!("Could not find the memory zone {} for the resize", id);
1523         Err(Error::ResizeZone)
1524     }
1525 
1526     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1527         let pci_device_info = self
1528             .device_manager
1529             .lock()
1530             .unwrap()
1531             .add_device(&mut device_cfg)
1532             .map_err(Error::DeviceManager)?;
1533 
1534         // Update VmConfig by adding the new device. This is important to
1535         // ensure the device would be created in case of a reboot.
1536         {
1537             let mut config = self.config.lock().unwrap();
1538             add_to_config(&mut config.devices, device_cfg);
1539         }
1540 
1541         self.device_manager
1542             .lock()
1543             .unwrap()
1544             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1545             .map_err(Error::DeviceManager)?;
1546 
1547         Ok(pci_device_info)
1548     }
1549 
1550     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1551         let pci_device_info = self
1552             .device_manager
1553             .lock()
1554             .unwrap()
1555             .add_user_device(&mut device_cfg)
1556             .map_err(Error::DeviceManager)?;
1557 
1558         // Update VmConfig by adding the new device. This is important to
1559         // ensure the device would be created in case of a reboot.
1560         {
1561             let mut config = self.config.lock().unwrap();
1562             add_to_config(&mut config.user_devices, device_cfg);
1563         }
1564 
1565         self.device_manager
1566             .lock()
1567             .unwrap()
1568             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1569             .map_err(Error::DeviceManager)?;
1570 
1571         Ok(pci_device_info)
1572     }
1573 
1574     pub fn remove_device(&mut self, id: String) -> Result<()> {
1575         self.device_manager
1576             .lock()
1577             .unwrap()
1578             .remove_device(id.clone())
1579             .map_err(Error::DeviceManager)?;
1580 
1581         // Update VmConfig by removing the device. This is important to
1582         // ensure the device would not be created in case of a reboot.
1583         self.config.lock().unwrap().remove_device(&id);
1584 
1585         self.device_manager
1586             .lock()
1587             .unwrap()
1588             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1589             .map_err(Error::DeviceManager)?;
1590         Ok(())
1591     }
1592 
1593     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1594         let pci_device_info = self
1595             .device_manager
1596             .lock()
1597             .unwrap()
1598             .add_disk(&mut disk_cfg)
1599             .map_err(Error::DeviceManager)?;
1600 
1601         // Update VmConfig by adding the new device. This is important to
1602         // ensure the device would be created in case of a reboot.
1603         {
1604             let mut config = self.config.lock().unwrap();
1605             add_to_config(&mut config.disks, disk_cfg);
1606         }
1607 
1608         self.device_manager
1609             .lock()
1610             .unwrap()
1611             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1612             .map_err(Error::DeviceManager)?;
1613 
1614         Ok(pci_device_info)
1615     }
1616 
1617     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1618         let pci_device_info = self
1619             .device_manager
1620             .lock()
1621             .unwrap()
1622             .add_fs(&mut fs_cfg)
1623             .map_err(Error::DeviceManager)?;
1624 
1625         // Update VmConfig by adding the new device. This is important to
1626         // ensure the device would be created in case of a reboot.
1627         {
1628             let mut config = self.config.lock().unwrap();
1629             add_to_config(&mut config.fs, fs_cfg);
1630         }
1631 
1632         self.device_manager
1633             .lock()
1634             .unwrap()
1635             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1636             .map_err(Error::DeviceManager)?;
1637 
1638         Ok(pci_device_info)
1639     }
1640 
1641     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1642         let pci_device_info = self
1643             .device_manager
1644             .lock()
1645             .unwrap()
1646             .add_pmem(&mut pmem_cfg)
1647             .map_err(Error::DeviceManager)?;
1648 
1649         // Update VmConfig by adding the new device. This is important to
1650         // ensure the device would be created in case of a reboot.
1651         {
1652             let mut config = self.config.lock().unwrap();
1653             add_to_config(&mut config.pmem, pmem_cfg);
1654         }
1655 
1656         self.device_manager
1657             .lock()
1658             .unwrap()
1659             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1660             .map_err(Error::DeviceManager)?;
1661 
1662         Ok(pci_device_info)
1663     }
1664 
1665     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1666         let pci_device_info = self
1667             .device_manager
1668             .lock()
1669             .unwrap()
1670             .add_net(&mut net_cfg)
1671             .map_err(Error::DeviceManager)?;
1672 
1673         // Update VmConfig by adding the new device. This is important to
1674         // ensure the device would be created in case of a reboot.
1675         {
1676             let mut config = self.config.lock().unwrap();
1677             add_to_config(&mut config.net, net_cfg);
1678         }
1679 
1680         self.device_manager
1681             .lock()
1682             .unwrap()
1683             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1684             .map_err(Error::DeviceManager)?;
1685 
1686         Ok(pci_device_info)
1687     }
1688 
1689     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1690         let pci_device_info = self
1691             .device_manager
1692             .lock()
1693             .unwrap()
1694             .add_vdpa(&mut vdpa_cfg)
1695             .map_err(Error::DeviceManager)?;
1696 
1697         // Update VmConfig by adding the new device. This is important to
1698         // ensure the device would be created in case of a reboot.
1699         {
1700             let mut config = self.config.lock().unwrap();
1701             add_to_config(&mut config.vdpa, vdpa_cfg);
1702         }
1703 
1704         self.device_manager
1705             .lock()
1706             .unwrap()
1707             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1708             .map_err(Error::DeviceManager)?;
1709 
1710         Ok(pci_device_info)
1711     }
1712 
1713     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1714         let pci_device_info = self
1715             .device_manager
1716             .lock()
1717             .unwrap()
1718             .add_vsock(&mut vsock_cfg)
1719             .map_err(Error::DeviceManager)?;
1720 
1721         // Update VmConfig by adding the new device. This is important to
1722         // ensure the device would be created in case of a reboot.
1723         {
1724             let mut config = self.config.lock().unwrap();
1725             config.vsock = Some(vsock_cfg);
1726         }
1727 
1728         self.device_manager
1729             .lock()
1730             .unwrap()
1731             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1732             .map_err(Error::DeviceManager)?;
1733 
1734         Ok(pci_device_info)
1735     }
1736 
1737     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1738         Ok(self.device_manager.lock().unwrap().counters())
1739     }
1740 
1741     #[cfg(feature = "tdx")]
1742     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1743         use arch::x86_64::tdx::*;
1744 
1745         let firmware_path = self
1746             .config
1747             .lock()
1748             .unwrap()
1749             .payload
1750             .as_ref()
1751             .unwrap()
1752             .firmware
1753             .clone()
1754             .ok_or(Error::TdxFirmwareMissing)?;
1755         // The TDVF file contains a table of section as well as code
1756         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1757 
1758         // For all the sections allocate some RAM backing them
1759         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1760     }
1761 
1762     #[cfg(feature = "tdx")]
1763     fn hob_memory_resources(
1764         mut sorted_sections: Vec<TdvfSection>,
1765         guest_memory: &GuestMemoryMmap,
1766     ) -> Vec<(u64, u64, bool)> {
1767         let mut list = Vec::new();
1768 
1769         let mut current_section = sorted_sections.pop();
1770 
1771         // RAM regions interleaved with TDVF sections
1772         let mut next_start_addr = 0;
1773         for region in guest_memory.iter() {
1774             let region_start = region.start_addr().0;
1775             let region_end = region.last_addr().0;
1776             if region_start > next_start_addr {
1777                 next_start_addr = region_start;
1778             }
1779 
1780             loop {
1781                 let (start, size, ram) = if let Some(section) = &current_section {
1782                     if section.address <= next_start_addr {
1783                         (section.address, section.size, false)
1784                     } else {
1785                         let last_addr = std::cmp::min(section.address - 1, region_end);
1786                         (next_start_addr, last_addr - next_start_addr + 1, true)
1787                     }
1788                 } else {
1789                     (next_start_addr, region_end - next_start_addr + 1, true)
1790                 };
1791 
1792                 list.push((start, size, ram));
1793 
1794                 if !ram {
1795                     current_section = sorted_sections.pop();
1796                 }
1797 
1798                 next_start_addr = start + size;
1799 
1800                 if region_start > next_start_addr {
1801                     next_start_addr = region_start;
1802                 }
1803 
1804                 if next_start_addr > region_end {
1805                     break;
1806                 }
1807             }
1808         }
1809 
1810         // Once all the interleaved sections have been processed, let's simply
1811         // pull the remaining ones.
1812         if let Some(section) = current_section {
1813             list.push((section.address, section.size, false));
1814         }
1815         while let Some(section) = sorted_sections.pop() {
1816             list.push((section.address, section.size, false));
1817         }
1818 
1819         list
1820     }
1821 
1822     #[cfg(feature = "tdx")]
1823     fn populate_tdx_sections(
1824         &mut self,
1825         sections: &[TdvfSection],
1826         guid_found: bool,
1827     ) -> Result<Option<u64>> {
1828         use arch::x86_64::tdx::*;
1829         // Get the memory end *before* we start adding TDVF ram regions
1830         let boot_guest_memory = self
1831             .memory_manager
1832             .lock()
1833             .as_ref()
1834             .unwrap()
1835             .boot_guest_memory();
1836         for section in sections {
1837             // No need to allocate if the section falls within guest RAM ranges
1838             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1839                 info!(
1840                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1841                     section
1842                 );
1843                 continue;
1844             }
1845 
1846             info!("Allocating TDVF Section: {:x?}", section);
1847             self.memory_manager
1848                 .lock()
1849                 .unwrap()
1850                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1851                 .map_err(Error::AllocatingTdvfMemory)?;
1852         }
1853 
1854         // The TDVF file contains a table of section as well as code
1855         let firmware_path = self
1856             .config
1857             .lock()
1858             .unwrap()
1859             .payload
1860             .as_ref()
1861             .unwrap()
1862             .firmware
1863             .clone()
1864             .ok_or(Error::TdxFirmwareMissing)?;
1865         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1866 
1867         // The guest memory at this point now has all the required regions so it
1868         // is safe to copy from the TDVF file into it.
1869         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1870         let mem = guest_memory.memory();
1871         let mut payload_info = None;
1872         let mut hob_offset = None;
1873         for section in sections {
1874             info!("Populating TDVF Section: {:x?}", section);
1875             match section.r#type {
1876                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1877                     info!("Copying section to guest memory");
1878                     firmware_file
1879                         .seek(SeekFrom::Start(section.data_offset as u64))
1880                         .map_err(Error::LoadTdvf)?;
1881                     mem.read_volatile_from(
1882                         GuestAddress(section.address),
1883                         &mut firmware_file,
1884                         section.data_size as usize,
1885                     )
1886                     .unwrap();
1887                 }
1888                 TdvfSectionType::TdHob => {
1889                     hob_offset = Some(section.address);
1890                 }
1891                 TdvfSectionType::Payload => {
1892                     info!("Copying payload to guest memory");
1893                     if let Some(payload_file) = self.kernel.as_mut() {
1894                         let payload_size = payload_file
1895                             .seek(SeekFrom::End(0))
1896                             .map_err(Error::LoadPayload)?;
1897 
1898                         payload_file
1899                             .seek(SeekFrom::Start(0x1f1))
1900                             .map_err(Error::LoadPayload)?;
1901 
1902                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1903                         payload_file
1904                             .read_volatile(&mut payload_header.as_bytes())
1905                             .unwrap();
1906 
1907                         if payload_header.header != 0x5372_6448 {
1908                             return Err(Error::InvalidPayloadType);
1909                         }
1910 
1911                         if (payload_header.version < 0x0200)
1912                             || ((payload_header.loadflags & 0x1) == 0x0)
1913                         {
1914                             return Err(Error::InvalidPayloadType);
1915                         }
1916 
1917                         payload_file.rewind().map_err(Error::LoadPayload)?;
1918                         mem.read_volatile_from(
1919                             GuestAddress(section.address),
1920                             payload_file,
1921                             payload_size as usize,
1922                         )
1923                         .unwrap();
1924 
1925                         // Create the payload info that will be inserted into
1926                         // the HOB.
1927                         payload_info = Some(PayloadInfo {
1928                             image_type: PayloadImageType::BzImage,
1929                             entry_point: section.address,
1930                         });
1931                     }
1932                 }
1933                 TdvfSectionType::PayloadParam => {
1934                     info!("Copying payload parameters to guest memory");
1935                     let cmdline = Self::generate_cmdline(
1936                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1937                     )?;
1938                     mem.write_slice(
1939                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1940                         GuestAddress(section.address),
1941                     )
1942                     .unwrap();
1943                 }
1944                 _ => {}
1945             }
1946         }
1947 
1948         // Generate HOB
1949         let mut hob = TdHob::start(hob_offset.unwrap());
1950 
1951         let mut sorted_sections = sections.to_vec();
1952         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1953 
1954         sorted_sections.sort_by_key(|section| section.address);
1955         sorted_sections.reverse();
1956 
1957         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1958             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1959                 .map_err(Error::PopulateHob)?;
1960         }
1961 
1962         // MMIO regions
1963         hob.add_mmio_resource(
1964             &mem,
1965             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1966             arch::layout::APIC_START.raw_value()
1967                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1968         )
1969         .map_err(Error::PopulateHob)?;
1970         let start_of_device_area = self
1971             .memory_manager
1972             .lock()
1973             .unwrap()
1974             .start_of_device_area()
1975             .raw_value();
1976         let end_of_device_area = self
1977             .memory_manager
1978             .lock()
1979             .unwrap()
1980             .end_of_device_area()
1981             .raw_value();
1982         hob.add_mmio_resource(
1983             &mem,
1984             start_of_device_area,
1985             end_of_device_area - start_of_device_area,
1986         )
1987         .map_err(Error::PopulateHob)?;
1988 
1989         // Loop over the ACPI tables and copy them to the HOB.
1990 
1991         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1992             &self.device_manager,
1993             &self.cpu_manager,
1994             &self.memory_manager,
1995             &self.numa_nodes,
1996         ) {
1997             hob.add_acpi_table(&mem, acpi_table.as_slice())
1998                 .map_err(Error::PopulateHob)?;
1999         }
2000 
2001         // If a payload info has been created, let's insert it into the HOB.
2002         if let Some(payload_info) = payload_info {
2003             hob.add_payload(&mem, payload_info)
2004                 .map_err(Error::PopulateHob)?;
2005         }
2006 
2007         hob.finish(&mem).map_err(Error::PopulateHob)?;
2008 
2009         Ok(hob_offset)
2010     }
2011 
2012     #[cfg(feature = "tdx")]
2013     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
2014         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2015         let mem = guest_memory.memory();
2016 
2017         for section in sections {
2018             self.vm
2019                 .tdx_init_memory_region(
2020                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2021                     section.address,
2022                     section.size,
2023                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2024                     section.attributes == 1,
2025                 )
2026                 .map_err(Error::InitializeTdxMemoryRegion)?;
2027         }
2028 
2029         Ok(())
2030     }
2031 
2032     // Creates ACPI tables
2033     // In case of TDX being used, this is a no-op since the tables will be
2034     // created and passed when populating the HOB.
2035 
2036     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2037         #[cfg(feature = "tdx")]
2038         if self.config.lock().unwrap().is_tdx_enabled() {
2039             return None;
2040         }
2041         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2042         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2043         let rsdp_addr = crate::acpi::create_acpi_tables(
2044             &mem,
2045             &self.device_manager,
2046             &self.cpu_manager,
2047             &self.memory_manager,
2048             &self.numa_nodes,
2049             tpm_enabled,
2050         );
2051         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2052 
2053         Some(rsdp_addr)
2054     }
2055 
2056     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2057         trace_scoped!("entry_point");
2058 
2059         self.load_payload_handle
2060             .take()
2061             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2062             .transpose()
2063     }
2064 
2065     pub fn boot(&mut self) -> Result<()> {
2066         trace_scoped!("Vm::boot");
2067         info!("Booting VM");
2068         event!("vm", "booting");
2069         let current_state = self.get_state()?;
2070         if current_state == VmState::Paused {
2071             return self.resume().map_err(Error::Resume);
2072         }
2073 
2074         let new_state = if self.stop_on_boot {
2075             VmState::BreakPoint
2076         } else {
2077             VmState::Running
2078         };
2079         current_state.valid_transition(new_state)?;
2080 
2081         // Do earlier to parallelise with loading kernel
2082         #[cfg(target_arch = "x86_64")]
2083         cfg_if::cfg_if! {
2084             if #[cfg(feature = "sev_snp")] {
2085                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2086                 let rsdp_addr = if sev_snp_enabled {
2087                     // In case of SEV-SNP guest ACPI tables are provided via
2088                     // IGVM. So skip the creation of ACPI tables and set the
2089                     // rsdp addr to None.
2090                     None
2091                 } else {
2092                     self.create_acpi_tables()
2093                 };
2094             } else {
2095                 let rsdp_addr = self.create_acpi_tables();
2096             }
2097         }
2098 
2099         // Load kernel synchronously or if asynchronous then wait for load to
2100         // finish.
2101         let entry_point = self.entry_point()?;
2102 
2103         #[cfg(feature = "tdx")]
2104         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2105 
2106         // Configure the vcpus that have been created
2107         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2108         for vcpu in vcpus {
2109             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2110             let boot_setup = entry_point.map(|e| (e, guest_memory));
2111             self.cpu_manager
2112                 .lock()
2113                 .unwrap()
2114                 .configure_vcpu(vcpu, boot_setup)
2115                 .map_err(Error::CpuManager)?;
2116         }
2117 
2118         #[cfg(feature = "tdx")]
2119         let (sections, guid_found) = if tdx_enabled {
2120             self.extract_tdvf_sections()?
2121         } else {
2122             (Vec::new(), false)
2123         };
2124 
2125         // Configuring the TDX regions requires that the vCPUs are created.
2126         #[cfg(feature = "tdx")]
2127         let hob_address = if tdx_enabled {
2128             // TDX sections are written to memory.
2129             self.populate_tdx_sections(&sections, guid_found)?
2130         } else {
2131             None
2132         };
2133 
2134         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2135         // available after they are configured
2136         #[cfg(target_arch = "aarch64")]
2137         let rsdp_addr = self.create_acpi_tables();
2138 
2139         // Configure shared state based on loaded kernel
2140         entry_point
2141             .map(|entry_point| {
2142                 // Safe to unwrap rsdp_addr as we know it can't be None when
2143                 // the entry_point is Some.
2144                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2145             })
2146             .transpose()?;
2147 
2148         #[cfg(target_arch = "x86_64")]
2149         // Note: For x86, always call this function before invoking start boot vcpus.
2150         // Otherwise guest would fail to boot because we haven't created the
2151         // userspace mappings to update the hypervisor about the memory mappings.
2152         // These mappings must be created before we start the vCPU threads for
2153         // the very first time.
2154         self.memory_manager
2155             .lock()
2156             .unwrap()
2157             .allocate_address_space()
2158             .map_err(Error::MemoryManager)?;
2159 
2160         #[cfg(feature = "tdx")]
2161         if let Some(hob_address) = hob_address {
2162             // With the HOB address extracted the vCPUs can have
2163             // their TDX state configured.
2164             self.cpu_manager
2165                 .lock()
2166                 .unwrap()
2167                 .initialize_tdx(hob_address)
2168                 .map_err(Error::CpuManager)?;
2169             // Let the hypervisor know which memory ranges are shared with the
2170             // guest. This prevents the guest from ignoring/discarding memory
2171             // regions provided by the host.
2172             self.init_tdx_memory(&sections)?;
2173             // With TDX memory and CPU state configured TDX setup is complete
2174             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2175         }
2176 
2177         self.cpu_manager
2178             .lock()
2179             .unwrap()
2180             .start_boot_vcpus(new_state == VmState::BreakPoint)
2181             .map_err(Error::CpuManager)?;
2182 
2183         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2184         *state = new_state;
2185         event!("vm", "booted");
2186         Ok(())
2187     }
2188 
2189     pub fn restore(&mut self) -> Result<()> {
2190         event!("vm", "restoring");
2191 
2192         #[cfg(target_arch = "x86_64")]
2193         // Note: For x86, always call this function before invoking start boot vcpus.
2194         // Otherwise guest would fail to boot because we haven't created the
2195         // userspace mappings to update the hypervisor about the memory mappings.
2196         // These mappings must be created before we start the vCPU threads for
2197         // the very first time for the restored VM.
2198         self.memory_manager
2199             .lock()
2200             .unwrap()
2201             .allocate_address_space()
2202             .map_err(Error::MemoryManager)?;
2203 
2204         // Now we can start all vCPUs from here.
2205         self.cpu_manager
2206             .lock()
2207             .unwrap()
2208             .start_restored_vcpus()
2209             .map_err(Error::CpuManager)?;
2210 
2211         event!("vm", "restored");
2212         Ok(())
2213     }
2214 
2215     /// Gets a thread-safe reference counted pointer to the VM configuration.
2216     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2217         Arc::clone(&self.config)
2218     }
2219 
2220     /// Get the VM state. Returns an error if the state is poisoned.
2221     pub fn get_state(&self) -> Result<VmState> {
2222         self.state
2223             .try_read()
2224             .map_err(|_| Error::PoisonedState)
2225             .map(|state| *state)
2226     }
2227 
2228     /// Gets the actual size of the balloon.
2229     pub fn balloon_size(&self) -> u64 {
2230         self.device_manager.lock().unwrap().balloon_size()
2231     }
2232 
2233     pub fn send_memory_fds(
2234         &mut self,
2235         socket: &mut UnixStream,
2236     ) -> std::result::Result<(), MigratableError> {
2237         for (slot, fd) in self
2238             .memory_manager
2239             .lock()
2240             .unwrap()
2241             .memory_slot_fds()
2242             .drain()
2243         {
2244             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2245                 .write_to(socket)
2246                 .map_err(|e| {
2247                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2248                 })?;
2249             socket
2250                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2251                 .map_err(|e| {
2252                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2253                 })?;
2254 
2255             let res = Response::read_from(socket)?;
2256             if res.status() != Status::Ok {
2257                 warn!("Error during memory fd migration");
2258                 Request::abandon().write_to(socket)?;
2259                 Response::read_from(socket).ok();
2260                 return Err(MigratableError::MigrateSend(anyhow!(
2261                     "Error during memory fd migration"
2262                 )));
2263             }
2264         }
2265 
2266         Ok(())
2267     }
2268 
2269     pub fn send_memory_regions<F>(
2270         &mut self,
2271         ranges: &MemoryRangeTable,
2272         fd: &mut F,
2273     ) -> std::result::Result<(), MigratableError>
2274     where
2275         F: WriteVolatile,
2276     {
2277         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2278         let mem = guest_memory.memory();
2279 
2280         for range in ranges.regions() {
2281             let mut offset: u64 = 0;
2282             // Here we are manually handling the retry in case we can't the
2283             // whole region at once because we can't use the implementation
2284             // from vm-memory::GuestMemory of write_all_to() as it is not
2285             // following the correct behavior. For more info about this issue
2286             // see: https://github.com/rust-vmm/vm-memory/issues/174
2287             loop {
2288                 let bytes_written = mem
2289                     .write_volatile_to(
2290                         GuestAddress(range.gpa + offset),
2291                         fd,
2292                         (range.length - offset) as usize,
2293                     )
2294                     .map_err(|e| {
2295                         MigratableError::MigrateSend(anyhow!(
2296                             "Error transferring memory to socket: {}",
2297                             e
2298                         ))
2299                     })?;
2300                 offset += bytes_written as u64;
2301 
2302                 if offset == range.length {
2303                     break;
2304                 }
2305             }
2306         }
2307 
2308         Ok(())
2309     }
2310 
2311     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2312         self.memory_manager
2313             .lock()
2314             .unwrap()
2315             .memory_range_table(false)
2316     }
2317 
2318     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2319         self.device_manager.lock().unwrap().device_tree()
2320     }
2321 
2322     pub fn activate_virtio_devices(&self) -> Result<()> {
2323         self.device_manager
2324             .lock()
2325             .unwrap()
2326             .activate_virtio_devices()
2327             .map_err(Error::ActivateVirtioDevices)
2328     }
2329 
2330     #[cfg(target_arch = "x86_64")]
2331     pub fn power_button(&self) -> Result<()> {
2332         return self
2333             .device_manager
2334             .lock()
2335             .unwrap()
2336             .notify_power_button()
2337             .map_err(Error::PowerButton);
2338     }
2339 
2340     #[cfg(target_arch = "aarch64")]
2341     pub fn power_button(&self) -> Result<()> {
2342         self.device_manager
2343             .lock()
2344             .unwrap()
2345             .notify_power_button()
2346             .map_err(Error::PowerButton)
2347     }
2348 
2349     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2350         self.memory_manager.lock().unwrap().snapshot_data()
2351     }
2352 
2353     #[cfg(feature = "guest_debug")]
2354     pub fn debug_request(
2355         &mut self,
2356         gdb_request: &GdbRequestPayload,
2357         cpu_id: usize,
2358     ) -> Result<GdbResponsePayload> {
2359         use GdbRequestPayload::*;
2360         match gdb_request {
2361             SetSingleStep(single_step) => {
2362                 self.set_guest_debug(cpu_id, &[], *single_step)
2363                     .map_err(Error::Debug)?;
2364             }
2365             SetHwBreakPoint(addrs) => {
2366                 self.set_guest_debug(cpu_id, addrs, false)
2367                     .map_err(Error::Debug)?;
2368             }
2369             Pause => {
2370                 self.debug_pause().map_err(Error::Debug)?;
2371             }
2372             Resume => {
2373                 self.debug_resume().map_err(Error::Debug)?;
2374             }
2375             ReadRegs => {
2376                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2377                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2378             }
2379             WriteRegs(regs) => {
2380                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2381             }
2382             ReadMem(vaddr, len) => {
2383                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2384                 let mem = self
2385                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2386                     .map_err(Error::Debug)?;
2387                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2388             }
2389             WriteMem(vaddr, data) => {
2390                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2391                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2392                     .map_err(Error::Debug)?;
2393             }
2394             ActiveVcpus => {
2395                 let active_vcpus = self.active_vcpus();
2396                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2397             }
2398         }
2399         Ok(GdbResponsePayload::CommandComplete)
2400     }
2401 
2402     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2403     fn get_dump_state(
2404         &mut self,
2405         destination_url: &str,
2406     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2407         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2408         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2409         let mut elf_phdr_num = 1;
2410         let elf_sh_info = 0;
2411         let coredump_file_path = url_to_file(destination_url)?;
2412         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2413 
2414         if mapping_num < UINT16_MAX - 2 {
2415             elf_phdr_num += mapping_num as u16;
2416         } else {
2417             panic!("mapping num beyond 65535 not supported");
2418         }
2419         let coredump_file = OpenOptions::new()
2420             .read(true)
2421             .write(true)
2422             .create_new(true)
2423             .open(coredump_file_path)
2424             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2425 
2426         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2427         let mem_data = self
2428             .memory_manager
2429             .lock()
2430             .unwrap()
2431             .coredump_memory_regions(mem_offset);
2432 
2433         Ok(DumpState {
2434             elf_note_size,
2435             elf_phdr_num,
2436             elf_sh_info,
2437             mem_offset,
2438             mem_info: Some(mem_data),
2439             file: Some(coredump_file),
2440         })
2441     }
2442 
2443     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2444     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2445         size_of::<elf::Elf64_Ehdr>() as u64
2446             + note_size as u64
2447             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2448     }
2449 }
2450 
2451 impl Pausable for Vm {
2452     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2453         event!("vm", "pausing");
2454         let mut state = self
2455             .state
2456             .try_write()
2457             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2458         let new_state = VmState::Paused;
2459 
2460         state
2461             .valid_transition(new_state)
2462             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2463 
2464         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2465         {
2466             let mut clock = self
2467                 .vm
2468                 .get_clock()
2469                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2470             clock.reset_flags();
2471             self.saved_clock = Some(clock);
2472         }
2473 
2474         // Before pausing the vCPUs activate any pending virtio devices that might
2475         // need activation between starting the pause (or e.g. a migration it's part of)
2476         self.activate_virtio_devices().map_err(|e| {
2477             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2478         })?;
2479 
2480         self.cpu_manager.lock().unwrap().pause()?;
2481         self.device_manager.lock().unwrap().pause()?;
2482 
2483         *state = new_state;
2484 
2485         event!("vm", "paused");
2486         Ok(())
2487     }
2488 
2489     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2490         event!("vm", "resuming");
2491         let mut state = self
2492             .state
2493             .try_write()
2494             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2495         let new_state = VmState::Running;
2496 
2497         state
2498             .valid_transition(new_state)
2499             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2500 
2501         self.cpu_manager.lock().unwrap().resume()?;
2502         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2503         {
2504             if let Some(clock) = &self.saved_clock {
2505                 self.vm.set_clock(clock).map_err(|e| {
2506                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2507                 })?;
2508             }
2509         }
2510         self.device_manager.lock().unwrap().resume()?;
2511 
2512         // And we're back to the Running state.
2513         *state = new_state;
2514         event!("vm", "resumed");
2515         Ok(())
2516     }
2517 }
2518 
2519 #[derive(Serialize, Deserialize)]
2520 pub struct VmSnapshot {
2521     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2522     pub clock: Option<hypervisor::ClockData>,
2523     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2524     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2525 }
2526 
2527 pub const VM_SNAPSHOT_ID: &str = "vm";
2528 impl Snapshottable for Vm {
2529     fn id(&self) -> String {
2530         VM_SNAPSHOT_ID.to_string()
2531     }
2532 
2533     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2534         event!("vm", "snapshotting");
2535 
2536         #[cfg(feature = "tdx")]
2537         {
2538             if self.config.lock().unwrap().is_tdx_enabled() {
2539                 return Err(MigratableError::Snapshot(anyhow!(
2540                     "Snapshot not possible with TDX VM"
2541                 )));
2542             }
2543         }
2544 
2545         let current_state = self.get_state().unwrap();
2546         if current_state != VmState::Paused {
2547             return Err(MigratableError::Snapshot(anyhow!(
2548                 "Trying to snapshot while VM is running"
2549             )));
2550         }
2551 
2552         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2553         let common_cpuid = {
2554             let amx = self.config.lock().unwrap().cpus.features.amx;
2555             let phys_bits = physical_bits(
2556                 &self.hypervisor,
2557                 self.config.lock().unwrap().cpus.max_phys_bits,
2558             );
2559             arch::generate_common_cpuid(
2560                 &self.hypervisor,
2561                 &arch::CpuidConfig {
2562                     sgx_epc_sections: None,
2563                     phys_bits,
2564                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2565                     #[cfg(feature = "tdx")]
2566                     tdx: false,
2567                     amx,
2568                 },
2569             )
2570             .map_err(|e| {
2571                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2572             })?
2573         };
2574 
2575         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2576             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2577             clock: self.saved_clock,
2578             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2579             common_cpuid,
2580         })
2581         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2582 
2583         let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data));
2584 
2585         let (id, snapshot) = {
2586             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2587             (cpu_manager.id(), cpu_manager.snapshot()?)
2588         };
2589         vm_snapshot.add_snapshot(id, snapshot);
2590         let (id, snapshot) = {
2591             let mut memory_manager = self.memory_manager.lock().unwrap();
2592             (memory_manager.id(), memory_manager.snapshot()?)
2593         };
2594         vm_snapshot.add_snapshot(id, snapshot);
2595         let (id, snapshot) = {
2596             let mut device_manager = self.device_manager.lock().unwrap();
2597             (device_manager.id(), device_manager.snapshot()?)
2598         };
2599         vm_snapshot.add_snapshot(id, snapshot);
2600 
2601         event!("vm", "snapshotted");
2602         Ok(vm_snapshot)
2603     }
2604 }
2605 
2606 impl Transportable for Vm {
2607     fn send(
2608         &self,
2609         snapshot: &Snapshot,
2610         destination_url: &str,
2611     ) -> std::result::Result<(), MigratableError> {
2612         let mut snapshot_config_path = url_to_path(destination_url)?;
2613         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2614 
2615         // Create the snapshot config file
2616         let mut snapshot_config_file = OpenOptions::new()
2617             .read(true)
2618             .write(true)
2619             .create_new(true)
2620             .open(snapshot_config_path)
2621             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2622 
2623         // Serialize and write the snapshot config
2624         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2625             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2626 
2627         snapshot_config_file
2628             .write(vm_config.as_bytes())
2629             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2630 
2631         let mut snapshot_state_path = url_to_path(destination_url)?;
2632         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2633 
2634         // Create the snapshot state file
2635         let mut snapshot_state_file = OpenOptions::new()
2636             .read(true)
2637             .write(true)
2638             .create_new(true)
2639             .open(snapshot_state_path)
2640             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2641 
2642         // Serialize and write the snapshot state
2643         let vm_state =
2644             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2645 
2646         snapshot_state_file
2647             .write(&vm_state)
2648             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2649 
2650         // Tell the memory manager to also send/write its own snapshot.
2651         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2652             self.memory_manager
2653                 .lock()
2654                 .unwrap()
2655                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2656         } else {
2657             return Err(MigratableError::Restore(anyhow!(
2658                 "Missing memory manager snapshot"
2659             )));
2660         }
2661 
2662         Ok(())
2663     }
2664 }
2665 
2666 impl Migratable for Vm {
2667     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2668         self.memory_manager.lock().unwrap().start_dirty_log()?;
2669         self.device_manager.lock().unwrap().start_dirty_log()
2670     }
2671 
2672     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2673         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2674         self.device_manager.lock().unwrap().stop_dirty_log()
2675     }
2676 
2677     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2678         Ok(MemoryRangeTable::new_from_tables(vec![
2679             self.memory_manager.lock().unwrap().dirty_log()?,
2680             self.device_manager.lock().unwrap().dirty_log()?,
2681         ]))
2682     }
2683 
2684     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2685         self.memory_manager.lock().unwrap().start_migration()?;
2686         self.device_manager.lock().unwrap().start_migration()
2687     }
2688 
2689     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2690         self.memory_manager.lock().unwrap().complete_migration()?;
2691         self.device_manager.lock().unwrap().complete_migration()
2692     }
2693 }
2694 
2695 #[cfg(feature = "guest_debug")]
2696 impl Debuggable for Vm {
2697     fn set_guest_debug(
2698         &self,
2699         cpu_id: usize,
2700         addrs: &[GuestAddress],
2701         singlestep: bool,
2702     ) -> std::result::Result<(), DebuggableError> {
2703         self.cpu_manager
2704             .lock()
2705             .unwrap()
2706             .set_guest_debug(cpu_id, addrs, singlestep)
2707     }
2708 
2709     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2710         if *self.state.read().unwrap() == VmState::Running {
2711             self.pause().map_err(DebuggableError::Pause)?;
2712         }
2713 
2714         let mut state = self
2715             .state
2716             .try_write()
2717             .map_err(|_| DebuggableError::PoisonedState)?;
2718         *state = VmState::BreakPoint;
2719         Ok(())
2720     }
2721 
2722     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2723         if *self.state.read().unwrap() == VmState::BreakPoint {
2724             self.resume().map_err(DebuggableError::Pause)?;
2725         }
2726 
2727         Ok(())
2728     }
2729 
2730     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2731         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2732     }
2733 
2734     fn write_regs(
2735         &self,
2736         cpu_id: usize,
2737         regs: &CoreRegs,
2738     ) -> std::result::Result<(), DebuggableError> {
2739         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2740     }
2741 
2742     fn read_mem(
2743         &self,
2744         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2745         cpu_id: usize,
2746         vaddr: GuestAddress,
2747         len: usize,
2748     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2749         self.cpu_manager
2750             .lock()
2751             .unwrap()
2752             .read_mem(guest_memory, cpu_id, vaddr, len)
2753     }
2754 
2755     fn write_mem(
2756         &self,
2757         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2758         cpu_id: usize,
2759         vaddr: &GuestAddress,
2760         data: &[u8],
2761     ) -> std::result::Result<(), DebuggableError> {
2762         self.cpu_manager
2763             .lock()
2764             .unwrap()
2765             .write_mem(guest_memory, cpu_id, vaddr, data)
2766     }
2767 
2768     fn active_vcpus(&self) -> usize {
2769         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2770         if active_vcpus > 0 {
2771             active_vcpus
2772         } else {
2773             // The VM is not booted yet. Report boot_vcpus() instead.
2774             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2775         }
2776     }
2777 }
2778 
2779 #[cfg(feature = "guest_debug")]
2780 pub const UINT16_MAX: u32 = 65535;
2781 
2782 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2783 impl Elf64Writable for Vm {}
2784 
2785 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2786 impl GuestDebuggable for Vm {
2787     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2788         event!("vm", "coredumping");
2789 
2790         let mut resume = false;
2791 
2792         #[cfg(feature = "tdx")]
2793         {
2794             if let Some(ref platform) = self.config.lock().unwrap().platform {
2795                 if platform.tdx {
2796                     return Err(GuestDebuggableError::Coredump(anyhow!(
2797                         "Coredump not possible with TDX VM"
2798                     )));
2799                 }
2800             }
2801         }
2802 
2803         match self.get_state().unwrap() {
2804             VmState::Running => {
2805                 self.pause().map_err(GuestDebuggableError::Pause)?;
2806                 resume = true;
2807             }
2808             VmState::Paused => {}
2809             _ => {
2810                 return Err(GuestDebuggableError::Coredump(anyhow!(
2811                     "Trying to coredump while VM is not running or paused"
2812                 )));
2813             }
2814         }
2815 
2816         let coredump_state = self.get_dump_state(destination_url)?;
2817 
2818         self.write_header(&coredump_state)?;
2819         self.write_note(&coredump_state)?;
2820         self.write_loads(&coredump_state)?;
2821 
2822         self.cpu_manager
2823             .lock()
2824             .unwrap()
2825             .cpu_write_elf64_note(&coredump_state)?;
2826         self.cpu_manager
2827             .lock()
2828             .unwrap()
2829             .cpu_write_vmm_note(&coredump_state)?;
2830 
2831         self.memory_manager
2832             .lock()
2833             .unwrap()
2834             .coredump_iterate_save_mem(&coredump_state)?;
2835 
2836         if resume {
2837             self.resume().map_err(GuestDebuggableError::Resume)?;
2838         }
2839 
2840         Ok(())
2841     }
2842 }
2843 
2844 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2845 #[cfg(test)]
2846 mod tests {
2847     use super::*;
2848 
2849     fn test_vm_state_transitions(state: VmState) {
2850         match state {
2851             VmState::Created => {
2852                 // Check the transitions from Created
2853                 assert!(state.valid_transition(VmState::Created).is_err());
2854                 assert!(state.valid_transition(VmState::Running).is_ok());
2855                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2856                 assert!(state.valid_transition(VmState::Paused).is_ok());
2857                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2858             }
2859             VmState::Running => {
2860                 // Check the transitions from Running
2861                 assert!(state.valid_transition(VmState::Created).is_err());
2862                 assert!(state.valid_transition(VmState::Running).is_err());
2863                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2864                 assert!(state.valid_transition(VmState::Paused).is_ok());
2865                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2866             }
2867             VmState::Shutdown => {
2868                 // Check the transitions from Shutdown
2869                 assert!(state.valid_transition(VmState::Created).is_err());
2870                 assert!(state.valid_transition(VmState::Running).is_ok());
2871                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2872                 assert!(state.valid_transition(VmState::Paused).is_err());
2873                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2874             }
2875             VmState::Paused => {
2876                 // Check the transitions from Paused
2877                 assert!(state.valid_transition(VmState::Created).is_err());
2878                 assert!(state.valid_transition(VmState::Running).is_ok());
2879                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2880                 assert!(state.valid_transition(VmState::Paused).is_err());
2881                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2882             }
2883             VmState::BreakPoint => {
2884                 // Check the transitions from Breakpoint
2885                 assert!(state.valid_transition(VmState::Created).is_ok());
2886                 assert!(state.valid_transition(VmState::Running).is_ok());
2887                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2888                 assert!(state.valid_transition(VmState::Paused).is_err());
2889                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2890             }
2891         }
2892     }
2893 
2894     #[test]
2895     fn test_vm_created_transitions() {
2896         test_vm_state_transitions(VmState::Created);
2897     }
2898 
2899     #[test]
2900     fn test_vm_running_transitions() {
2901         test_vm_state_transitions(VmState::Running);
2902     }
2903 
2904     #[test]
2905     fn test_vm_shutdown_transitions() {
2906         test_vm_state_transitions(VmState::Shutdown);
2907     }
2908 
2909     #[test]
2910     fn test_vm_paused_transitions() {
2911         test_vm_state_transitions(VmState::Paused);
2912     }
2913 
2914     #[cfg(feature = "tdx")]
2915     #[test]
2916     fn test_hob_memory_resources() {
2917         // Case 1: Two TDVF sections in the middle of the RAM
2918         let sections = vec![
2919             TdvfSection {
2920                 address: 0xc000,
2921                 size: 0x1000,
2922                 ..Default::default()
2923             },
2924             TdvfSection {
2925                 address: 0x1000,
2926                 size: 0x4000,
2927                 ..Default::default()
2928             },
2929         ];
2930         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2931         let expected = vec![
2932             (0, 0x1000, true),
2933             (0x1000, 0x4000, false),
2934             (0x5000, 0x7000, true),
2935             (0xc000, 0x1000, false),
2936             (0xd000, 0x0fff_3000, true),
2937         ];
2938         assert_eq!(
2939             expected,
2940             Vm::hob_memory_resources(
2941                 sections,
2942                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2943             )
2944         );
2945 
2946         // Case 2: Two TDVF sections with no conflict with the RAM
2947         let sections = vec![
2948             TdvfSection {
2949                 address: 0x1000_1000,
2950                 size: 0x1000,
2951                 ..Default::default()
2952             },
2953             TdvfSection {
2954                 address: 0,
2955                 size: 0x1000,
2956                 ..Default::default()
2957             },
2958         ];
2959         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2960         let expected = vec![
2961             (0, 0x1000, false),
2962             (0x1000, 0x1000_0000, true),
2963             (0x1000_1000, 0x1000, false),
2964         ];
2965         assert_eq!(
2966             expected,
2967             Vm::hob_memory_resources(
2968                 sections,
2969                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2970             )
2971         );
2972 
2973         // Case 3: Two TDVF sections with partial conflicts with the RAM
2974         let sections = vec![
2975             TdvfSection {
2976                 address: 0x1000_0000,
2977                 size: 0x2000,
2978                 ..Default::default()
2979             },
2980             TdvfSection {
2981                 address: 0,
2982                 size: 0x2000,
2983                 ..Default::default()
2984             },
2985         ];
2986         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2987         let expected = vec![
2988             (0, 0x2000, false),
2989             (0x2000, 0x0fff_e000, true),
2990             (0x1000_0000, 0x2000, false),
2991         ];
2992         assert_eq!(
2993             expected,
2994             Vm::hob_memory_resources(
2995                 sections,
2996                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2997             )
2998         );
2999 
3000         // Case 4: Two TDVF sections with no conflict before the RAM and two
3001         // more additional sections with no conflict after the RAM.
3002         let sections = vec![
3003             TdvfSection {
3004                 address: 0x2000_1000,
3005                 size: 0x1000,
3006                 ..Default::default()
3007             },
3008             TdvfSection {
3009                 address: 0x2000_0000,
3010                 size: 0x1000,
3011                 ..Default::default()
3012             },
3013             TdvfSection {
3014                 address: 0x1000,
3015                 size: 0x1000,
3016                 ..Default::default()
3017             },
3018             TdvfSection {
3019                 address: 0,
3020                 size: 0x1000,
3021                 ..Default::default()
3022             },
3023         ];
3024         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3025         let expected = vec![
3026             (0, 0x1000, false),
3027             (0x1000, 0x1000, false),
3028             (0x4000, 0x1000_0000, true),
3029             (0x2000_0000, 0x1000, false),
3030             (0x2000_1000, 0x1000, false),
3031         ];
3032         assert_eq!(
3033             expected,
3034             Vm::hob_memory_resources(
3035                 sections,
3036                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3037             )
3038         );
3039 
3040         // Case 5: One TDVF section overriding the entire RAM
3041         let sections = vec![TdvfSection {
3042             address: 0,
3043             size: 0x2000_0000,
3044             ..Default::default()
3045         }];
3046         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3047         let expected = vec![(0, 0x2000_0000, false)];
3048         assert_eq!(
3049             expected,
3050             Vm::hob_memory_resources(
3051                 sections,
3052                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3053             )
3054         );
3055 
3056         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3057         let sections = vec![
3058             TdvfSection {
3059                 address: 0x1000_2000,
3060                 size: 0x2000,
3061                 ..Default::default()
3062             },
3063             TdvfSection {
3064                 address: 0,
3065                 size: 0x2000,
3066                 ..Default::default()
3067             },
3068         ];
3069         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3070             (GuestAddress(0x2000), 0x1000_0000),
3071             (GuestAddress(0x1000_4000), 0x1000_0000),
3072         ];
3073         let expected = vec![
3074             (0, 0x2000, false),
3075             (0x2000, 0x1000_0000, true),
3076             (0x1000_2000, 0x2000, false),
3077             (0x1000_4000, 0x1000_0000, true),
3078         ];
3079         assert_eq!(
3080             expected,
3081             Vm::hob_memory_resources(
3082                 sections,
3083                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3084             )
3085         );
3086 
3087         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3088         let sections = vec![
3089             TdvfSection {
3090                 address: 0x1000_0000,
3091                 size: 0x4000,
3092                 ..Default::default()
3093             },
3094             TdvfSection {
3095                 address: 0,
3096                 size: 0x4000,
3097                 ..Default::default()
3098             },
3099         ];
3100         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3101             (GuestAddress(0x1000), 0x1000_0000),
3102             (GuestAddress(0x1000_3000), 0x1000_0000),
3103         ];
3104         let expected = vec![
3105             (0, 0x4000, false),
3106             (0x4000, 0x0fff_c000, true),
3107             (0x1000_0000, 0x4000, false),
3108             (0x1000_4000, 0x0fff_f000, true),
3109         ];
3110         assert_eq!(
3111             expected,
3112             Vm::hob_memory_resources(
3113                 sections,
3114                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3115             )
3116         );
3117     }
3118 }
3119 
3120 #[cfg(target_arch = "aarch64")]
3121 #[cfg(test)]
3122 mod tests {
3123     use super::*;
3124     use crate::GuestMemoryMmap;
3125     use arch::aarch64::fdt::create_fdt;
3126     use arch::aarch64::layout;
3127     use arch::{DeviceType, MmioDeviceInfo};
3128     use devices::gic::Gic;
3129 
3130     const LEN: u64 = 4096;
3131 
3132     #[test]
3133     fn test_create_fdt_with_devices() {
3134         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3135         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3136 
3137         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3138             (
3139                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3140                 MmioDeviceInfo {
3141                     addr: 0x00,
3142                     len: LEN,
3143                     irq: 33,
3144                 },
3145             ),
3146             (
3147                 (DeviceType::Virtio(1), "virtio".to_string()),
3148                 MmioDeviceInfo {
3149                     addr: LEN,
3150                     len: LEN,
3151                     irq: 34,
3152                 },
3153             ),
3154             (
3155                 (DeviceType::Rtc, "rtc".to_string()),
3156                 MmioDeviceInfo {
3157                     addr: 2 * LEN,
3158                     len: LEN,
3159                     irq: 35,
3160                 },
3161             ),
3162         ]
3163         .iter()
3164         .cloned()
3165         .collect();
3166 
3167         let hv = hypervisor::new().unwrap();
3168         let vm = hv.create_vm().unwrap();
3169         let gic = vm
3170             .create_vgic(Gic::create_default_config(1))
3171             .expect("Cannot create gic");
3172         assert!(create_fdt(
3173             &mem,
3174             "console=tty0",
3175             vec![0],
3176             Some((0, 0, 0)),
3177             &dev_info,
3178             &gic,
3179             &None,
3180             &Vec::new(),
3181             &BTreeMap::new(),
3182             None,
3183             true,
3184         )
3185         .is_ok())
3186     }
3187 }
3188 
3189 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3190 #[test]
3191 pub fn test_vm() {
3192     use hypervisor::VmExit;
3193     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3194     // This example based on https://lwn.net/Articles/658511/
3195     let code = [
3196         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3197         0x00, 0xd8, /* add %bl, %al */
3198         0x04, b'0', /* add $'0', %al */
3199         0xee, /* out %al, (%dx) */
3200         0xb0, b'\n', /* mov $'\n', %al */
3201         0xee,  /* out %al, (%dx) */
3202         0xf4,  /* hlt */
3203     ];
3204 
3205     let mem_size = 0x1000;
3206     let load_addr = GuestAddress(0x1000);
3207     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3208 
3209     let hv = hypervisor::new().unwrap();
3210     let vm = hv.create_vm().expect("new VM creation failed");
3211 
3212     for (index, region) in mem.iter().enumerate() {
3213         let mem_region = vm.make_user_memory_region(
3214             index as u32,
3215             region.start_addr().raw_value(),
3216             region.len(),
3217             region.as_ptr() as u64,
3218             false,
3219             false,
3220         );
3221 
3222         vm.create_user_memory_region(mem_region)
3223             .expect("Cannot configure guest memory");
3224     }
3225     mem.write_slice(&code, load_addr)
3226         .expect("Writing code to memory failed");
3227 
3228     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3229 
3230     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3231     vcpu_sregs.cs.base = 0;
3232     vcpu_sregs.cs.selector = 0;
3233     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3234 
3235     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3236     vcpu_regs.rip = 0x1000;
3237     vcpu_regs.rax = 2;
3238     vcpu_regs.rbx = 3;
3239     vcpu_regs.rflags = 2;
3240     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3241 
3242     loop {
3243         match vcpu.run().expect("run failed") {
3244             VmExit::IoOut(addr, data) => {
3245                 println!(
3246                     "IO out -- addr: {:#x} data [{:?}]",
3247                     addr,
3248                     str::from_utf8(data).unwrap()
3249                 );
3250             }
3251             VmExit::Reset => {
3252                 println!("HLT");
3253                 break;
3254             }
3255             r => panic!("unexpected exit reason: {r:?}"),
3256         }
3257     }
3258 }
3259