xref: /cloud-hypervisor/vmm/src/vm.rs (revision d10f20eb718023742143fa847a37f3d6114ead52)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 #[cfg(feature = "igvm")]
29 use crate::igvm::igvm_loader;
30 use crate::memory_manager::{
31     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
32 };
33 #[cfg(target_arch = "x86_64")]
34 use crate::migration::get_vm_snapshot;
35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
36 use crate::migration::url_to_file;
37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
38 use crate::GuestMemoryMmap;
39 use crate::{
40     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
41 };
42 use anyhow::anyhow;
43 use arch::get_host_cpu_phys_bits;
44 #[cfg(target_arch = "x86_64")]
45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
46 #[cfg(feature = "tdx")]
47 use arch::x86_64::tdx::TdvfSection;
48 use arch::EntryPoint;
49 #[cfg(target_arch = "aarch64")]
50 use arch::PciSpaceInfo;
51 use arch::{NumaNode, NumaNodes};
52 #[cfg(target_arch = "aarch64")]
53 use devices::interrupt_controller;
54 use devices::AcpiNotificationFlags;
55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
59 use hypervisor::{HypervisorVmError, VmOps};
60 use libc::{termios, SIGWINCH};
61 use linux_loader::cmdline::Cmdline;
62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
63 use linux_loader::elf;
64 #[cfg(target_arch = "x86_64")]
65 use linux_loader::loader::bzimage::BzImage;
66 #[cfg(target_arch = "x86_64")]
67 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
68 #[cfg(target_arch = "aarch64")]
69 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
70 use linux_loader::loader::KernelLoader;
71 use seccompiler::SeccompAction;
72 use serde::{Deserialize, Serialize};
73 use std::cmp;
74 use std::collections::BTreeMap;
75 use std::collections::HashMap;
76 use std::fs::{File, OpenOptions};
77 use std::io::{self, Seek, SeekFrom, Write};
78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
79 use std::mem::size_of;
80 use std::num::Wrapping;
81 use std::ops::Deref;
82 use std::os::unix::net::UnixStream;
83 use std::sync::{Arc, Mutex, RwLock};
84 use std::time::Instant;
85 use std::{result, str, thread};
86 use thiserror::Error;
87 use tracer::trace_scoped;
88 use vm_device::Bus;
89 #[cfg(feature = "tdx")]
90 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
91 use vm_memory::{
92     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
93 };
94 use vm_migration::protocol::{Request, Response, Status};
95 use vm_migration::{
96     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
97     Snapshottable, Transportable,
98 };
99 use vmm_sys_util::eventfd::EventFd;
100 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
101 
102 /// Errors associated with VM management
103 #[derive(Debug, Error)]
104 pub enum Error {
105     #[error("Cannot open kernel file: {0}")]
106     KernelFile(#[source] io::Error),
107 
108     #[error("Cannot open initramfs file: {0}")]
109     InitramfsFile(#[source] io::Error),
110 
111     #[error("Cannot load the kernel into memory: {0}")]
112     KernelLoad(#[source] linux_loader::loader::Error),
113 
114     #[cfg(target_arch = "aarch64")]
115     #[error("Cannot load the UEFI binary in memory: {0:?}")]
116     UefiLoad(arch::aarch64::uefi::Error),
117 
118     #[error("Cannot load the initramfs into memory")]
119     InitramfsLoad,
120 
121     #[error("Cannot load the kernel command line in memory: {0}")]
122     LoadCmdLine(#[source] linux_loader::loader::Error),
123 
124     #[error("Cannot modify the kernel command line: {0}")]
125     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
126 
127     #[error("Cannot create the kernel command line: {0}")]
128     CmdLineCreate(#[source] linux_loader::cmdline::Error),
129 
130     #[error("Cannot configure system: {0}")]
131     ConfigureSystem(#[source] arch::Error),
132 
133     #[cfg(target_arch = "aarch64")]
134     #[error("Cannot enable interrupt controller: {0:?}")]
135     EnableInterruptController(interrupt_controller::Error),
136 
137     #[error("VM state is poisoned")]
138     PoisonedState,
139 
140     #[error("Error from device manager: {0:?}")]
141     DeviceManager(DeviceManagerError),
142 
143     #[error("No device with id {0:?} to remove")]
144     NoDeviceToRemove(String),
145 
146     #[error("Cannot spawn a signal handler thread: {0}")]
147     SignalHandlerSpawn(#[source] io::Error),
148 
149     #[error("Failed to join on threads: {0:?}")]
150     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
151 
152     #[error("VM config is missing")]
153     VmMissingConfig,
154 
155     #[error("VM is not created")]
156     VmNotCreated,
157 
158     #[error("VM is already created")]
159     VmAlreadyCreated,
160 
161     #[error("VM is not running")]
162     VmNotRunning,
163 
164     #[error("Cannot clone EventFd: {0}")]
165     EventFdClone(#[source] io::Error),
166 
167     #[error("invalid VM state transition: {0:?} to {1:?}")]
168     InvalidStateTransition(VmState, VmState),
169 
170     #[error("Error from CPU manager: {0}")]
171     CpuManager(#[source] cpu::Error),
172 
173     #[error("Cannot pause devices: {0}")]
174     PauseDevices(#[source] MigratableError),
175 
176     #[error("Cannot resume devices: {0}")]
177     ResumeDevices(#[source] MigratableError),
178 
179     #[error("Cannot pause CPUs: {0}")]
180     PauseCpus(#[source] MigratableError),
181 
182     #[error("Cannot resume cpus: {0}")]
183     ResumeCpus(#[source] MigratableError),
184 
185     #[error("Cannot pause VM: {0}")]
186     Pause(#[source] MigratableError),
187 
188     #[error("Cannot resume VM: {0}")]
189     Resume(#[source] MigratableError),
190 
191     #[error("Memory manager error: {0:?}")]
192     MemoryManager(MemoryManagerError),
193 
194     #[error("Eventfd write error: {0}")]
195     EventfdError(#[source] std::io::Error),
196 
197     #[error("Cannot snapshot VM: {0}")]
198     Snapshot(#[source] MigratableError),
199 
200     #[error("Cannot restore VM: {0}")]
201     Restore(#[source] MigratableError),
202 
203     #[error("Cannot send VM snapshot: {0}")]
204     SnapshotSend(#[source] MigratableError),
205 
206     #[error("Invalid restore source URL")]
207     InvalidRestoreSourceUrl,
208 
209     #[error("Failed to validate config: {0}")]
210     ConfigValidation(#[source] ValidationError),
211 
212     #[error("Too many virtio-vsock devices")]
213     TooManyVsockDevices,
214 
215     #[error("Failed serializing into JSON: {0}")]
216     SerializeJson(#[source] serde_json::Error),
217 
218     #[error("Invalid NUMA configuration")]
219     InvalidNumaConfig,
220 
221     #[error("Cannot create seccomp filter: {0}")]
222     CreateSeccompFilter(#[source] seccompiler::Error),
223 
224     #[error("Cannot apply seccomp filter: {0}")]
225     ApplySeccompFilter(#[source] seccompiler::Error),
226 
227     #[error("Failed resizing a memory zone")]
228     ResizeZone,
229 
230     #[error("Cannot activate virtio devices: {0:?}")]
231     ActivateVirtioDevices(DeviceManagerError),
232 
233     #[error("Error triggering power button: {0:?}")]
234     PowerButton(DeviceManagerError),
235 
236     #[error("Kernel lacks PVH header")]
237     KernelMissingPvhHeader,
238 
239     #[error("Failed to allocate firmware RAM: {0:?}")]
240     AllocateFirmwareMemory(MemoryManagerError),
241 
242     #[error("Error manipulating firmware file: {0}")]
243     FirmwareFile(#[source] std::io::Error),
244 
245     #[error("Firmware too big")]
246     FirmwareTooLarge,
247 
248     #[error("Failed to copy firmware to memory: {0}")]
249     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
250 
251     #[cfg(feature = "sev_snp")]
252     #[error("Error enabling SEV-SNP VM: {0}")]
253     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
254 
255     #[cfg(feature = "tdx")]
256     #[error("Error performing I/O on TDX firmware file: {0}")]
257     LoadTdvf(#[source] std::io::Error),
258 
259     #[cfg(feature = "tdx")]
260     #[error("Error performing I/O on the TDX payload file: {0}")]
261     LoadPayload(#[source] std::io::Error),
262 
263     #[cfg(feature = "tdx")]
264     #[error("Error parsing TDVF: {0}")]
265     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
266 
267     #[cfg(feature = "tdx")]
268     #[error("Error populating TDX HOB: {0}")]
269     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
270 
271     #[cfg(feature = "tdx")]
272     #[error("Error allocating TDVF memory: {0:?}")]
273     AllocatingTdvfMemory(crate::memory_manager::Error),
274 
275     #[cfg(feature = "tdx")]
276     #[error("Error enabling TDX VM: {0}")]
277     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
278 
279     #[cfg(feature = "tdx")]
280     #[error("Error enabling TDX memory region: {0}")]
281     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
282 
283     #[cfg(feature = "tdx")]
284     #[error("Error finalizing TDX VM: {0}")]
285     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
286 
287     #[cfg(feature = "tdx")]
288     #[error("TDX firmware missing")]
289     TdxFirmwareMissing,
290 
291     #[cfg(feature = "tdx")]
292     #[error("Invalid TDX payload type")]
293     InvalidPayloadType,
294 
295     #[cfg(feature = "guest_debug")]
296     #[error("Error debugging VM: {0:?}")]
297     Debug(DebuggableError),
298 
299     #[error("Error spawning kernel loading thread")]
300     KernelLoadThreadSpawn(std::io::Error),
301 
302     #[error("Error joining kernel loading thread")]
303     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
304 
305     #[error("Payload configuration is not bootable")]
306     InvalidPayload,
307 
308     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
309     #[error("Error coredumping VM: {0:?}")]
310     Coredump(GuestDebuggableError),
311 
312     #[cfg(feature = "igvm")]
313     #[error("Cannot open igvm file: {0}")]
314     IgvmFile(#[source] io::Error),
315 
316     #[cfg(feature = "igvm")]
317     #[error("Cannot load the igvm into memory: {0}")]
318     IgvmLoad(#[source] igvm_loader::Error),
319 
320     #[error("Error injecting NMI")]
321     ErrorNmi,
322 }
323 pub type Result<T> = result::Result<T, Error>;
324 
325 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
326 pub enum VmState {
327     Created,
328     Running,
329     Shutdown,
330     Paused,
331     BreakPoint,
332 }
333 
334 impl VmState {
335     fn valid_transition(self, new_state: VmState) -> Result<()> {
336         match self {
337             VmState::Created => match new_state {
338                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
339                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
340                     Ok(())
341                 }
342             },
343 
344             VmState::Running => match new_state {
345                 VmState::Created | VmState::Running => {
346                     Err(Error::InvalidStateTransition(self, new_state))
347                 }
348                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
349             },
350 
351             VmState::Shutdown => match new_state {
352                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
353                     Err(Error::InvalidStateTransition(self, new_state))
354                 }
355                 VmState::Running => Ok(()),
356             },
357 
358             VmState::Paused => match new_state {
359                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
360                     Err(Error::InvalidStateTransition(self, new_state))
361                 }
362                 VmState::Running | VmState::Shutdown => Ok(()),
363             },
364             VmState::BreakPoint => match new_state {
365                 VmState::Created | VmState::Running => Ok(()),
366                 _ => Err(Error::InvalidStateTransition(self, new_state)),
367             },
368         }
369     }
370 }
371 
372 struct VmOpsHandler {
373     memory: GuestMemoryAtomic<GuestMemoryMmap>,
374     #[cfg(target_arch = "x86_64")]
375     io_bus: Arc<Bus>,
376     mmio_bus: Arc<Bus>,
377 }
378 
379 impl VmOps for VmOpsHandler {
380     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
381         self.memory
382             .memory()
383             .write(buf, GuestAddress(gpa))
384             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
385     }
386 
387     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
388         self.memory
389             .memory()
390             .read(buf, GuestAddress(gpa))
391             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
392     }
393 
394     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
395         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
396             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
397         }
398         Ok(())
399     }
400 
401     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
402         match self.mmio_bus.write(gpa, data) {
403             Err(vm_device::BusError::MissingAddressRange) => {
404                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
405             }
406             Ok(Some(barrier)) => {
407                 info!("Waiting for barrier");
408                 barrier.wait();
409                 info!("Barrier released");
410             }
411             _ => {}
412         };
413         Ok(())
414     }
415 
416     #[cfg(target_arch = "x86_64")]
417     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
418         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
419             info!("Guest PIO read to unregistered address 0x{:x}", port);
420         }
421         Ok(())
422     }
423 
424     #[cfg(target_arch = "x86_64")]
425     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
426         match self.io_bus.write(port, data) {
427             Err(vm_device::BusError::MissingAddressRange) => {
428                 info!("Guest PIO write to unregistered address 0x{:x}", port);
429             }
430             Ok(Some(barrier)) => {
431                 info!("Waiting for barrier");
432                 barrier.wait();
433                 info!("Barrier released");
434             }
435             _ => {}
436         };
437         Ok(())
438     }
439 }
440 
441 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
442     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
443 
444     cmp::min(host_phys_bits, max_phys_bits)
445 }
446 
447 pub struct Vm {
448     #[cfg(feature = "tdx")]
449     kernel: Option<File>,
450     initramfs: Option<File>,
451     threads: Vec<thread::JoinHandle<()>>,
452     device_manager: Arc<Mutex<DeviceManager>>,
453     config: Arc<Mutex<VmConfig>>,
454     state: RwLock<VmState>,
455     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
456     memory_manager: Arc<Mutex<MemoryManager>>,
457     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
458     // The hypervisor abstracted virtual machine.
459     vm: Arc<dyn hypervisor::Vm>,
460     #[cfg(target_arch = "x86_64")]
461     saved_clock: Option<hypervisor::ClockData>,
462     numa_nodes: NumaNodes,
463     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
464     hypervisor: Arc<dyn hypervisor::Hypervisor>,
465     stop_on_boot: bool,
466     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
467 }
468 
469 impl Vm {
470     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
471 
472     #[allow(clippy::too_many_arguments)]
473     pub fn new_from_memory_manager(
474         config: Arc<Mutex<VmConfig>>,
475         memory_manager: Arc<Mutex<MemoryManager>>,
476         vm: Arc<dyn hypervisor::Vm>,
477         exit_evt: EventFd,
478         reset_evt: EventFd,
479         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
480         seccomp_action: &SeccompAction,
481         hypervisor: Arc<dyn hypervisor::Hypervisor>,
482         activate_evt: EventFd,
483         timestamp: Instant,
484         serial_pty: Option<PtyPair>,
485         console_pty: Option<PtyPair>,
486         debug_console_pty: Option<PtyPair>,
487         console_resize_pipe: Option<File>,
488         original_termios: Arc<Mutex<Option<termios>>>,
489         snapshot: Option<Snapshot>,
490     ) -> Result<Self> {
491         trace_scoped!("Vm::new_from_memory_manager");
492 
493         let boot_id_list = config
494             .lock()
495             .unwrap()
496             .validate()
497             .map_err(Error::ConfigValidation)?;
498 
499         #[cfg(not(feature = "igvm"))]
500         let load_payload_handle = if snapshot.is_none() {
501             Self::load_payload_async(&memory_manager, &config)?
502         } else {
503             None
504         };
505 
506         info!("Booting VM from config: {:?}", &config);
507 
508         // Create NUMA nodes based on NumaConfig.
509         let numa_nodes =
510             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
511 
512         #[cfg(feature = "tdx")]
513         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
514         #[cfg(feature = "sev_snp")]
515         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
516         #[cfg(feature = "tdx")]
517         let force_iommu = tdx_enabled;
518         #[cfg(feature = "sev_snp")]
519         let force_iommu = sev_snp_enabled;
520         #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
521         let force_iommu = false;
522 
523         #[cfg(feature = "guest_debug")]
524         let stop_on_boot = config.lock().unwrap().gdb;
525         #[cfg(not(feature = "guest_debug"))]
526         let stop_on_boot = false;
527 
528         let memory = memory_manager.lock().unwrap().guest_memory();
529         #[cfg(target_arch = "x86_64")]
530         let io_bus = Arc::new(Bus::new());
531         let mmio_bus = Arc::new(Bus::new());
532 
533         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
534             memory,
535             #[cfg(target_arch = "x86_64")]
536             io_bus: io_bus.clone(),
537             mmio_bus: mmio_bus.clone(),
538         });
539 
540         let cpus_config = { &config.lock().unwrap().cpus.clone() };
541         let cpu_manager = cpu::CpuManager::new(
542             cpus_config,
543             vm.clone(),
544             exit_evt.try_clone().map_err(Error::EventFdClone)?,
545             reset_evt.try_clone().map_err(Error::EventFdClone)?,
546             #[cfg(feature = "guest_debug")]
547             vm_debug_evt,
548             &hypervisor,
549             seccomp_action.clone(),
550             vm_ops,
551             #[cfg(feature = "tdx")]
552             tdx_enabled,
553             &numa_nodes,
554             #[cfg(feature = "sev_snp")]
555             sev_snp_enabled,
556         )
557         .map_err(Error::CpuManager)?;
558 
559         #[cfg(target_arch = "x86_64")]
560         cpu_manager
561             .lock()
562             .unwrap()
563             .populate_cpuid(
564                 &memory_manager,
565                 &hypervisor,
566                 #[cfg(feature = "tdx")]
567                 tdx_enabled,
568             )
569             .map_err(Error::CpuManager)?;
570 
571         // Loading the igvm file is pushed down here because
572         // igvm parser needs cpu_manager to retrieve cpuid leaf.
573         // For the regular case, we can start loading early, but for
574         // igvm case we have to wait until cpu_manager is created.
575         // Currently, Microsoft Hypervisor does not provide any
576         // Hypervisor specific common cpuid, we need to call get_cpuid_values
577         // per cpuid through cpu_manager.
578         #[cfg(feature = "igvm")]
579         let load_payload_handle = if snapshot.is_none() {
580             Self::load_payload_async(
581                 &memory_manager,
582                 &config,
583                 &cpu_manager,
584                 #[cfg(feature = "sev_snp")]
585                 sev_snp_enabled,
586             )?
587         } else {
588             None
589         };
590         // The initial TDX configuration must be done before the vCPUs are
591         // created
592         #[cfg(feature = "tdx")]
593         if tdx_enabled {
594             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
595             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
596             vm.tdx_init(&cpuid, max_vcpus)
597                 .map_err(Error::InitializeTdxVm)?;
598         }
599 
600         cpu_manager
601             .lock()
602             .unwrap()
603             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
604             .map_err(Error::CpuManager)?;
605 
606         // This initial SEV-SNP configuration must be done immediately after
607         // vCPUs are created. As part of this initialization we are
608         // transitioning the guest into secure state.
609         #[cfg(feature = "sev_snp")]
610         if sev_snp_enabled {
611             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
612         }
613 
614         #[cfg(feature = "tdx")]
615         let dynamic = !tdx_enabled;
616         #[cfg(not(feature = "tdx"))]
617         let dynamic = true;
618 
619         let device_manager = DeviceManager::new(
620             #[cfg(target_arch = "x86_64")]
621             io_bus,
622             mmio_bus,
623             hypervisor.hypervisor_type(),
624             vm.clone(),
625             config.clone(),
626             memory_manager.clone(),
627             cpu_manager.clone(),
628             exit_evt.try_clone().map_err(Error::EventFdClone)?,
629             reset_evt,
630             seccomp_action.clone(),
631             numa_nodes.clone(),
632             &activate_evt,
633             force_iommu,
634             boot_id_list,
635             timestamp,
636             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
637             dynamic,
638         )
639         .map_err(Error::DeviceManager)?;
640 
641         device_manager
642             .lock()
643             .unwrap()
644             .create_devices(
645                 serial_pty,
646                 console_pty,
647                 debug_console_pty,
648                 console_resize_pipe,
649                 original_termios,
650             )
651             .map_err(Error::DeviceManager)?;
652 
653         #[cfg(feature = "tdx")]
654         let kernel = config
655             .lock()
656             .unwrap()
657             .payload
658             .as_ref()
659             .map(|p| p.kernel.as_ref().map(File::open))
660             .unwrap_or_default()
661             .transpose()
662             .map_err(Error::KernelFile)?;
663 
664         let initramfs = config
665             .lock()
666             .unwrap()
667             .payload
668             .as_ref()
669             .map(|p| p.initramfs.as_ref().map(File::open))
670             .unwrap_or_default()
671             .transpose()
672             .map_err(Error::InitramfsFile)?;
673 
674         #[cfg(target_arch = "x86_64")]
675         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
676             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
677             vm_snapshot.clock
678         } else {
679             None
680         };
681 
682         let vm_state = if snapshot.is_some() {
683             VmState::Paused
684         } else {
685             VmState::Created
686         };
687 
688         Ok(Vm {
689             #[cfg(feature = "tdx")]
690             kernel,
691             initramfs,
692             device_manager,
693             config,
694             threads: Vec::with_capacity(1),
695             state: RwLock::new(vm_state),
696             cpu_manager,
697             memory_manager,
698             vm,
699             #[cfg(target_arch = "x86_64")]
700             saved_clock,
701             numa_nodes,
702             hypervisor,
703             stop_on_boot,
704             load_payload_handle,
705         })
706     }
707 
708     fn create_numa_nodes(
709         configs: Option<Vec<NumaConfig>>,
710         memory_manager: &Arc<Mutex<MemoryManager>>,
711     ) -> Result<NumaNodes> {
712         let mm = memory_manager.lock().unwrap();
713         let mm_zones = mm.memory_zones();
714         let mut numa_nodes = BTreeMap::new();
715 
716         if let Some(configs) = &configs {
717             for config in configs.iter() {
718                 if numa_nodes.contains_key(&config.guest_numa_id) {
719                     error!("Can't define twice the same NUMA node");
720                     return Err(Error::InvalidNumaConfig);
721                 }
722 
723                 let mut node = NumaNode::default();
724 
725                 if let Some(memory_zones) = &config.memory_zones {
726                     for memory_zone in memory_zones.iter() {
727                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
728                             node.memory_regions.extend(mm_zone.regions().clone());
729                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
730                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
731                             }
732                             node.memory_zones.push(memory_zone.clone());
733                         } else {
734                             error!("Unknown memory zone '{}'", memory_zone);
735                             return Err(Error::InvalidNumaConfig);
736                         }
737                     }
738                 }
739 
740                 if let Some(cpus) = &config.cpus {
741                     node.cpus.extend(cpus);
742                 }
743 
744                 if let Some(pci_segments) = &config.pci_segments {
745                     node.pci_segments.extend(pci_segments);
746                 }
747 
748                 if let Some(distances) = &config.distances {
749                     for distance in distances.iter() {
750                         let dest = distance.destination;
751                         let dist = distance.distance;
752 
753                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
754                             error!("Unknown destination NUMA node {}", dest);
755                             return Err(Error::InvalidNumaConfig);
756                         }
757 
758                         if node.distances.contains_key(&dest) {
759                             error!("Destination NUMA node {} has been already set", dest);
760                             return Err(Error::InvalidNumaConfig);
761                         }
762 
763                         node.distances.insert(dest, dist);
764                     }
765                 }
766 
767                 #[cfg(target_arch = "x86_64")]
768                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
769                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
770                         let mm_sections = sgx_epc_region.epc_sections();
771                         for sgx_epc_section in sgx_epc_sections.iter() {
772                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
773                                 node.sgx_epc_sections.push(mm_section.clone());
774                             } else {
775                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
776                                 return Err(Error::InvalidNumaConfig);
777                             }
778                         }
779                     } else {
780                         error!("Missing SGX EPC region");
781                         return Err(Error::InvalidNumaConfig);
782                     }
783                 }
784 
785                 numa_nodes.insert(config.guest_numa_id, node);
786             }
787         }
788 
789         Ok(numa_nodes)
790     }
791 
792     #[allow(clippy::too_many_arguments)]
793     pub fn new(
794         vm_config: Arc<Mutex<VmConfig>>,
795         exit_evt: EventFd,
796         reset_evt: EventFd,
797         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
798         seccomp_action: &SeccompAction,
799         hypervisor: Arc<dyn hypervisor::Hypervisor>,
800         activate_evt: EventFd,
801         serial_pty: Option<PtyPair>,
802         console_pty: Option<PtyPair>,
803         debug_console_pty: Option<PtyPair>,
804         console_resize_pipe: Option<File>,
805         original_termios: Arc<Mutex<Option<termios>>>,
806         snapshot: Option<Snapshot>,
807         source_url: Option<&str>,
808         prefault: Option<bool>,
809     ) -> Result<Self> {
810         trace_scoped!("Vm::new");
811 
812         let timestamp = Instant::now();
813 
814         #[cfg(feature = "tdx")]
815         let tdx_enabled = if snapshot.is_some() {
816             false
817         } else {
818             vm_config.lock().unwrap().is_tdx_enabled()
819         };
820 
821         #[cfg(feature = "sev_snp")]
822         let sev_snp_enabled = if snapshot.is_some() {
823             false
824         } else {
825             vm_config.lock().unwrap().is_sev_snp_enabled()
826         };
827 
828         let vm = Self::create_hypervisor_vm(
829             &hypervisor,
830             #[cfg(feature = "tdx")]
831             tdx_enabled,
832             #[cfg(feature = "sev_snp")]
833             sev_snp_enabled,
834         )?;
835 
836         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
837 
838         let memory_manager = if let Some(snapshot) =
839             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
840         {
841             MemoryManager::new_from_snapshot(
842                 &snapshot,
843                 vm.clone(),
844                 &vm_config.lock().unwrap().memory.clone(),
845                 source_url,
846                 prefault.unwrap(),
847                 phys_bits,
848             )
849             .map_err(Error::MemoryManager)?
850         } else {
851             #[cfg(target_arch = "x86_64")]
852             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
853 
854             MemoryManager::new(
855                 vm.clone(),
856                 &vm_config.lock().unwrap().memory.clone(),
857                 None,
858                 phys_bits,
859                 #[cfg(feature = "tdx")]
860                 tdx_enabled,
861                 None,
862                 None,
863                 #[cfg(target_arch = "x86_64")]
864                 sgx_epc_config,
865             )
866             .map_err(Error::MemoryManager)?
867         };
868 
869         Vm::new_from_memory_manager(
870             vm_config,
871             memory_manager,
872             vm,
873             exit_evt,
874             reset_evt,
875             #[cfg(feature = "guest_debug")]
876             vm_debug_evt,
877             seccomp_action,
878             hypervisor,
879             activate_evt,
880             timestamp,
881             serial_pty,
882             console_pty,
883             debug_console_pty,
884             console_resize_pipe,
885             original_termios,
886             snapshot,
887         )
888     }
889 
890     pub fn create_hypervisor_vm(
891         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
892         #[cfg(feature = "tdx")] tdx_enabled: bool,
893         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
894     ) -> Result<Arc<dyn hypervisor::Vm>> {
895         hypervisor.check_required_extensions().unwrap();
896 
897         cfg_if::cfg_if! {
898             if #[cfg(feature = "tdx")] {
899                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
900                 // Otherwise KVM_X86_LEGACY_VM: 0
901                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
902                 let vm = hypervisor
903                     .create_vm_with_type(u64::from(tdx_enabled))
904                     .unwrap();
905             } else if #[cfg(feature = "sev_snp")] {
906                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
907                 // Otherwise SEV_SNP_DISABLED: 0
908                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
909                 let vm = hypervisor
910                     .create_vm_with_type(u64::from(sev_snp_enabled))
911                     .unwrap();
912             } else {
913                 let vm = hypervisor.create_vm().unwrap();
914             }
915         }
916 
917         #[cfg(target_arch = "x86_64")]
918         {
919             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
920                 .unwrap();
921             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
922             vm.enable_split_irq().unwrap();
923         }
924 
925         Ok(vm)
926     }
927 
928     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
929         let initramfs = self.initramfs.as_mut().unwrap();
930         let size: usize = initramfs
931             .seek(SeekFrom::End(0))
932             .map_err(|_| Error::InitramfsLoad)?
933             .try_into()
934             .unwrap();
935         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
936 
937         let address =
938             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
939         let address = GuestAddress(address);
940 
941         guest_mem
942             .read_volatile_from(address, initramfs, size)
943             .map_err(|_| Error::InitramfsLoad)?;
944 
945         info!("Initramfs loaded: address = 0x{:x}", address.0);
946         Ok(arch::InitramfsConfig { address, size })
947     }
948 
949     pub fn generate_cmdline(
950         payload: &PayloadConfig,
951         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
952     ) -> Result<Cmdline> {
953         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
954         if let Some(s) = payload.cmdline.as_ref() {
955             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
956         }
957 
958         #[cfg(target_arch = "aarch64")]
959         for entry in device_manager.lock().unwrap().cmdline_additions() {
960             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
961         }
962         Ok(cmdline)
963     }
964 
965     #[cfg(target_arch = "aarch64")]
966     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
967         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
968         let mem = uefi_flash.memory();
969         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
970             .map_err(Error::UefiLoad)?;
971         Ok(())
972     }
973 
974     #[cfg(target_arch = "aarch64")]
975     fn load_kernel(
976         firmware: Option<File>,
977         kernel: Option<File>,
978         memory_manager: Arc<Mutex<MemoryManager>>,
979     ) -> Result<EntryPoint> {
980         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
981         let mem = guest_memory.memory();
982         let entry_addr = match (firmware, kernel) {
983             (None, Some(mut kernel)) => {
984                 match linux_loader::loader::pe::PE::load(
985                     mem.deref(),
986                     Some(arch::layout::KERNEL_START),
987                     &mut kernel,
988                     None,
989                 ) {
990                     Ok(entry_addr) => entry_addr.kernel_load,
991                     // Try to load the binary as kernel PE file at first.
992                     // If failed, retry to load it as UEFI binary.
993                     // As the UEFI binary is formatless, it must be the last option to try.
994                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
995                         Self::load_firmware(&kernel, memory_manager)?;
996                         arch::layout::UEFI_START
997                     }
998                     Err(e) => {
999                         return Err(Error::KernelLoad(e));
1000                     }
1001                 }
1002             }
1003             (Some(firmware), None) => {
1004                 Self::load_firmware(&firmware, memory_manager)?;
1005                 arch::layout::UEFI_START
1006             }
1007             _ => return Err(Error::InvalidPayload),
1008         };
1009 
1010         Ok(EntryPoint { entry_addr })
1011     }
1012 
1013     #[cfg(feature = "igvm")]
1014     fn load_igvm(
1015         igvm: File,
1016         memory_manager: Arc<Mutex<MemoryManager>>,
1017         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1018         #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1019     ) -> Result<EntryPoint> {
1020         let res = igvm_loader::load_igvm(
1021             &igvm,
1022             memory_manager,
1023             cpu_manager.clone(),
1024             "",
1025             #[cfg(feature = "sev_snp")]
1026             host_data,
1027         )
1028         .map_err(Error::IgvmLoad)?;
1029 
1030         cfg_if::cfg_if! {
1031             if #[cfg(feature = "sev_snp")] {
1032                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1033                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1034                 } else {
1035                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1036                 };
1037             } else {
1038                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1039             }
1040         };
1041         Ok(entry_point)
1042     }
1043 
1044     #[cfg(target_arch = "x86_64")]
1045     fn load_kernel(
1046         mut kernel: File,
1047         cmdline: Option<Cmdline>,
1048         memory_manager: Arc<Mutex<MemoryManager>>,
1049     ) -> Result<EntryPoint> {
1050         info!("Loading kernel");
1051 
1052         let mem = {
1053             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1054             guest_memory.memory()
1055         };
1056 
1057         // Try ELF binary with PVH boot.
1058         let entry_addr = linux_loader::loader::elf::Elf::load(
1059             mem.deref(),
1060             None,
1061             &mut kernel,
1062             Some(arch::layout::HIGH_RAM_START),
1063         )
1064         // Try loading kernel as bzImage.
1065         .or_else(|_| {
1066             BzImage::load(
1067                 mem.deref(),
1068                 None,
1069                 &mut kernel,
1070                 Some(arch::layout::HIGH_RAM_START),
1071             )
1072         })
1073         .map_err(Error::KernelLoad)?;
1074 
1075         if let Some(cmdline) = cmdline {
1076             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1077                 .map_err(Error::LoadCmdLine)?;
1078         }
1079 
1080         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1081             // Use the PVH kernel entry point to boot the guest
1082             info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1083             Ok(EntryPoint {
1084                 entry_addr,
1085                 setup_header: None,
1086             })
1087         } else if entry_addr.setup_header.is_some() {
1088             // Use the bzImage 32bit entry point to boot the guest
1089             info!(
1090                 "bzImage kernel loaded: entry_addr = 0x{:x}",
1091                 entry_addr.kernel_load.0
1092             );
1093             Ok(EntryPoint {
1094                 entry_addr: entry_addr.kernel_load,
1095                 setup_header: entry_addr.setup_header,
1096             })
1097         } else {
1098             Err(Error::KernelMissingPvhHeader)
1099         }
1100     }
1101 
1102     #[cfg(target_arch = "x86_64")]
1103     fn load_payload(
1104         payload: &PayloadConfig,
1105         memory_manager: Arc<Mutex<MemoryManager>>,
1106         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1107         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1108     ) -> Result<EntryPoint> {
1109         trace_scoped!("load_payload");
1110         #[cfg(feature = "igvm")]
1111         {
1112             if let Some(_igvm_file) = &payload.igvm {
1113                 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1114                 #[cfg(feature = "sev_snp")]
1115                 if sev_snp_enabled {
1116                     return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1117                 }
1118                 #[cfg(not(feature = "sev_snp"))]
1119                 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1120             }
1121         }
1122         match (
1123             &payload.firmware,
1124             &payload.kernel,
1125             &payload.initramfs,
1126             &payload.cmdline,
1127         ) {
1128             (Some(firmware), None, None, None) => {
1129                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1130                 Self::load_kernel(firmware, None, memory_manager)
1131             }
1132             (None, Some(kernel), _, _) => {
1133                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1134                 let cmdline = Self::generate_cmdline(payload)?;
1135                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1136             }
1137             _ => Err(Error::InvalidPayload),
1138         }
1139     }
1140 
1141     #[cfg(target_arch = "aarch64")]
1142     fn load_payload(
1143         payload: &PayloadConfig,
1144         memory_manager: Arc<Mutex<MemoryManager>>,
1145     ) -> Result<EntryPoint> {
1146         match (&payload.firmware, &payload.kernel) {
1147             (Some(firmware), None) => {
1148                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1149                 Self::load_kernel(Some(firmware), None, memory_manager)
1150             }
1151             (None, Some(kernel)) => {
1152                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1153                 Self::load_kernel(None, Some(kernel), memory_manager)
1154             }
1155             _ => Err(Error::InvalidPayload),
1156         }
1157     }
1158 
1159     fn load_payload_async(
1160         memory_manager: &Arc<Mutex<MemoryManager>>,
1161         config: &Arc<Mutex<VmConfig>>,
1162         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1163         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1164     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1165         // Kernel with TDX is loaded in a different manner
1166         #[cfg(feature = "tdx")]
1167         if config.lock().unwrap().is_tdx_enabled() {
1168             return Ok(None);
1169         }
1170 
1171         config
1172             .lock()
1173             .unwrap()
1174             .payload
1175             .as_ref()
1176             .map(|payload| {
1177                 let memory_manager = memory_manager.clone();
1178                 let payload = payload.clone();
1179                 #[cfg(feature = "igvm")]
1180                 let cpu_manager = cpu_manager.clone();
1181 
1182                 std::thread::Builder::new()
1183                     .name("payload_loader".into())
1184                     .spawn(move || {
1185                         Self::load_payload(
1186                             &payload,
1187                             memory_manager,
1188                             #[cfg(feature = "igvm")]
1189                             cpu_manager,
1190                             #[cfg(feature = "sev_snp")]
1191                             sev_snp_enabled,
1192                         )
1193                     })
1194                     .map_err(Error::KernelLoadThreadSpawn)
1195             })
1196             .transpose()
1197     }
1198 
1199     #[cfg(target_arch = "x86_64")]
1200     fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1201         trace_scoped!("configure_system");
1202         info!("Configuring system");
1203         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1204 
1205         let initramfs_config = match self.initramfs {
1206             Some(_) => Some(self.load_initramfs(&mem)?),
1207             None => None,
1208         };
1209 
1210         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1211         let rsdp_addr = Some(rsdp_addr);
1212         let sgx_epc_region = self
1213             .memory_manager
1214             .lock()
1215             .unwrap()
1216             .sgx_epc_region()
1217             .as_ref()
1218             .cloned();
1219 
1220         let serial_number = self
1221             .config
1222             .lock()
1223             .unwrap()
1224             .platform
1225             .as_ref()
1226             .and_then(|p| p.serial_number.clone());
1227 
1228         let uuid = self
1229             .config
1230             .lock()
1231             .unwrap()
1232             .platform
1233             .as_ref()
1234             .and_then(|p| p.uuid.clone());
1235 
1236         let oem_strings = self
1237             .config
1238             .lock()
1239             .unwrap()
1240             .platform
1241             .as_ref()
1242             .and_then(|p| p.oem_strings.clone());
1243 
1244         let oem_strings = oem_strings
1245             .as_deref()
1246             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1247 
1248         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1249 
1250         arch::configure_system(
1251             &mem,
1252             arch::layout::CMDLINE_START,
1253             arch::layout::CMDLINE_MAX_SIZE,
1254             &initramfs_config,
1255             boot_vcpus,
1256             entry_addr.setup_header,
1257             rsdp_addr,
1258             sgx_epc_region,
1259             serial_number.as_deref(),
1260             uuid.as_deref(),
1261             oem_strings.as_deref(),
1262             topology,
1263         )
1264         .map_err(Error::ConfigureSystem)?;
1265         Ok(())
1266     }
1267 
1268     #[cfg(target_arch = "aarch64")]
1269     fn configure_system(
1270         &mut self,
1271         _rsdp_addr: GuestAddress,
1272         _entry_addr: EntryPoint,
1273     ) -> Result<()> {
1274         let cmdline = Self::generate_cmdline(
1275             self.config.lock().unwrap().payload.as_ref().unwrap(),
1276             &self.device_manager,
1277         )?;
1278         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1279         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1280         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1281         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1282         let initramfs_config = match self.initramfs {
1283             Some(_) => Some(self.load_initramfs(&mem)?),
1284             None => None,
1285         };
1286 
1287         let device_info = &self
1288             .device_manager
1289             .lock()
1290             .unwrap()
1291             .get_device_info()
1292             .clone();
1293 
1294         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1295             let pci_space = PciSpaceInfo {
1296                 pci_segment_id: pci_segment.id,
1297                 mmio_config_address: pci_segment.mmio_config_address,
1298                 pci_device_space_start: pci_segment.start_of_mem64_area,
1299                 pci_device_space_size: pci_segment.end_of_mem64_area
1300                     - pci_segment.start_of_mem64_area
1301                     + 1,
1302             };
1303             pci_space_info.push(pci_space);
1304         }
1305 
1306         let virtio_iommu_bdf = self
1307             .device_manager
1308             .lock()
1309             .unwrap()
1310             .iommu_attached_devices()
1311             .as_ref()
1312             .map(|(v, _)| *v);
1313 
1314         let vgic = self
1315             .device_manager
1316             .lock()
1317             .unwrap()
1318             .get_interrupt_controller()
1319             .unwrap()
1320             .lock()
1321             .unwrap()
1322             .get_vgic()
1323             .map_err(|_| {
1324                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1325                     arch::aarch64::Error::SetupGic,
1326                 ))
1327             })?;
1328 
1329         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1330         let pmu_supported = self
1331             .cpu_manager
1332             .lock()
1333             .unwrap()
1334             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1335             .map_err(|_| {
1336                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1337                     arch::aarch64::Error::VcpuInitPmu,
1338                 ))
1339             })?;
1340 
1341         arch::configure_system(
1342             &mem,
1343             cmdline.as_cstring().unwrap().to_str().unwrap(),
1344             vcpu_mpidrs,
1345             vcpu_topology,
1346             device_info,
1347             &initramfs_config,
1348             &pci_space_info,
1349             virtio_iommu_bdf.map(|bdf| bdf.into()),
1350             &vgic,
1351             &self.numa_nodes,
1352             pmu_supported,
1353         )
1354         .map_err(Error::ConfigureSystem)?;
1355 
1356         Ok(())
1357     }
1358 
1359     pub fn serial_pty(&self) -> Option<PtyPair> {
1360         self.device_manager.lock().unwrap().serial_pty()
1361     }
1362 
1363     pub fn console_pty(&self) -> Option<PtyPair> {
1364         self.device_manager.lock().unwrap().console_pty()
1365     }
1366 
1367     pub fn debug_console_pty(&self) -> Option<PtyPair> {
1368         self.device_manager.lock().unwrap().debug_console_pty()
1369     }
1370 
1371     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1372         self.device_manager.lock().unwrap().console_resize_pipe()
1373     }
1374 
1375     pub fn shutdown(&mut self) -> Result<()> {
1376         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1377         let new_state = VmState::Shutdown;
1378 
1379         state.valid_transition(new_state)?;
1380 
1381         // Wake up the DeviceManager threads so they will get terminated cleanly
1382         self.device_manager
1383             .lock()
1384             .unwrap()
1385             .resume()
1386             .map_err(Error::Resume)?;
1387 
1388         self.cpu_manager
1389             .lock()
1390             .unwrap()
1391             .shutdown()
1392             .map_err(Error::CpuManager)?;
1393 
1394         // Wait for all the threads to finish
1395         for thread in self.threads.drain(..) {
1396             thread.join().map_err(Error::ThreadCleanup)?
1397         }
1398         *state = new_state;
1399 
1400         Ok(())
1401     }
1402 
1403     pub fn resize(
1404         &mut self,
1405         desired_vcpus: Option<u8>,
1406         desired_memory: Option<u64>,
1407         desired_balloon: Option<u64>,
1408     ) -> Result<()> {
1409         event!("vm", "resizing");
1410 
1411         if let Some(desired_vcpus) = desired_vcpus {
1412             if self
1413                 .cpu_manager
1414                 .lock()
1415                 .unwrap()
1416                 .resize(desired_vcpus)
1417                 .map_err(Error::CpuManager)?
1418             {
1419                 self.device_manager
1420                     .lock()
1421                     .unwrap()
1422                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1423                     .map_err(Error::DeviceManager)?;
1424             }
1425             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1426         }
1427 
1428         if let Some(desired_memory) = desired_memory {
1429             let new_region = self
1430                 .memory_manager
1431                 .lock()
1432                 .unwrap()
1433                 .resize(desired_memory)
1434                 .map_err(Error::MemoryManager)?;
1435 
1436             let memory_config = &mut self.config.lock().unwrap().memory;
1437 
1438             if let Some(new_region) = &new_region {
1439                 self.device_manager
1440                     .lock()
1441                     .unwrap()
1442                     .update_memory(new_region)
1443                     .map_err(Error::DeviceManager)?;
1444 
1445                 match memory_config.hotplug_method {
1446                     HotplugMethod::Acpi => {
1447                         self.device_manager
1448                             .lock()
1449                             .unwrap()
1450                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1451                             .map_err(Error::DeviceManager)?;
1452                     }
1453                     HotplugMethod::VirtioMem => {}
1454                 }
1455             }
1456 
1457             // We update the VM config regardless of the actual guest resize
1458             // operation result (happened or not), so that if the VM reboots
1459             // it will be running with the last configure memory size.
1460             match memory_config.hotplug_method {
1461                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1462                 HotplugMethod::VirtioMem => {
1463                     if desired_memory > memory_config.size {
1464                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1465                     } else {
1466                         memory_config.hotplugged_size = None;
1467                     }
1468                 }
1469             }
1470         }
1471 
1472         if let Some(desired_balloon) = desired_balloon {
1473             self.device_manager
1474                 .lock()
1475                 .unwrap()
1476                 .resize_balloon(desired_balloon)
1477                 .map_err(Error::DeviceManager)?;
1478 
1479             // Update the configuration value for the balloon size to ensure
1480             // a reboot would use the right value.
1481             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1482                 balloon_config.size = desired_balloon;
1483             }
1484         }
1485 
1486         event!("vm", "resized");
1487 
1488         Ok(())
1489     }
1490 
1491     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1492         let memory_config = &mut self.config.lock().unwrap().memory;
1493 
1494         if let Some(zones) = &mut memory_config.zones {
1495             for zone in zones.iter_mut() {
1496                 if zone.id == id {
1497                     if desired_memory >= zone.size {
1498                         let hotplugged_size = desired_memory - zone.size;
1499                         self.memory_manager
1500                             .lock()
1501                             .unwrap()
1502                             .resize_zone(&id, desired_memory - zone.size)
1503                             .map_err(Error::MemoryManager)?;
1504                         // We update the memory zone config regardless of the
1505                         // actual 'resize-zone' operation result (happened or
1506                         // not), so that if the VM reboots it will be running
1507                         // with the last configured memory zone size.
1508                         zone.hotplugged_size = Some(hotplugged_size);
1509 
1510                         return Ok(());
1511                     } else {
1512                         error!(
1513                             "Invalid to ask less ({}) than boot RAM ({}) for \
1514                             this memory zone",
1515                             desired_memory, zone.size,
1516                         );
1517                         return Err(Error::ResizeZone);
1518                     }
1519                 }
1520             }
1521         }
1522 
1523         error!("Could not find the memory zone {} for the resize", id);
1524         Err(Error::ResizeZone)
1525     }
1526 
1527     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1528         let pci_device_info = self
1529             .device_manager
1530             .lock()
1531             .unwrap()
1532             .add_device(&mut device_cfg)
1533             .map_err(Error::DeviceManager)?;
1534 
1535         // Update VmConfig by adding the new device. This is important to
1536         // ensure the device would be created in case of a reboot.
1537         {
1538             let mut config = self.config.lock().unwrap();
1539             add_to_config(&mut config.devices, device_cfg);
1540         }
1541 
1542         self.device_manager
1543             .lock()
1544             .unwrap()
1545             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1546             .map_err(Error::DeviceManager)?;
1547 
1548         Ok(pci_device_info)
1549     }
1550 
1551     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1552         let pci_device_info = self
1553             .device_manager
1554             .lock()
1555             .unwrap()
1556             .add_user_device(&mut device_cfg)
1557             .map_err(Error::DeviceManager)?;
1558 
1559         // Update VmConfig by adding the new device. This is important to
1560         // ensure the device would be created in case of a reboot.
1561         {
1562             let mut config = self.config.lock().unwrap();
1563             add_to_config(&mut config.user_devices, device_cfg);
1564         }
1565 
1566         self.device_manager
1567             .lock()
1568             .unwrap()
1569             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1570             .map_err(Error::DeviceManager)?;
1571 
1572         Ok(pci_device_info)
1573     }
1574 
1575     pub fn remove_device(&mut self, id: String) -> Result<()> {
1576         self.device_manager
1577             .lock()
1578             .unwrap()
1579             .remove_device(id.clone())
1580             .map_err(Error::DeviceManager)?;
1581 
1582         // Update VmConfig by removing the device. This is important to
1583         // ensure the device would not be created in case of a reboot.
1584         self.config.lock().unwrap().remove_device(&id);
1585 
1586         self.device_manager
1587             .lock()
1588             .unwrap()
1589             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1590             .map_err(Error::DeviceManager)?;
1591         Ok(())
1592     }
1593 
1594     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1595         let pci_device_info = self
1596             .device_manager
1597             .lock()
1598             .unwrap()
1599             .add_disk(&mut disk_cfg)
1600             .map_err(Error::DeviceManager)?;
1601 
1602         // Update VmConfig by adding the new device. This is important to
1603         // ensure the device would be created in case of a reboot.
1604         {
1605             let mut config = self.config.lock().unwrap();
1606             add_to_config(&mut config.disks, disk_cfg);
1607         }
1608 
1609         self.device_manager
1610             .lock()
1611             .unwrap()
1612             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1613             .map_err(Error::DeviceManager)?;
1614 
1615         Ok(pci_device_info)
1616     }
1617 
1618     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1619         let pci_device_info = self
1620             .device_manager
1621             .lock()
1622             .unwrap()
1623             .add_fs(&mut fs_cfg)
1624             .map_err(Error::DeviceManager)?;
1625 
1626         // Update VmConfig by adding the new device. This is important to
1627         // ensure the device would be created in case of a reboot.
1628         {
1629             let mut config = self.config.lock().unwrap();
1630             add_to_config(&mut config.fs, fs_cfg);
1631         }
1632 
1633         self.device_manager
1634             .lock()
1635             .unwrap()
1636             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1637             .map_err(Error::DeviceManager)?;
1638 
1639         Ok(pci_device_info)
1640     }
1641 
1642     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1643         let pci_device_info = self
1644             .device_manager
1645             .lock()
1646             .unwrap()
1647             .add_pmem(&mut pmem_cfg)
1648             .map_err(Error::DeviceManager)?;
1649 
1650         // Update VmConfig by adding the new device. This is important to
1651         // ensure the device would be created in case of a reboot.
1652         {
1653             let mut config = self.config.lock().unwrap();
1654             add_to_config(&mut config.pmem, pmem_cfg);
1655         }
1656 
1657         self.device_manager
1658             .lock()
1659             .unwrap()
1660             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1661             .map_err(Error::DeviceManager)?;
1662 
1663         Ok(pci_device_info)
1664     }
1665 
1666     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1667         let pci_device_info = self
1668             .device_manager
1669             .lock()
1670             .unwrap()
1671             .add_net(&mut net_cfg)
1672             .map_err(Error::DeviceManager)?;
1673 
1674         // Update VmConfig by adding the new device. This is important to
1675         // ensure the device would be created in case of a reboot.
1676         {
1677             let mut config = self.config.lock().unwrap();
1678             add_to_config(&mut config.net, net_cfg);
1679         }
1680 
1681         self.device_manager
1682             .lock()
1683             .unwrap()
1684             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1685             .map_err(Error::DeviceManager)?;
1686 
1687         Ok(pci_device_info)
1688     }
1689 
1690     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1691         let pci_device_info = self
1692             .device_manager
1693             .lock()
1694             .unwrap()
1695             .add_vdpa(&mut vdpa_cfg)
1696             .map_err(Error::DeviceManager)?;
1697 
1698         // Update VmConfig by adding the new device. This is important to
1699         // ensure the device would be created in case of a reboot.
1700         {
1701             let mut config = self.config.lock().unwrap();
1702             add_to_config(&mut config.vdpa, vdpa_cfg);
1703         }
1704 
1705         self.device_manager
1706             .lock()
1707             .unwrap()
1708             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1709             .map_err(Error::DeviceManager)?;
1710 
1711         Ok(pci_device_info)
1712     }
1713 
1714     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1715         let pci_device_info = self
1716             .device_manager
1717             .lock()
1718             .unwrap()
1719             .add_vsock(&mut vsock_cfg)
1720             .map_err(Error::DeviceManager)?;
1721 
1722         // Update VmConfig by adding the new device. This is important to
1723         // ensure the device would be created in case of a reboot.
1724         {
1725             let mut config = self.config.lock().unwrap();
1726             config.vsock = Some(vsock_cfg);
1727         }
1728 
1729         self.device_manager
1730             .lock()
1731             .unwrap()
1732             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1733             .map_err(Error::DeviceManager)?;
1734 
1735         Ok(pci_device_info)
1736     }
1737 
1738     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1739         Ok(self.device_manager.lock().unwrap().counters())
1740     }
1741 
1742     #[cfg(feature = "tdx")]
1743     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1744         use arch::x86_64::tdx::*;
1745 
1746         let firmware_path = self
1747             .config
1748             .lock()
1749             .unwrap()
1750             .payload
1751             .as_ref()
1752             .unwrap()
1753             .firmware
1754             .clone()
1755             .ok_or(Error::TdxFirmwareMissing)?;
1756         // The TDVF file contains a table of section as well as code
1757         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1758 
1759         // For all the sections allocate some RAM backing them
1760         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1761     }
1762 
1763     #[cfg(feature = "tdx")]
1764     fn hob_memory_resources(
1765         mut sorted_sections: Vec<TdvfSection>,
1766         guest_memory: &GuestMemoryMmap,
1767     ) -> Vec<(u64, u64, bool)> {
1768         let mut list = Vec::new();
1769 
1770         let mut current_section = sorted_sections.pop();
1771 
1772         // RAM regions interleaved with TDVF sections
1773         let mut next_start_addr = 0;
1774         for region in guest_memory.iter() {
1775             let region_start = region.start_addr().0;
1776             let region_end = region.last_addr().0;
1777             if region_start > next_start_addr {
1778                 next_start_addr = region_start;
1779             }
1780 
1781             loop {
1782                 let (start, size, ram) = if let Some(section) = &current_section {
1783                     if section.address <= next_start_addr {
1784                         (section.address, section.size, false)
1785                     } else {
1786                         let last_addr = std::cmp::min(section.address - 1, region_end);
1787                         (next_start_addr, last_addr - next_start_addr + 1, true)
1788                     }
1789                 } else {
1790                     (next_start_addr, region_end - next_start_addr + 1, true)
1791                 };
1792 
1793                 list.push((start, size, ram));
1794 
1795                 if !ram {
1796                     current_section = sorted_sections.pop();
1797                 }
1798 
1799                 next_start_addr = start + size;
1800 
1801                 if region_start > next_start_addr {
1802                     next_start_addr = region_start;
1803                 }
1804 
1805                 if next_start_addr > region_end {
1806                     break;
1807                 }
1808             }
1809         }
1810 
1811         // Once all the interleaved sections have been processed, let's simply
1812         // pull the remaining ones.
1813         if let Some(section) = current_section {
1814             list.push((section.address, section.size, false));
1815         }
1816         while let Some(section) = sorted_sections.pop() {
1817             list.push((section.address, section.size, false));
1818         }
1819 
1820         list
1821     }
1822 
1823     #[cfg(feature = "tdx")]
1824     fn populate_tdx_sections(
1825         &mut self,
1826         sections: &[TdvfSection],
1827         guid_found: bool,
1828     ) -> Result<Option<u64>> {
1829         use arch::x86_64::tdx::*;
1830         // Get the memory end *before* we start adding TDVF ram regions
1831         let boot_guest_memory = self
1832             .memory_manager
1833             .lock()
1834             .as_ref()
1835             .unwrap()
1836             .boot_guest_memory();
1837         for section in sections {
1838             // No need to allocate if the section falls within guest RAM ranges
1839             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1840                 info!(
1841                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1842                     section
1843                 );
1844                 continue;
1845             }
1846 
1847             info!("Allocating TDVF Section: {:x?}", section);
1848             self.memory_manager
1849                 .lock()
1850                 .unwrap()
1851                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1852                 .map_err(Error::AllocatingTdvfMemory)?;
1853         }
1854 
1855         // The TDVF file contains a table of section as well as code
1856         let firmware_path = self
1857             .config
1858             .lock()
1859             .unwrap()
1860             .payload
1861             .as_ref()
1862             .unwrap()
1863             .firmware
1864             .clone()
1865             .ok_or(Error::TdxFirmwareMissing)?;
1866         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1867 
1868         // The guest memory at this point now has all the required regions so it
1869         // is safe to copy from the TDVF file into it.
1870         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1871         let mem = guest_memory.memory();
1872         let mut payload_info = None;
1873         let mut hob_offset = None;
1874         for section in sections {
1875             info!("Populating TDVF Section: {:x?}", section);
1876             match section.r#type {
1877                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1878                     info!("Copying section to guest memory");
1879                     firmware_file
1880                         .seek(SeekFrom::Start(section.data_offset as u64))
1881                         .map_err(Error::LoadTdvf)?;
1882                     mem.read_volatile_from(
1883                         GuestAddress(section.address),
1884                         &mut firmware_file,
1885                         section.data_size as usize,
1886                     )
1887                     .unwrap();
1888                 }
1889                 TdvfSectionType::TdHob => {
1890                     hob_offset = Some(section.address);
1891                 }
1892                 TdvfSectionType::Payload => {
1893                     info!("Copying payload to guest memory");
1894                     if let Some(payload_file) = self.kernel.as_mut() {
1895                         let payload_size = payload_file
1896                             .seek(SeekFrom::End(0))
1897                             .map_err(Error::LoadPayload)?;
1898 
1899                         payload_file
1900                             .seek(SeekFrom::Start(0x1f1))
1901                             .map_err(Error::LoadPayload)?;
1902 
1903                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1904                         payload_file
1905                             .read_volatile(&mut payload_header.as_bytes())
1906                             .unwrap();
1907 
1908                         if payload_header.header != 0x5372_6448 {
1909                             return Err(Error::InvalidPayloadType);
1910                         }
1911 
1912                         if (payload_header.version < 0x0200)
1913                             || ((payload_header.loadflags & 0x1) == 0x0)
1914                         {
1915                             return Err(Error::InvalidPayloadType);
1916                         }
1917 
1918                         payload_file.rewind().map_err(Error::LoadPayload)?;
1919                         mem.read_volatile_from(
1920                             GuestAddress(section.address),
1921                             payload_file,
1922                             payload_size as usize,
1923                         )
1924                         .unwrap();
1925 
1926                         // Create the payload info that will be inserted into
1927                         // the HOB.
1928                         payload_info = Some(PayloadInfo {
1929                             image_type: PayloadImageType::BzImage,
1930                             entry_point: section.address,
1931                         });
1932                     }
1933                 }
1934                 TdvfSectionType::PayloadParam => {
1935                     info!("Copying payload parameters to guest memory");
1936                     let cmdline = Self::generate_cmdline(
1937                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1938                     )?;
1939                     mem.write_slice(
1940                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1941                         GuestAddress(section.address),
1942                     )
1943                     .unwrap();
1944                 }
1945                 _ => {}
1946             }
1947         }
1948 
1949         // Generate HOB
1950         let mut hob = TdHob::start(hob_offset.unwrap());
1951 
1952         let mut sorted_sections = sections.to_vec();
1953         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1954 
1955         sorted_sections.sort_by_key(|section| section.address);
1956         sorted_sections.reverse();
1957 
1958         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1959             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1960                 .map_err(Error::PopulateHob)?;
1961         }
1962 
1963         // MMIO regions
1964         hob.add_mmio_resource(
1965             &mem,
1966             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1967             arch::layout::APIC_START.raw_value()
1968                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1969         )
1970         .map_err(Error::PopulateHob)?;
1971         let start_of_device_area = self
1972             .memory_manager
1973             .lock()
1974             .unwrap()
1975             .start_of_device_area()
1976             .raw_value();
1977         let end_of_device_area = self
1978             .memory_manager
1979             .lock()
1980             .unwrap()
1981             .end_of_device_area()
1982             .raw_value();
1983         hob.add_mmio_resource(
1984             &mem,
1985             start_of_device_area,
1986             end_of_device_area - start_of_device_area,
1987         )
1988         .map_err(Error::PopulateHob)?;
1989 
1990         // Loop over the ACPI tables and copy them to the HOB.
1991 
1992         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1993             &self.device_manager,
1994             &self.cpu_manager,
1995             &self.memory_manager,
1996             &self.numa_nodes,
1997         ) {
1998             hob.add_acpi_table(&mem, acpi_table.as_slice())
1999                 .map_err(Error::PopulateHob)?;
2000         }
2001 
2002         // If a payload info has been created, let's insert it into the HOB.
2003         if let Some(payload_info) = payload_info {
2004             hob.add_payload(&mem, payload_info)
2005                 .map_err(Error::PopulateHob)?;
2006         }
2007 
2008         hob.finish(&mem).map_err(Error::PopulateHob)?;
2009 
2010         Ok(hob_offset)
2011     }
2012 
2013     #[cfg(feature = "tdx")]
2014     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
2015         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2016         let mem = guest_memory.memory();
2017 
2018         for section in sections {
2019             self.vm
2020                 .tdx_init_memory_region(
2021                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2022                     section.address,
2023                     section.size,
2024                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2025                     section.attributes == 1,
2026                 )
2027                 .map_err(Error::InitializeTdxMemoryRegion)?;
2028         }
2029 
2030         Ok(())
2031     }
2032 
2033     // Creates ACPI tables
2034     // In case of TDX being used, this is a no-op since the tables will be
2035     // created and passed when populating the HOB.
2036 
2037     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2038         #[cfg(feature = "tdx")]
2039         if self.config.lock().unwrap().is_tdx_enabled() {
2040             return None;
2041         }
2042         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2043         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2044         let rsdp_addr = crate::acpi::create_acpi_tables(
2045             &mem,
2046             &self.device_manager,
2047             &self.cpu_manager,
2048             &self.memory_manager,
2049             &self.numa_nodes,
2050             tpm_enabled,
2051         );
2052         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2053 
2054         Some(rsdp_addr)
2055     }
2056 
2057     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2058         trace_scoped!("entry_point");
2059 
2060         self.load_payload_handle
2061             .take()
2062             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2063             .transpose()
2064     }
2065 
2066     pub fn boot(&mut self) -> Result<()> {
2067         trace_scoped!("Vm::boot");
2068         let current_state = self.get_state()?;
2069         if current_state == VmState::Paused {
2070             return self.resume().map_err(Error::Resume);
2071         }
2072 
2073         let new_state = if self.stop_on_boot {
2074             VmState::BreakPoint
2075         } else {
2076             VmState::Running
2077         };
2078         current_state.valid_transition(new_state)?;
2079 
2080         // Do earlier to parallelise with loading kernel
2081         #[cfg(target_arch = "x86_64")]
2082         cfg_if::cfg_if! {
2083             if #[cfg(feature = "sev_snp")] {
2084                 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2085                 let rsdp_addr = if sev_snp_enabled {
2086                     // In case of SEV-SNP guest ACPI tables are provided via
2087                     // IGVM. So skip the creation of ACPI tables and set the
2088                     // rsdp addr to None.
2089                     None
2090                 } else {
2091                     self.create_acpi_tables()
2092                 };
2093             } else {
2094                 let rsdp_addr = self.create_acpi_tables();
2095             }
2096         }
2097 
2098         // Load kernel synchronously or if asynchronous then wait for load to
2099         // finish.
2100         let entry_point = self.entry_point()?;
2101 
2102         #[cfg(feature = "tdx")]
2103         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2104 
2105         // Configure the vcpus that have been created
2106         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2107         for vcpu in vcpus {
2108             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2109             let boot_setup = entry_point.map(|e| (e, guest_memory));
2110             self.cpu_manager
2111                 .lock()
2112                 .unwrap()
2113                 .configure_vcpu(vcpu, boot_setup)
2114                 .map_err(Error::CpuManager)?;
2115         }
2116 
2117         #[cfg(feature = "tdx")]
2118         let (sections, guid_found) = if tdx_enabled {
2119             self.extract_tdvf_sections()?
2120         } else {
2121             (Vec::new(), false)
2122         };
2123 
2124         // Configuring the TDX regions requires that the vCPUs are created.
2125         #[cfg(feature = "tdx")]
2126         let hob_address = if tdx_enabled {
2127             // TDX sections are written to memory.
2128             self.populate_tdx_sections(&sections, guid_found)?
2129         } else {
2130             None
2131         };
2132 
2133         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2134         // available after they are configured
2135         #[cfg(target_arch = "aarch64")]
2136         let rsdp_addr = self.create_acpi_tables();
2137 
2138         // Configure shared state based on loaded kernel
2139         entry_point
2140             .map(|entry_point| {
2141                 // Safe to unwrap rsdp_addr as we know it can't be None when
2142                 // the entry_point is Some.
2143                 self.configure_system(rsdp_addr.unwrap(), entry_point)
2144             })
2145             .transpose()?;
2146 
2147         #[cfg(target_arch = "x86_64")]
2148         // Note: For x86, always call this function before invoking start boot vcpus.
2149         // Otherwise guest would fail to boot because we haven't created the
2150         // userspace mappings to update the hypervisor about the memory mappings.
2151         // These mappings must be created before we start the vCPU threads for
2152         // the very first time.
2153         self.memory_manager
2154             .lock()
2155             .unwrap()
2156             .allocate_address_space()
2157             .map_err(Error::MemoryManager)?;
2158 
2159         #[cfg(feature = "tdx")]
2160         if let Some(hob_address) = hob_address {
2161             // With the HOB address extracted the vCPUs can have
2162             // their TDX state configured.
2163             self.cpu_manager
2164                 .lock()
2165                 .unwrap()
2166                 .initialize_tdx(hob_address)
2167                 .map_err(Error::CpuManager)?;
2168             // Let the hypervisor know which memory ranges are shared with the
2169             // guest. This prevents the guest from ignoring/discarding memory
2170             // regions provided by the host.
2171             self.init_tdx_memory(&sections)?;
2172             // With TDX memory and CPU state configured TDX setup is complete
2173             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2174         }
2175 
2176         self.cpu_manager
2177             .lock()
2178             .unwrap()
2179             .start_boot_vcpus(new_state == VmState::BreakPoint)
2180             .map_err(Error::CpuManager)?;
2181 
2182         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2183         *state = new_state;
2184         Ok(())
2185     }
2186 
2187     pub fn restore(&mut self) -> Result<()> {
2188         event!("vm", "restoring");
2189 
2190         #[cfg(target_arch = "x86_64")]
2191         // Note: For x86, always call this function before invoking start boot vcpus.
2192         // Otherwise guest would fail to boot because we haven't created the
2193         // userspace mappings to update the hypervisor about the memory mappings.
2194         // These mappings must be created before we start the vCPU threads for
2195         // the very first time for the restored VM.
2196         self.memory_manager
2197             .lock()
2198             .unwrap()
2199             .allocate_address_space()
2200             .map_err(Error::MemoryManager)?;
2201 
2202         // Now we can start all vCPUs from here.
2203         self.cpu_manager
2204             .lock()
2205             .unwrap()
2206             .start_restored_vcpus()
2207             .map_err(Error::CpuManager)?;
2208 
2209         event!("vm", "restored");
2210         Ok(())
2211     }
2212 
2213     /// Gets a thread-safe reference counted pointer to the VM configuration.
2214     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2215         Arc::clone(&self.config)
2216     }
2217 
2218     /// Get the VM state. Returns an error if the state is poisoned.
2219     pub fn get_state(&self) -> Result<VmState> {
2220         self.state
2221             .try_read()
2222             .map_err(|_| Error::PoisonedState)
2223             .map(|state| *state)
2224     }
2225 
2226     /// Gets the actual size of the balloon.
2227     pub fn balloon_size(&self) -> u64 {
2228         self.device_manager.lock().unwrap().balloon_size()
2229     }
2230 
2231     pub fn send_memory_fds(
2232         &mut self,
2233         socket: &mut UnixStream,
2234     ) -> std::result::Result<(), MigratableError> {
2235         for (slot, fd) in self
2236             .memory_manager
2237             .lock()
2238             .unwrap()
2239             .memory_slot_fds()
2240             .drain()
2241         {
2242             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2243                 .write_to(socket)
2244                 .map_err(|e| {
2245                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2246                 })?;
2247             socket
2248                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2249                 .map_err(|e| {
2250                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2251                 })?;
2252 
2253             let res = Response::read_from(socket)?;
2254             if res.status() != Status::Ok {
2255                 warn!("Error during memory fd migration");
2256                 Request::abandon().write_to(socket)?;
2257                 Response::read_from(socket).ok();
2258                 return Err(MigratableError::MigrateSend(anyhow!(
2259                     "Error during memory fd migration"
2260                 )));
2261             }
2262         }
2263 
2264         Ok(())
2265     }
2266 
2267     pub fn send_memory_regions<F>(
2268         &mut self,
2269         ranges: &MemoryRangeTable,
2270         fd: &mut F,
2271     ) -> std::result::Result<(), MigratableError>
2272     where
2273         F: WriteVolatile,
2274     {
2275         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2276         let mem = guest_memory.memory();
2277 
2278         for range in ranges.regions() {
2279             let mut offset: u64 = 0;
2280             // Here we are manually handling the retry in case we can't the
2281             // whole region at once because we can't use the implementation
2282             // from vm-memory::GuestMemory of write_all_to() as it is not
2283             // following the correct behavior. For more info about this issue
2284             // see: https://github.com/rust-vmm/vm-memory/issues/174
2285             loop {
2286                 let bytes_written = mem
2287                     .write_volatile_to(
2288                         GuestAddress(range.gpa + offset),
2289                         fd,
2290                         (range.length - offset) as usize,
2291                     )
2292                     .map_err(|e| {
2293                         MigratableError::MigrateSend(anyhow!(
2294                             "Error transferring memory to socket: {}",
2295                             e
2296                         ))
2297                     })?;
2298                 offset += bytes_written as u64;
2299 
2300                 if offset == range.length {
2301                     break;
2302                 }
2303             }
2304         }
2305 
2306         Ok(())
2307     }
2308 
2309     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2310         self.memory_manager
2311             .lock()
2312             .unwrap()
2313             .memory_range_table(false)
2314     }
2315 
2316     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2317         self.device_manager.lock().unwrap().device_tree()
2318     }
2319 
2320     pub fn activate_virtio_devices(&self) -> Result<()> {
2321         self.device_manager
2322             .lock()
2323             .unwrap()
2324             .activate_virtio_devices()
2325             .map_err(Error::ActivateVirtioDevices)
2326     }
2327 
2328     #[cfg(target_arch = "x86_64")]
2329     pub fn power_button(&self) -> Result<()> {
2330         return self
2331             .device_manager
2332             .lock()
2333             .unwrap()
2334             .notify_power_button()
2335             .map_err(Error::PowerButton);
2336     }
2337 
2338     #[cfg(target_arch = "aarch64")]
2339     pub fn power_button(&self) -> Result<()> {
2340         self.device_manager
2341             .lock()
2342             .unwrap()
2343             .notify_power_button()
2344             .map_err(Error::PowerButton)
2345     }
2346 
2347     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2348         self.memory_manager.lock().unwrap().snapshot_data()
2349     }
2350 
2351     #[cfg(feature = "guest_debug")]
2352     pub fn debug_request(
2353         &mut self,
2354         gdb_request: &GdbRequestPayload,
2355         cpu_id: usize,
2356     ) -> Result<GdbResponsePayload> {
2357         use GdbRequestPayload::*;
2358         match gdb_request {
2359             SetSingleStep(single_step) => {
2360                 self.set_guest_debug(cpu_id, &[], *single_step)
2361                     .map_err(Error::Debug)?;
2362             }
2363             SetHwBreakPoint(addrs) => {
2364                 self.set_guest_debug(cpu_id, addrs, false)
2365                     .map_err(Error::Debug)?;
2366             }
2367             Pause => {
2368                 self.debug_pause().map_err(Error::Debug)?;
2369             }
2370             Resume => {
2371                 self.debug_resume().map_err(Error::Debug)?;
2372             }
2373             ReadRegs => {
2374                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2375                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2376             }
2377             WriteRegs(regs) => {
2378                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2379             }
2380             ReadMem(vaddr, len) => {
2381                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2382                 let mem = self
2383                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2384                     .map_err(Error::Debug)?;
2385                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2386             }
2387             WriteMem(vaddr, data) => {
2388                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2389                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2390                     .map_err(Error::Debug)?;
2391             }
2392             ActiveVcpus => {
2393                 let active_vcpus = self.active_vcpus();
2394                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2395             }
2396         }
2397         Ok(GdbResponsePayload::CommandComplete)
2398     }
2399 
2400     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2401     fn get_dump_state(
2402         &mut self,
2403         destination_url: &str,
2404     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2405         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2406         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2407         let mut elf_phdr_num = 1;
2408         let elf_sh_info = 0;
2409         let coredump_file_path = url_to_file(destination_url)?;
2410         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2411 
2412         if mapping_num < UINT16_MAX - 2 {
2413             elf_phdr_num += mapping_num as u16;
2414         } else {
2415             panic!("mapping num beyond 65535 not supported");
2416         }
2417         let coredump_file = OpenOptions::new()
2418             .read(true)
2419             .write(true)
2420             .create_new(true)
2421             .open(coredump_file_path)
2422             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2423 
2424         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2425         let mem_data = self
2426             .memory_manager
2427             .lock()
2428             .unwrap()
2429             .coredump_memory_regions(mem_offset);
2430 
2431         Ok(DumpState {
2432             elf_note_size,
2433             elf_phdr_num,
2434             elf_sh_info,
2435             mem_offset,
2436             mem_info: Some(mem_data),
2437             file: Some(coredump_file),
2438         })
2439     }
2440 
2441     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2442     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2443         size_of::<elf::Elf64_Ehdr>() as u64
2444             + note_size as u64
2445             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2446     }
2447 
2448     pub fn nmi(&self) -> Result<()> {
2449         return self
2450             .cpu_manager
2451             .lock()
2452             .unwrap()
2453             .nmi()
2454             .map_err(|_| Error::ErrorNmi);
2455     }
2456 }
2457 
2458 impl Pausable for Vm {
2459     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2460         event!("vm", "pausing");
2461         let mut state = self
2462             .state
2463             .try_write()
2464             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2465         let new_state = VmState::Paused;
2466 
2467         state
2468             .valid_transition(new_state)
2469             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2470 
2471         #[cfg(target_arch = "x86_64")]
2472         {
2473             let mut clock = self
2474                 .vm
2475                 .get_clock()
2476                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2477             clock.reset_flags();
2478             self.saved_clock = Some(clock);
2479         }
2480 
2481         // Before pausing the vCPUs activate any pending virtio devices that might
2482         // need activation between starting the pause (or e.g. a migration it's part of)
2483         self.activate_virtio_devices().map_err(|e| {
2484             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2485         })?;
2486 
2487         self.cpu_manager.lock().unwrap().pause()?;
2488         self.device_manager.lock().unwrap().pause()?;
2489 
2490         *state = new_state;
2491 
2492         event!("vm", "paused");
2493         Ok(())
2494     }
2495 
2496     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2497         event!("vm", "resuming");
2498         let mut state = self
2499             .state
2500             .try_write()
2501             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2502         let new_state = VmState::Running;
2503 
2504         state
2505             .valid_transition(new_state)
2506             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2507 
2508         self.cpu_manager.lock().unwrap().resume()?;
2509         #[cfg(target_arch = "x86_64")]
2510         {
2511             if let Some(clock) = &self.saved_clock {
2512                 self.vm.set_clock(clock).map_err(|e| {
2513                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2514                 })?;
2515             }
2516         }
2517         self.device_manager.lock().unwrap().resume()?;
2518 
2519         // And we're back to the Running state.
2520         *state = new_state;
2521         event!("vm", "resumed");
2522         Ok(())
2523     }
2524 }
2525 
2526 #[derive(Serialize, Deserialize)]
2527 pub struct VmSnapshot {
2528     #[cfg(target_arch = "x86_64")]
2529     pub clock: Option<hypervisor::ClockData>,
2530     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2531     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2532 }
2533 
2534 pub const VM_SNAPSHOT_ID: &str = "vm";
2535 impl Snapshottable for Vm {
2536     fn id(&self) -> String {
2537         VM_SNAPSHOT_ID.to_string()
2538     }
2539 
2540     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2541         event!("vm", "snapshotting");
2542 
2543         #[cfg(feature = "tdx")]
2544         {
2545             if self.config.lock().unwrap().is_tdx_enabled() {
2546                 return Err(MigratableError::Snapshot(anyhow!(
2547                     "Snapshot not possible with TDX VM"
2548                 )));
2549             }
2550         }
2551 
2552         let current_state = self.get_state().unwrap();
2553         if current_state != VmState::Paused {
2554             return Err(MigratableError::Snapshot(anyhow!(
2555                 "Trying to snapshot while VM is running"
2556             )));
2557         }
2558 
2559         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2560         let common_cpuid = {
2561             let amx = self.config.lock().unwrap().cpus.features.amx;
2562             let phys_bits = physical_bits(
2563                 &self.hypervisor,
2564                 self.config.lock().unwrap().cpus.max_phys_bits,
2565             );
2566             arch::generate_common_cpuid(
2567                 &self.hypervisor,
2568                 &arch::CpuidConfig {
2569                     sgx_epc_sections: None,
2570                     phys_bits,
2571                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2572                     #[cfg(feature = "tdx")]
2573                     tdx: false,
2574                     amx,
2575                 },
2576             )
2577             .map_err(|e| {
2578                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2579             })?
2580         };
2581 
2582         let vm_snapshot_state = VmSnapshot {
2583             #[cfg(target_arch = "x86_64")]
2584             clock: self.saved_clock,
2585             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2586             common_cpuid,
2587         };
2588 
2589         let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?;
2590 
2591         let (id, snapshot) = {
2592             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2593             (cpu_manager.id(), cpu_manager.snapshot()?)
2594         };
2595         vm_snapshot.add_snapshot(id, snapshot);
2596         let (id, snapshot) = {
2597             let mut memory_manager = self.memory_manager.lock().unwrap();
2598             (memory_manager.id(), memory_manager.snapshot()?)
2599         };
2600         vm_snapshot.add_snapshot(id, snapshot);
2601         let (id, snapshot) = {
2602             let mut device_manager = self.device_manager.lock().unwrap();
2603             (device_manager.id(), device_manager.snapshot()?)
2604         };
2605         vm_snapshot.add_snapshot(id, snapshot);
2606 
2607         event!("vm", "snapshotted");
2608         Ok(vm_snapshot)
2609     }
2610 }
2611 
2612 impl Transportable for Vm {
2613     fn send(
2614         &self,
2615         snapshot: &Snapshot,
2616         destination_url: &str,
2617     ) -> std::result::Result<(), MigratableError> {
2618         let mut snapshot_config_path = url_to_path(destination_url)?;
2619         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2620 
2621         // Create the snapshot config file
2622         let mut snapshot_config_file = OpenOptions::new()
2623             .read(true)
2624             .write(true)
2625             .create_new(true)
2626             .open(snapshot_config_path)
2627             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2628 
2629         // Serialize and write the snapshot config
2630         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2631             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2632 
2633         snapshot_config_file
2634             .write(vm_config.as_bytes())
2635             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2636 
2637         let mut snapshot_state_path = url_to_path(destination_url)?;
2638         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2639 
2640         // Create the snapshot state file
2641         let mut snapshot_state_file = OpenOptions::new()
2642             .read(true)
2643             .write(true)
2644             .create_new(true)
2645             .open(snapshot_state_path)
2646             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2647 
2648         // Serialize and write the snapshot state
2649         let vm_state =
2650             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2651 
2652         snapshot_state_file
2653             .write(&vm_state)
2654             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2655 
2656         // Tell the memory manager to also send/write its own snapshot.
2657         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2658             self.memory_manager
2659                 .lock()
2660                 .unwrap()
2661                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2662         } else {
2663             return Err(MigratableError::Restore(anyhow!(
2664                 "Missing memory manager snapshot"
2665             )));
2666         }
2667 
2668         Ok(())
2669     }
2670 }
2671 
2672 impl Migratable for Vm {
2673     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2674         self.memory_manager.lock().unwrap().start_dirty_log()?;
2675         self.device_manager.lock().unwrap().start_dirty_log()
2676     }
2677 
2678     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2679         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2680         self.device_manager.lock().unwrap().stop_dirty_log()
2681     }
2682 
2683     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2684         Ok(MemoryRangeTable::new_from_tables(vec![
2685             self.memory_manager.lock().unwrap().dirty_log()?,
2686             self.device_manager.lock().unwrap().dirty_log()?,
2687         ]))
2688     }
2689 
2690     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2691         self.memory_manager.lock().unwrap().start_migration()?;
2692         self.device_manager.lock().unwrap().start_migration()
2693     }
2694 
2695     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2696         self.memory_manager.lock().unwrap().complete_migration()?;
2697         self.device_manager.lock().unwrap().complete_migration()
2698     }
2699 }
2700 
2701 #[cfg(feature = "guest_debug")]
2702 impl Debuggable for Vm {
2703     fn set_guest_debug(
2704         &self,
2705         cpu_id: usize,
2706         addrs: &[GuestAddress],
2707         singlestep: bool,
2708     ) -> std::result::Result<(), DebuggableError> {
2709         self.cpu_manager
2710             .lock()
2711             .unwrap()
2712             .set_guest_debug(cpu_id, addrs, singlestep)
2713     }
2714 
2715     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2716         if *self.state.read().unwrap() == VmState::Running {
2717             self.pause().map_err(DebuggableError::Pause)?;
2718         }
2719 
2720         let mut state = self
2721             .state
2722             .try_write()
2723             .map_err(|_| DebuggableError::PoisonedState)?;
2724         *state = VmState::BreakPoint;
2725         Ok(())
2726     }
2727 
2728     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2729         if *self.state.read().unwrap() == VmState::BreakPoint {
2730             self.resume().map_err(DebuggableError::Pause)?;
2731         }
2732 
2733         Ok(())
2734     }
2735 
2736     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2737         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2738     }
2739 
2740     fn write_regs(
2741         &self,
2742         cpu_id: usize,
2743         regs: &CoreRegs,
2744     ) -> std::result::Result<(), DebuggableError> {
2745         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2746     }
2747 
2748     fn read_mem(
2749         &self,
2750         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2751         cpu_id: usize,
2752         vaddr: GuestAddress,
2753         len: usize,
2754     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2755         self.cpu_manager
2756             .lock()
2757             .unwrap()
2758             .read_mem(guest_memory, cpu_id, vaddr, len)
2759     }
2760 
2761     fn write_mem(
2762         &self,
2763         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2764         cpu_id: usize,
2765         vaddr: &GuestAddress,
2766         data: &[u8],
2767     ) -> std::result::Result<(), DebuggableError> {
2768         self.cpu_manager
2769             .lock()
2770             .unwrap()
2771             .write_mem(guest_memory, cpu_id, vaddr, data)
2772     }
2773 
2774     fn active_vcpus(&self) -> usize {
2775         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2776         if active_vcpus > 0 {
2777             active_vcpus
2778         } else {
2779             // The VM is not booted yet. Report boot_vcpus() instead.
2780             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2781         }
2782     }
2783 }
2784 
2785 #[cfg(feature = "guest_debug")]
2786 pub const UINT16_MAX: u32 = 65535;
2787 
2788 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2789 impl Elf64Writable for Vm {}
2790 
2791 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2792 impl GuestDebuggable for Vm {
2793     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2794         event!("vm", "coredumping");
2795 
2796         let mut resume = false;
2797 
2798         #[cfg(feature = "tdx")]
2799         {
2800             if let Some(ref platform) = self.config.lock().unwrap().platform {
2801                 if platform.tdx {
2802                     return Err(GuestDebuggableError::Coredump(anyhow!(
2803                         "Coredump not possible with TDX VM"
2804                     )));
2805                 }
2806             }
2807         }
2808 
2809         match self.get_state().unwrap() {
2810             VmState::Running => {
2811                 self.pause().map_err(GuestDebuggableError::Pause)?;
2812                 resume = true;
2813             }
2814             VmState::Paused => {}
2815             _ => {
2816                 return Err(GuestDebuggableError::Coredump(anyhow!(
2817                     "Trying to coredump while VM is not running or paused"
2818                 )));
2819             }
2820         }
2821 
2822         let coredump_state = self.get_dump_state(destination_url)?;
2823 
2824         self.write_header(&coredump_state)?;
2825         self.write_note(&coredump_state)?;
2826         self.write_loads(&coredump_state)?;
2827 
2828         self.cpu_manager
2829             .lock()
2830             .unwrap()
2831             .cpu_write_elf64_note(&coredump_state)?;
2832         self.cpu_manager
2833             .lock()
2834             .unwrap()
2835             .cpu_write_vmm_note(&coredump_state)?;
2836 
2837         self.memory_manager
2838             .lock()
2839             .unwrap()
2840             .coredump_iterate_save_mem(&coredump_state)?;
2841 
2842         if resume {
2843             self.resume().map_err(GuestDebuggableError::Resume)?;
2844         }
2845 
2846         Ok(())
2847     }
2848 }
2849 
2850 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2851 #[cfg(test)]
2852 mod tests {
2853     use super::*;
2854 
2855     fn test_vm_state_transitions(state: VmState) {
2856         match state {
2857             VmState::Created => {
2858                 // Check the transitions from Created
2859                 assert!(state.valid_transition(VmState::Created).is_err());
2860                 assert!(state.valid_transition(VmState::Running).is_ok());
2861                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2862                 assert!(state.valid_transition(VmState::Paused).is_ok());
2863                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2864             }
2865             VmState::Running => {
2866                 // Check the transitions from Running
2867                 assert!(state.valid_transition(VmState::Created).is_err());
2868                 assert!(state.valid_transition(VmState::Running).is_err());
2869                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2870                 assert!(state.valid_transition(VmState::Paused).is_ok());
2871                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2872             }
2873             VmState::Shutdown => {
2874                 // Check the transitions from Shutdown
2875                 assert!(state.valid_transition(VmState::Created).is_err());
2876                 assert!(state.valid_transition(VmState::Running).is_ok());
2877                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2878                 assert!(state.valid_transition(VmState::Paused).is_err());
2879                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2880             }
2881             VmState::Paused => {
2882                 // Check the transitions from Paused
2883                 assert!(state.valid_transition(VmState::Created).is_err());
2884                 assert!(state.valid_transition(VmState::Running).is_ok());
2885                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2886                 assert!(state.valid_transition(VmState::Paused).is_err());
2887                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2888             }
2889             VmState::BreakPoint => {
2890                 // Check the transitions from Breakpoint
2891                 assert!(state.valid_transition(VmState::Created).is_ok());
2892                 assert!(state.valid_transition(VmState::Running).is_ok());
2893                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2894                 assert!(state.valid_transition(VmState::Paused).is_err());
2895                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2896             }
2897         }
2898     }
2899 
2900     #[test]
2901     fn test_vm_created_transitions() {
2902         test_vm_state_transitions(VmState::Created);
2903     }
2904 
2905     #[test]
2906     fn test_vm_running_transitions() {
2907         test_vm_state_transitions(VmState::Running);
2908     }
2909 
2910     #[test]
2911     fn test_vm_shutdown_transitions() {
2912         test_vm_state_transitions(VmState::Shutdown);
2913     }
2914 
2915     #[test]
2916     fn test_vm_paused_transitions() {
2917         test_vm_state_transitions(VmState::Paused);
2918     }
2919 
2920     #[cfg(feature = "tdx")]
2921     #[test]
2922     fn test_hob_memory_resources() {
2923         // Case 1: Two TDVF sections in the middle of the RAM
2924         let sections = vec![
2925             TdvfSection {
2926                 address: 0xc000,
2927                 size: 0x1000,
2928                 ..Default::default()
2929             },
2930             TdvfSection {
2931                 address: 0x1000,
2932                 size: 0x4000,
2933                 ..Default::default()
2934             },
2935         ];
2936         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2937         let expected = vec![
2938             (0, 0x1000, true),
2939             (0x1000, 0x4000, false),
2940             (0x5000, 0x7000, true),
2941             (0xc000, 0x1000, false),
2942             (0xd000, 0x0fff_3000, true),
2943         ];
2944         assert_eq!(
2945             expected,
2946             Vm::hob_memory_resources(
2947                 sections,
2948                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2949             )
2950         );
2951 
2952         // Case 2: Two TDVF sections with no conflict with the RAM
2953         let sections = vec![
2954             TdvfSection {
2955                 address: 0x1000_1000,
2956                 size: 0x1000,
2957                 ..Default::default()
2958             },
2959             TdvfSection {
2960                 address: 0,
2961                 size: 0x1000,
2962                 ..Default::default()
2963             },
2964         ];
2965         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2966         let expected = vec![
2967             (0, 0x1000, false),
2968             (0x1000, 0x1000_0000, true),
2969             (0x1000_1000, 0x1000, false),
2970         ];
2971         assert_eq!(
2972             expected,
2973             Vm::hob_memory_resources(
2974                 sections,
2975                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2976             )
2977         );
2978 
2979         // Case 3: Two TDVF sections with partial conflicts with the RAM
2980         let sections = vec![
2981             TdvfSection {
2982                 address: 0x1000_0000,
2983                 size: 0x2000,
2984                 ..Default::default()
2985             },
2986             TdvfSection {
2987                 address: 0,
2988                 size: 0x2000,
2989                 ..Default::default()
2990             },
2991         ];
2992         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2993         let expected = vec![
2994             (0, 0x2000, false),
2995             (0x2000, 0x0fff_e000, true),
2996             (0x1000_0000, 0x2000, false),
2997         ];
2998         assert_eq!(
2999             expected,
3000             Vm::hob_memory_resources(
3001                 sections,
3002                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3003             )
3004         );
3005 
3006         // Case 4: Two TDVF sections with no conflict before the RAM and two
3007         // more additional sections with no conflict after the RAM.
3008         let sections = vec![
3009             TdvfSection {
3010                 address: 0x2000_1000,
3011                 size: 0x1000,
3012                 ..Default::default()
3013             },
3014             TdvfSection {
3015                 address: 0x2000_0000,
3016                 size: 0x1000,
3017                 ..Default::default()
3018             },
3019             TdvfSection {
3020                 address: 0x1000,
3021                 size: 0x1000,
3022                 ..Default::default()
3023             },
3024             TdvfSection {
3025                 address: 0,
3026                 size: 0x1000,
3027                 ..Default::default()
3028             },
3029         ];
3030         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3031         let expected = vec![
3032             (0, 0x1000, false),
3033             (0x1000, 0x1000, false),
3034             (0x4000, 0x1000_0000, true),
3035             (0x2000_0000, 0x1000, false),
3036             (0x2000_1000, 0x1000, false),
3037         ];
3038         assert_eq!(
3039             expected,
3040             Vm::hob_memory_resources(
3041                 sections,
3042                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3043             )
3044         );
3045 
3046         // Case 5: One TDVF section overriding the entire RAM
3047         let sections = vec![TdvfSection {
3048             address: 0,
3049             size: 0x2000_0000,
3050             ..Default::default()
3051         }];
3052         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3053         let expected = vec![(0, 0x2000_0000, false)];
3054         assert_eq!(
3055             expected,
3056             Vm::hob_memory_resources(
3057                 sections,
3058                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3059             )
3060         );
3061 
3062         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3063         let sections = vec![
3064             TdvfSection {
3065                 address: 0x1000_2000,
3066                 size: 0x2000,
3067                 ..Default::default()
3068             },
3069             TdvfSection {
3070                 address: 0,
3071                 size: 0x2000,
3072                 ..Default::default()
3073             },
3074         ];
3075         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3076             (GuestAddress(0x2000), 0x1000_0000),
3077             (GuestAddress(0x1000_4000), 0x1000_0000),
3078         ];
3079         let expected = vec![
3080             (0, 0x2000, false),
3081             (0x2000, 0x1000_0000, true),
3082             (0x1000_2000, 0x2000, false),
3083             (0x1000_4000, 0x1000_0000, true),
3084         ];
3085         assert_eq!(
3086             expected,
3087             Vm::hob_memory_resources(
3088                 sections,
3089                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3090             )
3091         );
3092 
3093         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3094         let sections = vec![
3095             TdvfSection {
3096                 address: 0x1000_0000,
3097                 size: 0x4000,
3098                 ..Default::default()
3099             },
3100             TdvfSection {
3101                 address: 0,
3102                 size: 0x4000,
3103                 ..Default::default()
3104             },
3105         ];
3106         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3107             (GuestAddress(0x1000), 0x1000_0000),
3108             (GuestAddress(0x1000_3000), 0x1000_0000),
3109         ];
3110         let expected = vec![
3111             (0, 0x4000, false),
3112             (0x4000, 0x0fff_c000, true),
3113             (0x1000_0000, 0x4000, false),
3114             (0x1000_4000, 0x0fff_f000, true),
3115         ];
3116         assert_eq!(
3117             expected,
3118             Vm::hob_memory_resources(
3119                 sections,
3120                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3121             )
3122         );
3123     }
3124 }
3125 
3126 #[cfg(target_arch = "aarch64")]
3127 #[cfg(test)]
3128 mod tests {
3129     use super::*;
3130     use arch::aarch64::fdt::create_fdt;
3131     use arch::aarch64::layout;
3132     use arch::{DeviceType, MmioDeviceInfo};
3133     use devices::gic::Gic;
3134 
3135     const LEN: u64 = 4096;
3136 
3137     #[test]
3138     fn test_create_fdt_with_devices() {
3139         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3140         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3141 
3142         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3143             (
3144                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3145                 MmioDeviceInfo {
3146                     addr: 0x00,
3147                     len: LEN,
3148                     irq: 33,
3149                 },
3150             ),
3151             (
3152                 (DeviceType::Virtio(1), "virtio".to_string()),
3153                 MmioDeviceInfo {
3154                     addr: LEN,
3155                     len: LEN,
3156                     irq: 34,
3157                 },
3158             ),
3159             (
3160                 (DeviceType::Rtc, "rtc".to_string()),
3161                 MmioDeviceInfo {
3162                     addr: 2 * LEN,
3163                     len: LEN,
3164                     irq: 35,
3165                 },
3166             ),
3167         ]
3168         .iter()
3169         .cloned()
3170         .collect();
3171 
3172         let hv = hypervisor::new().unwrap();
3173         let vm = hv.create_vm().unwrap();
3174         let gic = vm
3175             .create_vgic(Gic::create_default_config(1))
3176             .expect("Cannot create gic");
3177         assert!(create_fdt(
3178             &mem,
3179             "console=tty0",
3180             vec![0],
3181             Some((0, 0, 0)),
3182             &dev_info,
3183             &gic,
3184             &None,
3185             &Vec::new(),
3186             &BTreeMap::new(),
3187             None,
3188             true,
3189         )
3190         .is_ok())
3191     }
3192 }
3193 
3194 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3195 #[test]
3196 pub fn test_vm() {
3197     use hypervisor::VmExit;
3198     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3199     // This example based on https://lwn.net/Articles/658511/
3200     let code = [
3201         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3202         0x00, 0xd8, /* add %bl, %al */
3203         0x04, b'0', /* add $'0', %al */
3204         0xee, /* out %al, (%dx) */
3205         0xb0, b'\n', /* mov $'\n', %al */
3206         0xee,  /* out %al, (%dx) */
3207         0xf4,  /* hlt */
3208     ];
3209 
3210     let mem_size = 0x1000;
3211     let load_addr = GuestAddress(0x1000);
3212     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3213 
3214     let hv = hypervisor::new().unwrap();
3215     let vm = hv.create_vm().expect("new VM creation failed");
3216 
3217     for (index, region) in mem.iter().enumerate() {
3218         let mem_region = vm.make_user_memory_region(
3219             index as u32,
3220             region.start_addr().raw_value(),
3221             region.len(),
3222             region.as_ptr() as u64,
3223             false,
3224             false,
3225         );
3226 
3227         vm.create_user_memory_region(mem_region)
3228             .expect("Cannot configure guest memory");
3229     }
3230     mem.write_slice(&code, load_addr)
3231         .expect("Writing code to memory failed");
3232 
3233     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3234 
3235     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3236     vcpu_sregs.cs.base = 0;
3237     vcpu_sregs.cs.selector = 0;
3238     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3239 
3240     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3241     vcpu_regs.rip = 0x1000;
3242     vcpu_regs.rax = 2;
3243     vcpu_regs.rbx = 3;
3244     vcpu_regs.rflags = 2;
3245     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3246 
3247     loop {
3248         match vcpu.run().expect("run failed") {
3249             VmExit::Reset => {
3250                 println!("HLT");
3251                 break;
3252             }
3253             VmExit::Ignore => {}
3254             r => panic!("unexpected exit reason: {r:?}"),
3255         }
3256     }
3257 }
3258