xref: /cloud-hypervisor/vmm/src/vm.rs (revision 3ce0fef7fd546467398c914dbc74d8542e45cf6f)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 #[cfg(feature = "igvm")]
29 use crate::igvm::igvm_loader;
30 use crate::memory_manager::{
31     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
32 };
33 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
34 use crate::migration::get_vm_snapshot;
35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
36 use crate::migration::url_to_file;
37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
38 use crate::GuestMemoryMmap;
39 use crate::{
40     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
41 };
42 use anyhow::anyhow;
43 use arch::get_host_cpu_phys_bits;
44 #[cfg(target_arch = "x86_64")]
45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
46 #[cfg(feature = "tdx")]
47 use arch::x86_64::tdx::TdvfSection;
48 use arch::EntryPoint;
49 #[cfg(target_arch = "aarch64")]
50 use arch::PciSpaceInfo;
51 use arch::{NumaNode, NumaNodes};
52 #[cfg(target_arch = "aarch64")]
53 use devices::interrupt_controller;
54 use devices::AcpiNotificationFlags;
55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
59 use hypervisor::{HypervisorVmError, VmOps};
60 use libc::{termios, SIGWINCH};
61 use linux_loader::cmdline::Cmdline;
62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
63 use linux_loader::elf;
64 #[cfg(target_arch = "x86_64")]
65 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
66 #[cfg(target_arch = "aarch64")]
67 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
68 use linux_loader::loader::KernelLoader;
69 use seccompiler::SeccompAction;
70 use serde::{Deserialize, Serialize};
71 use std::cmp;
72 use std::collections::BTreeMap;
73 use std::collections::HashMap;
74 use std::convert::TryInto;
75 use std::fs::{File, OpenOptions};
76 use std::io::{self, Seek, SeekFrom, Write};
77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
78 use std::mem::size_of;
79 use std::num::Wrapping;
80 use std::ops::Deref;
81 use std::os::unix::net::UnixStream;
82 use std::sync::{Arc, Mutex, RwLock};
83 use std::time::Instant;
84 use std::{result, str, thread};
85 use thiserror::Error;
86 use tracer::trace_scoped;
87 use vm_device::Bus;
88 #[cfg(feature = "tdx")]
89 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
90 use vm_memory::{
91     Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
92 };
93 use vm_migration::protocol::{Request, Response, Status};
94 use vm_migration::{
95     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
96     SnapshotData, Snapshottable, Transportable,
97 };
98 use vmm_sys_util::eventfd::EventFd;
99 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
100 
101 /// Errors associated with VM management
102 #[derive(Debug, Error)]
103 pub enum Error {
104     #[error("Cannot open kernel file: {0}")]
105     KernelFile(#[source] io::Error),
106 
107     #[error("Cannot open initramfs file: {0}")]
108     InitramfsFile(#[source] io::Error),
109 
110     #[error("Cannot load the kernel into memory: {0}")]
111     KernelLoad(#[source] linux_loader::loader::Error),
112 
113     #[cfg(target_arch = "aarch64")]
114     #[error("Cannot load the UEFI binary in memory: {0:?}")]
115     UefiLoad(arch::aarch64::uefi::Error),
116 
117     #[error("Cannot load the initramfs into memory")]
118     InitramfsLoad,
119 
120     #[error("Cannot load the kernel command line in memory: {0}")]
121     LoadCmdLine(#[source] linux_loader::loader::Error),
122 
123     #[error("Cannot modify the kernel command line: {0}")]
124     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
125 
126     #[error("Cannot create the kernel command line: {0}")]
127     CmdLineCreate(#[source] linux_loader::cmdline::Error),
128 
129     #[error("Cannot configure system: {0}")]
130     ConfigureSystem(#[source] arch::Error),
131 
132     #[cfg(target_arch = "aarch64")]
133     #[error("Cannot enable interrupt controller: {0:?}")]
134     EnableInterruptController(interrupt_controller::Error),
135 
136     #[error("VM state is poisoned")]
137     PoisonedState,
138 
139     #[error("Error from device manager: {0:?}")]
140     DeviceManager(DeviceManagerError),
141 
142     #[error("No device with id {0:?} to remove")]
143     NoDeviceToRemove(String),
144 
145     #[error("Cannot spawn a signal handler thread: {0}")]
146     SignalHandlerSpawn(#[source] io::Error),
147 
148     #[error("Failed to join on threads: {0:?}")]
149     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
150 
151     #[error("VM config is missing")]
152     VmMissingConfig,
153 
154     #[error("VM is not created")]
155     VmNotCreated,
156 
157     #[error("VM is already created")]
158     VmAlreadyCreated,
159 
160     #[error("VM is not running")]
161     VmNotRunning,
162 
163     #[error("Cannot clone EventFd: {0}")]
164     EventFdClone(#[source] io::Error),
165 
166     #[error("invalid VM state transition: {0:?} to {1:?}")]
167     InvalidStateTransition(VmState, VmState),
168 
169     #[error("Error from CPU manager: {0}")]
170     CpuManager(#[source] cpu::Error),
171 
172     #[error("Cannot pause devices: {0}")]
173     PauseDevices(#[source] MigratableError),
174 
175     #[error("Cannot resume devices: {0}")]
176     ResumeDevices(#[source] MigratableError),
177 
178     #[error("Cannot pause CPUs: {0}")]
179     PauseCpus(#[source] MigratableError),
180 
181     #[error("Cannot resume cpus: {0}")]
182     ResumeCpus(#[source] MigratableError),
183 
184     #[error("Cannot pause VM: {0}")]
185     Pause(#[source] MigratableError),
186 
187     #[error("Cannot resume VM: {0}")]
188     Resume(#[source] MigratableError),
189 
190     #[error("Memory manager error: {0:?}")]
191     MemoryManager(MemoryManagerError),
192 
193     #[error("Eventfd write error: {0}")]
194     EventfdError(#[source] std::io::Error),
195 
196     #[error("Cannot snapshot VM: {0}")]
197     Snapshot(#[source] MigratableError),
198 
199     #[error("Cannot restore VM: {0}")]
200     Restore(#[source] MigratableError),
201 
202     #[error("Cannot send VM snapshot: {0}")]
203     SnapshotSend(#[source] MigratableError),
204 
205     #[error("Invalid restore source URL")]
206     InvalidRestoreSourceUrl,
207 
208     #[error("Failed to validate config: {0}")]
209     ConfigValidation(#[source] ValidationError),
210 
211     #[error("Too many virtio-vsock devices")]
212     TooManyVsockDevices,
213 
214     #[error("Failed serializing into JSON: {0}")]
215     SerializeJson(#[source] serde_json::Error),
216 
217     #[error("Invalid NUMA configuration")]
218     InvalidNumaConfig,
219 
220     #[error("Cannot create seccomp filter: {0}")]
221     CreateSeccompFilter(#[source] seccompiler::Error),
222 
223     #[error("Cannot apply seccomp filter: {0}")]
224     ApplySeccompFilter(#[source] seccompiler::Error),
225 
226     #[error("Failed resizing a memory zone")]
227     ResizeZone,
228 
229     #[error("Cannot activate virtio devices: {0:?}")]
230     ActivateVirtioDevices(DeviceManagerError),
231 
232     #[error("Error triggering power button: {0:?}")]
233     PowerButton(DeviceManagerError),
234 
235     #[error("Kernel lacks PVH header")]
236     KernelMissingPvhHeader,
237 
238     #[error("Failed to allocate firmware RAM: {0:?}")]
239     AllocateFirmwareMemory(MemoryManagerError),
240 
241     #[error("Error manipulating firmware file: {0}")]
242     FirmwareFile(#[source] std::io::Error),
243 
244     #[error("Firmware too big")]
245     FirmwareTooLarge,
246 
247     #[error("Failed to copy firmware to memory: {0}")]
248     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
249 
250     #[cfg(feature = "sev_snp")]
251     #[error("Error enabling SEV-SNP VM: {0}")]
252     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
253 
254     #[cfg(feature = "tdx")]
255     #[error("Error performing I/O on TDX firmware file: {0}")]
256     LoadTdvf(#[source] std::io::Error),
257 
258     #[cfg(feature = "tdx")]
259     #[error("Error performing I/O on the TDX payload file: {0}")]
260     LoadPayload(#[source] std::io::Error),
261 
262     #[cfg(feature = "tdx")]
263     #[error("Error parsing TDVF: {0}")]
264     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
265 
266     #[cfg(feature = "tdx")]
267     #[error("Error populating TDX HOB: {0}")]
268     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
269 
270     #[cfg(feature = "tdx")]
271     #[error("Error allocating TDVF memory: {0:?}")]
272     AllocatingTdvfMemory(crate::memory_manager::Error),
273 
274     #[cfg(feature = "tdx")]
275     #[error("Error enabling TDX VM: {0}")]
276     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
277 
278     #[cfg(feature = "tdx")]
279     #[error("Error enabling TDX memory region: {0}")]
280     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
281 
282     #[cfg(feature = "tdx")]
283     #[error("Error finalizing TDX VM: {0}")]
284     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
285 
286     #[cfg(feature = "tdx")]
287     #[error("TDX firmware missing")]
288     TdxFirmwareMissing,
289 
290     #[cfg(feature = "tdx")]
291     #[error("Invalid TDX payload type")]
292     InvalidPayloadType,
293 
294     #[cfg(feature = "guest_debug")]
295     #[error("Error debugging VM: {0:?}")]
296     Debug(DebuggableError),
297 
298     #[error("Error spawning kernel loading thread")]
299     KernelLoadThreadSpawn(std::io::Error),
300 
301     #[error("Error joining kernel loading thread")]
302     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
303 
304     #[error("Payload configuration is not bootable")]
305     InvalidPayload,
306 
307     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
308     #[error("Error coredumping VM: {0:?}")]
309     Coredump(GuestDebuggableError),
310 
311     #[cfg(feature = "igvm")]
312     #[error("Cannot open igvm file: {0}")]
313     IgvmFile(#[source] io::Error),
314 
315     #[cfg(feature = "igvm")]
316     #[error("Cannot load the igvm into memory: {0}")]
317     IgvmLoad(#[source] igvm_loader::Error),
318 }
319 pub type Result<T> = result::Result<T, Error>;
320 
321 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
322 pub enum VmState {
323     Created,
324     Running,
325     Shutdown,
326     Paused,
327     BreakPoint,
328 }
329 
330 impl VmState {
331     fn valid_transition(self, new_state: VmState) -> Result<()> {
332         match self {
333             VmState::Created => match new_state {
334                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
335                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
336                     Ok(())
337                 }
338             },
339 
340             VmState::Running => match new_state {
341                 VmState::Created | VmState::Running => {
342                     Err(Error::InvalidStateTransition(self, new_state))
343                 }
344                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
345             },
346 
347             VmState::Shutdown => match new_state {
348                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
349                     Err(Error::InvalidStateTransition(self, new_state))
350                 }
351                 VmState::Running => Ok(()),
352             },
353 
354             VmState::Paused => match new_state {
355                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
356                     Err(Error::InvalidStateTransition(self, new_state))
357                 }
358                 VmState::Running | VmState::Shutdown => Ok(()),
359             },
360             VmState::BreakPoint => match new_state {
361                 VmState::Created | VmState::Running => Ok(()),
362                 _ => Err(Error::InvalidStateTransition(self, new_state)),
363             },
364         }
365     }
366 }
367 
368 struct VmOpsHandler {
369     memory: GuestMemoryAtomic<GuestMemoryMmap>,
370     #[cfg(target_arch = "x86_64")]
371     io_bus: Arc<Bus>,
372     mmio_bus: Arc<Bus>,
373 }
374 
375 impl VmOps for VmOpsHandler {
376     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
377         self.memory
378             .memory()
379             .write(buf, GuestAddress(gpa))
380             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
381     }
382 
383     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
384         self.memory
385             .memory()
386             .read(buf, GuestAddress(gpa))
387             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
388     }
389 
390     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
391         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
392             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
393         }
394         Ok(())
395     }
396 
397     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
398         match self.mmio_bus.write(gpa, data) {
399             Err(vm_device::BusError::MissingAddressRange) => {
400                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
401             }
402             Ok(Some(barrier)) => {
403                 info!("Waiting for barrier");
404                 barrier.wait();
405                 info!("Barrier released");
406             }
407             _ => {}
408         };
409         Ok(())
410     }
411 
412     #[cfg(target_arch = "x86_64")]
413     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
414         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
415             info!("Guest PIO read to unregistered address 0x{:x}", port);
416         }
417         Ok(())
418     }
419 
420     #[cfg(target_arch = "x86_64")]
421     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
422         match self.io_bus.write(port, data) {
423             Err(vm_device::BusError::MissingAddressRange) => {
424                 info!("Guest PIO write to unregistered address 0x{:x}", port);
425             }
426             Ok(Some(barrier)) => {
427                 info!("Waiting for barrier");
428                 barrier.wait();
429                 info!("Barrier released");
430             }
431             _ => {}
432         };
433         Ok(())
434     }
435 }
436 
437 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
438     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
439 
440     cmp::min(host_phys_bits, max_phys_bits)
441 }
442 
443 pub struct Vm {
444     #[cfg(feature = "tdx")]
445     kernel: Option<File>,
446     initramfs: Option<File>,
447     threads: Vec<thread::JoinHandle<()>>,
448     device_manager: Arc<Mutex<DeviceManager>>,
449     config: Arc<Mutex<VmConfig>>,
450     state: RwLock<VmState>,
451     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
452     memory_manager: Arc<Mutex<MemoryManager>>,
453     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
454     // The hypervisor abstracted virtual machine.
455     vm: Arc<dyn hypervisor::Vm>,
456     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
457     saved_clock: Option<hypervisor::ClockData>,
458     numa_nodes: NumaNodes,
459     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
460     hypervisor: Arc<dyn hypervisor::Hypervisor>,
461     stop_on_boot: bool,
462     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
463 }
464 
465 impl Vm {
466     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
467 
468     #[allow(clippy::too_many_arguments)]
469     pub fn new_from_memory_manager(
470         config: Arc<Mutex<VmConfig>>,
471         memory_manager: Arc<Mutex<MemoryManager>>,
472         vm: Arc<dyn hypervisor::Vm>,
473         exit_evt: EventFd,
474         reset_evt: EventFd,
475         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
476         seccomp_action: &SeccompAction,
477         hypervisor: Arc<dyn hypervisor::Hypervisor>,
478         activate_evt: EventFd,
479         timestamp: Instant,
480         serial_pty: Option<PtyPair>,
481         console_pty: Option<PtyPair>,
482         console_resize_pipe: Option<File>,
483         original_termios: Arc<Mutex<Option<termios>>>,
484         snapshot: Option<Snapshot>,
485     ) -> Result<Self> {
486         trace_scoped!("Vm::new_from_memory_manager");
487 
488         let boot_id_list = config
489             .lock()
490             .unwrap()
491             .validate()
492             .map_err(Error::ConfigValidation)?;
493 
494         #[cfg(not(feature = "igvm"))]
495         let load_payload_handle = if snapshot.is_none() {
496             Self::load_payload_async(&memory_manager, &config)?
497         } else {
498             None
499         };
500 
501         info!("Booting VM from config: {:?}", &config);
502 
503         // Create NUMA nodes based on NumaConfig.
504         let numa_nodes =
505             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
506 
507         #[cfg(feature = "tdx")]
508         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
509         #[cfg(feature = "sev_snp")]
510         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
511         #[cfg(feature = "tdx")]
512         let force_iommu = tdx_enabled;
513         #[cfg(not(feature = "tdx"))]
514         let force_iommu = false;
515 
516         #[cfg(feature = "guest_debug")]
517         let stop_on_boot = config.lock().unwrap().gdb;
518         #[cfg(not(feature = "guest_debug"))]
519         let stop_on_boot = false;
520 
521         let memory = memory_manager.lock().unwrap().guest_memory();
522         #[cfg(target_arch = "x86_64")]
523         let io_bus = Arc::new(Bus::new());
524         let mmio_bus = Arc::new(Bus::new());
525 
526         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
527             memory,
528             #[cfg(target_arch = "x86_64")]
529             io_bus: io_bus.clone(),
530             mmio_bus: mmio_bus.clone(),
531         });
532 
533         let cpus_config = { &config.lock().unwrap().cpus.clone() };
534         let cpu_manager = cpu::CpuManager::new(
535             cpus_config,
536             vm.clone(),
537             exit_evt.try_clone().map_err(Error::EventFdClone)?,
538             reset_evt.try_clone().map_err(Error::EventFdClone)?,
539             #[cfg(feature = "guest_debug")]
540             vm_debug_evt,
541             &hypervisor,
542             seccomp_action.clone(),
543             vm_ops,
544             #[cfg(feature = "tdx")]
545             tdx_enabled,
546             &numa_nodes,
547             #[cfg(feature = "sev_snp")]
548             sev_snp_enabled,
549         )
550         .map_err(Error::CpuManager)?;
551 
552         #[cfg(target_arch = "x86_64")]
553         cpu_manager
554             .lock()
555             .unwrap()
556             .populate_cpuid(
557                 &memory_manager,
558                 &hypervisor,
559                 #[cfg(feature = "tdx")]
560                 tdx_enabled,
561             )
562             .map_err(Error::CpuManager)?;
563 
564         // Loading the igvm file is pushed down here because
565         // igvm parser needs cpu_manager to retrieve cpuid leaf.
566         // For the regular case, we can start loading early, but for
567         // igvm case we have to wait until cpu_manager is created.
568         // Currently, Microsoft Hypervisor does not provide any
569         // Hypervisor specific common cpuid, we need to call get_cpuid_values
570         // per cpuid through cpu_manager.
571         #[cfg(feature = "igvm")]
572         let load_payload_handle = if snapshot.is_none() {
573             Self::load_payload_async(&memory_manager, &config, &cpu_manager)?
574         } else {
575             None
576         };
577         // The initial TDX configuration must be done before the vCPUs are
578         // created
579         #[cfg(feature = "tdx")]
580         if tdx_enabled {
581             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
582             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
583             vm.tdx_init(&cpuid, max_vcpus)
584                 .map_err(Error::InitializeTdxVm)?;
585         }
586 
587         cpu_manager
588             .lock()
589             .unwrap()
590             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
591             .map_err(Error::CpuManager)?;
592 
593         // This initial SEV-SNP configuration must be done immediately after
594         // vCPUs are created. As part of this initialization we are
595         // transitioning the guest into secure state.
596         #[cfg(feature = "sev_snp")]
597         if sev_snp_enabled {
598             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
599         }
600 
601         #[cfg(feature = "tdx")]
602         let dynamic = !tdx_enabled;
603         #[cfg(not(feature = "tdx"))]
604         let dynamic = true;
605 
606         let device_manager = DeviceManager::new(
607             #[cfg(target_arch = "x86_64")]
608             io_bus,
609             mmio_bus,
610             hypervisor.hypervisor_type(),
611             vm.clone(),
612             config.clone(),
613             memory_manager.clone(),
614             cpu_manager.clone(),
615             exit_evt.try_clone().map_err(Error::EventFdClone)?,
616             reset_evt,
617             seccomp_action.clone(),
618             numa_nodes.clone(),
619             &activate_evt,
620             force_iommu,
621             boot_id_list,
622             timestamp,
623             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
624             dynamic,
625         )
626         .map_err(Error::DeviceManager)?;
627 
628         device_manager
629             .lock()
630             .unwrap()
631             .create_devices(
632                 serial_pty,
633                 console_pty,
634                 console_resize_pipe,
635                 original_termios,
636             )
637             .map_err(Error::DeviceManager)?;
638 
639         #[cfg(feature = "tdx")]
640         let kernel = config
641             .lock()
642             .unwrap()
643             .payload
644             .as_ref()
645             .map(|p| p.kernel.as_ref().map(File::open))
646             .unwrap_or_default()
647             .transpose()
648             .map_err(Error::KernelFile)?;
649 
650         let initramfs = config
651             .lock()
652             .unwrap()
653             .payload
654             .as_ref()
655             .map(|p| p.initramfs.as_ref().map(File::open))
656             .unwrap_or_default()
657             .transpose()
658             .map_err(Error::InitramfsFile)?;
659 
660         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
661         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
662             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
663             vm_snapshot.clock
664         } else {
665             None
666         };
667 
668         let vm_state = if snapshot.is_some() {
669             VmState::Paused
670         } else {
671             VmState::Created
672         };
673 
674         Ok(Vm {
675             #[cfg(feature = "tdx")]
676             kernel,
677             initramfs,
678             device_manager,
679             config,
680             threads: Vec::with_capacity(1),
681             state: RwLock::new(vm_state),
682             cpu_manager,
683             memory_manager,
684             vm,
685             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
686             saved_clock,
687             numa_nodes,
688             hypervisor,
689             stop_on_boot,
690             load_payload_handle,
691         })
692     }
693 
694     fn create_numa_nodes(
695         configs: Option<Vec<NumaConfig>>,
696         memory_manager: &Arc<Mutex<MemoryManager>>,
697     ) -> Result<NumaNodes> {
698         let mm = memory_manager.lock().unwrap();
699         let mm_zones = mm.memory_zones();
700         let mut numa_nodes = BTreeMap::new();
701 
702         if let Some(configs) = &configs {
703             for config in configs.iter() {
704                 if numa_nodes.contains_key(&config.guest_numa_id) {
705                     error!("Can't define twice the same NUMA node");
706                     return Err(Error::InvalidNumaConfig);
707                 }
708 
709                 let mut node = NumaNode::default();
710 
711                 if let Some(memory_zones) = &config.memory_zones {
712                     for memory_zone in memory_zones.iter() {
713                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
714                             node.memory_regions.extend(mm_zone.regions().clone());
715                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
716                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
717                             }
718                             node.memory_zones.push(memory_zone.clone());
719                         } else {
720                             error!("Unknown memory zone '{}'", memory_zone);
721                             return Err(Error::InvalidNumaConfig);
722                         }
723                     }
724                 }
725 
726                 if let Some(cpus) = &config.cpus {
727                     node.cpus.extend(cpus);
728                 }
729 
730                 if let Some(pci_segments) = &config.pci_segments {
731                     node.pci_segments.extend(pci_segments);
732                 }
733 
734                 if let Some(distances) = &config.distances {
735                     for distance in distances.iter() {
736                         let dest = distance.destination;
737                         let dist = distance.distance;
738 
739                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
740                             error!("Unknown destination NUMA node {}", dest);
741                             return Err(Error::InvalidNumaConfig);
742                         }
743 
744                         if node.distances.contains_key(&dest) {
745                             error!("Destination NUMA node {} has been already set", dest);
746                             return Err(Error::InvalidNumaConfig);
747                         }
748 
749                         node.distances.insert(dest, dist);
750                     }
751                 }
752 
753                 #[cfg(target_arch = "x86_64")]
754                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
755                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
756                         let mm_sections = sgx_epc_region.epc_sections();
757                         for sgx_epc_section in sgx_epc_sections.iter() {
758                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
759                                 node.sgx_epc_sections.push(mm_section.clone());
760                             } else {
761                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
762                                 return Err(Error::InvalidNumaConfig);
763                             }
764                         }
765                     } else {
766                         error!("Missing SGX EPC region");
767                         return Err(Error::InvalidNumaConfig);
768                     }
769                 }
770 
771                 numa_nodes.insert(config.guest_numa_id, node);
772             }
773         }
774 
775         Ok(numa_nodes)
776     }
777 
778     #[allow(clippy::too_many_arguments)]
779     pub fn new(
780         vm_config: Arc<Mutex<VmConfig>>,
781         exit_evt: EventFd,
782         reset_evt: EventFd,
783         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
784         seccomp_action: &SeccompAction,
785         hypervisor: Arc<dyn hypervisor::Hypervisor>,
786         activate_evt: EventFd,
787         serial_pty: Option<PtyPair>,
788         console_pty: Option<PtyPair>,
789         console_resize_pipe: Option<File>,
790         original_termios: Arc<Mutex<Option<termios>>>,
791         snapshot: Option<Snapshot>,
792         source_url: Option<&str>,
793         prefault: Option<bool>,
794     ) -> Result<Self> {
795         trace_scoped!("Vm::new");
796 
797         let timestamp = Instant::now();
798 
799         #[cfg(feature = "tdx")]
800         let tdx_enabled = if snapshot.is_some() {
801             false
802         } else {
803             vm_config.lock().unwrap().is_tdx_enabled()
804         };
805 
806         #[cfg(feature = "sev_snp")]
807         let sev_snp_enabled = if snapshot.is_some() {
808             false
809         } else {
810             vm_config.lock().unwrap().is_sev_snp_enabled()
811         };
812 
813         let vm = Self::create_hypervisor_vm(
814             &hypervisor,
815             #[cfg(feature = "tdx")]
816             tdx_enabled,
817             #[cfg(feature = "sev_snp")]
818             sev_snp_enabled,
819         )?;
820 
821         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
822 
823         let memory_manager = if let Some(snapshot) =
824             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
825         {
826             MemoryManager::new_from_snapshot(
827                 &snapshot,
828                 vm.clone(),
829                 &vm_config.lock().unwrap().memory.clone(),
830                 source_url,
831                 prefault.unwrap(),
832                 phys_bits,
833             )
834             .map_err(Error::MemoryManager)?
835         } else {
836             #[cfg(target_arch = "x86_64")]
837             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
838 
839             MemoryManager::new(
840                 vm.clone(),
841                 &vm_config.lock().unwrap().memory.clone(),
842                 None,
843                 phys_bits,
844                 #[cfg(feature = "tdx")]
845                 tdx_enabled,
846                 None,
847                 None,
848                 #[cfg(target_arch = "x86_64")]
849                 sgx_epc_config,
850             )
851             .map_err(Error::MemoryManager)?
852         };
853 
854         Vm::new_from_memory_manager(
855             vm_config,
856             memory_manager,
857             vm,
858             exit_evt,
859             reset_evt,
860             #[cfg(feature = "guest_debug")]
861             vm_debug_evt,
862             seccomp_action,
863             hypervisor,
864             activate_evt,
865             timestamp,
866             serial_pty,
867             console_pty,
868             console_resize_pipe,
869             original_termios,
870             snapshot,
871         )
872     }
873 
874     pub fn create_hypervisor_vm(
875         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
876         #[cfg(feature = "tdx")] tdx_enabled: bool,
877         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
878     ) -> Result<Arc<dyn hypervisor::Vm>> {
879         hypervisor.check_required_extensions().unwrap();
880 
881         cfg_if::cfg_if! {
882             if #[cfg(feature = "tdx")] {
883                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
884                 // Otherwise KVM_X86_LEGACY_VM: 0
885                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
886                 let vm = hypervisor
887                     .create_vm_with_type(u64::from(tdx_enabled))
888                     .unwrap();
889             } else if #[cfg(feature = "sev_snp")] {
890                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
891                 // Otherwise SEV_SNP_DISABLED: 0
892                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
893                 let vm = hypervisor
894                     .create_vm_with_type(u64::from(sev_snp_enabled))
895                     .unwrap();
896             } else {
897                 let vm = hypervisor.create_vm().unwrap();
898             }
899         }
900 
901         #[cfg(target_arch = "x86_64")]
902         {
903             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
904                 .unwrap();
905             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
906             vm.enable_split_irq().unwrap();
907         }
908 
909         Ok(vm)
910     }
911 
912     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
913         let initramfs = self.initramfs.as_mut().unwrap();
914         let size: usize = initramfs
915             .seek(SeekFrom::End(0))
916             .map_err(|_| Error::InitramfsLoad)?
917             .try_into()
918             .unwrap();
919         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
920 
921         let address =
922             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
923         let address = GuestAddress(address);
924 
925         guest_mem
926             .read_volatile_from(address, initramfs, size)
927             .map_err(|_| Error::InitramfsLoad)?;
928 
929         info!("Initramfs loaded: address = 0x{:x}", address.0);
930         Ok(arch::InitramfsConfig { address, size })
931     }
932 
933     pub fn generate_cmdline(
934         payload: &PayloadConfig,
935         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
936     ) -> Result<Cmdline> {
937         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
938         if let Some(s) = payload.cmdline.as_ref() {
939             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
940         }
941 
942         #[cfg(target_arch = "aarch64")]
943         for entry in device_manager.lock().unwrap().cmdline_additions() {
944             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
945         }
946         Ok(cmdline)
947     }
948 
949     #[cfg(target_arch = "aarch64")]
950     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
951         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
952         let mem = uefi_flash.memory();
953         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
954             .map_err(Error::UefiLoad)?;
955         Ok(())
956     }
957 
958     #[cfg(target_arch = "aarch64")]
959     fn load_kernel(
960         firmware: Option<File>,
961         kernel: Option<File>,
962         memory_manager: Arc<Mutex<MemoryManager>>,
963     ) -> Result<EntryPoint> {
964         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
965         let mem = guest_memory.memory();
966         let entry_addr = match (firmware, kernel) {
967             (None, Some(mut kernel)) => {
968                 match linux_loader::loader::pe::PE::load(
969                     mem.deref(),
970                     Some(arch::layout::KERNEL_START),
971                     &mut kernel,
972                     None,
973                 ) {
974                     Ok(entry_addr) => entry_addr.kernel_load,
975                     // Try to load the binary as kernel PE file at first.
976                     // If failed, retry to load it as UEFI binary.
977                     // As the UEFI binary is formatless, it must be the last option to try.
978                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
979                         Self::load_firmware(&kernel, memory_manager)?;
980                         arch::layout::UEFI_START
981                     }
982                     Err(e) => {
983                         return Err(Error::KernelLoad(e));
984                     }
985                 }
986             }
987             (Some(firmware), None) => {
988                 Self::load_firmware(&firmware, memory_manager)?;
989                 arch::layout::UEFI_START
990             }
991             _ => return Err(Error::InvalidPayload),
992         };
993 
994         Ok(EntryPoint { entry_addr })
995     }
996 
997     #[cfg(feature = "igvm")]
998     fn load_igvm(
999         igvm: File,
1000         memory_manager: Arc<Mutex<MemoryManager>>,
1001         cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1002     ) -> Result<EntryPoint> {
1003         let res = igvm_loader::load_igvm(&igvm, memory_manager, cpu_manager.clone(), "")
1004             .map_err(Error::IgvmLoad)?;
1005 
1006         cfg_if::cfg_if! {
1007             if #[cfg(feature = "sev_snp")] {
1008                 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1009                     EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa) }
1010                 } else {
1011                     EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip) }
1012                 };
1013             } else {
1014                let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip) };
1015             }
1016         };
1017         Ok(entry_point)
1018     }
1019 
1020     #[cfg(target_arch = "x86_64")]
1021     fn load_kernel(
1022         mut kernel: File,
1023         cmdline: Option<Cmdline>,
1024         memory_manager: Arc<Mutex<MemoryManager>>,
1025     ) -> Result<EntryPoint> {
1026         info!("Loading kernel");
1027 
1028         let mem = {
1029             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1030             guest_memory.memory()
1031         };
1032         let entry_addr = linux_loader::loader::elf::Elf::load(
1033             mem.deref(),
1034             None,
1035             &mut kernel,
1036             Some(arch::layout::HIGH_RAM_START),
1037         )
1038         .map_err(Error::KernelLoad)?;
1039 
1040         if let Some(cmdline) = cmdline {
1041             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1042                 .map_err(Error::LoadCmdLine)?;
1043         }
1044 
1045         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1046             // Use the PVH kernel entry point to boot the guest
1047             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1048             Ok(EntryPoint { entry_addr })
1049         } else {
1050             Err(Error::KernelMissingPvhHeader)
1051         }
1052     }
1053 
1054     #[cfg(target_arch = "x86_64")]
1055     fn load_payload(
1056         payload: &PayloadConfig,
1057         memory_manager: Arc<Mutex<MemoryManager>>,
1058         #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1059     ) -> Result<EntryPoint> {
1060         trace_scoped!("load_payload");
1061         #[cfg(feature = "igvm")]
1062         if let Some(_igvm_file) = &payload.igvm {
1063             let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1064             return Self::load_igvm(igvm, memory_manager, cpu_manager);
1065         }
1066         match (
1067             &payload.firmware,
1068             &payload.kernel,
1069             &payload.initramfs,
1070             &payload.cmdline,
1071         ) {
1072             (Some(firmware), None, None, None) => {
1073                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1074                 Self::load_kernel(firmware, None, memory_manager)
1075             }
1076             (None, Some(kernel), _, _) => {
1077                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1078                 let cmdline = Self::generate_cmdline(payload)?;
1079                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1080             }
1081             _ => Err(Error::InvalidPayload),
1082         }
1083     }
1084 
1085     #[cfg(target_arch = "aarch64")]
1086     fn load_payload(
1087         payload: &PayloadConfig,
1088         memory_manager: Arc<Mutex<MemoryManager>>,
1089     ) -> Result<EntryPoint> {
1090         match (&payload.firmware, &payload.kernel) {
1091             (Some(firmware), None) => {
1092                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1093                 Self::load_kernel(Some(firmware), None, memory_manager)
1094             }
1095             (None, Some(kernel)) => {
1096                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1097                 Self::load_kernel(None, Some(kernel), memory_manager)
1098             }
1099             _ => Err(Error::InvalidPayload),
1100         }
1101     }
1102 
1103     fn load_payload_async(
1104         memory_manager: &Arc<Mutex<MemoryManager>>,
1105         config: &Arc<Mutex<VmConfig>>,
1106         #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1107     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1108         // Kernel with TDX is loaded in a different manner
1109         #[cfg(feature = "tdx")]
1110         if config.lock().unwrap().is_tdx_enabled() {
1111             return Ok(None);
1112         }
1113 
1114         config
1115             .lock()
1116             .unwrap()
1117             .payload
1118             .as_ref()
1119             .map(|payload| {
1120                 let memory_manager = memory_manager.clone();
1121                 let payload = payload.clone();
1122                 #[cfg(feature = "igvm")]
1123                 let cpu_manager = cpu_manager.clone();
1124 
1125                 std::thread::Builder::new()
1126                     .name("payload_loader".into())
1127                     .spawn(move || {
1128                         Self::load_payload(
1129                             &payload,
1130                             memory_manager,
1131                             #[cfg(feature = "igvm")]
1132                             cpu_manager,
1133                         )
1134                     })
1135                     .map_err(Error::KernelLoadThreadSpawn)
1136             })
1137             .transpose()
1138     }
1139 
1140     #[cfg(target_arch = "x86_64")]
1141     fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> {
1142         trace_scoped!("configure_system");
1143         info!("Configuring system");
1144         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1145 
1146         let initramfs_config = match self.initramfs {
1147             Some(_) => Some(self.load_initramfs(&mem)?),
1148             None => None,
1149         };
1150 
1151         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1152         let rsdp_addr = Some(rsdp_addr);
1153         let sgx_epc_region = self
1154             .memory_manager
1155             .lock()
1156             .unwrap()
1157             .sgx_epc_region()
1158             .as_ref()
1159             .cloned();
1160 
1161         let serial_number = self
1162             .config
1163             .lock()
1164             .unwrap()
1165             .platform
1166             .as_ref()
1167             .and_then(|p| p.serial_number.clone());
1168 
1169         let uuid = self
1170             .config
1171             .lock()
1172             .unwrap()
1173             .platform
1174             .as_ref()
1175             .and_then(|p| p.uuid.clone());
1176 
1177         let oem_strings = self
1178             .config
1179             .lock()
1180             .unwrap()
1181             .platform
1182             .as_ref()
1183             .and_then(|p| p.oem_strings.clone());
1184 
1185         let oem_strings = oem_strings
1186             .as_deref()
1187             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1188 
1189         let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1190 
1191         arch::configure_system(
1192             &mem,
1193             arch::layout::CMDLINE_START,
1194             &initramfs_config,
1195             boot_vcpus,
1196             rsdp_addr,
1197             sgx_epc_region,
1198             serial_number.as_deref(),
1199             uuid.as_deref(),
1200             oem_strings.as_deref(),
1201             topology,
1202         )
1203         .map_err(Error::ConfigureSystem)?;
1204         Ok(())
1205     }
1206 
1207     #[cfg(target_arch = "aarch64")]
1208     fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> {
1209         let cmdline = Self::generate_cmdline(
1210             self.config.lock().unwrap().payload.as_ref().unwrap(),
1211             &self.device_manager,
1212         )?;
1213         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1214         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1215         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1216         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1217         let initramfs_config = match self.initramfs {
1218             Some(_) => Some(self.load_initramfs(&mem)?),
1219             None => None,
1220         };
1221 
1222         let device_info = &self
1223             .device_manager
1224             .lock()
1225             .unwrap()
1226             .get_device_info()
1227             .clone();
1228 
1229         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1230             let pci_space = PciSpaceInfo {
1231                 pci_segment_id: pci_segment.id,
1232                 mmio_config_address: pci_segment.mmio_config_address,
1233                 pci_device_space_start: pci_segment.start_of_mem64_area,
1234                 pci_device_space_size: pci_segment.end_of_mem64_area
1235                     - pci_segment.start_of_mem64_area
1236                     + 1,
1237             };
1238             pci_space_info.push(pci_space);
1239         }
1240 
1241         let virtio_iommu_bdf = self
1242             .device_manager
1243             .lock()
1244             .unwrap()
1245             .iommu_attached_devices()
1246             .as_ref()
1247             .map(|(v, _)| *v);
1248 
1249         let vgic = self
1250             .device_manager
1251             .lock()
1252             .unwrap()
1253             .get_interrupt_controller()
1254             .unwrap()
1255             .lock()
1256             .unwrap()
1257             .get_vgic()
1258             .map_err(|_| {
1259                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1260                     arch::aarch64::Error::SetupGic,
1261                 ))
1262             })?;
1263 
1264         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1265         let pmu_supported = self
1266             .cpu_manager
1267             .lock()
1268             .unwrap()
1269             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1270             .map_err(|_| {
1271                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1272                     arch::aarch64::Error::VcpuInitPmu,
1273                 ))
1274             })?;
1275 
1276         arch::configure_system(
1277             &mem,
1278             cmdline.as_cstring().unwrap().to_str().unwrap(),
1279             vcpu_mpidrs,
1280             vcpu_topology,
1281             device_info,
1282             &initramfs_config,
1283             &pci_space_info,
1284             virtio_iommu_bdf.map(|bdf| bdf.into()),
1285             &vgic,
1286             &self.numa_nodes,
1287             pmu_supported,
1288         )
1289         .map_err(Error::ConfigureSystem)?;
1290 
1291         Ok(())
1292     }
1293 
1294     pub fn serial_pty(&self) -> Option<PtyPair> {
1295         self.device_manager.lock().unwrap().serial_pty()
1296     }
1297 
1298     pub fn console_pty(&self) -> Option<PtyPair> {
1299         self.device_manager.lock().unwrap().console_pty()
1300     }
1301 
1302     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1303         self.device_manager.lock().unwrap().console_resize_pipe()
1304     }
1305 
1306     pub fn shutdown(&mut self) -> Result<()> {
1307         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1308         let new_state = VmState::Shutdown;
1309 
1310         state.valid_transition(new_state)?;
1311 
1312         // Wake up the DeviceManager threads so they will get terminated cleanly
1313         self.device_manager
1314             .lock()
1315             .unwrap()
1316             .resume()
1317             .map_err(Error::Resume)?;
1318 
1319         self.cpu_manager
1320             .lock()
1321             .unwrap()
1322             .shutdown()
1323             .map_err(Error::CpuManager)?;
1324 
1325         // Wait for all the threads to finish
1326         for thread in self.threads.drain(..) {
1327             thread.join().map_err(Error::ThreadCleanup)?
1328         }
1329         *state = new_state;
1330 
1331         event!("vm", "shutdown");
1332 
1333         Ok(())
1334     }
1335 
1336     pub fn resize(
1337         &mut self,
1338         desired_vcpus: Option<u8>,
1339         desired_memory: Option<u64>,
1340         desired_balloon: Option<u64>,
1341     ) -> Result<()> {
1342         event!("vm", "resizing");
1343 
1344         if let Some(desired_vcpus) = desired_vcpus {
1345             if self
1346                 .cpu_manager
1347                 .lock()
1348                 .unwrap()
1349                 .resize(desired_vcpus)
1350                 .map_err(Error::CpuManager)?
1351             {
1352                 self.device_manager
1353                     .lock()
1354                     .unwrap()
1355                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1356                     .map_err(Error::DeviceManager)?;
1357             }
1358             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1359         }
1360 
1361         if let Some(desired_memory) = desired_memory {
1362             let new_region = self
1363                 .memory_manager
1364                 .lock()
1365                 .unwrap()
1366                 .resize(desired_memory)
1367                 .map_err(Error::MemoryManager)?;
1368 
1369             let memory_config = &mut self.config.lock().unwrap().memory;
1370 
1371             if let Some(new_region) = &new_region {
1372                 self.device_manager
1373                     .lock()
1374                     .unwrap()
1375                     .update_memory(new_region)
1376                     .map_err(Error::DeviceManager)?;
1377 
1378                 match memory_config.hotplug_method {
1379                     HotplugMethod::Acpi => {
1380                         self.device_manager
1381                             .lock()
1382                             .unwrap()
1383                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1384                             .map_err(Error::DeviceManager)?;
1385                     }
1386                     HotplugMethod::VirtioMem => {}
1387                 }
1388             }
1389 
1390             // We update the VM config regardless of the actual guest resize
1391             // operation result (happened or not), so that if the VM reboots
1392             // it will be running with the last configure memory size.
1393             match memory_config.hotplug_method {
1394                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1395                 HotplugMethod::VirtioMem => {
1396                     if desired_memory > memory_config.size {
1397                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1398                     } else {
1399                         memory_config.hotplugged_size = None;
1400                     }
1401                 }
1402             }
1403         }
1404 
1405         if let Some(desired_balloon) = desired_balloon {
1406             self.device_manager
1407                 .lock()
1408                 .unwrap()
1409                 .resize_balloon(desired_balloon)
1410                 .map_err(Error::DeviceManager)?;
1411 
1412             // Update the configuration value for the balloon size to ensure
1413             // a reboot would use the right value.
1414             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1415                 balloon_config.size = desired_balloon;
1416             }
1417         }
1418 
1419         event!("vm", "resized");
1420 
1421         Ok(())
1422     }
1423 
1424     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1425         let memory_config = &mut self.config.lock().unwrap().memory;
1426 
1427         if let Some(zones) = &mut memory_config.zones {
1428             for zone in zones.iter_mut() {
1429                 if zone.id == id {
1430                     if desired_memory >= zone.size {
1431                         let hotplugged_size = desired_memory - zone.size;
1432                         self.memory_manager
1433                             .lock()
1434                             .unwrap()
1435                             .resize_zone(&id, desired_memory - zone.size)
1436                             .map_err(Error::MemoryManager)?;
1437                         // We update the memory zone config regardless of the
1438                         // actual 'resize-zone' operation result (happened or
1439                         // not), so that if the VM reboots it will be running
1440                         // with the last configured memory zone size.
1441                         zone.hotplugged_size = Some(hotplugged_size);
1442 
1443                         return Ok(());
1444                     } else {
1445                         error!(
1446                             "Invalid to ask less ({}) than boot RAM ({}) for \
1447                             this memory zone",
1448                             desired_memory, zone.size,
1449                         );
1450                         return Err(Error::ResizeZone);
1451                     }
1452                 }
1453             }
1454         }
1455 
1456         error!("Could not find the memory zone {} for the resize", id);
1457         Err(Error::ResizeZone)
1458     }
1459 
1460     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1461         let pci_device_info = self
1462             .device_manager
1463             .lock()
1464             .unwrap()
1465             .add_device(&mut device_cfg)
1466             .map_err(Error::DeviceManager)?;
1467 
1468         // Update VmConfig by adding the new device. This is important to
1469         // ensure the device would be created in case of a reboot.
1470         {
1471             let mut config = self.config.lock().unwrap();
1472             add_to_config(&mut config.devices, device_cfg);
1473         }
1474 
1475         self.device_manager
1476             .lock()
1477             .unwrap()
1478             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1479             .map_err(Error::DeviceManager)?;
1480 
1481         Ok(pci_device_info)
1482     }
1483 
1484     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1485         let pci_device_info = self
1486             .device_manager
1487             .lock()
1488             .unwrap()
1489             .add_user_device(&mut device_cfg)
1490             .map_err(Error::DeviceManager)?;
1491 
1492         // Update VmConfig by adding the new device. This is important to
1493         // ensure the device would be created in case of a reboot.
1494         {
1495             let mut config = self.config.lock().unwrap();
1496             add_to_config(&mut config.user_devices, device_cfg);
1497         }
1498 
1499         self.device_manager
1500             .lock()
1501             .unwrap()
1502             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1503             .map_err(Error::DeviceManager)?;
1504 
1505         Ok(pci_device_info)
1506     }
1507 
1508     pub fn remove_device(&mut self, id: String) -> Result<()> {
1509         self.device_manager
1510             .lock()
1511             .unwrap()
1512             .remove_device(id.clone())
1513             .map_err(Error::DeviceManager)?;
1514 
1515         // Update VmConfig by removing the device. This is important to
1516         // ensure the device would not be created in case of a reboot.
1517         self.config.lock().unwrap().remove_device(&id);
1518 
1519         self.device_manager
1520             .lock()
1521             .unwrap()
1522             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1523             .map_err(Error::DeviceManager)?;
1524         Ok(())
1525     }
1526 
1527     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1528         let pci_device_info = self
1529             .device_manager
1530             .lock()
1531             .unwrap()
1532             .add_disk(&mut disk_cfg)
1533             .map_err(Error::DeviceManager)?;
1534 
1535         // Update VmConfig by adding the new device. This is important to
1536         // ensure the device would be created in case of a reboot.
1537         {
1538             let mut config = self.config.lock().unwrap();
1539             add_to_config(&mut config.disks, disk_cfg);
1540         }
1541 
1542         self.device_manager
1543             .lock()
1544             .unwrap()
1545             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1546             .map_err(Error::DeviceManager)?;
1547 
1548         Ok(pci_device_info)
1549     }
1550 
1551     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1552         let pci_device_info = self
1553             .device_manager
1554             .lock()
1555             .unwrap()
1556             .add_fs(&mut fs_cfg)
1557             .map_err(Error::DeviceManager)?;
1558 
1559         // Update VmConfig by adding the new device. This is important to
1560         // ensure the device would be created in case of a reboot.
1561         {
1562             let mut config = self.config.lock().unwrap();
1563             add_to_config(&mut config.fs, fs_cfg);
1564         }
1565 
1566         self.device_manager
1567             .lock()
1568             .unwrap()
1569             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1570             .map_err(Error::DeviceManager)?;
1571 
1572         Ok(pci_device_info)
1573     }
1574 
1575     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1576         let pci_device_info = self
1577             .device_manager
1578             .lock()
1579             .unwrap()
1580             .add_pmem(&mut pmem_cfg)
1581             .map_err(Error::DeviceManager)?;
1582 
1583         // Update VmConfig by adding the new device. This is important to
1584         // ensure the device would be created in case of a reboot.
1585         {
1586             let mut config = self.config.lock().unwrap();
1587             add_to_config(&mut config.pmem, pmem_cfg);
1588         }
1589 
1590         self.device_manager
1591             .lock()
1592             .unwrap()
1593             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1594             .map_err(Error::DeviceManager)?;
1595 
1596         Ok(pci_device_info)
1597     }
1598 
1599     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1600         let pci_device_info = self
1601             .device_manager
1602             .lock()
1603             .unwrap()
1604             .add_net(&mut net_cfg)
1605             .map_err(Error::DeviceManager)?;
1606 
1607         // Update VmConfig by adding the new device. This is important to
1608         // ensure the device would be created in case of a reboot.
1609         {
1610             let mut config = self.config.lock().unwrap();
1611             add_to_config(&mut config.net, net_cfg);
1612         }
1613 
1614         self.device_manager
1615             .lock()
1616             .unwrap()
1617             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1618             .map_err(Error::DeviceManager)?;
1619 
1620         Ok(pci_device_info)
1621     }
1622 
1623     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1624         let pci_device_info = self
1625             .device_manager
1626             .lock()
1627             .unwrap()
1628             .add_vdpa(&mut vdpa_cfg)
1629             .map_err(Error::DeviceManager)?;
1630 
1631         // Update VmConfig by adding the new device. This is important to
1632         // ensure the device would be created in case of a reboot.
1633         {
1634             let mut config = self.config.lock().unwrap();
1635             add_to_config(&mut config.vdpa, vdpa_cfg);
1636         }
1637 
1638         self.device_manager
1639             .lock()
1640             .unwrap()
1641             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1642             .map_err(Error::DeviceManager)?;
1643 
1644         Ok(pci_device_info)
1645     }
1646 
1647     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1648         let pci_device_info = self
1649             .device_manager
1650             .lock()
1651             .unwrap()
1652             .add_vsock(&mut vsock_cfg)
1653             .map_err(Error::DeviceManager)?;
1654 
1655         // Update VmConfig by adding the new device. This is important to
1656         // ensure the device would be created in case of a reboot.
1657         {
1658             let mut config = self.config.lock().unwrap();
1659             config.vsock = Some(vsock_cfg);
1660         }
1661 
1662         self.device_manager
1663             .lock()
1664             .unwrap()
1665             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1666             .map_err(Error::DeviceManager)?;
1667 
1668         Ok(pci_device_info)
1669     }
1670 
1671     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1672         Ok(self.device_manager.lock().unwrap().counters())
1673     }
1674 
1675     #[cfg(feature = "tdx")]
1676     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1677         use arch::x86_64::tdx::*;
1678 
1679         let firmware_path = self
1680             .config
1681             .lock()
1682             .unwrap()
1683             .payload
1684             .as_ref()
1685             .unwrap()
1686             .firmware
1687             .clone()
1688             .ok_or(Error::TdxFirmwareMissing)?;
1689         // The TDVF file contains a table of section as well as code
1690         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1691 
1692         // For all the sections allocate some RAM backing them
1693         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1694     }
1695 
1696     #[cfg(feature = "tdx")]
1697     fn hob_memory_resources(
1698         mut sorted_sections: Vec<TdvfSection>,
1699         guest_memory: &GuestMemoryMmap,
1700     ) -> Vec<(u64, u64, bool)> {
1701         let mut list = Vec::new();
1702 
1703         let mut current_section = sorted_sections.pop();
1704 
1705         // RAM regions interleaved with TDVF sections
1706         let mut next_start_addr = 0;
1707         for region in guest_memory.iter() {
1708             let region_start = region.start_addr().0;
1709             let region_end = region.last_addr().0;
1710             if region_start > next_start_addr {
1711                 next_start_addr = region_start;
1712             }
1713 
1714             loop {
1715                 let (start, size, ram) = if let Some(section) = &current_section {
1716                     if section.address <= next_start_addr {
1717                         (section.address, section.size, false)
1718                     } else {
1719                         let last_addr = std::cmp::min(section.address - 1, region_end);
1720                         (next_start_addr, last_addr - next_start_addr + 1, true)
1721                     }
1722                 } else {
1723                     (next_start_addr, region_end - next_start_addr + 1, true)
1724                 };
1725 
1726                 list.push((start, size, ram));
1727 
1728                 if !ram {
1729                     current_section = sorted_sections.pop();
1730                 }
1731 
1732                 next_start_addr = start + size;
1733 
1734                 if region_start > next_start_addr {
1735                     next_start_addr = region_start;
1736                 }
1737 
1738                 if next_start_addr > region_end {
1739                     break;
1740                 }
1741             }
1742         }
1743 
1744         // Once all the interleaved sections have been processed, let's simply
1745         // pull the remaining ones.
1746         if let Some(section) = current_section {
1747             list.push((section.address, section.size, false));
1748         }
1749         while let Some(section) = sorted_sections.pop() {
1750             list.push((section.address, section.size, false));
1751         }
1752 
1753         list
1754     }
1755 
1756     #[cfg(feature = "tdx")]
1757     fn populate_tdx_sections(
1758         &mut self,
1759         sections: &[TdvfSection],
1760         guid_found: bool,
1761     ) -> Result<Option<u64>> {
1762         use arch::x86_64::tdx::*;
1763         // Get the memory end *before* we start adding TDVF ram regions
1764         let boot_guest_memory = self
1765             .memory_manager
1766             .lock()
1767             .as_ref()
1768             .unwrap()
1769             .boot_guest_memory();
1770         for section in sections {
1771             // No need to allocate if the section falls within guest RAM ranges
1772             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1773                 info!(
1774                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1775                     section
1776                 );
1777                 continue;
1778             }
1779 
1780             info!("Allocating TDVF Section: {:x?}", section);
1781             self.memory_manager
1782                 .lock()
1783                 .unwrap()
1784                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1785                 .map_err(Error::AllocatingTdvfMemory)?;
1786         }
1787 
1788         // The TDVF file contains a table of section as well as code
1789         let firmware_path = self
1790             .config
1791             .lock()
1792             .unwrap()
1793             .payload
1794             .as_ref()
1795             .unwrap()
1796             .firmware
1797             .clone()
1798             .ok_or(Error::TdxFirmwareMissing)?;
1799         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1800 
1801         // The guest memory at this point now has all the required regions so it
1802         // is safe to copy from the TDVF file into it.
1803         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1804         let mem = guest_memory.memory();
1805         let mut payload_info = None;
1806         let mut hob_offset = None;
1807         for section in sections {
1808             info!("Populating TDVF Section: {:x?}", section);
1809             match section.r#type {
1810                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1811                     info!("Copying section to guest memory");
1812                     firmware_file
1813                         .seek(SeekFrom::Start(section.data_offset as u64))
1814                         .map_err(Error::LoadTdvf)?;
1815                     mem.read_volatile_from(
1816                         GuestAddress(section.address),
1817                         &mut firmware_file,
1818                         section.data_size as usize,
1819                     )
1820                     .unwrap();
1821                 }
1822                 TdvfSectionType::TdHob => {
1823                     hob_offset = Some(section.address);
1824                 }
1825                 TdvfSectionType::Payload => {
1826                     info!("Copying payload to guest memory");
1827                     if let Some(payload_file) = self.kernel.as_mut() {
1828                         let payload_size = payload_file
1829                             .seek(SeekFrom::End(0))
1830                             .map_err(Error::LoadPayload)?;
1831 
1832                         payload_file
1833                             .seek(SeekFrom::Start(0x1f1))
1834                             .map_err(Error::LoadPayload)?;
1835 
1836                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1837                         payload_file
1838                             .read_volatile(&mut payload_header.as_bytes())
1839                             .unwrap();
1840 
1841                         if payload_header.header != 0x5372_6448 {
1842                             return Err(Error::InvalidPayloadType);
1843                         }
1844 
1845                         if (payload_header.version < 0x0200)
1846                             || ((payload_header.loadflags & 0x1) == 0x0)
1847                         {
1848                             return Err(Error::InvalidPayloadType);
1849                         }
1850 
1851                         payload_file.rewind().map_err(Error::LoadPayload)?;
1852                         mem.read_volatile_from(
1853                             GuestAddress(section.address),
1854                             payload_file,
1855                             payload_size as usize,
1856                         )
1857                         .unwrap();
1858 
1859                         // Create the payload info that will be inserted into
1860                         // the HOB.
1861                         payload_info = Some(PayloadInfo {
1862                             image_type: PayloadImageType::BzImage,
1863                             entry_point: section.address,
1864                         });
1865                     }
1866                 }
1867                 TdvfSectionType::PayloadParam => {
1868                     info!("Copying payload parameters to guest memory");
1869                     let cmdline = Self::generate_cmdline(
1870                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1871                     )?;
1872                     mem.write_slice(
1873                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1874                         GuestAddress(section.address),
1875                     )
1876                     .unwrap();
1877                 }
1878                 _ => {}
1879             }
1880         }
1881 
1882         // Generate HOB
1883         let mut hob = TdHob::start(hob_offset.unwrap());
1884 
1885         let mut sorted_sections = sections.to_vec();
1886         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1887 
1888         sorted_sections.sort_by_key(|section| section.address);
1889         sorted_sections.reverse();
1890 
1891         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1892             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1893                 .map_err(Error::PopulateHob)?;
1894         }
1895 
1896         // MMIO regions
1897         hob.add_mmio_resource(
1898             &mem,
1899             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1900             arch::layout::APIC_START.raw_value()
1901                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1902         )
1903         .map_err(Error::PopulateHob)?;
1904         let start_of_device_area = self
1905             .memory_manager
1906             .lock()
1907             .unwrap()
1908             .start_of_device_area()
1909             .raw_value();
1910         let end_of_device_area = self
1911             .memory_manager
1912             .lock()
1913             .unwrap()
1914             .end_of_device_area()
1915             .raw_value();
1916         hob.add_mmio_resource(
1917             &mem,
1918             start_of_device_area,
1919             end_of_device_area - start_of_device_area,
1920         )
1921         .map_err(Error::PopulateHob)?;
1922 
1923         // Loop over the ACPI tables and copy them to the HOB.
1924 
1925         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1926             &self.device_manager,
1927             &self.cpu_manager,
1928             &self.memory_manager,
1929             &self.numa_nodes,
1930         ) {
1931             hob.add_acpi_table(&mem, acpi_table.as_slice())
1932                 .map_err(Error::PopulateHob)?;
1933         }
1934 
1935         // If a payload info has been created, let's insert it into the HOB.
1936         if let Some(payload_info) = payload_info {
1937             hob.add_payload(&mem, payload_info)
1938                 .map_err(Error::PopulateHob)?;
1939         }
1940 
1941         hob.finish(&mem).map_err(Error::PopulateHob)?;
1942 
1943         Ok(hob_offset)
1944     }
1945 
1946     #[cfg(feature = "tdx")]
1947     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1948         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1949         let mem = guest_memory.memory();
1950 
1951         for section in sections {
1952             self.vm
1953                 .tdx_init_memory_region(
1954                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1955                     section.address,
1956                     section.size,
1957                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1958                     section.attributes == 1,
1959                 )
1960                 .map_err(Error::InitializeTdxMemoryRegion)?;
1961         }
1962 
1963         Ok(())
1964     }
1965 
1966     // Creates ACPI tables
1967     // In case of TDX being used, this is a no-op since the tables will be
1968     // created and passed when populating the HOB.
1969 
1970     fn create_acpi_tables(&self) -> Option<GuestAddress> {
1971         #[cfg(feature = "tdx")]
1972         if self.config.lock().unwrap().is_tdx_enabled() {
1973             return None;
1974         }
1975         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
1976         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
1977         let rsdp_addr = crate::acpi::create_acpi_tables(
1978             &mem,
1979             &self.device_manager,
1980             &self.cpu_manager,
1981             &self.memory_manager,
1982             &self.numa_nodes,
1983             tpm_enabled,
1984         );
1985         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
1986 
1987         Some(rsdp_addr)
1988     }
1989 
1990     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
1991         trace_scoped!("entry_point");
1992 
1993         self.load_payload_handle
1994             .take()
1995             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
1996             .transpose()
1997     }
1998 
1999     pub fn boot(&mut self) -> Result<()> {
2000         trace_scoped!("Vm::boot");
2001         info!("Booting VM");
2002         event!("vm", "booting");
2003         let current_state = self.get_state()?;
2004         if current_state == VmState::Paused {
2005             return self.resume().map_err(Error::Resume);
2006         }
2007 
2008         let new_state = if self.stop_on_boot {
2009             VmState::BreakPoint
2010         } else {
2011             VmState::Running
2012         };
2013         current_state.valid_transition(new_state)?;
2014 
2015         // Do earlier to parallelise with loading kernel
2016         #[cfg(target_arch = "x86_64")]
2017         let rsdp_addr = self.create_acpi_tables();
2018 
2019         // Load kernel synchronously or if asynchronous then wait for load to
2020         // finish.
2021         let entry_point = self.entry_point()?;
2022 
2023         #[cfg(feature = "tdx")]
2024         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2025 
2026         // Configure the vcpus that have been created
2027         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2028         for vcpu in vcpus {
2029             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2030             let boot_setup = entry_point.map(|e| (e, guest_memory));
2031             self.cpu_manager
2032                 .lock()
2033                 .unwrap()
2034                 .configure_vcpu(vcpu, boot_setup)
2035                 .map_err(Error::CpuManager)?;
2036         }
2037 
2038         #[cfg(feature = "tdx")]
2039         let (sections, guid_found) = if tdx_enabled {
2040             self.extract_tdvf_sections()?
2041         } else {
2042             (Vec::new(), false)
2043         };
2044 
2045         // Configuring the TDX regions requires that the vCPUs are created.
2046         #[cfg(feature = "tdx")]
2047         let hob_address = if tdx_enabled {
2048             // TDX sections are written to memory.
2049             self.populate_tdx_sections(&sections, guid_found)?
2050         } else {
2051             None
2052         };
2053 
2054         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2055         // available after they are configured
2056         #[cfg(target_arch = "aarch64")]
2057         let rsdp_addr = self.create_acpi_tables();
2058 
2059         // Configure shared state based on loaded kernel
2060         entry_point
2061             .map(|_| {
2062                 // Safe to unwrap rsdp_addr as we know it can't be None when
2063                 // the entry_point is Some.
2064                 self.configure_system(rsdp_addr.unwrap())
2065             })
2066             .transpose()?;
2067 
2068         #[cfg(target_arch = "x86_64")]
2069         // Note: For x86, always call this function before invoking start boot vcpus.
2070         // Otherwise guest would fail to boot because we haven't created the
2071         // userspace mappings to update the hypervisor about the memory mappings.
2072         // These mappings must be created before we start the vCPU threads for
2073         // the very first time.
2074         self.memory_manager
2075             .lock()
2076             .unwrap()
2077             .allocate_address_space()
2078             .map_err(Error::MemoryManager)?;
2079 
2080         #[cfg(feature = "tdx")]
2081         if let Some(hob_address) = hob_address {
2082             // With the HOB address extracted the vCPUs can have
2083             // their TDX state configured.
2084             self.cpu_manager
2085                 .lock()
2086                 .unwrap()
2087                 .initialize_tdx(hob_address)
2088                 .map_err(Error::CpuManager)?;
2089             // Let the hypervisor know which memory ranges are shared with the
2090             // guest. This prevents the guest from ignoring/discarding memory
2091             // regions provided by the host.
2092             self.init_tdx_memory(&sections)?;
2093             // With TDX memory and CPU state configured TDX setup is complete
2094             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2095         }
2096 
2097         self.cpu_manager
2098             .lock()
2099             .unwrap()
2100             .start_boot_vcpus(new_state == VmState::BreakPoint)
2101             .map_err(Error::CpuManager)?;
2102 
2103         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2104         *state = new_state;
2105         event!("vm", "booted");
2106         Ok(())
2107     }
2108 
2109     pub fn restore(&mut self) -> Result<()> {
2110         event!("vm", "restoring");
2111 
2112         #[cfg(target_arch = "x86_64")]
2113         // Note: For x86, always call this function before invoking start boot vcpus.
2114         // Otherwise guest would fail to boot because we haven't created the
2115         // userspace mappings to update the hypervisor about the memory mappings.
2116         // These mappings must be created before we start the vCPU threads for
2117         // the very first time for the restored VM.
2118         self.memory_manager
2119             .lock()
2120             .unwrap()
2121             .allocate_address_space()
2122             .map_err(Error::MemoryManager)?;
2123 
2124         // Now we can start all vCPUs from here.
2125         self.cpu_manager
2126             .lock()
2127             .unwrap()
2128             .start_restored_vcpus()
2129             .map_err(Error::CpuManager)?;
2130 
2131         event!("vm", "restored");
2132         Ok(())
2133     }
2134 
2135     /// Gets a thread-safe reference counted pointer to the VM configuration.
2136     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2137         Arc::clone(&self.config)
2138     }
2139 
2140     /// Get the VM state. Returns an error if the state is poisoned.
2141     pub fn get_state(&self) -> Result<VmState> {
2142         self.state
2143             .try_read()
2144             .map_err(|_| Error::PoisonedState)
2145             .map(|state| *state)
2146     }
2147 
2148     /// Gets the actual size of the balloon.
2149     pub fn balloon_size(&self) -> u64 {
2150         self.device_manager.lock().unwrap().balloon_size()
2151     }
2152 
2153     pub fn send_memory_fds(
2154         &mut self,
2155         socket: &mut UnixStream,
2156     ) -> std::result::Result<(), MigratableError> {
2157         for (slot, fd) in self
2158             .memory_manager
2159             .lock()
2160             .unwrap()
2161             .memory_slot_fds()
2162             .drain()
2163         {
2164             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2165                 .write_to(socket)
2166                 .map_err(|e| {
2167                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2168                 })?;
2169             socket
2170                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2171                 .map_err(|e| {
2172                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2173                 })?;
2174 
2175             let res = Response::read_from(socket)?;
2176             if res.status() != Status::Ok {
2177                 warn!("Error during memory fd migration");
2178                 Request::abandon().write_to(socket)?;
2179                 Response::read_from(socket).ok();
2180                 return Err(MigratableError::MigrateSend(anyhow!(
2181                     "Error during memory fd migration"
2182                 )));
2183             }
2184         }
2185 
2186         Ok(())
2187     }
2188 
2189     pub fn send_memory_regions<F>(
2190         &mut self,
2191         ranges: &MemoryRangeTable,
2192         fd: &mut F,
2193     ) -> std::result::Result<(), MigratableError>
2194     where
2195         F: WriteVolatile,
2196     {
2197         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2198         let mem = guest_memory.memory();
2199 
2200         for range in ranges.regions() {
2201             let mut offset: u64 = 0;
2202             // Here we are manually handling the retry in case we can't the
2203             // whole region at once because we can't use the implementation
2204             // from vm-memory::GuestMemory of write_all_to() as it is not
2205             // following the correct behavior. For more info about this issue
2206             // see: https://github.com/rust-vmm/vm-memory/issues/174
2207             loop {
2208                 let bytes_written = mem
2209                     .write_volatile_to(
2210                         GuestAddress(range.gpa + offset),
2211                         fd,
2212                         (range.length - offset) as usize,
2213                     )
2214                     .map_err(|e| {
2215                         MigratableError::MigrateSend(anyhow!(
2216                             "Error transferring memory to socket: {}",
2217                             e
2218                         ))
2219                     })?;
2220                 offset += bytes_written as u64;
2221 
2222                 if offset == range.length {
2223                     break;
2224                 }
2225             }
2226         }
2227 
2228         Ok(())
2229     }
2230 
2231     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2232         self.memory_manager
2233             .lock()
2234             .unwrap()
2235             .memory_range_table(false)
2236     }
2237 
2238     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2239         self.device_manager.lock().unwrap().device_tree()
2240     }
2241 
2242     pub fn activate_virtio_devices(&self) -> Result<()> {
2243         self.device_manager
2244             .lock()
2245             .unwrap()
2246             .activate_virtio_devices()
2247             .map_err(Error::ActivateVirtioDevices)
2248     }
2249 
2250     #[cfg(target_arch = "x86_64")]
2251     pub fn power_button(&self) -> Result<()> {
2252         return self
2253             .device_manager
2254             .lock()
2255             .unwrap()
2256             .notify_power_button()
2257             .map_err(Error::PowerButton);
2258     }
2259 
2260     #[cfg(target_arch = "aarch64")]
2261     pub fn power_button(&self) -> Result<()> {
2262         self.device_manager
2263             .lock()
2264             .unwrap()
2265             .notify_power_button()
2266             .map_err(Error::PowerButton)
2267     }
2268 
2269     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2270         self.memory_manager.lock().unwrap().snapshot_data()
2271     }
2272 
2273     #[cfg(feature = "guest_debug")]
2274     pub fn debug_request(
2275         &mut self,
2276         gdb_request: &GdbRequestPayload,
2277         cpu_id: usize,
2278     ) -> Result<GdbResponsePayload> {
2279         use GdbRequestPayload::*;
2280         match gdb_request {
2281             SetSingleStep(single_step) => {
2282                 self.set_guest_debug(cpu_id, &[], *single_step)
2283                     .map_err(Error::Debug)?;
2284             }
2285             SetHwBreakPoint(addrs) => {
2286                 self.set_guest_debug(cpu_id, addrs, false)
2287                     .map_err(Error::Debug)?;
2288             }
2289             Pause => {
2290                 self.debug_pause().map_err(Error::Debug)?;
2291             }
2292             Resume => {
2293                 self.debug_resume().map_err(Error::Debug)?;
2294             }
2295             ReadRegs => {
2296                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2297                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2298             }
2299             WriteRegs(regs) => {
2300                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2301             }
2302             ReadMem(vaddr, len) => {
2303                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2304                 let mem = self
2305                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2306                     .map_err(Error::Debug)?;
2307                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2308             }
2309             WriteMem(vaddr, data) => {
2310                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2311                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2312                     .map_err(Error::Debug)?;
2313             }
2314             ActiveVcpus => {
2315                 let active_vcpus = self.active_vcpus();
2316                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2317             }
2318         }
2319         Ok(GdbResponsePayload::CommandComplete)
2320     }
2321 
2322     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2323     fn get_dump_state(
2324         &mut self,
2325         destination_url: &str,
2326     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2327         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2328         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2329         let mut elf_phdr_num = 1;
2330         let elf_sh_info = 0;
2331         let coredump_file_path = url_to_file(destination_url)?;
2332         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2333 
2334         if mapping_num < UINT16_MAX - 2 {
2335             elf_phdr_num += mapping_num as u16;
2336         } else {
2337             panic!("mapping num beyond 65535 not supported");
2338         }
2339         let coredump_file = OpenOptions::new()
2340             .read(true)
2341             .write(true)
2342             .create_new(true)
2343             .open(coredump_file_path)
2344             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2345 
2346         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2347         let mem_data = self
2348             .memory_manager
2349             .lock()
2350             .unwrap()
2351             .coredump_memory_regions(mem_offset);
2352 
2353         Ok(DumpState {
2354             elf_note_size,
2355             elf_phdr_num,
2356             elf_sh_info,
2357             mem_offset,
2358             mem_info: Some(mem_data),
2359             file: Some(coredump_file),
2360         })
2361     }
2362 
2363     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2364     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2365         size_of::<elf::Elf64_Ehdr>() as u64
2366             + note_size as u64
2367             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2368     }
2369 }
2370 
2371 impl Pausable for Vm {
2372     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2373         event!("vm", "pausing");
2374         let mut state = self
2375             .state
2376             .try_write()
2377             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2378         let new_state = VmState::Paused;
2379 
2380         state
2381             .valid_transition(new_state)
2382             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2383 
2384         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2385         {
2386             let mut clock = self
2387                 .vm
2388                 .get_clock()
2389                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2390             clock.reset_flags();
2391             self.saved_clock = Some(clock);
2392         }
2393 
2394         // Before pausing the vCPUs activate any pending virtio devices that might
2395         // need activation between starting the pause (or e.g. a migration it's part of)
2396         self.activate_virtio_devices().map_err(|e| {
2397             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2398         })?;
2399 
2400         self.cpu_manager.lock().unwrap().pause()?;
2401         self.device_manager.lock().unwrap().pause()?;
2402 
2403         *state = new_state;
2404 
2405         event!("vm", "paused");
2406         Ok(())
2407     }
2408 
2409     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2410         event!("vm", "resuming");
2411         let mut state = self
2412             .state
2413             .try_write()
2414             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2415         let new_state = VmState::Running;
2416 
2417         state
2418             .valid_transition(new_state)
2419             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2420 
2421         self.cpu_manager.lock().unwrap().resume()?;
2422         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2423         {
2424             if let Some(clock) = &self.saved_clock {
2425                 self.vm.set_clock(clock).map_err(|e| {
2426                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2427                 })?;
2428             }
2429         }
2430         self.device_manager.lock().unwrap().resume()?;
2431 
2432         // And we're back to the Running state.
2433         *state = new_state;
2434         event!("vm", "resumed");
2435         Ok(())
2436     }
2437 }
2438 
2439 #[derive(Serialize, Deserialize)]
2440 pub struct VmSnapshot {
2441     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2442     pub clock: Option<hypervisor::ClockData>,
2443     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2444     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2445 }
2446 
2447 pub const VM_SNAPSHOT_ID: &str = "vm";
2448 impl Snapshottable for Vm {
2449     fn id(&self) -> String {
2450         VM_SNAPSHOT_ID.to_string()
2451     }
2452 
2453     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2454         event!("vm", "snapshotting");
2455 
2456         #[cfg(feature = "tdx")]
2457         {
2458             if self.config.lock().unwrap().is_tdx_enabled() {
2459                 return Err(MigratableError::Snapshot(anyhow!(
2460                     "Snapshot not possible with TDX VM"
2461                 )));
2462             }
2463         }
2464 
2465         let current_state = self.get_state().unwrap();
2466         if current_state != VmState::Paused {
2467             return Err(MigratableError::Snapshot(anyhow!(
2468                 "Trying to snapshot while VM is running"
2469             )));
2470         }
2471 
2472         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2473         let common_cpuid = {
2474             let amx = self.config.lock().unwrap().cpus.features.amx;
2475             let phys_bits = physical_bits(
2476                 &self.hypervisor,
2477                 self.config.lock().unwrap().cpus.max_phys_bits,
2478             );
2479             arch::generate_common_cpuid(
2480                 &self.hypervisor,
2481                 &arch::CpuidConfig {
2482                     sgx_epc_sections: None,
2483                     phys_bits,
2484                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2485                     #[cfg(feature = "tdx")]
2486                     tdx: false,
2487                     amx,
2488                 },
2489             )
2490             .map_err(|e| {
2491                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2492             })?
2493         };
2494 
2495         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2496             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2497             clock: self.saved_clock,
2498             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2499             common_cpuid,
2500         })
2501         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2502 
2503         let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data));
2504 
2505         let (id, snapshot) = {
2506             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2507             (cpu_manager.id(), cpu_manager.snapshot()?)
2508         };
2509         vm_snapshot.add_snapshot(id, snapshot);
2510         let (id, snapshot) = {
2511             let mut memory_manager = self.memory_manager.lock().unwrap();
2512             (memory_manager.id(), memory_manager.snapshot()?)
2513         };
2514         vm_snapshot.add_snapshot(id, snapshot);
2515         let (id, snapshot) = {
2516             let mut device_manager = self.device_manager.lock().unwrap();
2517             (device_manager.id(), device_manager.snapshot()?)
2518         };
2519         vm_snapshot.add_snapshot(id, snapshot);
2520 
2521         event!("vm", "snapshotted");
2522         Ok(vm_snapshot)
2523     }
2524 }
2525 
2526 impl Transportable for Vm {
2527     fn send(
2528         &self,
2529         snapshot: &Snapshot,
2530         destination_url: &str,
2531     ) -> std::result::Result<(), MigratableError> {
2532         let mut snapshot_config_path = url_to_path(destination_url)?;
2533         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2534 
2535         // Create the snapshot config file
2536         let mut snapshot_config_file = OpenOptions::new()
2537             .read(true)
2538             .write(true)
2539             .create_new(true)
2540             .open(snapshot_config_path)
2541             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2542 
2543         // Serialize and write the snapshot config
2544         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2545             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2546 
2547         snapshot_config_file
2548             .write(vm_config.as_bytes())
2549             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2550 
2551         let mut snapshot_state_path = url_to_path(destination_url)?;
2552         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2553 
2554         // Create the snapshot state file
2555         let mut snapshot_state_file = OpenOptions::new()
2556             .read(true)
2557             .write(true)
2558             .create_new(true)
2559             .open(snapshot_state_path)
2560             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2561 
2562         // Serialize and write the snapshot state
2563         let vm_state =
2564             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2565 
2566         snapshot_state_file
2567             .write(&vm_state)
2568             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2569 
2570         // Tell the memory manager to also send/write its own snapshot.
2571         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2572             self.memory_manager
2573                 .lock()
2574                 .unwrap()
2575                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2576         } else {
2577             return Err(MigratableError::Restore(anyhow!(
2578                 "Missing memory manager snapshot"
2579             )));
2580         }
2581 
2582         Ok(())
2583     }
2584 }
2585 
2586 impl Migratable for Vm {
2587     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2588         self.memory_manager.lock().unwrap().start_dirty_log()?;
2589         self.device_manager.lock().unwrap().start_dirty_log()
2590     }
2591 
2592     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2593         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2594         self.device_manager.lock().unwrap().stop_dirty_log()
2595     }
2596 
2597     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2598         Ok(MemoryRangeTable::new_from_tables(vec![
2599             self.memory_manager.lock().unwrap().dirty_log()?,
2600             self.device_manager.lock().unwrap().dirty_log()?,
2601         ]))
2602     }
2603 
2604     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2605         self.memory_manager.lock().unwrap().start_migration()?;
2606         self.device_manager.lock().unwrap().start_migration()
2607     }
2608 
2609     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2610         self.memory_manager.lock().unwrap().complete_migration()?;
2611         self.device_manager.lock().unwrap().complete_migration()
2612     }
2613 }
2614 
2615 #[cfg(feature = "guest_debug")]
2616 impl Debuggable for Vm {
2617     fn set_guest_debug(
2618         &self,
2619         cpu_id: usize,
2620         addrs: &[GuestAddress],
2621         singlestep: bool,
2622     ) -> std::result::Result<(), DebuggableError> {
2623         self.cpu_manager
2624             .lock()
2625             .unwrap()
2626             .set_guest_debug(cpu_id, addrs, singlestep)
2627     }
2628 
2629     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2630         if *self.state.read().unwrap() == VmState::Running {
2631             self.pause().map_err(DebuggableError::Pause)?;
2632         }
2633 
2634         let mut state = self
2635             .state
2636             .try_write()
2637             .map_err(|_| DebuggableError::PoisonedState)?;
2638         *state = VmState::BreakPoint;
2639         Ok(())
2640     }
2641 
2642     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2643         if *self.state.read().unwrap() == VmState::BreakPoint {
2644             self.resume().map_err(DebuggableError::Pause)?;
2645         }
2646 
2647         Ok(())
2648     }
2649 
2650     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2651         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2652     }
2653 
2654     fn write_regs(
2655         &self,
2656         cpu_id: usize,
2657         regs: &CoreRegs,
2658     ) -> std::result::Result<(), DebuggableError> {
2659         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2660     }
2661 
2662     fn read_mem(
2663         &self,
2664         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2665         cpu_id: usize,
2666         vaddr: GuestAddress,
2667         len: usize,
2668     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2669         self.cpu_manager
2670             .lock()
2671             .unwrap()
2672             .read_mem(guest_memory, cpu_id, vaddr, len)
2673     }
2674 
2675     fn write_mem(
2676         &self,
2677         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2678         cpu_id: usize,
2679         vaddr: &GuestAddress,
2680         data: &[u8],
2681     ) -> std::result::Result<(), DebuggableError> {
2682         self.cpu_manager
2683             .lock()
2684             .unwrap()
2685             .write_mem(guest_memory, cpu_id, vaddr, data)
2686     }
2687 
2688     fn active_vcpus(&self) -> usize {
2689         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2690         if active_vcpus > 0 {
2691             active_vcpus
2692         } else {
2693             // The VM is not booted yet. Report boot_vcpus() instead.
2694             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2695         }
2696     }
2697 }
2698 
2699 #[cfg(feature = "guest_debug")]
2700 pub const UINT16_MAX: u32 = 65535;
2701 
2702 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2703 impl Elf64Writable for Vm {}
2704 
2705 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2706 impl GuestDebuggable for Vm {
2707     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2708         event!("vm", "coredumping");
2709 
2710         let mut resume = false;
2711 
2712         #[cfg(feature = "tdx")]
2713         {
2714             if let Some(ref platform) = self.config.lock().unwrap().platform {
2715                 if platform.tdx {
2716                     return Err(GuestDebuggableError::Coredump(anyhow!(
2717                         "Coredump not possible with TDX VM"
2718                     )));
2719                 }
2720             }
2721         }
2722 
2723         match self.get_state().unwrap() {
2724             VmState::Running => {
2725                 self.pause().map_err(GuestDebuggableError::Pause)?;
2726                 resume = true;
2727             }
2728             VmState::Paused => {}
2729             _ => {
2730                 return Err(GuestDebuggableError::Coredump(anyhow!(
2731                     "Trying to coredump while VM is not running or paused"
2732                 )));
2733             }
2734         }
2735 
2736         let coredump_state = self.get_dump_state(destination_url)?;
2737 
2738         self.write_header(&coredump_state)?;
2739         self.write_note(&coredump_state)?;
2740         self.write_loads(&coredump_state)?;
2741 
2742         self.cpu_manager
2743             .lock()
2744             .unwrap()
2745             .cpu_write_elf64_note(&coredump_state)?;
2746         self.cpu_manager
2747             .lock()
2748             .unwrap()
2749             .cpu_write_vmm_note(&coredump_state)?;
2750 
2751         self.memory_manager
2752             .lock()
2753             .unwrap()
2754             .coredump_iterate_save_mem(&coredump_state)?;
2755 
2756         if resume {
2757             self.resume().map_err(GuestDebuggableError::Resume)?;
2758         }
2759 
2760         Ok(())
2761     }
2762 }
2763 
2764 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2765 #[cfg(test)]
2766 mod tests {
2767     use super::*;
2768 
2769     fn test_vm_state_transitions(state: VmState) {
2770         match state {
2771             VmState::Created => {
2772                 // Check the transitions from Created
2773                 assert!(state.valid_transition(VmState::Created).is_err());
2774                 assert!(state.valid_transition(VmState::Running).is_ok());
2775                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2776                 assert!(state.valid_transition(VmState::Paused).is_ok());
2777                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2778             }
2779             VmState::Running => {
2780                 // Check the transitions from Running
2781                 assert!(state.valid_transition(VmState::Created).is_err());
2782                 assert!(state.valid_transition(VmState::Running).is_err());
2783                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2784                 assert!(state.valid_transition(VmState::Paused).is_ok());
2785                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2786             }
2787             VmState::Shutdown => {
2788                 // Check the transitions from Shutdown
2789                 assert!(state.valid_transition(VmState::Created).is_err());
2790                 assert!(state.valid_transition(VmState::Running).is_ok());
2791                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2792                 assert!(state.valid_transition(VmState::Paused).is_err());
2793                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2794             }
2795             VmState::Paused => {
2796                 // Check the transitions from Paused
2797                 assert!(state.valid_transition(VmState::Created).is_err());
2798                 assert!(state.valid_transition(VmState::Running).is_ok());
2799                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2800                 assert!(state.valid_transition(VmState::Paused).is_err());
2801                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2802             }
2803             VmState::BreakPoint => {
2804                 // Check the transitions from Breakpoint
2805                 assert!(state.valid_transition(VmState::Created).is_ok());
2806                 assert!(state.valid_transition(VmState::Running).is_ok());
2807                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2808                 assert!(state.valid_transition(VmState::Paused).is_err());
2809                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2810             }
2811         }
2812     }
2813 
2814     #[test]
2815     fn test_vm_created_transitions() {
2816         test_vm_state_transitions(VmState::Created);
2817     }
2818 
2819     #[test]
2820     fn test_vm_running_transitions() {
2821         test_vm_state_transitions(VmState::Running);
2822     }
2823 
2824     #[test]
2825     fn test_vm_shutdown_transitions() {
2826         test_vm_state_transitions(VmState::Shutdown);
2827     }
2828 
2829     #[test]
2830     fn test_vm_paused_transitions() {
2831         test_vm_state_transitions(VmState::Paused);
2832     }
2833 
2834     #[cfg(feature = "tdx")]
2835     #[test]
2836     fn test_hob_memory_resources() {
2837         // Case 1: Two TDVF sections in the middle of the RAM
2838         let sections = vec![
2839             TdvfSection {
2840                 address: 0xc000,
2841                 size: 0x1000,
2842                 ..Default::default()
2843             },
2844             TdvfSection {
2845                 address: 0x1000,
2846                 size: 0x4000,
2847                 ..Default::default()
2848             },
2849         ];
2850         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2851         let expected = vec![
2852             (0, 0x1000, true),
2853             (0x1000, 0x4000, false),
2854             (0x5000, 0x7000, true),
2855             (0xc000, 0x1000, false),
2856             (0xd000, 0x0fff_3000, true),
2857         ];
2858         assert_eq!(
2859             expected,
2860             Vm::hob_memory_resources(
2861                 sections,
2862                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2863             )
2864         );
2865 
2866         // Case 2: Two TDVF sections with no conflict with the RAM
2867         let sections = vec![
2868             TdvfSection {
2869                 address: 0x1000_1000,
2870                 size: 0x1000,
2871                 ..Default::default()
2872             },
2873             TdvfSection {
2874                 address: 0,
2875                 size: 0x1000,
2876                 ..Default::default()
2877             },
2878         ];
2879         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2880         let expected = vec![
2881             (0, 0x1000, false),
2882             (0x1000, 0x1000_0000, true),
2883             (0x1000_1000, 0x1000, false),
2884         ];
2885         assert_eq!(
2886             expected,
2887             Vm::hob_memory_resources(
2888                 sections,
2889                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2890             )
2891         );
2892 
2893         // Case 3: Two TDVF sections with partial conflicts with the RAM
2894         let sections = vec![
2895             TdvfSection {
2896                 address: 0x1000_0000,
2897                 size: 0x2000,
2898                 ..Default::default()
2899             },
2900             TdvfSection {
2901                 address: 0,
2902                 size: 0x2000,
2903                 ..Default::default()
2904             },
2905         ];
2906         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2907         let expected = vec![
2908             (0, 0x2000, false),
2909             (0x2000, 0x0fff_e000, true),
2910             (0x1000_0000, 0x2000, false),
2911         ];
2912         assert_eq!(
2913             expected,
2914             Vm::hob_memory_resources(
2915                 sections,
2916                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2917             )
2918         );
2919 
2920         // Case 4: Two TDVF sections with no conflict before the RAM and two
2921         // more additional sections with no conflict after the RAM.
2922         let sections = vec![
2923             TdvfSection {
2924                 address: 0x2000_1000,
2925                 size: 0x1000,
2926                 ..Default::default()
2927             },
2928             TdvfSection {
2929                 address: 0x2000_0000,
2930                 size: 0x1000,
2931                 ..Default::default()
2932             },
2933             TdvfSection {
2934                 address: 0x1000,
2935                 size: 0x1000,
2936                 ..Default::default()
2937             },
2938             TdvfSection {
2939                 address: 0,
2940                 size: 0x1000,
2941                 ..Default::default()
2942             },
2943         ];
2944         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
2945         let expected = vec![
2946             (0, 0x1000, false),
2947             (0x1000, 0x1000, false),
2948             (0x4000, 0x1000_0000, true),
2949             (0x2000_0000, 0x1000, false),
2950             (0x2000_1000, 0x1000, false),
2951         ];
2952         assert_eq!(
2953             expected,
2954             Vm::hob_memory_resources(
2955                 sections,
2956                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2957             )
2958         );
2959 
2960         // Case 5: One TDVF section overriding the entire RAM
2961         let sections = vec![TdvfSection {
2962             address: 0,
2963             size: 0x2000_0000,
2964             ..Default::default()
2965         }];
2966         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2967         let expected = vec![(0, 0x2000_0000, false)];
2968         assert_eq!(
2969             expected,
2970             Vm::hob_memory_resources(
2971                 sections,
2972                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2973             )
2974         );
2975 
2976         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
2977         let sections = vec![
2978             TdvfSection {
2979                 address: 0x1000_2000,
2980                 size: 0x2000,
2981                 ..Default::default()
2982             },
2983             TdvfSection {
2984                 address: 0,
2985                 size: 0x2000,
2986                 ..Default::default()
2987             },
2988         ];
2989         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
2990             (GuestAddress(0x2000), 0x1000_0000),
2991             (GuestAddress(0x1000_4000), 0x1000_0000),
2992         ];
2993         let expected = vec![
2994             (0, 0x2000, false),
2995             (0x2000, 0x1000_0000, true),
2996             (0x1000_2000, 0x2000, false),
2997             (0x1000_4000, 0x1000_0000, true),
2998         ];
2999         assert_eq!(
3000             expected,
3001             Vm::hob_memory_resources(
3002                 sections,
3003                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3004             )
3005         );
3006 
3007         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3008         let sections = vec![
3009             TdvfSection {
3010                 address: 0x1000_0000,
3011                 size: 0x4000,
3012                 ..Default::default()
3013             },
3014             TdvfSection {
3015                 address: 0,
3016                 size: 0x4000,
3017                 ..Default::default()
3018             },
3019         ];
3020         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3021             (GuestAddress(0x1000), 0x1000_0000),
3022             (GuestAddress(0x1000_3000), 0x1000_0000),
3023         ];
3024         let expected = vec![
3025             (0, 0x4000, false),
3026             (0x4000, 0x0fff_c000, true),
3027             (0x1000_0000, 0x4000, false),
3028             (0x1000_4000, 0x0fff_f000, true),
3029         ];
3030         assert_eq!(
3031             expected,
3032             Vm::hob_memory_resources(
3033                 sections,
3034                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3035             )
3036         );
3037     }
3038 }
3039 
3040 #[cfg(target_arch = "aarch64")]
3041 #[cfg(test)]
3042 mod tests {
3043     use super::*;
3044     use crate::GuestMemoryMmap;
3045     use arch::aarch64::fdt::create_fdt;
3046     use arch::aarch64::layout;
3047     use arch::{DeviceType, MmioDeviceInfo};
3048     use devices::gic::Gic;
3049 
3050     const LEN: u64 = 4096;
3051 
3052     #[test]
3053     fn test_create_fdt_with_devices() {
3054         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3055         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3056 
3057         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3058             (
3059                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3060                 MmioDeviceInfo {
3061                     addr: 0x00,
3062                     len: LEN,
3063                     irq: 33,
3064                 },
3065             ),
3066             (
3067                 (DeviceType::Virtio(1), "virtio".to_string()),
3068                 MmioDeviceInfo {
3069                     addr: LEN,
3070                     len: LEN,
3071                     irq: 34,
3072                 },
3073             ),
3074             (
3075                 (DeviceType::Rtc, "rtc".to_string()),
3076                 MmioDeviceInfo {
3077                     addr: 2 * LEN,
3078                     len: LEN,
3079                     irq: 35,
3080                 },
3081             ),
3082         ]
3083         .iter()
3084         .cloned()
3085         .collect();
3086 
3087         let hv = hypervisor::new().unwrap();
3088         let vm = hv.create_vm().unwrap();
3089         let gic = vm
3090             .create_vgic(Gic::create_default_config(1))
3091             .expect("Cannot create gic");
3092         assert!(create_fdt(
3093             &mem,
3094             "console=tty0",
3095             vec![0],
3096             Some((0, 0, 0)),
3097             &dev_info,
3098             &gic,
3099             &None,
3100             &Vec::new(),
3101             &BTreeMap::new(),
3102             None,
3103             true,
3104         )
3105         .is_ok())
3106     }
3107 }
3108 
3109 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3110 #[test]
3111 pub fn test_vm() {
3112     use hypervisor::VmExit;
3113     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3114     // This example based on https://lwn.net/Articles/658511/
3115     let code = [
3116         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3117         0x00, 0xd8, /* add %bl, %al */
3118         0x04, b'0', /* add $'0', %al */
3119         0xee, /* out %al, (%dx) */
3120         0xb0, b'\n', /* mov $'\n', %al */
3121         0xee,  /* out %al, (%dx) */
3122         0xf4,  /* hlt */
3123     ];
3124 
3125     let mem_size = 0x1000;
3126     let load_addr = GuestAddress(0x1000);
3127     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3128 
3129     let hv = hypervisor::new().unwrap();
3130     let vm = hv.create_vm().expect("new VM creation failed");
3131 
3132     for (index, region) in mem.iter().enumerate() {
3133         let mem_region = vm.make_user_memory_region(
3134             index as u32,
3135             region.start_addr().raw_value(),
3136             region.len(),
3137             region.as_ptr() as u64,
3138             false,
3139             false,
3140         );
3141 
3142         vm.create_user_memory_region(mem_region)
3143             .expect("Cannot configure guest memory");
3144     }
3145     mem.write_slice(&code, load_addr)
3146         .expect("Writing code to memory failed");
3147 
3148     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3149 
3150     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3151     vcpu_sregs.cs.base = 0;
3152     vcpu_sregs.cs.selector = 0;
3153     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3154 
3155     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3156     vcpu_regs.rip = 0x1000;
3157     vcpu_regs.rax = 2;
3158     vcpu_regs.rbx = 3;
3159     vcpu_regs.rflags = 2;
3160     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3161 
3162     loop {
3163         match vcpu.run().expect("run failed") {
3164             VmExit::IoOut(addr, data) => {
3165                 println!(
3166                     "IO out -- addr: {:#x} data [{:?}]",
3167                     addr,
3168                     str::from_utf8(data).unwrap()
3169                 );
3170             }
3171             VmExit::Reset => {
3172                 println!("HLT");
3173                 break;
3174             }
3175             r => panic!("unexpected exit reason: {r:?}"),
3176         }
3177     }
3178 }
3179