xref: /cloud-hypervisor/vmm/src/vm.rs (revision 4d7a4c598ac247aaf770b00dfb057cdac891f67d)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 use crate::memory_manager::{
29     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
30 };
31 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
32 use crate::migration::get_vm_snapshot;
33 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
34 use crate::migration::url_to_file;
35 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
36 use crate::GuestMemoryMmap;
37 use crate::{
38     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
39 };
40 use anyhow::anyhow;
41 use arch::get_host_cpu_phys_bits;
42 #[cfg(target_arch = "x86_64")]
43 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
44 #[cfg(feature = "tdx")]
45 use arch::x86_64::tdx::TdvfSection;
46 use arch::EntryPoint;
47 #[cfg(target_arch = "aarch64")]
48 use arch::PciSpaceInfo;
49 use arch::{NumaNode, NumaNodes};
50 #[cfg(target_arch = "aarch64")]
51 use devices::interrupt_controller;
52 use devices::AcpiNotificationFlags;
53 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
54 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
55 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
56 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
57 use hypervisor::{HypervisorVmError, VmOps};
58 use libc::{termios, SIGWINCH};
59 use linux_loader::cmdline::Cmdline;
60 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
61 use linux_loader::elf;
62 #[cfg(target_arch = "x86_64")]
63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
64 #[cfg(target_arch = "aarch64")]
65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
66 use linux_loader::loader::KernelLoader;
67 use seccompiler::SeccompAction;
68 use serde::{Deserialize, Serialize};
69 use std::cmp;
70 use std::collections::BTreeMap;
71 use std::collections::HashMap;
72 use std::convert::TryInto;
73 use std::fs::{File, OpenOptions};
74 use std::io::{self, Seek, SeekFrom, Write};
75 #[cfg(feature = "tdx")]
76 use std::mem;
77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
78 use std::mem::size_of;
79 use std::num::Wrapping;
80 use std::ops::Deref;
81 use std::os::unix::net::UnixStream;
82 use std::sync::{Arc, Mutex, RwLock};
83 use std::time::Instant;
84 use std::{result, str, thread};
85 use thiserror::Error;
86 use tracer::trace_scoped;
87 use vm_device::Bus;
88 #[cfg(feature = "tdx")]
89 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion};
90 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic};
91 use vm_migration::protocol::{Request, Response, Status};
92 use vm_migration::{
93     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
94     SnapshotData, Snapshottable, Transportable,
95 };
96 use vmm_sys_util::eventfd::EventFd;
97 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
98 
99 /// Errors associated with VM management
100 #[derive(Debug, Error)]
101 pub enum Error {
102     #[error("Cannot open kernel file: {0}")]
103     KernelFile(#[source] io::Error),
104 
105     #[error("Cannot open initramfs file: {0}")]
106     InitramfsFile(#[source] io::Error),
107 
108     #[error("Cannot load the kernel into memory: {0}")]
109     KernelLoad(#[source] linux_loader::loader::Error),
110 
111     #[cfg(target_arch = "aarch64")]
112     #[error("Cannot load the UEFI binary in memory: {0:?}")]
113     UefiLoad(arch::aarch64::uefi::Error),
114 
115     #[error("Cannot load the initramfs into memory")]
116     InitramfsLoad,
117 
118     #[error("Cannot load the kernel command line in memory: {0}")]
119     LoadCmdLine(#[source] linux_loader::loader::Error),
120 
121     #[error("Cannot modify the kernel command line: {0}")]
122     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
123 
124     #[error("Cannot create the kernel command line: {0}")]
125     CmdLineCreate(#[source] linux_loader::cmdline::Error),
126 
127     #[error("Cannot configure system: {0}")]
128     ConfigureSystem(#[source] arch::Error),
129 
130     #[cfg(target_arch = "aarch64")]
131     #[error("Cannot enable interrupt controller: {0:?}")]
132     EnableInterruptController(interrupt_controller::Error),
133 
134     #[error("VM state is poisoned")]
135     PoisonedState,
136 
137     #[error("Error from device manager: {0:?}")]
138     DeviceManager(DeviceManagerError),
139 
140     #[error("No device with id {0:?} to remove")]
141     NoDeviceToRemove(String),
142 
143     #[error("Cannot spawn a signal handler thread: {0}")]
144     SignalHandlerSpawn(#[source] io::Error),
145 
146     #[error("Failed to join on threads: {0:?}")]
147     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
148 
149     #[error("VM config is missing")]
150     VmMissingConfig,
151 
152     #[error("VM is not created")]
153     VmNotCreated,
154 
155     #[error("VM is already created")]
156     VmAlreadyCreated,
157 
158     #[error("VM is not running")]
159     VmNotRunning,
160 
161     #[error("Cannot clone EventFd: {0}")]
162     EventFdClone(#[source] io::Error),
163 
164     #[error("invalid VM state transition: {0:?} to {1:?}")]
165     InvalidStateTransition(VmState, VmState),
166 
167     #[error("Error from CPU manager: {0}")]
168     CpuManager(#[source] cpu::Error),
169 
170     #[error("Cannot pause devices: {0}")]
171     PauseDevices(#[source] MigratableError),
172 
173     #[error("Cannot resume devices: {0}")]
174     ResumeDevices(#[source] MigratableError),
175 
176     #[error("Cannot pause CPUs: {0}")]
177     PauseCpus(#[source] MigratableError),
178 
179     #[error("Cannot resume cpus: {0}")]
180     ResumeCpus(#[source] MigratableError),
181 
182     #[error("Cannot pause VM: {0}")]
183     Pause(#[source] MigratableError),
184 
185     #[error("Cannot resume VM: {0}")]
186     Resume(#[source] MigratableError),
187 
188     #[error("Memory manager error: {0:?}")]
189     MemoryManager(MemoryManagerError),
190 
191     #[error("Eventfd write error: {0}")]
192     EventfdError(#[source] std::io::Error),
193 
194     #[error("Cannot snapshot VM: {0}")]
195     Snapshot(#[source] MigratableError),
196 
197     #[error("Cannot restore VM: {0}")]
198     Restore(#[source] MigratableError),
199 
200     #[error("Cannot send VM snapshot: {0}")]
201     SnapshotSend(#[source] MigratableError),
202 
203     #[error("Invalid restore source URL")]
204     InvalidRestoreSourceUrl,
205 
206     #[error("Failed to validate config: {0}")]
207     ConfigValidation(#[source] ValidationError),
208 
209     #[error("Too many virtio-vsock devices")]
210     TooManyVsockDevices,
211 
212     #[error("Failed serializing into JSON: {0}")]
213     SerializeJson(#[source] serde_json::Error),
214 
215     #[error("Invalid NUMA configuration")]
216     InvalidNumaConfig,
217 
218     #[error("Cannot create seccomp filter: {0}")]
219     CreateSeccompFilter(#[source] seccompiler::Error),
220 
221     #[error("Cannot apply seccomp filter: {0}")]
222     ApplySeccompFilter(#[source] seccompiler::Error),
223 
224     #[error("Failed resizing a memory zone")]
225     ResizeZone,
226 
227     #[error("Cannot activate virtio devices: {0:?}")]
228     ActivateVirtioDevices(DeviceManagerError),
229 
230     #[error("Error triggering power button: {0:?}")]
231     PowerButton(DeviceManagerError),
232 
233     #[error("Kernel lacks PVH header")]
234     KernelMissingPvhHeader,
235 
236     #[error("Failed to allocate firmware RAM: {0:?}")]
237     AllocateFirmwareMemory(MemoryManagerError),
238 
239     #[error("Error manipulating firmware file: {0}")]
240     FirmwareFile(#[source] std::io::Error),
241 
242     #[error("Firmware too big")]
243     FirmwareTooLarge,
244 
245     #[error("Failed to copy firmware to memory: {0}")]
246     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
247 
248     #[cfg(feature = "sev_snp")]
249     #[error("Error enabling SEV-SNP VM: {0}")]
250     InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
251 
252     #[cfg(feature = "tdx")]
253     #[error("Error performing I/O on TDX firmware file: {0}")]
254     LoadTdvf(#[source] std::io::Error),
255 
256     #[cfg(feature = "tdx")]
257     #[error("Error performing I/O on the TDX payload file: {0}")]
258     LoadPayload(#[source] std::io::Error),
259 
260     #[cfg(feature = "tdx")]
261     #[error("Error parsing TDVF: {0}")]
262     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
263 
264     #[cfg(feature = "tdx")]
265     #[error("Error populating TDX HOB: {0}")]
266     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
267 
268     #[cfg(feature = "tdx")]
269     #[error("Error allocating TDVF memory: {0:?}")]
270     AllocatingTdvfMemory(crate::memory_manager::Error),
271 
272     #[cfg(feature = "tdx")]
273     #[error("Error enabling TDX VM: {0}")]
274     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
275 
276     #[cfg(feature = "tdx")]
277     #[error("Error enabling TDX memory region: {0}")]
278     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
279 
280     #[cfg(feature = "tdx")]
281     #[error("Error finalizing TDX VM: {0}")]
282     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
283 
284     #[cfg(feature = "tdx")]
285     #[error("TDX firmware missing")]
286     TdxFirmwareMissing,
287 
288     #[cfg(feature = "tdx")]
289     #[error("Invalid TDX payload type")]
290     InvalidPayloadType,
291 
292     #[cfg(feature = "guest_debug")]
293     #[error("Error debugging VM: {0:?}")]
294     Debug(DebuggableError),
295 
296     #[error("Error spawning kernel loading thread")]
297     KernelLoadThreadSpawn(std::io::Error),
298 
299     #[error("Error joining kernel loading thread")]
300     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
301 
302     #[error("Payload configuration is not bootable")]
303     InvalidPayload,
304 
305     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
306     #[error("Error coredumping VM: {0:?}")]
307     Coredump(GuestDebuggableError),
308 }
309 pub type Result<T> = result::Result<T, Error>;
310 
311 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
312 pub enum VmState {
313     Created,
314     Running,
315     Shutdown,
316     Paused,
317     BreakPoint,
318 }
319 
320 impl VmState {
321     fn valid_transition(self, new_state: VmState) -> Result<()> {
322         match self {
323             VmState::Created => match new_state {
324                 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
325                 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
326                     Ok(())
327                 }
328             },
329 
330             VmState::Running => match new_state {
331                 VmState::Created | VmState::Running => {
332                     Err(Error::InvalidStateTransition(self, new_state))
333                 }
334                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
335             },
336 
337             VmState::Shutdown => match new_state {
338                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
339                     Err(Error::InvalidStateTransition(self, new_state))
340                 }
341                 VmState::Running => Ok(()),
342             },
343 
344             VmState::Paused => match new_state {
345                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
346                     Err(Error::InvalidStateTransition(self, new_state))
347                 }
348                 VmState::Running | VmState::Shutdown => Ok(()),
349             },
350             VmState::BreakPoint => match new_state {
351                 VmState::Created | VmState::Running => Ok(()),
352                 _ => Err(Error::InvalidStateTransition(self, new_state)),
353             },
354         }
355     }
356 }
357 
358 struct VmOpsHandler {
359     memory: GuestMemoryAtomic<GuestMemoryMmap>,
360     #[cfg(target_arch = "x86_64")]
361     io_bus: Arc<Bus>,
362     mmio_bus: Arc<Bus>,
363 }
364 
365 impl VmOps for VmOpsHandler {
366     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
367         self.memory
368             .memory()
369             .write(buf, GuestAddress(gpa))
370             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
371     }
372 
373     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
374         self.memory
375             .memory()
376             .read(buf, GuestAddress(gpa))
377             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
378     }
379 
380     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
381         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
382             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
383         }
384         Ok(())
385     }
386 
387     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
388         match self.mmio_bus.write(gpa, data) {
389             Err(vm_device::BusError::MissingAddressRange) => {
390                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
391             }
392             Ok(Some(barrier)) => {
393                 info!("Waiting for barrier");
394                 barrier.wait();
395                 info!("Barrier released");
396             }
397             _ => {}
398         };
399         Ok(())
400     }
401 
402     #[cfg(target_arch = "x86_64")]
403     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
404         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
405             info!("Guest PIO read to unregistered address 0x{:x}", port);
406         }
407         Ok(())
408     }
409 
410     #[cfg(target_arch = "x86_64")]
411     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
412         match self.io_bus.write(port, data) {
413             Err(vm_device::BusError::MissingAddressRange) => {
414                 info!("Guest PIO write to unregistered address 0x{:x}", port);
415             }
416             Ok(Some(barrier)) => {
417                 info!("Waiting for barrier");
418                 barrier.wait();
419                 info!("Barrier released");
420             }
421             _ => {}
422         };
423         Ok(())
424     }
425 }
426 
427 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
428     let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
429 
430     cmp::min(host_phys_bits, max_phys_bits)
431 }
432 
433 pub struct Vm {
434     #[cfg(feature = "tdx")]
435     kernel: Option<File>,
436     initramfs: Option<File>,
437     threads: Vec<thread::JoinHandle<()>>,
438     device_manager: Arc<Mutex<DeviceManager>>,
439     config: Arc<Mutex<VmConfig>>,
440     state: RwLock<VmState>,
441     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
442     memory_manager: Arc<Mutex<MemoryManager>>,
443     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
444     // The hypervisor abstracted virtual machine.
445     vm: Arc<dyn hypervisor::Vm>,
446     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
447     saved_clock: Option<hypervisor::ClockData>,
448     numa_nodes: NumaNodes,
449     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
450     hypervisor: Arc<dyn hypervisor::Hypervisor>,
451     stop_on_boot: bool,
452     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
453 }
454 
455 impl Vm {
456     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
457 
458     #[allow(clippy::too_many_arguments)]
459     pub fn new_from_memory_manager(
460         config: Arc<Mutex<VmConfig>>,
461         memory_manager: Arc<Mutex<MemoryManager>>,
462         vm: Arc<dyn hypervisor::Vm>,
463         exit_evt: EventFd,
464         reset_evt: EventFd,
465         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
466         seccomp_action: &SeccompAction,
467         hypervisor: Arc<dyn hypervisor::Hypervisor>,
468         activate_evt: EventFd,
469         timestamp: Instant,
470         serial_pty: Option<PtyPair>,
471         console_pty: Option<PtyPair>,
472         console_resize_pipe: Option<File>,
473         original_termios: Arc<Mutex<Option<termios>>>,
474         snapshot: Option<Snapshot>,
475     ) -> Result<Self> {
476         trace_scoped!("Vm::new_from_memory_manager");
477 
478         let boot_id_list = config
479             .lock()
480             .unwrap()
481             .validate()
482             .map_err(Error::ConfigValidation)?;
483 
484         let load_payload_handle = if snapshot.is_none() {
485             Self::load_payload_async(&memory_manager, &config)?
486         } else {
487             None
488         };
489 
490         info!("Booting VM from config: {:?}", &config);
491 
492         // Create NUMA nodes based on NumaConfig.
493         let numa_nodes =
494             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
495 
496         #[cfg(feature = "tdx")]
497         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
498         #[cfg(feature = "sev_snp")]
499         let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
500         #[cfg(feature = "tdx")]
501         let force_iommu = tdx_enabled;
502         #[cfg(not(feature = "tdx"))]
503         let force_iommu = false;
504 
505         #[cfg(feature = "guest_debug")]
506         let stop_on_boot = config.lock().unwrap().gdb;
507         #[cfg(not(feature = "guest_debug"))]
508         let stop_on_boot = false;
509 
510         let memory = memory_manager.lock().unwrap().guest_memory();
511         #[cfg(target_arch = "x86_64")]
512         let io_bus = Arc::new(Bus::new());
513         let mmio_bus = Arc::new(Bus::new());
514 
515         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
516             memory,
517             #[cfg(target_arch = "x86_64")]
518             io_bus: io_bus.clone(),
519             mmio_bus: mmio_bus.clone(),
520         });
521 
522         let cpus_config = { &config.lock().unwrap().cpus.clone() };
523         let cpu_manager = cpu::CpuManager::new(
524             cpus_config,
525             vm.clone(),
526             exit_evt.try_clone().map_err(Error::EventFdClone)?,
527             reset_evt.try_clone().map_err(Error::EventFdClone)?,
528             #[cfg(feature = "guest_debug")]
529             vm_debug_evt,
530             &hypervisor,
531             seccomp_action.clone(),
532             vm_ops,
533             #[cfg(feature = "tdx")]
534             tdx_enabled,
535             &numa_nodes,
536         )
537         .map_err(Error::CpuManager)?;
538 
539         #[cfg(target_arch = "x86_64")]
540         cpu_manager
541             .lock()
542             .unwrap()
543             .populate_cpuid(
544                 &memory_manager,
545                 &hypervisor,
546                 #[cfg(feature = "tdx")]
547                 tdx_enabled,
548             )
549             .map_err(Error::CpuManager)?;
550 
551         // The initial TDX configuration must be done before the vCPUs are
552         // created
553         #[cfg(feature = "tdx")]
554         if tdx_enabled {
555             let cpuid = cpu_manager.lock().unwrap().common_cpuid();
556             let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
557             vm.tdx_init(&cpuid, max_vcpus)
558                 .map_err(Error::InitializeTdxVm)?;
559         }
560 
561         cpu_manager
562             .lock()
563             .unwrap()
564             .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
565             .map_err(Error::CpuManager)?;
566 
567         // This initial SEV-SNP configuration must be done immediately after
568         // vCPUs are created. As part of this initialization we are
569         // transitioning the guest into secure state.
570         #[cfg(feature = "sev_snp")]
571         if sev_snp_enabled {
572             vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
573         }
574 
575         #[cfg(feature = "tdx")]
576         let dynamic = !tdx_enabled;
577         #[cfg(not(feature = "tdx"))]
578         let dynamic = true;
579 
580         let device_manager = DeviceManager::new(
581             #[cfg(target_arch = "x86_64")]
582             io_bus,
583             mmio_bus,
584             hypervisor.hypervisor_type(),
585             vm.clone(),
586             config.clone(),
587             memory_manager.clone(),
588             cpu_manager.clone(),
589             exit_evt.try_clone().map_err(Error::EventFdClone)?,
590             reset_evt,
591             seccomp_action.clone(),
592             numa_nodes.clone(),
593             &activate_evt,
594             force_iommu,
595             boot_id_list,
596             timestamp,
597             snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
598             dynamic,
599         )
600         .map_err(Error::DeviceManager)?;
601 
602         device_manager
603             .lock()
604             .unwrap()
605             .create_devices(
606                 serial_pty,
607                 console_pty,
608                 console_resize_pipe,
609                 original_termios,
610             )
611             .map_err(Error::DeviceManager)?;
612 
613         #[cfg(feature = "tdx")]
614         let kernel = config
615             .lock()
616             .unwrap()
617             .payload
618             .as_ref()
619             .map(|p| p.kernel.as_ref().map(File::open))
620             .unwrap_or_default()
621             .transpose()
622             .map_err(Error::KernelFile)?;
623 
624         let initramfs = config
625             .lock()
626             .unwrap()
627             .payload
628             .as_ref()
629             .map(|p| p.initramfs.as_ref().map(File::open))
630             .unwrap_or_default()
631             .transpose()
632             .map_err(Error::InitramfsFile)?;
633 
634         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
635         let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
636             let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
637             vm_snapshot.clock
638         } else {
639             None
640         };
641 
642         let vm_state = if snapshot.is_some() {
643             VmState::Paused
644         } else {
645             VmState::Created
646         };
647 
648         Ok(Vm {
649             #[cfg(feature = "tdx")]
650             kernel,
651             initramfs,
652             device_manager,
653             config,
654             threads: Vec::with_capacity(1),
655             state: RwLock::new(vm_state),
656             cpu_manager,
657             memory_manager,
658             vm,
659             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
660             saved_clock,
661             numa_nodes,
662             hypervisor,
663             stop_on_boot,
664             load_payload_handle,
665         })
666     }
667 
668     fn create_numa_nodes(
669         configs: Option<Vec<NumaConfig>>,
670         memory_manager: &Arc<Mutex<MemoryManager>>,
671     ) -> Result<NumaNodes> {
672         let mm = memory_manager.lock().unwrap();
673         let mm_zones = mm.memory_zones();
674         let mut numa_nodes = BTreeMap::new();
675 
676         if let Some(configs) = &configs {
677             for config in configs.iter() {
678                 if numa_nodes.contains_key(&config.guest_numa_id) {
679                     error!("Can't define twice the same NUMA node");
680                     return Err(Error::InvalidNumaConfig);
681                 }
682 
683                 let mut node = NumaNode::default();
684 
685                 if let Some(memory_zones) = &config.memory_zones {
686                     for memory_zone in memory_zones.iter() {
687                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
688                             node.memory_regions.extend(mm_zone.regions().clone());
689                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
690                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
691                             }
692                             node.memory_zones.push(memory_zone.clone());
693                         } else {
694                             error!("Unknown memory zone '{}'", memory_zone);
695                             return Err(Error::InvalidNumaConfig);
696                         }
697                     }
698                 }
699 
700                 if let Some(cpus) = &config.cpus {
701                     node.cpus.extend(cpus);
702                 }
703 
704                 if let Some(pci_segments) = &config.pci_segments {
705                     node.pci_segments.extend(pci_segments);
706                 }
707 
708                 if let Some(distances) = &config.distances {
709                     for distance in distances.iter() {
710                         let dest = distance.destination;
711                         let dist = distance.distance;
712 
713                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
714                             error!("Unknown destination NUMA node {}", dest);
715                             return Err(Error::InvalidNumaConfig);
716                         }
717 
718                         if node.distances.contains_key(&dest) {
719                             error!("Destination NUMA node {} has been already set", dest);
720                             return Err(Error::InvalidNumaConfig);
721                         }
722 
723                         node.distances.insert(dest, dist);
724                     }
725                 }
726 
727                 #[cfg(target_arch = "x86_64")]
728                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
729                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
730                         let mm_sections = sgx_epc_region.epc_sections();
731                         for sgx_epc_section in sgx_epc_sections.iter() {
732                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
733                                 node.sgx_epc_sections.push(mm_section.clone());
734                             } else {
735                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
736                                 return Err(Error::InvalidNumaConfig);
737                             }
738                         }
739                     } else {
740                         error!("Missing SGX EPC region");
741                         return Err(Error::InvalidNumaConfig);
742                     }
743                 }
744 
745                 numa_nodes.insert(config.guest_numa_id, node);
746             }
747         }
748 
749         Ok(numa_nodes)
750     }
751 
752     #[allow(clippy::too_many_arguments)]
753     pub fn new(
754         vm_config: Arc<Mutex<VmConfig>>,
755         exit_evt: EventFd,
756         reset_evt: EventFd,
757         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
758         seccomp_action: &SeccompAction,
759         hypervisor: Arc<dyn hypervisor::Hypervisor>,
760         activate_evt: EventFd,
761         serial_pty: Option<PtyPair>,
762         console_pty: Option<PtyPair>,
763         console_resize_pipe: Option<File>,
764         original_termios: Arc<Mutex<Option<termios>>>,
765         snapshot: Option<Snapshot>,
766         source_url: Option<&str>,
767         prefault: Option<bool>,
768     ) -> Result<Self> {
769         trace_scoped!("Vm::new");
770 
771         let timestamp = Instant::now();
772 
773         #[cfg(feature = "tdx")]
774         let tdx_enabled = if snapshot.is_some() {
775             false
776         } else {
777             vm_config.lock().unwrap().is_tdx_enabled()
778         };
779 
780         #[cfg(feature = "sev_snp")]
781         let sev_snp_enabled = if snapshot.is_some() {
782             false
783         } else {
784             vm_config.lock().unwrap().is_sev_snp_enabled()
785         };
786 
787         let vm = Self::create_hypervisor_vm(
788             &hypervisor,
789             #[cfg(feature = "tdx")]
790             tdx_enabled,
791             #[cfg(feature = "sev_snp")]
792             sev_snp_enabled,
793         )?;
794 
795         let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
796 
797         let memory_manager = if let Some(snapshot) =
798             snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
799         {
800             MemoryManager::new_from_snapshot(
801                 &snapshot,
802                 vm.clone(),
803                 &vm_config.lock().unwrap().memory.clone(),
804                 source_url,
805                 prefault.unwrap(),
806                 phys_bits,
807             )
808             .map_err(Error::MemoryManager)?
809         } else {
810             #[cfg(target_arch = "x86_64")]
811             let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
812 
813             MemoryManager::new(
814                 vm.clone(),
815                 &vm_config.lock().unwrap().memory.clone(),
816                 None,
817                 phys_bits,
818                 #[cfg(feature = "tdx")]
819                 tdx_enabled,
820                 None,
821                 None,
822                 #[cfg(target_arch = "x86_64")]
823                 sgx_epc_config,
824             )
825             .map_err(Error::MemoryManager)?
826         };
827 
828         Vm::new_from_memory_manager(
829             vm_config,
830             memory_manager,
831             vm,
832             exit_evt,
833             reset_evt,
834             #[cfg(feature = "guest_debug")]
835             vm_debug_evt,
836             seccomp_action,
837             hypervisor,
838             activate_evt,
839             timestamp,
840             serial_pty,
841             console_pty,
842             console_resize_pipe,
843             original_termios,
844             snapshot,
845         )
846     }
847 
848     pub fn create_hypervisor_vm(
849         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
850         #[cfg(feature = "tdx")] tdx_enabled: bool,
851         #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
852     ) -> Result<Arc<dyn hypervisor::Vm>> {
853         hypervisor.check_required_extensions().unwrap();
854 
855         cfg_if::cfg_if! {
856             if #[cfg(feature = "tdx")] {
857                 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
858                 // Otherwise KVM_X86_LEGACY_VM: 0
859                 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
860                 let vm = hypervisor
861                     .create_vm_with_type(u64::from(tdx_enabled))
862                     .unwrap();
863             } else if #[cfg(feature = "sev_snp")] {
864                 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
865                 // Otherwise SEV_SNP_DISABLED: 0
866                 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
867                 let vm = hypervisor
868                     .create_vm_with_type(u64::from(sev_snp_enabled))
869                     .unwrap();
870             } else {
871                 let vm = hypervisor.create_vm().unwrap();
872             }
873         }
874 
875         #[cfg(target_arch = "x86_64")]
876         {
877             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
878                 .unwrap();
879             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
880             vm.enable_split_irq().unwrap();
881         }
882 
883         Ok(vm)
884     }
885 
886     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
887         let mut initramfs = self.initramfs.as_ref().unwrap();
888         let size: usize = initramfs
889             .seek(SeekFrom::End(0))
890             .map_err(|_| Error::InitramfsLoad)?
891             .try_into()
892             .unwrap();
893         initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
894 
895         let address =
896             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
897         let address = GuestAddress(address);
898 
899         guest_mem
900             .read_from(address, &mut initramfs, size)
901             .map_err(|_| Error::InitramfsLoad)?;
902 
903         info!("Initramfs loaded: address = 0x{:x}", address.0);
904         Ok(arch::InitramfsConfig { address, size })
905     }
906 
907     pub fn generate_cmdline(
908         payload: &PayloadConfig,
909         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
910     ) -> Result<Cmdline> {
911         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
912         if let Some(s) = payload.cmdline.as_ref() {
913             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
914         }
915 
916         #[cfg(target_arch = "aarch64")]
917         for entry in device_manager.lock().unwrap().cmdline_additions() {
918             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
919         }
920         Ok(cmdline)
921     }
922 
923     #[cfg(target_arch = "aarch64")]
924     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
925         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
926         let mem = uefi_flash.memory();
927         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
928             .map_err(Error::UefiLoad)?;
929         Ok(())
930     }
931 
932     #[cfg(target_arch = "aarch64")]
933     fn load_kernel(
934         firmware: Option<File>,
935         kernel: Option<File>,
936         memory_manager: Arc<Mutex<MemoryManager>>,
937     ) -> Result<EntryPoint> {
938         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
939         let mem = guest_memory.memory();
940         let entry_addr = match (firmware, kernel) {
941             (None, Some(mut kernel)) => {
942                 match linux_loader::loader::pe::PE::load(
943                     mem.deref(),
944                     Some(arch::layout::KERNEL_START),
945                     &mut kernel,
946                     None,
947                 ) {
948                     Ok(entry_addr) => entry_addr.kernel_load,
949                     // Try to load the binary as kernel PE file at first.
950                     // If failed, retry to load it as UEFI binary.
951                     // As the UEFI binary is formatless, it must be the last option to try.
952                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
953                         Self::load_firmware(&kernel, memory_manager)?;
954                         arch::layout::UEFI_START
955                     }
956                     Err(e) => {
957                         return Err(Error::KernelLoad(e));
958                     }
959                 }
960             }
961             (Some(firmware), None) => {
962                 Self::load_firmware(&firmware, memory_manager)?;
963                 arch::layout::UEFI_START
964             }
965             _ => return Err(Error::InvalidPayload),
966         };
967 
968         Ok(EntryPoint { entry_addr })
969     }
970 
971     #[cfg(target_arch = "x86_64")]
972     fn load_kernel(
973         mut kernel: File,
974         cmdline: Option<Cmdline>,
975         memory_manager: Arc<Mutex<MemoryManager>>,
976     ) -> Result<EntryPoint> {
977         info!("Loading kernel");
978 
979         let mem = {
980             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
981             guest_memory.memory()
982         };
983         let entry_addr = linux_loader::loader::elf::Elf::load(
984             mem.deref(),
985             None,
986             &mut kernel,
987             Some(arch::layout::HIGH_RAM_START),
988         )
989         .map_err(Error::KernelLoad)?;
990 
991         if let Some(cmdline) = cmdline {
992             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
993                 .map_err(Error::LoadCmdLine)?;
994         }
995 
996         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
997             // Use the PVH kernel entry point to boot the guest
998             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
999             Ok(EntryPoint { entry_addr })
1000         } else {
1001             Err(Error::KernelMissingPvhHeader)
1002         }
1003     }
1004 
1005     #[cfg(target_arch = "x86_64")]
1006     fn load_payload(
1007         payload: &PayloadConfig,
1008         memory_manager: Arc<Mutex<MemoryManager>>,
1009     ) -> Result<EntryPoint> {
1010         trace_scoped!("load_payload");
1011         match (
1012             &payload.firmware,
1013             &payload.kernel,
1014             &payload.initramfs,
1015             &payload.cmdline,
1016         ) {
1017             (Some(firmware), None, None, None) => {
1018                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1019                 Self::load_kernel(firmware, None, memory_manager)
1020             }
1021             (None, Some(kernel), _, _) => {
1022                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1023                 let cmdline = Self::generate_cmdline(payload)?;
1024                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1025             }
1026             _ => Err(Error::InvalidPayload),
1027         }
1028     }
1029 
1030     #[cfg(target_arch = "aarch64")]
1031     fn load_payload(
1032         payload: &PayloadConfig,
1033         memory_manager: Arc<Mutex<MemoryManager>>,
1034     ) -> Result<EntryPoint> {
1035         match (&payload.firmware, &payload.kernel) {
1036             (Some(firmware), None) => {
1037                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1038                 Self::load_kernel(Some(firmware), None, memory_manager)
1039             }
1040             (None, Some(kernel)) => {
1041                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1042                 Self::load_kernel(None, Some(kernel), memory_manager)
1043             }
1044             _ => Err(Error::InvalidPayload),
1045         }
1046     }
1047 
1048     fn load_payload_async(
1049         memory_manager: &Arc<Mutex<MemoryManager>>,
1050         config: &Arc<Mutex<VmConfig>>,
1051     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1052         // Kernel with TDX is loaded in a different manner
1053         #[cfg(feature = "tdx")]
1054         if config.lock().unwrap().is_tdx_enabled() {
1055             return Ok(None);
1056         }
1057 
1058         config
1059             .lock()
1060             .unwrap()
1061             .payload
1062             .as_ref()
1063             .map(|payload| {
1064                 let memory_manager = memory_manager.clone();
1065                 let payload = payload.clone();
1066 
1067                 std::thread::Builder::new()
1068                     .name("payload_loader".into())
1069                     .spawn(move || Self::load_payload(&payload, memory_manager))
1070                     .map_err(Error::KernelLoadThreadSpawn)
1071             })
1072             .transpose()
1073     }
1074 
1075     #[cfg(target_arch = "x86_64")]
1076     fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> {
1077         trace_scoped!("configure_system");
1078         info!("Configuring system");
1079         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1080 
1081         let initramfs_config = match self.initramfs {
1082             Some(_) => Some(self.load_initramfs(&mem)?),
1083             None => None,
1084         };
1085 
1086         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1087         let rsdp_addr = Some(rsdp_addr);
1088         let sgx_epc_region = self
1089             .memory_manager
1090             .lock()
1091             .unwrap()
1092             .sgx_epc_region()
1093             .as_ref()
1094             .cloned();
1095 
1096         let serial_number = self
1097             .config
1098             .lock()
1099             .unwrap()
1100             .platform
1101             .as_ref()
1102             .and_then(|p| p.serial_number.clone());
1103 
1104         let uuid = self
1105             .config
1106             .lock()
1107             .unwrap()
1108             .platform
1109             .as_ref()
1110             .and_then(|p| p.uuid.clone());
1111 
1112         let oem_strings = self
1113             .config
1114             .lock()
1115             .unwrap()
1116             .platform
1117             .as_ref()
1118             .and_then(|p| p.oem_strings.clone());
1119 
1120         let oem_strings = oem_strings
1121             .as_deref()
1122             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1123 
1124         arch::configure_system(
1125             &mem,
1126             arch::layout::CMDLINE_START,
1127             &initramfs_config,
1128             boot_vcpus,
1129             rsdp_addr,
1130             sgx_epc_region,
1131             serial_number.as_deref(),
1132             uuid.as_deref(),
1133             oem_strings.as_deref(),
1134         )
1135         .map_err(Error::ConfigureSystem)?;
1136         Ok(())
1137     }
1138 
1139     #[cfg(target_arch = "aarch64")]
1140     fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> {
1141         let cmdline = Self::generate_cmdline(
1142             self.config.lock().unwrap().payload.as_ref().unwrap(),
1143             &self.device_manager,
1144         )?;
1145         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1146         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1147         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1148         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1149         let initramfs_config = match self.initramfs {
1150             Some(_) => Some(self.load_initramfs(&mem)?),
1151             None => None,
1152         };
1153 
1154         let device_info = &self
1155             .device_manager
1156             .lock()
1157             .unwrap()
1158             .get_device_info()
1159             .clone();
1160 
1161         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1162             let pci_space = PciSpaceInfo {
1163                 pci_segment_id: pci_segment.id,
1164                 mmio_config_address: pci_segment.mmio_config_address,
1165                 pci_device_space_start: pci_segment.start_of_device_area,
1166                 pci_device_space_size: pci_segment.end_of_device_area
1167                     - pci_segment.start_of_device_area
1168                     + 1,
1169             };
1170             pci_space_info.push(pci_space);
1171         }
1172 
1173         let virtio_iommu_bdf = self
1174             .device_manager
1175             .lock()
1176             .unwrap()
1177             .iommu_attached_devices()
1178             .as_ref()
1179             .map(|(v, _)| *v);
1180 
1181         let vgic = self
1182             .device_manager
1183             .lock()
1184             .unwrap()
1185             .get_interrupt_controller()
1186             .unwrap()
1187             .lock()
1188             .unwrap()
1189             .get_vgic()
1190             .map_err(|_| {
1191                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1192                     arch::aarch64::Error::SetupGic,
1193                 ))
1194             })?;
1195 
1196         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1197         let pmu_supported = self
1198             .cpu_manager
1199             .lock()
1200             .unwrap()
1201             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1202             .map_err(|_| {
1203                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1204                     arch::aarch64::Error::VcpuInitPmu,
1205                 ))
1206             })?;
1207 
1208         arch::configure_system(
1209             &mem,
1210             cmdline.as_cstring().unwrap().to_str().unwrap(),
1211             vcpu_mpidrs,
1212             vcpu_topology,
1213             device_info,
1214             &initramfs_config,
1215             &pci_space_info,
1216             virtio_iommu_bdf.map(|bdf| bdf.into()),
1217             &vgic,
1218             &self.numa_nodes,
1219             pmu_supported,
1220         )
1221         .map_err(Error::ConfigureSystem)?;
1222 
1223         Ok(())
1224     }
1225 
1226     pub fn serial_pty(&self) -> Option<PtyPair> {
1227         self.device_manager.lock().unwrap().serial_pty()
1228     }
1229 
1230     pub fn console_pty(&self) -> Option<PtyPair> {
1231         self.device_manager.lock().unwrap().console_pty()
1232     }
1233 
1234     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1235         self.device_manager.lock().unwrap().console_resize_pipe()
1236     }
1237 
1238     pub fn shutdown(&mut self) -> Result<()> {
1239         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1240         let new_state = VmState::Shutdown;
1241 
1242         state.valid_transition(new_state)?;
1243 
1244         // Wake up the DeviceManager threads so they will get terminated cleanly
1245         self.device_manager
1246             .lock()
1247             .unwrap()
1248             .resume()
1249             .map_err(Error::Resume)?;
1250 
1251         self.cpu_manager
1252             .lock()
1253             .unwrap()
1254             .shutdown()
1255             .map_err(Error::CpuManager)?;
1256 
1257         // Wait for all the threads to finish
1258         for thread in self.threads.drain(..) {
1259             thread.join().map_err(Error::ThreadCleanup)?
1260         }
1261         *state = new_state;
1262 
1263         event!("vm", "shutdown");
1264 
1265         Ok(())
1266     }
1267 
1268     pub fn resize(
1269         &mut self,
1270         desired_vcpus: Option<u8>,
1271         desired_memory: Option<u64>,
1272         desired_balloon: Option<u64>,
1273     ) -> Result<()> {
1274         event!("vm", "resizing");
1275 
1276         if let Some(desired_vcpus) = desired_vcpus {
1277             if self
1278                 .cpu_manager
1279                 .lock()
1280                 .unwrap()
1281                 .resize(desired_vcpus)
1282                 .map_err(Error::CpuManager)?
1283             {
1284                 self.device_manager
1285                     .lock()
1286                     .unwrap()
1287                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1288                     .map_err(Error::DeviceManager)?;
1289             }
1290             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1291         }
1292 
1293         if let Some(desired_memory) = desired_memory {
1294             let new_region = self
1295                 .memory_manager
1296                 .lock()
1297                 .unwrap()
1298                 .resize(desired_memory)
1299                 .map_err(Error::MemoryManager)?;
1300 
1301             let memory_config = &mut self.config.lock().unwrap().memory;
1302 
1303             if let Some(new_region) = &new_region {
1304                 self.device_manager
1305                     .lock()
1306                     .unwrap()
1307                     .update_memory(new_region)
1308                     .map_err(Error::DeviceManager)?;
1309 
1310                 match memory_config.hotplug_method {
1311                     HotplugMethod::Acpi => {
1312                         self.device_manager
1313                             .lock()
1314                             .unwrap()
1315                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1316                             .map_err(Error::DeviceManager)?;
1317                     }
1318                     HotplugMethod::VirtioMem => {}
1319                 }
1320             }
1321 
1322             // We update the VM config regardless of the actual guest resize
1323             // operation result (happened or not), so that if the VM reboots
1324             // it will be running with the last configure memory size.
1325             match memory_config.hotplug_method {
1326                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1327                 HotplugMethod::VirtioMem => {
1328                     if desired_memory > memory_config.size {
1329                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1330                     } else {
1331                         memory_config.hotplugged_size = None;
1332                     }
1333                 }
1334             }
1335         }
1336 
1337         if let Some(desired_balloon) = desired_balloon {
1338             self.device_manager
1339                 .lock()
1340                 .unwrap()
1341                 .resize_balloon(desired_balloon)
1342                 .map_err(Error::DeviceManager)?;
1343 
1344             // Update the configuration value for the balloon size to ensure
1345             // a reboot would use the right value.
1346             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1347                 balloon_config.size = desired_balloon;
1348             }
1349         }
1350 
1351         event!("vm", "resized");
1352 
1353         Ok(())
1354     }
1355 
1356     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1357         let memory_config = &mut self.config.lock().unwrap().memory;
1358 
1359         if let Some(zones) = &mut memory_config.zones {
1360             for zone in zones.iter_mut() {
1361                 if zone.id == id {
1362                     if desired_memory >= zone.size {
1363                         let hotplugged_size = desired_memory - zone.size;
1364                         self.memory_manager
1365                             .lock()
1366                             .unwrap()
1367                             .resize_zone(&id, desired_memory - zone.size)
1368                             .map_err(Error::MemoryManager)?;
1369                         // We update the memory zone config regardless of the
1370                         // actual 'resize-zone' operation result (happened or
1371                         // not), so that if the VM reboots it will be running
1372                         // with the last configured memory zone size.
1373                         zone.hotplugged_size = Some(hotplugged_size);
1374 
1375                         return Ok(());
1376                     } else {
1377                         error!(
1378                             "Invalid to ask less ({}) than boot RAM ({}) for \
1379                             this memory zone",
1380                             desired_memory, zone.size,
1381                         );
1382                         return Err(Error::ResizeZone);
1383                     }
1384                 }
1385             }
1386         }
1387 
1388         error!("Could not find the memory zone {} for the resize", id);
1389         Err(Error::ResizeZone)
1390     }
1391 
1392     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1393         let pci_device_info = self
1394             .device_manager
1395             .lock()
1396             .unwrap()
1397             .add_device(&mut device_cfg)
1398             .map_err(Error::DeviceManager)?;
1399 
1400         // Update VmConfig by adding the new device. This is important to
1401         // ensure the device would be created in case of a reboot.
1402         {
1403             let mut config = self.config.lock().unwrap();
1404             add_to_config(&mut config.devices, device_cfg);
1405         }
1406 
1407         self.device_manager
1408             .lock()
1409             .unwrap()
1410             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1411             .map_err(Error::DeviceManager)?;
1412 
1413         Ok(pci_device_info)
1414     }
1415 
1416     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1417         let pci_device_info = self
1418             .device_manager
1419             .lock()
1420             .unwrap()
1421             .add_user_device(&mut device_cfg)
1422             .map_err(Error::DeviceManager)?;
1423 
1424         // Update VmConfig by adding the new device. This is important to
1425         // ensure the device would be created in case of a reboot.
1426         {
1427             let mut config = self.config.lock().unwrap();
1428             add_to_config(&mut config.user_devices, device_cfg);
1429         }
1430 
1431         self.device_manager
1432             .lock()
1433             .unwrap()
1434             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1435             .map_err(Error::DeviceManager)?;
1436 
1437         Ok(pci_device_info)
1438     }
1439 
1440     pub fn remove_device(&mut self, id: String) -> Result<()> {
1441         self.device_manager
1442             .lock()
1443             .unwrap()
1444             .remove_device(id.clone())
1445             .map_err(Error::DeviceManager)?;
1446 
1447         // Update VmConfig by removing the device. This is important to
1448         // ensure the device would not be created in case of a reboot.
1449         self.config.lock().unwrap().remove_device(&id);
1450 
1451         self.device_manager
1452             .lock()
1453             .unwrap()
1454             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1455             .map_err(Error::DeviceManager)?;
1456         Ok(())
1457     }
1458 
1459     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1460         let pci_device_info = self
1461             .device_manager
1462             .lock()
1463             .unwrap()
1464             .add_disk(&mut disk_cfg)
1465             .map_err(Error::DeviceManager)?;
1466 
1467         // Update VmConfig by adding the new device. This is important to
1468         // ensure the device would be created in case of a reboot.
1469         {
1470             let mut config = self.config.lock().unwrap();
1471             add_to_config(&mut config.disks, disk_cfg);
1472         }
1473 
1474         self.device_manager
1475             .lock()
1476             .unwrap()
1477             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1478             .map_err(Error::DeviceManager)?;
1479 
1480         Ok(pci_device_info)
1481     }
1482 
1483     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1484         let pci_device_info = self
1485             .device_manager
1486             .lock()
1487             .unwrap()
1488             .add_fs(&mut fs_cfg)
1489             .map_err(Error::DeviceManager)?;
1490 
1491         // Update VmConfig by adding the new device. This is important to
1492         // ensure the device would be created in case of a reboot.
1493         {
1494             let mut config = self.config.lock().unwrap();
1495             add_to_config(&mut config.fs, fs_cfg);
1496         }
1497 
1498         self.device_manager
1499             .lock()
1500             .unwrap()
1501             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1502             .map_err(Error::DeviceManager)?;
1503 
1504         Ok(pci_device_info)
1505     }
1506 
1507     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1508         let pci_device_info = self
1509             .device_manager
1510             .lock()
1511             .unwrap()
1512             .add_pmem(&mut pmem_cfg)
1513             .map_err(Error::DeviceManager)?;
1514 
1515         // Update VmConfig by adding the new device. This is important to
1516         // ensure the device would be created in case of a reboot.
1517         {
1518             let mut config = self.config.lock().unwrap();
1519             add_to_config(&mut config.pmem, pmem_cfg);
1520         }
1521 
1522         self.device_manager
1523             .lock()
1524             .unwrap()
1525             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1526             .map_err(Error::DeviceManager)?;
1527 
1528         Ok(pci_device_info)
1529     }
1530 
1531     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1532         let pci_device_info = self
1533             .device_manager
1534             .lock()
1535             .unwrap()
1536             .add_net(&mut net_cfg)
1537             .map_err(Error::DeviceManager)?;
1538 
1539         // Update VmConfig by adding the new device. This is important to
1540         // ensure the device would be created in case of a reboot.
1541         {
1542             let mut config = self.config.lock().unwrap();
1543             add_to_config(&mut config.net, net_cfg);
1544         }
1545 
1546         self.device_manager
1547             .lock()
1548             .unwrap()
1549             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1550             .map_err(Error::DeviceManager)?;
1551 
1552         Ok(pci_device_info)
1553     }
1554 
1555     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1556         let pci_device_info = self
1557             .device_manager
1558             .lock()
1559             .unwrap()
1560             .add_vdpa(&mut vdpa_cfg)
1561             .map_err(Error::DeviceManager)?;
1562 
1563         // Update VmConfig by adding the new device. This is important to
1564         // ensure the device would be created in case of a reboot.
1565         {
1566             let mut config = self.config.lock().unwrap();
1567             add_to_config(&mut config.vdpa, vdpa_cfg);
1568         }
1569 
1570         self.device_manager
1571             .lock()
1572             .unwrap()
1573             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1574             .map_err(Error::DeviceManager)?;
1575 
1576         Ok(pci_device_info)
1577     }
1578 
1579     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1580         let pci_device_info = self
1581             .device_manager
1582             .lock()
1583             .unwrap()
1584             .add_vsock(&mut vsock_cfg)
1585             .map_err(Error::DeviceManager)?;
1586 
1587         // Update VmConfig by adding the new device. This is important to
1588         // ensure the device would be created in case of a reboot.
1589         {
1590             let mut config = self.config.lock().unwrap();
1591             config.vsock = Some(vsock_cfg);
1592         }
1593 
1594         self.device_manager
1595             .lock()
1596             .unwrap()
1597             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1598             .map_err(Error::DeviceManager)?;
1599 
1600         Ok(pci_device_info)
1601     }
1602 
1603     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1604         Ok(self.device_manager.lock().unwrap().counters())
1605     }
1606 
1607     #[cfg(feature = "tdx")]
1608     fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1609         use arch::x86_64::tdx::*;
1610 
1611         let firmware_path = self
1612             .config
1613             .lock()
1614             .unwrap()
1615             .payload
1616             .as_ref()
1617             .unwrap()
1618             .firmware
1619             .clone()
1620             .ok_or(Error::TdxFirmwareMissing)?;
1621         // The TDVF file contains a table of section as well as code
1622         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1623 
1624         // For all the sections allocate some RAM backing them
1625         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1626     }
1627 
1628     #[cfg(feature = "tdx")]
1629     fn hob_memory_resources(
1630         mut sorted_sections: Vec<TdvfSection>,
1631         guest_memory: &GuestMemoryMmap,
1632     ) -> Vec<(u64, u64, bool)> {
1633         let mut list = Vec::new();
1634 
1635         let mut current_section = sorted_sections.pop();
1636 
1637         // RAM regions interleaved with TDVF sections
1638         let mut next_start_addr = 0;
1639         for region in guest_memory.iter() {
1640             let region_start = region.start_addr().0;
1641             let region_end = region.last_addr().0;
1642             if region_start > next_start_addr {
1643                 next_start_addr = region_start;
1644             }
1645 
1646             loop {
1647                 let (start, size, ram) = if let Some(section) = &current_section {
1648                     if section.address <= next_start_addr {
1649                         (section.address, section.size, false)
1650                     } else {
1651                         let last_addr = std::cmp::min(section.address - 1, region_end);
1652                         (next_start_addr, last_addr - next_start_addr + 1, true)
1653                     }
1654                 } else {
1655                     (next_start_addr, region_end - next_start_addr + 1, true)
1656                 };
1657 
1658                 list.push((start, size, ram));
1659 
1660                 if !ram {
1661                     current_section = sorted_sections.pop();
1662                 }
1663 
1664                 next_start_addr = start + size;
1665 
1666                 if region_start > next_start_addr {
1667                     next_start_addr = region_start;
1668                 }
1669 
1670                 if next_start_addr > region_end {
1671                     break;
1672                 }
1673             }
1674         }
1675 
1676         // Once all the interleaved sections have been processed, let's simply
1677         // pull the remaining ones.
1678         if let Some(section) = current_section {
1679             list.push((section.address, section.size, false));
1680         }
1681         while let Some(section) = sorted_sections.pop() {
1682             list.push((section.address, section.size, false));
1683         }
1684 
1685         list
1686     }
1687 
1688     #[cfg(feature = "tdx")]
1689     fn populate_tdx_sections(
1690         &mut self,
1691         sections: &[TdvfSection],
1692         guid_found: bool,
1693     ) -> Result<Option<u64>> {
1694         use arch::x86_64::tdx::*;
1695         // Get the memory end *before* we start adding TDVF ram regions
1696         let boot_guest_memory = self
1697             .memory_manager
1698             .lock()
1699             .as_ref()
1700             .unwrap()
1701             .boot_guest_memory();
1702         for section in sections {
1703             // No need to allocate if the section falls within guest RAM ranges
1704             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1705                 info!(
1706                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1707                     section
1708                 );
1709                 continue;
1710             }
1711 
1712             info!("Allocating TDVF Section: {:x?}", section);
1713             self.memory_manager
1714                 .lock()
1715                 .unwrap()
1716                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1717                 .map_err(Error::AllocatingTdvfMemory)?;
1718         }
1719 
1720         // The TDVF file contains a table of section as well as code
1721         let firmware_path = self
1722             .config
1723             .lock()
1724             .unwrap()
1725             .payload
1726             .as_ref()
1727             .unwrap()
1728             .firmware
1729             .clone()
1730             .ok_or(Error::TdxFirmwareMissing)?;
1731         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1732 
1733         // The guest memory at this point now has all the required regions so it
1734         // is safe to copy from the TDVF file into it.
1735         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1736         let mem = guest_memory.memory();
1737         let mut payload_info = None;
1738         let mut hob_offset = None;
1739         for section in sections {
1740             info!("Populating TDVF Section: {:x?}", section);
1741             match section.r#type {
1742                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1743                     info!("Copying section to guest memory");
1744                     firmware_file
1745                         .seek(SeekFrom::Start(section.data_offset as u64))
1746                         .map_err(Error::LoadTdvf)?;
1747                     mem.read_from(
1748                         GuestAddress(section.address),
1749                         &mut firmware_file,
1750                         section.data_size as usize,
1751                     )
1752                     .unwrap();
1753                 }
1754                 TdvfSectionType::TdHob => {
1755                     hob_offset = Some(section.address);
1756                 }
1757                 TdvfSectionType::Payload => {
1758                     info!("Copying payload to guest memory");
1759                     if let Some(payload_file) = self.kernel.as_mut() {
1760                         let payload_size = payload_file
1761                             .seek(SeekFrom::End(0))
1762                             .map_err(Error::LoadPayload)?;
1763 
1764                         payload_file
1765                             .seek(SeekFrom::Start(0x1f1))
1766                             .map_err(Error::LoadPayload)?;
1767 
1768                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1769                         payload_header
1770                             .as_bytes()
1771                             .read_from(
1772                                 0,
1773                                 payload_file,
1774                                 mem::size_of::<linux_loader::bootparam::setup_header>(),
1775                             )
1776                             .unwrap();
1777 
1778                         if payload_header.header != 0x5372_6448 {
1779                             return Err(Error::InvalidPayloadType);
1780                         }
1781 
1782                         if (payload_header.version < 0x0200)
1783                             || ((payload_header.loadflags & 0x1) == 0x0)
1784                         {
1785                             return Err(Error::InvalidPayloadType);
1786                         }
1787 
1788                         payload_file.rewind().map_err(Error::LoadPayload)?;
1789                         mem.read_from(
1790                             GuestAddress(section.address),
1791                             payload_file,
1792                             payload_size as usize,
1793                         )
1794                         .unwrap();
1795 
1796                         // Create the payload info that will be inserted into
1797                         // the HOB.
1798                         payload_info = Some(PayloadInfo {
1799                             image_type: PayloadImageType::BzImage,
1800                             entry_point: section.address,
1801                         });
1802                     }
1803                 }
1804                 TdvfSectionType::PayloadParam => {
1805                     info!("Copying payload parameters to guest memory");
1806                     let cmdline = Self::generate_cmdline(
1807                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1808                     )?;
1809                     mem.write_slice(
1810                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1811                         GuestAddress(section.address),
1812                     )
1813                     .unwrap();
1814                 }
1815                 _ => {}
1816             }
1817         }
1818 
1819         // Generate HOB
1820         let mut hob = TdHob::start(hob_offset.unwrap());
1821 
1822         let mut sorted_sections = sections.to_vec();
1823         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1824 
1825         sorted_sections.sort_by_key(|section| section.address);
1826         sorted_sections.reverse();
1827 
1828         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1829             hob.add_memory_resource(&mem, start, size, ram, guid_found)
1830                 .map_err(Error::PopulateHob)?;
1831         }
1832 
1833         // MMIO regions
1834         hob.add_mmio_resource(
1835             &mem,
1836             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1837             arch::layout::APIC_START.raw_value()
1838                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1839         )
1840         .map_err(Error::PopulateHob)?;
1841         let start_of_device_area = self
1842             .memory_manager
1843             .lock()
1844             .unwrap()
1845             .start_of_device_area()
1846             .raw_value();
1847         let end_of_device_area = self
1848             .memory_manager
1849             .lock()
1850             .unwrap()
1851             .end_of_device_area()
1852             .raw_value();
1853         hob.add_mmio_resource(
1854             &mem,
1855             start_of_device_area,
1856             end_of_device_area - start_of_device_area,
1857         )
1858         .map_err(Error::PopulateHob)?;
1859 
1860         // Loop over the ACPI tables and copy them to the HOB.
1861 
1862         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1863             &self.device_manager,
1864             &self.cpu_manager,
1865             &self.memory_manager,
1866             &self.numa_nodes,
1867         ) {
1868             hob.add_acpi_table(&mem, acpi_table.as_slice())
1869                 .map_err(Error::PopulateHob)?;
1870         }
1871 
1872         // If a payload info has been created, let's insert it into the HOB.
1873         if let Some(payload_info) = payload_info {
1874             hob.add_payload(&mem, payload_info)
1875                 .map_err(Error::PopulateHob)?;
1876         }
1877 
1878         hob.finish(&mem).map_err(Error::PopulateHob)?;
1879 
1880         Ok(hob_offset)
1881     }
1882 
1883     #[cfg(feature = "tdx")]
1884     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1885         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1886         let mem = guest_memory.memory();
1887 
1888         for section in sections {
1889             self.vm
1890                 .tdx_init_memory_region(
1891                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1892                     section.address,
1893                     section.size,
1894                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1895                     section.attributes == 1,
1896                 )
1897                 .map_err(Error::InitializeTdxMemoryRegion)?;
1898         }
1899 
1900         Ok(())
1901     }
1902 
1903     // Creates ACPI tables
1904     // In case of TDX being used, this is a no-op since the tables will be
1905     // created and passed when populating the HOB.
1906 
1907     fn create_acpi_tables(&self) -> Option<GuestAddress> {
1908         #[cfg(feature = "tdx")]
1909         if self.config.lock().unwrap().is_tdx_enabled() {
1910             return None;
1911         }
1912         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
1913         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
1914         let rsdp_addr = crate::acpi::create_acpi_tables(
1915             &mem,
1916             &self.device_manager,
1917             &self.cpu_manager,
1918             &self.memory_manager,
1919             &self.numa_nodes,
1920             tpm_enabled,
1921         );
1922         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
1923 
1924         Some(rsdp_addr)
1925     }
1926 
1927     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
1928         trace_scoped!("entry_point");
1929 
1930         self.load_payload_handle
1931             .take()
1932             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
1933             .transpose()
1934     }
1935 
1936     pub fn boot(&mut self) -> Result<()> {
1937         trace_scoped!("Vm::boot");
1938         info!("Booting VM");
1939         event!("vm", "booting");
1940         let current_state = self.get_state()?;
1941         if current_state == VmState::Paused {
1942             return self.resume().map_err(Error::Resume);
1943         }
1944 
1945         let new_state = if self.stop_on_boot {
1946             VmState::BreakPoint
1947         } else {
1948             VmState::Running
1949         };
1950         current_state.valid_transition(new_state)?;
1951 
1952         // Do earlier to parallelise with loading kernel
1953         #[cfg(target_arch = "x86_64")]
1954         let rsdp_addr = self.create_acpi_tables();
1955 
1956         // Load kernel synchronously or if asynchronous then wait for load to
1957         // finish.
1958         let entry_point = self.entry_point()?;
1959 
1960         #[cfg(feature = "tdx")]
1961         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
1962 
1963         // Configure the vcpus that have been created
1964         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
1965         for vcpu in vcpus {
1966             let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
1967             let boot_setup = entry_point.map(|e| (e, guest_memory));
1968             self.cpu_manager
1969                 .lock()
1970                 .unwrap()
1971                 .configure_vcpu(vcpu, boot_setup)
1972                 .map_err(Error::CpuManager)?;
1973         }
1974 
1975         #[cfg(feature = "tdx")]
1976         let (sections, guid_found) = if tdx_enabled {
1977             self.extract_tdvf_sections()?
1978         } else {
1979             (Vec::new(), false)
1980         };
1981 
1982         // Configuring the TDX regions requires that the vCPUs are created.
1983         #[cfg(feature = "tdx")]
1984         let hob_address = if tdx_enabled {
1985             // TDX sections are written to memory.
1986             self.populate_tdx_sections(&sections, guid_found)?
1987         } else {
1988             None
1989         };
1990 
1991         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
1992         // available after they are configured
1993         #[cfg(target_arch = "aarch64")]
1994         let rsdp_addr = self.create_acpi_tables();
1995 
1996         // Configure shared state based on loaded kernel
1997         entry_point
1998             .map(|_| {
1999                 // Safe to unwrap rsdp_addr as we know it can't be None when
2000                 // the entry_point is Some.
2001                 self.configure_system(rsdp_addr.unwrap())
2002             })
2003             .transpose()?;
2004 
2005         #[cfg(target_arch = "x86_64")]
2006         // Note: For x86, always call this function before invoking start boot vcpus.
2007         // Otherwise guest would fail to boot because we haven't created the
2008         // userspace mappings to update the hypervisor about the memory mappings.
2009         // These mappings must be created before we start the vCPU threads for
2010         // the very first time.
2011         self.memory_manager
2012             .lock()
2013             .unwrap()
2014             .allocate_address_space()
2015             .map_err(Error::MemoryManager)?;
2016 
2017         #[cfg(feature = "tdx")]
2018         if let Some(hob_address) = hob_address {
2019             // With the HOB address extracted the vCPUs can have
2020             // their TDX state configured.
2021             self.cpu_manager
2022                 .lock()
2023                 .unwrap()
2024                 .initialize_tdx(hob_address)
2025                 .map_err(Error::CpuManager)?;
2026             // Let the hypervisor know which memory ranges are shared with the
2027             // guest. This prevents the guest from ignoring/discarding memory
2028             // regions provided by the host.
2029             self.init_tdx_memory(&sections)?;
2030             // With TDX memory and CPU state configured TDX setup is complete
2031             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2032         }
2033 
2034         self.cpu_manager
2035             .lock()
2036             .unwrap()
2037             .start_boot_vcpus(new_state == VmState::BreakPoint)
2038             .map_err(Error::CpuManager)?;
2039 
2040         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2041         *state = new_state;
2042         event!("vm", "booted");
2043         Ok(())
2044     }
2045 
2046     pub fn restore(&mut self) -> Result<()> {
2047         event!("vm", "restoring");
2048 
2049         #[cfg(target_arch = "x86_64")]
2050         // Note: For x86, always call this function before invoking start boot vcpus.
2051         // Otherwise guest would fail to boot because we haven't created the
2052         // userspace mappings to update the hypervisor about the memory mappings.
2053         // These mappings must be created before we start the vCPU threads for
2054         // the very first time for the restored VM.
2055         self.memory_manager
2056             .lock()
2057             .unwrap()
2058             .allocate_address_space()
2059             .map_err(Error::MemoryManager)?;
2060 
2061         // Now we can start all vCPUs from here.
2062         self.cpu_manager
2063             .lock()
2064             .unwrap()
2065             .start_restored_vcpus()
2066             .map_err(Error::CpuManager)?;
2067 
2068         event!("vm", "restored");
2069         Ok(())
2070     }
2071 
2072     /// Gets a thread-safe reference counted pointer to the VM configuration.
2073     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2074         Arc::clone(&self.config)
2075     }
2076 
2077     /// Get the VM state. Returns an error if the state is poisoned.
2078     pub fn get_state(&self) -> Result<VmState> {
2079         self.state
2080             .try_read()
2081             .map_err(|_| Error::PoisonedState)
2082             .map(|state| *state)
2083     }
2084 
2085     /// Gets the actual size of the balloon.
2086     pub fn balloon_size(&self) -> u64 {
2087         self.device_manager.lock().unwrap().balloon_size()
2088     }
2089 
2090     pub fn send_memory_fds(
2091         &mut self,
2092         socket: &mut UnixStream,
2093     ) -> std::result::Result<(), MigratableError> {
2094         for (slot, fd) in self
2095             .memory_manager
2096             .lock()
2097             .unwrap()
2098             .memory_slot_fds()
2099             .drain()
2100         {
2101             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2102                 .write_to(socket)
2103                 .map_err(|e| {
2104                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2105                 })?;
2106             socket
2107                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2108                 .map_err(|e| {
2109                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2110                 })?;
2111 
2112             let res = Response::read_from(socket)?;
2113             if res.status() != Status::Ok {
2114                 warn!("Error during memory fd migration");
2115                 Request::abandon().write_to(socket)?;
2116                 Response::read_from(socket).ok();
2117                 return Err(MigratableError::MigrateSend(anyhow!(
2118                     "Error during memory fd migration"
2119                 )));
2120             }
2121         }
2122 
2123         Ok(())
2124     }
2125 
2126     pub fn send_memory_regions<F>(
2127         &mut self,
2128         ranges: &MemoryRangeTable,
2129         fd: &mut F,
2130     ) -> std::result::Result<(), MigratableError>
2131     where
2132         F: Write,
2133     {
2134         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2135         let mem = guest_memory.memory();
2136 
2137         for range in ranges.regions() {
2138             let mut offset: u64 = 0;
2139             // Here we are manually handling the retry in case we can't the
2140             // whole region at once because we can't use the implementation
2141             // from vm-memory::GuestMemory of write_all_to() as it is not
2142             // following the correct behavior. For more info about this issue
2143             // see: https://github.com/rust-vmm/vm-memory/issues/174
2144             loop {
2145                 let bytes_written = mem
2146                     .write_to(
2147                         GuestAddress(range.gpa + offset),
2148                         fd,
2149                         (range.length - offset) as usize,
2150                     )
2151                     .map_err(|e| {
2152                         MigratableError::MigrateSend(anyhow!(
2153                             "Error transferring memory to socket: {}",
2154                             e
2155                         ))
2156                     })?;
2157                 offset += bytes_written as u64;
2158 
2159                 if offset == range.length {
2160                     break;
2161                 }
2162             }
2163         }
2164 
2165         Ok(())
2166     }
2167 
2168     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2169         self.memory_manager
2170             .lock()
2171             .unwrap()
2172             .memory_range_table(false)
2173     }
2174 
2175     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2176         self.device_manager.lock().unwrap().device_tree()
2177     }
2178 
2179     pub fn activate_virtio_devices(&self) -> Result<()> {
2180         self.device_manager
2181             .lock()
2182             .unwrap()
2183             .activate_virtio_devices()
2184             .map_err(Error::ActivateVirtioDevices)
2185     }
2186 
2187     #[cfg(target_arch = "x86_64")]
2188     pub fn power_button(&self) -> Result<()> {
2189         return self
2190             .device_manager
2191             .lock()
2192             .unwrap()
2193             .notify_power_button()
2194             .map_err(Error::PowerButton);
2195     }
2196 
2197     #[cfg(target_arch = "aarch64")]
2198     pub fn power_button(&self) -> Result<()> {
2199         self.device_manager
2200             .lock()
2201             .unwrap()
2202             .notify_power_button()
2203             .map_err(Error::PowerButton)
2204     }
2205 
2206     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2207         self.memory_manager.lock().unwrap().snapshot_data()
2208     }
2209 
2210     #[cfg(feature = "guest_debug")]
2211     pub fn debug_request(
2212         &mut self,
2213         gdb_request: &GdbRequestPayload,
2214         cpu_id: usize,
2215     ) -> Result<GdbResponsePayload> {
2216         use GdbRequestPayload::*;
2217         match gdb_request {
2218             SetSingleStep(single_step) => {
2219                 self.set_guest_debug(cpu_id, &[], *single_step)
2220                     .map_err(Error::Debug)?;
2221             }
2222             SetHwBreakPoint(addrs) => {
2223                 self.set_guest_debug(cpu_id, addrs, false)
2224                     .map_err(Error::Debug)?;
2225             }
2226             Pause => {
2227                 self.debug_pause().map_err(Error::Debug)?;
2228             }
2229             Resume => {
2230                 self.debug_resume().map_err(Error::Debug)?;
2231             }
2232             ReadRegs => {
2233                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2234                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2235             }
2236             WriteRegs(regs) => {
2237                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2238             }
2239             ReadMem(vaddr, len) => {
2240                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2241                 let mem = self
2242                     .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2243                     .map_err(Error::Debug)?;
2244                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2245             }
2246             WriteMem(vaddr, data) => {
2247                 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2248                 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2249                     .map_err(Error::Debug)?;
2250             }
2251             ActiveVcpus => {
2252                 let active_vcpus = self.active_vcpus();
2253                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2254             }
2255         }
2256         Ok(GdbResponsePayload::CommandComplete)
2257     }
2258 
2259     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2260     fn get_dump_state(
2261         &mut self,
2262         destination_url: &str,
2263     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2264         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2265         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2266         let mut elf_phdr_num = 1;
2267         let elf_sh_info = 0;
2268         let coredump_file_path = url_to_file(destination_url)?;
2269         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2270 
2271         if mapping_num < UINT16_MAX - 2 {
2272             elf_phdr_num += mapping_num as u16;
2273         } else {
2274             panic!("mapping num beyond 65535 not supported");
2275         }
2276         let coredump_file = OpenOptions::new()
2277             .read(true)
2278             .write(true)
2279             .create_new(true)
2280             .open(coredump_file_path)
2281             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2282 
2283         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2284         let mem_data = self
2285             .memory_manager
2286             .lock()
2287             .unwrap()
2288             .coredump_memory_regions(mem_offset);
2289 
2290         Ok(DumpState {
2291             elf_note_size,
2292             elf_phdr_num,
2293             elf_sh_info,
2294             mem_offset,
2295             mem_info: Some(mem_data),
2296             file: Some(coredump_file),
2297         })
2298     }
2299 
2300     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2301     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2302         size_of::<elf::Elf64_Ehdr>() as u64
2303             + note_size as u64
2304             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2305     }
2306 }
2307 
2308 impl Pausable for Vm {
2309     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2310         event!("vm", "pausing");
2311         let mut state = self
2312             .state
2313             .try_write()
2314             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2315         let new_state = VmState::Paused;
2316 
2317         state
2318             .valid_transition(new_state)
2319             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2320 
2321         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2322         {
2323             let mut clock = self
2324                 .vm
2325                 .get_clock()
2326                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2327             clock.reset_flags();
2328             self.saved_clock = Some(clock);
2329         }
2330 
2331         // Before pausing the vCPUs activate any pending virtio devices that might
2332         // need activation between starting the pause (or e.g. a migration it's part of)
2333         self.activate_virtio_devices().map_err(|e| {
2334             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2335         })?;
2336 
2337         self.cpu_manager.lock().unwrap().pause()?;
2338         self.device_manager.lock().unwrap().pause()?;
2339 
2340         *state = new_state;
2341 
2342         event!("vm", "paused");
2343         Ok(())
2344     }
2345 
2346     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2347         event!("vm", "resuming");
2348         let mut state = self
2349             .state
2350             .try_write()
2351             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2352         let new_state = VmState::Running;
2353 
2354         state
2355             .valid_transition(new_state)
2356             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2357 
2358         self.cpu_manager.lock().unwrap().resume()?;
2359         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2360         {
2361             if let Some(clock) = &self.saved_clock {
2362                 self.vm.set_clock(clock).map_err(|e| {
2363                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2364                 })?;
2365             }
2366         }
2367         self.device_manager.lock().unwrap().resume()?;
2368 
2369         // And we're back to the Running state.
2370         *state = new_state;
2371         event!("vm", "resumed");
2372         Ok(())
2373     }
2374 }
2375 
2376 #[derive(Serialize, Deserialize)]
2377 pub struct VmSnapshot {
2378     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2379     pub clock: Option<hypervisor::ClockData>,
2380     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2381     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2382 }
2383 
2384 pub const VM_SNAPSHOT_ID: &str = "vm";
2385 impl Snapshottable for Vm {
2386     fn id(&self) -> String {
2387         VM_SNAPSHOT_ID.to_string()
2388     }
2389 
2390     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2391         event!("vm", "snapshotting");
2392 
2393         #[cfg(feature = "tdx")]
2394         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2395 
2396         #[cfg(feature = "tdx")]
2397         {
2398             if tdx_enabled {
2399                 return Err(MigratableError::Snapshot(anyhow!(
2400                     "Snapshot not possible with TDX VM"
2401                 )));
2402             }
2403         }
2404 
2405         let current_state = self.get_state().unwrap();
2406         if current_state != VmState::Paused {
2407             return Err(MigratableError::Snapshot(anyhow!(
2408                 "Trying to snapshot while VM is running"
2409             )));
2410         }
2411 
2412         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2413         let common_cpuid = {
2414             let amx = self.config.lock().unwrap().cpus.features.amx;
2415             let phys_bits = physical_bits(
2416                 &self.hypervisor,
2417                 self.config.lock().unwrap().cpus.max_phys_bits,
2418             );
2419             arch::generate_common_cpuid(
2420                 &self.hypervisor,
2421                 &arch::CpuidConfig {
2422                     sgx_epc_sections: None,
2423                     phys_bits,
2424                     kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2425                     #[cfg(feature = "tdx")]
2426                     tdx: tdx_enabled,
2427                     amx,
2428                 },
2429             )
2430             .map_err(|e| {
2431                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2432             })?
2433         };
2434 
2435         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2436             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2437             clock: self.saved_clock,
2438             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2439             common_cpuid,
2440         })
2441         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2442 
2443         let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data));
2444 
2445         let (id, snapshot) = {
2446             let mut cpu_manager = self.cpu_manager.lock().unwrap();
2447             (cpu_manager.id(), cpu_manager.snapshot()?)
2448         };
2449         vm_snapshot.add_snapshot(id, snapshot);
2450         let (id, snapshot) = {
2451             let mut memory_manager = self.memory_manager.lock().unwrap();
2452             (memory_manager.id(), memory_manager.snapshot()?)
2453         };
2454         vm_snapshot.add_snapshot(id, snapshot);
2455         let (id, snapshot) = {
2456             let mut device_manager = self.device_manager.lock().unwrap();
2457             (device_manager.id(), device_manager.snapshot()?)
2458         };
2459         vm_snapshot.add_snapshot(id, snapshot);
2460 
2461         event!("vm", "snapshotted");
2462         Ok(vm_snapshot)
2463     }
2464 }
2465 
2466 impl Transportable for Vm {
2467     fn send(
2468         &self,
2469         snapshot: &Snapshot,
2470         destination_url: &str,
2471     ) -> std::result::Result<(), MigratableError> {
2472         let mut snapshot_config_path = url_to_path(destination_url)?;
2473         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2474 
2475         // Create the snapshot config file
2476         let mut snapshot_config_file = OpenOptions::new()
2477             .read(true)
2478             .write(true)
2479             .create_new(true)
2480             .open(snapshot_config_path)
2481             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2482 
2483         // Serialize and write the snapshot config
2484         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2485             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2486 
2487         snapshot_config_file
2488             .write(vm_config.as_bytes())
2489             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2490 
2491         let mut snapshot_state_path = url_to_path(destination_url)?;
2492         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2493 
2494         // Create the snapshot state file
2495         let mut snapshot_state_file = OpenOptions::new()
2496             .read(true)
2497             .write(true)
2498             .create_new(true)
2499             .open(snapshot_state_path)
2500             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2501 
2502         // Serialize and write the snapshot state
2503         let vm_state =
2504             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2505 
2506         snapshot_state_file
2507             .write(&vm_state)
2508             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2509 
2510         // Tell the memory manager to also send/write its own snapshot.
2511         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2512             self.memory_manager
2513                 .lock()
2514                 .unwrap()
2515                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2516         } else {
2517             return Err(MigratableError::Restore(anyhow!(
2518                 "Missing memory manager snapshot"
2519             )));
2520         }
2521 
2522         Ok(())
2523     }
2524 }
2525 
2526 impl Migratable for Vm {
2527     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2528         self.memory_manager.lock().unwrap().start_dirty_log()?;
2529         self.device_manager.lock().unwrap().start_dirty_log()
2530     }
2531 
2532     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2533         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2534         self.device_manager.lock().unwrap().stop_dirty_log()
2535     }
2536 
2537     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2538         Ok(MemoryRangeTable::new_from_tables(vec![
2539             self.memory_manager.lock().unwrap().dirty_log()?,
2540             self.device_manager.lock().unwrap().dirty_log()?,
2541         ]))
2542     }
2543 
2544     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2545         self.memory_manager.lock().unwrap().start_migration()?;
2546         self.device_manager.lock().unwrap().start_migration()
2547     }
2548 
2549     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2550         self.memory_manager.lock().unwrap().complete_migration()?;
2551         self.device_manager.lock().unwrap().complete_migration()
2552     }
2553 }
2554 
2555 #[cfg(feature = "guest_debug")]
2556 impl Debuggable for Vm {
2557     fn set_guest_debug(
2558         &self,
2559         cpu_id: usize,
2560         addrs: &[GuestAddress],
2561         singlestep: bool,
2562     ) -> std::result::Result<(), DebuggableError> {
2563         self.cpu_manager
2564             .lock()
2565             .unwrap()
2566             .set_guest_debug(cpu_id, addrs, singlestep)
2567     }
2568 
2569     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2570         if *self.state.read().unwrap() == VmState::Running {
2571             self.pause().map_err(DebuggableError::Pause)?;
2572         }
2573 
2574         let mut state = self
2575             .state
2576             .try_write()
2577             .map_err(|_| DebuggableError::PoisonedState)?;
2578         *state = VmState::BreakPoint;
2579         Ok(())
2580     }
2581 
2582     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2583         if *self.state.read().unwrap() == VmState::BreakPoint {
2584             self.resume().map_err(DebuggableError::Pause)?;
2585         }
2586 
2587         Ok(())
2588     }
2589 
2590     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2591         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2592     }
2593 
2594     fn write_regs(
2595         &self,
2596         cpu_id: usize,
2597         regs: &CoreRegs,
2598     ) -> std::result::Result<(), DebuggableError> {
2599         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2600     }
2601 
2602     fn read_mem(
2603         &self,
2604         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2605         cpu_id: usize,
2606         vaddr: GuestAddress,
2607         len: usize,
2608     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2609         self.cpu_manager
2610             .lock()
2611             .unwrap()
2612             .read_mem(guest_memory, cpu_id, vaddr, len)
2613     }
2614 
2615     fn write_mem(
2616         &self,
2617         guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2618         cpu_id: usize,
2619         vaddr: &GuestAddress,
2620         data: &[u8],
2621     ) -> std::result::Result<(), DebuggableError> {
2622         self.cpu_manager
2623             .lock()
2624             .unwrap()
2625             .write_mem(guest_memory, cpu_id, vaddr, data)
2626     }
2627 
2628     fn active_vcpus(&self) -> usize {
2629         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2630         if active_vcpus > 0 {
2631             active_vcpus
2632         } else {
2633             // The VM is not booted yet. Report boot_vcpus() instead.
2634             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2635         }
2636     }
2637 }
2638 
2639 #[cfg(feature = "guest_debug")]
2640 pub const UINT16_MAX: u32 = 65535;
2641 
2642 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2643 impl Elf64Writable for Vm {}
2644 
2645 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2646 impl GuestDebuggable for Vm {
2647     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2648         event!("vm", "coredumping");
2649 
2650         let mut resume = false;
2651 
2652         #[cfg(feature = "tdx")]
2653         {
2654             if let Some(ref platform) = self.config.lock().unwrap().platform {
2655                 if platform.tdx {
2656                     return Err(GuestDebuggableError::Coredump(anyhow!(
2657                         "Coredump not possible with TDX VM"
2658                     )));
2659                 }
2660             }
2661         }
2662 
2663         match self.get_state().unwrap() {
2664             VmState::Running => {
2665                 self.pause().map_err(GuestDebuggableError::Pause)?;
2666                 resume = true;
2667             }
2668             VmState::Paused => {}
2669             _ => {
2670                 return Err(GuestDebuggableError::Coredump(anyhow!(
2671                     "Trying to coredump while VM is not running or paused"
2672                 )));
2673             }
2674         }
2675 
2676         let coredump_state = self.get_dump_state(destination_url)?;
2677 
2678         self.write_header(&coredump_state)?;
2679         self.write_note(&coredump_state)?;
2680         self.write_loads(&coredump_state)?;
2681 
2682         self.cpu_manager
2683             .lock()
2684             .unwrap()
2685             .cpu_write_elf64_note(&coredump_state)?;
2686         self.cpu_manager
2687             .lock()
2688             .unwrap()
2689             .cpu_write_vmm_note(&coredump_state)?;
2690 
2691         self.memory_manager
2692             .lock()
2693             .unwrap()
2694             .coredump_iterate_save_mem(&coredump_state)?;
2695 
2696         if resume {
2697             self.resume().map_err(GuestDebuggableError::Resume)?;
2698         }
2699 
2700         Ok(())
2701     }
2702 }
2703 
2704 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2705 #[cfg(test)]
2706 mod tests {
2707     use super::*;
2708 
2709     fn test_vm_state_transitions(state: VmState) {
2710         match state {
2711             VmState::Created => {
2712                 // Check the transitions from Created
2713                 assert!(state.valid_transition(VmState::Created).is_err());
2714                 assert!(state.valid_transition(VmState::Running).is_ok());
2715                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2716                 assert!(state.valid_transition(VmState::Paused).is_ok());
2717                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2718             }
2719             VmState::Running => {
2720                 // Check the transitions from Running
2721                 assert!(state.valid_transition(VmState::Created).is_err());
2722                 assert!(state.valid_transition(VmState::Running).is_err());
2723                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2724                 assert!(state.valid_transition(VmState::Paused).is_ok());
2725                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2726             }
2727             VmState::Shutdown => {
2728                 // Check the transitions from Shutdown
2729                 assert!(state.valid_transition(VmState::Created).is_err());
2730                 assert!(state.valid_transition(VmState::Running).is_ok());
2731                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2732                 assert!(state.valid_transition(VmState::Paused).is_err());
2733                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2734             }
2735             VmState::Paused => {
2736                 // Check the transitions from Paused
2737                 assert!(state.valid_transition(VmState::Created).is_err());
2738                 assert!(state.valid_transition(VmState::Running).is_ok());
2739                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2740                 assert!(state.valid_transition(VmState::Paused).is_err());
2741                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2742             }
2743             VmState::BreakPoint => {
2744                 // Check the transitions from Breakpoint
2745                 assert!(state.valid_transition(VmState::Created).is_ok());
2746                 assert!(state.valid_transition(VmState::Running).is_ok());
2747                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2748                 assert!(state.valid_transition(VmState::Paused).is_err());
2749                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2750             }
2751         }
2752     }
2753 
2754     #[test]
2755     fn test_vm_created_transitions() {
2756         test_vm_state_transitions(VmState::Created);
2757     }
2758 
2759     #[test]
2760     fn test_vm_running_transitions() {
2761         test_vm_state_transitions(VmState::Running);
2762     }
2763 
2764     #[test]
2765     fn test_vm_shutdown_transitions() {
2766         test_vm_state_transitions(VmState::Shutdown);
2767     }
2768 
2769     #[test]
2770     fn test_vm_paused_transitions() {
2771         test_vm_state_transitions(VmState::Paused);
2772     }
2773 
2774     #[cfg(feature = "tdx")]
2775     #[test]
2776     fn test_hob_memory_resources() {
2777         // Case 1: Two TDVF sections in the middle of the RAM
2778         let sections = vec![
2779             TdvfSection {
2780                 address: 0xc000,
2781                 size: 0x1000,
2782                 ..Default::default()
2783             },
2784             TdvfSection {
2785                 address: 0x1000,
2786                 size: 0x4000,
2787                 ..Default::default()
2788             },
2789         ];
2790         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
2791         let expected = vec![
2792             (0, 0x1000, true),
2793             (0x1000, 0x4000, false),
2794             (0x5000, 0x7000, true),
2795             (0xc000, 0x1000, false),
2796             (0xd000, 0x0fff_3000, true),
2797         ];
2798         assert_eq!(
2799             expected,
2800             Vm::hob_memory_resources(
2801                 sections,
2802                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2803             )
2804         );
2805 
2806         // Case 2: Two TDVF sections with no conflict with the RAM
2807         let sections = vec![
2808             TdvfSection {
2809                 address: 0x1000_1000,
2810                 size: 0x1000,
2811                 ..Default::default()
2812             },
2813             TdvfSection {
2814                 address: 0,
2815                 size: 0x1000,
2816                 ..Default::default()
2817             },
2818         ];
2819         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2820         let expected = vec![
2821             (0, 0x1000, false),
2822             (0x1000, 0x1000_0000, true),
2823             (0x1000_1000, 0x1000, false),
2824         ];
2825         assert_eq!(
2826             expected,
2827             Vm::hob_memory_resources(
2828                 sections,
2829                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2830             )
2831         );
2832 
2833         // Case 3: Two TDVF sections with partial conflicts with the RAM
2834         let sections = vec![
2835             TdvfSection {
2836                 address: 0x1000_0000,
2837                 size: 0x2000,
2838                 ..Default::default()
2839             },
2840             TdvfSection {
2841                 address: 0,
2842                 size: 0x2000,
2843                 ..Default::default()
2844             },
2845         ];
2846         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2847         let expected = vec![
2848             (0, 0x2000, false),
2849             (0x2000, 0x0fff_e000, true),
2850             (0x1000_0000, 0x2000, false),
2851         ];
2852         assert_eq!(
2853             expected,
2854             Vm::hob_memory_resources(
2855                 sections,
2856                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2857             )
2858         );
2859 
2860         // Case 4: Two TDVF sections with no conflict before the RAM and two
2861         // more additional sections with no conflict after the RAM.
2862         let sections = vec![
2863             TdvfSection {
2864                 address: 0x2000_1000,
2865                 size: 0x1000,
2866                 ..Default::default()
2867             },
2868             TdvfSection {
2869                 address: 0x2000_0000,
2870                 size: 0x1000,
2871                 ..Default::default()
2872             },
2873             TdvfSection {
2874                 address: 0x1000,
2875                 size: 0x1000,
2876                 ..Default::default()
2877             },
2878             TdvfSection {
2879                 address: 0,
2880                 size: 0x1000,
2881                 ..Default::default()
2882             },
2883         ];
2884         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
2885         let expected = vec![
2886             (0, 0x1000, false),
2887             (0x1000, 0x1000, false),
2888             (0x4000, 0x1000_0000, true),
2889             (0x2000_0000, 0x1000, false),
2890             (0x2000_1000, 0x1000, false),
2891         ];
2892         assert_eq!(
2893             expected,
2894             Vm::hob_memory_resources(
2895                 sections,
2896                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2897             )
2898         );
2899 
2900         // Case 5: One TDVF section overriding the entire RAM
2901         let sections = vec![TdvfSection {
2902             address: 0,
2903             size: 0x2000_0000,
2904             ..Default::default()
2905         }];
2906         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
2907         let expected = vec![(0, 0x2000_0000, false)];
2908         assert_eq!(
2909             expected,
2910             Vm::hob_memory_resources(
2911                 sections,
2912                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2913             )
2914         );
2915 
2916         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
2917         let sections = vec![
2918             TdvfSection {
2919                 address: 0x1000_2000,
2920                 size: 0x2000,
2921                 ..Default::default()
2922             },
2923             TdvfSection {
2924                 address: 0,
2925                 size: 0x2000,
2926                 ..Default::default()
2927             },
2928         ];
2929         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
2930             (GuestAddress(0x2000), 0x1000_0000),
2931             (GuestAddress(0x1000_4000), 0x1000_0000),
2932         ];
2933         let expected = vec![
2934             (0, 0x2000, false),
2935             (0x2000, 0x1000_0000, true),
2936             (0x1000_2000, 0x2000, false),
2937             (0x1000_4000, 0x1000_0000, true),
2938         ];
2939         assert_eq!(
2940             expected,
2941             Vm::hob_memory_resources(
2942                 sections,
2943                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2944             )
2945         );
2946 
2947         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
2948         let sections = vec![
2949             TdvfSection {
2950                 address: 0x1000_0000,
2951                 size: 0x4000,
2952                 ..Default::default()
2953             },
2954             TdvfSection {
2955                 address: 0,
2956                 size: 0x4000,
2957                 ..Default::default()
2958             },
2959         ];
2960         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
2961             (GuestAddress(0x1000), 0x1000_0000),
2962             (GuestAddress(0x1000_3000), 0x1000_0000),
2963         ];
2964         let expected = vec![
2965             (0, 0x4000, false),
2966             (0x4000, 0x0fff_c000, true),
2967             (0x1000_0000, 0x4000, false),
2968             (0x1000_4000, 0x0fff_f000, true),
2969         ];
2970         assert_eq!(
2971             expected,
2972             Vm::hob_memory_resources(
2973                 sections,
2974                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
2975             )
2976         );
2977     }
2978 }
2979 
2980 #[cfg(target_arch = "aarch64")]
2981 #[cfg(test)]
2982 mod tests {
2983     use super::*;
2984     use crate::GuestMemoryMmap;
2985     use arch::aarch64::fdt::create_fdt;
2986     use arch::aarch64::layout;
2987     use arch::{DeviceType, MmioDeviceInfo};
2988     use devices::gic::Gic;
2989 
2990     const LEN: u64 = 4096;
2991 
2992     #[test]
2993     fn test_create_fdt_with_devices() {
2994         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
2995         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
2996 
2997         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
2998             (
2999                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3000                 MmioDeviceInfo {
3001                     addr: 0x00,
3002                     len: LEN,
3003                     irq: 33,
3004                 },
3005             ),
3006             (
3007                 (DeviceType::Virtio(1), "virtio".to_string()),
3008                 MmioDeviceInfo {
3009                     addr: LEN,
3010                     len: LEN,
3011                     irq: 34,
3012                 },
3013             ),
3014             (
3015                 (DeviceType::Rtc, "rtc".to_string()),
3016                 MmioDeviceInfo {
3017                     addr: 2 * LEN,
3018                     len: LEN,
3019                     irq: 35,
3020                 },
3021             ),
3022         ]
3023         .iter()
3024         .cloned()
3025         .collect();
3026 
3027         let hv = hypervisor::new().unwrap();
3028         let vm = hv.create_vm().unwrap();
3029         let gic = vm
3030             .create_vgic(Gic::create_default_config(1))
3031             .expect("Cannot create gic");
3032         assert!(create_fdt(
3033             &mem,
3034             "console=tty0",
3035             vec![0],
3036             Some((0, 0, 0)),
3037             &dev_info,
3038             &gic,
3039             &None,
3040             &Vec::new(),
3041             &BTreeMap::new(),
3042             None,
3043             true,
3044         )
3045         .is_ok())
3046     }
3047 }
3048 
3049 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3050 #[test]
3051 pub fn test_vm() {
3052     use hypervisor::VmExit;
3053     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3054     // This example based on https://lwn.net/Articles/658511/
3055     let code = [
3056         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3057         0x00, 0xd8, /* add %bl, %al */
3058         0x04, b'0', /* add $'0', %al */
3059         0xee, /* out %al, (%dx) */
3060         0xb0, b'\n', /* mov $'\n', %al */
3061         0xee,  /* out %al, (%dx) */
3062         0xf4,  /* hlt */
3063     ];
3064 
3065     let mem_size = 0x1000;
3066     let load_addr = GuestAddress(0x1000);
3067     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3068 
3069     let hv = hypervisor::new().unwrap();
3070     let vm = hv.create_vm().expect("new VM creation failed");
3071 
3072     for (index, region) in mem.iter().enumerate() {
3073         let mem_region = vm.make_user_memory_region(
3074             index as u32,
3075             region.start_addr().raw_value(),
3076             region.len(),
3077             region.as_ptr() as u64,
3078             false,
3079             false,
3080         );
3081 
3082         vm.create_user_memory_region(mem_region)
3083             .expect("Cannot configure guest memory");
3084     }
3085     mem.write_slice(&code, load_addr)
3086         .expect("Writing code to memory failed");
3087 
3088     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3089 
3090     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3091     vcpu_sregs.cs.base = 0;
3092     vcpu_sregs.cs.selector = 0;
3093     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3094 
3095     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3096     vcpu_regs.rip = 0x1000;
3097     vcpu_regs.rax = 2;
3098     vcpu_regs.rbx = 3;
3099     vcpu_regs.rflags = 2;
3100     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3101 
3102     loop {
3103         match vcpu.run().expect("run failed") {
3104             VmExit::IoOut(addr, data) => {
3105                 println!(
3106                     "IO out -- addr: {:#x} data [{:?}]",
3107                     addr,
3108                     str::from_utf8(data).unwrap()
3109                 );
3110             }
3111             VmExit::Reset => {
3112                 println!("HLT");
3113                 break;
3114             }
3115             r => panic!("unexpected exit reason: {r:?}"),
3116         }
3117     }
3118 }
3119