xref: /cloud-hypervisor/vmm/src/vm.rs (revision 6f8bd27cf7629733582d930519e98d19e90afb16)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::{
15     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
16     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
17 };
18 use crate::config::{NumaConfig, PayloadConfig};
19 #[cfg(feature = "guest_debug")]
20 use crate::coredump::{
21     CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
22 };
23 use crate::cpu;
24 use crate::device_manager::{Console, DeviceManager, DeviceManagerError, PtyPair};
25 use crate::device_tree::DeviceTree;
26 #[cfg(feature = "guest_debug")]
27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
28 use crate::memory_manager::{
29     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
30 };
31 #[cfg(feature = "guest_debug")]
32 use crate::migration::url_to_file;
33 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
34 use crate::seccomp_filters::{get_seccomp_filter, Thread};
35 use crate::GuestMemoryMmap;
36 use crate::{
37     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
38 };
39 use anyhow::anyhow;
40 use arch::get_host_cpu_phys_bits;
41 #[cfg(target_arch = "x86_64")]
42 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
43 #[cfg(feature = "tdx")]
44 use arch::x86_64::tdx::TdvfSection;
45 use arch::EntryPoint;
46 #[cfg(target_arch = "aarch64")]
47 use arch::PciSpaceInfo;
48 use arch::{NumaNode, NumaNodes};
49 #[cfg(target_arch = "aarch64")]
50 use devices::gic::GIC_V3_ITS_SNAPSHOT_ID;
51 #[cfg(target_arch = "aarch64")]
52 use devices::interrupt_controller;
53 use devices::AcpiNotificationFlags;
54 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
55 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
56 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
57 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
58 use hypervisor::{HypervisorVmError, VmOps};
59 use linux_loader::cmdline::Cmdline;
60 #[cfg(feature = "guest_debug")]
61 use linux_loader::elf;
62 #[cfg(target_arch = "x86_64")]
63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
64 #[cfg(target_arch = "aarch64")]
65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
66 use linux_loader::loader::KernelLoader;
67 use seccompiler::{apply_filter, SeccompAction};
68 use serde::{Deserialize, Serialize};
69 use signal_hook::{consts::SIGWINCH, iterator::backend::Handle, iterator::Signals};
70 use std::cmp;
71 use std::collections::BTreeMap;
72 use std::collections::HashMap;
73 use std::convert::TryInto;
74 use std::fs::{File, OpenOptions};
75 use std::io::{self, Seek, SeekFrom, Write};
76 #[cfg(feature = "tdx")]
77 use std::mem;
78 #[cfg(feature = "guest_debug")]
79 use std::mem::size_of;
80 use std::num::Wrapping;
81 use std::ops::Deref;
82 use std::os::unix::net::UnixStream;
83 use std::panic::AssertUnwindSafe;
84 use std::sync::{Arc, Mutex, RwLock};
85 use std::time::Instant;
86 use std::{result, str, thread};
87 use thiserror::Error;
88 use tracer::trace_scoped;
89 use vm_device::Bus;
90 #[cfg(feature = "tdx")]
91 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion};
92 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic};
93 use vm_migration::protocol::{Request, Response, Status};
94 use vm_migration::{
95     protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot,
96     SnapshotDataSection, Snapshottable, Transportable,
97 };
98 use vmm_sys_util::eventfd::EventFd;
99 use vmm_sys_util::signal::unblock_signal;
100 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
101 use vmm_sys_util::terminal::Terminal;
102 
103 /// Errors associated with VM management
104 #[derive(Debug, Error)]
105 pub enum Error {
106     #[error("Cannot open kernel file: {0}")]
107     KernelFile(#[source] io::Error),
108 
109     #[error("Cannot open initramfs file: {0}")]
110     InitramfsFile(#[source] io::Error),
111 
112     #[error("Cannot load the kernel into memory: {0}")]
113     KernelLoad(#[source] linux_loader::loader::Error),
114 
115     #[cfg(target_arch = "aarch64")]
116     #[error("Cannot load the UEFI binary in memory: {0:?}")]
117     UefiLoad(arch::aarch64::uefi::Error),
118 
119     #[error("Cannot load the initramfs into memory")]
120     InitramfsLoad,
121 
122     #[error("Cannot load the kernel command line in memory: {0}")]
123     LoadCmdLine(#[source] linux_loader::loader::Error),
124 
125     #[error("Cannot modify the kernel command line: {0}")]
126     CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
127 
128     #[error("Cannot create the kernel command line: {0}")]
129     CmdLineCreate(#[source] linux_loader::cmdline::Error),
130 
131     #[error("Cannot configure system: {0}")]
132     ConfigureSystem(#[source] arch::Error),
133 
134     #[cfg(target_arch = "aarch64")]
135     #[error("Cannot enable interrupt controller: {0:?}")]
136     EnableInterruptController(interrupt_controller::Error),
137 
138     #[error("VM state is poisoned")]
139     PoisonedState,
140 
141     #[error("Error from device manager: {0:?}")]
142     DeviceManager(DeviceManagerError),
143 
144     #[error("Cannot setup terminal in raw mode: {0}")]
145     SetTerminalRaw(#[source] vmm_sys_util::errno::Error),
146 
147     #[error("Cannot setup terminal in canonical mode.: {0}")]
148     SetTerminalCanon(#[source] vmm_sys_util::errno::Error),
149 
150     #[error("Cannot spawn a signal handler thread: {0}")]
151     SignalHandlerSpawn(#[source] io::Error),
152 
153     #[error("Failed to join on threads: {0:?}")]
154     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
155 
156     #[error("VM config is missing")]
157     VmMissingConfig,
158 
159     #[error("VM is not created")]
160     VmNotCreated,
161 
162     #[error("VM is already created")]
163     VmAlreadyCreated,
164 
165     #[error("VM is not running")]
166     VmNotRunning,
167 
168     #[error("Cannot clone EventFd: {0}")]
169     EventFdClone(#[source] io::Error),
170 
171     #[error("invalid VM state transition: {0:?} to {1:?}")]
172     InvalidStateTransition(VmState, VmState),
173 
174     #[error("Error from CPU manager: {0}")]
175     CpuManager(#[source] cpu::Error),
176 
177     #[error("Cannot pause devices: {0}")]
178     PauseDevices(#[source] MigratableError),
179 
180     #[error("Cannot resume devices: {0}")]
181     ResumeDevices(#[source] MigratableError),
182 
183     #[error("Cannot pause CPUs: {0}")]
184     PauseCpus(#[source] MigratableError),
185 
186     #[error("Cannot resume cpus: {0}")]
187     ResumeCpus(#[source] MigratableError),
188 
189     #[error("Cannot pause VM: {0}")]
190     Pause(#[source] MigratableError),
191 
192     #[error("Cannot resume VM: {0}")]
193     Resume(#[source] MigratableError),
194 
195     #[error("Memory manager error: {0:?}")]
196     MemoryManager(MemoryManagerError),
197 
198     #[error("Eventfd write error: {0}")]
199     EventfdError(#[source] std::io::Error),
200 
201     #[error("Cannot snapshot VM: {0}")]
202     Snapshot(#[source] MigratableError),
203 
204     #[error("Cannot restore VM: {0}")]
205     Restore(#[source] MigratableError),
206 
207     #[error("Cannot send VM snapshot: {0}")]
208     SnapshotSend(#[source] MigratableError),
209 
210     #[error("Invalid restore source URL")]
211     InvalidRestoreSourceUrl,
212 
213     #[error("Failed to validate config: {0}")]
214     ConfigValidation(#[source] ValidationError),
215 
216     #[error("Too many virtio-vsock devices")]
217     TooManyVsockDevices,
218 
219     #[error("Failed serializing into JSON: {0}")]
220     SerializeJson(#[source] serde_json::Error),
221 
222     #[error("Invalid NUMA configuration")]
223     InvalidNumaConfig,
224 
225     #[error("Cannot create seccomp filter: {0}")]
226     CreateSeccompFilter(#[source] seccompiler::Error),
227 
228     #[error("Cannot apply seccomp filter: {0}")]
229     ApplySeccompFilter(#[source] seccompiler::Error),
230 
231     #[error("Failed resizing a memory zone")]
232     ResizeZone,
233 
234     #[error("Cannot activate virtio devices: {0:?}")]
235     ActivateVirtioDevices(DeviceManagerError),
236 
237     #[error("Error triggering power button: {0:?}")]
238     PowerButton(DeviceManagerError),
239 
240     #[error("Kernel lacks PVH header")]
241     KernelMissingPvhHeader,
242 
243     #[error("Failed to allocate firmware RAM: {0:?}")]
244     AllocateFirmwareMemory(MemoryManagerError),
245 
246     #[error("Error manipulating firmware file: {0}")]
247     FirmwareFile(#[source] std::io::Error),
248 
249     #[error("Firmware too big")]
250     FirmwareTooLarge,
251 
252     #[error("Failed to copy firmware to memory: {0}")]
253     FirmwareLoad(#[source] vm_memory::GuestMemoryError),
254 
255     #[cfg(feature = "tdx")]
256     #[error("Error performing I/O on TDX firmware file: {0}")]
257     LoadTdvf(#[source] std::io::Error),
258 
259     #[cfg(feature = "tdx")]
260     #[error("Error performing I/O on the TDX payload file: {0}")]
261     LoadPayload(#[source] std::io::Error),
262 
263     #[cfg(feature = "tdx")]
264     #[error("Error parsing TDVF: {0}")]
265     ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
266 
267     #[cfg(feature = "tdx")]
268     #[error("Error populating TDX HOB: {0}")]
269     PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
270 
271     #[cfg(feature = "tdx")]
272     #[error("Error allocating TDVF memory: {0:?}")]
273     AllocatingTdvfMemory(crate::memory_manager::Error),
274 
275     #[cfg(feature = "tdx")]
276     #[error("Error enabling TDX VM: {0}")]
277     InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
278 
279     #[cfg(feature = "tdx")]
280     #[error("Error enabling TDX memory region: {0}")]
281     InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
282 
283     #[cfg(feature = "tdx")]
284     #[error("Error finalizing TDX VM: {0}")]
285     FinalizeTdx(#[source] hypervisor::HypervisorVmError),
286 
287     #[cfg(feature = "tdx")]
288     #[error("TDX firmware missing")]
289     TdxFirmwareMissing,
290 
291     #[cfg(feature = "tdx")]
292     #[error("Invalid TDX payload type")]
293     InvalidPayloadType,
294 
295     #[cfg(feature = "guest_debug")]
296     #[error("Error debugging VM: {0:?}")]
297     Debug(DebuggableError),
298 
299     #[error("Error spawning kernel loading thread")]
300     KernelLoadThreadSpawn(std::io::Error),
301 
302     #[error("Error joining kernel loading thread")]
303     KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
304 
305     #[error("Payload configuration is not bootable")]
306     InvalidPayload,
307 
308     #[cfg(feature = "guest_debug")]
309     #[error("Error coredumping VM: {0:?}")]
310     Coredump(GuestDebuggableError),
311 }
312 pub type Result<T> = result::Result<T, Error>;
313 
314 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
315 pub enum VmState {
316     Created,
317     Running,
318     Shutdown,
319     Paused,
320     BreakPoint,
321 }
322 
323 impl VmState {
324     fn valid_transition(self, new_state: VmState) -> Result<()> {
325         match self {
326             VmState::Created => match new_state {
327                 VmState::Created | VmState::Shutdown => {
328                     Err(Error::InvalidStateTransition(self, new_state))
329                 }
330                 VmState::Running | VmState::Paused | VmState::BreakPoint => Ok(()),
331             },
332 
333             VmState::Running => match new_state {
334                 VmState::Created | VmState::Running => {
335                     Err(Error::InvalidStateTransition(self, new_state))
336                 }
337                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
338             },
339 
340             VmState::Shutdown => match new_state {
341                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
342                     Err(Error::InvalidStateTransition(self, new_state))
343                 }
344                 VmState::Running => Ok(()),
345             },
346 
347             VmState::Paused => match new_state {
348                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
349                     Err(Error::InvalidStateTransition(self, new_state))
350                 }
351                 VmState::Running | VmState::Shutdown => Ok(()),
352             },
353             VmState::BreakPoint => match new_state {
354                 VmState::Created | VmState::Running => Ok(()),
355                 _ => Err(Error::InvalidStateTransition(self, new_state)),
356             },
357         }
358     }
359 }
360 
361 struct VmOpsHandler {
362     memory: GuestMemoryAtomic<GuestMemoryMmap>,
363     #[cfg(target_arch = "x86_64")]
364     io_bus: Arc<Bus>,
365     mmio_bus: Arc<Bus>,
366 }
367 
368 impl VmOps for VmOpsHandler {
369     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
370         self.memory
371             .memory()
372             .write(buf, GuestAddress(gpa))
373             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
374     }
375 
376     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
377         self.memory
378             .memory()
379             .read(buf, GuestAddress(gpa))
380             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
381     }
382 
383     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
384         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
385             info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
386         }
387         Ok(())
388     }
389 
390     fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
391         match self.mmio_bus.write(gpa, data) {
392             Err(vm_device::BusError::MissingAddressRange) => {
393                 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
394             }
395             Ok(Some(barrier)) => {
396                 info!("Waiting for barrier");
397                 barrier.wait();
398                 info!("Barrier released");
399             }
400             _ => {}
401         };
402         Ok(())
403     }
404 
405     #[cfg(target_arch = "x86_64")]
406     fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
407         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
408             info!("Guest PIO read to unregistered address 0x{:x}", port);
409         }
410         Ok(())
411     }
412 
413     #[cfg(target_arch = "x86_64")]
414     fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
415         match self.io_bus.write(port, data) {
416             Err(vm_device::BusError::MissingAddressRange) => {
417                 info!("Guest PIO write to unregistered address 0x{:x}", port);
418             }
419             Ok(Some(barrier)) => {
420                 info!("Waiting for barrier");
421                 barrier.wait();
422                 info!("Barrier released");
423             }
424             _ => {}
425         };
426         Ok(())
427     }
428 }
429 
430 pub fn physical_bits(max_phys_bits: u8) -> u8 {
431     let host_phys_bits = get_host_cpu_phys_bits();
432 
433     cmp::min(host_phys_bits, max_phys_bits)
434 }
435 
436 pub struct Vm {
437     #[cfg(feature = "tdx")]
438     kernel: Option<File>,
439     initramfs: Option<File>,
440     threads: Vec<thread::JoinHandle<()>>,
441     device_manager: Arc<Mutex<DeviceManager>>,
442     config: Arc<Mutex<VmConfig>>,
443     on_tty: bool,
444     signals: Option<Handle>,
445     state: RwLock<VmState>,
446     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
447     memory_manager: Arc<Mutex<MemoryManager>>,
448     #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
449     // The hypervisor abstracted virtual machine.
450     vm: Arc<dyn hypervisor::Vm>,
451     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
452     saved_clock: Option<hypervisor::ClockData>,
453     numa_nodes: NumaNodes,
454     seccomp_action: SeccompAction,
455     exit_evt: EventFd,
456     hypervisor: Arc<dyn hypervisor::Hypervisor>,
457     stop_on_boot: bool,
458     load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
459 }
460 
461 impl Vm {
462     pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
463 
464     #[allow(clippy::too_many_arguments)]
465     pub fn new_from_memory_manager(
466         config: Arc<Mutex<VmConfig>>,
467         memory_manager: Arc<Mutex<MemoryManager>>,
468         vm: Arc<dyn hypervisor::Vm>,
469         exit_evt: EventFd,
470         reset_evt: EventFd,
471         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
472         seccomp_action: &SeccompAction,
473         hypervisor: Arc<dyn hypervisor::Hypervisor>,
474         activate_evt: EventFd,
475         restoring: bool,
476         timestamp: Instant,
477         snapshot: Option<&Snapshot>,
478     ) -> Result<Self> {
479         trace_scoped!("Vm::new_from_memory_manager");
480 
481         let boot_id_list = config
482             .lock()
483             .unwrap()
484             .validate()
485             .map_err(Error::ConfigValidation)?;
486 
487         let load_payload_handle = if !restoring {
488             Self::load_payload_async(&memory_manager, &config)?
489         } else {
490             None
491         };
492 
493         info!("Booting VM from config: {:?}", &config);
494 
495         // Create NUMA nodes based on NumaConfig.
496         let numa_nodes =
497             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
498 
499         #[cfg(feature = "tdx")]
500         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
501         #[cfg(feature = "tdx")]
502         let force_iommu = tdx_enabled;
503         #[cfg(not(feature = "tdx"))]
504         let force_iommu = false;
505 
506         #[cfg(feature = "guest_debug")]
507         let stop_on_boot = config.lock().unwrap().gdb;
508         #[cfg(not(feature = "guest_debug"))]
509         let stop_on_boot = false;
510 
511         let memory = memory_manager.lock().unwrap().guest_memory();
512         #[cfg(target_arch = "x86_64")]
513         let io_bus = Arc::new(Bus::new());
514         let mmio_bus = Arc::new(Bus::new());
515 
516         let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
517             memory,
518             #[cfg(target_arch = "x86_64")]
519             io_bus: io_bus.clone(),
520             mmio_bus: mmio_bus.clone(),
521         });
522 
523         let cpus_config = { &config.lock().unwrap().cpus.clone() };
524         let cpu_manager = cpu::CpuManager::new(
525             cpus_config,
526             &memory_manager,
527             vm.clone(),
528             exit_evt.try_clone().map_err(Error::EventFdClone)?,
529             reset_evt.try_clone().map_err(Error::EventFdClone)?,
530             #[cfg(feature = "guest_debug")]
531             vm_debug_evt,
532             hypervisor.clone(),
533             seccomp_action.clone(),
534             vm_ops,
535             #[cfg(feature = "tdx")]
536             tdx_enabled,
537             &numa_nodes,
538         )
539         .map_err(Error::CpuManager)?;
540 
541         cpu_manager
542             .lock()
543             .unwrap()
544             .create_boot_vcpus()
545             .map_err(Error::CpuManager)?;
546 
547         #[cfg(feature = "tdx")]
548         let dynamic = !tdx_enabled;
549         #[cfg(not(feature = "tdx"))]
550         let dynamic = true;
551 
552         let device_manager = DeviceManager::new(
553             #[cfg(target_arch = "x86_64")]
554             io_bus,
555             mmio_bus,
556             hypervisor.hypervisor_type(),
557             vm.clone(),
558             config.clone(),
559             memory_manager.clone(),
560             cpu_manager.clone(),
561             exit_evt.try_clone().map_err(Error::EventFdClone)?,
562             reset_evt,
563             seccomp_action.clone(),
564             numa_nodes.clone(),
565             &activate_evt,
566             force_iommu,
567             restoring,
568             boot_id_list,
569             timestamp,
570             snapshot_from_id(snapshot, DEVICE_MANAGER_SNAPSHOT_ID),
571             dynamic,
572         )
573         .map_err(Error::DeviceManager)?;
574 
575         // SAFETY: trivially safe
576         let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO) } != 0;
577 
578         #[cfg(feature = "tdx")]
579         let kernel = config
580             .lock()
581             .unwrap()
582             .payload
583             .as_ref()
584             .map(|p| p.kernel.as_ref().map(File::open))
585             .unwrap_or_default()
586             .transpose()
587             .map_err(Error::KernelFile)?;
588 
589         let initramfs = config
590             .lock()
591             .unwrap()
592             .payload
593             .as_ref()
594             .map(|p| p.initramfs.as_ref().map(File::open))
595             .unwrap_or_default()
596             .transpose()
597             .map_err(Error::InitramfsFile)?;
598 
599         Ok(Vm {
600             #[cfg(feature = "tdx")]
601             kernel,
602             initramfs,
603             device_manager,
604             config,
605             on_tty,
606             threads: Vec::with_capacity(1),
607             signals: None,
608             state: RwLock::new(VmState::Created),
609             cpu_manager,
610             memory_manager,
611             vm,
612             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
613             saved_clock: None,
614             numa_nodes,
615             seccomp_action: seccomp_action.clone(),
616             exit_evt,
617             hypervisor,
618             stop_on_boot,
619             load_payload_handle,
620         })
621     }
622 
623     fn create_numa_nodes(
624         configs: Option<Vec<NumaConfig>>,
625         memory_manager: &Arc<Mutex<MemoryManager>>,
626     ) -> Result<NumaNodes> {
627         let mm = memory_manager.lock().unwrap();
628         let mm_zones = mm.memory_zones();
629         let mut numa_nodes = BTreeMap::new();
630 
631         if let Some(configs) = &configs {
632             for config in configs.iter() {
633                 if numa_nodes.contains_key(&config.guest_numa_id) {
634                     error!("Can't define twice the same NUMA node");
635                     return Err(Error::InvalidNumaConfig);
636                 }
637 
638                 let mut node = NumaNode::default();
639 
640                 if let Some(memory_zones) = &config.memory_zones {
641                     for memory_zone in memory_zones.iter() {
642                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
643                             node.memory_regions.extend(mm_zone.regions().clone());
644                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
645                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
646                             }
647                             node.memory_zones.push(memory_zone.clone());
648                         } else {
649                             error!("Unknown memory zone '{}'", memory_zone);
650                             return Err(Error::InvalidNumaConfig);
651                         }
652                     }
653                 }
654 
655                 if let Some(cpus) = &config.cpus {
656                     node.cpus.extend(cpus);
657                 }
658 
659                 if let Some(distances) = &config.distances {
660                     for distance in distances.iter() {
661                         let dest = distance.destination;
662                         let dist = distance.distance;
663 
664                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
665                             error!("Unknown destination NUMA node {}", dest);
666                             return Err(Error::InvalidNumaConfig);
667                         }
668 
669                         if node.distances.contains_key(&dest) {
670                             error!("Destination NUMA node {} has been already set", dest);
671                             return Err(Error::InvalidNumaConfig);
672                         }
673 
674                         node.distances.insert(dest, dist);
675                     }
676                 }
677 
678                 #[cfg(target_arch = "x86_64")]
679                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
680                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
681                         let mm_sections = sgx_epc_region.epc_sections();
682                         for sgx_epc_section in sgx_epc_sections.iter() {
683                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
684                                 node.sgx_epc_sections.push(mm_section.clone());
685                             } else {
686                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
687                                 return Err(Error::InvalidNumaConfig);
688                             }
689                         }
690                     } else {
691                         error!("Missing SGX EPC region");
692                         return Err(Error::InvalidNumaConfig);
693                     }
694                 }
695 
696                 numa_nodes.insert(config.guest_numa_id, node);
697             }
698         }
699 
700         Ok(numa_nodes)
701     }
702 
703     #[allow(clippy::too_many_arguments)]
704     pub fn new(
705         config: Arc<Mutex<VmConfig>>,
706         exit_evt: EventFd,
707         reset_evt: EventFd,
708         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
709         seccomp_action: &SeccompAction,
710         hypervisor: Arc<dyn hypervisor::Hypervisor>,
711         activate_evt: EventFd,
712         serial_pty: Option<PtyPair>,
713         console_pty: Option<PtyPair>,
714         console_resize_pipe: Option<File>,
715     ) -> Result<Self> {
716         trace_scoped!("Vm::new");
717 
718         let timestamp = Instant::now();
719 
720         #[cfg(feature = "tdx")]
721         let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
722 
723         let vm = Self::create_hypervisor_vm(
724             &hypervisor,
725             #[cfg(feature = "tdx")]
726             tdx_enabled,
727         )?;
728 
729         let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits);
730 
731         #[cfg(target_arch = "x86_64")]
732         let sgx_epc_config = config.lock().unwrap().sgx_epc.clone();
733 
734         let memory_manager = MemoryManager::new(
735             vm.clone(),
736             &config.lock().unwrap().memory.clone(),
737             None,
738             phys_bits,
739             #[cfg(feature = "tdx")]
740             tdx_enabled,
741             None,
742             None,
743             #[cfg(target_arch = "x86_64")]
744             sgx_epc_config,
745         )
746         .map_err(Error::MemoryManager)?;
747 
748         let new_vm = Vm::new_from_memory_manager(
749             config,
750             memory_manager,
751             vm,
752             exit_evt,
753             reset_evt,
754             #[cfg(feature = "guest_debug")]
755             vm_debug_evt,
756             seccomp_action,
757             hypervisor,
758             activate_evt,
759             false,
760             timestamp,
761             None,
762         )?;
763 
764         // The device manager must create the devices from here as it is part
765         // of the regular code path creating everything from scratch.
766         new_vm
767             .device_manager
768             .lock()
769             .unwrap()
770             .create_devices(serial_pty, console_pty, console_resize_pipe)
771             .map_err(Error::DeviceManager)?;
772         Ok(new_vm)
773     }
774 
775     #[allow(clippy::too_many_arguments)]
776     pub fn new_from_snapshot(
777         snapshot: &Snapshot,
778         vm_config: Arc<Mutex<VmConfig>>,
779         exit_evt: EventFd,
780         reset_evt: EventFd,
781         #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
782         source_url: Option<&str>,
783         prefault: bool,
784         seccomp_action: &SeccompAction,
785         hypervisor: Arc<dyn hypervisor::Hypervisor>,
786         activate_evt: EventFd,
787     ) -> Result<Self> {
788         let timestamp = Instant::now();
789 
790         let vm = Self::create_hypervisor_vm(
791             &hypervisor,
792             #[cfg(feature = "tdx")]
793             false,
794         )?;
795 
796         let memory_manager = if let Some(memory_manager_snapshot) =
797             snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID)
798         {
799             let phys_bits = physical_bits(vm_config.lock().unwrap().cpus.max_phys_bits);
800             MemoryManager::new_from_snapshot(
801                 memory_manager_snapshot,
802                 vm.clone(),
803                 &vm_config.lock().unwrap().memory.clone(),
804                 source_url,
805                 prefault,
806                 phys_bits,
807             )
808             .map_err(Error::MemoryManager)?
809         } else {
810             return Err(Error::Restore(MigratableError::Restore(anyhow!(
811                 "Missing memory manager snapshot"
812             ))));
813         };
814 
815         Vm::new_from_memory_manager(
816             vm_config,
817             memory_manager,
818             vm,
819             exit_evt,
820             reset_evt,
821             #[cfg(feature = "guest_debug")]
822             vm_debug_evt,
823             seccomp_action,
824             hypervisor,
825             activate_evt,
826             true,
827             timestamp,
828             Some(snapshot),
829         )
830     }
831 
832     pub fn create_hypervisor_vm(
833         hypervisor: &Arc<dyn hypervisor::Hypervisor>,
834         #[cfg(feature = "tdx")] tdx_enabled: bool,
835     ) -> Result<Arc<dyn hypervisor::Vm>> {
836         hypervisor.check_required_extensions().unwrap();
837 
838         #[cfg(feature = "tdx")]
839         let vm = hypervisor
840             .create_vm_with_type(if tdx_enabled {
841                 2 // KVM_X86_TDX_VM
842             } else {
843                 0 // KVM_X86_LEGACY_VM
844             })
845             .unwrap();
846         #[cfg(not(feature = "tdx"))]
847         let vm = hypervisor.create_vm().unwrap();
848 
849         #[cfg(target_arch = "x86_64")]
850         {
851             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
852                 .unwrap();
853             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
854             vm.enable_split_irq().unwrap();
855         }
856 
857         Ok(vm)
858     }
859 
860     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
861         let mut initramfs = self.initramfs.as_ref().unwrap();
862         let size: usize = initramfs
863             .seek(SeekFrom::End(0))
864             .map_err(|_| Error::InitramfsLoad)?
865             .try_into()
866             .unwrap();
867         initramfs
868             .seek(SeekFrom::Start(0))
869             .map_err(|_| Error::InitramfsLoad)?;
870 
871         let address =
872             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
873         let address = GuestAddress(address);
874 
875         guest_mem
876             .read_from(address, &mut initramfs, size)
877             .map_err(|_| Error::InitramfsLoad)?;
878 
879         info!("Initramfs loaded: address = 0x{:x}", address.0);
880         Ok(arch::InitramfsConfig { address, size })
881     }
882 
883     fn generate_cmdline(
884         payload: &PayloadConfig,
885         #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>,
886     ) -> Result<Cmdline> {
887         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
888         if let Some(s) = payload.cmdline.as_ref() {
889             cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
890         }
891 
892         #[cfg(target_arch = "aarch64")]
893         for entry in device_manager.lock().unwrap().cmdline_additions() {
894             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
895         }
896         Ok(cmdline)
897     }
898 
899     #[cfg(target_arch = "aarch64")]
900     fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
901         let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
902         let mem = uefi_flash.memory();
903         arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
904             .map_err(Error::UefiLoad)?;
905         Ok(())
906     }
907 
908     #[cfg(target_arch = "aarch64")]
909     fn load_kernel(
910         firmware: Option<File>,
911         kernel: Option<File>,
912         memory_manager: Arc<Mutex<MemoryManager>>,
913     ) -> Result<EntryPoint> {
914         let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
915         let mem = guest_memory.memory();
916         let entry_addr = match (firmware, kernel) {
917             (None, Some(mut kernel)) => {
918                 match linux_loader::loader::pe::PE::load(
919                     mem.deref(),
920                     Some(arch::layout::KERNEL_START),
921                     &mut kernel,
922                     None,
923                 ) {
924                     Ok(entry_addr) => entry_addr.kernel_load,
925                     // Try to load the binary as kernel PE file at first.
926                     // If failed, retry to load it as UEFI binary.
927                     // As the UEFI binary is formatless, it must be the last option to try.
928                     Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
929                         Self::load_firmware(&kernel, memory_manager)?;
930                         arch::layout::UEFI_START
931                     }
932                     Err(e) => {
933                         return Err(Error::KernelLoad(e));
934                     }
935                 }
936             }
937             (Some(firmware), None) => {
938                 Self::load_firmware(&firmware, memory_manager)?;
939                 arch::layout::UEFI_START
940             }
941             _ => return Err(Error::InvalidPayload),
942         };
943 
944         Ok(EntryPoint { entry_addr })
945     }
946 
947     #[cfg(target_arch = "x86_64")]
948     fn load_kernel(
949         mut kernel: File,
950         cmdline: Option<Cmdline>,
951         memory_manager: Arc<Mutex<MemoryManager>>,
952     ) -> Result<EntryPoint> {
953         info!("Loading kernel");
954 
955         let mem = {
956             let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
957             guest_memory.memory()
958         };
959         let entry_addr = linux_loader::loader::elf::Elf::load(
960             mem.deref(),
961             None,
962             &mut kernel,
963             Some(arch::layout::HIGH_RAM_START),
964         )
965         .map_err(Error::KernelLoad)?;
966 
967         if let Some(cmdline) = cmdline {
968             linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
969                 .map_err(Error::LoadCmdLine)?;
970         }
971 
972         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
973             // Use the PVH kernel entry point to boot the guest
974             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
975             Ok(EntryPoint {
976                 entry_addr: Some(entry_addr),
977             })
978         } else {
979             Err(Error::KernelMissingPvhHeader)
980         }
981     }
982 
983     #[cfg(target_arch = "x86_64")]
984     fn load_payload(
985         payload: &PayloadConfig,
986         memory_manager: Arc<Mutex<MemoryManager>>,
987     ) -> Result<EntryPoint> {
988         trace_scoped!("load_payload");
989         match (
990             &payload.firmware,
991             &payload.kernel,
992             &payload.initramfs,
993             &payload.cmdline,
994         ) {
995             (Some(firmware), None, None, None) => {
996                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
997                 Self::load_kernel(firmware, None, memory_manager)
998             }
999             (None, Some(kernel), _, _) => {
1000                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1001                 let cmdline = Self::generate_cmdline(payload)?;
1002                 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1003             }
1004             _ => Err(Error::InvalidPayload),
1005         }
1006     }
1007 
1008     #[cfg(target_arch = "aarch64")]
1009     fn load_payload(
1010         payload: &PayloadConfig,
1011         memory_manager: Arc<Mutex<MemoryManager>>,
1012     ) -> Result<EntryPoint> {
1013         match (&payload.firmware, &payload.kernel) {
1014             (Some(firmware), None) => {
1015                 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1016                 Self::load_kernel(Some(firmware), None, memory_manager)
1017             }
1018             (None, Some(kernel)) => {
1019                 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1020                 Self::load_kernel(None, Some(kernel), memory_manager)
1021             }
1022             _ => Err(Error::InvalidPayload),
1023         }
1024     }
1025 
1026     fn load_payload_async(
1027         memory_manager: &Arc<Mutex<MemoryManager>>,
1028         config: &Arc<Mutex<VmConfig>>,
1029     ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1030         // Kernel with TDX is loaded in a different manner
1031         #[cfg(feature = "tdx")]
1032         if config.lock().unwrap().is_tdx_enabled() {
1033             return Ok(None);
1034         }
1035 
1036         config
1037             .lock()
1038             .unwrap()
1039             .payload
1040             .as_ref()
1041             .map(|payload| {
1042                 let memory_manager = memory_manager.clone();
1043                 let payload = payload.clone();
1044 
1045                 std::thread::Builder::new()
1046                     .name("payload_loader".into())
1047                     .spawn(move || Self::load_payload(&payload, memory_manager))
1048                     .map_err(Error::KernelLoadThreadSpawn)
1049             })
1050             .transpose()
1051     }
1052 
1053     #[cfg(target_arch = "x86_64")]
1054     fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> {
1055         trace_scoped!("configure_system");
1056         info!("Configuring system");
1057         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1058 
1059         let initramfs_config = match self.initramfs {
1060             Some(_) => Some(self.load_initramfs(&mem)?),
1061             None => None,
1062         };
1063 
1064         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1065         let rsdp_addr = Some(rsdp_addr);
1066         let sgx_epc_region = self
1067             .memory_manager
1068             .lock()
1069             .unwrap()
1070             .sgx_epc_region()
1071             .as_ref()
1072             .cloned();
1073 
1074         let serial_number = self
1075             .config
1076             .lock()
1077             .unwrap()
1078             .platform
1079             .as_ref()
1080             .and_then(|p| p.serial_number.clone());
1081 
1082         let uuid = self
1083             .config
1084             .lock()
1085             .unwrap()
1086             .platform
1087             .as_ref()
1088             .and_then(|p| p.uuid.clone());
1089 
1090         let oem_strings = self
1091             .config
1092             .lock()
1093             .unwrap()
1094             .platform
1095             .as_ref()
1096             .and_then(|p| p.oem_strings.clone());
1097 
1098         let oem_strings = oem_strings
1099             .as_deref()
1100             .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1101 
1102         arch::configure_system(
1103             &mem,
1104             arch::layout::CMDLINE_START,
1105             &initramfs_config,
1106             boot_vcpus,
1107             rsdp_addr,
1108             sgx_epc_region,
1109             serial_number.as_deref(),
1110             uuid.as_deref(),
1111             oem_strings.as_deref(),
1112         )
1113         .map_err(Error::ConfigureSystem)?;
1114         Ok(())
1115     }
1116 
1117     #[cfg(target_arch = "aarch64")]
1118     fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> {
1119         let cmdline = Self::generate_cmdline(
1120             self.config.lock().unwrap().payload.as_ref().unwrap(),
1121             &self.device_manager,
1122         )?;
1123         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1124         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1125         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1126         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1127         let initramfs_config = match self.initramfs {
1128             Some(_) => Some(self.load_initramfs(&mem)?),
1129             None => None,
1130         };
1131 
1132         let device_info = &self
1133             .device_manager
1134             .lock()
1135             .unwrap()
1136             .get_device_info()
1137             .clone();
1138 
1139         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1140             let pci_space = PciSpaceInfo {
1141                 pci_segment_id: pci_segment.id,
1142                 mmio_config_address: pci_segment.mmio_config_address,
1143                 pci_device_space_start: pci_segment.start_of_device_area,
1144                 pci_device_space_size: pci_segment.end_of_device_area
1145                     - pci_segment.start_of_device_area
1146                     + 1,
1147             };
1148             pci_space_info.push(pci_space);
1149         }
1150 
1151         let virtio_iommu_bdf = self
1152             .device_manager
1153             .lock()
1154             .unwrap()
1155             .iommu_attached_devices()
1156             .as_ref()
1157             .map(|(v, _)| *v);
1158 
1159         let vgic = self
1160             .device_manager
1161             .lock()
1162             .unwrap()
1163             .get_interrupt_controller()
1164             .unwrap()
1165             .lock()
1166             .unwrap()
1167             .get_vgic()
1168             .map_err(|_| {
1169                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1170                     arch::aarch64::Error::SetupGic,
1171                 ))
1172             })?;
1173 
1174         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1175         let pmu_supported = self
1176             .cpu_manager
1177             .lock()
1178             .unwrap()
1179             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1180             .map_err(|_| {
1181                 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1182                     arch::aarch64::Error::VcpuInitPmu,
1183                 ))
1184             })?;
1185 
1186         arch::configure_system(
1187             &mem,
1188             cmdline.as_cstring().unwrap().to_str().unwrap(),
1189             vcpu_mpidrs,
1190             vcpu_topology,
1191             device_info,
1192             &initramfs_config,
1193             &pci_space_info,
1194             virtio_iommu_bdf.map(|bdf| bdf.into()),
1195             &vgic,
1196             &self.numa_nodes,
1197             pmu_supported,
1198         )
1199         .map_err(Error::ConfigureSystem)?;
1200 
1201         Ok(())
1202     }
1203 
1204     pub fn serial_pty(&self) -> Option<PtyPair> {
1205         self.device_manager.lock().unwrap().serial_pty()
1206     }
1207 
1208     pub fn console_pty(&self) -> Option<PtyPair> {
1209         self.device_manager.lock().unwrap().console_pty()
1210     }
1211 
1212     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1213         self.device_manager.lock().unwrap().console_resize_pipe()
1214     }
1215 
1216     pub fn shutdown(&mut self) -> Result<()> {
1217         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1218         let new_state = VmState::Shutdown;
1219 
1220         state.valid_transition(new_state)?;
1221 
1222         if self.on_tty {
1223             // Don't forget to set the terminal in canonical mode
1224             // before to exit.
1225             io::stdin()
1226                 .lock()
1227                 .set_canon_mode()
1228                 .map_err(Error::SetTerminalCanon)?;
1229         }
1230 
1231         // Trigger the termination of the signal_handler thread
1232         if let Some(signals) = self.signals.take() {
1233             signals.close();
1234         }
1235 
1236         // Wake up the DeviceManager threads so they will get terminated cleanly
1237         self.device_manager
1238             .lock()
1239             .unwrap()
1240             .resume()
1241             .map_err(Error::Resume)?;
1242 
1243         self.cpu_manager
1244             .lock()
1245             .unwrap()
1246             .shutdown()
1247             .map_err(Error::CpuManager)?;
1248 
1249         // Wait for all the threads to finish
1250         for thread in self.threads.drain(..) {
1251             thread.join().map_err(Error::ThreadCleanup)?
1252         }
1253         *state = new_state;
1254 
1255         event!("vm", "shutdown");
1256 
1257         Ok(())
1258     }
1259 
1260     pub fn resize(
1261         &mut self,
1262         desired_vcpus: Option<u8>,
1263         desired_memory: Option<u64>,
1264         desired_balloon: Option<u64>,
1265     ) -> Result<()> {
1266         event!("vm", "resizing");
1267 
1268         if let Some(desired_vcpus) = desired_vcpus {
1269             if self
1270                 .cpu_manager
1271                 .lock()
1272                 .unwrap()
1273                 .resize(desired_vcpus)
1274                 .map_err(Error::CpuManager)?
1275             {
1276                 self.device_manager
1277                     .lock()
1278                     .unwrap()
1279                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1280                     .map_err(Error::DeviceManager)?;
1281             }
1282             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1283         }
1284 
1285         if let Some(desired_memory) = desired_memory {
1286             let new_region = self
1287                 .memory_manager
1288                 .lock()
1289                 .unwrap()
1290                 .resize(desired_memory)
1291                 .map_err(Error::MemoryManager)?;
1292 
1293             let mut memory_config = &mut self.config.lock().unwrap().memory;
1294 
1295             if let Some(new_region) = &new_region {
1296                 self.device_manager
1297                     .lock()
1298                     .unwrap()
1299                     .update_memory(new_region)
1300                     .map_err(Error::DeviceManager)?;
1301 
1302                 match memory_config.hotplug_method {
1303                     HotplugMethod::Acpi => {
1304                         self.device_manager
1305                             .lock()
1306                             .unwrap()
1307                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1308                             .map_err(Error::DeviceManager)?;
1309                     }
1310                     HotplugMethod::VirtioMem => {}
1311                 }
1312             }
1313 
1314             // We update the VM config regardless of the actual guest resize
1315             // operation result (happened or not), so that if the VM reboots
1316             // it will be running with the last configure memory size.
1317             match memory_config.hotplug_method {
1318                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1319                 HotplugMethod::VirtioMem => {
1320                     if desired_memory > memory_config.size {
1321                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1322                     } else {
1323                         memory_config.hotplugged_size = None;
1324                     }
1325                 }
1326             }
1327         }
1328 
1329         if let Some(desired_balloon) = desired_balloon {
1330             self.device_manager
1331                 .lock()
1332                 .unwrap()
1333                 .resize_balloon(desired_balloon)
1334                 .map_err(Error::DeviceManager)?;
1335 
1336             // Update the configuration value for the balloon size to ensure
1337             // a reboot would use the right value.
1338             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1339                 balloon_config.size = desired_balloon;
1340             }
1341         }
1342 
1343         event!("vm", "resized");
1344 
1345         Ok(())
1346     }
1347 
1348     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1349         let memory_config = &mut self.config.lock().unwrap().memory;
1350 
1351         if let Some(zones) = &mut memory_config.zones {
1352             for zone in zones.iter_mut() {
1353                 if zone.id == id {
1354                     if desired_memory >= zone.size {
1355                         let hotplugged_size = desired_memory - zone.size;
1356                         self.memory_manager
1357                             .lock()
1358                             .unwrap()
1359                             .resize_zone(&id, desired_memory - zone.size)
1360                             .map_err(Error::MemoryManager)?;
1361                         // We update the memory zone config regardless of the
1362                         // actual 'resize-zone' operation result (happened or
1363                         // not), so that if the VM reboots it will be running
1364                         // with the last configured memory zone size.
1365                         zone.hotplugged_size = Some(hotplugged_size);
1366 
1367                         return Ok(());
1368                     } else {
1369                         error!(
1370                             "Invalid to ask less ({}) than boot RAM ({}) for \
1371                             this memory zone",
1372                             desired_memory, zone.size,
1373                         );
1374                         return Err(Error::ResizeZone);
1375                     }
1376                 }
1377             }
1378         }
1379 
1380         error!("Could not find the memory zone {} for the resize", id);
1381         Err(Error::ResizeZone)
1382     }
1383 
1384     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1385         let pci_device_info = self
1386             .device_manager
1387             .lock()
1388             .unwrap()
1389             .add_device(&mut device_cfg)
1390             .map_err(Error::DeviceManager)?;
1391 
1392         // Update VmConfig by adding the new device. This is important to
1393         // ensure the device would be created in case of a reboot.
1394         {
1395             let mut config = self.config.lock().unwrap();
1396             add_to_config(&mut config.devices, device_cfg);
1397         }
1398 
1399         self.device_manager
1400             .lock()
1401             .unwrap()
1402             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1403             .map_err(Error::DeviceManager)?;
1404 
1405         Ok(pci_device_info)
1406     }
1407 
1408     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1409         let pci_device_info = self
1410             .device_manager
1411             .lock()
1412             .unwrap()
1413             .add_user_device(&mut device_cfg)
1414             .map_err(Error::DeviceManager)?;
1415 
1416         // Update VmConfig by adding the new device. This is important to
1417         // ensure the device would be created in case of a reboot.
1418         {
1419             let mut config = self.config.lock().unwrap();
1420             add_to_config(&mut config.user_devices, device_cfg);
1421         }
1422 
1423         self.device_manager
1424             .lock()
1425             .unwrap()
1426             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1427             .map_err(Error::DeviceManager)?;
1428 
1429         Ok(pci_device_info)
1430     }
1431 
1432     pub fn remove_device(&mut self, id: String) -> Result<()> {
1433         self.device_manager
1434             .lock()
1435             .unwrap()
1436             .remove_device(id.clone())
1437             .map_err(Error::DeviceManager)?;
1438 
1439         // Update VmConfig by removing the device. This is important to
1440         // ensure the device would not be created in case of a reboot.
1441         let mut config = self.config.lock().unwrap();
1442 
1443         // Remove if VFIO device
1444         if let Some(devices) = config.devices.as_mut() {
1445             devices.retain(|dev| dev.id.as_ref() != Some(&id));
1446         }
1447 
1448         // Remove if VFIO user device
1449         if let Some(user_devices) = config.user_devices.as_mut() {
1450             user_devices.retain(|dev| dev.id.as_ref() != Some(&id));
1451         }
1452 
1453         // Remove if disk device
1454         if let Some(disks) = config.disks.as_mut() {
1455             disks.retain(|dev| dev.id.as_ref() != Some(&id));
1456         }
1457 
1458         // Remove if fs device
1459         if let Some(fs) = config.fs.as_mut() {
1460             fs.retain(|dev| dev.id.as_ref() != Some(&id));
1461         }
1462 
1463         // Remove if net device
1464         if let Some(net) = config.net.as_mut() {
1465             net.retain(|dev| dev.id.as_ref() != Some(&id));
1466         }
1467 
1468         // Remove if pmem device
1469         if let Some(pmem) = config.pmem.as_mut() {
1470             pmem.retain(|dev| dev.id.as_ref() != Some(&id));
1471         }
1472 
1473         // Remove if vDPA device
1474         if let Some(vdpa) = config.vdpa.as_mut() {
1475             vdpa.retain(|dev| dev.id.as_ref() != Some(&id));
1476         }
1477 
1478         // Remove if vsock device
1479         if let Some(vsock) = config.vsock.as_ref() {
1480             if vsock.id.as_ref() == Some(&id) {
1481                 config.vsock = None;
1482             }
1483         }
1484 
1485         self.device_manager
1486             .lock()
1487             .unwrap()
1488             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1489             .map_err(Error::DeviceManager)?;
1490         Ok(())
1491     }
1492 
1493     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1494         let pci_device_info = self
1495             .device_manager
1496             .lock()
1497             .unwrap()
1498             .add_disk(&mut disk_cfg)
1499             .map_err(Error::DeviceManager)?;
1500 
1501         // Update VmConfig by adding the new device. This is important to
1502         // ensure the device would be created in case of a reboot.
1503         {
1504             let mut config = self.config.lock().unwrap();
1505             add_to_config(&mut config.disks, disk_cfg);
1506         }
1507 
1508         self.device_manager
1509             .lock()
1510             .unwrap()
1511             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1512             .map_err(Error::DeviceManager)?;
1513 
1514         Ok(pci_device_info)
1515     }
1516 
1517     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1518         let pci_device_info = self
1519             .device_manager
1520             .lock()
1521             .unwrap()
1522             .add_fs(&mut fs_cfg)
1523             .map_err(Error::DeviceManager)?;
1524 
1525         // Update VmConfig by adding the new device. This is important to
1526         // ensure the device would be created in case of a reboot.
1527         {
1528             let mut config = self.config.lock().unwrap();
1529             add_to_config(&mut config.fs, fs_cfg);
1530         }
1531 
1532         self.device_manager
1533             .lock()
1534             .unwrap()
1535             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1536             .map_err(Error::DeviceManager)?;
1537 
1538         Ok(pci_device_info)
1539     }
1540 
1541     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1542         let pci_device_info = self
1543             .device_manager
1544             .lock()
1545             .unwrap()
1546             .add_pmem(&mut pmem_cfg)
1547             .map_err(Error::DeviceManager)?;
1548 
1549         // Update VmConfig by adding the new device. This is important to
1550         // ensure the device would be created in case of a reboot.
1551         {
1552             let mut config = self.config.lock().unwrap();
1553             add_to_config(&mut config.pmem, pmem_cfg);
1554         }
1555 
1556         self.device_manager
1557             .lock()
1558             .unwrap()
1559             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1560             .map_err(Error::DeviceManager)?;
1561 
1562         Ok(pci_device_info)
1563     }
1564 
1565     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1566         let pci_device_info = self
1567             .device_manager
1568             .lock()
1569             .unwrap()
1570             .add_net(&mut net_cfg)
1571             .map_err(Error::DeviceManager)?;
1572 
1573         // Update VmConfig by adding the new device. This is important to
1574         // ensure the device would be created in case of a reboot.
1575         {
1576             let mut config = self.config.lock().unwrap();
1577             add_to_config(&mut config.net, net_cfg);
1578         }
1579 
1580         self.device_manager
1581             .lock()
1582             .unwrap()
1583             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1584             .map_err(Error::DeviceManager)?;
1585 
1586         Ok(pci_device_info)
1587     }
1588 
1589     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1590         let pci_device_info = self
1591             .device_manager
1592             .lock()
1593             .unwrap()
1594             .add_vdpa(&mut vdpa_cfg)
1595             .map_err(Error::DeviceManager)?;
1596 
1597         // Update VmConfig by adding the new device. This is important to
1598         // ensure the device would be created in case of a reboot.
1599         {
1600             let mut config = self.config.lock().unwrap();
1601             add_to_config(&mut config.vdpa, vdpa_cfg);
1602         }
1603 
1604         self.device_manager
1605             .lock()
1606             .unwrap()
1607             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1608             .map_err(Error::DeviceManager)?;
1609 
1610         Ok(pci_device_info)
1611     }
1612 
1613     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1614         let pci_device_info = self
1615             .device_manager
1616             .lock()
1617             .unwrap()
1618             .add_vsock(&mut vsock_cfg)
1619             .map_err(Error::DeviceManager)?;
1620 
1621         // Update VmConfig by adding the new device. This is important to
1622         // ensure the device would be created in case of a reboot.
1623         {
1624             let mut config = self.config.lock().unwrap();
1625             config.vsock = Some(vsock_cfg);
1626         }
1627 
1628         self.device_manager
1629             .lock()
1630             .unwrap()
1631             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1632             .map_err(Error::DeviceManager)?;
1633 
1634         Ok(pci_device_info)
1635     }
1636 
1637     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1638         Ok(self.device_manager.lock().unwrap().counters())
1639     }
1640 
1641     fn signal_handler(mut signals: Signals, console_input_clone: Arc<Console>) {
1642         for sig in &Vm::HANDLED_SIGNALS {
1643             unblock_signal(*sig).unwrap();
1644         }
1645 
1646         for signal in signals.forever() {
1647             if signal == SIGWINCH {
1648                 console_input_clone.update_console_size();
1649             }
1650         }
1651     }
1652 
1653     #[cfg(feature = "tdx")]
1654     fn init_tdx(&mut self) -> Result<()> {
1655         let cpuid = self.cpu_manager.lock().unwrap().common_cpuid();
1656         let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32;
1657         self.vm
1658             .tdx_init(&cpuid, max_vcpus)
1659             .map_err(Error::InitializeTdxVm)?;
1660         Ok(())
1661     }
1662 
1663     #[cfg(feature = "tdx")]
1664     fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> {
1665         use arch::x86_64::tdx::*;
1666 
1667         let firmware_path = self
1668             .config
1669             .lock()
1670             .unwrap()
1671             .payload
1672             .as_ref()
1673             .unwrap()
1674             .firmware
1675             .clone()
1676             .ok_or(Error::TdxFirmwareMissing)?;
1677         // The TDVF file contains a table of section as well as code
1678         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1679 
1680         // For all the sections allocate some RAM backing them
1681         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1682     }
1683 
1684     #[cfg(feature = "tdx")]
1685     fn hob_memory_resources(
1686         mut sorted_sections: Vec<TdvfSection>,
1687         guest_memory: &GuestMemoryMmap,
1688     ) -> Vec<(u64, u64, bool)> {
1689         let mut list = Vec::new();
1690 
1691         let mut current_section = sorted_sections.pop();
1692 
1693         // RAM regions interleaved with TDVF sections
1694         let mut next_start_addr = 0;
1695         for region in guest_memory.iter() {
1696             let region_start = region.start_addr().0;
1697             let region_end = region.last_addr().0;
1698             if region_start > next_start_addr {
1699                 next_start_addr = region_start;
1700             }
1701 
1702             loop {
1703                 let (start, size, ram) = if let Some(section) = &current_section {
1704                     if section.address <= next_start_addr {
1705                         (section.address, section.size, false)
1706                     } else {
1707                         let last_addr = std::cmp::min(section.address - 1, region_end);
1708                         (next_start_addr, last_addr - next_start_addr + 1, true)
1709                     }
1710                 } else {
1711                     (next_start_addr, region_end - next_start_addr + 1, true)
1712                 };
1713 
1714                 list.push((start, size, ram));
1715 
1716                 if !ram {
1717                     current_section = sorted_sections.pop();
1718                 }
1719 
1720                 next_start_addr = start + size;
1721 
1722                 if region_start > next_start_addr {
1723                     next_start_addr = region_start;
1724                 }
1725 
1726                 if next_start_addr > region_end {
1727                     break;
1728                 }
1729             }
1730         }
1731 
1732         // Once all the interleaved sections have been processed, let's simply
1733         // pull the remaining ones.
1734         if let Some(section) = current_section {
1735             list.push((section.address, section.size, false));
1736         }
1737         while let Some(section) = sorted_sections.pop() {
1738             list.push((section.address, section.size, false));
1739         }
1740 
1741         list
1742     }
1743 
1744     #[cfg(feature = "tdx")]
1745     fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> {
1746         use arch::x86_64::tdx::*;
1747         // Get the memory end *before* we start adding TDVF ram regions
1748         let boot_guest_memory = self
1749             .memory_manager
1750             .lock()
1751             .as_ref()
1752             .unwrap()
1753             .boot_guest_memory();
1754         for section in sections {
1755             // No need to allocate if the section falls within guest RAM ranges
1756             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1757                 info!(
1758                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1759                     section
1760                 );
1761                 continue;
1762             }
1763 
1764             info!("Allocating TDVF Section: {:x?}", section);
1765             self.memory_manager
1766                 .lock()
1767                 .unwrap()
1768                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1769                 .map_err(Error::AllocatingTdvfMemory)?;
1770         }
1771 
1772         // The TDVF file contains a table of section as well as code
1773         let firmware_path = self
1774             .config
1775             .lock()
1776             .unwrap()
1777             .payload
1778             .as_ref()
1779             .unwrap()
1780             .firmware
1781             .clone()
1782             .ok_or(Error::TdxFirmwareMissing)?;
1783         let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1784 
1785         // The guest memory at this point now has all the required regions so it
1786         // is safe to copy from the TDVF file into it.
1787         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1788         let mem = guest_memory.memory();
1789         let mut payload_info = None;
1790         let mut hob_offset = None;
1791         for section in sections {
1792             info!("Populating TDVF Section: {:x?}", section);
1793             match section.r#type {
1794                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1795                     info!("Copying section to guest memory");
1796                     firmware_file
1797                         .seek(SeekFrom::Start(section.data_offset as u64))
1798                         .map_err(Error::LoadTdvf)?;
1799                     mem.read_from(
1800                         GuestAddress(section.address),
1801                         &mut firmware_file,
1802                         section.data_size as usize,
1803                     )
1804                     .unwrap();
1805                 }
1806                 TdvfSectionType::TdHob => {
1807                     hob_offset = Some(section.address);
1808                 }
1809                 TdvfSectionType::Payload => {
1810                     info!("Copying payload to guest memory");
1811                     if let Some(payload_file) = self.kernel.as_mut() {
1812                         let payload_size = payload_file
1813                             .seek(SeekFrom::End(0))
1814                             .map_err(Error::LoadPayload)?;
1815 
1816                         payload_file
1817                             .seek(SeekFrom::Start(0x1f1))
1818                             .map_err(Error::LoadPayload)?;
1819 
1820                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1821                         payload_header
1822                             .as_bytes()
1823                             .read_from(
1824                                 0,
1825                                 payload_file,
1826                                 mem::size_of::<linux_loader::bootparam::setup_header>(),
1827                             )
1828                             .unwrap();
1829 
1830                         if payload_header.header != 0x5372_6448 {
1831                             return Err(Error::InvalidPayloadType);
1832                         }
1833 
1834                         if (payload_header.version < 0x0200)
1835                             || ((payload_header.loadflags & 0x1) == 0x0)
1836                         {
1837                             return Err(Error::InvalidPayloadType);
1838                         }
1839 
1840                         payload_file
1841                             .seek(SeekFrom::Start(0))
1842                             .map_err(Error::LoadPayload)?;
1843                         mem.read_from(
1844                             GuestAddress(section.address),
1845                             payload_file,
1846                             payload_size as usize,
1847                         )
1848                         .unwrap();
1849 
1850                         // Create the payload info that will be inserted into
1851                         // the HOB.
1852                         payload_info = Some(PayloadInfo {
1853                             image_type: PayloadImageType::BzImage,
1854                             entry_point: section.address,
1855                         });
1856                     }
1857                 }
1858                 TdvfSectionType::PayloadParam => {
1859                     info!("Copying payload parameters to guest memory");
1860                     let cmdline = Self::generate_cmdline(
1861                         self.config.lock().unwrap().payload.as_ref().unwrap(),
1862                     )?;
1863                     mem.write_slice(
1864                         cmdline.as_cstring().unwrap().as_bytes_with_nul(),
1865                         GuestAddress(section.address),
1866                     )
1867                     .unwrap();
1868                 }
1869                 _ => {}
1870             }
1871         }
1872 
1873         // Generate HOB
1874         let mut hob = TdHob::start(hob_offset.unwrap());
1875 
1876         let mut sorted_sections = sections.to_vec();
1877         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1878 
1879         sorted_sections.sort_by_key(|section| section.address);
1880         sorted_sections.reverse();
1881 
1882         for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
1883             hob.add_memory_resource(&mem, start, size, ram)
1884                 .map_err(Error::PopulateHob)?;
1885         }
1886 
1887         // MMIO regions
1888         hob.add_mmio_resource(
1889             &mem,
1890             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1891             arch::layout::APIC_START.raw_value()
1892                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1893         )
1894         .map_err(Error::PopulateHob)?;
1895         let start_of_device_area = self
1896             .memory_manager
1897             .lock()
1898             .unwrap()
1899             .start_of_device_area()
1900             .raw_value();
1901         let end_of_device_area = self
1902             .memory_manager
1903             .lock()
1904             .unwrap()
1905             .end_of_device_area()
1906             .raw_value();
1907         hob.add_mmio_resource(
1908             &mem,
1909             start_of_device_area,
1910             end_of_device_area - start_of_device_area,
1911         )
1912         .map_err(Error::PopulateHob)?;
1913 
1914         // Loop over the ACPI tables and copy them to the HOB.
1915 
1916         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1917             &self.device_manager,
1918             &self.cpu_manager,
1919             &self.memory_manager,
1920             &self.numa_nodes,
1921         ) {
1922             hob.add_acpi_table(&mem, acpi_table.as_slice())
1923                 .map_err(Error::PopulateHob)?;
1924         }
1925 
1926         // If a payload info has been created, let's insert it into the HOB.
1927         if let Some(payload_info) = payload_info {
1928             hob.add_payload(&mem, payload_info)
1929                 .map_err(Error::PopulateHob)?;
1930         }
1931 
1932         hob.finish(&mem).map_err(Error::PopulateHob)?;
1933 
1934         Ok(hob_offset)
1935     }
1936 
1937     #[cfg(feature = "tdx")]
1938     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1939         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1940         let mem = guest_memory.memory();
1941 
1942         for section in sections {
1943             self.vm
1944                 .tdx_init_memory_region(
1945                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1946                     section.address,
1947                     section.size,
1948                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1949                     section.attributes == 1,
1950                 )
1951                 .map_err(Error::InitializeTdxMemoryRegion)?;
1952         }
1953 
1954         Ok(())
1955     }
1956 
1957     fn setup_signal_handler(&mut self) -> Result<()> {
1958         let console = self.device_manager.lock().unwrap().console().clone();
1959         let signals = Signals::new(Vm::HANDLED_SIGNALS);
1960         match signals {
1961             Ok(signals) => {
1962                 self.signals = Some(signals.handle());
1963                 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?;
1964                 let signal_handler_seccomp_filter = get_seccomp_filter(
1965                     &self.seccomp_action,
1966                     Thread::SignalHandler,
1967                     self.hypervisor.hypervisor_type(),
1968                 )
1969                 .map_err(Error::CreateSeccompFilter)?;
1970                 self.threads.push(
1971                     thread::Builder::new()
1972                         .name("vm_signal_handler".to_string())
1973                         .spawn(move || {
1974                             if !signal_handler_seccomp_filter.is_empty() {
1975                                 if let Err(e) = apply_filter(&signal_handler_seccomp_filter)
1976                                     .map_err(Error::ApplySeccompFilter)
1977                                 {
1978                                     error!("Error applying seccomp filter: {:?}", e);
1979                                     exit_evt.write(1).ok();
1980                                     return;
1981                                 }
1982                             }
1983                             std::panic::catch_unwind(AssertUnwindSafe(|| {
1984                                 Vm::signal_handler(signals, console);
1985                             }))
1986                             .map_err(|_| {
1987                                 error!("signal_handler thead panicked");
1988                                 exit_evt.write(1).ok()
1989                             })
1990                             .ok();
1991                         })
1992                         .map_err(Error::SignalHandlerSpawn)?,
1993                 );
1994             }
1995             Err(e) => error!("Signal not found {}", e),
1996         }
1997         Ok(())
1998     }
1999 
2000     fn setup_tty(&self) -> Result<()> {
2001         if self.on_tty {
2002             io::stdin()
2003                 .lock()
2004                 .set_raw_mode()
2005                 .map_err(Error::SetTerminalRaw)?;
2006         }
2007 
2008         Ok(())
2009     }
2010 
2011     // Creates ACPI tables
2012     // In case of TDX being used, this is a no-op since the tables will be
2013     // created and passed when populating the HOB.
2014 
2015     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2016         #[cfg(feature = "tdx")]
2017         if self.config.lock().unwrap().is_tdx_enabled() {
2018             return None;
2019         }
2020         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2021         let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2022         let rsdp_addr = crate::acpi::create_acpi_tables(
2023             &mem,
2024             &self.device_manager,
2025             &self.cpu_manager,
2026             &self.memory_manager,
2027             &self.numa_nodes,
2028             tpm_enabled,
2029         );
2030         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2031 
2032         Some(rsdp_addr)
2033     }
2034 
2035     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2036         trace_scoped!("entry_point");
2037 
2038         self.load_payload_handle
2039             .take()
2040             .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2041             .transpose()
2042     }
2043 
2044     pub fn boot(&mut self) -> Result<()> {
2045         trace_scoped!("Vm::boot");
2046         info!("Booting VM");
2047         event!("vm", "booting");
2048         let current_state = self.get_state()?;
2049         if current_state == VmState::Paused {
2050             return self.resume().map_err(Error::Resume);
2051         }
2052 
2053         let new_state = if self.stop_on_boot {
2054             VmState::BreakPoint
2055         } else {
2056             VmState::Running
2057         };
2058         current_state.valid_transition(new_state)?;
2059 
2060         // Do earlier to parallelise with loading kernel
2061         #[cfg(target_arch = "x86_64")]
2062         let rsdp_addr = self.create_acpi_tables();
2063 
2064         self.setup_signal_handler()?;
2065         self.setup_tty()?;
2066 
2067         // Load kernel synchronously or if asynchronous then wait for load to
2068         // finish.
2069         let entry_point = self.entry_point()?;
2070 
2071         #[cfg(feature = "tdx")]
2072         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2073 
2074         // The initial TDX configuration must be done before the vCPUs are
2075         // created
2076         #[cfg(feature = "tdx")]
2077         if tdx_enabled {
2078             self.init_tdx()?;
2079         }
2080 
2081         // Configure the vcpus that have been created
2082         let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2083         for vcpu in vcpus {
2084             self.cpu_manager
2085                 .lock()
2086                 .unwrap()
2087                 .configure_vcpu(vcpu, entry_point, None)
2088                 .map_err(Error::CpuManager)?;
2089         }
2090 
2091         #[cfg(feature = "tdx")]
2092         let sections = if tdx_enabled {
2093             self.extract_tdvf_sections()?
2094         } else {
2095             Vec::new()
2096         };
2097 
2098         // Configuring the TDX regions requires that the vCPUs are created.
2099         #[cfg(feature = "tdx")]
2100         let hob_address = if tdx_enabled {
2101             // TDX sections are written to memory.
2102             self.populate_tdx_sections(&sections)?
2103         } else {
2104             None
2105         };
2106 
2107         // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2108         // available after they are configured
2109         #[cfg(target_arch = "aarch64")]
2110         let rsdp_addr = self.create_acpi_tables();
2111 
2112         // Configure shared state based on loaded kernel
2113         entry_point
2114             .map(|_| {
2115                 // Safe to unwrap rsdp_addr as we know it can't be None when
2116                 // the entry_point is Some.
2117                 self.configure_system(rsdp_addr.unwrap())
2118             })
2119             .transpose()?;
2120 
2121         #[cfg(feature = "tdx")]
2122         if let Some(hob_address) = hob_address {
2123             // With the HOB address extracted the vCPUs can have
2124             // their TDX state configured.
2125             self.cpu_manager
2126                 .lock()
2127                 .unwrap()
2128                 .initialize_tdx(hob_address)
2129                 .map_err(Error::CpuManager)?;
2130             // Let the hypervisor know which memory ranges are shared with the
2131             // guest. This prevents the guest from ignoring/discarding memory
2132             // regions provided by the host.
2133             self.init_tdx_memory(&sections)?;
2134             // With TDX memory and CPU state configured TDX setup is complete
2135             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2136         }
2137 
2138         self.cpu_manager
2139             .lock()
2140             .unwrap()
2141             .start_boot_vcpus(new_state == VmState::BreakPoint)
2142             .map_err(Error::CpuManager)?;
2143 
2144         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2145         *state = new_state;
2146         event!("vm", "booted");
2147         Ok(())
2148     }
2149 
2150     /// Gets a thread-safe reference counted pointer to the VM configuration.
2151     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2152         Arc::clone(&self.config)
2153     }
2154 
2155     /// Get the VM state. Returns an error if the state is poisoned.
2156     pub fn get_state(&self) -> Result<VmState> {
2157         self.state
2158             .try_read()
2159             .map_err(|_| Error::PoisonedState)
2160             .map(|state| *state)
2161     }
2162 
2163     /// Load saved clock from snapshot
2164     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2165     pub fn load_clock_from_snapshot(
2166         &mut self,
2167         snapshot: &Snapshot,
2168     ) -> Result<Option<hypervisor::ClockData>> {
2169         use crate::migration::get_vm_snapshot;
2170         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
2171         self.saved_clock = vm_snapshot.clock;
2172         Ok(self.saved_clock)
2173     }
2174 
2175     #[cfg(target_arch = "aarch64")]
2176     /// Add the vGIC section to the VM snapshot.
2177     fn add_vgic_snapshot_section(
2178         &self,
2179         vm_snapshot: &mut Snapshot,
2180     ) -> std::result::Result<(), MigratableError> {
2181         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2182         self.device_manager
2183             .lock()
2184             .unwrap()
2185             .get_interrupt_controller()
2186             .unwrap()
2187             .lock()
2188             .unwrap()
2189             .set_gicr_typers(&saved_vcpu_states);
2190 
2191         vm_snapshot.add_snapshot(
2192             self.device_manager
2193                 .lock()
2194                 .unwrap()
2195                 .get_interrupt_controller()
2196                 .unwrap()
2197                 .lock()
2198                 .unwrap()
2199                 .snapshot()?,
2200         );
2201 
2202         Ok(())
2203     }
2204 
2205     #[cfg(target_arch = "aarch64")]
2206     /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing.
2207     fn restore_vgic_and_enable_interrupt(
2208         &self,
2209         vm_snapshot: &Snapshot,
2210     ) -> std::result::Result<(), MigratableError> {
2211         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2212 
2213         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
2214         self.cpu_manager
2215             .lock()
2216             .unwrap()
2217             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
2218             .map_err(|e| MigratableError::Restore(anyhow!("Error init PMU: {:?}", e)))?;
2219 
2220         // Here we prepare the GICR_TYPER registers from the restored vCPU states.
2221         self.device_manager
2222             .lock()
2223             .unwrap()
2224             .get_interrupt_controller()
2225             .unwrap()
2226             .lock()
2227             .unwrap()
2228             .set_gicr_typers(&saved_vcpu_states);
2229 
2230         // Restore GIC states.
2231         if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) {
2232             self.device_manager
2233                 .lock()
2234                 .unwrap()
2235                 .get_interrupt_controller()
2236                 .unwrap()
2237                 .lock()
2238                 .unwrap()
2239                 .restore(*gicv3_its_snapshot.clone())?;
2240         } else {
2241             return Err(MigratableError::Restore(anyhow!(
2242                 "Missing GicV3Its snapshot"
2243             )));
2244         }
2245 
2246         Ok(())
2247     }
2248 
2249     /// Gets the actual size of the balloon.
2250     pub fn balloon_size(&self) -> u64 {
2251         self.device_manager.lock().unwrap().balloon_size()
2252     }
2253 
2254     pub fn send_memory_fds(
2255         &mut self,
2256         socket: &mut UnixStream,
2257     ) -> std::result::Result<(), MigratableError> {
2258         for (slot, fd) in self
2259             .memory_manager
2260             .lock()
2261             .unwrap()
2262             .memory_slot_fds()
2263             .drain()
2264         {
2265             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2266                 .write_to(socket)
2267                 .map_err(|e| {
2268                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2269                 })?;
2270             socket
2271                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2272                 .map_err(|e| {
2273                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2274                 })?;
2275 
2276             let res = Response::read_from(socket)?;
2277             if res.status() != Status::Ok {
2278                 warn!("Error during memory fd migration");
2279                 Request::abandon().write_to(socket)?;
2280                 Response::read_from(socket).ok();
2281                 return Err(MigratableError::MigrateSend(anyhow!(
2282                     "Error during memory fd migration"
2283                 )));
2284             }
2285         }
2286 
2287         Ok(())
2288     }
2289 
2290     pub fn send_memory_regions<F>(
2291         &mut self,
2292         ranges: &MemoryRangeTable,
2293         fd: &mut F,
2294     ) -> std::result::Result<(), MigratableError>
2295     where
2296         F: Write,
2297     {
2298         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2299         let mem = guest_memory.memory();
2300 
2301         for range in ranges.regions() {
2302             let mut offset: u64 = 0;
2303             // Here we are manually handling the retry in case we can't the
2304             // whole region at once because we can't use the implementation
2305             // from vm-memory::GuestMemory of write_all_to() as it is not
2306             // following the correct behavior. For more info about this issue
2307             // see: https://github.com/rust-vmm/vm-memory/issues/174
2308             loop {
2309                 let bytes_written = mem
2310                     .write_to(
2311                         GuestAddress(range.gpa + offset),
2312                         fd,
2313                         (range.length - offset) as usize,
2314                     )
2315                     .map_err(|e| {
2316                         MigratableError::MigrateSend(anyhow!(
2317                             "Error transferring memory to socket: {}",
2318                             e
2319                         ))
2320                     })?;
2321                 offset += bytes_written as u64;
2322 
2323                 if offset == range.length {
2324                     break;
2325                 }
2326             }
2327         }
2328 
2329         Ok(())
2330     }
2331 
2332     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2333         self.memory_manager
2334             .lock()
2335             .unwrap()
2336             .memory_range_table(false)
2337     }
2338 
2339     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2340         self.device_manager.lock().unwrap().device_tree()
2341     }
2342 
2343     pub fn activate_virtio_devices(&self) -> Result<()> {
2344         self.device_manager
2345             .lock()
2346             .unwrap()
2347             .activate_virtio_devices()
2348             .map_err(Error::ActivateVirtioDevices)
2349     }
2350 
2351     #[cfg(target_arch = "x86_64")]
2352     pub fn power_button(&self) -> Result<()> {
2353         return self
2354             .device_manager
2355             .lock()
2356             .unwrap()
2357             .notify_power_button()
2358             .map_err(Error::PowerButton);
2359     }
2360 
2361     #[cfg(target_arch = "aarch64")]
2362     pub fn power_button(&self) -> Result<()> {
2363         self.device_manager
2364             .lock()
2365             .unwrap()
2366             .notify_power_button()
2367             .map_err(Error::PowerButton)
2368     }
2369 
2370     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2371         self.memory_manager.lock().unwrap().snapshot_data()
2372     }
2373 
2374     #[cfg(feature = "guest_debug")]
2375     pub fn debug_request(
2376         &mut self,
2377         gdb_request: &GdbRequestPayload,
2378         cpu_id: usize,
2379     ) -> Result<GdbResponsePayload> {
2380         use GdbRequestPayload::*;
2381         match gdb_request {
2382             SetSingleStep(single_step) => {
2383                 self.set_guest_debug(cpu_id, &[], *single_step)
2384                     .map_err(Error::Debug)?;
2385             }
2386             SetHwBreakPoint(addrs) => {
2387                 self.set_guest_debug(cpu_id, addrs, false)
2388                     .map_err(Error::Debug)?;
2389             }
2390             Pause => {
2391                 self.debug_pause().map_err(Error::Debug)?;
2392             }
2393             Resume => {
2394                 self.debug_resume().map_err(Error::Debug)?;
2395             }
2396             ReadRegs => {
2397                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2398                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2399             }
2400             WriteRegs(regs) => {
2401                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2402             }
2403             ReadMem(vaddr, len) => {
2404                 let mem = self.read_mem(cpu_id, *vaddr, *len).map_err(Error::Debug)?;
2405                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2406             }
2407             WriteMem(vaddr, data) => {
2408                 self.write_mem(cpu_id, vaddr, data).map_err(Error::Debug)?;
2409             }
2410             ActiveVcpus => {
2411                 let active_vcpus = self.active_vcpus();
2412                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2413             }
2414         }
2415         Ok(GdbResponsePayload::CommandComplete)
2416     }
2417 
2418     #[cfg(feature = "guest_debug")]
2419     fn get_dump_state(
2420         &mut self,
2421         destination_url: &str,
2422     ) -> std::result::Result<DumpState, GuestDebuggableError> {
2423         let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2424         let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2425         let mut elf_phdr_num = 1;
2426         let elf_sh_info = 0;
2427         let coredump_file_path = url_to_file(destination_url)?;
2428         let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2429 
2430         if mapping_num < UINT16_MAX - 2 {
2431             elf_phdr_num += mapping_num as u16;
2432         } else {
2433             panic!("mapping num beyond 65535 not supported");
2434         }
2435         let coredump_file = OpenOptions::new()
2436             .read(true)
2437             .write(true)
2438             .create_new(true)
2439             .open(coredump_file_path)
2440             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2441 
2442         let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2443         let mem_data = self
2444             .memory_manager
2445             .lock()
2446             .unwrap()
2447             .coredump_memory_regions(mem_offset);
2448 
2449         Ok(DumpState {
2450             elf_note_size,
2451             elf_phdr_num,
2452             elf_sh_info,
2453             mem_offset,
2454             mem_info: Some(mem_data),
2455             file: Some(coredump_file),
2456         })
2457     }
2458 
2459     #[cfg(feature = "guest_debug")]
2460     fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2461         size_of::<elf::Elf64_Ehdr>() as u64
2462             + note_size as u64
2463             + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2464     }
2465 }
2466 
2467 impl Pausable for Vm {
2468     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2469         event!("vm", "pausing");
2470         let mut state = self
2471             .state
2472             .try_write()
2473             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2474         let new_state = VmState::Paused;
2475 
2476         state
2477             .valid_transition(new_state)
2478             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2479 
2480         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2481         {
2482             let mut clock = self
2483                 .vm
2484                 .get_clock()
2485                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2486             clock.reset_flags();
2487             self.saved_clock = Some(clock);
2488         }
2489 
2490         // Before pausing the vCPUs activate any pending virtio devices that might
2491         // need activation between starting the pause (or e.g. a migration it's part of)
2492         self.activate_virtio_devices().map_err(|e| {
2493             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2494         })?;
2495 
2496         self.cpu_manager.lock().unwrap().pause()?;
2497         self.device_manager.lock().unwrap().pause()?;
2498 
2499         *state = new_state;
2500 
2501         event!("vm", "paused");
2502         Ok(())
2503     }
2504 
2505     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2506         event!("vm", "resuming");
2507         let mut state = self
2508             .state
2509             .try_write()
2510             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2511         let new_state = VmState::Running;
2512 
2513         state
2514             .valid_transition(new_state)
2515             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2516 
2517         self.cpu_manager.lock().unwrap().resume()?;
2518         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2519         {
2520             if let Some(clock) = &self.saved_clock {
2521                 self.vm.set_clock(clock).map_err(|e| {
2522                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2523                 })?;
2524             }
2525         }
2526         self.device_manager.lock().unwrap().resume()?;
2527 
2528         // And we're back to the Running state.
2529         *state = new_state;
2530         event!("vm", "resumed");
2531         Ok(())
2532     }
2533 }
2534 
2535 #[derive(Serialize, Deserialize)]
2536 pub struct VmSnapshot {
2537     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2538     pub clock: Option<hypervisor::ClockData>,
2539     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2540     pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2541 }
2542 
2543 pub const VM_SNAPSHOT_ID: &str = "vm";
2544 impl Snapshottable for Vm {
2545     fn id(&self) -> String {
2546         VM_SNAPSHOT_ID.to_string()
2547     }
2548 
2549     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2550         event!("vm", "snapshotting");
2551 
2552         #[cfg(feature = "tdx")]
2553         let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2554 
2555         #[cfg(feature = "tdx")]
2556         {
2557             if tdx_enabled {
2558                 return Err(MigratableError::Snapshot(anyhow!(
2559                     "Snapshot not possible with TDX VM"
2560                 )));
2561             }
2562         }
2563 
2564         let current_state = self.get_state().unwrap();
2565         if current_state != VmState::Paused {
2566             return Err(MigratableError::Snapshot(anyhow!(
2567                 "Trying to snapshot while VM is running"
2568             )));
2569         }
2570 
2571         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2572         let common_cpuid = {
2573             let phys_bits = physical_bits(self.config.lock().unwrap().cpus.max_phys_bits);
2574             arch::generate_common_cpuid(
2575                 self.hypervisor.clone(),
2576                 None,
2577                 None,
2578                 phys_bits,
2579                 self.config.lock().unwrap().cpus.kvm_hyperv,
2580                 #[cfg(feature = "tdx")]
2581                 tdx_enabled,
2582             )
2583             .map_err(|e| {
2584                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2585             })?
2586         };
2587 
2588         let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID);
2589         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2590             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2591             clock: self.saved_clock,
2592             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2593             common_cpuid,
2594         })
2595         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2596 
2597         vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?);
2598         vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?);
2599 
2600         #[cfg(target_arch = "aarch64")]
2601         self.add_vgic_snapshot_section(&mut vm_snapshot)
2602             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2603 
2604         vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?);
2605         vm_snapshot.add_data_section(SnapshotDataSection {
2606             id: format!("{}-section", VM_SNAPSHOT_ID),
2607             snapshot: vm_snapshot_data,
2608         });
2609 
2610         event!("vm", "snapshotted");
2611         Ok(vm_snapshot)
2612     }
2613 
2614     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
2615         event!("vm", "restoring");
2616 
2617         let current_state = self
2618             .get_state()
2619             .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?;
2620         let new_state = VmState::Paused;
2621         current_state.valid_transition(new_state).map_err(|e| {
2622             MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e))
2623         })?;
2624 
2625         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2626         self.load_clock_from_snapshot(&snapshot)
2627             .map_err(|e| MigratableError::Restore(anyhow!("Error restoring clock: {:?}", e)))?;
2628 
2629         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2630             self.device_manager
2631                 .lock()
2632                 .unwrap()
2633                 .restore(*device_manager_snapshot.clone())?;
2634         } else {
2635             return Err(MigratableError::Restore(anyhow!(
2636                 "Missing device manager snapshot"
2637             )));
2638         }
2639 
2640         if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) {
2641             self.cpu_manager
2642                 .lock()
2643                 .unwrap()
2644                 .restore(*cpu_manager_snapshot.clone())?;
2645         } else {
2646             return Err(MigratableError::Restore(anyhow!(
2647                 "Missing CPU manager snapshot"
2648             )));
2649         }
2650 
2651         #[cfg(target_arch = "aarch64")]
2652         self.restore_vgic_and_enable_interrupt(&snapshot)?;
2653 
2654         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2655             self.device_manager
2656                 .lock()
2657                 .unwrap()
2658                 .restore_devices(*device_manager_snapshot.clone())?;
2659         } else {
2660             return Err(MigratableError::Restore(anyhow!(
2661                 "Missing device manager snapshot"
2662             )));
2663         }
2664 
2665         // Now we can start all vCPUs from here.
2666         self.cpu_manager
2667             .lock()
2668             .unwrap()
2669             .start_restored_vcpus()
2670             .map_err(|e| {
2671                 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e))
2672             })?;
2673 
2674         self.setup_signal_handler().map_err(|e| {
2675             MigratableError::Restore(anyhow!("Could not setup signal handler: {:#?}", e))
2676         })?;
2677         self.setup_tty()
2678             .map_err(|e| MigratableError::Restore(anyhow!("Could not setup tty: {:#?}", e)))?;
2679 
2680         let mut state = self
2681             .state
2682             .try_write()
2683             .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?;
2684         *state = new_state;
2685 
2686         event!("vm", "restored");
2687         Ok(())
2688     }
2689 }
2690 
2691 impl Transportable for Vm {
2692     fn send(
2693         &self,
2694         snapshot: &Snapshot,
2695         destination_url: &str,
2696     ) -> std::result::Result<(), MigratableError> {
2697         let mut snapshot_config_path = url_to_path(destination_url)?;
2698         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2699 
2700         // Create the snapshot config file
2701         let mut snapshot_config_file = OpenOptions::new()
2702             .read(true)
2703             .write(true)
2704             .create_new(true)
2705             .open(snapshot_config_path)
2706             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2707 
2708         // Serialize and write the snapshot config
2709         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2710             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2711 
2712         snapshot_config_file
2713             .write(vm_config.as_bytes())
2714             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2715 
2716         let mut snapshot_state_path = url_to_path(destination_url)?;
2717         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2718 
2719         // Create the snapshot state file
2720         let mut snapshot_state_file = OpenOptions::new()
2721             .read(true)
2722             .write(true)
2723             .create_new(true)
2724             .open(snapshot_state_path)
2725             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2726 
2727         // Serialize and write the snapshot state
2728         let vm_state =
2729             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2730 
2731         snapshot_state_file
2732             .write(&vm_state)
2733             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2734 
2735         // Tell the memory manager to also send/write its own snapshot.
2736         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2737             self.memory_manager
2738                 .lock()
2739                 .unwrap()
2740                 .send(&memory_manager_snapshot.clone(), destination_url)?;
2741         } else {
2742             return Err(MigratableError::Restore(anyhow!(
2743                 "Missing memory manager snapshot"
2744             )));
2745         }
2746 
2747         Ok(())
2748     }
2749 }
2750 
2751 impl Migratable for Vm {
2752     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2753         self.memory_manager.lock().unwrap().start_dirty_log()?;
2754         self.device_manager.lock().unwrap().start_dirty_log()
2755     }
2756 
2757     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2758         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2759         self.device_manager.lock().unwrap().stop_dirty_log()
2760     }
2761 
2762     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2763         Ok(MemoryRangeTable::new_from_tables(vec![
2764             self.memory_manager.lock().unwrap().dirty_log()?,
2765             self.device_manager.lock().unwrap().dirty_log()?,
2766         ]))
2767     }
2768 
2769     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2770         self.memory_manager.lock().unwrap().start_migration()?;
2771         self.device_manager.lock().unwrap().start_migration()
2772     }
2773 
2774     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2775         self.memory_manager.lock().unwrap().complete_migration()?;
2776         self.device_manager.lock().unwrap().complete_migration()
2777     }
2778 }
2779 
2780 #[cfg(feature = "guest_debug")]
2781 impl Debuggable for Vm {
2782     fn set_guest_debug(
2783         &self,
2784         cpu_id: usize,
2785         addrs: &[GuestAddress],
2786         singlestep: bool,
2787     ) -> std::result::Result<(), DebuggableError> {
2788         self.cpu_manager
2789             .lock()
2790             .unwrap()
2791             .set_guest_debug(cpu_id, addrs, singlestep)
2792     }
2793 
2794     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2795         if *self.state.read().unwrap() == VmState::Running {
2796             self.pause().map_err(DebuggableError::Pause)?;
2797         }
2798 
2799         let mut state = self
2800             .state
2801             .try_write()
2802             .map_err(|_| DebuggableError::PoisonedState)?;
2803         *state = VmState::BreakPoint;
2804         Ok(())
2805     }
2806 
2807     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2808         if *self.state.read().unwrap() == VmState::BreakPoint {
2809             self.resume().map_err(DebuggableError::Pause)?;
2810         }
2811 
2812         Ok(())
2813     }
2814 
2815     fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2816         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2817     }
2818 
2819     fn write_regs(
2820         &self,
2821         cpu_id: usize,
2822         regs: &CoreRegs,
2823     ) -> std::result::Result<(), DebuggableError> {
2824         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2825     }
2826 
2827     fn read_mem(
2828         &self,
2829         cpu_id: usize,
2830         vaddr: GuestAddress,
2831         len: usize,
2832     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2833         self.cpu_manager
2834             .lock()
2835             .unwrap()
2836             .read_mem(cpu_id, vaddr, len)
2837     }
2838 
2839     fn write_mem(
2840         &self,
2841         cpu_id: usize,
2842         vaddr: &GuestAddress,
2843         data: &[u8],
2844     ) -> std::result::Result<(), DebuggableError> {
2845         self.cpu_manager
2846             .lock()
2847             .unwrap()
2848             .write_mem(cpu_id, vaddr, data)
2849     }
2850 
2851     fn active_vcpus(&self) -> usize {
2852         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2853         if active_vcpus > 0 {
2854             active_vcpus
2855         } else {
2856             // The VM is not booted yet. Report boot_vcpus() instead.
2857             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2858         }
2859     }
2860 }
2861 
2862 #[cfg(feature = "guest_debug")]
2863 pub const UINT16_MAX: u32 = 65535;
2864 
2865 #[cfg(feature = "guest_debug")]
2866 impl Elf64Writable for Vm {}
2867 
2868 #[cfg(feature = "guest_debug")]
2869 impl GuestDebuggable for Vm {
2870     fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
2871         event!("vm", "coredumping");
2872 
2873         #[cfg(feature = "tdx")]
2874         {
2875             if let Some(ref platform) = self.config.lock().unwrap().platform {
2876                 if platform.tdx {
2877                     return Err(GuestDebuggableError::Coredump(anyhow!(
2878                         "Coredump not possible with TDX VM"
2879                     )));
2880                 }
2881             }
2882         }
2883 
2884         let current_state = self.get_state().unwrap();
2885         if current_state != VmState::Paused {
2886             return Err(GuestDebuggableError::Coredump(anyhow!(
2887                 "Trying to coredump while VM is running"
2888             )));
2889         }
2890 
2891         let coredump_state = self.get_dump_state(destination_url)?;
2892 
2893         self.write_header(&coredump_state)?;
2894         self.write_note(&coredump_state)?;
2895         self.write_loads(&coredump_state)?;
2896 
2897         self.cpu_manager
2898             .lock()
2899             .unwrap()
2900             .cpu_write_elf64_note(&coredump_state)?;
2901         self.cpu_manager
2902             .lock()
2903             .unwrap()
2904             .cpu_write_vmm_note(&coredump_state)?;
2905 
2906         self.memory_manager
2907             .lock()
2908             .unwrap()
2909             .coredump_iterate_save_mem(&coredump_state)
2910     }
2911 }
2912 
2913 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2914 #[cfg(test)]
2915 mod tests {
2916     use super::*;
2917 
2918     fn test_vm_state_transitions(state: VmState) {
2919         match state {
2920             VmState::Created => {
2921                 // Check the transitions from Created
2922                 assert!(state.valid_transition(VmState::Created).is_err());
2923                 assert!(state.valid_transition(VmState::Running).is_ok());
2924                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2925                 assert!(state.valid_transition(VmState::Paused).is_ok());
2926                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2927             }
2928             VmState::Running => {
2929                 // Check the transitions from Running
2930                 assert!(state.valid_transition(VmState::Created).is_err());
2931                 assert!(state.valid_transition(VmState::Running).is_err());
2932                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2933                 assert!(state.valid_transition(VmState::Paused).is_ok());
2934                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2935             }
2936             VmState::Shutdown => {
2937                 // Check the transitions from Shutdown
2938                 assert!(state.valid_transition(VmState::Created).is_err());
2939                 assert!(state.valid_transition(VmState::Running).is_ok());
2940                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2941                 assert!(state.valid_transition(VmState::Paused).is_err());
2942                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2943             }
2944             VmState::Paused => {
2945                 // Check the transitions from Paused
2946                 assert!(state.valid_transition(VmState::Created).is_err());
2947                 assert!(state.valid_transition(VmState::Running).is_ok());
2948                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2949                 assert!(state.valid_transition(VmState::Paused).is_err());
2950                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2951             }
2952             VmState::BreakPoint => {
2953                 // Check the transitions from Breakpoint
2954                 assert!(state.valid_transition(VmState::Created).is_ok());
2955                 assert!(state.valid_transition(VmState::Running).is_ok());
2956                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2957                 assert!(state.valid_transition(VmState::Paused).is_err());
2958                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2959             }
2960         }
2961     }
2962 
2963     #[test]
2964     fn test_vm_created_transitions() {
2965         test_vm_state_transitions(VmState::Created);
2966     }
2967 
2968     #[test]
2969     fn test_vm_running_transitions() {
2970         test_vm_state_transitions(VmState::Running);
2971     }
2972 
2973     #[test]
2974     fn test_vm_shutdown_transitions() {
2975         test_vm_state_transitions(VmState::Shutdown);
2976     }
2977 
2978     #[test]
2979     fn test_vm_paused_transitions() {
2980         test_vm_state_transitions(VmState::Paused);
2981     }
2982 
2983     #[cfg(feature = "tdx")]
2984     #[test]
2985     fn test_hob_memory_resources() {
2986         // Case 1: Two TDVF sections in the middle of the RAM
2987         let sections = vec![
2988             TdvfSection {
2989                 address: 0xc000,
2990                 size: 0x1000,
2991                 ..Default::default()
2992             },
2993             TdvfSection {
2994                 address: 0x1000,
2995                 size: 0x4000,
2996                 ..Default::default()
2997             },
2998         ];
2999         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
3000         let expected = vec![
3001             (0, 0x1000, true),
3002             (0x1000, 0x4000, false),
3003             (0x5000, 0x7000, true),
3004             (0xc000, 0x1000, false),
3005             (0xd000, 0x0fff_3000, true),
3006         ];
3007         assert_eq!(
3008             expected,
3009             Vm::hob_memory_resources(
3010                 sections,
3011                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3012             )
3013         );
3014 
3015         // Case 2: Two TDVF sections with no conflict with the RAM
3016         let sections = vec![
3017             TdvfSection {
3018                 address: 0x1000_1000,
3019                 size: 0x1000,
3020                 ..Default::default()
3021             },
3022             TdvfSection {
3023                 address: 0,
3024                 size: 0x1000,
3025                 ..Default::default()
3026             },
3027         ];
3028         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3029         let expected = vec![
3030             (0, 0x1000, false),
3031             (0x1000, 0x1000_0000, true),
3032             (0x1000_1000, 0x1000, false),
3033         ];
3034         assert_eq!(
3035             expected,
3036             Vm::hob_memory_resources(
3037                 sections,
3038                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3039             )
3040         );
3041 
3042         // Case 3: Two TDVF sections with partial conflicts with the RAM
3043         let sections = vec![
3044             TdvfSection {
3045                 address: 0x1000_0000,
3046                 size: 0x2000,
3047                 ..Default::default()
3048             },
3049             TdvfSection {
3050                 address: 0,
3051                 size: 0x2000,
3052                 ..Default::default()
3053             },
3054         ];
3055         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3056         let expected = vec![
3057             (0, 0x2000, false),
3058             (0x2000, 0x0fff_e000, true),
3059             (0x1000_0000, 0x2000, false),
3060         ];
3061         assert_eq!(
3062             expected,
3063             Vm::hob_memory_resources(
3064                 sections,
3065                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3066             )
3067         );
3068 
3069         // Case 4: Two TDVF sections with no conflict before the RAM and two
3070         // more additional sections with no conflict after the RAM.
3071         let sections = vec![
3072             TdvfSection {
3073                 address: 0x2000_1000,
3074                 size: 0x1000,
3075                 ..Default::default()
3076             },
3077             TdvfSection {
3078                 address: 0x2000_0000,
3079                 size: 0x1000,
3080                 ..Default::default()
3081             },
3082             TdvfSection {
3083                 address: 0x1000,
3084                 size: 0x1000,
3085                 ..Default::default()
3086             },
3087             TdvfSection {
3088                 address: 0,
3089                 size: 0x1000,
3090                 ..Default::default()
3091             },
3092         ];
3093         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3094         let expected = vec![
3095             (0, 0x1000, false),
3096             (0x1000, 0x1000, false),
3097             (0x4000, 0x1000_0000, true),
3098             (0x2000_0000, 0x1000, false),
3099             (0x2000_1000, 0x1000, false),
3100         ];
3101         assert_eq!(
3102             expected,
3103             Vm::hob_memory_resources(
3104                 sections,
3105                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3106             )
3107         );
3108 
3109         // Case 5: One TDVF section overriding the entire RAM
3110         let sections = vec![TdvfSection {
3111             address: 0,
3112             size: 0x2000_0000,
3113             ..Default::default()
3114         }];
3115         let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3116         let expected = vec![(0, 0x2000_0000, false)];
3117         assert_eq!(
3118             expected,
3119             Vm::hob_memory_resources(
3120                 sections,
3121                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3122             )
3123         );
3124 
3125         // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3126         let sections = vec![
3127             TdvfSection {
3128                 address: 0x1000_2000,
3129                 size: 0x2000,
3130                 ..Default::default()
3131             },
3132             TdvfSection {
3133                 address: 0,
3134                 size: 0x2000,
3135                 ..Default::default()
3136             },
3137         ];
3138         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3139             (GuestAddress(0x2000), 0x1000_0000),
3140             (GuestAddress(0x1000_4000), 0x1000_0000),
3141         ];
3142         let expected = vec![
3143             (0, 0x2000, false),
3144             (0x2000, 0x1000_0000, true),
3145             (0x1000_2000, 0x2000, false),
3146             (0x1000_4000, 0x1000_0000, true),
3147         ];
3148         assert_eq!(
3149             expected,
3150             Vm::hob_memory_resources(
3151                 sections,
3152                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3153             )
3154         );
3155 
3156         // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3157         let sections = vec![
3158             TdvfSection {
3159                 address: 0x1000_0000,
3160                 size: 0x4000,
3161                 ..Default::default()
3162             },
3163             TdvfSection {
3164                 address: 0,
3165                 size: 0x4000,
3166                 ..Default::default()
3167             },
3168         ];
3169         let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3170             (GuestAddress(0x1000), 0x1000_0000),
3171             (GuestAddress(0x1000_3000), 0x1000_0000),
3172         ];
3173         let expected = vec![
3174             (0, 0x4000, false),
3175             (0x4000, 0x0fff_c000, true),
3176             (0x1000_0000, 0x4000, false),
3177             (0x1000_4000, 0x0fff_f000, true),
3178         ];
3179         assert_eq!(
3180             expected,
3181             Vm::hob_memory_resources(
3182                 sections,
3183                 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3184             )
3185         );
3186     }
3187 }
3188 
3189 #[cfg(target_arch = "aarch64")]
3190 #[cfg(test)]
3191 mod tests {
3192     use super::*;
3193     use crate::GuestMemoryMmap;
3194     use arch::aarch64::fdt::create_fdt;
3195     use arch::aarch64::layout;
3196     use arch::{DeviceType, MmioDeviceInfo};
3197     use devices::gic::Gic;
3198 
3199     const LEN: u64 = 4096;
3200 
3201     #[test]
3202     fn test_create_fdt_with_devices() {
3203         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3204         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
3205 
3206         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3207             (
3208                 (DeviceType::Serial, DeviceType::Serial.to_string()),
3209                 MmioDeviceInfo {
3210                     addr: 0x00,
3211                     len: LEN,
3212                     irq: 33,
3213                 },
3214             ),
3215             (
3216                 (DeviceType::Virtio(1), "virtio".to_string()),
3217                 MmioDeviceInfo {
3218                     addr: LEN,
3219                     len: LEN,
3220                     irq: 34,
3221                 },
3222             ),
3223             (
3224                 (DeviceType::Rtc, "rtc".to_string()),
3225                 MmioDeviceInfo {
3226                     addr: 2 * LEN,
3227                     len: LEN,
3228                     irq: 35,
3229                 },
3230             ),
3231         ]
3232         .iter()
3233         .cloned()
3234         .collect();
3235 
3236         let hv = hypervisor::new().unwrap();
3237         let vm = hv.create_vm().unwrap();
3238         let gic = vm
3239             .create_vgic(Gic::create_default_config(1))
3240             .expect("Cannot create gic");
3241         assert!(create_fdt(
3242             &mem,
3243             "console=tty0",
3244             vec![0],
3245             Some((0, 0, 0)),
3246             &dev_info,
3247             &gic,
3248             &None,
3249             &Vec::new(),
3250             &BTreeMap::new(),
3251             None,
3252             true,
3253         )
3254         .is_ok())
3255     }
3256 }
3257 
3258 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3259 #[test]
3260 pub fn test_vm() {
3261     use hypervisor::VmExit;
3262     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3263     // This example based on https://lwn.net/Articles/658511/
3264     let code = [
3265         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3266         0x00, 0xd8, /* add %bl, %al */
3267         0x04, b'0', /* add $'0', %al */
3268         0xee, /* out %al, (%dx) */
3269         0xb0, b'\n', /* mov $'\n', %al */
3270         0xee,  /* out %al, (%dx) */
3271         0xf4,  /* hlt */
3272     ];
3273 
3274     let mem_size = 0x1000;
3275     let load_addr = GuestAddress(0x1000);
3276     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3277 
3278     let hv = hypervisor::new().unwrap();
3279     let vm = hv.create_vm().expect("new VM creation failed");
3280 
3281     for (index, region) in mem.iter().enumerate() {
3282         let mem_region = vm.make_user_memory_region(
3283             index as u32,
3284             region.start_addr().raw_value(),
3285             region.len(),
3286             region.as_ptr() as u64,
3287             false,
3288             false,
3289         );
3290 
3291         vm.create_user_memory_region(mem_region)
3292             .expect("Cannot configure guest memory");
3293     }
3294     mem.write_slice(&code, load_addr)
3295         .expect("Writing code to memory failed");
3296 
3297     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3298 
3299     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3300     vcpu_sregs.cs.base = 0;
3301     vcpu_sregs.cs.selector = 0;
3302     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3303 
3304     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3305     vcpu_regs.rip = 0x1000;
3306     vcpu_regs.rax = 2;
3307     vcpu_regs.rbx = 3;
3308     vcpu_regs.rflags = 2;
3309     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3310 
3311     loop {
3312         match vcpu.run().expect("run failed") {
3313             VmExit::IoOut(addr, data) => {
3314                 println!(
3315                     "IO out -- addr: {:#x} data [{:?}]",
3316                     addr,
3317                     str::from_utf8(data).unwrap()
3318                 );
3319             }
3320             VmExit::Reset => {
3321                 println!("HLT");
3322                 break;
3323             }
3324             r => panic!("unexpected exit reason: {:?}", r),
3325         }
3326     }
3327 }
3328