xref: /cloud-hypervisor/vmm/src/vm.rs (revision 9af2968a7dc47b89bf07ea9dc5e735084efcfa3a)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 #[cfg(feature = "acpi")]
15 use crate::config::NumaConfig;
16 use crate::config::{
17     DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, ValidationError,
18     VmConfig, VsockConfig,
19 };
20 use crate::cpu;
21 use crate::device_manager::{
22     self, get_win_size, Console, DeviceManager, DeviceManagerError, PtyPair,
23 };
24 use crate::device_tree::DeviceTree;
25 use crate::memory_manager::{Error as MemoryManagerError, MemoryManager};
26 use crate::migration::{get_vm_snapshot, url_to_path, VM_SNAPSHOT_FILE};
27 use crate::seccomp_filters::{get_seccomp_filter, Thread};
28 use crate::{GuestMemoryMmap, GuestRegionMmap};
29 use crate::{
30     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
31 };
32 use anyhow::anyhow;
33 use arch::get_host_cpu_phys_bits;
34 #[cfg(feature = "tdx")]
35 use arch::x86_64::tdx::TdvfSection;
36 #[cfg(target_arch = "x86_64")]
37 use arch::x86_64::SgxEpcSection;
38 use arch::EntryPoint;
39 use devices::AcpiNotificationFlags;
40 use hypervisor::vm::{HypervisorVmError, VmmOps};
41 use linux_loader::cmdline::Cmdline;
42 #[cfg(target_arch = "x86_64")]
43 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
44 #[cfg(target_arch = "aarch64")]
45 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
46 use linux_loader::loader::KernelLoader;
47 use seccomp::{SeccompAction, SeccompFilter};
48 use signal_hook::{
49     consts::{SIGINT, SIGTERM, SIGWINCH},
50     iterator::backend::Handle,
51     iterator::Signals,
52 };
53 use std::cmp;
54 use std::collections::{BTreeMap, HashMap};
55 use std::convert::TryInto;
56 use std::ffi::CString;
57 #[cfg(target_arch = "x86_64")]
58 use std::fmt;
59 use std::fs::{File, OpenOptions};
60 use std::io::{self, Read, Write};
61 use std::io::{Seek, SeekFrom};
62 use std::num::Wrapping;
63 use std::ops::Deref;
64 use std::sync::{Arc, Mutex, RwLock};
65 use std::{result, str, thread};
66 use vm_device::Bus;
67 use vm_memory::{
68     Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
69     GuestMemoryRegion,
70 };
71 use vm_migration::{
72     protocol::{MemoryRange, MemoryRangeTable},
73     Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
74     Transportable,
75 };
76 use vmm_sys_util::eventfd::EventFd;
77 use vmm_sys_util::terminal::Terminal;
78 
79 #[cfg(target_arch = "aarch64")]
80 use arch::aarch64::gic::gicv3_its::kvm::{KvmGicV3Its, GIC_V3_ITS_SNAPSHOT_ID};
81 #[cfg(target_arch = "aarch64")]
82 use arch::aarch64::gic::kvm::create_gic;
83 #[cfg(target_arch = "aarch64")]
84 use devices::interrupt_controller::{self, InterruptController};
85 
86 /// Errors associated with VM management
87 #[derive(Debug)]
88 pub enum Error {
89     /// Cannot open the kernel image
90     KernelFile(io::Error),
91 
92     /// Cannot open the initramfs image
93     InitramfsFile(io::Error),
94 
95     /// Cannot load the kernel in memory
96     KernelLoad(linux_loader::loader::Error),
97 
98     #[cfg(target_arch = "aarch64")]
99     /// Cannot load the UEFI binary in memory
100     UefiLoad(arch::aarch64::uefi::Error),
101 
102     /// Cannot load the initramfs in memory
103     InitramfsLoad,
104 
105     /// Cannot load the command line in memory
106     LoadCmdLine(linux_loader::loader::Error),
107 
108     /// Cannot modify the command line
109     CmdLineInsertStr(linux_loader::cmdline::Error),
110 
111     /// Cannot convert command line into CString
112     CmdLineCString(std::ffi::NulError),
113 
114     /// Cannot configure system
115     ConfigureSystem(arch::Error),
116 
117     /// Cannot enable interrupt controller
118     #[cfg(target_arch = "aarch64")]
119     EnableInterruptController(interrupt_controller::Error),
120 
121     PoisonedState,
122 
123     /// Cannot create a device manager.
124     DeviceManager(DeviceManagerError),
125 
126     /// Write to the console failed.
127     Console(vmm_sys_util::errno::Error),
128 
129     /// Write to the pty console failed.
130     PtyConsole(io::Error),
131 
132     /// Cannot setup terminal in raw mode.
133     SetTerminalRaw(vmm_sys_util::errno::Error),
134 
135     /// Cannot setup terminal in canonical mode.
136     SetTerminalCanon(vmm_sys_util::errno::Error),
137 
138     /// Memory is overflow
139     MemOverflow,
140 
141     /// Cannot spawn a signal handler thread
142     SignalHandlerSpawn(io::Error),
143 
144     /// Failed to join on vCPU threads
145     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
146 
147     /// VM config is missing.
148     VmMissingConfig,
149 
150     /// VM is not created
151     VmNotCreated,
152 
153     /// VM is already created
154     VmAlreadyCreated,
155 
156     /// VM is not running
157     VmNotRunning,
158 
159     /// Cannot clone EventFd.
160     EventFdClone(io::Error),
161 
162     /// Invalid VM state transition
163     InvalidStateTransition(VmState, VmState),
164 
165     /// Error from CPU handling
166     CpuManager(cpu::Error),
167 
168     /// Cannot pause devices
169     PauseDevices(MigratableError),
170 
171     /// Cannot resume devices
172     ResumeDevices(MigratableError),
173 
174     /// Cannot pause CPUs
175     PauseCpus(MigratableError),
176 
177     /// Cannot resume cpus
178     ResumeCpus(MigratableError),
179 
180     /// Cannot pause VM
181     Pause(MigratableError),
182 
183     /// Cannot resume VM
184     Resume(MigratableError),
185 
186     /// Memory manager error
187     MemoryManager(MemoryManagerError),
188 
189     /// Eventfd write error
190     EventfdError(std::io::Error),
191 
192     /// Cannot snapshot VM
193     Snapshot(MigratableError),
194 
195     /// Cannot restore VM
196     Restore(MigratableError),
197 
198     /// Cannot send VM snapshot
199     SnapshotSend(MigratableError),
200 
201     /// Cannot convert source URL from Path into &str
202     RestoreSourceUrlPathToStr,
203 
204     /// Failed to validate config
205     ConfigValidation(ValidationError),
206 
207     /// No more that one virtio-vsock device
208     TooManyVsockDevices,
209 
210     /// Failed serializing into JSON
211     SerializeJson(serde_json::Error),
212 
213     /// Invalid configuration for NUMA.
214     InvalidNumaConfig,
215 
216     /// Cannot create seccomp filter
217     CreateSeccompFilter(seccomp::SeccompError),
218 
219     /// Cannot apply seccomp filter
220     ApplySeccompFilter(seccomp::Error),
221 
222     /// Failed resizing a memory zone.
223     ResizeZone,
224 
225     /// Cannot activate virtio devices
226     ActivateVirtioDevices(device_manager::DeviceManagerError),
227 
228     /// Power button not supported
229     PowerButtonNotSupported,
230 
231     /// Error triggering power button
232     PowerButton(device_manager::DeviceManagerError),
233 
234     /// Kernel lacks PVH header
235     KernelMissingPvhHeader,
236 
237     /// Error doing I/O on TDX firmware file
238     #[cfg(feature = "tdx")]
239     LoadTdvf(std::io::Error),
240 
241     /// Error parsing TDVF
242     #[cfg(feature = "tdx")]
243     ParseTdvf(arch::x86_64::tdx::TdvfError),
244 
245     /// Error populating HOB
246     #[cfg(feature = "tdx")]
247     PopulateHob(arch::x86_64::tdx::TdvfError),
248 
249     /// Error allocating TDVF memory
250     #[cfg(feature = "tdx")]
251     AllocatingTdvfMemory(crate::memory_manager::Error),
252 
253     /// Error enabling TDX VM
254     #[cfg(feature = "tdx")]
255     InitializeTdxVm(hypervisor::HypervisorVmError),
256 
257     /// Error enabling TDX memory region
258     #[cfg(feature = "tdx")]
259     InitializeTdxMemoryRegion(hypervisor::HypervisorVmError),
260 
261     /// Error finalizing TDX setup
262     #[cfg(feature = "tdx")]
263     FinalizeTdx(hypervisor::HypervisorVmError),
264 }
265 pub type Result<T> = result::Result<T, Error>;
266 
267 #[derive(Clone, Default)]
268 pub struct NumaNode {
269     memory_regions: Vec<Arc<GuestRegionMmap>>,
270     hotplug_regions: Vec<Arc<GuestRegionMmap>>,
271     cpus: Vec<u8>,
272     distances: BTreeMap<u32, u8>,
273     memory_zones: Vec<String>,
274     #[cfg(target_arch = "x86_64")]
275     sgx_epc_sections: Vec<SgxEpcSection>,
276 }
277 
278 impl NumaNode {
279     pub fn memory_regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
280         &self.memory_regions
281     }
282 
283     pub fn hotplug_regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
284         &self.hotplug_regions
285     }
286 
287     pub fn cpus(&self) -> &Vec<u8> {
288         &self.cpus
289     }
290 
291     pub fn distances(&self) -> &BTreeMap<u32, u8> {
292         &self.distances
293     }
294 
295     pub fn memory_zones(&self) -> &Vec<String> {
296         &self.memory_zones
297     }
298 
299     #[cfg(target_arch = "x86_64")]
300     pub fn sgx_epc_sections(&self) -> &Vec<SgxEpcSection> {
301         &self.sgx_epc_sections
302     }
303 }
304 
305 pub type NumaNodes = BTreeMap<u32, NumaNode>;
306 
307 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq)]
308 pub enum VmState {
309     Created,
310     Running,
311     Shutdown,
312     Paused,
313 }
314 
315 impl VmState {
316     fn valid_transition(self, new_state: VmState) -> Result<()> {
317         match self {
318             VmState::Created => match new_state {
319                 VmState::Created | VmState::Shutdown => {
320                     Err(Error::InvalidStateTransition(self, new_state))
321                 }
322                 VmState::Running | VmState::Paused => Ok(()),
323             },
324 
325             VmState::Running => match new_state {
326                 VmState::Created | VmState::Running => {
327                     Err(Error::InvalidStateTransition(self, new_state))
328                 }
329                 VmState::Paused | VmState::Shutdown => Ok(()),
330             },
331 
332             VmState::Shutdown => match new_state {
333                 VmState::Paused | VmState::Created | VmState::Shutdown => {
334                     Err(Error::InvalidStateTransition(self, new_state))
335                 }
336                 VmState::Running => Ok(()),
337             },
338 
339             VmState::Paused => match new_state {
340                 VmState::Created | VmState::Paused => {
341                     Err(Error::InvalidStateTransition(self, new_state))
342                 }
343                 VmState::Running | VmState::Shutdown => Ok(()),
344             },
345         }
346     }
347 }
348 
349 // Debug I/O port
350 #[cfg(target_arch = "x86_64")]
351 const DEBUG_IOPORT: u16 = 0x80;
352 #[cfg(target_arch = "x86_64")]
353 const DEBUG_IOPORT_PREFIX: &str = "Debug I/O port";
354 
355 #[cfg(target_arch = "x86_64")]
356 /// Debug I/O port, see:
357 /// https://www.intel.com/content/www/us/en/support/articles/000005500/boards-and-kits.html
358 ///
359 /// Since we're not a physical platform, we can freely assign code ranges for
360 /// debugging specific parts of our virtual platform.
361 pub enum DebugIoPortRange {
362     Firmware,
363     Bootloader,
364     Kernel,
365     Userspace,
366     Custom,
367 }
368 #[cfg(target_arch = "x86_64")]
369 impl DebugIoPortRange {
370     fn from_u8(value: u8) -> DebugIoPortRange {
371         match value {
372             0x00..=0x1f => DebugIoPortRange::Firmware,
373             0x20..=0x3f => DebugIoPortRange::Bootloader,
374             0x40..=0x5f => DebugIoPortRange::Kernel,
375             0x60..=0x7f => DebugIoPortRange::Userspace,
376             _ => DebugIoPortRange::Custom,
377         }
378     }
379 }
380 
381 #[cfg(target_arch = "x86_64")]
382 impl fmt::Display for DebugIoPortRange {
383     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
384         match self {
385             DebugIoPortRange::Firmware => write!(f, "{}: Firmware", DEBUG_IOPORT_PREFIX),
386             DebugIoPortRange::Bootloader => write!(f, "{}: Bootloader", DEBUG_IOPORT_PREFIX),
387             DebugIoPortRange::Kernel => write!(f, "{}: Kernel", DEBUG_IOPORT_PREFIX),
388             DebugIoPortRange::Userspace => write!(f, "{}: Userspace", DEBUG_IOPORT_PREFIX),
389             DebugIoPortRange::Custom => write!(f, "{}: Custom", DEBUG_IOPORT_PREFIX),
390         }
391     }
392 }
393 
394 struct VmOps {
395     memory: GuestMemoryAtomic<GuestMemoryMmap>,
396     #[cfg(target_arch = "x86_64")]
397     io_bus: Arc<Bus>,
398     mmio_bus: Arc<Bus>,
399     #[cfg(target_arch = "x86_64")]
400     timestamp: std::time::Instant,
401 }
402 
403 impl VmOps {
404     #[cfg(target_arch = "x86_64")]
405     // Log debug io port codes.
406     fn log_debug_ioport(&self, code: u8) {
407         let elapsed = self.timestamp.elapsed();
408 
409         debug!(
410             "[{} code 0x{:x}] {}.{:>06} seconds",
411             DebugIoPortRange::from_u8(code),
412             code,
413             elapsed.as_secs(),
414             elapsed.as_micros()
415         );
416     }
417 }
418 
419 impl VmmOps for VmOps {
420     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> hypervisor::vm::Result<usize> {
421         self.memory
422             .memory()
423             .write(buf, GuestAddress(gpa))
424             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
425     }
426 
427     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> hypervisor::vm::Result<usize> {
428         self.memory
429             .memory()
430             .read(buf, GuestAddress(gpa))
431             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
432     }
433 
434     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> {
435         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
436             warn!("Guest MMIO read to unregistered address 0x{:x}", gpa);
437         }
438         Ok(())
439     }
440 
441     fn mmio_write(&self, gpa: u64, data: &[u8]) -> hypervisor::vm::Result<()> {
442         match self.mmio_bus.write(gpa, data) {
443             Err(vm_device::BusError::MissingAddressRange) => {
444                 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa);
445             }
446             Ok(Some(barrier)) => {
447                 info!("Waiting for barrier");
448                 barrier.wait();
449                 info!("Barrier released");
450             }
451             _ => {}
452         };
453         Ok(())
454     }
455 
456     #[cfg(target_arch = "x86_64")]
457     fn pio_read(&self, port: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> {
458         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
459             warn!("Guest PIO read to unregistered address 0x{:x}", port);
460         }
461         Ok(())
462     }
463 
464     #[cfg(target_arch = "x86_64")]
465     fn pio_write(&self, port: u64, data: &[u8]) -> hypervisor::vm::Result<()> {
466         if port == DEBUG_IOPORT as u64 && data.len() == 1 {
467             self.log_debug_ioport(data[0]);
468             return Ok(());
469         }
470 
471         match self.io_bus.write(port, data) {
472             Err(vm_device::BusError::MissingAddressRange) => {
473                 warn!("Guest PIO write to unregistered address 0x{:x}", port);
474             }
475             Ok(Some(barrier)) => {
476                 info!("Waiting for barrier");
477                 barrier.wait();
478                 info!("Barrier released");
479             }
480             _ => {}
481         };
482         Ok(())
483     }
484 }
485 
486 pub fn physical_bits(max_phys_bits: Option<u8>, #[cfg(feature = "tdx")] tdx_enabled: bool) -> u8 {
487     #[cfg(not(feature = "tdx"))]
488     let host_phys_bits = get_host_cpu_phys_bits();
489     #[cfg(feature = "tdx")]
490     let mut host_phys_bits = get_host_cpu_phys_bits();
491 
492     #[cfg(feature = "tdx")]
493     if tdx_enabled {
494         // When running TDX guest, the Guest Physical Address space is limited
495         // by a shared bit that is located on bit 47 for 4 level paging, and on
496         // bit 51 for 5 level paging (when GPAW bit is 1). In order to keep
497         // things simple, and since a 47 bits address space is 128TiB large, we
498         // ensure to limit the physical addressable space to 47 bits when
499         // runnning TDX.
500         host_phys_bits = std::cmp::min(host_phys_bits, 47)
501     }
502 
503     cmp::min(host_phys_bits, max_phys_bits.unwrap_or(host_phys_bits))
504 }
505 
506 pub struct Vm {
507     kernel: Option<File>,
508     initramfs: Option<File>,
509     threads: Vec<thread::JoinHandle<()>>,
510     device_manager: Arc<Mutex<DeviceManager>>,
511     config: Arc<Mutex<VmConfig>>,
512     on_tty: bool,
513     signals: Option<Handle>,
514     state: RwLock<VmState>,
515     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
516     memory_manager: Arc<Mutex<MemoryManager>>,
517     #[cfg_attr(not(feature = "kvm"), allow(dead_code))]
518     // The hypervisor abstracted virtual machine.
519     vm: Arc<dyn hypervisor::Vm>,
520     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
521     saved_clock: Option<hypervisor::ClockData>,
522     #[cfg(feature = "acpi")]
523     numa_nodes: NumaNodes,
524     seccomp_action: SeccompAction,
525     exit_evt: EventFd,
526 }
527 
528 impl Vm {
529     #[allow(clippy::too_many_arguments)]
530     fn new_from_memory_manager(
531         config: Arc<Mutex<VmConfig>>,
532         memory_manager: Arc<Mutex<MemoryManager>>,
533         vm: Arc<dyn hypervisor::Vm>,
534         exit_evt: EventFd,
535         reset_evt: EventFd,
536         seccomp_action: &SeccompAction,
537         hypervisor: Arc<dyn hypervisor::Hypervisor>,
538         #[cfg(all(feature = "kvm", target_arch = "x86_64"))] _saved_clock: Option<
539             hypervisor::ClockData,
540         >,
541         activate_evt: EventFd,
542     ) -> Result<Self> {
543         config
544             .lock()
545             .unwrap()
546             .validate()
547             .map_err(Error::ConfigValidation)?;
548 
549         info!("Booting VM from config: {:?}", &config);
550 
551         // Create NUMA nodes based on NumaConfig.
552         #[cfg(feature = "acpi")]
553         let numa_nodes =
554             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
555 
556         #[cfg(feature = "tdx")]
557         let force_iommu = config.lock().unwrap().tdx.is_some();
558         #[cfg(not(feature = "tdx"))]
559         let force_iommu = false;
560 
561         let device_manager = DeviceManager::new(
562             vm.clone(),
563             config.clone(),
564             memory_manager.clone(),
565             &exit_evt,
566             &reset_evt,
567             seccomp_action.clone(),
568             #[cfg(feature = "acpi")]
569             numa_nodes.clone(),
570             &activate_evt,
571             force_iommu,
572         )
573         .map_err(Error::DeviceManager)?;
574 
575         let memory = memory_manager.lock().unwrap().guest_memory();
576         #[cfg(target_arch = "x86_64")]
577         let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus());
578         let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus());
579         // Create the VmOps structure, which implements the VmmOps trait.
580         // And send it to the hypervisor.
581         let vm_ops: Arc<Box<dyn VmmOps>> = Arc::new(Box::new(VmOps {
582             memory,
583             #[cfg(target_arch = "x86_64")]
584             io_bus,
585             mmio_bus,
586             #[cfg(target_arch = "x86_64")]
587             timestamp: std::time::Instant::now(),
588         }));
589 
590         let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?;
591         #[cfg(feature = "tdx")]
592         let tdx_enabled = config.lock().unwrap().tdx.is_some();
593         let cpu_manager = cpu::CpuManager::new(
594             &config.lock().unwrap().cpus.clone(),
595             &device_manager,
596             &memory_manager,
597             vm.clone(),
598             exit_evt_clone,
599             reset_evt,
600             hypervisor,
601             seccomp_action.clone(),
602             vm_ops,
603             #[cfg(feature = "tdx")]
604             tdx_enabled,
605             #[cfg(feature = "acpi")]
606             &numa_nodes,
607         )
608         .map_err(Error::CpuManager)?;
609 
610         let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0;
611         let kernel = config
612             .lock()
613             .unwrap()
614             .kernel
615             .as_ref()
616             .map(|k| File::open(&k.path))
617             .transpose()
618             .map_err(Error::KernelFile)?;
619 
620         let initramfs = config
621             .lock()
622             .unwrap()
623             .initramfs
624             .as_ref()
625             .map(|i| File::open(&i.path))
626             .transpose()
627             .map_err(Error::InitramfsFile)?;
628 
629         Ok(Vm {
630             kernel,
631             initramfs,
632             device_manager,
633             config,
634             on_tty,
635             threads: Vec::with_capacity(1),
636             signals: None,
637             state: RwLock::new(VmState::Created),
638             cpu_manager,
639             memory_manager,
640             vm,
641             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
642             saved_clock: _saved_clock,
643             #[cfg(feature = "acpi")]
644             numa_nodes,
645             seccomp_action: seccomp_action.clone(),
646             exit_evt,
647         })
648     }
649 
650     #[cfg(feature = "acpi")]
651     fn create_numa_nodes(
652         configs: Option<Vec<NumaConfig>>,
653         memory_manager: &Arc<Mutex<MemoryManager>>,
654     ) -> Result<NumaNodes> {
655         let mm = memory_manager.lock().unwrap();
656         let mm_zones = mm.memory_zones();
657         let mut numa_nodes = BTreeMap::new();
658 
659         if let Some(configs) = &configs {
660             for config in configs.iter() {
661                 if numa_nodes.contains_key(&config.guest_numa_id) {
662                     error!("Can't define twice the same NUMA node");
663                     return Err(Error::InvalidNumaConfig);
664                 }
665 
666                 let mut node = NumaNode::default();
667 
668                 if let Some(memory_zones) = &config.memory_zones {
669                     for memory_zone in memory_zones.iter() {
670                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
671                             node.memory_regions.extend(mm_zone.regions().clone());
672                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
673                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
674                             }
675                             node.memory_zones.push(memory_zone.clone());
676                         } else {
677                             error!("Unknown memory zone '{}'", memory_zone);
678                             return Err(Error::InvalidNumaConfig);
679                         }
680                     }
681                 }
682 
683                 if let Some(cpus) = &config.cpus {
684                     node.cpus.extend(cpus);
685                 }
686 
687                 if let Some(distances) = &config.distances {
688                     for distance in distances.iter() {
689                         let dest = distance.destination;
690                         let dist = distance.distance;
691 
692                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
693                             error!("Unknown destination NUMA node {}", dest);
694                             return Err(Error::InvalidNumaConfig);
695                         }
696 
697                         if node.distances.contains_key(&dest) {
698                             error!("Destination NUMA node {} has been already set", dest);
699                             return Err(Error::InvalidNumaConfig);
700                         }
701 
702                         node.distances.insert(dest, dist);
703                     }
704                 }
705 
706                 #[cfg(target_arch = "x86_64")]
707                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
708                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
709                         let mm_sections = sgx_epc_region.epc_sections();
710                         for sgx_epc_section in sgx_epc_sections.iter() {
711                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
712                                 node.sgx_epc_sections.push(mm_section.clone());
713                             } else {
714                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
715                                 return Err(Error::InvalidNumaConfig);
716                             }
717                         }
718                     } else {
719                         error!("Missing SGX EPC region");
720                         return Err(Error::InvalidNumaConfig);
721                     }
722                 }
723 
724                 numa_nodes.insert(config.guest_numa_id, node);
725             }
726         }
727 
728         Ok(numa_nodes)
729     }
730 
731     #[allow(clippy::too_many_arguments)]
732     pub fn new(
733         config: Arc<Mutex<VmConfig>>,
734         exit_evt: EventFd,
735         reset_evt: EventFd,
736         seccomp_action: &SeccompAction,
737         hypervisor: Arc<dyn hypervisor::Hypervisor>,
738         activate_evt: EventFd,
739         serial_pty: Option<PtyPair>,
740         console_pty: Option<PtyPair>,
741     ) -> Result<Self> {
742         #[cfg(feature = "tdx")]
743         let tdx_enabled = config.lock().unwrap().tdx.is_some();
744         hypervisor.check_required_extensions().unwrap();
745         #[cfg(feature = "tdx")]
746         let vm = hypervisor
747             .create_vm_with_type(if tdx_enabled {
748                 2 // KVM_X86_TDX_VM
749             } else {
750                 0 // KVM_X86_LEGACY_VM
751             })
752             .unwrap();
753         #[cfg(not(feature = "tdx"))]
754         let vm = hypervisor.create_vm().unwrap();
755 
756         #[cfg(target_arch = "x86_64")]
757         vm.enable_split_irq().unwrap();
758         let phys_bits = physical_bits(
759             config.lock().unwrap().cpus.max_phys_bits,
760             #[cfg(feature = "tdx")]
761             tdx_enabled,
762         );
763         let memory_manager = MemoryManager::new(
764             vm.clone(),
765             &config.lock().unwrap().memory.clone(),
766             false,
767             phys_bits,
768             #[cfg(feature = "tdx")]
769             tdx_enabled,
770         )
771         .map_err(Error::MemoryManager)?;
772 
773         #[cfg(target_arch = "x86_64")]
774         {
775             if let Some(sgx_epc_config) = config.lock().unwrap().sgx_epc.clone() {
776                 memory_manager
777                     .lock()
778                     .unwrap()
779                     .setup_sgx(sgx_epc_config, &vm)
780                     .map_err(Error::MemoryManager)?;
781             }
782         }
783 
784         let new_vm = Vm::new_from_memory_manager(
785             config,
786             memory_manager,
787             vm,
788             exit_evt,
789             reset_evt,
790             seccomp_action,
791             hypervisor,
792             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
793             None,
794             activate_evt,
795         )?;
796 
797         // The device manager must create the devices from here as it is part
798         // of the regular code path creating everything from scratch.
799         new_vm
800             .device_manager
801             .lock()
802             .unwrap()
803             .create_devices(serial_pty, console_pty)
804             .map_err(Error::DeviceManager)?;
805         Ok(new_vm)
806     }
807 
808     #[allow(clippy::too_many_arguments)]
809     pub fn new_from_snapshot(
810         snapshot: &Snapshot,
811         exit_evt: EventFd,
812         reset_evt: EventFd,
813         source_url: Option<&str>,
814         prefault: bool,
815         seccomp_action: &SeccompAction,
816         hypervisor: Arc<dyn hypervisor::Hypervisor>,
817         activate_evt: EventFd,
818     ) -> Result<Self> {
819         hypervisor.check_required_extensions().unwrap();
820         let vm = hypervisor.create_vm().unwrap();
821         #[cfg(target_arch = "x86_64")]
822         vm.enable_split_irq().unwrap();
823         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
824         let config = vm_snapshot.config;
825         if let Some(state) = vm_snapshot.state {
826             vm.set_state(state)
827                 .map_err(|e| Error::Restore(MigratableError::Restore(e.into())))?;
828         }
829 
830         let memory_manager = if let Some(memory_manager_snapshot) =
831             snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID)
832         {
833             let phys_bits = physical_bits(
834                 config.lock().unwrap().cpus.max_phys_bits,
835                 #[cfg(feature = "tdx")]
836                 config.lock().unwrap().tdx.is_some(),
837             );
838             MemoryManager::new_from_snapshot(
839                 memory_manager_snapshot,
840                 vm.clone(),
841                 &config.lock().unwrap().memory.clone(),
842                 source_url,
843                 prefault,
844                 phys_bits,
845             )
846             .map_err(Error::MemoryManager)?
847         } else {
848             return Err(Error::Restore(MigratableError::Restore(anyhow!(
849                 "Missing memory manager snapshot"
850             ))));
851         };
852 
853         Vm::new_from_memory_manager(
854             config,
855             memory_manager,
856             vm,
857             exit_evt,
858             reset_evt,
859             seccomp_action,
860             hypervisor,
861             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
862             vm_snapshot.clock,
863             activate_evt,
864         )
865     }
866 
867     pub fn new_from_migration(
868         config: Arc<Mutex<VmConfig>>,
869         exit_evt: EventFd,
870         reset_evt: EventFd,
871         seccomp_action: &SeccompAction,
872         hypervisor: Arc<dyn hypervisor::Hypervisor>,
873         activate_evt: EventFd,
874     ) -> Result<Self> {
875         hypervisor.check_required_extensions().unwrap();
876         let vm = hypervisor.create_vm().unwrap();
877         #[cfg(target_arch = "x86_64")]
878         vm.enable_split_irq().unwrap();
879         let phys_bits = physical_bits(
880             config.lock().unwrap().cpus.max_phys_bits,
881             #[cfg(feature = "tdx")]
882             config.lock().unwrap().tdx.is_some(),
883         );
884 
885         let memory_manager = MemoryManager::new(
886             vm.clone(),
887             &config.lock().unwrap().memory.clone(),
888             false,
889             phys_bits,
890             #[cfg(feature = "tdx")]
891             false,
892         )
893         .map_err(Error::MemoryManager)?;
894 
895         Vm::new_from_memory_manager(
896             config,
897             memory_manager,
898             vm,
899             exit_evt,
900             reset_evt,
901             seccomp_action,
902             hypervisor,
903             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
904             None,
905             activate_evt,
906         )
907     }
908 
909     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
910         let mut initramfs = self.initramfs.as_ref().unwrap();
911         let size: usize = initramfs
912             .seek(SeekFrom::End(0))
913             .map_err(|_| Error::InitramfsLoad)?
914             .try_into()
915             .unwrap();
916         initramfs
917             .seek(SeekFrom::Start(0))
918             .map_err(|_| Error::InitramfsLoad)?;
919 
920         let address =
921             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
922         let address = GuestAddress(address);
923 
924         guest_mem
925             .read_from(address, &mut initramfs, size)
926             .map_err(|_| Error::InitramfsLoad)?;
927 
928         info!("Initramfs loaded: address = 0x{:x}", address.0);
929         Ok(arch::InitramfsConfig { address, size })
930     }
931 
932     fn get_cmdline(&mut self) -> Result<CString> {
933         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE);
934         cmdline
935             .insert_str(self.config.lock().unwrap().cmdline.args.clone())
936             .map_err(Error::CmdLineInsertStr)?;
937         for entry in self.device_manager.lock().unwrap().cmdline_additions() {
938             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
939         }
940         CString::new(cmdline).map_err(Error::CmdLineCString)
941     }
942 
943     #[cfg(target_arch = "aarch64")]
944     fn load_kernel(&mut self) -> Result<EntryPoint> {
945         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
946         let mem = guest_memory.memory();
947         let mut kernel = self.kernel.as_ref().unwrap();
948         let entry_addr = match linux_loader::loader::pe::PE::load(
949             mem.deref(),
950             Some(GuestAddress(arch::get_kernel_start())),
951             &mut kernel,
952             None,
953         ) {
954             Ok(entry_addr) => entry_addr,
955             // Try to load the binary as kernel PE file at first.
956             // If failed, retry to load it as UEFI binary.
957             // As the UEFI binary is formatless, it must be the last option to try.
958             Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
959                 arch::aarch64::uefi::load_uefi(
960                     mem.deref(),
961                     GuestAddress(arch::get_uefi_start()),
962                     &mut kernel,
963                 )
964                 .map_err(Error::UefiLoad)?;
965                 // The entry point offset in UEFI image is always 0.
966                 return Ok(EntryPoint {
967                     entry_addr: GuestAddress(arch::get_uefi_start()),
968                 });
969             }
970             Err(e) => {
971                 return Err(Error::KernelLoad(e));
972             }
973         };
974 
975         let entry_point_addr: GuestAddress = entry_addr.kernel_load;
976 
977         Ok(EntryPoint {
978             entry_addr: entry_point_addr,
979         })
980     }
981 
982     #[cfg(target_arch = "x86_64")]
983     fn load_kernel(&mut self) -> Result<EntryPoint> {
984         info!("Loading kernel");
985         let cmdline_cstring = self.get_cmdline()?;
986         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
987         let mem = guest_memory.memory();
988         let mut kernel = self.kernel.as_ref().unwrap();
989         let entry_addr = match linux_loader::loader::elf::Elf::load(
990             mem.deref(),
991             None,
992             &mut kernel,
993             Some(arch::layout::HIGH_RAM_START),
994         ) {
995             Ok(entry_addr) => entry_addr,
996             Err(e) => {
997                 return Err(Error::KernelLoad(e));
998             }
999         };
1000 
1001         linux_loader::loader::load_cmdline(
1002             mem.deref(),
1003             arch::layout::CMDLINE_START,
1004             &cmdline_cstring,
1005         )
1006         .map_err(Error::LoadCmdLine)?;
1007 
1008         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1009             // Use the PVH kernel entry point to boot the guest
1010             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1011             Ok(EntryPoint { entry_addr })
1012         } else {
1013             Err(Error::KernelMissingPvhHeader)
1014         }
1015     }
1016 
1017     #[cfg(target_arch = "x86_64")]
1018     fn configure_system(&mut self) -> Result<()> {
1019         info!("Configuring system");
1020         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1021 
1022         let initramfs_config = match self.initramfs {
1023             Some(_) => Some(self.load_initramfs(&mem)?),
1024             None => None,
1025         };
1026 
1027         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1028 
1029         #[allow(unused_mut, unused_assignments)]
1030         let mut rsdp_addr: Option<GuestAddress> = None;
1031 
1032         #[cfg(feature = "acpi")]
1033         {
1034             rsdp_addr = Some(crate::acpi::create_acpi_tables(
1035                 &mem,
1036                 &self.device_manager,
1037                 &self.cpu_manager,
1038                 &self.memory_manager,
1039                 &self.numa_nodes,
1040             ));
1041             info!(
1042                 "Created ACPI tables: rsdp_addr = 0x{:x}",
1043                 rsdp_addr.unwrap().0
1044             );
1045         }
1046 
1047         let sgx_epc_region = self
1048             .memory_manager
1049             .lock()
1050             .unwrap()
1051             .sgx_epc_region()
1052             .as_ref()
1053             .cloned();
1054 
1055         arch::configure_system(
1056             &mem,
1057             arch::layout::CMDLINE_START,
1058             &initramfs_config,
1059             boot_vcpus,
1060             rsdp_addr,
1061             sgx_epc_region,
1062         )
1063         .map_err(Error::ConfigureSystem)?;
1064         Ok(())
1065     }
1066 
1067     #[cfg(target_arch = "aarch64")]
1068     fn configure_system(&mut self) -> Result<()> {
1069         let cmdline_cstring = self.get_cmdline()?;
1070         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1071         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1072         let initramfs_config = match self.initramfs {
1073             Some(_) => Some(self.load_initramfs(&mem)?),
1074             None => None,
1075         };
1076 
1077         let device_info = &self
1078             .device_manager
1079             .lock()
1080             .unwrap()
1081             .get_device_info()
1082             .clone();
1083 
1084         let pci_space_start: GuestAddress = self
1085             .memory_manager
1086             .lock()
1087             .as_ref()
1088             .unwrap()
1089             .start_of_device_area();
1090 
1091         let pci_space_end: GuestAddress = self
1092             .memory_manager
1093             .lock()
1094             .as_ref()
1095             .unwrap()
1096             .end_of_device_area();
1097 
1098         let pci_space_size = pci_space_end
1099             .checked_offset_from(pci_space_start)
1100             .ok_or(Error::MemOverflow)?
1101             + 1;
1102 
1103         let pci_space = (pci_space_start.0, pci_space_size);
1104 
1105         #[cfg(feature = "acpi")]
1106         {
1107             let _ = crate::acpi::create_acpi_tables(
1108                 &mem,
1109                 &self.device_manager,
1110                 &self.cpu_manager,
1111                 &self.memory_manager,
1112                 &self.numa_nodes,
1113             );
1114         }
1115 
1116         let gic_device = create_gic(
1117             &self.memory_manager.lock().as_ref().unwrap().vm,
1118             self.cpu_manager.lock().unwrap().boot_vcpus() as u64,
1119         )
1120         .map_err(|e| {
1121             Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::SetupGic(e)))
1122         })?;
1123 
1124         arch::configure_system(
1125             &mem,
1126             &cmdline_cstring,
1127             vcpu_mpidrs,
1128             device_info,
1129             &initramfs_config,
1130             &pci_space,
1131             &*gic_device,
1132         )
1133         .map_err(Error::ConfigureSystem)?;
1134 
1135         // Update the GIC entity in device manager
1136         self.device_manager
1137             .lock()
1138             .unwrap()
1139             .get_interrupt_controller()
1140             .unwrap()
1141             .lock()
1142             .unwrap()
1143             .set_gic_device(Arc::new(Mutex::new(gic_device)));
1144 
1145         // Activate gic device
1146         self.device_manager
1147             .lock()
1148             .unwrap()
1149             .get_interrupt_controller()
1150             .unwrap()
1151             .lock()
1152             .unwrap()
1153             .enable()
1154             .map_err(Error::EnableInterruptController)?;
1155 
1156         Ok(())
1157     }
1158 
1159     pub fn serial_pty(&self) -> Option<PtyPair> {
1160         self.device_manager.lock().unwrap().serial_pty()
1161     }
1162 
1163     pub fn console_pty(&self) -> Option<PtyPair> {
1164         self.device_manager.lock().unwrap().console_pty()
1165     }
1166 
1167     pub fn shutdown(&mut self) -> Result<()> {
1168         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1169         let new_state = VmState::Shutdown;
1170 
1171         state.valid_transition(new_state)?;
1172 
1173         if self.on_tty {
1174             // Don't forget to set the terminal in canonical mode
1175             // before to exit.
1176             io::stdin()
1177                 .lock()
1178                 .set_canon_mode()
1179                 .map_err(Error::SetTerminalCanon)?;
1180         }
1181 
1182         // Trigger the termination of the signal_handler thread
1183         if let Some(signals) = self.signals.take() {
1184             signals.close();
1185         }
1186 
1187         // Wake up the DeviceManager threads so they will get terminated cleanly
1188         self.device_manager
1189             .lock()
1190             .unwrap()
1191             .resume()
1192             .map_err(Error::Resume)?;
1193 
1194         self.cpu_manager
1195             .lock()
1196             .unwrap()
1197             .shutdown()
1198             .map_err(Error::CpuManager)?;
1199 
1200         // Wait for all the threads to finish
1201         for thread in self.threads.drain(..) {
1202             thread.join().map_err(Error::ThreadCleanup)?
1203         }
1204         *state = new_state;
1205 
1206         event!("vm", "shutdown");
1207 
1208         Ok(())
1209     }
1210 
1211     pub fn resize(
1212         &mut self,
1213         desired_vcpus: Option<u8>,
1214         desired_memory: Option<u64>,
1215         desired_balloon: Option<u64>,
1216     ) -> Result<()> {
1217         event!("vm", "resizing");
1218 
1219         if let Some(desired_vcpus) = desired_vcpus {
1220             if self
1221                 .cpu_manager
1222                 .lock()
1223                 .unwrap()
1224                 .resize(desired_vcpus)
1225                 .map_err(Error::CpuManager)?
1226             {
1227                 self.device_manager
1228                     .lock()
1229                     .unwrap()
1230                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1231                     .map_err(Error::DeviceManager)?;
1232             }
1233             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1234         }
1235 
1236         if let Some(desired_memory) = desired_memory {
1237             let new_region = self
1238                 .memory_manager
1239                 .lock()
1240                 .unwrap()
1241                 .resize(desired_memory)
1242                 .map_err(Error::MemoryManager)?;
1243 
1244             let mut memory_config = &mut self.config.lock().unwrap().memory;
1245 
1246             if let Some(new_region) = &new_region {
1247                 self.device_manager
1248                     .lock()
1249                     .unwrap()
1250                     .update_memory(new_region)
1251                     .map_err(Error::DeviceManager)?;
1252 
1253                 match memory_config.hotplug_method {
1254                     HotplugMethod::Acpi => {
1255                         self.device_manager
1256                             .lock()
1257                             .unwrap()
1258                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1259                             .map_err(Error::DeviceManager)?;
1260                     }
1261                     HotplugMethod::VirtioMem => {}
1262                 }
1263             }
1264 
1265             // We update the VM config regardless of the actual guest resize
1266             // operation result (happened or not), so that if the VM reboots
1267             // it will be running with the last configure memory size.
1268             match memory_config.hotplug_method {
1269                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1270                 HotplugMethod::VirtioMem => {
1271                     if desired_memory > memory_config.size {
1272                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1273                     } else {
1274                         memory_config.hotplugged_size = None;
1275                     }
1276                 }
1277             }
1278         }
1279 
1280         if let Some(desired_balloon) = desired_balloon {
1281             self.device_manager
1282                 .lock()
1283                 .unwrap()
1284                 .resize_balloon(desired_balloon)
1285                 .map_err(Error::DeviceManager)?;
1286 
1287             // Update the configuration value for the balloon size to ensure
1288             // a reboot would use the right value.
1289             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1290                 balloon_config.size = desired_balloon;
1291             }
1292         }
1293 
1294         event!("vm", "resized");
1295 
1296         Ok(())
1297     }
1298 
1299     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1300         let memory_config = &mut self.config.lock().unwrap().memory;
1301 
1302         if let Some(zones) = &mut memory_config.zones {
1303             for zone in zones.iter_mut() {
1304                 if zone.id == id {
1305                     if desired_memory >= zone.size {
1306                         let hotplugged_size = desired_memory - zone.size;
1307                         self.memory_manager
1308                             .lock()
1309                             .unwrap()
1310                             .resize_zone(&id, desired_memory - zone.size)
1311                             .map_err(Error::MemoryManager)?;
1312                         // We update the memory zone config regardless of the
1313                         // actual 'resize-zone' operation result (happened or
1314                         // not), so that if the VM reboots it will be running
1315                         // with the last configured memory zone size.
1316                         zone.hotplugged_size = Some(hotplugged_size);
1317 
1318                         return Ok(());
1319                     } else {
1320                         error!(
1321                             "Invalid to ask less ({}) than boot RAM ({}) for \
1322                             this memory zone",
1323                             desired_memory, zone.size,
1324                         );
1325                         return Err(Error::ResizeZone);
1326                     }
1327                 }
1328             }
1329         }
1330 
1331         error!("Could not find the memory zone {} for the resize", id);
1332         Err(Error::ResizeZone)
1333     }
1334 
1335     fn add_to_config<T>(devices: &mut Option<Vec<T>>, device: T) {
1336         if let Some(devices) = devices {
1337             devices.push(device);
1338         } else {
1339             *devices = Some(vec![device]);
1340         }
1341     }
1342 
1343     pub fn add_device(&mut self, mut _device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1344         {
1345             // Validate on a clone of the config
1346             let mut config = self.config.lock().unwrap().clone();
1347             Self::add_to_config(&mut config.devices, _device_cfg.clone());
1348             config.validate().map_err(Error::ConfigValidation)?;
1349         }
1350 
1351         let pci_device_info = self
1352             .device_manager
1353             .lock()
1354             .unwrap()
1355             .add_device(&mut _device_cfg)
1356             .map_err(Error::DeviceManager)?;
1357 
1358         // Update VmConfig by adding the new device. This is important to
1359         // ensure the device would be created in case of a reboot.
1360         {
1361             let mut config = self.config.lock().unwrap();
1362             Self::add_to_config(&mut config.devices, _device_cfg);
1363         }
1364 
1365         self.device_manager
1366             .lock()
1367             .unwrap()
1368             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1369             .map_err(Error::DeviceManager)?;
1370 
1371         Ok(pci_device_info)
1372     }
1373 
1374     pub fn remove_device(&mut self, _id: String) -> Result<()> {
1375         self.device_manager
1376             .lock()
1377             .unwrap()
1378             .remove_device(_id.clone())
1379             .map_err(Error::DeviceManager)?;
1380 
1381         // Update VmConfig by removing the device. This is important to
1382         // ensure the device would not be created in case of a reboot.
1383         let mut config = self.config.lock().unwrap();
1384 
1385         // Remove if VFIO device
1386         if let Some(devices) = config.devices.as_mut() {
1387             devices.retain(|dev| dev.id.as_ref() != Some(&_id));
1388         }
1389 
1390         // Remove if disk device
1391         if let Some(disks) = config.disks.as_mut() {
1392             disks.retain(|dev| dev.id.as_ref() != Some(&_id));
1393         }
1394 
1395         // Remove if net device
1396         if let Some(net) = config.net.as_mut() {
1397             net.retain(|dev| dev.id.as_ref() != Some(&_id));
1398         }
1399 
1400         // Remove if pmem device
1401         if let Some(pmem) = config.pmem.as_mut() {
1402             pmem.retain(|dev| dev.id.as_ref() != Some(&_id));
1403         }
1404 
1405         // Remove if vsock device
1406         if let Some(vsock) = config.vsock.as_ref() {
1407             if vsock.id.as_ref() == Some(&_id) {
1408                 config.vsock = None;
1409             }
1410         }
1411 
1412         self.device_manager
1413             .lock()
1414             .unwrap()
1415             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1416             .map_err(Error::DeviceManager)?;
1417         Ok(())
1418     }
1419 
1420     pub fn add_disk(&mut self, mut _disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1421         {
1422             // Validate on a clone of the config
1423             let mut config = self.config.lock().unwrap().clone();
1424             Self::add_to_config(&mut config.disks, _disk_cfg.clone());
1425             config.validate().map_err(Error::ConfigValidation)?;
1426         }
1427 
1428         let pci_device_info = self
1429             .device_manager
1430             .lock()
1431             .unwrap()
1432             .add_disk(&mut _disk_cfg)
1433             .map_err(Error::DeviceManager)?;
1434 
1435         // Update VmConfig by adding the new device. This is important to
1436         // ensure the device would be created in case of a reboot.
1437         {
1438             let mut config = self.config.lock().unwrap();
1439             Self::add_to_config(&mut config.disks, _disk_cfg);
1440         }
1441 
1442         self.device_manager
1443             .lock()
1444             .unwrap()
1445             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1446             .map_err(Error::DeviceManager)?;
1447 
1448         Ok(pci_device_info)
1449     }
1450 
1451     pub fn add_fs(&mut self, mut _fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1452         {
1453             // Validate on a clone of the config
1454             let mut config = self.config.lock().unwrap().clone();
1455             Self::add_to_config(&mut config.fs, _fs_cfg.clone());
1456             config.validate().map_err(Error::ConfigValidation)?;
1457         }
1458 
1459         let pci_device_info = self
1460             .device_manager
1461             .lock()
1462             .unwrap()
1463             .add_fs(&mut _fs_cfg)
1464             .map_err(Error::DeviceManager)?;
1465 
1466         // Update VmConfig by adding the new device. This is important to
1467         // ensure the device would be created in case of a reboot.
1468         {
1469             let mut config = self.config.lock().unwrap();
1470             Self::add_to_config(&mut config.fs, _fs_cfg);
1471         }
1472 
1473         self.device_manager
1474             .lock()
1475             .unwrap()
1476             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1477             .map_err(Error::DeviceManager)?;
1478 
1479         Ok(pci_device_info)
1480     }
1481 
1482     pub fn add_pmem(&mut self, mut _pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1483         {
1484             // Validate on a clone of the config
1485             let mut config = self.config.lock().unwrap().clone();
1486             Self::add_to_config(&mut config.pmem, _pmem_cfg.clone());
1487             config.validate().map_err(Error::ConfigValidation)?;
1488         }
1489 
1490         let pci_device_info = self
1491             .device_manager
1492             .lock()
1493             .unwrap()
1494             .add_pmem(&mut _pmem_cfg)
1495             .map_err(Error::DeviceManager)?;
1496 
1497         // Update VmConfig by adding the new device. This is important to
1498         // ensure the device would be created in case of a reboot.
1499         {
1500             let mut config = self.config.lock().unwrap();
1501             Self::add_to_config(&mut config.pmem, _pmem_cfg);
1502         }
1503 
1504         self.device_manager
1505             .lock()
1506             .unwrap()
1507             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1508             .map_err(Error::DeviceManager)?;
1509 
1510         Ok(pci_device_info)
1511     }
1512 
1513     pub fn add_net(&mut self, mut _net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1514         {
1515             // Validate on a clone of the config
1516             let mut config = self.config.lock().unwrap().clone();
1517             Self::add_to_config(&mut config.net, _net_cfg.clone());
1518             config.validate().map_err(Error::ConfigValidation)?;
1519         }
1520 
1521         let pci_device_info = self
1522             .device_manager
1523             .lock()
1524             .unwrap()
1525             .add_net(&mut _net_cfg)
1526             .map_err(Error::DeviceManager)?;
1527 
1528         // Update VmConfig by adding the new device. This is important to
1529         // ensure the device would be created in case of a reboot.
1530         {
1531             let mut config = self.config.lock().unwrap();
1532             Self::add_to_config(&mut config.net, _net_cfg);
1533         }
1534 
1535         self.device_manager
1536             .lock()
1537             .unwrap()
1538             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1539             .map_err(Error::DeviceManager)?;
1540 
1541         Ok(pci_device_info)
1542     }
1543 
1544     pub fn add_vsock(&mut self, mut _vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1545         if self.config.lock().unwrap().vsock.is_some() {
1546             return Err(Error::TooManyVsockDevices);
1547         }
1548 
1549         {
1550             // Validate on a clone of the config
1551             let mut config = self.config.lock().unwrap().clone();
1552             config.vsock = Some(_vsock_cfg.clone());
1553             config.validate().map_err(Error::ConfigValidation)?;
1554         }
1555 
1556         let pci_device_info = self
1557             .device_manager
1558             .lock()
1559             .unwrap()
1560             .add_vsock(&mut _vsock_cfg)
1561             .map_err(Error::DeviceManager)?;
1562 
1563         // Update VmConfig by adding the new device. This is important to
1564         // ensure the device would be created in case of a reboot.
1565         {
1566             let mut config = self.config.lock().unwrap();
1567             config.vsock = Some(_vsock_cfg);
1568         }
1569 
1570         self.device_manager
1571             .lock()
1572             .unwrap()
1573             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1574             .map_err(Error::DeviceManager)?;
1575 
1576         Ok(pci_device_info)
1577     }
1578 
1579     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1580         Ok(self.device_manager.lock().unwrap().counters())
1581     }
1582 
1583     fn os_signal_handler(
1584         mut signals: Signals,
1585         console_input_clone: Arc<Console>,
1586         on_tty: bool,
1587         exit_evt: EventFd,
1588     ) {
1589         for signal in signals.forever() {
1590             match signal {
1591                 SIGWINCH => {
1592                     let (col, row) = get_win_size();
1593                     console_input_clone.update_console_size(col, row);
1594                 }
1595                 SIGTERM | SIGINT => {
1596                     if on_tty {
1597                         io::stdin()
1598                             .lock()
1599                             .set_canon_mode()
1600                             .expect("failed to restore terminal mode");
1601                     }
1602                     if exit_evt.write(1).is_err() {
1603                         std::process::exit(1);
1604                     }
1605                 }
1606                 _ => (),
1607             }
1608         }
1609     }
1610 
1611     #[cfg(feature = "tdx")]
1612     fn init_tdx(&mut self) -> Result<()> {
1613         let cpuid = self.cpu_manager.lock().unwrap().common_cpuid();
1614         let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32;
1615         self.vm
1616             .tdx_init(&cpuid, max_vcpus)
1617             .map_err(Error::InitializeTdxVm)?;
1618         Ok(())
1619     }
1620 
1621     #[cfg(feature = "tdx")]
1622     fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> {
1623         use arch::x86_64::tdx::*;
1624         // The TDVF file contains a table of section as well as code
1625         let mut firmware_file =
1626             File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware)
1627                 .map_err(Error::LoadTdvf)?;
1628 
1629         // For all the sections allocate some RAM backing them
1630         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1631     }
1632 
1633     #[cfg(feature = "tdx")]
1634     fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> {
1635         use arch::x86_64::tdx::*;
1636         // Get the memory end *before* we start adding TDVF ram regions
1637         let boot_guest_memory = self
1638             .memory_manager
1639             .lock()
1640             .as_ref()
1641             .unwrap()
1642             .boot_guest_memory();
1643         for section in sections {
1644             // No need to allocate if the section falls within guest RAM ranges
1645             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1646                 info!(
1647                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1648                     section
1649                 );
1650                 continue;
1651             }
1652 
1653             info!("Allocating TDVF Section: {:x?}", section);
1654             self.memory_manager
1655                 .lock()
1656                 .unwrap()
1657                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1658                 .map_err(Error::AllocatingTdvfMemory)?;
1659         }
1660 
1661         // The TDVF file contains a table of section as well as code
1662         let mut firmware_file =
1663             File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware)
1664                 .map_err(Error::LoadTdvf)?;
1665 
1666         // The guest memory at this point now has all the required regions so it
1667         // is safe to copy from the TDVF file into it.
1668         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1669         let mem = guest_memory.memory();
1670         let mut hob_offset = None;
1671         for section in sections {
1672             info!("Populating TDVF Section: {:x?}", section);
1673             match section.r#type {
1674                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1675                     info!("Copying section to guest memory");
1676                     firmware_file
1677                         .seek(SeekFrom::Start(section.data_offset as u64))
1678                         .map_err(Error::LoadTdvf)?;
1679                     mem.read_from(
1680                         GuestAddress(section.address),
1681                         &mut firmware_file,
1682                         section.data_size as usize,
1683                     )
1684                     .unwrap();
1685                 }
1686                 TdvfSectionType::TdHob => {
1687                     hob_offset = Some(section.address);
1688                 }
1689                 _ => {}
1690             }
1691         }
1692 
1693         // Generate HOB
1694         let mut hob = TdHob::start(hob_offset.unwrap());
1695 
1696         let mut sorted_sections = sections.to_vec();
1697         sorted_sections.retain(|section| {
1698             !matches!(section.r#type, TdvfSectionType::Bfv | TdvfSectionType::Cfv)
1699         });
1700         sorted_sections.sort_by_key(|section| section.address);
1701         sorted_sections.reverse();
1702         let mut current_section = sorted_sections.pop();
1703 
1704         // RAM regions interleaved with TDVF sections
1705         let mut next_start_addr = 0;
1706         for region in boot_guest_memory.iter() {
1707             let region_start = region.start_addr().0;
1708             let region_end = region.last_addr().0;
1709             if region_start > next_start_addr {
1710                 next_start_addr = region_start;
1711             }
1712 
1713             loop {
1714                 let (start, size, ram) = if let Some(section) = &current_section {
1715                     if section.address <= next_start_addr {
1716                         (section.address, section.size, false)
1717                     } else {
1718                         let last_addr = std::cmp::min(section.address - 1, region_end);
1719                         (next_start_addr, last_addr - next_start_addr + 1, true)
1720                     }
1721                 } else {
1722                     (next_start_addr, region_end - next_start_addr + 1, true)
1723                 };
1724 
1725                 hob.add_memory_resource(&mem, start, size, ram)
1726                     .map_err(Error::PopulateHob)?;
1727 
1728                 if !ram {
1729                     current_section = sorted_sections.pop();
1730                 }
1731 
1732                 next_start_addr = start + size;
1733 
1734                 if next_start_addr > region_end {
1735                     break;
1736                 }
1737             }
1738         }
1739 
1740         // MMIO regions
1741         hob.add_mmio_resource(
1742             &mem,
1743             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1744             arch::layout::APIC_START.raw_value()
1745                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1746         )
1747         .map_err(Error::PopulateHob)?;
1748         let start_of_device_area = self
1749             .memory_manager
1750             .lock()
1751             .unwrap()
1752             .start_of_device_area()
1753             .raw_value();
1754         let end_of_device_area = self
1755             .memory_manager
1756             .lock()
1757             .unwrap()
1758             .end_of_device_area()
1759             .raw_value();
1760         hob.add_mmio_resource(
1761             &mem,
1762             start_of_device_area,
1763             end_of_device_area - start_of_device_area,
1764         )
1765         .map_err(Error::PopulateHob)?;
1766 
1767         hob.finish(&mem).map_err(Error::PopulateHob)?;
1768 
1769         Ok(hob_offset)
1770     }
1771 
1772     #[cfg(feature = "tdx")]
1773     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1774         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1775         let mem = guest_memory.memory();
1776 
1777         for section in sections {
1778             self.vm
1779                 .tdx_init_memory_region(
1780                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1781                     section.address,
1782                     section.size,
1783                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1784                     section.attributes == 1,
1785                 )
1786                 .map_err(Error::InitializeTdxMemoryRegion)?;
1787         }
1788         Ok(())
1789     }
1790 
1791     pub fn boot(&mut self) -> Result<()> {
1792         info!("Booting VM");
1793         event!("vm", "booting");
1794         let current_state = self.get_state()?;
1795         if current_state == VmState::Paused {
1796             return self.resume().map_err(Error::Resume);
1797         }
1798 
1799         let new_state = VmState::Running;
1800         current_state.valid_transition(new_state)?;
1801 
1802         // Load kernel if configured
1803         let entry_point = if self.kernel.as_ref().is_some() {
1804             Some(self.load_kernel()?)
1805         } else {
1806             None
1807         };
1808 
1809         // The initial TDX configuration must be done before the vCPUs are
1810         // created
1811         #[cfg(feature = "tdx")]
1812         if self.config.lock().unwrap().tdx.is_some() {
1813             self.init_tdx()?;
1814         }
1815 
1816         // Create and configure vcpus
1817         self.cpu_manager
1818             .lock()
1819             .unwrap()
1820             .create_boot_vcpus(entry_point)
1821             .map_err(Error::CpuManager)?;
1822 
1823         #[cfg(feature = "tdx")]
1824         let sections = self.extract_tdvf_sections()?;
1825 
1826         // Configuring the TDX regions requires that the vCPUs are created
1827         #[cfg(feature = "tdx")]
1828         let hob_address = if self.config.lock().unwrap().tdx.is_some() {
1829             self.populate_tdx_sections(&sections)?
1830         } else {
1831             None
1832         };
1833 
1834         // Configure shared state based on loaded kernel
1835         entry_point.map(|_| self.configure_system()).transpose()?;
1836 
1837         #[cfg(feature = "tdx")]
1838         if let Some(hob_address) = hob_address {
1839             // With the HOB address extracted the vCPUs can have
1840             // their TDX state configured.
1841             self.cpu_manager
1842                 .lock()
1843                 .unwrap()
1844                 .initialize_tdx(hob_address)
1845                 .map_err(Error::CpuManager)?;
1846             self.init_tdx_memory(&sections)?;
1847             // With TDX memory and CPU state configured TDX setup is complete
1848             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
1849         }
1850 
1851         self.cpu_manager
1852             .lock()
1853             .unwrap()
1854             .start_boot_vcpus()
1855             .map_err(Error::CpuManager)?;
1856 
1857         if self
1858             .device_manager
1859             .lock()
1860             .unwrap()
1861             .console()
1862             .input_enabled()
1863         {
1864             let console = self.device_manager.lock().unwrap().console().clone();
1865             let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]);
1866             match signals {
1867                 Ok(signals) => {
1868                     self.signals = Some(signals.handle());
1869                     let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?;
1870                     let on_tty = self.on_tty;
1871                     let signal_handler_seccomp_filter =
1872                         get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler)
1873                             .map_err(Error::CreateSeccompFilter)?;
1874                     self.threads.push(
1875                         thread::Builder::new()
1876                             .name("signal_handler".to_string())
1877                             .spawn(move || {
1878                                 if let Err(e) = SeccompFilter::apply(signal_handler_seccomp_filter)
1879                                     .map_err(Error::ApplySeccompFilter)
1880                                 {
1881                                     error!("Error applying seccomp filter: {:?}", e);
1882                                     return;
1883                                 }
1884 
1885                                 Vm::os_signal_handler(signals, console, on_tty, exit_evt);
1886                             })
1887                             .map_err(Error::SignalHandlerSpawn)?,
1888                     );
1889                 }
1890                 Err(e) => error!("Signal not found {}", e),
1891             }
1892 
1893             if self.on_tty {
1894                 io::stdin()
1895                     .lock()
1896                     .set_raw_mode()
1897                     .map_err(Error::SetTerminalRaw)?;
1898             }
1899         }
1900 
1901         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1902         *state = new_state;
1903         event!("vm", "booted");
1904         Ok(())
1905     }
1906 
1907     pub fn handle_pty(&self) -> Result<()> {
1908         // Could be a little dangerous, picks up a lock on device_manager
1909         // and goes into a blocking read. If the epoll loops starts to be
1910         // services by multiple threads likely need to revist this.
1911         let dm = self.device_manager.lock().unwrap();
1912         let mut out = [0u8; 64];
1913         if let Some(mut pty) = dm.serial_pty() {
1914             let count = pty.main.read(&mut out).map_err(Error::PtyConsole)?;
1915             let console = dm.console();
1916             if console.input_enabled() {
1917                 console
1918                     .queue_input_bytes_serial(&out[..count])
1919                     .map_err(Error::Console)?;
1920             }
1921         };
1922         let count = match dm.console_pty() {
1923             Some(mut pty) => pty.main.read(&mut out).map_err(Error::PtyConsole)?,
1924             None => return Ok(()),
1925         };
1926         let console = dm.console();
1927         if console.input_enabled() {
1928             console.queue_input_bytes_console(&out[..count])
1929         }
1930 
1931         Ok(())
1932     }
1933 
1934     pub fn handle_stdin(&self) -> Result<()> {
1935         let mut out = [0u8; 64];
1936         let count = io::stdin()
1937             .lock()
1938             .read_raw(&mut out)
1939             .map_err(Error::Console)?;
1940 
1941         // Replace "\n" with "\r" to deal with Windows SAC (#1170)
1942         if count == 1 && out[0] == 0x0a {
1943             out[0] = 0x0d;
1944         }
1945 
1946         if self
1947             .device_manager
1948             .lock()
1949             .unwrap()
1950             .console()
1951             .input_enabled()
1952         {
1953             self.device_manager
1954                 .lock()
1955                 .unwrap()
1956                 .console()
1957                 .queue_input_bytes(&out[..count])
1958                 .map_err(Error::Console)?;
1959         }
1960 
1961         Ok(())
1962     }
1963 
1964     /// Gets a thread-safe reference counted pointer to the VM configuration.
1965     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
1966         Arc::clone(&self.config)
1967     }
1968 
1969     /// Get the VM state. Returns an error if the state is poisoned.
1970     pub fn get_state(&self) -> Result<VmState> {
1971         self.state
1972             .try_read()
1973             .map_err(|_| Error::PoisonedState)
1974             .map(|state| *state)
1975     }
1976 
1977     /// Load saved clock from snapshot
1978     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
1979     pub fn load_clock_from_snapshot(
1980         &mut self,
1981         snapshot: &Snapshot,
1982     ) -> Result<Option<hypervisor::ClockData>> {
1983         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
1984         self.saved_clock = vm_snapshot.clock;
1985         Ok(self.saved_clock)
1986     }
1987 
1988     #[cfg(target_arch = "aarch64")]
1989     /// Add the vGIC section to the VM snapshot.
1990     fn add_vgic_snapshot_section(
1991         &self,
1992         vm_snapshot: &mut Snapshot,
1993     ) -> std::result::Result<(), MigratableError> {
1994         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
1995         let gic_device = Arc::clone(
1996             self.device_manager
1997                 .lock()
1998                 .unwrap()
1999                 .get_interrupt_controller()
2000                 .unwrap()
2001                 .lock()
2002                 .unwrap()
2003                 .get_gic_device()
2004                 .unwrap(),
2005         );
2006 
2007         gic_device
2008             .lock()
2009             .unwrap()
2010             .set_gicr_typers(&saved_vcpu_states);
2011 
2012         vm_snapshot.add_snapshot(
2013             gic_device
2014                 .lock()
2015                 .unwrap()
2016                 .as_any_concrete_mut()
2017                 .downcast_mut::<KvmGicV3Its>()
2018                 .unwrap()
2019                 .snapshot()?,
2020         );
2021 
2022         Ok(())
2023     }
2024 
2025     #[cfg(target_arch = "aarch64")]
2026     /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing.
2027     fn restore_vgic_and_enable_interrupt(
2028         &self,
2029         vm_snapshot: &Snapshot,
2030     ) -> std::result::Result<(), MigratableError> {
2031         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2032         // The number of vCPUs is the same as the number of saved vCPU states.
2033         let vcpu_numbers = saved_vcpu_states.len();
2034 
2035         // Creating a GIC device here, as the GIC will not be created when
2036         // restoring the device manager. Note that currently only the bare GICv3
2037         // without ITS is supported.
2038         let mut gic_device = create_gic(&self.vm, vcpu_numbers.try_into().unwrap())
2039             .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?;
2040 
2041         // Here we prepare the GICR_TYPER registers from the restored vCPU states.
2042         gic_device.set_gicr_typers(&saved_vcpu_states);
2043 
2044         let gic_device = Arc::new(Mutex::new(gic_device));
2045         // Update the GIC entity in device manager
2046         self.device_manager
2047             .lock()
2048             .unwrap()
2049             .get_interrupt_controller()
2050             .unwrap()
2051             .lock()
2052             .unwrap()
2053             .set_gic_device(Arc::clone(&gic_device));
2054 
2055         // Restore GIC states.
2056         if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) {
2057             gic_device
2058                 .lock()
2059                 .unwrap()
2060                 .as_any_concrete_mut()
2061                 .downcast_mut::<KvmGicV3Its>()
2062                 .unwrap()
2063                 .restore(*gicv3_its_snapshot.clone())?;
2064         } else {
2065             return Err(MigratableError::Restore(anyhow!(
2066                 "Missing GicV3Its snapshot"
2067             )));
2068         }
2069 
2070         // Activate gic device
2071         self.device_manager
2072             .lock()
2073             .unwrap()
2074             .get_interrupt_controller()
2075             .unwrap()
2076             .lock()
2077             .unwrap()
2078             .enable()
2079             .map_err(|e| {
2080                 MigratableError::Restore(anyhow!(
2081                     "Could not enable interrupt controller routing: {:#?}",
2082                     e
2083                 ))
2084             })?;
2085 
2086         Ok(())
2087     }
2088 
2089     /// Gets the actual size of the balloon.
2090     pub fn balloon_size(&self) -> u64 {
2091         self.device_manager.lock().unwrap().balloon_size()
2092     }
2093 
2094     pub fn receive_memory_regions<F>(
2095         &mut self,
2096         ranges: &MemoryRangeTable,
2097         fd: &mut F,
2098     ) -> std::result::Result<(), MigratableError>
2099     where
2100         F: Read,
2101     {
2102         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2103         let mem = guest_memory.memory();
2104 
2105         for range in ranges.regions() {
2106             mem.read_exact_from(GuestAddress(range.gpa), fd, range.length as usize)
2107                 .map_err(|e| {
2108                     MigratableError::MigrateReceive(anyhow!(
2109                         "Error transferring memory to socket: {}",
2110                         e
2111                     ))
2112                 })?;
2113         }
2114         Ok(())
2115     }
2116 
2117     pub fn send_memory_regions<F>(
2118         &mut self,
2119         ranges: &MemoryRangeTable,
2120         fd: &mut F,
2121     ) -> std::result::Result<(), MigratableError>
2122     where
2123         F: Write,
2124     {
2125         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2126         let mem = guest_memory.memory();
2127 
2128         for range in ranges.regions() {
2129             mem.write_all_to(GuestAddress(range.gpa), fd, range.length as usize)
2130                 .map_err(|e| {
2131                     MigratableError::MigrateSend(anyhow!(
2132                         "Error transferring memory to socket: {}",
2133                         e
2134                     ))
2135                 })?;
2136         }
2137 
2138         Ok(())
2139     }
2140 
2141     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2142         let mut table = MemoryRangeTable::default();
2143         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2144 
2145         for region in guest_memory.memory().iter() {
2146             table.push(MemoryRange {
2147                 gpa: region.start_addr().raw_value(),
2148                 length: region.len() as u64,
2149             });
2150         }
2151 
2152         Ok(table)
2153     }
2154 
2155     pub fn start_memory_dirty_log(&self) -> std::result::Result<(), MigratableError> {
2156         self.memory_manager.lock().unwrap().start_memory_dirty_log()
2157     }
2158 
2159     pub fn dirty_memory_range_table(
2160         &self,
2161     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
2162         self.memory_manager
2163             .lock()
2164             .unwrap()
2165             .dirty_memory_range_table()
2166     }
2167 
2168     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2169         self.device_manager.lock().unwrap().device_tree()
2170     }
2171 
2172     pub fn activate_virtio_devices(&self) -> Result<()> {
2173         self.device_manager
2174             .lock()
2175             .unwrap()
2176             .activate_virtio_devices()
2177             .map_err(Error::ActivateVirtioDevices)
2178     }
2179 
2180     #[cfg(target_arch = "x86_64")]
2181     pub fn power_button(&self) -> Result<()> {
2182         #[cfg(feature = "acpi")]
2183         return self
2184             .device_manager
2185             .lock()
2186             .unwrap()
2187             .notify_power_button()
2188             .map_err(Error::PowerButton);
2189         #[cfg(not(feature = "acpi"))]
2190         Err(Error::PowerButtonNotSupported)
2191     }
2192 
2193     #[cfg(target_arch = "aarch64")]
2194     pub fn power_button(&self) -> Result<()> {
2195         self.device_manager
2196             .lock()
2197             .unwrap()
2198             .notify_power_button()
2199             .map_err(Error::PowerButton)
2200     }
2201 }
2202 
2203 impl Pausable for Vm {
2204     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2205         event!("vm", "pausing");
2206         let mut state = self
2207             .state
2208             .try_write()
2209             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2210         let new_state = VmState::Paused;
2211 
2212         state
2213             .valid_transition(new_state)
2214             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2215 
2216         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2217         {
2218             let mut clock = self
2219                 .vm
2220                 .get_clock()
2221                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2222             // Reset clock flags.
2223             clock.flags = 0;
2224             self.saved_clock = Some(clock);
2225         }
2226         self.cpu_manager.lock().unwrap().pause()?;
2227         self.device_manager.lock().unwrap().pause()?;
2228 
2229         *state = new_state;
2230 
2231         event!("vm", "paused");
2232         Ok(())
2233     }
2234 
2235     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2236         event!("vm", "resuming");
2237         let mut state = self
2238             .state
2239             .try_write()
2240             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2241         let new_state = VmState::Running;
2242 
2243         state
2244             .valid_transition(new_state)
2245             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2246 
2247         self.cpu_manager.lock().unwrap().resume()?;
2248         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2249         {
2250             if let Some(clock) = &self.saved_clock {
2251                 self.vm.set_clock(clock).map_err(|e| {
2252                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2253                 })?;
2254             }
2255         }
2256         self.device_manager.lock().unwrap().resume()?;
2257 
2258         // And we're back to the Running state.
2259         *state = new_state;
2260         event!("vm", "resumed");
2261         Ok(())
2262     }
2263 }
2264 
2265 #[derive(Serialize, Deserialize)]
2266 pub struct VmSnapshot {
2267     pub config: Arc<Mutex<VmConfig>>,
2268     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2269     pub clock: Option<hypervisor::ClockData>,
2270     pub state: Option<hypervisor::VmState>,
2271 }
2272 
2273 pub const VM_SNAPSHOT_ID: &str = "vm";
2274 impl Snapshottable for Vm {
2275     fn id(&self) -> String {
2276         VM_SNAPSHOT_ID.to_string()
2277     }
2278 
2279     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2280         event!("vm", "snapshotting");
2281 
2282         #[cfg(feature = "tdx")]
2283         {
2284             if self.config.lock().unwrap().tdx.is_some() {
2285                 return Err(MigratableError::Snapshot(anyhow!(
2286                     "Snapshot not possible with TDX VM"
2287                 )));
2288             }
2289         }
2290 
2291         let current_state = self.get_state().unwrap();
2292         if current_state != VmState::Paused {
2293             return Err(MigratableError::Snapshot(anyhow!(
2294                 "Trying to snapshot while VM is running"
2295             )));
2296         }
2297 
2298         let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID);
2299         let vm_state = self
2300             .vm
2301             .state()
2302             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2303         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2304             config: self.get_config(),
2305             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2306             clock: self.saved_clock,
2307             state: Some(vm_state),
2308         })
2309         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2310 
2311         vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?);
2312         vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?);
2313 
2314         #[cfg(target_arch = "aarch64")]
2315         self.add_vgic_snapshot_section(&mut vm_snapshot)
2316             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2317 
2318         vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?);
2319         vm_snapshot.add_data_section(SnapshotDataSection {
2320             id: format!("{}-section", VM_SNAPSHOT_ID),
2321             snapshot: vm_snapshot_data,
2322         });
2323 
2324         event!("vm", "snapshotted");
2325         Ok(vm_snapshot)
2326     }
2327 
2328     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
2329         event!("vm", "restoring");
2330 
2331         let current_state = self
2332             .get_state()
2333             .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?;
2334         let new_state = VmState::Paused;
2335         current_state.valid_transition(new_state).map_err(|e| {
2336             MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e))
2337         })?;
2338 
2339         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2340             self.memory_manager
2341                 .lock()
2342                 .unwrap()
2343                 .restore(*memory_manager_snapshot.clone())?;
2344         } else {
2345             return Err(MigratableError::Restore(anyhow!(
2346                 "Missing memory manager snapshot"
2347             )));
2348         }
2349 
2350         if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) {
2351             self.cpu_manager
2352                 .lock()
2353                 .unwrap()
2354                 .restore(*cpu_manager_snapshot.clone())?;
2355         } else {
2356             return Err(MigratableError::Restore(anyhow!(
2357                 "Missing CPU manager snapshot"
2358             )));
2359         }
2360 
2361         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2362             self.device_manager
2363                 .lock()
2364                 .unwrap()
2365                 .restore(*device_manager_snapshot.clone())?;
2366         } else {
2367             return Err(MigratableError::Restore(anyhow!(
2368                 "Missing device manager snapshot"
2369             )));
2370         }
2371 
2372         #[cfg(target_arch = "aarch64")]
2373         self.restore_vgic_and_enable_interrupt(&snapshot)?;
2374 
2375         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2376             self.device_manager
2377                 .lock()
2378                 .unwrap()
2379                 .restore_devices(*device_manager_snapshot.clone())?;
2380         } else {
2381             return Err(MigratableError::Restore(anyhow!(
2382                 "Missing device manager snapshot"
2383             )));
2384         }
2385 
2386         // Now we can start all vCPUs from here.
2387         self.cpu_manager
2388             .lock()
2389             .unwrap()
2390             .start_restored_vcpus()
2391             .map_err(|e| {
2392                 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e))
2393             })?;
2394 
2395         if self
2396             .device_manager
2397             .lock()
2398             .unwrap()
2399             .console()
2400             .input_enabled()
2401         {
2402             let console = self.device_manager.lock().unwrap().console().clone();
2403             let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]);
2404             match signals {
2405                 Ok(signals) => {
2406                     self.signals = Some(signals.handle());
2407 
2408                     let on_tty = self.on_tty;
2409                     let signal_handler_seccomp_filter =
2410                         get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler).map_err(
2411                             |e| {
2412                                 MigratableError::Restore(anyhow!(
2413                                     "Could not create seccomp filter: {:#?}",
2414                                     Error::CreateSeccompFilter(e)
2415                                 ))
2416                             },
2417                         )?;
2418                     let exit_evt = self.exit_evt.try_clone().map_err(|e| {
2419                         MigratableError::Restore(anyhow!("Could not clone exit event fd: {:?}", e))
2420                     })?;
2421 
2422                     self.threads.push(
2423                         thread::Builder::new()
2424                             .name("signal_handler".to_string())
2425                             .spawn(move || {
2426                                 if let Err(e) = SeccompFilter::apply(signal_handler_seccomp_filter)
2427                                     .map_err(Error::ApplySeccompFilter)
2428                                 {
2429                                     error!("Error applying seccomp filter: {:?}", e);
2430                                     return;
2431                                 }
2432 
2433                                 Vm::os_signal_handler(signals, console, on_tty, exit_evt)
2434                             })
2435                             .map_err(|e| {
2436                                 MigratableError::Restore(anyhow!(
2437                                     "Could not start console signal thread: {:#?}",
2438                                     e
2439                                 ))
2440                             })?,
2441                     );
2442                 }
2443                 Err(e) => error!("Signal not found {}", e),
2444             }
2445 
2446             if self.on_tty {
2447                 io::stdin().lock().set_raw_mode().map_err(|e| {
2448                     MigratableError::Restore(anyhow!(
2449                         "Could not set terminal in raw mode: {:#?}",
2450                         e
2451                     ))
2452                 })?;
2453             }
2454         }
2455 
2456         let mut state = self
2457             .state
2458             .try_write()
2459             .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?;
2460         *state = new_state;
2461 
2462         event!("vm", "restored");
2463         Ok(())
2464     }
2465 }
2466 
2467 impl Transportable for Vm {
2468     fn send(
2469         &self,
2470         snapshot: &Snapshot,
2471         destination_url: &str,
2472     ) -> std::result::Result<(), MigratableError> {
2473         let mut vm_snapshot_path = url_to_path(destination_url)?;
2474         vm_snapshot_path.push(VM_SNAPSHOT_FILE);
2475 
2476         // Create the snapshot file
2477         let mut vm_snapshot_file = OpenOptions::new()
2478             .read(true)
2479             .write(true)
2480             .create_new(true)
2481             .open(vm_snapshot_path)
2482             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2483 
2484         // Serialize and write the snapshot
2485         let vm_snapshot =
2486             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2487 
2488         vm_snapshot_file
2489             .write(&vm_snapshot)
2490             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2491 
2492         // Tell the memory manager to also send/write its own snapshot.
2493         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2494             self.memory_manager
2495                 .lock()
2496                 .unwrap()
2497                 .send(&*memory_manager_snapshot.clone(), destination_url)?;
2498         } else {
2499             return Err(MigratableError::Restore(anyhow!(
2500                 "Missing memory manager snapshot"
2501             )));
2502         }
2503 
2504         Ok(())
2505     }
2506 }
2507 impl Migratable for Vm {}
2508 
2509 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2510 #[cfg(test)]
2511 mod tests {
2512     use super::*;
2513 
2514     fn test_vm_state_transitions(state: VmState) {
2515         match state {
2516             VmState::Created => {
2517                 // Check the transitions from Created
2518                 assert!(state.valid_transition(VmState::Created).is_err());
2519                 assert!(state.valid_transition(VmState::Running).is_ok());
2520                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2521                 assert!(state.valid_transition(VmState::Paused).is_ok());
2522             }
2523             VmState::Running => {
2524                 // Check the transitions from Running
2525                 assert!(state.valid_transition(VmState::Created).is_err());
2526                 assert!(state.valid_transition(VmState::Running).is_err());
2527                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2528                 assert!(state.valid_transition(VmState::Paused).is_ok());
2529             }
2530             VmState::Shutdown => {
2531                 // Check the transitions from Shutdown
2532                 assert!(state.valid_transition(VmState::Created).is_err());
2533                 assert!(state.valid_transition(VmState::Running).is_ok());
2534                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2535                 assert!(state.valid_transition(VmState::Paused).is_err());
2536             }
2537             VmState::Paused => {
2538                 // Check the transitions from Paused
2539                 assert!(state.valid_transition(VmState::Created).is_err());
2540                 assert!(state.valid_transition(VmState::Running).is_ok());
2541                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2542                 assert!(state.valid_transition(VmState::Paused).is_err());
2543             }
2544         }
2545     }
2546 
2547     #[test]
2548     fn test_vm_created_transitions() {
2549         test_vm_state_transitions(VmState::Created);
2550     }
2551 
2552     #[test]
2553     fn test_vm_running_transitions() {
2554         test_vm_state_transitions(VmState::Running);
2555     }
2556 
2557     #[test]
2558     fn test_vm_shutdown_transitions() {
2559         test_vm_state_transitions(VmState::Shutdown);
2560     }
2561 
2562     #[test]
2563     fn test_vm_paused_transitions() {
2564         test_vm_state_transitions(VmState::Paused);
2565     }
2566 }
2567 
2568 #[cfg(target_arch = "aarch64")]
2569 #[cfg(test)]
2570 mod tests {
2571     use super::*;
2572     use crate::GuestMemoryMmap;
2573     use arch::aarch64::fdt::create_fdt;
2574     use arch::aarch64::gic::kvm::create_gic;
2575     use arch::aarch64::layout;
2576     use arch::{DeviceType, MmioDeviceInfo};
2577     use vm_memory::GuestAddress;
2578 
2579     const LEN: u64 = 4096;
2580 
2581     #[test]
2582     fn test_create_fdt_with_devices() {
2583         let regions = vec![(
2584             GuestAddress(layout::RAM_64BIT_START),
2585             (layout::FDT_MAX_SIZE + 0x1000) as usize,
2586         )];
2587         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
2588 
2589         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
2590             (
2591                 (DeviceType::Serial, DeviceType::Serial.to_string()),
2592                 MmioDeviceInfo {
2593                     addr: 0x00,
2594                     irq: 33,
2595                 },
2596             ),
2597             (
2598                 (DeviceType::Virtio(1), "virtio".to_string()),
2599                 MmioDeviceInfo { addr: LEN, irq: 34 },
2600             ),
2601             (
2602                 (DeviceType::Rtc, "rtc".to_string()),
2603                 MmioDeviceInfo {
2604                     addr: 2 * LEN,
2605                     irq: 35,
2606                 },
2607             ),
2608         ]
2609         .iter()
2610         .cloned()
2611         .collect();
2612 
2613         let hv = hypervisor::new().unwrap();
2614         let vm = hv.create_vm().unwrap();
2615         let gic = create_gic(&vm, 1).unwrap();
2616         assert!(create_fdt(
2617             &mem,
2618             &CString::new("console=tty0").unwrap(),
2619             vec![0],
2620             &dev_info,
2621             &*gic,
2622             &None,
2623             &(0x1_0000_0000, 0x1_0000),
2624         )
2625         .is_ok())
2626     }
2627 }
2628 
2629 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2630 #[test]
2631 pub fn test_vm() {
2632     use hypervisor::VmExit;
2633     use vm_memory::{GuestMemory, GuestMemoryRegion};
2634     // This example based on https://lwn.net/Articles/658511/
2635     let code = [
2636         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
2637         0x00, 0xd8, /* add %bl, %al */
2638         0x04, b'0', /* add $'0', %al */
2639         0xee, /* out %al, (%dx) */
2640         0xb0, b'\n', /* mov $'\n', %al */
2641         0xee,  /* out %al, (%dx) */
2642         0xf4,  /* hlt */
2643     ];
2644 
2645     let mem_size = 0x1000;
2646     let load_addr = GuestAddress(0x1000);
2647     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
2648 
2649     let hv = hypervisor::new().unwrap();
2650     let vm = hv.create_vm().expect("new VM creation failed");
2651 
2652     for (index, region) in mem.iter().enumerate() {
2653         let mem_region = vm.make_user_memory_region(
2654             index as u32,
2655             region.start_addr().raw_value(),
2656             region.len() as u64,
2657             region.as_ptr() as u64,
2658             false,
2659             false,
2660         );
2661 
2662         vm.create_user_memory_region(mem_region)
2663             .expect("Cannot configure guest memory");
2664     }
2665     mem.write_slice(&code, load_addr)
2666         .expect("Writing code to memory failed");
2667 
2668     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
2669 
2670     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
2671     vcpu_sregs.cs.base = 0;
2672     vcpu_sregs.cs.selector = 0;
2673     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
2674 
2675     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
2676     vcpu_regs.rip = 0x1000;
2677     vcpu_regs.rax = 2;
2678     vcpu_regs.rbx = 3;
2679     vcpu_regs.rflags = 2;
2680     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
2681 
2682     loop {
2683         match vcpu.run().expect("run failed") {
2684             VmExit::IoOut(addr, data) => {
2685                 println!(
2686                     "IO out -- addr: {:#x} data [{:?}]",
2687                     addr,
2688                     str::from_utf8(data).unwrap()
2689                 );
2690             }
2691             VmExit::Reset => {
2692                 println!("HLT");
2693                 break;
2694             }
2695             r => panic!("unexpected exit reason: {:?}", r),
2696         }
2697     }
2698 }
2699