xref: /cloud-hypervisor/vmm/src/vm.rs (revision 7d7bfb2034001d4cb15df2ddc56d2d350c8da30f)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 use crate::config::NumaConfig;
15 use crate::config::{
16     add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
17     UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig,
18 };
19 use crate::cpu;
20 use crate::device_manager::{self, Console, DeviceManager, DeviceManagerError, PtyPair};
21 use crate::device_tree::DeviceTree;
22 #[cfg(feature = "gdb")]
23 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
24 use crate::memory_manager::{
25     Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
26 };
27 use crate::migration::{get_vm_snapshot, url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
28 use crate::seccomp_filters::{get_seccomp_filter, Thread};
29 use crate::GuestMemoryMmap;
30 use crate::{
31     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
32 };
33 use anyhow::anyhow;
34 use arch::get_host_cpu_phys_bits;
35 #[cfg(target_arch = "x86_64")]
36 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
37 #[cfg(feature = "tdx")]
38 use arch::x86_64::tdx::TdvfSection;
39 use arch::EntryPoint;
40 #[cfg(target_arch = "aarch64")]
41 use arch::PciSpaceInfo;
42 use arch::{NumaNode, NumaNodes};
43 use devices::AcpiNotificationFlags;
44 #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
45 use gdbstub_arch::x86::reg::X86_64CoreRegs;
46 use hypervisor::vm::{HypervisorVmError, VmmOps};
47 use linux_loader::cmdline::Cmdline;
48 #[cfg(target_arch = "x86_64")]
49 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
50 #[cfg(target_arch = "aarch64")]
51 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
52 use linux_loader::loader::KernelLoader;
53 use seccompiler::{apply_filter, SeccompAction};
54 use signal_hook::{
55     consts::{SIGINT, SIGTERM, SIGWINCH},
56     iterator::backend::Handle,
57     iterator::Signals,
58 };
59 use std::cmp;
60 use std::collections::BTreeMap;
61 use std::collections::HashMap;
62 use std::convert::TryInto;
63 #[cfg(target_arch = "x86_64")]
64 use std::fmt;
65 use std::fs::{File, OpenOptions};
66 use std::io::{self, Read, Write};
67 use std::io::{Seek, SeekFrom};
68 #[cfg(feature = "tdx")]
69 use std::mem;
70 use std::num::Wrapping;
71 use std::ops::Deref;
72 use std::os::unix::net::UnixStream;
73 use std::panic::AssertUnwindSafe;
74 use std::sync::{Arc, Mutex, RwLock};
75 use std::{result, str, thread};
76 use vm_device::Bus;
77 #[cfg(target_arch = "x86_64")]
78 use vm_device::BusDevice;
79 #[cfg(target_arch = "x86_64")]
80 use vm_memory::Address;
81 #[cfg(feature = "tdx")]
82 use vm_memory::{ByteValued, GuestMemory, GuestMemoryRegion};
83 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic};
84 use vm_migration::protocol::{Request, Response, Status};
85 use vm_migration::{
86     protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, Snapshot,
87     SnapshotDataSection, Snapshottable, Transportable,
88 };
89 use vmm_sys_util::eventfd::EventFd;
90 use vmm_sys_util::signal::unblock_signal;
91 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
92 use vmm_sys_util::terminal::Terminal;
93 
94 #[cfg(target_arch = "aarch64")]
95 use arch::aarch64::gic::gicv3_its::kvm::{KvmGicV3Its, GIC_V3_ITS_SNAPSHOT_ID};
96 #[cfg(target_arch = "aarch64")]
97 use arch::aarch64::gic::kvm::create_gic;
98 #[cfg(target_arch = "aarch64")]
99 use devices::interrupt_controller::{self, InterruptController};
100 
101 /// Errors associated with VM management
102 #[derive(Debug)]
103 pub enum Error {
104     /// Cannot open the kernel image
105     KernelFile(io::Error),
106 
107     /// Cannot open the initramfs image
108     InitramfsFile(io::Error),
109 
110     /// Cannot load the kernel in memory
111     KernelLoad(linux_loader::loader::Error),
112 
113     #[cfg(target_arch = "aarch64")]
114     /// Cannot load the UEFI binary in memory
115     UefiLoad(arch::aarch64::uefi::Error),
116 
117     /// Cannot load the initramfs in memory
118     InitramfsLoad,
119 
120     /// Cannot load the command line in memory
121     LoadCmdLine(linux_loader::loader::Error),
122 
123     /// Cannot modify the command line
124     CmdLineInsertStr(linux_loader::cmdline::Error),
125 
126     /// Cannot configure system
127     ConfigureSystem(arch::Error),
128 
129     /// Cannot enable interrupt controller
130     #[cfg(target_arch = "aarch64")]
131     EnableInterruptController(interrupt_controller::Error),
132 
133     PoisonedState,
134 
135     /// Cannot create a device manager.
136     DeviceManager(DeviceManagerError),
137 
138     /// Write to the console failed.
139     Console(vmm_sys_util::errno::Error),
140 
141     /// Write to the pty console failed.
142     PtyConsole(io::Error),
143 
144     /// Cannot setup terminal in raw mode.
145     SetTerminalRaw(vmm_sys_util::errno::Error),
146 
147     /// Cannot setup terminal in canonical mode.
148     SetTerminalCanon(vmm_sys_util::errno::Error),
149 
150     /// Memory is overflow
151     MemOverflow,
152 
153     /// Cannot spawn a signal handler thread
154     SignalHandlerSpawn(io::Error),
155 
156     /// Failed to join on vCPU threads
157     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
158 
159     /// VM config is missing.
160     VmMissingConfig,
161 
162     /// VM is not created
163     VmNotCreated,
164 
165     /// VM is already created
166     VmAlreadyCreated,
167 
168     /// VM is not running
169     VmNotRunning,
170 
171     /// Cannot clone EventFd.
172     EventFdClone(io::Error),
173 
174     /// Invalid VM state transition
175     InvalidStateTransition(VmState, VmState),
176 
177     /// Error from CPU handling
178     CpuManager(cpu::Error),
179 
180     /// Cannot pause devices
181     PauseDevices(MigratableError),
182 
183     /// Cannot resume devices
184     ResumeDevices(MigratableError),
185 
186     /// Cannot pause CPUs
187     PauseCpus(MigratableError),
188 
189     /// Cannot resume cpus
190     ResumeCpus(MigratableError),
191 
192     /// Cannot pause VM
193     Pause(MigratableError),
194 
195     /// Cannot resume VM
196     Resume(MigratableError),
197 
198     /// Memory manager error
199     MemoryManager(MemoryManagerError),
200 
201     /// Eventfd write error
202     EventfdError(std::io::Error),
203 
204     /// Cannot snapshot VM
205     Snapshot(MigratableError),
206 
207     /// Cannot restore VM
208     Restore(MigratableError),
209 
210     /// Cannot send VM snapshot
211     SnapshotSend(MigratableError),
212 
213     /// Cannot convert source URL from Path into &str
214     RestoreSourceUrlPathToStr,
215 
216     /// Failed to validate config
217     ConfigValidation(ValidationError),
218 
219     /// No more that one virtio-vsock device
220     TooManyVsockDevices,
221 
222     /// Failed serializing into JSON
223     SerializeJson(serde_json::Error),
224 
225     /// Invalid configuration for NUMA.
226     InvalidNumaConfig,
227 
228     /// Cannot create seccomp filter
229     CreateSeccompFilter(seccompiler::Error),
230 
231     /// Cannot apply seccomp filter
232     ApplySeccompFilter(seccompiler::Error),
233 
234     /// Failed resizing a memory zone.
235     ResizeZone,
236 
237     /// Cannot activate virtio devices
238     ActivateVirtioDevices(device_manager::DeviceManagerError),
239 
240     /// Error triggering power button
241     PowerButton(device_manager::DeviceManagerError),
242 
243     /// Kernel lacks PVH header
244     KernelMissingPvhHeader,
245 
246     /// Failed to allocate firmware RAM
247     AllocateFirmwareMemory(MemoryManagerError),
248 
249     /// Error manipulating firmware file
250     FirmwareFile(std::io::Error),
251 
252     /// Firmware too big
253     FirmwareTooLarge,
254 
255     // Failed to copy to memory
256     FirmwareLoad(vm_memory::GuestMemoryError),
257 
258     /// Error performing I/O on TDX firmware file
259     #[cfg(feature = "tdx")]
260     LoadTdvf(std::io::Error),
261 
262     /// Error performing I/O on the payload file
263     #[cfg(feature = "tdx")]
264     LoadPayload(std::io::Error),
265 
266     /// Error parsing TDVF
267     #[cfg(feature = "tdx")]
268     ParseTdvf(arch::x86_64::tdx::TdvfError),
269 
270     /// Error populating HOB
271     #[cfg(feature = "tdx")]
272     PopulateHob(arch::x86_64::tdx::TdvfError),
273 
274     /// Error allocating TDVF memory
275     #[cfg(feature = "tdx")]
276     AllocatingTdvfMemory(crate::memory_manager::Error),
277 
278     /// Error enabling TDX VM
279     #[cfg(feature = "tdx")]
280     InitializeTdxVm(hypervisor::HypervisorVmError),
281 
282     /// Error enabling TDX memory region
283     #[cfg(feature = "tdx")]
284     InitializeTdxMemoryRegion(hypervisor::HypervisorVmError),
285 
286     /// Error finalizing TDX setup
287     #[cfg(feature = "tdx")]
288     FinalizeTdx(hypervisor::HypervisorVmError),
289 
290     /// Invalid payload type
291     #[cfg(feature = "tdx")]
292     InvalidPayloadType,
293 
294     /// Error debugging VM
295     #[cfg(feature = "gdb")]
296     Debug(DebuggableError),
297 }
298 pub type Result<T> = result::Result<T, Error>;
299 
300 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq)]
301 pub enum VmState {
302     Created,
303     Running,
304     Shutdown,
305     Paused,
306     BreakPoint,
307 }
308 
309 impl VmState {
310     fn valid_transition(self, new_state: VmState) -> Result<()> {
311         match self {
312             VmState::Created => match new_state {
313                 VmState::Created | VmState::Shutdown => {
314                     Err(Error::InvalidStateTransition(self, new_state))
315                 }
316                 VmState::Running | VmState::Paused | VmState::BreakPoint => Ok(()),
317             },
318 
319             VmState::Running => match new_state {
320                 VmState::Created | VmState::Running => {
321                     Err(Error::InvalidStateTransition(self, new_state))
322                 }
323                 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
324             },
325 
326             VmState::Shutdown => match new_state {
327                 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
328                     Err(Error::InvalidStateTransition(self, new_state))
329                 }
330                 VmState::Running => Ok(()),
331             },
332 
333             VmState::Paused => match new_state {
334                 VmState::Created | VmState::Paused | VmState::BreakPoint => {
335                     Err(Error::InvalidStateTransition(self, new_state))
336                 }
337                 VmState::Running | VmState::Shutdown => Ok(()),
338             },
339             VmState::BreakPoint => match new_state {
340                 VmState::Created | VmState::Running => Ok(()),
341                 _ => Err(Error::InvalidStateTransition(self, new_state)),
342             },
343         }
344     }
345 }
346 
347 // Debug I/O port
348 #[cfg(target_arch = "x86_64")]
349 const DEBUG_IOPORT: u16 = 0x80;
350 #[cfg(target_arch = "x86_64")]
351 const DEBUG_IOPORT_PREFIX: &str = "Debug I/O port";
352 
353 #[cfg(target_arch = "x86_64")]
354 /// Debug I/O port, see:
355 /// https://www.intel.com/content/www/us/en/support/articles/000005500/boards-and-kits.html
356 ///
357 /// Since we're not a physical platform, we can freely assign code ranges for
358 /// debugging specific parts of our virtual platform.
359 pub enum DebugIoPortRange {
360     Firmware,
361     Bootloader,
362     Kernel,
363     Userspace,
364     Custom,
365 }
366 #[cfg(target_arch = "x86_64")]
367 impl DebugIoPortRange {
368     fn from_u8(value: u8) -> DebugIoPortRange {
369         match value {
370             0x00..=0x1f => DebugIoPortRange::Firmware,
371             0x20..=0x3f => DebugIoPortRange::Bootloader,
372             0x40..=0x5f => DebugIoPortRange::Kernel,
373             0x60..=0x7f => DebugIoPortRange::Userspace,
374             _ => DebugIoPortRange::Custom,
375         }
376     }
377 }
378 
379 #[cfg(target_arch = "x86_64")]
380 impl fmt::Display for DebugIoPortRange {
381     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
382         match self {
383             DebugIoPortRange::Firmware => write!(f, "{}: Firmware", DEBUG_IOPORT_PREFIX),
384             DebugIoPortRange::Bootloader => write!(f, "{}: Bootloader", DEBUG_IOPORT_PREFIX),
385             DebugIoPortRange::Kernel => write!(f, "{}: Kernel", DEBUG_IOPORT_PREFIX),
386             DebugIoPortRange::Userspace => write!(f, "{}: Userspace", DEBUG_IOPORT_PREFIX),
387             DebugIoPortRange::Custom => write!(f, "{}: Custom", DEBUG_IOPORT_PREFIX),
388         }
389     }
390 }
391 
392 struct VmOps {
393     memory: GuestMemoryAtomic<GuestMemoryMmap>,
394     #[cfg(target_arch = "x86_64")]
395     io_bus: Arc<Bus>,
396     mmio_bus: Arc<Bus>,
397     #[cfg(target_arch = "x86_64")]
398     timestamp: std::time::Instant,
399     #[cfg(target_arch = "x86_64")]
400     pci_config_io: Arc<Mutex<dyn BusDevice>>,
401 }
402 
403 impl VmOps {
404     #[cfg(target_arch = "x86_64")]
405     // Log debug io port codes.
406     fn log_debug_ioport(&self, code: u8) {
407         let elapsed = self.timestamp.elapsed();
408 
409         info!(
410             "[{} code 0x{:x}] {}.{:>06} seconds",
411             DebugIoPortRange::from_u8(code),
412             code,
413             elapsed.as_secs(),
414             elapsed.as_micros()
415         );
416     }
417 }
418 
419 impl VmmOps for VmOps {
420     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> hypervisor::vm::Result<usize> {
421         self.memory
422             .memory()
423             .write(buf, GuestAddress(gpa))
424             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
425     }
426 
427     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> hypervisor::vm::Result<usize> {
428         self.memory
429             .memory()
430             .read(buf, GuestAddress(gpa))
431             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
432     }
433 
434     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> {
435         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
436             warn!("Guest MMIO read to unregistered address 0x{:x}", gpa);
437         }
438         Ok(())
439     }
440 
441     fn mmio_write(&self, gpa: u64, data: &[u8]) -> hypervisor::vm::Result<()> {
442         match self.mmio_bus.write(gpa, data) {
443             Err(vm_device::BusError::MissingAddressRange) => {
444                 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa);
445             }
446             Ok(Some(barrier)) => {
447                 info!("Waiting for barrier");
448                 barrier.wait();
449                 info!("Barrier released");
450             }
451             _ => {}
452         };
453         Ok(())
454     }
455 
456     #[cfg(target_arch = "x86_64")]
457     fn pio_read(&self, port: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> {
458         use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE};
459 
460         if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) {
461             self.pci_config_io.lock().unwrap().read(
462                 PCI_CONFIG_IO_PORT,
463                 port - PCI_CONFIG_IO_PORT,
464                 data,
465             );
466             return Ok(());
467         }
468 
469         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
470             warn!("Guest PIO read to unregistered address 0x{:x}", port);
471         }
472         Ok(())
473     }
474 
475     #[cfg(target_arch = "x86_64")]
476     fn pio_write(&self, port: u64, data: &[u8]) -> hypervisor::vm::Result<()> {
477         use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE};
478 
479         if port == DEBUG_IOPORT as u64 && data.len() == 1 {
480             self.log_debug_ioport(data[0]);
481             return Ok(());
482         }
483 
484         if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) {
485             self.pci_config_io.lock().unwrap().write(
486                 PCI_CONFIG_IO_PORT,
487                 port - PCI_CONFIG_IO_PORT,
488                 data,
489             );
490             return Ok(());
491         }
492 
493         match self.io_bus.write(port, data) {
494             Err(vm_device::BusError::MissingAddressRange) => {
495                 warn!("Guest PIO write to unregistered address 0x{:x}", port);
496             }
497             Ok(Some(barrier)) => {
498                 info!("Waiting for barrier");
499                 barrier.wait();
500                 info!("Barrier released");
501             }
502             _ => {}
503         };
504         Ok(())
505     }
506 }
507 
508 pub fn physical_bits(max_phys_bits: u8) -> u8 {
509     let host_phys_bits = get_host_cpu_phys_bits();
510 
511     cmp::min(host_phys_bits, max_phys_bits)
512 }
513 
514 pub const HANDLED_SIGNALS: [i32; 3] = [SIGWINCH, SIGTERM, SIGINT];
515 
516 pub struct Vm {
517     kernel: Option<File>,
518     initramfs: Option<File>,
519     threads: Vec<thread::JoinHandle<()>>,
520     device_manager: Arc<Mutex<DeviceManager>>,
521     config: Arc<Mutex<VmConfig>>,
522     on_tty: bool,
523     signals: Option<Handle>,
524     state: RwLock<VmState>,
525     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
526     memory_manager: Arc<Mutex<MemoryManager>>,
527     #[cfg_attr(not(feature = "kvm"), allow(dead_code))]
528     // The hypervisor abstracted virtual machine.
529     vm: Arc<dyn hypervisor::Vm>,
530     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
531     saved_clock: Option<hypervisor::ClockData>,
532 
533     numa_nodes: NumaNodes,
534     seccomp_action: SeccompAction,
535     exit_evt: EventFd,
536     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
537     hypervisor: Arc<dyn hypervisor::Hypervisor>,
538     stop_on_boot: bool,
539 }
540 
541 impl Vm {
542     #[allow(clippy::too_many_arguments)]
543     fn new_from_memory_manager(
544         config: Arc<Mutex<VmConfig>>,
545         memory_manager: Arc<Mutex<MemoryManager>>,
546         vm: Arc<dyn hypervisor::Vm>,
547         exit_evt: EventFd,
548         reset_evt: EventFd,
549         #[cfg(feature = "gdb")] vm_debug_evt: EventFd,
550         seccomp_action: &SeccompAction,
551         hypervisor: Arc<dyn hypervisor::Hypervisor>,
552         activate_evt: EventFd,
553         restoring: bool,
554     ) -> Result<Self> {
555         config
556             .lock()
557             .unwrap()
558             .validate()
559             .map_err(Error::ConfigValidation)?;
560 
561         info!("Booting VM from config: {:?}", &config);
562 
563         // Create NUMA nodes based on NumaConfig.
564         let numa_nodes =
565             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
566 
567         #[cfg(feature = "tdx")]
568         let force_iommu = config.lock().unwrap().tdx.is_some();
569         #[cfg(not(feature = "tdx"))]
570         let force_iommu = false;
571 
572         #[cfg(feature = "gdb")]
573         let stop_on_boot = config.lock().unwrap().gdb;
574         #[cfg(not(feature = "gdb"))]
575         let stop_on_boot = false;
576 
577         let device_manager = DeviceManager::new(
578             vm.clone(),
579             config.clone(),
580             memory_manager.clone(),
581             &exit_evt,
582             &reset_evt,
583             seccomp_action.clone(),
584             numa_nodes.clone(),
585             &activate_evt,
586             force_iommu,
587             restoring,
588         )
589         .map_err(Error::DeviceManager)?;
590 
591         let memory = memory_manager.lock().unwrap().guest_memory();
592         #[cfg(target_arch = "x86_64")]
593         let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus());
594         let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus());
595         // Create the VmOps structure, which implements the VmmOps trait.
596         // And send it to the hypervisor.
597 
598         #[cfg(target_arch = "x86_64")]
599         let pci_config_io =
600             device_manager.lock().unwrap().pci_config_io() as Arc<Mutex<dyn BusDevice>>;
601         let vm_ops: Arc<dyn VmmOps> = Arc::new(VmOps {
602             memory,
603             #[cfg(target_arch = "x86_64")]
604             io_bus,
605             mmio_bus,
606             #[cfg(target_arch = "x86_64")]
607             timestamp: std::time::Instant::now(),
608             #[cfg(target_arch = "x86_64")]
609             pci_config_io,
610         });
611 
612         let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?;
613         #[cfg(feature = "tdx")]
614         let tdx_enabled = config.lock().unwrap().tdx.is_some();
615         let cpus_config = { &config.lock().unwrap().cpus.clone() };
616         let cpu_manager = cpu::CpuManager::new(
617             cpus_config,
618             &device_manager,
619             &memory_manager,
620             vm.clone(),
621             exit_evt_clone,
622             reset_evt,
623             #[cfg(feature = "gdb")]
624             vm_debug_evt,
625             hypervisor.clone(),
626             seccomp_action.clone(),
627             vm_ops,
628             #[cfg(feature = "tdx")]
629             tdx_enabled,
630             &numa_nodes,
631         )
632         .map_err(Error::CpuManager)?;
633 
634         let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0;
635         let kernel = config
636             .lock()
637             .unwrap()
638             .kernel
639             .as_ref()
640             .map(|k| File::open(&k.path))
641             .transpose()
642             .map_err(Error::KernelFile)?;
643 
644         let initramfs = config
645             .lock()
646             .unwrap()
647             .initramfs
648             .as_ref()
649             .map(|i| File::open(&i.path))
650             .transpose()
651             .map_err(Error::InitramfsFile)?;
652 
653         Ok(Vm {
654             kernel,
655             initramfs,
656             device_manager,
657             config,
658             on_tty,
659             threads: Vec::with_capacity(1),
660             signals: None,
661             state: RwLock::new(VmState::Created),
662             cpu_manager,
663             memory_manager,
664             vm,
665             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
666             saved_clock: None,
667 
668             numa_nodes,
669             seccomp_action: seccomp_action.clone(),
670             exit_evt,
671             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
672             hypervisor,
673             stop_on_boot,
674         })
675     }
676 
677     fn create_numa_nodes(
678         configs: Option<Vec<NumaConfig>>,
679         memory_manager: &Arc<Mutex<MemoryManager>>,
680     ) -> Result<NumaNodes> {
681         let mm = memory_manager.lock().unwrap();
682         let mm_zones = mm.memory_zones();
683         let mut numa_nodes = BTreeMap::new();
684 
685         if let Some(configs) = &configs {
686             for config in configs.iter() {
687                 if numa_nodes.contains_key(&config.guest_numa_id) {
688                     error!("Can't define twice the same NUMA node");
689                     return Err(Error::InvalidNumaConfig);
690                 }
691 
692                 let mut node = NumaNode::default();
693 
694                 if let Some(memory_zones) = &config.memory_zones {
695                     for memory_zone in memory_zones.iter() {
696                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
697                             node.memory_regions.extend(mm_zone.regions().clone());
698                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
699                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
700                             }
701                             node.memory_zones.push(memory_zone.clone());
702                         } else {
703                             error!("Unknown memory zone '{}'", memory_zone);
704                             return Err(Error::InvalidNumaConfig);
705                         }
706                     }
707                 }
708 
709                 if let Some(cpus) = &config.cpus {
710                     node.cpus.extend(cpus);
711                 }
712 
713                 if let Some(distances) = &config.distances {
714                     for distance in distances.iter() {
715                         let dest = distance.destination;
716                         let dist = distance.distance;
717 
718                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
719                             error!("Unknown destination NUMA node {}", dest);
720                             return Err(Error::InvalidNumaConfig);
721                         }
722 
723                         if node.distances.contains_key(&dest) {
724                             error!("Destination NUMA node {} has been already set", dest);
725                             return Err(Error::InvalidNumaConfig);
726                         }
727 
728                         node.distances.insert(dest, dist);
729                     }
730                 }
731 
732                 #[cfg(target_arch = "x86_64")]
733                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
734                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
735                         let mm_sections = sgx_epc_region.epc_sections();
736                         for sgx_epc_section in sgx_epc_sections.iter() {
737                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
738                                 node.sgx_epc_sections.push(mm_section.clone());
739                             } else {
740                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
741                                 return Err(Error::InvalidNumaConfig);
742                             }
743                         }
744                     } else {
745                         error!("Missing SGX EPC region");
746                         return Err(Error::InvalidNumaConfig);
747                     }
748                 }
749 
750                 numa_nodes.insert(config.guest_numa_id, node);
751             }
752         }
753 
754         Ok(numa_nodes)
755     }
756 
757     #[allow(clippy::too_many_arguments)]
758     pub fn new(
759         config: Arc<Mutex<VmConfig>>,
760         exit_evt: EventFd,
761         reset_evt: EventFd,
762         #[cfg(feature = "gdb")] vm_debug_evt: EventFd,
763         seccomp_action: &SeccompAction,
764         hypervisor: Arc<dyn hypervisor::Hypervisor>,
765         activate_evt: EventFd,
766         serial_pty: Option<PtyPair>,
767         console_pty: Option<PtyPair>,
768         console_resize_pipe: Option<File>,
769     ) -> Result<Self> {
770         #[cfg(feature = "tdx")]
771         let tdx_enabled = config.lock().unwrap().tdx.is_some();
772         hypervisor.check_required_extensions().unwrap();
773         #[cfg(feature = "tdx")]
774         let vm = hypervisor
775             .create_vm_with_type(if tdx_enabled {
776                 2 // KVM_X86_TDX_VM
777             } else {
778                 0 // KVM_X86_LEGACY_VM
779             })
780             .unwrap();
781         #[cfg(not(feature = "tdx"))]
782         let vm = hypervisor.create_vm().unwrap();
783 
784         #[cfg(target_arch = "x86_64")]
785         {
786             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
787                 .unwrap();
788             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
789             vm.enable_split_irq().unwrap();
790         }
791 
792         let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits);
793 
794         #[cfg(target_arch = "x86_64")]
795         let sgx_epc_config = config.lock().unwrap().sgx_epc.clone();
796 
797         let memory_manager = MemoryManager::new(
798             vm.clone(),
799             &config.lock().unwrap().memory.clone(),
800             None,
801             phys_bits,
802             #[cfg(feature = "tdx")]
803             tdx_enabled,
804             None,
805             None,
806             #[cfg(target_arch = "x86_64")]
807             sgx_epc_config,
808         )
809         .map_err(Error::MemoryManager)?;
810 
811         let new_vm = Vm::new_from_memory_manager(
812             config,
813             memory_manager,
814             vm,
815             exit_evt,
816             reset_evt,
817             #[cfg(feature = "gdb")]
818             vm_debug_evt,
819             seccomp_action,
820             hypervisor,
821             activate_evt,
822             false,
823         )?;
824 
825         // The device manager must create the devices from here as it is part
826         // of the regular code path creating everything from scratch.
827         new_vm
828             .device_manager
829             .lock()
830             .unwrap()
831             .create_devices(serial_pty, console_pty, console_resize_pipe)
832             .map_err(Error::DeviceManager)?;
833         Ok(new_vm)
834     }
835 
836     #[allow(clippy::too_many_arguments)]
837     pub fn new_from_snapshot(
838         snapshot: &Snapshot,
839         vm_config: Arc<Mutex<VmConfig>>,
840         exit_evt: EventFd,
841         reset_evt: EventFd,
842         #[cfg(feature = "gdb")] vm_debug_evt: EventFd,
843         source_url: Option<&str>,
844         prefault: bool,
845         seccomp_action: &SeccompAction,
846         hypervisor: Arc<dyn hypervisor::Hypervisor>,
847         activate_evt: EventFd,
848     ) -> Result<Self> {
849         hypervisor.check_required_extensions().unwrap();
850         let vm = hypervisor.create_vm().unwrap();
851 
852         #[cfg(target_arch = "x86_64")]
853         {
854             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
855                 .unwrap();
856             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
857             vm.enable_split_irq().unwrap();
858         }
859 
860         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
861         if let Some(state) = vm_snapshot.state {
862             vm.set_state(state)
863                 .map_err(|e| Error::Restore(MigratableError::Restore(e.into())))?;
864         }
865 
866         let memory_manager = if let Some(memory_manager_snapshot) =
867             snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID)
868         {
869             let phys_bits = physical_bits(vm_config.lock().unwrap().cpus.max_phys_bits);
870             MemoryManager::new_from_snapshot(
871                 memory_manager_snapshot,
872                 vm.clone(),
873                 &vm_config.lock().unwrap().memory.clone(),
874                 source_url,
875                 prefault,
876                 phys_bits,
877             )
878             .map_err(Error::MemoryManager)?
879         } else {
880             return Err(Error::Restore(MigratableError::Restore(anyhow!(
881                 "Missing memory manager snapshot"
882             ))));
883         };
884 
885         Vm::new_from_memory_manager(
886             vm_config,
887             memory_manager,
888             vm,
889             exit_evt,
890             reset_evt,
891             #[cfg(feature = "gdb")]
892             vm_debug_evt,
893             seccomp_action,
894             hypervisor,
895             activate_evt,
896             true,
897         )
898     }
899 
900     #[allow(clippy::too_many_arguments)]
901     pub fn new_from_migration(
902         config: Arc<Mutex<VmConfig>>,
903         exit_evt: EventFd,
904         reset_evt: EventFd,
905         #[cfg(feature = "gdb")] vm_debug_evt: EventFd,
906         seccomp_action: &SeccompAction,
907         hypervisor: Arc<dyn hypervisor::Hypervisor>,
908         activate_evt: EventFd,
909         memory_manager_data: &MemoryManagerSnapshotData,
910         existing_memory_files: Option<HashMap<u32, File>>,
911     ) -> Result<Self> {
912         hypervisor.check_required_extensions().unwrap();
913         let vm = hypervisor.create_vm().unwrap();
914 
915         #[cfg(target_arch = "x86_64")]
916         {
917             vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
918                 .unwrap();
919             vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
920             vm.enable_split_irq().unwrap();
921         }
922 
923         let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits);
924 
925         let memory_manager = MemoryManager::new(
926             vm.clone(),
927             &config.lock().unwrap().memory.clone(),
928             None,
929             phys_bits,
930             #[cfg(feature = "tdx")]
931             false,
932             Some(memory_manager_data),
933             existing_memory_files,
934             #[cfg(target_arch = "x86_64")]
935             None,
936         )
937         .map_err(Error::MemoryManager)?;
938 
939         Vm::new_from_memory_manager(
940             config,
941             memory_manager,
942             vm,
943             exit_evt,
944             reset_evt,
945             #[cfg(feature = "gdb")]
946             vm_debug_evt,
947             seccomp_action,
948             hypervisor,
949             activate_evt,
950             true,
951         )
952     }
953 
954     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
955         let mut initramfs = self.initramfs.as_ref().unwrap();
956         let size: usize = initramfs
957             .seek(SeekFrom::End(0))
958             .map_err(|_| Error::InitramfsLoad)?
959             .try_into()
960             .unwrap();
961         initramfs
962             .seek(SeekFrom::Start(0))
963             .map_err(|_| Error::InitramfsLoad)?;
964 
965         let address =
966             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
967         let address = GuestAddress(address);
968 
969         guest_mem
970             .read_from(address, &mut initramfs, size)
971             .map_err(|_| Error::InitramfsLoad)?;
972 
973         info!("Initramfs loaded: address = 0x{:x}", address.0);
974         Ok(arch::InitramfsConfig { address, size })
975     }
976 
977     fn get_cmdline(&mut self) -> Result<Cmdline> {
978         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE);
979         cmdline
980             .insert_str(self.config.lock().unwrap().cmdline.args.clone())
981             .map_err(Error::CmdLineInsertStr)?;
982         for entry in self.device_manager.lock().unwrap().cmdline_additions() {
983             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
984         }
985         Ok(cmdline)
986     }
987 
988     #[cfg(target_arch = "aarch64")]
989     fn load_kernel(&mut self) -> Result<EntryPoint> {
990         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
991         let mem = guest_memory.memory();
992         let mut kernel = self.kernel.as_ref().unwrap();
993         let entry_addr = match linux_loader::loader::pe::PE::load(
994             mem.deref(),
995             Some(arch::layout::KERNEL_START),
996             &mut kernel,
997             None,
998         ) {
999             Ok(entry_addr) => entry_addr,
1000             // Try to load the binary as kernel PE file at first.
1001             // If failed, retry to load it as UEFI binary.
1002             // As the UEFI binary is formatless, it must be the last option to try.
1003             Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
1004                 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut kernel)
1005                     .map_err(Error::UefiLoad)?;
1006                 // The entry point offset in UEFI image is always 0.
1007                 return Ok(EntryPoint {
1008                     entry_addr: arch::layout::UEFI_START,
1009                 });
1010             }
1011             Err(e) => {
1012                 return Err(Error::KernelLoad(e));
1013             }
1014         };
1015 
1016         let entry_point_addr: GuestAddress = entry_addr.kernel_load;
1017 
1018         Ok(EntryPoint {
1019             entry_addr: entry_point_addr,
1020         })
1021     }
1022 
1023     #[cfg(target_arch = "x86_64")]
1024     fn load_kernel(&mut self) -> Result<EntryPoint> {
1025         use linux_loader::loader::{elf::Error::InvalidElfMagicNumber, Error::Elf};
1026         info!("Loading kernel");
1027         let cmdline = self.get_cmdline()?;
1028         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1029         let mem = guest_memory.memory();
1030         let mut kernel = self.kernel.as_ref().unwrap();
1031         let entry_addr = match linux_loader::loader::elf::Elf::load(
1032             mem.deref(),
1033             None,
1034             &mut kernel,
1035             Some(arch::layout::HIGH_RAM_START),
1036         ) {
1037             Ok(entry_addr) => entry_addr,
1038             Err(e) => match e {
1039                 Elf(InvalidElfMagicNumber) => {
1040                     // Not an ELF header - assume raw binary data / firmware
1041                     let size = kernel.seek(SeekFrom::End(0)).map_err(Error::FirmwareFile)?;
1042 
1043                     // The OVMF firmware is as big as you might expect and it's 4MiB so limit to that
1044                     if size > 4 << 20 {
1045                         return Err(Error::FirmwareTooLarge);
1046                     }
1047 
1048                     // Loaded at the end of the 4GiB
1049                     let load_address = GuestAddress(4 << 30)
1050                         .checked_sub(size)
1051                         .ok_or(Error::FirmwareTooLarge)?;
1052 
1053                     info!(
1054                         "Loading RAW firmware at 0x{:x} (size: {})",
1055                         load_address.raw_value(),
1056                         size
1057                     );
1058 
1059                     self.memory_manager
1060                         .lock()
1061                         .unwrap()
1062                         .add_ram_region(load_address, size as usize)
1063                         .map_err(Error::AllocateFirmwareMemory)?;
1064 
1065                     kernel
1066                         .seek(SeekFrom::Start(0))
1067                         .map_err(Error::FirmwareFile)?;
1068                     guest_memory
1069                         .memory()
1070                         .read_exact_from(load_address, &mut kernel, size as usize)
1071                         .map_err(Error::FirmwareLoad)?;
1072 
1073                     return Ok(EntryPoint { entry_addr: None });
1074                 }
1075                 _ => {
1076                     return Err(Error::KernelLoad(e));
1077                 }
1078             },
1079         };
1080 
1081         linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1082             .map_err(Error::LoadCmdLine)?;
1083 
1084         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1085             // Use the PVH kernel entry point to boot the guest
1086             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1087             Ok(EntryPoint {
1088                 entry_addr: Some(entry_addr),
1089             })
1090         } else {
1091             Err(Error::KernelMissingPvhHeader)
1092         }
1093     }
1094 
1095     #[cfg(target_arch = "x86_64")]
1096     fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> {
1097         info!("Configuring system");
1098         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1099 
1100         let initramfs_config = match self.initramfs {
1101             Some(_) => Some(self.load_initramfs(&mem)?),
1102             None => None,
1103         };
1104 
1105         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1106         let rsdp_addr = Some(rsdp_addr);
1107         let sgx_epc_region = self
1108             .memory_manager
1109             .lock()
1110             .unwrap()
1111             .sgx_epc_region()
1112             .as_ref()
1113             .cloned();
1114 
1115         arch::configure_system(
1116             &mem,
1117             arch::layout::CMDLINE_START,
1118             &initramfs_config,
1119             boot_vcpus,
1120             rsdp_addr,
1121             sgx_epc_region,
1122         )
1123         .map_err(Error::ConfigureSystem)?;
1124         Ok(())
1125     }
1126 
1127     #[cfg(target_arch = "aarch64")]
1128     fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> {
1129         let cmdline = self.get_cmdline()?;
1130         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1131         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1132         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1133         let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1134         let initramfs_config = match self.initramfs {
1135             Some(_) => Some(self.load_initramfs(&mem)?),
1136             None => None,
1137         };
1138 
1139         let device_info = &self
1140             .device_manager
1141             .lock()
1142             .unwrap()
1143             .get_device_info()
1144             .clone();
1145 
1146         for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1147             let pci_space = PciSpaceInfo {
1148                 pci_segment_id: pci_segment.id,
1149                 mmio_config_address: pci_segment.mmio_config_address,
1150                 pci_device_space_start: pci_segment.start_of_device_area,
1151                 pci_device_space_size: pci_segment.end_of_device_area
1152                     - pci_segment.start_of_device_area
1153                     + 1,
1154             };
1155             pci_space_info.push(pci_space);
1156         }
1157 
1158         let virtio_iommu_bdf = self
1159             .device_manager
1160             .lock()
1161             .unwrap()
1162             .iommu_attached_devices()
1163             .as_ref()
1164             .map(|(v, _)| *v);
1165 
1166         let gic_device = create_gic(
1167             &self.memory_manager.lock().as_ref().unwrap().vm,
1168             self.cpu_manager.lock().unwrap().boot_vcpus() as u64,
1169         )
1170         .map_err(|e| {
1171             Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::SetupGic(e)))
1172         })?;
1173 
1174         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1175         let pmu_supported = self
1176             .cpu_manager
1177             .lock()
1178             .unwrap()
1179             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
1180             .map_err(|_| {
1181                 Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::VcpuInitPmu))
1182             })?;
1183 
1184         arch::configure_system(
1185             &mem,
1186             cmdline.as_str(),
1187             vcpu_mpidrs,
1188             vcpu_topology,
1189             device_info,
1190             &initramfs_config,
1191             &pci_space_info,
1192             virtio_iommu_bdf.map(|bdf| bdf.into()),
1193             &*gic_device,
1194             &self.numa_nodes,
1195             pmu_supported,
1196         )
1197         .map_err(Error::ConfigureSystem)?;
1198 
1199         // Update the GIC entity in device manager
1200         self.device_manager
1201             .lock()
1202             .unwrap()
1203             .get_interrupt_controller()
1204             .unwrap()
1205             .lock()
1206             .unwrap()
1207             .set_gic_device(Arc::new(Mutex::new(gic_device)));
1208 
1209         // Activate gic device
1210         self.device_manager
1211             .lock()
1212             .unwrap()
1213             .get_interrupt_controller()
1214             .unwrap()
1215             .lock()
1216             .unwrap()
1217             .enable()
1218             .map_err(Error::EnableInterruptController)?;
1219 
1220         Ok(())
1221     }
1222 
1223     pub fn serial_pty(&self) -> Option<PtyPair> {
1224         self.device_manager.lock().unwrap().serial_pty()
1225     }
1226 
1227     pub fn console_pty(&self) -> Option<PtyPair> {
1228         self.device_manager.lock().unwrap().console_pty()
1229     }
1230 
1231     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1232         self.device_manager.lock().unwrap().console_resize_pipe()
1233     }
1234 
1235     pub fn shutdown(&mut self) -> Result<()> {
1236         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1237         let new_state = VmState::Shutdown;
1238 
1239         state.valid_transition(new_state)?;
1240 
1241         if self.on_tty {
1242             // Don't forget to set the terminal in canonical mode
1243             // before to exit.
1244             io::stdin()
1245                 .lock()
1246                 .set_canon_mode()
1247                 .map_err(Error::SetTerminalCanon)?;
1248         }
1249 
1250         // Trigger the termination of the signal_handler thread
1251         if let Some(signals) = self.signals.take() {
1252             signals.close();
1253         }
1254 
1255         // Wake up the DeviceManager threads so they will get terminated cleanly
1256         self.device_manager
1257             .lock()
1258             .unwrap()
1259             .resume()
1260             .map_err(Error::Resume)?;
1261 
1262         self.cpu_manager
1263             .lock()
1264             .unwrap()
1265             .shutdown()
1266             .map_err(Error::CpuManager)?;
1267 
1268         // Wait for all the threads to finish
1269         for thread in self.threads.drain(..) {
1270             thread.join().map_err(Error::ThreadCleanup)?
1271         }
1272         *state = new_state;
1273 
1274         event!("vm", "shutdown");
1275 
1276         Ok(())
1277     }
1278 
1279     pub fn resize(
1280         &mut self,
1281         desired_vcpus: Option<u8>,
1282         desired_memory: Option<u64>,
1283         desired_balloon: Option<u64>,
1284     ) -> Result<()> {
1285         event!("vm", "resizing");
1286 
1287         if let Some(desired_vcpus) = desired_vcpus {
1288             if self
1289                 .cpu_manager
1290                 .lock()
1291                 .unwrap()
1292                 .resize(desired_vcpus)
1293                 .map_err(Error::CpuManager)?
1294             {
1295                 self.device_manager
1296                     .lock()
1297                     .unwrap()
1298                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1299                     .map_err(Error::DeviceManager)?;
1300             }
1301             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1302         }
1303 
1304         if let Some(desired_memory) = desired_memory {
1305             let new_region = self
1306                 .memory_manager
1307                 .lock()
1308                 .unwrap()
1309                 .resize(desired_memory)
1310                 .map_err(Error::MemoryManager)?;
1311 
1312             let mut memory_config = &mut self.config.lock().unwrap().memory;
1313 
1314             if let Some(new_region) = &new_region {
1315                 self.device_manager
1316                     .lock()
1317                     .unwrap()
1318                     .update_memory(new_region)
1319                     .map_err(Error::DeviceManager)?;
1320 
1321                 match memory_config.hotplug_method {
1322                     HotplugMethod::Acpi => {
1323                         self.device_manager
1324                             .lock()
1325                             .unwrap()
1326                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1327                             .map_err(Error::DeviceManager)?;
1328                     }
1329                     HotplugMethod::VirtioMem => {}
1330                 }
1331             }
1332 
1333             // We update the VM config regardless of the actual guest resize
1334             // operation result (happened or not), so that if the VM reboots
1335             // it will be running with the last configure memory size.
1336             match memory_config.hotplug_method {
1337                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1338                 HotplugMethod::VirtioMem => {
1339                     if desired_memory > memory_config.size {
1340                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1341                     } else {
1342                         memory_config.hotplugged_size = None;
1343                     }
1344                 }
1345             }
1346         }
1347 
1348         if let Some(desired_balloon) = desired_balloon {
1349             self.device_manager
1350                 .lock()
1351                 .unwrap()
1352                 .resize_balloon(desired_balloon)
1353                 .map_err(Error::DeviceManager)?;
1354 
1355             // Update the configuration value for the balloon size to ensure
1356             // a reboot would use the right value.
1357             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1358                 balloon_config.size = desired_balloon;
1359             }
1360         }
1361 
1362         event!("vm", "resized");
1363 
1364         Ok(())
1365     }
1366 
1367     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1368         let memory_config = &mut self.config.lock().unwrap().memory;
1369 
1370         if let Some(zones) = &mut memory_config.zones {
1371             for zone in zones.iter_mut() {
1372                 if zone.id == id {
1373                     if desired_memory >= zone.size {
1374                         let hotplugged_size = desired_memory - zone.size;
1375                         self.memory_manager
1376                             .lock()
1377                             .unwrap()
1378                             .resize_zone(&id, desired_memory - zone.size)
1379                             .map_err(Error::MemoryManager)?;
1380                         // We update the memory zone config regardless of the
1381                         // actual 'resize-zone' operation result (happened or
1382                         // not), so that if the VM reboots it will be running
1383                         // with the last configured memory zone size.
1384                         zone.hotplugged_size = Some(hotplugged_size);
1385 
1386                         return Ok(());
1387                     } else {
1388                         error!(
1389                             "Invalid to ask less ({}) than boot RAM ({}) for \
1390                             this memory zone",
1391                             desired_memory, zone.size,
1392                         );
1393                         return Err(Error::ResizeZone);
1394                     }
1395                 }
1396             }
1397         }
1398 
1399         error!("Could not find the memory zone {} for the resize", id);
1400         Err(Error::ResizeZone)
1401     }
1402 
1403     pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1404         let pci_device_info = self
1405             .device_manager
1406             .lock()
1407             .unwrap()
1408             .add_device(&mut device_cfg)
1409             .map_err(Error::DeviceManager)?;
1410 
1411         // Update VmConfig by adding the new device. This is important to
1412         // ensure the device would be created in case of a reboot.
1413         {
1414             let mut config = self.config.lock().unwrap();
1415             add_to_config(&mut config.devices, device_cfg);
1416         }
1417 
1418         self.device_manager
1419             .lock()
1420             .unwrap()
1421             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1422             .map_err(Error::DeviceManager)?;
1423 
1424         Ok(pci_device_info)
1425     }
1426 
1427     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1428         let pci_device_info = self
1429             .device_manager
1430             .lock()
1431             .unwrap()
1432             .add_user_device(&mut device_cfg)
1433             .map_err(Error::DeviceManager)?;
1434 
1435         // Update VmConfig by adding the new device. This is important to
1436         // ensure the device would be created in case of a reboot.
1437         {
1438             let mut config = self.config.lock().unwrap();
1439             add_to_config(&mut config.user_devices, device_cfg);
1440         }
1441 
1442         self.device_manager
1443             .lock()
1444             .unwrap()
1445             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1446             .map_err(Error::DeviceManager)?;
1447 
1448         Ok(pci_device_info)
1449     }
1450 
1451     pub fn remove_device(&mut self, id: String) -> Result<()> {
1452         self.device_manager
1453             .lock()
1454             .unwrap()
1455             .remove_device(id.clone())
1456             .map_err(Error::DeviceManager)?;
1457 
1458         // Update VmConfig by removing the device. This is important to
1459         // ensure the device would not be created in case of a reboot.
1460         let mut config = self.config.lock().unwrap();
1461 
1462         // Remove if VFIO device
1463         if let Some(devices) = config.devices.as_mut() {
1464             devices.retain(|dev| dev.id.as_ref() != Some(&id));
1465         }
1466 
1467         // Remove if VFIO user device
1468         if let Some(user_devices) = config.user_devices.as_mut() {
1469             user_devices.retain(|dev| dev.id.as_ref() != Some(&id));
1470         }
1471 
1472         // Remove if disk device
1473         if let Some(disks) = config.disks.as_mut() {
1474             disks.retain(|dev| dev.id.as_ref() != Some(&id));
1475         }
1476 
1477         // Remove if net device
1478         if let Some(net) = config.net.as_mut() {
1479             net.retain(|dev| dev.id.as_ref() != Some(&id));
1480         }
1481 
1482         // Remove if pmem device
1483         if let Some(pmem) = config.pmem.as_mut() {
1484             pmem.retain(|dev| dev.id.as_ref() != Some(&id));
1485         }
1486 
1487         // Remove if vDPA device
1488         if let Some(vdpa) = config.vdpa.as_mut() {
1489             vdpa.retain(|dev| dev.id.as_ref() != Some(&id));
1490         }
1491 
1492         // Remove if vsock device
1493         if let Some(vsock) = config.vsock.as_ref() {
1494             if vsock.id.as_ref() == Some(&id) {
1495                 config.vsock = None;
1496             }
1497         }
1498 
1499         self.device_manager
1500             .lock()
1501             .unwrap()
1502             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1503             .map_err(Error::DeviceManager)?;
1504         Ok(())
1505     }
1506 
1507     pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1508         let pci_device_info = self
1509             .device_manager
1510             .lock()
1511             .unwrap()
1512             .add_disk(&mut disk_cfg)
1513             .map_err(Error::DeviceManager)?;
1514 
1515         // Update VmConfig by adding the new device. This is important to
1516         // ensure the device would be created in case of a reboot.
1517         {
1518             let mut config = self.config.lock().unwrap();
1519             add_to_config(&mut config.disks, disk_cfg);
1520         }
1521 
1522         self.device_manager
1523             .lock()
1524             .unwrap()
1525             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1526             .map_err(Error::DeviceManager)?;
1527 
1528         Ok(pci_device_info)
1529     }
1530 
1531     pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1532         let pci_device_info = self
1533             .device_manager
1534             .lock()
1535             .unwrap()
1536             .add_fs(&mut fs_cfg)
1537             .map_err(Error::DeviceManager)?;
1538 
1539         // Update VmConfig by adding the new device. This is important to
1540         // ensure the device would be created in case of a reboot.
1541         {
1542             let mut config = self.config.lock().unwrap();
1543             add_to_config(&mut config.fs, fs_cfg);
1544         }
1545 
1546         self.device_manager
1547             .lock()
1548             .unwrap()
1549             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1550             .map_err(Error::DeviceManager)?;
1551 
1552         Ok(pci_device_info)
1553     }
1554 
1555     pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1556         let pci_device_info = self
1557             .device_manager
1558             .lock()
1559             .unwrap()
1560             .add_pmem(&mut pmem_cfg)
1561             .map_err(Error::DeviceManager)?;
1562 
1563         // Update VmConfig by adding the new device. This is important to
1564         // ensure the device would be created in case of a reboot.
1565         {
1566             let mut config = self.config.lock().unwrap();
1567             add_to_config(&mut config.pmem, pmem_cfg);
1568         }
1569 
1570         self.device_manager
1571             .lock()
1572             .unwrap()
1573             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1574             .map_err(Error::DeviceManager)?;
1575 
1576         Ok(pci_device_info)
1577     }
1578 
1579     pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1580         let pci_device_info = self
1581             .device_manager
1582             .lock()
1583             .unwrap()
1584             .add_net(&mut net_cfg)
1585             .map_err(Error::DeviceManager)?;
1586 
1587         // Update VmConfig by adding the new device. This is important to
1588         // ensure the device would be created in case of a reboot.
1589         {
1590             let mut config = self.config.lock().unwrap();
1591             add_to_config(&mut config.net, net_cfg);
1592         }
1593 
1594         self.device_manager
1595             .lock()
1596             .unwrap()
1597             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1598             .map_err(Error::DeviceManager)?;
1599 
1600         Ok(pci_device_info)
1601     }
1602 
1603     pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1604         let pci_device_info = self
1605             .device_manager
1606             .lock()
1607             .unwrap()
1608             .add_vdpa(&mut vdpa_cfg)
1609             .map_err(Error::DeviceManager)?;
1610 
1611         // Update VmConfig by adding the new device. This is important to
1612         // ensure the device would be created in case of a reboot.
1613         {
1614             let mut config = self.config.lock().unwrap();
1615             add_to_config(&mut config.vdpa, vdpa_cfg);
1616         }
1617 
1618         self.device_manager
1619             .lock()
1620             .unwrap()
1621             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1622             .map_err(Error::DeviceManager)?;
1623 
1624         Ok(pci_device_info)
1625     }
1626 
1627     pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1628         let pci_device_info = self
1629             .device_manager
1630             .lock()
1631             .unwrap()
1632             .add_vsock(&mut vsock_cfg)
1633             .map_err(Error::DeviceManager)?;
1634 
1635         // Update VmConfig by adding the new device. This is important to
1636         // ensure the device would be created in case of a reboot.
1637         {
1638             let mut config = self.config.lock().unwrap();
1639             config.vsock = Some(vsock_cfg);
1640         }
1641 
1642         self.device_manager
1643             .lock()
1644             .unwrap()
1645             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1646             .map_err(Error::DeviceManager)?;
1647 
1648         Ok(pci_device_info)
1649     }
1650 
1651     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1652         Ok(self.device_manager.lock().unwrap().counters())
1653     }
1654 
1655     fn os_signal_handler(
1656         mut signals: Signals,
1657         console_input_clone: Arc<Console>,
1658         on_tty: bool,
1659         exit_evt: &EventFd,
1660     ) {
1661         for sig in &HANDLED_SIGNALS {
1662             unblock_signal(*sig).unwrap();
1663         }
1664 
1665         for signal in signals.forever() {
1666             match signal {
1667                 SIGWINCH => {
1668                     console_input_clone.update_console_size();
1669                 }
1670                 SIGTERM | SIGINT => {
1671                     if on_tty {
1672                         io::stdin()
1673                             .lock()
1674                             .set_canon_mode()
1675                             .expect("failed to restore terminal mode");
1676                     }
1677                     if exit_evt.write(1).is_err() {
1678                         std::process::exit(1);
1679                     }
1680                 }
1681                 _ => (),
1682             }
1683         }
1684     }
1685 
1686     #[cfg(feature = "tdx")]
1687     fn init_tdx(&mut self) -> Result<()> {
1688         let cpuid = self.cpu_manager.lock().unwrap().common_cpuid();
1689         let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32;
1690         self.vm
1691             .tdx_init(&cpuid, max_vcpus)
1692             .map_err(Error::InitializeTdxVm)?;
1693         Ok(())
1694     }
1695 
1696     #[cfg(feature = "tdx")]
1697     fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> {
1698         use arch::x86_64::tdx::*;
1699         // The TDVF file contains a table of section as well as code
1700         let mut firmware_file =
1701             File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware)
1702                 .map_err(Error::LoadTdvf)?;
1703 
1704         // For all the sections allocate some RAM backing them
1705         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1706     }
1707 
1708     #[cfg(feature = "tdx")]
1709     fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> {
1710         use arch::x86_64::tdx::*;
1711         // Get the memory end *before* we start adding TDVF ram regions
1712         let boot_guest_memory = self
1713             .memory_manager
1714             .lock()
1715             .as_ref()
1716             .unwrap()
1717             .boot_guest_memory();
1718         for section in sections {
1719             // No need to allocate if the section falls within guest RAM ranges
1720             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1721                 info!(
1722                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1723                     section
1724                 );
1725                 continue;
1726             }
1727 
1728             info!("Allocating TDVF Section: {:x?}", section);
1729             self.memory_manager
1730                 .lock()
1731                 .unwrap()
1732                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1733                 .map_err(Error::AllocatingTdvfMemory)?;
1734         }
1735 
1736         // The TDVF file contains a table of section as well as code
1737         let mut firmware_file =
1738             File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware)
1739                 .map_err(Error::LoadTdvf)?;
1740 
1741         // The guest memory at this point now has all the required regions so it
1742         // is safe to copy from the TDVF file into it.
1743         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1744         let mem = guest_memory.memory();
1745         let mut payload_info = None;
1746         let mut hob_offset = None;
1747         for section in sections {
1748             info!("Populating TDVF Section: {:x?}", section);
1749             match section.r#type {
1750                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1751                     info!("Copying section to guest memory");
1752                     firmware_file
1753                         .seek(SeekFrom::Start(section.data_offset as u64))
1754                         .map_err(Error::LoadTdvf)?;
1755                     mem.read_from(
1756                         GuestAddress(section.address),
1757                         &mut firmware_file,
1758                         section.data_size as usize,
1759                     )
1760                     .unwrap();
1761                 }
1762                 TdvfSectionType::TdHob => {
1763                     hob_offset = Some(section.address);
1764                 }
1765                 TdvfSectionType::Payload => {
1766                     info!("Copying payload to guest memory");
1767                     if let Some(payload_file) = self.kernel.as_mut() {
1768                         let payload_size = payload_file
1769                             .seek(SeekFrom::End(0))
1770                             .map_err(Error::LoadPayload)?;
1771 
1772                         payload_file
1773                             .seek(SeekFrom::Start(0x1f1))
1774                             .map_err(Error::LoadPayload)?;
1775 
1776                         let mut payload_header = linux_loader::bootparam::setup_header::default();
1777                         payload_header
1778                             .as_bytes()
1779                             .read_from(
1780                                 0,
1781                                 payload_file,
1782                                 mem::size_of::<linux_loader::bootparam::setup_header>(),
1783                             )
1784                             .unwrap();
1785 
1786                         if payload_header.header != 0x5372_6448 {
1787                             return Err(Error::InvalidPayloadType);
1788                         }
1789 
1790                         if (payload_header.version < 0x0200)
1791                             || ((payload_header.loadflags & 0x1) == 0x0)
1792                         {
1793                             return Err(Error::InvalidPayloadType);
1794                         }
1795 
1796                         payload_file
1797                             .seek(SeekFrom::Start(0))
1798                             .map_err(Error::LoadPayload)?;
1799                         mem.read_from(
1800                             GuestAddress(section.address),
1801                             payload_file,
1802                             payload_size as usize,
1803                         )
1804                         .unwrap();
1805 
1806                         // Create the payload info that will be inserted into
1807                         // the HOB.
1808                         payload_info = Some(PayloadInfo {
1809                             image_type: PayloadImageType::BzImage,
1810                             entry_point: section.address,
1811                         });
1812                     }
1813                 }
1814                 TdvfSectionType::PayloadParam => {
1815                     info!("Copying payload parameters to guest memory");
1816                     let cmdline = self.get_cmdline()?;
1817                     mem.write_slice(cmdline.as_str().as_bytes(), GuestAddress(section.address))
1818                         .unwrap();
1819                 }
1820                 _ => {}
1821             }
1822         }
1823 
1824         // Generate HOB
1825         let mut hob = TdHob::start(hob_offset.unwrap());
1826 
1827         let mut sorted_sections = sections.to_vec();
1828         sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
1829 
1830         sorted_sections.sort_by_key(|section| section.address);
1831         sorted_sections.reverse();
1832         let mut current_section = sorted_sections.pop();
1833 
1834         // RAM regions interleaved with TDVF sections
1835         let mut next_start_addr = 0;
1836         for region in boot_guest_memory.iter() {
1837             let region_start = region.start_addr().0;
1838             let region_end = region.last_addr().0;
1839             if region_start > next_start_addr {
1840                 next_start_addr = region_start;
1841             }
1842 
1843             loop {
1844                 let (start, size, ram) = if let Some(section) = &current_section {
1845                     if section.address <= next_start_addr {
1846                         (section.address, section.size, false)
1847                     } else {
1848                         let last_addr = std::cmp::min(section.address - 1, region_end);
1849                         (next_start_addr, last_addr - next_start_addr + 1, true)
1850                     }
1851                 } else {
1852                     (next_start_addr, region_end - next_start_addr + 1, true)
1853                 };
1854 
1855                 hob.add_memory_resource(&mem, start, size, ram)
1856                     .map_err(Error::PopulateHob)?;
1857 
1858                 if !ram {
1859                     current_section = sorted_sections.pop();
1860                 }
1861 
1862                 next_start_addr = start + size;
1863 
1864                 if region_start > next_start_addr {
1865                     next_start_addr = region_start;
1866                 }
1867 
1868                 if next_start_addr > region_end {
1869                     break;
1870                 }
1871             }
1872         }
1873 
1874         // MMIO regions
1875         hob.add_mmio_resource(
1876             &mem,
1877             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1878             arch::layout::APIC_START.raw_value()
1879                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1880         )
1881         .map_err(Error::PopulateHob)?;
1882         let start_of_device_area = self
1883             .memory_manager
1884             .lock()
1885             .unwrap()
1886             .start_of_device_area()
1887             .raw_value();
1888         let end_of_device_area = self
1889             .memory_manager
1890             .lock()
1891             .unwrap()
1892             .end_of_device_area()
1893             .raw_value();
1894         hob.add_mmio_resource(
1895             &mem,
1896             start_of_device_area,
1897             end_of_device_area - start_of_device_area,
1898         )
1899         .map_err(Error::PopulateHob)?;
1900 
1901         // Loop over the ACPI tables and copy them to the HOB.
1902 
1903         for acpi_table in crate::acpi::create_acpi_tables_tdx(
1904             &self.device_manager,
1905             &self.cpu_manager,
1906             &self.memory_manager,
1907             &self.numa_nodes,
1908         ) {
1909             hob.add_acpi_table(&mem, acpi_table.as_slice())
1910                 .map_err(Error::PopulateHob)?;
1911         }
1912 
1913         // If a payload info has been created, let's insert it into the HOB.
1914         if let Some(payload_info) = payload_info {
1915             hob.add_payload(&mem, payload_info)
1916                 .map_err(Error::PopulateHob)?;
1917         }
1918 
1919         hob.finish(&mem).map_err(Error::PopulateHob)?;
1920 
1921         Ok(hob_offset)
1922     }
1923 
1924     #[cfg(feature = "tdx")]
1925     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1926         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1927         let mem = guest_memory.memory();
1928 
1929         for section in sections {
1930             self.vm
1931                 .tdx_init_memory_region(
1932                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1933                     section.address,
1934                     section.size,
1935                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1936                     section.attributes == 1,
1937                 )
1938                 .map_err(Error::InitializeTdxMemoryRegion)?;
1939         }
1940 
1941         Ok(())
1942     }
1943 
1944     fn setup_signal_handler(&mut self) -> Result<()> {
1945         let console = self.device_manager.lock().unwrap().console().clone();
1946         let signals = Signals::new(&HANDLED_SIGNALS);
1947         match signals {
1948             Ok(signals) => {
1949                 self.signals = Some(signals.handle());
1950                 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?;
1951                 let on_tty = self.on_tty;
1952                 let signal_handler_seccomp_filter =
1953                     get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler)
1954                         .map_err(Error::CreateSeccompFilter)?;
1955                 self.threads.push(
1956                     thread::Builder::new()
1957                         .name("signal_handler".to_string())
1958                         .spawn(move || {
1959                             if !signal_handler_seccomp_filter.is_empty() {
1960                                 if let Err(e) = apply_filter(&signal_handler_seccomp_filter)
1961                                     .map_err(Error::ApplySeccompFilter)
1962                                 {
1963                                     error!("Error applying seccomp filter: {:?}", e);
1964                                     exit_evt.write(1).ok();
1965                                     return;
1966                                 }
1967                             }
1968                             std::panic::catch_unwind(AssertUnwindSafe(|| {
1969                                 Vm::os_signal_handler(signals, console, on_tty, &exit_evt);
1970                             }))
1971                             .map_err(|_| {
1972                                 error!("signal_handler thead panicked");
1973                                 exit_evt.write(1).ok()
1974                             })
1975                             .ok();
1976                         })
1977                         .map_err(Error::SignalHandlerSpawn)?,
1978                 );
1979             }
1980             Err(e) => error!("Signal not found {}", e),
1981         }
1982         Ok(())
1983     }
1984 
1985     fn setup_tty(&self) -> Result<()> {
1986         if self.on_tty {
1987             io::stdin()
1988                 .lock()
1989                 .set_raw_mode()
1990                 .map_err(Error::SetTerminalRaw)?;
1991         }
1992 
1993         Ok(())
1994     }
1995 
1996     // Creates ACPI tables
1997     // In case of TDX being used, this is a no-op since the tables will be
1998     // created and passed when populating the HOB.
1999 
2000     fn create_acpi_tables(&self) -> Option<GuestAddress> {
2001         #[cfg(feature = "tdx")]
2002         if self.config.lock().unwrap().tdx.is_some() {
2003             return None;
2004         }
2005 
2006         let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2007 
2008         let rsdp_addr = crate::acpi::create_acpi_tables(
2009             &mem,
2010             &self.device_manager,
2011             &self.cpu_manager,
2012             &self.memory_manager,
2013             &self.numa_nodes,
2014         );
2015         info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2016 
2017         Some(rsdp_addr)
2018     }
2019 
2020     fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2021         Ok(if self.kernel.as_ref().is_some() {
2022             #[cfg(feature = "tdx")]
2023             if self.config.lock().unwrap().tdx.is_some() {
2024                 return Ok(None);
2025             }
2026             Some(self.load_kernel()?)
2027         } else {
2028             None
2029         })
2030     }
2031 
2032     pub fn boot(&mut self) -> Result<()> {
2033         info!("Booting VM");
2034         event!("vm", "booting");
2035         let current_state = self.get_state()?;
2036         if current_state == VmState::Paused {
2037             return self.resume().map_err(Error::Resume);
2038         }
2039 
2040         let new_state = if self.stop_on_boot {
2041             VmState::BreakPoint
2042         } else {
2043             VmState::Running
2044         };
2045         current_state.valid_transition(new_state)?;
2046 
2047         // Load kernel if configured
2048         let entry_point = self.entry_point()?;
2049 
2050         // The initial TDX configuration must be done before the vCPUs are
2051         // created
2052         #[cfg(feature = "tdx")]
2053         if self.config.lock().unwrap().tdx.is_some() {
2054             self.init_tdx()?;
2055         }
2056 
2057         // Create and configure vcpus
2058         self.cpu_manager
2059             .lock()
2060             .unwrap()
2061             .create_boot_vcpus(entry_point)
2062             .map_err(Error::CpuManager)?;
2063 
2064         #[cfg(feature = "tdx")]
2065         let sections = if self.config.lock().unwrap().tdx.is_some() {
2066             self.extract_tdvf_sections()?
2067         } else {
2068             Vec::new()
2069         };
2070 
2071         let rsdp_addr = self.create_acpi_tables();
2072 
2073         // Configuring the TDX regions requires that the vCPUs are created.
2074         #[cfg(feature = "tdx")]
2075         let hob_address = if self.config.lock().unwrap().tdx.is_some() {
2076             // TDX sections are written to memory.
2077             self.populate_tdx_sections(&sections)?
2078         } else {
2079             None
2080         };
2081 
2082         // Configure shared state based on loaded kernel
2083         entry_point
2084             .map(|_| {
2085                 // Safe to unwrap rsdp_addr as we know it can't be None when
2086                 // the entry_point is Some.
2087                 self.configure_system(rsdp_addr.unwrap())
2088             })
2089             .transpose()?;
2090 
2091         #[cfg(feature = "tdx")]
2092         if let Some(hob_address) = hob_address {
2093             // With the HOB address extracted the vCPUs can have
2094             // their TDX state configured.
2095             self.cpu_manager
2096                 .lock()
2097                 .unwrap()
2098                 .initialize_tdx(hob_address)
2099                 .map_err(Error::CpuManager)?;
2100             // Let the hypervisor know which memory ranges are shared with the
2101             // guest. This prevents the guest from ignoring/discarding memory
2102             // regions provided by the host.
2103             self.init_tdx_memory(&sections)?;
2104             // With TDX memory and CPU state configured TDX setup is complete
2105             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2106         }
2107 
2108         if new_state == VmState::Running {
2109             self.cpu_manager
2110                 .lock()
2111                 .unwrap()
2112                 .start_boot_vcpus()
2113                 .map_err(Error::CpuManager)?;
2114         }
2115 
2116         self.setup_signal_handler()?;
2117         self.setup_tty()?;
2118 
2119         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2120         *state = new_state;
2121         event!("vm", "booted");
2122         Ok(())
2123     }
2124 
2125     /// Gets a thread-safe reference counted pointer to the VM configuration.
2126     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2127         Arc::clone(&self.config)
2128     }
2129 
2130     /// Get the VM state. Returns an error if the state is poisoned.
2131     pub fn get_state(&self) -> Result<VmState> {
2132         self.state
2133             .try_read()
2134             .map_err(|_| Error::PoisonedState)
2135             .map(|state| *state)
2136     }
2137 
2138     /// Load saved clock from snapshot
2139     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2140     pub fn load_clock_from_snapshot(
2141         &mut self,
2142         snapshot: &Snapshot,
2143     ) -> Result<Option<hypervisor::ClockData>> {
2144         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
2145         self.saved_clock = vm_snapshot.clock;
2146         Ok(self.saved_clock)
2147     }
2148 
2149     #[cfg(target_arch = "aarch64")]
2150     /// Add the vGIC section to the VM snapshot.
2151     fn add_vgic_snapshot_section(
2152         &self,
2153         vm_snapshot: &mut Snapshot,
2154     ) -> std::result::Result<(), MigratableError> {
2155         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2156         let gic_device = Arc::clone(
2157             self.device_manager
2158                 .lock()
2159                 .unwrap()
2160                 .get_interrupt_controller()
2161                 .unwrap()
2162                 .lock()
2163                 .unwrap()
2164                 .get_gic_device()
2165                 .unwrap(),
2166         );
2167 
2168         gic_device
2169             .lock()
2170             .unwrap()
2171             .set_gicr_typers(&saved_vcpu_states);
2172 
2173         vm_snapshot.add_snapshot(
2174             if let Some(gicv3_its) = gic_device
2175                 .lock()
2176                 .unwrap()
2177                 .as_any_concrete_mut()
2178                 .downcast_mut::<KvmGicV3Its>()
2179             {
2180                 gicv3_its.snapshot()?
2181             } else {
2182                 return Err(MigratableError::Snapshot(anyhow!(
2183                     "GicDevice downcast to KvmGicV3Its failed when snapshotting VM!"
2184                 )));
2185             },
2186         );
2187 
2188         Ok(())
2189     }
2190 
2191     #[cfg(target_arch = "aarch64")]
2192     /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing.
2193     fn restore_vgic_and_enable_interrupt(
2194         &self,
2195         vm_snapshot: &Snapshot,
2196     ) -> std::result::Result<(), MigratableError> {
2197         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2198         // The number of vCPUs is the same as the number of saved vCPU states.
2199         let vcpu_numbers = saved_vcpu_states.len();
2200 
2201         // Creating a GIC device here, as the GIC will not be created when
2202         // restoring the device manager. Note that currently only the bare GICv3
2203         // without ITS is supported.
2204         let mut gic_device = create_gic(&self.vm, vcpu_numbers.try_into().unwrap())
2205             .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?;
2206 
2207         // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
2208         self.cpu_manager
2209             .lock()
2210             .unwrap()
2211             .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16)
2212             .map_err(|e| MigratableError::Restore(anyhow!("Error init PMU: {:?}", e)))?;
2213 
2214         // Here we prepare the GICR_TYPER registers from the restored vCPU states.
2215         gic_device.set_gicr_typers(&saved_vcpu_states);
2216 
2217         let gic_device = Arc::new(Mutex::new(gic_device));
2218         // Update the GIC entity in device manager
2219         self.device_manager
2220             .lock()
2221             .unwrap()
2222             .get_interrupt_controller()
2223             .unwrap()
2224             .lock()
2225             .unwrap()
2226             .set_gic_device(Arc::clone(&gic_device));
2227 
2228         // Restore GIC states.
2229         if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) {
2230             if let Some(gicv3_its) = gic_device
2231                 .lock()
2232                 .unwrap()
2233                 .as_any_concrete_mut()
2234                 .downcast_mut::<KvmGicV3Its>()
2235             {
2236                 gicv3_its.restore(*gicv3_its_snapshot.clone())?;
2237             } else {
2238                 return Err(MigratableError::Restore(anyhow!(
2239                     "GicDevice downcast to KvmGicV3Its failed when restoring VM!"
2240                 )));
2241             };
2242         } else {
2243             return Err(MigratableError::Restore(anyhow!(
2244                 "Missing GicV3Its snapshot"
2245             )));
2246         }
2247 
2248         // Activate gic device
2249         self.device_manager
2250             .lock()
2251             .unwrap()
2252             .get_interrupt_controller()
2253             .unwrap()
2254             .lock()
2255             .unwrap()
2256             .enable()
2257             .map_err(|e| {
2258                 MigratableError::Restore(anyhow!(
2259                     "Could not enable interrupt controller routing: {:#?}",
2260                     e
2261                 ))
2262             })?;
2263 
2264         Ok(())
2265     }
2266 
2267     /// Gets the actual size of the balloon.
2268     pub fn balloon_size(&self) -> u64 {
2269         self.device_manager.lock().unwrap().balloon_size()
2270     }
2271 
2272     pub fn receive_memory_regions<F>(
2273         &mut self,
2274         ranges: &MemoryRangeTable,
2275         fd: &mut F,
2276     ) -> std::result::Result<(), MigratableError>
2277     where
2278         F: Read,
2279     {
2280         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2281         let mem = guest_memory.memory();
2282 
2283         for range in ranges.regions() {
2284             let mut offset: u64 = 0;
2285             // Here we are manually handling the retry in case we can't the
2286             // whole region at once because we can't use the implementation
2287             // from vm-memory::GuestMemory of read_exact_from() as it is not
2288             // following the correct behavior. For more info about this issue
2289             // see: https://github.com/rust-vmm/vm-memory/issues/174
2290             loop {
2291                 let bytes_read = mem
2292                     .read_from(
2293                         GuestAddress(range.gpa + offset),
2294                         fd,
2295                         (range.length - offset) as usize,
2296                     )
2297                     .map_err(|e| {
2298                         MigratableError::MigrateReceive(anyhow!(
2299                             "Error receiving memory from socket: {}",
2300                             e
2301                         ))
2302                     })?;
2303                 offset += bytes_read as u64;
2304 
2305                 if offset == range.length {
2306                     break;
2307                 }
2308             }
2309         }
2310 
2311         Ok(())
2312     }
2313 
2314     pub fn send_memory_fds(
2315         &mut self,
2316         socket: &mut UnixStream,
2317     ) -> std::result::Result<(), MigratableError> {
2318         for (slot, fd) in self
2319             .memory_manager
2320             .lock()
2321             .unwrap()
2322             .memory_slot_fds()
2323             .drain()
2324         {
2325             Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2326                 .write_to(socket)
2327                 .map_err(|e| {
2328                     MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2329                 })?;
2330             socket
2331                 .send_with_fd(&slot.to_le_bytes()[..], fd)
2332                 .map_err(|e| {
2333                     MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2334                 })?;
2335 
2336             let res = Response::read_from(socket)?;
2337             if res.status() != Status::Ok {
2338                 warn!("Error during memory fd migration");
2339                 Request::abandon().write_to(socket)?;
2340                 Response::read_from(socket).ok();
2341                 return Err(MigratableError::MigrateSend(anyhow!(
2342                     "Error during memory fd migration"
2343                 )));
2344             }
2345         }
2346 
2347         Ok(())
2348     }
2349 
2350     pub fn send_memory_regions<F>(
2351         &mut self,
2352         ranges: &MemoryRangeTable,
2353         fd: &mut F,
2354     ) -> std::result::Result<(), MigratableError>
2355     where
2356         F: Write,
2357     {
2358         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2359         let mem = guest_memory.memory();
2360 
2361         for range in ranges.regions() {
2362             let mut offset: u64 = 0;
2363             // Here we are manually handling the retry in case we can't the
2364             // whole region at once because we can't use the implementation
2365             // from vm-memory::GuestMemory of write_all_to() as it is not
2366             // following the correct behavior. For more info about this issue
2367             // see: https://github.com/rust-vmm/vm-memory/issues/174
2368             loop {
2369                 let bytes_written = mem
2370                     .write_to(
2371                         GuestAddress(range.gpa + offset),
2372                         fd,
2373                         (range.length - offset) as usize,
2374                     )
2375                     .map_err(|e| {
2376                         MigratableError::MigrateSend(anyhow!(
2377                             "Error transferring memory to socket: {}",
2378                             e
2379                         ))
2380                     })?;
2381                 offset += bytes_written as u64;
2382 
2383                 if offset == range.length {
2384                     break;
2385                 }
2386             }
2387         }
2388 
2389         Ok(())
2390     }
2391 
2392     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2393         self.memory_manager
2394             .lock()
2395             .unwrap()
2396             .memory_range_table(false)
2397     }
2398 
2399     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2400         self.device_manager.lock().unwrap().device_tree()
2401     }
2402 
2403     pub fn activate_virtio_devices(&self) -> Result<()> {
2404         self.device_manager
2405             .lock()
2406             .unwrap()
2407             .activate_virtio_devices()
2408             .map_err(Error::ActivateVirtioDevices)
2409     }
2410 
2411     #[cfg(target_arch = "x86_64")]
2412     pub fn power_button(&self) -> Result<()> {
2413         return self
2414             .device_manager
2415             .lock()
2416             .unwrap()
2417             .notify_power_button()
2418             .map_err(Error::PowerButton);
2419     }
2420 
2421     #[cfg(target_arch = "aarch64")]
2422     pub fn power_button(&self) -> Result<()> {
2423         self.device_manager
2424             .lock()
2425             .unwrap()
2426             .notify_power_button()
2427             .map_err(Error::PowerButton)
2428     }
2429 
2430     pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2431         self.memory_manager.lock().unwrap().snapshot_data()
2432     }
2433 
2434     #[cfg(all(target_arch = "x86_64", feature = "gdb"))]
2435     pub fn debug_request(
2436         &mut self,
2437         gdb_request: &GdbRequestPayload,
2438         cpu_id: usize,
2439     ) -> Result<GdbResponsePayload> {
2440         use GdbRequestPayload::*;
2441         match gdb_request {
2442             SetSingleStep(single_step) => {
2443                 self.set_guest_debug(cpu_id, &[], *single_step)
2444                     .map_err(Error::Debug)?;
2445             }
2446             SetHwBreakPoint(addrs) => {
2447                 self.set_guest_debug(cpu_id, addrs, false)
2448                     .map_err(Error::Debug)?;
2449             }
2450             Pause => {
2451                 self.debug_pause().map_err(Error::Debug)?;
2452             }
2453             Resume => {
2454                 self.debug_resume().map_err(Error::Debug)?;
2455             }
2456             ReadRegs => {
2457                 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2458                 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2459             }
2460             WriteRegs(regs) => {
2461                 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2462             }
2463             ReadMem(vaddr, len) => {
2464                 let mem = self.read_mem(cpu_id, *vaddr, *len).map_err(Error::Debug)?;
2465                 return Ok(GdbResponsePayload::MemoryRegion(mem));
2466             }
2467             WriteMem(vaddr, data) => {
2468                 self.write_mem(cpu_id, vaddr, data).map_err(Error::Debug)?;
2469             }
2470             ActiveVcpus => {
2471                 let active_vcpus = self.active_vcpus();
2472                 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2473             }
2474         }
2475         Ok(GdbResponsePayload::CommandComplete)
2476     }
2477 }
2478 
2479 impl Pausable for Vm {
2480     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2481         event!("vm", "pausing");
2482         let mut state = self
2483             .state
2484             .try_write()
2485             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2486         let new_state = VmState::Paused;
2487 
2488         state
2489             .valid_transition(new_state)
2490             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2491 
2492         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2493         {
2494             let mut clock = self
2495                 .vm
2496                 .get_clock()
2497                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2498             // Reset clock flags.
2499             clock.flags = 0;
2500             self.saved_clock = Some(clock);
2501         }
2502 
2503         // Before pausing the vCPUs activate any pending virtio devices that might
2504         // need activation between starting the pause (or e.g. a migration it's part of)
2505         self.activate_virtio_devices().map_err(|e| {
2506             MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2507         })?;
2508 
2509         self.cpu_manager.lock().unwrap().pause()?;
2510         self.device_manager.lock().unwrap().pause()?;
2511 
2512         *state = new_state;
2513 
2514         event!("vm", "paused");
2515         Ok(())
2516     }
2517 
2518     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2519         event!("vm", "resuming");
2520         let mut state = self
2521             .state
2522             .try_write()
2523             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2524         let new_state = VmState::Running;
2525 
2526         state
2527             .valid_transition(new_state)
2528             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2529 
2530         self.cpu_manager.lock().unwrap().resume()?;
2531         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2532         {
2533             if let Some(clock) = &self.saved_clock {
2534                 self.vm.set_clock(clock).map_err(|e| {
2535                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2536                 })?;
2537             }
2538         }
2539         self.device_manager.lock().unwrap().resume()?;
2540 
2541         // And we're back to the Running state.
2542         *state = new_state;
2543         event!("vm", "resumed");
2544         Ok(())
2545     }
2546 }
2547 
2548 #[derive(Serialize, Deserialize)]
2549 pub struct VmSnapshot {
2550     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2551     pub clock: Option<hypervisor::ClockData>,
2552     pub state: Option<hypervisor::VmState>,
2553     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2554     pub common_cpuid: hypervisor::CpuId,
2555 }
2556 
2557 pub const VM_SNAPSHOT_ID: &str = "vm";
2558 impl Snapshottable for Vm {
2559     fn id(&self) -> String {
2560         VM_SNAPSHOT_ID.to_string()
2561     }
2562 
2563     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2564         event!("vm", "snapshotting");
2565 
2566         #[cfg(feature = "tdx")]
2567         {
2568             if self.config.lock().unwrap().tdx.is_some() {
2569                 return Err(MigratableError::Snapshot(anyhow!(
2570                     "Snapshot not possible with TDX VM"
2571                 )));
2572             }
2573         }
2574 
2575         let current_state = self.get_state().unwrap();
2576         if current_state != VmState::Paused {
2577             return Err(MigratableError::Snapshot(anyhow!(
2578                 "Trying to snapshot while VM is running"
2579             )));
2580         }
2581 
2582         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2583         let common_cpuid = {
2584             #[cfg(feature = "tdx")]
2585             let tdx_enabled = self.config.lock().unwrap().tdx.is_some();
2586             let phys_bits = physical_bits(self.config.lock().unwrap().cpus.max_phys_bits);
2587             arch::generate_common_cpuid(
2588                 self.hypervisor.clone(),
2589                 None,
2590                 None,
2591                 phys_bits,
2592                 self.config.lock().unwrap().cpus.kvm_hyperv,
2593                 #[cfg(feature = "tdx")]
2594                 tdx_enabled,
2595             )
2596             .map_err(|e| {
2597                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2598             })?
2599         };
2600 
2601         let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID);
2602         let vm_state = self
2603             .vm
2604             .state()
2605             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2606         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2607             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2608             clock: self.saved_clock,
2609             state: Some(vm_state),
2610             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2611             common_cpuid,
2612         })
2613         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2614 
2615         vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?);
2616         vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?);
2617 
2618         #[cfg(target_arch = "aarch64")]
2619         self.add_vgic_snapshot_section(&mut vm_snapshot)
2620             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2621 
2622         vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?);
2623         vm_snapshot.add_data_section(SnapshotDataSection {
2624             id: format!("{}-section", VM_SNAPSHOT_ID),
2625             snapshot: vm_snapshot_data,
2626         });
2627 
2628         event!("vm", "snapshotted");
2629         Ok(vm_snapshot)
2630     }
2631 
2632     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
2633         event!("vm", "restoring");
2634 
2635         let current_state = self
2636             .get_state()
2637             .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?;
2638         let new_state = VmState::Paused;
2639         current_state.valid_transition(new_state).map_err(|e| {
2640             MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e))
2641         })?;
2642 
2643         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2644         self.load_clock_from_snapshot(&snapshot)
2645             .map_err(|e| MigratableError::Restore(anyhow!("Error restoring clock: {:?}", e)))?;
2646 
2647         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2648             self.memory_manager
2649                 .lock()
2650                 .unwrap()
2651                 .restore(*memory_manager_snapshot.clone())?;
2652         } else {
2653             return Err(MigratableError::Restore(anyhow!(
2654                 "Missing memory manager snapshot"
2655             )));
2656         }
2657 
2658         if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) {
2659             self.cpu_manager
2660                 .lock()
2661                 .unwrap()
2662                 .restore(*cpu_manager_snapshot.clone())?;
2663         } else {
2664             return Err(MigratableError::Restore(anyhow!(
2665                 "Missing CPU manager snapshot"
2666             )));
2667         }
2668 
2669         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2670             self.device_manager
2671                 .lock()
2672                 .unwrap()
2673                 .restore(*device_manager_snapshot.clone())?;
2674         } else {
2675             return Err(MigratableError::Restore(anyhow!(
2676                 "Missing device manager snapshot"
2677             )));
2678         }
2679 
2680         #[cfg(target_arch = "aarch64")]
2681         self.restore_vgic_and_enable_interrupt(&snapshot)?;
2682 
2683         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2684             self.device_manager
2685                 .lock()
2686                 .unwrap()
2687                 .restore_devices(*device_manager_snapshot.clone())?;
2688         } else {
2689             return Err(MigratableError::Restore(anyhow!(
2690                 "Missing device manager snapshot"
2691             )));
2692         }
2693 
2694         // Now we can start all vCPUs from here.
2695         self.cpu_manager
2696             .lock()
2697             .unwrap()
2698             .start_restored_vcpus()
2699             .map_err(|e| {
2700                 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e))
2701             })?;
2702 
2703         self.setup_signal_handler().map_err(|e| {
2704             MigratableError::Restore(anyhow!("Could not setup signal handler: {:#?}", e))
2705         })?;
2706         self.setup_tty()
2707             .map_err(|e| MigratableError::Restore(anyhow!("Could not setup tty: {:#?}", e)))?;
2708 
2709         let mut state = self
2710             .state
2711             .try_write()
2712             .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?;
2713         *state = new_state;
2714 
2715         event!("vm", "restored");
2716         Ok(())
2717     }
2718 }
2719 
2720 impl Transportable for Vm {
2721     fn send(
2722         &self,
2723         snapshot: &Snapshot,
2724         destination_url: &str,
2725     ) -> std::result::Result<(), MigratableError> {
2726         let mut snapshot_config_path = url_to_path(destination_url)?;
2727         snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2728 
2729         // Create the snapshot config file
2730         let mut snapshot_config_file = OpenOptions::new()
2731             .read(true)
2732             .write(true)
2733             .create_new(true)
2734             .open(snapshot_config_path)
2735             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2736 
2737         // Serialize and write the snapshot config
2738         let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2739             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2740 
2741         snapshot_config_file
2742             .write(vm_config.as_bytes())
2743             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2744 
2745         let mut snapshot_state_path = url_to_path(destination_url)?;
2746         snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2747 
2748         // Create the snapshot state file
2749         let mut snapshot_state_file = OpenOptions::new()
2750             .read(true)
2751             .write(true)
2752             .create_new(true)
2753             .open(snapshot_state_path)
2754             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2755 
2756         // Serialize and write the snapshot state
2757         let vm_state =
2758             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2759 
2760         snapshot_state_file
2761             .write(&vm_state)
2762             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2763 
2764         // Tell the memory manager to also send/write its own snapshot.
2765         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2766             self.memory_manager
2767                 .lock()
2768                 .unwrap()
2769                 .send(&*memory_manager_snapshot.clone(), destination_url)?;
2770         } else {
2771             return Err(MigratableError::Restore(anyhow!(
2772                 "Missing memory manager snapshot"
2773             )));
2774         }
2775 
2776         Ok(())
2777     }
2778 }
2779 
2780 impl Migratable for Vm {
2781     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2782         self.memory_manager.lock().unwrap().start_dirty_log()?;
2783         self.device_manager.lock().unwrap().start_dirty_log()
2784     }
2785 
2786     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2787         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2788         self.device_manager.lock().unwrap().stop_dirty_log()
2789     }
2790 
2791     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2792         Ok(MemoryRangeTable::new_from_tables(vec![
2793             self.memory_manager.lock().unwrap().dirty_log()?,
2794             self.device_manager.lock().unwrap().dirty_log()?,
2795         ]))
2796     }
2797 
2798     fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2799         self.memory_manager.lock().unwrap().start_migration()?;
2800         self.device_manager.lock().unwrap().start_migration()
2801     }
2802 
2803     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2804         self.memory_manager.lock().unwrap().complete_migration()?;
2805         self.device_manager.lock().unwrap().complete_migration()
2806     }
2807 }
2808 
2809 #[cfg(feature = "gdb")]
2810 impl Debuggable for Vm {
2811     fn set_guest_debug(
2812         &self,
2813         cpu_id: usize,
2814         addrs: &[GuestAddress],
2815         singlestep: bool,
2816     ) -> std::result::Result<(), DebuggableError> {
2817         self.cpu_manager
2818             .lock()
2819             .unwrap()
2820             .set_guest_debug(cpu_id, addrs, singlestep)
2821     }
2822 
2823     fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2824         if !self.cpu_manager.lock().unwrap().vcpus_paused() {
2825             self.pause().map_err(DebuggableError::Pause)?;
2826         }
2827         let mut state = self
2828             .state
2829             .try_write()
2830             .map_err(|_| DebuggableError::PoisonedState)?;
2831         *state = VmState::BreakPoint;
2832         Ok(())
2833     }
2834 
2835     fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2836         if !self.cpu_manager.lock().unwrap().vcpus_paused() {
2837             self.cpu_manager
2838                 .lock()
2839                 .unwrap()
2840                 .start_boot_vcpus()
2841                 .map_err(|e| {
2842                     DebuggableError::Resume(MigratableError::Resume(anyhow!(
2843                         "Could not start boot vCPUs: {:?}",
2844                         e
2845                     )))
2846                 })?;
2847         } else {
2848             self.resume().map_err(DebuggableError::Resume)?;
2849         }
2850         let mut state = self
2851             .state
2852             .try_write()
2853             .map_err(|_| DebuggableError::PoisonedState)?;
2854         *state = VmState::Running;
2855         Ok(())
2856     }
2857 
2858     fn read_regs(&self, cpu_id: usize) -> std::result::Result<X86_64CoreRegs, DebuggableError> {
2859         self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2860     }
2861 
2862     fn write_regs(
2863         &self,
2864         cpu_id: usize,
2865         regs: &X86_64CoreRegs,
2866     ) -> std::result::Result<(), DebuggableError> {
2867         self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2868     }
2869 
2870     fn read_mem(
2871         &self,
2872         cpu_id: usize,
2873         vaddr: GuestAddress,
2874         len: usize,
2875     ) -> std::result::Result<Vec<u8>, DebuggableError> {
2876         self.cpu_manager
2877             .lock()
2878             .unwrap()
2879             .read_mem(cpu_id, vaddr, len)
2880     }
2881 
2882     fn write_mem(
2883         &self,
2884         cpu_id: usize,
2885         vaddr: &GuestAddress,
2886         data: &[u8],
2887     ) -> std::result::Result<(), DebuggableError> {
2888         self.cpu_manager
2889             .lock()
2890             .unwrap()
2891             .write_mem(cpu_id, vaddr, data)
2892     }
2893 
2894     fn active_vcpus(&self) -> usize {
2895         let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2896         if active_vcpus > 0 {
2897             active_vcpus
2898         } else {
2899             // The VM is not booted yet. Report boot_vcpus() instead.
2900             self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2901         }
2902     }
2903 }
2904 
2905 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2906 #[cfg(test)]
2907 mod tests {
2908     use super::*;
2909 
2910     fn test_vm_state_transitions(state: VmState) {
2911         match state {
2912             VmState::Created => {
2913                 // Check the transitions from Created
2914                 assert!(state.valid_transition(VmState::Created).is_err());
2915                 assert!(state.valid_transition(VmState::Running).is_ok());
2916                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2917                 assert!(state.valid_transition(VmState::Paused).is_ok());
2918                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2919             }
2920             VmState::Running => {
2921                 // Check the transitions from Running
2922                 assert!(state.valid_transition(VmState::Created).is_err());
2923                 assert!(state.valid_transition(VmState::Running).is_err());
2924                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2925                 assert!(state.valid_transition(VmState::Paused).is_ok());
2926                 assert!(state.valid_transition(VmState::BreakPoint).is_ok());
2927             }
2928             VmState::Shutdown => {
2929                 // Check the transitions from Shutdown
2930                 assert!(state.valid_transition(VmState::Created).is_err());
2931                 assert!(state.valid_transition(VmState::Running).is_ok());
2932                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2933                 assert!(state.valid_transition(VmState::Paused).is_err());
2934                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2935             }
2936             VmState::Paused => {
2937                 // Check the transitions from Paused
2938                 assert!(state.valid_transition(VmState::Created).is_err());
2939                 assert!(state.valid_transition(VmState::Running).is_ok());
2940                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2941                 assert!(state.valid_transition(VmState::Paused).is_err());
2942                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2943             }
2944             VmState::BreakPoint => {
2945                 // Check the transitions from Breakpoint
2946                 assert!(state.valid_transition(VmState::Created).is_ok());
2947                 assert!(state.valid_transition(VmState::Running).is_ok());
2948                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2949                 assert!(state.valid_transition(VmState::Paused).is_err());
2950                 assert!(state.valid_transition(VmState::BreakPoint).is_err());
2951             }
2952         }
2953     }
2954 
2955     #[test]
2956     fn test_vm_created_transitions() {
2957         test_vm_state_transitions(VmState::Created);
2958     }
2959 
2960     #[test]
2961     fn test_vm_running_transitions() {
2962         test_vm_state_transitions(VmState::Running);
2963     }
2964 
2965     #[test]
2966     fn test_vm_shutdown_transitions() {
2967         test_vm_state_transitions(VmState::Shutdown);
2968     }
2969 
2970     #[test]
2971     fn test_vm_paused_transitions() {
2972         test_vm_state_transitions(VmState::Paused);
2973     }
2974 }
2975 
2976 #[cfg(target_arch = "aarch64")]
2977 #[cfg(test)]
2978 mod tests {
2979     use super::*;
2980     use crate::GuestMemoryMmap;
2981     use arch::aarch64::fdt::create_fdt;
2982     use arch::aarch64::gic::kvm::create_gic;
2983     use arch::aarch64::layout;
2984     use arch::{DeviceType, MmioDeviceInfo};
2985 
2986     const LEN: u64 = 4096;
2987 
2988     #[test]
2989     fn test_create_fdt_with_devices() {
2990         let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
2991         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
2992 
2993         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
2994             (
2995                 (DeviceType::Serial, DeviceType::Serial.to_string()),
2996                 MmioDeviceInfo {
2997                     addr: 0x00,
2998                     len: LEN,
2999                     irq: 33,
3000                 },
3001             ),
3002             (
3003                 (DeviceType::Virtio(1), "virtio".to_string()),
3004                 MmioDeviceInfo {
3005                     addr: LEN,
3006                     len: LEN,
3007                     irq: 34,
3008                 },
3009             ),
3010             (
3011                 (DeviceType::Rtc, "rtc".to_string()),
3012                 MmioDeviceInfo {
3013                     addr: 2 * LEN,
3014                     len: LEN,
3015                     irq: 35,
3016                 },
3017             ),
3018         ]
3019         .iter()
3020         .cloned()
3021         .collect();
3022 
3023         let hv = hypervisor::new().unwrap();
3024         let vm = hv.create_vm().unwrap();
3025         let gic = create_gic(&vm, 1).unwrap();
3026         assert!(create_fdt(
3027             &mem,
3028             "console=tty0",
3029             vec![0],
3030             Some((0, 0, 0)),
3031             &dev_info,
3032             &*gic,
3033             &None,
3034             &Vec::new(),
3035             &BTreeMap::new(),
3036             None,
3037             true,
3038         )
3039         .is_ok())
3040     }
3041 }
3042 
3043 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3044 #[test]
3045 pub fn test_vm() {
3046     use hypervisor::VmExit;
3047     use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3048     // This example based on https://lwn.net/Articles/658511/
3049     let code = [
3050         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3051         0x00, 0xd8, /* add %bl, %al */
3052         0x04, b'0', /* add $'0', %al */
3053         0xee, /* out %al, (%dx) */
3054         0xb0, b'\n', /* mov $'\n', %al */
3055         0xee,  /* out %al, (%dx) */
3056         0xf4,  /* hlt */
3057     ];
3058 
3059     let mem_size = 0x1000;
3060     let load_addr = GuestAddress(0x1000);
3061     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3062 
3063     let hv = hypervisor::new().unwrap();
3064     let vm = hv.create_vm().expect("new VM creation failed");
3065 
3066     for (index, region) in mem.iter().enumerate() {
3067         let mem_region = vm.make_user_memory_region(
3068             index as u32,
3069             region.start_addr().raw_value(),
3070             region.len() as u64,
3071             region.as_ptr() as u64,
3072             false,
3073             false,
3074         );
3075 
3076         vm.create_user_memory_region(mem_region)
3077             .expect("Cannot configure guest memory");
3078     }
3079     mem.write_slice(&code, load_addr)
3080         .expect("Writing code to memory failed");
3081 
3082     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3083 
3084     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3085     vcpu_sregs.cs.base = 0;
3086     vcpu_sregs.cs.selector = 0;
3087     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3088 
3089     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3090     vcpu_regs.rip = 0x1000;
3091     vcpu_regs.rax = 2;
3092     vcpu_regs.rbx = 3;
3093     vcpu_regs.rflags = 2;
3094     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3095 
3096     loop {
3097         match vcpu.run().expect("run failed") {
3098             VmExit::IoOut(addr, data) => {
3099                 println!(
3100                     "IO out -- addr: {:#x} data [{:?}]",
3101                     addr,
3102                     str::from_utf8(data).unwrap()
3103                 );
3104             }
3105             VmExit::Reset => {
3106                 println!("HLT");
3107                 break;
3108             }
3109             r => panic!("unexpected exit reason: {:?}", r),
3110         }
3111     }
3112 }
3113