xref: /cloud-hypervisor/vmm/src/vm.rs (revision f67b3f79ea19c9a66e04074cbbf5d292f6529e43)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 #[cfg(any(target_arch = "aarch64", feature = "acpi"))]
15 use crate::config::NumaConfig;
16 use crate::config::{
17     ConsoleOutputMode, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig,
18     UserDeviceConfig, ValidationError, VmConfig, VsockConfig,
19 };
20 use crate::device_manager::{self, Console, DeviceManager, DeviceManagerError, PtyPair};
21 use crate::device_tree::DeviceTree;
22 use crate::memory_manager::{Error as MemoryManagerError, MemoryManager};
23 use crate::migration::{get_vm_snapshot, url_to_path, VM_SNAPSHOT_FILE};
24 use crate::seccomp_filters::{get_seccomp_filter, Thread};
25 use crate::GuestMemoryMmap;
26 use crate::{cpu, EpollDispatch};
27 use crate::{
28     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
29 };
30 use anyhow::anyhow;
31 use arch::get_host_cpu_phys_bits;
32 #[cfg(feature = "tdx")]
33 use arch::x86_64::tdx::TdvfSection;
34 use arch::EntryPoint;
35 #[cfg(any(target_arch = "aarch64", feature = "acpi"))]
36 use arch::{NumaNode, NumaNodes};
37 use devices::AcpiNotificationFlags;
38 use hypervisor::vm::{HypervisorVmError, VmmOps};
39 use linux_loader::cmdline::Cmdline;
40 #[cfg(target_arch = "x86_64")]
41 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
42 #[cfg(target_arch = "aarch64")]
43 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
44 use linux_loader::loader::KernelLoader;
45 use seccompiler::{apply_filter, SeccompAction};
46 use signal_hook::{
47     consts::{SIGINT, SIGTERM, SIGWINCH},
48     iterator::backend::Handle,
49     iterator::Signals,
50 };
51 use std::cmp;
52 #[cfg(any(target_arch = "aarch64", feature = "acpi"))]
53 use std::collections::BTreeMap;
54 use std::collections::HashMap;
55 use std::convert::TryInto;
56 use std::ffi::CString;
57 #[cfg(target_arch = "x86_64")]
58 use std::fmt;
59 use std::fs::{File, OpenOptions};
60 use std::io::{self, Read, Write};
61 use std::io::{Seek, SeekFrom};
62 use std::num::Wrapping;
63 use std::ops::Deref;
64 use std::panic::AssertUnwindSafe;
65 use std::sync::{Arc, Mutex, RwLock};
66 use std::{result, str, thread};
67 use vm_device::Bus;
68 use vm_memory::{
69     Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
70     GuestMemoryRegion,
71 };
72 use vm_migration::{
73     protocol::{MemoryRange, MemoryRangeTable},
74     Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
75     Transportable,
76 };
77 use vmm_sys_util::eventfd::EventFd;
78 use vmm_sys_util::signal::unblock_signal;
79 use vmm_sys_util::terminal::Terminal;
80 
81 #[cfg(target_arch = "aarch64")]
82 use arch::aarch64::gic::gicv3_its::kvm::{KvmGicV3Its, GIC_V3_ITS_SNAPSHOT_ID};
83 #[cfg(target_arch = "aarch64")]
84 use arch::aarch64::gic::kvm::create_gic;
85 #[cfg(target_arch = "aarch64")]
86 use devices::interrupt_controller::{self, InterruptController};
87 
88 /// Errors associated with VM management
89 #[derive(Debug)]
90 pub enum Error {
91     /// Cannot open the kernel image
92     KernelFile(io::Error),
93 
94     /// Cannot open the initramfs image
95     InitramfsFile(io::Error),
96 
97     /// Cannot load the kernel in memory
98     KernelLoad(linux_loader::loader::Error),
99 
100     #[cfg(target_arch = "aarch64")]
101     /// Cannot load the UEFI binary in memory
102     UefiLoad(arch::aarch64::uefi::Error),
103 
104     /// Cannot load the initramfs in memory
105     InitramfsLoad,
106 
107     /// Cannot load the command line in memory
108     LoadCmdLine(linux_loader::loader::Error),
109 
110     /// Cannot modify the command line
111     CmdLineInsertStr(linux_loader::cmdline::Error),
112 
113     /// Cannot convert command line into CString
114     CmdLineCString(std::ffi::NulError),
115 
116     /// Cannot configure system
117     ConfigureSystem(arch::Error),
118 
119     /// Cannot enable interrupt controller
120     #[cfg(target_arch = "aarch64")]
121     EnableInterruptController(interrupt_controller::Error),
122 
123     PoisonedState,
124 
125     /// Cannot create a device manager.
126     DeviceManager(DeviceManagerError),
127 
128     /// Write to the console failed.
129     Console(vmm_sys_util::errno::Error),
130 
131     /// Write to the pty console failed.
132     PtyConsole(io::Error),
133 
134     /// Cannot setup terminal in raw mode.
135     SetTerminalRaw(vmm_sys_util::errno::Error),
136 
137     /// Cannot setup terminal in canonical mode.
138     SetTerminalCanon(vmm_sys_util::errno::Error),
139 
140     /// Memory is overflow
141     MemOverflow,
142 
143     /// Cannot spawn a signal handler thread
144     SignalHandlerSpawn(io::Error),
145 
146     /// Failed to join on vCPU threads
147     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
148 
149     /// VM config is missing.
150     VmMissingConfig,
151 
152     /// VM is not created
153     VmNotCreated,
154 
155     /// VM is already created
156     VmAlreadyCreated,
157 
158     /// VM is not running
159     VmNotRunning,
160 
161     /// Cannot clone EventFd.
162     EventFdClone(io::Error),
163 
164     /// Invalid VM state transition
165     InvalidStateTransition(VmState, VmState),
166 
167     /// Error from CPU handling
168     CpuManager(cpu::Error),
169 
170     /// Cannot pause devices
171     PauseDevices(MigratableError),
172 
173     /// Cannot resume devices
174     ResumeDevices(MigratableError),
175 
176     /// Cannot pause CPUs
177     PauseCpus(MigratableError),
178 
179     /// Cannot resume cpus
180     ResumeCpus(MigratableError),
181 
182     /// Cannot pause VM
183     Pause(MigratableError),
184 
185     /// Cannot resume VM
186     Resume(MigratableError),
187 
188     /// Memory manager error
189     MemoryManager(MemoryManagerError),
190 
191     /// Eventfd write error
192     EventfdError(std::io::Error),
193 
194     /// Cannot snapshot VM
195     Snapshot(MigratableError),
196 
197     /// Cannot restore VM
198     Restore(MigratableError),
199 
200     /// Cannot send VM snapshot
201     SnapshotSend(MigratableError),
202 
203     /// Cannot convert source URL from Path into &str
204     RestoreSourceUrlPathToStr,
205 
206     /// Failed to validate config
207     ConfigValidation(ValidationError),
208 
209     /// No more that one virtio-vsock device
210     TooManyVsockDevices,
211 
212     /// Failed serializing into JSON
213     SerializeJson(serde_json::Error),
214 
215     /// Invalid configuration for NUMA.
216     InvalidNumaConfig,
217 
218     /// Cannot create seccomp filter
219     CreateSeccompFilter(seccompiler::Error),
220 
221     /// Cannot apply seccomp filter
222     ApplySeccompFilter(seccompiler::Error),
223 
224     /// Failed resizing a memory zone.
225     ResizeZone,
226 
227     /// Cannot activate virtio devices
228     ActivateVirtioDevices(device_manager::DeviceManagerError),
229 
230     /// Power button not supported
231     PowerButtonNotSupported,
232 
233     /// Error triggering power button
234     PowerButton(device_manager::DeviceManagerError),
235 
236     /// Kernel lacks PVH header
237     KernelMissingPvhHeader,
238 
239     /// Error doing I/O on TDX firmware file
240     #[cfg(feature = "tdx")]
241     LoadTdvf(std::io::Error),
242 
243     /// Error parsing TDVF
244     #[cfg(feature = "tdx")]
245     ParseTdvf(arch::x86_64::tdx::TdvfError),
246 
247     /// Error populating HOB
248     #[cfg(feature = "tdx")]
249     PopulateHob(arch::x86_64::tdx::TdvfError),
250 
251     /// Error allocating TDVF memory
252     #[cfg(feature = "tdx")]
253     AllocatingTdvfMemory(crate::memory_manager::Error),
254 
255     /// Error enabling TDX VM
256     #[cfg(feature = "tdx")]
257     InitializeTdxVm(hypervisor::HypervisorVmError),
258 
259     /// Error enabling TDX memory region
260     #[cfg(feature = "tdx")]
261     InitializeTdxMemoryRegion(hypervisor::HypervisorVmError),
262 
263     /// Error finalizing TDX setup
264     #[cfg(feature = "tdx")]
265     FinalizeTdx(hypervisor::HypervisorVmError),
266 }
267 pub type Result<T> = result::Result<T, Error>;
268 
269 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq)]
270 pub enum VmState {
271     Created,
272     Running,
273     Shutdown,
274     Paused,
275 }
276 
277 impl VmState {
278     fn valid_transition(self, new_state: VmState) -> Result<()> {
279         match self {
280             VmState::Created => match new_state {
281                 VmState::Created | VmState::Shutdown => {
282                     Err(Error::InvalidStateTransition(self, new_state))
283                 }
284                 VmState::Running | VmState::Paused => Ok(()),
285             },
286 
287             VmState::Running => match new_state {
288                 VmState::Created | VmState::Running => {
289                     Err(Error::InvalidStateTransition(self, new_state))
290                 }
291                 VmState::Paused | VmState::Shutdown => Ok(()),
292             },
293 
294             VmState::Shutdown => match new_state {
295                 VmState::Paused | VmState::Created | VmState::Shutdown => {
296                     Err(Error::InvalidStateTransition(self, new_state))
297                 }
298                 VmState::Running => Ok(()),
299             },
300 
301             VmState::Paused => match new_state {
302                 VmState::Created | VmState::Paused => {
303                     Err(Error::InvalidStateTransition(self, new_state))
304                 }
305                 VmState::Running | VmState::Shutdown => Ok(()),
306             },
307         }
308     }
309 }
310 
311 // Debug I/O port
312 #[cfg(target_arch = "x86_64")]
313 const DEBUG_IOPORT: u16 = 0x80;
314 #[cfg(target_arch = "x86_64")]
315 const DEBUG_IOPORT_PREFIX: &str = "Debug I/O port";
316 
317 #[cfg(target_arch = "x86_64")]
318 /// Debug I/O port, see:
319 /// https://www.intel.com/content/www/us/en/support/articles/000005500/boards-and-kits.html
320 ///
321 /// Since we're not a physical platform, we can freely assign code ranges for
322 /// debugging specific parts of our virtual platform.
323 pub enum DebugIoPortRange {
324     Firmware,
325     Bootloader,
326     Kernel,
327     Userspace,
328     Custom,
329 }
330 #[cfg(target_arch = "x86_64")]
331 impl DebugIoPortRange {
332     fn from_u8(value: u8) -> DebugIoPortRange {
333         match value {
334             0x00..=0x1f => DebugIoPortRange::Firmware,
335             0x20..=0x3f => DebugIoPortRange::Bootloader,
336             0x40..=0x5f => DebugIoPortRange::Kernel,
337             0x60..=0x7f => DebugIoPortRange::Userspace,
338             _ => DebugIoPortRange::Custom,
339         }
340     }
341 }
342 
343 #[cfg(target_arch = "x86_64")]
344 impl fmt::Display for DebugIoPortRange {
345     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
346         match self {
347             DebugIoPortRange::Firmware => write!(f, "{}: Firmware", DEBUG_IOPORT_PREFIX),
348             DebugIoPortRange::Bootloader => write!(f, "{}: Bootloader", DEBUG_IOPORT_PREFIX),
349             DebugIoPortRange::Kernel => write!(f, "{}: Kernel", DEBUG_IOPORT_PREFIX),
350             DebugIoPortRange::Userspace => write!(f, "{}: Userspace", DEBUG_IOPORT_PREFIX),
351             DebugIoPortRange::Custom => write!(f, "{}: Custom", DEBUG_IOPORT_PREFIX),
352         }
353     }
354 }
355 
356 struct VmOps {
357     memory: GuestMemoryAtomic<GuestMemoryMmap>,
358     #[cfg(target_arch = "x86_64")]
359     io_bus: Arc<Bus>,
360     mmio_bus: Arc<Bus>,
361     #[cfg(target_arch = "x86_64")]
362     timestamp: std::time::Instant,
363 }
364 
365 impl VmOps {
366     #[cfg(target_arch = "x86_64")]
367     // Log debug io port codes.
368     fn log_debug_ioport(&self, code: u8) {
369         let elapsed = self.timestamp.elapsed();
370 
371         debug!(
372             "[{} code 0x{:x}] {}.{:>06} seconds",
373             DebugIoPortRange::from_u8(code),
374             code,
375             elapsed.as_secs(),
376             elapsed.as_micros()
377         );
378     }
379 }
380 
381 impl VmmOps for VmOps {
382     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> hypervisor::vm::Result<usize> {
383         self.memory
384             .memory()
385             .write(buf, GuestAddress(gpa))
386             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
387     }
388 
389     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> hypervisor::vm::Result<usize> {
390         self.memory
391             .memory()
392             .read(buf, GuestAddress(gpa))
393             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
394     }
395 
396     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> {
397         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
398             warn!("Guest MMIO read to unregistered address 0x{:x}", gpa);
399         }
400         Ok(())
401     }
402 
403     fn mmio_write(&self, gpa: u64, data: &[u8]) -> hypervisor::vm::Result<()> {
404         match self.mmio_bus.write(gpa, data) {
405             Err(vm_device::BusError::MissingAddressRange) => {
406                 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa);
407             }
408             Ok(Some(barrier)) => {
409                 info!("Waiting for barrier");
410                 barrier.wait();
411                 info!("Barrier released");
412             }
413             _ => {}
414         };
415         Ok(())
416     }
417 
418     #[cfg(target_arch = "x86_64")]
419     fn pio_read(&self, port: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> {
420         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
421             warn!("Guest PIO read to unregistered address 0x{:x}", port);
422         }
423         Ok(())
424     }
425 
426     #[cfg(target_arch = "x86_64")]
427     fn pio_write(&self, port: u64, data: &[u8]) -> hypervisor::vm::Result<()> {
428         if port == DEBUG_IOPORT as u64 && data.len() == 1 {
429             self.log_debug_ioport(data[0]);
430             return Ok(());
431         }
432 
433         match self.io_bus.write(port, data) {
434             Err(vm_device::BusError::MissingAddressRange) => {
435                 warn!("Guest PIO write to unregistered address 0x{:x}", port);
436             }
437             Ok(Some(barrier)) => {
438                 info!("Waiting for barrier");
439                 barrier.wait();
440                 info!("Barrier released");
441             }
442             _ => {}
443         };
444         Ok(())
445     }
446 }
447 
448 pub fn physical_bits(max_phys_bits: Option<u8>, #[cfg(feature = "tdx")] tdx_enabled: bool) -> u8 {
449     #[cfg(not(feature = "tdx"))]
450     let host_phys_bits = get_host_cpu_phys_bits();
451     #[cfg(feature = "tdx")]
452     let mut host_phys_bits = get_host_cpu_phys_bits();
453 
454     #[cfg(feature = "tdx")]
455     if tdx_enabled {
456         // When running TDX guest, the Guest Physical Address space is limited
457         // by a shared bit that is located on bit 47 for 4 level paging, and on
458         // bit 51 for 5 level paging (when GPAW bit is 1). In order to keep
459         // things simple, and since a 47 bits address space is 128TiB large, we
460         // ensure to limit the physical addressable space to 47 bits when
461         // runnning TDX.
462         host_phys_bits = std::cmp::min(host_phys_bits, 47)
463     }
464 
465     cmp::min(host_phys_bits, max_phys_bits.unwrap_or(host_phys_bits))
466 }
467 
468 pub const HANDLED_SIGNALS: [i32; 3] = [SIGWINCH, SIGTERM, SIGINT];
469 
470 pub struct Vm {
471     kernel: Option<File>,
472     initramfs: Option<File>,
473     threads: Vec<thread::JoinHandle<()>>,
474     device_manager: Arc<Mutex<DeviceManager>>,
475     config: Arc<Mutex<VmConfig>>,
476     on_tty: bool,
477     signals: Option<Handle>,
478     state: RwLock<VmState>,
479     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
480     memory_manager: Arc<Mutex<MemoryManager>>,
481     #[cfg_attr(not(feature = "kvm"), allow(dead_code))]
482     // The hypervisor abstracted virtual machine.
483     vm: Arc<dyn hypervisor::Vm>,
484     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
485     saved_clock: Option<hypervisor::ClockData>,
486     #[cfg(any(target_arch = "aarch64", feature = "acpi"))]
487     numa_nodes: NumaNodes,
488     seccomp_action: SeccompAction,
489     exit_evt: EventFd,
490     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
491     hypervisor: Arc<dyn hypervisor::Hypervisor>,
492 }
493 
494 impl Vm {
495     #[allow(clippy::too_many_arguments)]
496     fn new_from_memory_manager(
497         config: Arc<Mutex<VmConfig>>,
498         memory_manager: Arc<Mutex<MemoryManager>>,
499         vm: Arc<dyn hypervisor::Vm>,
500         exit_evt: EventFd,
501         reset_evt: EventFd,
502         seccomp_action: &SeccompAction,
503         hypervisor: Arc<dyn hypervisor::Hypervisor>,
504         #[cfg(all(feature = "kvm", target_arch = "x86_64"))] _saved_clock: Option<
505             hypervisor::ClockData,
506         >,
507         activate_evt: EventFd,
508         restoring: bool,
509     ) -> Result<Self> {
510         config
511             .lock()
512             .unwrap()
513             .validate()
514             .map_err(Error::ConfigValidation)?;
515 
516         info!("Booting VM from config: {:?}", &config);
517 
518         // Create NUMA nodes based on NumaConfig.
519         #[cfg(any(target_arch = "aarch64", feature = "acpi"))]
520         let numa_nodes =
521             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
522 
523         #[cfg(feature = "tdx")]
524         let force_iommu = config.lock().unwrap().tdx.is_some();
525         #[cfg(not(feature = "tdx"))]
526         let force_iommu = false;
527 
528         let device_manager = DeviceManager::new(
529             vm.clone(),
530             config.clone(),
531             memory_manager.clone(),
532             &exit_evt,
533             &reset_evt,
534             seccomp_action.clone(),
535             #[cfg(any(target_arch = "aarch64", feature = "acpi"))]
536             numa_nodes.clone(),
537             &activate_evt,
538             force_iommu,
539             restoring,
540         )
541         .map_err(Error::DeviceManager)?;
542 
543         let memory = memory_manager.lock().unwrap().guest_memory();
544         #[cfg(target_arch = "x86_64")]
545         let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus());
546         let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus());
547         // Create the VmOps structure, which implements the VmmOps trait.
548         // And send it to the hypervisor.
549         let vm_ops: Arc<dyn VmmOps> = Arc::new(VmOps {
550             memory,
551             #[cfg(target_arch = "x86_64")]
552             io_bus,
553             mmio_bus,
554             #[cfg(target_arch = "x86_64")]
555             timestamp: std::time::Instant::now(),
556         });
557 
558         let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?;
559         #[cfg(feature = "tdx")]
560         let tdx_enabled = config.lock().unwrap().tdx.is_some();
561         let cpu_manager = cpu::CpuManager::new(
562             &config.lock().unwrap().cpus.clone(),
563             &device_manager,
564             &memory_manager,
565             vm.clone(),
566             exit_evt_clone,
567             reset_evt,
568             hypervisor.clone(),
569             seccomp_action.clone(),
570             vm_ops,
571             #[cfg(feature = "tdx")]
572             tdx_enabled,
573             #[cfg(any(target_arch = "aarch64", feature = "acpi"))]
574             &numa_nodes,
575         )
576         .map_err(Error::CpuManager)?;
577 
578         let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0;
579         let kernel = config
580             .lock()
581             .unwrap()
582             .kernel
583             .as_ref()
584             .map(|k| File::open(&k.path))
585             .transpose()
586             .map_err(Error::KernelFile)?;
587 
588         let initramfs = config
589             .lock()
590             .unwrap()
591             .initramfs
592             .as_ref()
593             .map(|i| File::open(&i.path))
594             .transpose()
595             .map_err(Error::InitramfsFile)?;
596 
597         Ok(Vm {
598             kernel,
599             initramfs,
600             device_manager,
601             config,
602             on_tty,
603             threads: Vec::with_capacity(1),
604             signals: None,
605             state: RwLock::new(VmState::Created),
606             cpu_manager,
607             memory_manager,
608             vm,
609             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
610             saved_clock: _saved_clock,
611             #[cfg(any(target_arch = "aarch64", feature = "acpi"))]
612             numa_nodes,
613             seccomp_action: seccomp_action.clone(),
614             exit_evt,
615             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
616             hypervisor,
617         })
618     }
619 
620     #[cfg(any(target_arch = "aarch64", feature = "acpi"))]
621     fn create_numa_nodes(
622         configs: Option<Vec<NumaConfig>>,
623         memory_manager: &Arc<Mutex<MemoryManager>>,
624     ) -> Result<NumaNodes> {
625         let mm = memory_manager.lock().unwrap();
626         let mm_zones = mm.memory_zones();
627         let mut numa_nodes = BTreeMap::new();
628 
629         if let Some(configs) = &configs {
630             for config in configs.iter() {
631                 if numa_nodes.contains_key(&config.guest_numa_id) {
632                     error!("Can't define twice the same NUMA node");
633                     return Err(Error::InvalidNumaConfig);
634                 }
635 
636                 let mut node = NumaNode::default();
637 
638                 if let Some(memory_zones) = &config.memory_zones {
639                     for memory_zone in memory_zones.iter() {
640                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
641                             node.memory_regions.extend(mm_zone.regions().clone());
642                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
643                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
644                             }
645                             node.memory_zones.push(memory_zone.clone());
646                         } else {
647                             error!("Unknown memory zone '{}'", memory_zone);
648                             return Err(Error::InvalidNumaConfig);
649                         }
650                     }
651                 }
652 
653                 if let Some(cpus) = &config.cpus {
654                     node.cpus.extend(cpus);
655                 }
656 
657                 if let Some(distances) = &config.distances {
658                     for distance in distances.iter() {
659                         let dest = distance.destination;
660                         let dist = distance.distance;
661 
662                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
663                             error!("Unknown destination NUMA node {}", dest);
664                             return Err(Error::InvalidNumaConfig);
665                         }
666 
667                         if node.distances.contains_key(&dest) {
668                             error!("Destination NUMA node {} has been already set", dest);
669                             return Err(Error::InvalidNumaConfig);
670                         }
671 
672                         node.distances.insert(dest, dist);
673                     }
674                 }
675 
676                 #[cfg(target_arch = "x86_64")]
677                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
678                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
679                         let mm_sections = sgx_epc_region.epc_sections();
680                         for sgx_epc_section in sgx_epc_sections.iter() {
681                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
682                                 node.sgx_epc_sections.push(mm_section.clone());
683                             } else {
684                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
685                                 return Err(Error::InvalidNumaConfig);
686                             }
687                         }
688                     } else {
689                         error!("Missing SGX EPC region");
690                         return Err(Error::InvalidNumaConfig);
691                     }
692                 }
693 
694                 numa_nodes.insert(config.guest_numa_id, node);
695             }
696         }
697 
698         Ok(numa_nodes)
699     }
700 
701     #[allow(clippy::too_many_arguments)]
702     pub fn new(
703         config: Arc<Mutex<VmConfig>>,
704         exit_evt: EventFd,
705         reset_evt: EventFd,
706         seccomp_action: &SeccompAction,
707         hypervisor: Arc<dyn hypervisor::Hypervisor>,
708         activate_evt: EventFd,
709         serial_pty: Option<PtyPair>,
710         console_pty: Option<PtyPair>,
711         console_resize_pipe: Option<File>,
712     ) -> Result<Self> {
713         #[cfg(feature = "tdx")]
714         let tdx_enabled = config.lock().unwrap().tdx.is_some();
715         hypervisor.check_required_extensions().unwrap();
716         #[cfg(feature = "tdx")]
717         let vm = hypervisor
718             .create_vm_with_type(if tdx_enabled {
719                 2 // KVM_X86_TDX_VM
720             } else {
721                 0 // KVM_X86_LEGACY_VM
722             })
723             .unwrap();
724         #[cfg(not(feature = "tdx"))]
725         let vm = hypervisor.create_vm().unwrap();
726 
727         #[cfg(target_arch = "x86_64")]
728         vm.enable_split_irq().unwrap();
729         let phys_bits = physical_bits(
730             config.lock().unwrap().cpus.max_phys_bits,
731             #[cfg(feature = "tdx")]
732             tdx_enabled,
733         );
734         let memory_manager = MemoryManager::new(
735             vm.clone(),
736             &config.lock().unwrap().memory.clone(),
737             false,
738             phys_bits,
739             #[cfg(feature = "tdx")]
740             tdx_enabled,
741         )
742         .map_err(Error::MemoryManager)?;
743 
744         #[cfg(target_arch = "x86_64")]
745         {
746             if let Some(sgx_epc_config) = config.lock().unwrap().sgx_epc.clone() {
747                 memory_manager
748                     .lock()
749                     .unwrap()
750                     .setup_sgx(sgx_epc_config, &vm)
751                     .map_err(Error::MemoryManager)?;
752             }
753         }
754 
755         let new_vm = Vm::new_from_memory_manager(
756             config,
757             memory_manager,
758             vm,
759             exit_evt,
760             reset_evt,
761             seccomp_action,
762             hypervisor,
763             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
764             None,
765             activate_evt,
766             false,
767         )?;
768 
769         // The device manager must create the devices from here as it is part
770         // of the regular code path creating everything from scratch.
771         new_vm
772             .device_manager
773             .lock()
774             .unwrap()
775             .create_devices(serial_pty, console_pty, console_resize_pipe)
776             .map_err(Error::DeviceManager)?;
777         Ok(new_vm)
778     }
779 
780     #[allow(clippy::too_many_arguments)]
781     pub fn new_from_snapshot(
782         snapshot: &Snapshot,
783         exit_evt: EventFd,
784         reset_evt: EventFd,
785         source_url: Option<&str>,
786         prefault: bool,
787         seccomp_action: &SeccompAction,
788         hypervisor: Arc<dyn hypervisor::Hypervisor>,
789         activate_evt: EventFd,
790     ) -> Result<Self> {
791         hypervisor.check_required_extensions().unwrap();
792         let vm = hypervisor.create_vm().unwrap();
793         #[cfg(target_arch = "x86_64")]
794         vm.enable_split_irq().unwrap();
795         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
796         let config = vm_snapshot.config;
797         if let Some(state) = vm_snapshot.state {
798             vm.set_state(state)
799                 .map_err(|e| Error::Restore(MigratableError::Restore(e.into())))?;
800         }
801 
802         let memory_manager = if let Some(memory_manager_snapshot) =
803             snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID)
804         {
805             let phys_bits = physical_bits(
806                 config.lock().unwrap().cpus.max_phys_bits,
807                 #[cfg(feature = "tdx")]
808                 config.lock().unwrap().tdx.is_some(),
809             );
810             MemoryManager::new_from_snapshot(
811                 memory_manager_snapshot,
812                 vm.clone(),
813                 &config.lock().unwrap().memory.clone(),
814                 source_url,
815                 prefault,
816                 phys_bits,
817             )
818             .map_err(Error::MemoryManager)?
819         } else {
820             return Err(Error::Restore(MigratableError::Restore(anyhow!(
821                 "Missing memory manager snapshot"
822             ))));
823         };
824 
825         Vm::new_from_memory_manager(
826             config,
827             memory_manager,
828             vm,
829             exit_evt,
830             reset_evt,
831             seccomp_action,
832             hypervisor,
833             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
834             vm_snapshot.clock,
835             activate_evt,
836             true,
837         )
838     }
839 
840     pub fn new_from_migration(
841         config: Arc<Mutex<VmConfig>>,
842         exit_evt: EventFd,
843         reset_evt: EventFd,
844         seccomp_action: &SeccompAction,
845         hypervisor: Arc<dyn hypervisor::Hypervisor>,
846         activate_evt: EventFd,
847     ) -> Result<Self> {
848         hypervisor.check_required_extensions().unwrap();
849         let vm = hypervisor.create_vm().unwrap();
850         #[cfg(target_arch = "x86_64")]
851         vm.enable_split_irq().unwrap();
852         let phys_bits = physical_bits(
853             config.lock().unwrap().cpus.max_phys_bits,
854             #[cfg(feature = "tdx")]
855             config.lock().unwrap().tdx.is_some(),
856         );
857 
858         let memory_manager = MemoryManager::new(
859             vm.clone(),
860             &config.lock().unwrap().memory.clone(),
861             false,
862             phys_bits,
863             #[cfg(feature = "tdx")]
864             false,
865         )
866         .map_err(Error::MemoryManager)?;
867 
868         Vm::new_from_memory_manager(
869             config,
870             memory_manager,
871             vm,
872             exit_evt,
873             reset_evt,
874             seccomp_action,
875             hypervisor,
876             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
877             None,
878             activate_evt,
879             true,
880         )
881     }
882 
883     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
884         let mut initramfs = self.initramfs.as_ref().unwrap();
885         let size: usize = initramfs
886             .seek(SeekFrom::End(0))
887             .map_err(|_| Error::InitramfsLoad)?
888             .try_into()
889             .unwrap();
890         initramfs
891             .seek(SeekFrom::Start(0))
892             .map_err(|_| Error::InitramfsLoad)?;
893 
894         let address =
895             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
896         let address = GuestAddress(address);
897 
898         guest_mem
899             .read_from(address, &mut initramfs, size)
900             .map_err(|_| Error::InitramfsLoad)?;
901 
902         info!("Initramfs loaded: address = 0x{:x}", address.0);
903         Ok(arch::InitramfsConfig { address, size })
904     }
905 
906     fn get_cmdline(&mut self) -> Result<CString> {
907         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE);
908         cmdline
909             .insert_str(self.config.lock().unwrap().cmdline.args.clone())
910             .map_err(Error::CmdLineInsertStr)?;
911         for entry in self.device_manager.lock().unwrap().cmdline_additions() {
912             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
913         }
914         CString::new(cmdline).map_err(Error::CmdLineCString)
915     }
916 
917     #[cfg(target_arch = "aarch64")]
918     fn load_kernel(&mut self) -> Result<EntryPoint> {
919         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
920         let mem = guest_memory.memory();
921         let mut kernel = self.kernel.as_ref().unwrap();
922         let entry_addr = match linux_loader::loader::pe::PE::load(
923             mem.deref(),
924             Some(GuestAddress(arch::get_kernel_start())),
925             &mut kernel,
926             None,
927         ) {
928             Ok(entry_addr) => entry_addr,
929             // Try to load the binary as kernel PE file at first.
930             // If failed, retry to load it as UEFI binary.
931             // As the UEFI binary is formatless, it must be the last option to try.
932             Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
933                 arch::aarch64::uefi::load_uefi(
934                     mem.deref(),
935                     GuestAddress(arch::get_uefi_start()),
936                     &mut kernel,
937                 )
938                 .map_err(Error::UefiLoad)?;
939                 // The entry point offset in UEFI image is always 0.
940                 return Ok(EntryPoint {
941                     entry_addr: GuestAddress(arch::get_uefi_start()),
942                 });
943             }
944             Err(e) => {
945                 return Err(Error::KernelLoad(e));
946             }
947         };
948 
949         let entry_point_addr: GuestAddress = entry_addr.kernel_load;
950 
951         Ok(EntryPoint {
952             entry_addr: entry_point_addr,
953         })
954     }
955 
956     #[cfg(target_arch = "x86_64")]
957     fn load_kernel(&mut self) -> Result<EntryPoint> {
958         info!("Loading kernel");
959         let cmdline_cstring = self.get_cmdline()?;
960         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
961         let mem = guest_memory.memory();
962         let mut kernel = self.kernel.as_ref().unwrap();
963         let entry_addr = match linux_loader::loader::elf::Elf::load(
964             mem.deref(),
965             None,
966             &mut kernel,
967             Some(arch::layout::HIGH_RAM_START),
968         ) {
969             Ok(entry_addr) => entry_addr,
970             Err(e) => {
971                 return Err(Error::KernelLoad(e));
972             }
973         };
974 
975         linux_loader::loader::load_cmdline(
976             mem.deref(),
977             arch::layout::CMDLINE_START,
978             &cmdline_cstring,
979         )
980         .map_err(Error::LoadCmdLine)?;
981 
982         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
983             // Use the PVH kernel entry point to boot the guest
984             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
985             Ok(EntryPoint { entry_addr })
986         } else {
987             Err(Error::KernelMissingPvhHeader)
988         }
989     }
990 
991     #[cfg(target_arch = "x86_64")]
992     fn configure_system(&mut self) -> Result<()> {
993         info!("Configuring system");
994         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
995 
996         let initramfs_config = match self.initramfs {
997             Some(_) => Some(self.load_initramfs(&mem)?),
998             None => None,
999         };
1000 
1001         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1002 
1003         #[allow(unused_mut, unused_assignments)]
1004         let mut rsdp_addr: Option<GuestAddress> = None;
1005 
1006         #[cfg(feature = "acpi")]
1007         {
1008             rsdp_addr = Some(crate::acpi::create_acpi_tables(
1009                 &mem,
1010                 &self.device_manager,
1011                 &self.cpu_manager,
1012                 &self.memory_manager,
1013                 &self.numa_nodes,
1014             ));
1015             info!(
1016                 "Created ACPI tables: rsdp_addr = 0x{:x}",
1017                 rsdp_addr.unwrap().0
1018             );
1019         }
1020 
1021         let sgx_epc_region = self
1022             .memory_manager
1023             .lock()
1024             .unwrap()
1025             .sgx_epc_region()
1026             .as_ref()
1027             .cloned();
1028 
1029         arch::configure_system(
1030             &mem,
1031             arch::layout::CMDLINE_START,
1032             &initramfs_config,
1033             boot_vcpus,
1034             rsdp_addr,
1035             sgx_epc_region,
1036         )
1037         .map_err(Error::ConfigureSystem)?;
1038         Ok(())
1039     }
1040 
1041     #[cfg(target_arch = "aarch64")]
1042     fn configure_system(&mut self) -> Result<()> {
1043         let cmdline_cstring = self.get_cmdline()?;
1044         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1045         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1046         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1047         let initramfs_config = match self.initramfs {
1048             Some(_) => Some(self.load_initramfs(&mem)?),
1049             None => None,
1050         };
1051 
1052         let device_info = &self
1053             .device_manager
1054             .lock()
1055             .unwrap()
1056             .get_device_info()
1057             .clone();
1058 
1059         let pci_space_start: GuestAddress = self
1060             .memory_manager
1061             .lock()
1062             .as_ref()
1063             .unwrap()
1064             .start_of_device_area();
1065 
1066         let pci_space_end: GuestAddress = self
1067             .memory_manager
1068             .lock()
1069             .as_ref()
1070             .unwrap()
1071             .end_of_device_area();
1072 
1073         let pci_space_size = pci_space_end
1074             .checked_offset_from(pci_space_start)
1075             .ok_or(Error::MemOverflow)?
1076             + 1;
1077 
1078         let pci_space = (pci_space_start.0, pci_space_size);
1079 
1080         #[cfg(feature = "acpi")]
1081         {
1082             let _ = crate::acpi::create_acpi_tables(
1083                 &mem,
1084                 &self.device_manager,
1085                 &self.cpu_manager,
1086                 &self.memory_manager,
1087                 &self.numa_nodes,
1088             );
1089         }
1090 
1091         let gic_device = create_gic(
1092             &self.memory_manager.lock().as_ref().unwrap().vm,
1093             self.cpu_manager.lock().unwrap().boot_vcpus() as u64,
1094         )
1095         .map_err(|e| {
1096             Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::SetupGic(e)))
1097         })?;
1098 
1099         arch::configure_system(
1100             &mem,
1101             &cmdline_cstring,
1102             vcpu_mpidrs,
1103             vcpu_topology,
1104             device_info,
1105             &initramfs_config,
1106             &pci_space,
1107             &*gic_device,
1108             &self.numa_nodes,
1109         )
1110         .map_err(Error::ConfigureSystem)?;
1111 
1112         // Update the GIC entity in device manager
1113         self.device_manager
1114             .lock()
1115             .unwrap()
1116             .get_interrupt_controller()
1117             .unwrap()
1118             .lock()
1119             .unwrap()
1120             .set_gic_device(Arc::new(Mutex::new(gic_device)));
1121 
1122         // Activate gic device
1123         self.device_manager
1124             .lock()
1125             .unwrap()
1126             .get_interrupt_controller()
1127             .unwrap()
1128             .lock()
1129             .unwrap()
1130             .enable()
1131             .map_err(Error::EnableInterruptController)?;
1132 
1133         Ok(())
1134     }
1135 
1136     pub fn serial_pty(&self) -> Option<PtyPair> {
1137         self.device_manager.lock().unwrap().serial_pty()
1138     }
1139 
1140     pub fn console_pty(&self) -> Option<PtyPair> {
1141         self.device_manager.lock().unwrap().console_pty()
1142     }
1143 
1144     pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1145         self.device_manager.lock().unwrap().console_resize_pipe()
1146     }
1147 
1148     pub fn shutdown(&mut self) -> Result<()> {
1149         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1150         let new_state = VmState::Shutdown;
1151 
1152         state.valid_transition(new_state)?;
1153 
1154         if self.on_tty {
1155             // Don't forget to set the terminal in canonical mode
1156             // before to exit.
1157             io::stdin()
1158                 .lock()
1159                 .set_canon_mode()
1160                 .map_err(Error::SetTerminalCanon)?;
1161         }
1162 
1163         // Trigger the termination of the signal_handler thread
1164         if let Some(signals) = self.signals.take() {
1165             signals.close();
1166         }
1167 
1168         // Wake up the DeviceManager threads so they will get terminated cleanly
1169         self.device_manager
1170             .lock()
1171             .unwrap()
1172             .resume()
1173             .map_err(Error::Resume)?;
1174 
1175         self.cpu_manager
1176             .lock()
1177             .unwrap()
1178             .shutdown()
1179             .map_err(Error::CpuManager)?;
1180 
1181         // Wait for all the threads to finish
1182         for thread in self.threads.drain(..) {
1183             thread.join().map_err(Error::ThreadCleanup)?
1184         }
1185         *state = new_state;
1186 
1187         event!("vm", "shutdown");
1188 
1189         Ok(())
1190     }
1191 
1192     pub fn resize(
1193         &mut self,
1194         desired_vcpus: Option<u8>,
1195         desired_memory: Option<u64>,
1196         desired_balloon: Option<u64>,
1197     ) -> Result<()> {
1198         event!("vm", "resizing");
1199 
1200         if let Some(desired_vcpus) = desired_vcpus {
1201             if self
1202                 .cpu_manager
1203                 .lock()
1204                 .unwrap()
1205                 .resize(desired_vcpus)
1206                 .map_err(Error::CpuManager)?
1207             {
1208                 self.device_manager
1209                     .lock()
1210                     .unwrap()
1211                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1212                     .map_err(Error::DeviceManager)?;
1213             }
1214             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1215         }
1216 
1217         if let Some(desired_memory) = desired_memory {
1218             let new_region = self
1219                 .memory_manager
1220                 .lock()
1221                 .unwrap()
1222                 .resize(desired_memory)
1223                 .map_err(Error::MemoryManager)?;
1224 
1225             let mut memory_config = &mut self.config.lock().unwrap().memory;
1226 
1227             if let Some(new_region) = &new_region {
1228                 self.device_manager
1229                     .lock()
1230                     .unwrap()
1231                     .update_memory(new_region)
1232                     .map_err(Error::DeviceManager)?;
1233 
1234                 match memory_config.hotplug_method {
1235                     HotplugMethod::Acpi => {
1236                         self.device_manager
1237                             .lock()
1238                             .unwrap()
1239                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1240                             .map_err(Error::DeviceManager)?;
1241                     }
1242                     HotplugMethod::VirtioMem => {}
1243                 }
1244             }
1245 
1246             // We update the VM config regardless of the actual guest resize
1247             // operation result (happened or not), so that if the VM reboots
1248             // it will be running with the last configure memory size.
1249             match memory_config.hotplug_method {
1250                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1251                 HotplugMethod::VirtioMem => {
1252                     if desired_memory > memory_config.size {
1253                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1254                     } else {
1255                         memory_config.hotplugged_size = None;
1256                     }
1257                 }
1258             }
1259         }
1260 
1261         if let Some(desired_balloon) = desired_balloon {
1262             self.device_manager
1263                 .lock()
1264                 .unwrap()
1265                 .resize_balloon(desired_balloon)
1266                 .map_err(Error::DeviceManager)?;
1267 
1268             // Update the configuration value for the balloon size to ensure
1269             // a reboot would use the right value.
1270             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1271                 balloon_config.size = desired_balloon;
1272             }
1273         }
1274 
1275         event!("vm", "resized");
1276 
1277         Ok(())
1278     }
1279 
1280     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1281         let memory_config = &mut self.config.lock().unwrap().memory;
1282 
1283         if let Some(zones) = &mut memory_config.zones {
1284             for zone in zones.iter_mut() {
1285                 if zone.id == id {
1286                     if desired_memory >= zone.size {
1287                         let hotplugged_size = desired_memory - zone.size;
1288                         self.memory_manager
1289                             .lock()
1290                             .unwrap()
1291                             .resize_zone(&id, desired_memory - zone.size)
1292                             .map_err(Error::MemoryManager)?;
1293                         // We update the memory zone config regardless of the
1294                         // actual 'resize-zone' operation result (happened or
1295                         // not), so that if the VM reboots it will be running
1296                         // with the last configured memory zone size.
1297                         zone.hotplugged_size = Some(hotplugged_size);
1298 
1299                         return Ok(());
1300                     } else {
1301                         error!(
1302                             "Invalid to ask less ({}) than boot RAM ({}) for \
1303                             this memory zone",
1304                             desired_memory, zone.size,
1305                         );
1306                         return Err(Error::ResizeZone);
1307                     }
1308                 }
1309             }
1310         }
1311 
1312         error!("Could not find the memory zone {} for the resize", id);
1313         Err(Error::ResizeZone)
1314     }
1315 
1316     fn add_to_config<T>(devices: &mut Option<Vec<T>>, device: T) {
1317         if let Some(devices) = devices {
1318             devices.push(device);
1319         } else {
1320             *devices = Some(vec![device]);
1321         }
1322     }
1323 
1324     pub fn add_device(&mut self, mut _device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1325         {
1326             // Validate on a clone of the config
1327             let mut config = self.config.lock().unwrap().clone();
1328             Self::add_to_config(&mut config.devices, _device_cfg.clone());
1329             config.validate().map_err(Error::ConfigValidation)?;
1330         }
1331 
1332         let pci_device_info = self
1333             .device_manager
1334             .lock()
1335             .unwrap()
1336             .add_device(&mut _device_cfg)
1337             .map_err(Error::DeviceManager)?;
1338 
1339         // Update VmConfig by adding the new device. This is important to
1340         // ensure the device would be created in case of a reboot.
1341         {
1342             let mut config = self.config.lock().unwrap();
1343             Self::add_to_config(&mut config.devices, _device_cfg);
1344         }
1345 
1346         self.device_manager
1347             .lock()
1348             .unwrap()
1349             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1350             .map_err(Error::DeviceManager)?;
1351 
1352         Ok(pci_device_info)
1353     }
1354 
1355     pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1356         {
1357             // Validate on a clone of the config
1358             let mut config = self.config.lock().unwrap().clone();
1359             Self::add_to_config(&mut config.user_devices, device_cfg.clone());
1360             config.validate().map_err(Error::ConfigValidation)?;
1361         }
1362 
1363         let pci_device_info = self
1364             .device_manager
1365             .lock()
1366             .unwrap()
1367             .add_user_device(&mut device_cfg)
1368             .map_err(Error::DeviceManager)?;
1369 
1370         // Update VmConfig by adding the new device. This is important to
1371         // ensure the device would be created in case of a reboot.
1372         {
1373             let mut config = self.config.lock().unwrap();
1374             Self::add_to_config(&mut config.user_devices, device_cfg);
1375         }
1376 
1377         self.device_manager
1378             .lock()
1379             .unwrap()
1380             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1381             .map_err(Error::DeviceManager)?;
1382 
1383         Ok(pci_device_info)
1384     }
1385 
1386     pub fn remove_device(&mut self, _id: String) -> Result<()> {
1387         self.device_manager
1388             .lock()
1389             .unwrap()
1390             .remove_device(_id.clone())
1391             .map_err(Error::DeviceManager)?;
1392 
1393         // Update VmConfig by removing the device. This is important to
1394         // ensure the device would not be created in case of a reboot.
1395         let mut config = self.config.lock().unwrap();
1396 
1397         // Remove if VFIO device
1398         if let Some(devices) = config.devices.as_mut() {
1399             devices.retain(|dev| dev.id.as_ref() != Some(&_id));
1400         }
1401 
1402         // Remove if disk device
1403         if let Some(disks) = config.disks.as_mut() {
1404             disks.retain(|dev| dev.id.as_ref() != Some(&_id));
1405         }
1406 
1407         // Remove if net device
1408         if let Some(net) = config.net.as_mut() {
1409             net.retain(|dev| dev.id.as_ref() != Some(&_id));
1410         }
1411 
1412         // Remove if pmem device
1413         if let Some(pmem) = config.pmem.as_mut() {
1414             pmem.retain(|dev| dev.id.as_ref() != Some(&_id));
1415         }
1416 
1417         // Remove if vsock device
1418         if let Some(vsock) = config.vsock.as_ref() {
1419             if vsock.id.as_ref() == Some(&_id) {
1420                 config.vsock = None;
1421             }
1422         }
1423 
1424         self.device_manager
1425             .lock()
1426             .unwrap()
1427             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1428             .map_err(Error::DeviceManager)?;
1429         Ok(())
1430     }
1431 
1432     pub fn add_disk(&mut self, mut _disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1433         {
1434             // Validate on a clone of the config
1435             let mut config = self.config.lock().unwrap().clone();
1436             Self::add_to_config(&mut config.disks, _disk_cfg.clone());
1437             config.validate().map_err(Error::ConfigValidation)?;
1438         }
1439 
1440         let pci_device_info = self
1441             .device_manager
1442             .lock()
1443             .unwrap()
1444             .add_disk(&mut _disk_cfg)
1445             .map_err(Error::DeviceManager)?;
1446 
1447         // Update VmConfig by adding the new device. This is important to
1448         // ensure the device would be created in case of a reboot.
1449         {
1450             let mut config = self.config.lock().unwrap();
1451             Self::add_to_config(&mut config.disks, _disk_cfg);
1452         }
1453 
1454         self.device_manager
1455             .lock()
1456             .unwrap()
1457             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1458             .map_err(Error::DeviceManager)?;
1459 
1460         Ok(pci_device_info)
1461     }
1462 
1463     pub fn add_fs(&mut self, mut _fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1464         {
1465             // Validate on a clone of the config
1466             let mut config = self.config.lock().unwrap().clone();
1467             Self::add_to_config(&mut config.fs, _fs_cfg.clone());
1468             config.validate().map_err(Error::ConfigValidation)?;
1469         }
1470 
1471         let pci_device_info = self
1472             .device_manager
1473             .lock()
1474             .unwrap()
1475             .add_fs(&mut _fs_cfg)
1476             .map_err(Error::DeviceManager)?;
1477 
1478         // Update VmConfig by adding the new device. This is important to
1479         // ensure the device would be created in case of a reboot.
1480         {
1481             let mut config = self.config.lock().unwrap();
1482             Self::add_to_config(&mut config.fs, _fs_cfg);
1483         }
1484 
1485         self.device_manager
1486             .lock()
1487             .unwrap()
1488             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1489             .map_err(Error::DeviceManager)?;
1490 
1491         Ok(pci_device_info)
1492     }
1493 
1494     pub fn add_pmem(&mut self, mut _pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1495         {
1496             // Validate on a clone of the config
1497             let mut config = self.config.lock().unwrap().clone();
1498             Self::add_to_config(&mut config.pmem, _pmem_cfg.clone());
1499             config.validate().map_err(Error::ConfigValidation)?;
1500         }
1501 
1502         let pci_device_info = self
1503             .device_manager
1504             .lock()
1505             .unwrap()
1506             .add_pmem(&mut _pmem_cfg)
1507             .map_err(Error::DeviceManager)?;
1508 
1509         // Update VmConfig by adding the new device. This is important to
1510         // ensure the device would be created in case of a reboot.
1511         {
1512             let mut config = self.config.lock().unwrap();
1513             Self::add_to_config(&mut config.pmem, _pmem_cfg);
1514         }
1515 
1516         self.device_manager
1517             .lock()
1518             .unwrap()
1519             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1520             .map_err(Error::DeviceManager)?;
1521 
1522         Ok(pci_device_info)
1523     }
1524 
1525     pub fn add_net(&mut self, mut _net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1526         {
1527             // Validate on a clone of the config
1528             let mut config = self.config.lock().unwrap().clone();
1529             Self::add_to_config(&mut config.net, _net_cfg.clone());
1530             config.validate().map_err(Error::ConfigValidation)?;
1531         }
1532 
1533         let pci_device_info = self
1534             .device_manager
1535             .lock()
1536             .unwrap()
1537             .add_net(&mut _net_cfg)
1538             .map_err(Error::DeviceManager)?;
1539 
1540         // Update VmConfig by adding the new device. This is important to
1541         // ensure the device would be created in case of a reboot.
1542         {
1543             let mut config = self.config.lock().unwrap();
1544             Self::add_to_config(&mut config.net, _net_cfg);
1545         }
1546 
1547         self.device_manager
1548             .lock()
1549             .unwrap()
1550             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1551             .map_err(Error::DeviceManager)?;
1552 
1553         Ok(pci_device_info)
1554     }
1555 
1556     pub fn add_vsock(&mut self, mut _vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1557         if self.config.lock().unwrap().vsock.is_some() {
1558             return Err(Error::TooManyVsockDevices);
1559         }
1560 
1561         {
1562             // Validate on a clone of the config
1563             let mut config = self.config.lock().unwrap().clone();
1564             config.vsock = Some(_vsock_cfg.clone());
1565             config.validate().map_err(Error::ConfigValidation)?;
1566         }
1567 
1568         let pci_device_info = self
1569             .device_manager
1570             .lock()
1571             .unwrap()
1572             .add_vsock(&mut _vsock_cfg)
1573             .map_err(Error::DeviceManager)?;
1574 
1575         // Update VmConfig by adding the new device. This is important to
1576         // ensure the device would be created in case of a reboot.
1577         {
1578             let mut config = self.config.lock().unwrap();
1579             config.vsock = Some(_vsock_cfg);
1580         }
1581 
1582         self.device_manager
1583             .lock()
1584             .unwrap()
1585             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1586             .map_err(Error::DeviceManager)?;
1587 
1588         Ok(pci_device_info)
1589     }
1590 
1591     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1592         Ok(self.device_manager.lock().unwrap().counters())
1593     }
1594 
1595     fn os_signal_handler(
1596         mut signals: Signals,
1597         console_input_clone: Arc<Console>,
1598         on_tty: bool,
1599         exit_evt: &EventFd,
1600     ) {
1601         for sig in HANDLED_SIGNALS {
1602             unblock_signal(sig).unwrap();
1603         }
1604 
1605         for signal in signals.forever() {
1606             match signal {
1607                 SIGWINCH => {
1608                     console_input_clone.update_console_size();
1609                 }
1610                 SIGTERM | SIGINT => {
1611                     if on_tty {
1612                         io::stdin()
1613                             .lock()
1614                             .set_canon_mode()
1615                             .expect("failed to restore terminal mode");
1616                     }
1617                     if exit_evt.write(1).is_err() {
1618                         std::process::exit(1);
1619                     }
1620                 }
1621                 _ => (),
1622             }
1623         }
1624     }
1625 
1626     #[cfg(feature = "tdx")]
1627     fn init_tdx(&mut self) -> Result<()> {
1628         let cpuid = self.cpu_manager.lock().unwrap().common_cpuid();
1629         let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32;
1630         self.vm
1631             .tdx_init(&cpuid, max_vcpus)
1632             .map_err(Error::InitializeTdxVm)?;
1633         Ok(())
1634     }
1635 
1636     #[cfg(feature = "tdx")]
1637     fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> {
1638         use arch::x86_64::tdx::*;
1639         // The TDVF file contains a table of section as well as code
1640         let mut firmware_file =
1641             File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware)
1642                 .map_err(Error::LoadTdvf)?;
1643 
1644         // For all the sections allocate some RAM backing them
1645         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1646     }
1647 
1648     #[cfg(feature = "tdx")]
1649     fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> {
1650         use arch::x86_64::tdx::*;
1651         // Get the memory end *before* we start adding TDVF ram regions
1652         let boot_guest_memory = self
1653             .memory_manager
1654             .lock()
1655             .as_ref()
1656             .unwrap()
1657             .boot_guest_memory();
1658         for section in sections {
1659             // No need to allocate if the section falls within guest RAM ranges
1660             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1661                 info!(
1662                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1663                     section
1664                 );
1665                 continue;
1666             }
1667 
1668             info!("Allocating TDVF Section: {:x?}", section);
1669             self.memory_manager
1670                 .lock()
1671                 .unwrap()
1672                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1673                 .map_err(Error::AllocatingTdvfMemory)?;
1674         }
1675 
1676         // The TDVF file contains a table of section as well as code
1677         let mut firmware_file =
1678             File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware)
1679                 .map_err(Error::LoadTdvf)?;
1680 
1681         // The guest memory at this point now has all the required regions so it
1682         // is safe to copy from the TDVF file into it.
1683         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1684         let mem = guest_memory.memory();
1685         let mut hob_offset = None;
1686         for section in sections {
1687             info!("Populating TDVF Section: {:x?}", section);
1688             match section.r#type {
1689                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1690                     info!("Copying section to guest memory");
1691                     firmware_file
1692                         .seek(SeekFrom::Start(section.data_offset as u64))
1693                         .map_err(Error::LoadTdvf)?;
1694                     mem.read_from(
1695                         GuestAddress(section.address),
1696                         &mut firmware_file,
1697                         section.data_size as usize,
1698                     )
1699                     .unwrap();
1700                 }
1701                 TdvfSectionType::TdHob => {
1702                     hob_offset = Some(section.address);
1703                 }
1704                 _ => {}
1705             }
1706         }
1707 
1708         // Generate HOB
1709         let mut hob = TdHob::start(hob_offset.unwrap());
1710 
1711         let mut sorted_sections = sections.to_vec();
1712         sorted_sections.retain(|section| {
1713             !matches!(section.r#type, TdvfSectionType::Bfv | TdvfSectionType::Cfv)
1714         });
1715         sorted_sections.sort_by_key(|section| section.address);
1716         sorted_sections.reverse();
1717         let mut current_section = sorted_sections.pop();
1718 
1719         // RAM regions interleaved with TDVF sections
1720         let mut next_start_addr = 0;
1721         for region in boot_guest_memory.iter() {
1722             let region_start = region.start_addr().0;
1723             let region_end = region.last_addr().0;
1724             if region_start > next_start_addr {
1725                 next_start_addr = region_start;
1726             }
1727 
1728             loop {
1729                 let (start, size, ram) = if let Some(section) = &current_section {
1730                     if section.address <= next_start_addr {
1731                         (section.address, section.size, false)
1732                     } else {
1733                         let last_addr = std::cmp::min(section.address - 1, region_end);
1734                         (next_start_addr, last_addr - next_start_addr + 1, true)
1735                     }
1736                 } else {
1737                     (next_start_addr, region_end - next_start_addr + 1, true)
1738                 };
1739 
1740                 hob.add_memory_resource(&mem, start, size, ram)
1741                     .map_err(Error::PopulateHob)?;
1742 
1743                 if !ram {
1744                     current_section = sorted_sections.pop();
1745                 }
1746 
1747                 next_start_addr = start + size;
1748 
1749                 if next_start_addr > region_end {
1750                     break;
1751                 }
1752             }
1753         }
1754 
1755         // MMIO regions
1756         hob.add_mmio_resource(
1757             &mem,
1758             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1759             arch::layout::APIC_START.raw_value()
1760                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1761         )
1762         .map_err(Error::PopulateHob)?;
1763         let start_of_device_area = self
1764             .memory_manager
1765             .lock()
1766             .unwrap()
1767             .start_of_device_area()
1768             .raw_value();
1769         let end_of_device_area = self
1770             .memory_manager
1771             .lock()
1772             .unwrap()
1773             .end_of_device_area()
1774             .raw_value();
1775         hob.add_mmio_resource(
1776             &mem,
1777             start_of_device_area,
1778             end_of_device_area - start_of_device_area,
1779         )
1780         .map_err(Error::PopulateHob)?;
1781 
1782         hob.finish(&mem).map_err(Error::PopulateHob)?;
1783 
1784         Ok(hob_offset)
1785     }
1786 
1787     #[cfg(feature = "tdx")]
1788     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1789         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1790         let mem = guest_memory.memory();
1791 
1792         for section in sections {
1793             self.vm
1794                 .tdx_init_memory_region(
1795                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1796                     section.address,
1797                     section.size,
1798                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1799                     section.attributes == 1,
1800                 )
1801                 .map_err(Error::InitializeTdxMemoryRegion)?;
1802         }
1803         Ok(())
1804     }
1805 
1806     fn setup_signal_handler(&mut self) -> Result<()> {
1807         let console = self.device_manager.lock().unwrap().console().clone();
1808         let signals = Signals::new(&HANDLED_SIGNALS);
1809         match signals {
1810             Ok(signals) => {
1811                 self.signals = Some(signals.handle());
1812                 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?;
1813                 let on_tty = self.on_tty;
1814                 let signal_handler_seccomp_filter =
1815                     get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler)
1816                         .map_err(Error::CreateSeccompFilter)?;
1817                 self.threads.push(
1818                     thread::Builder::new()
1819                         .name("signal_handler".to_string())
1820                         .spawn(move || {
1821                             if !signal_handler_seccomp_filter.is_empty() {
1822                                 if let Err(e) = apply_filter(&signal_handler_seccomp_filter)
1823                                     .map_err(Error::ApplySeccompFilter)
1824                                 {
1825                                     error!("Error applying seccomp filter: {:?}", e);
1826                                     exit_evt.write(1).ok();
1827                                     return;
1828                                 }
1829                             }
1830                             std::panic::catch_unwind(AssertUnwindSafe(|| {
1831                                 Vm::os_signal_handler(signals, console, on_tty, &exit_evt);
1832                             }))
1833                             .map_err(|_| {
1834                                 error!("signal_handler thead panicked");
1835                                 exit_evt.write(1).ok()
1836                             })
1837                             .ok();
1838                         })
1839                         .map_err(Error::SignalHandlerSpawn)?,
1840                 );
1841             }
1842             Err(e) => error!("Signal not found {}", e),
1843         }
1844         Ok(())
1845     }
1846 
1847     fn setup_tty(&self) -> Result<()> {
1848         if self.on_tty {
1849             io::stdin()
1850                 .lock()
1851                 .set_raw_mode()
1852                 .map_err(Error::SetTerminalRaw)?;
1853         }
1854 
1855         Ok(())
1856     }
1857 
1858     pub fn boot(&mut self) -> Result<()> {
1859         info!("Booting VM");
1860         event!("vm", "booting");
1861         let current_state = self.get_state()?;
1862         if current_state == VmState::Paused {
1863             return self.resume().map_err(Error::Resume);
1864         }
1865 
1866         let new_state = VmState::Running;
1867         current_state.valid_transition(new_state)?;
1868 
1869         // Load kernel if configured
1870         let entry_point = if self.kernel.as_ref().is_some() {
1871             Some(self.load_kernel()?)
1872         } else {
1873             None
1874         };
1875 
1876         // The initial TDX configuration must be done before the vCPUs are
1877         // created
1878         #[cfg(feature = "tdx")]
1879         if self.config.lock().unwrap().tdx.is_some() {
1880             self.init_tdx()?;
1881         }
1882 
1883         // Create and configure vcpus
1884         self.cpu_manager
1885             .lock()
1886             .unwrap()
1887             .create_boot_vcpus(entry_point)
1888             .map_err(Error::CpuManager)?;
1889 
1890         #[cfg(feature = "tdx")]
1891         let sections = self.extract_tdvf_sections()?;
1892 
1893         // Configuring the TDX regions requires that the vCPUs are created
1894         #[cfg(feature = "tdx")]
1895         let hob_address = if self.config.lock().unwrap().tdx.is_some() {
1896             self.populate_tdx_sections(&sections)?
1897         } else {
1898             None
1899         };
1900 
1901         // Configure shared state based on loaded kernel
1902         entry_point.map(|_| self.configure_system()).transpose()?;
1903 
1904         #[cfg(feature = "tdx")]
1905         if let Some(hob_address) = hob_address {
1906             // With the HOB address extracted the vCPUs can have
1907             // their TDX state configured.
1908             self.cpu_manager
1909                 .lock()
1910                 .unwrap()
1911                 .initialize_tdx(hob_address)
1912                 .map_err(Error::CpuManager)?;
1913             self.init_tdx_memory(&sections)?;
1914             // With TDX memory and CPU state configured TDX setup is complete
1915             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
1916         }
1917 
1918         self.cpu_manager
1919             .lock()
1920             .unwrap()
1921             .start_boot_vcpus()
1922             .map_err(Error::CpuManager)?;
1923 
1924         self.setup_signal_handler()?;
1925         self.setup_tty()?;
1926 
1927         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1928         *state = new_state;
1929         event!("vm", "booted");
1930         Ok(())
1931     }
1932 
1933     pub fn handle_pty(&self, event: EpollDispatch) -> Result<()> {
1934         // Could be a little dangerous, picks up a lock on device_manager
1935         // and goes into a blocking read. If the epoll loops starts to be
1936         // services by multiple threads likely need to revist this.
1937         let dm = self.device_manager.lock().unwrap();
1938 
1939         if matches!(event, EpollDispatch::SerialPty) {
1940             if let Some(mut pty) = dm.serial_pty() {
1941                 let mut out = [0u8; 64];
1942                 let count = pty.main.read(&mut out).map_err(Error::PtyConsole)?;
1943                 let console = dm.console();
1944                 console
1945                     .queue_input_bytes_serial(&out[..count])
1946                     .map_err(Error::Console)?;
1947             };
1948         }
1949 
1950         Ok(())
1951     }
1952 
1953     pub fn handle_stdin(&self) -> Result<()> {
1954         let mut out = [0u8; 64];
1955         let count = io::stdin()
1956             .lock()
1957             .read_raw(&mut out)
1958             .map_err(Error::Console)?;
1959 
1960         // Replace "\n" with "\r" to deal with Windows SAC (#1170)
1961         if count == 1 && out[0] == 0x0a {
1962             out[0] = 0x0d;
1963         }
1964 
1965         if matches!(
1966             self.config.lock().unwrap().serial.mode,
1967             ConsoleOutputMode::Tty
1968         ) {
1969             self.device_manager
1970                 .lock()
1971                 .unwrap()
1972                 .console()
1973                 .queue_input_bytes_serial(&out[..count])
1974                 .map_err(Error::Console)?;
1975         }
1976 
1977         Ok(())
1978     }
1979 
1980     /// Gets a thread-safe reference counted pointer to the VM configuration.
1981     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
1982         Arc::clone(&self.config)
1983     }
1984 
1985     /// Get the VM state. Returns an error if the state is poisoned.
1986     pub fn get_state(&self) -> Result<VmState> {
1987         self.state
1988             .try_read()
1989             .map_err(|_| Error::PoisonedState)
1990             .map(|state| *state)
1991     }
1992 
1993     /// Load saved clock from snapshot
1994     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
1995     pub fn load_clock_from_snapshot(
1996         &mut self,
1997         snapshot: &Snapshot,
1998     ) -> Result<Option<hypervisor::ClockData>> {
1999         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
2000         self.saved_clock = vm_snapshot.clock;
2001         Ok(self.saved_clock)
2002     }
2003 
2004     #[cfg(target_arch = "aarch64")]
2005     /// Add the vGIC section to the VM snapshot.
2006     fn add_vgic_snapshot_section(
2007         &self,
2008         vm_snapshot: &mut Snapshot,
2009     ) -> std::result::Result<(), MigratableError> {
2010         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2011         let gic_device = Arc::clone(
2012             self.device_manager
2013                 .lock()
2014                 .unwrap()
2015                 .get_interrupt_controller()
2016                 .unwrap()
2017                 .lock()
2018                 .unwrap()
2019                 .get_gic_device()
2020                 .unwrap(),
2021         );
2022 
2023         gic_device
2024             .lock()
2025             .unwrap()
2026             .set_gicr_typers(&saved_vcpu_states);
2027 
2028         vm_snapshot.add_snapshot(
2029             if let Some(gicv3_its) = gic_device
2030                 .lock()
2031                 .unwrap()
2032                 .as_any_concrete_mut()
2033                 .downcast_mut::<KvmGicV3Its>()
2034             {
2035                 gicv3_its.snapshot()?
2036             } else {
2037                 return Err(MigratableError::Snapshot(anyhow!(
2038                     "GicDevice downcast to KvmGicV3Its failed when snapshotting VM!"
2039                 )));
2040             },
2041         );
2042 
2043         Ok(())
2044     }
2045 
2046     #[cfg(target_arch = "aarch64")]
2047     /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing.
2048     fn restore_vgic_and_enable_interrupt(
2049         &self,
2050         vm_snapshot: &Snapshot,
2051     ) -> std::result::Result<(), MigratableError> {
2052         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2053         // The number of vCPUs is the same as the number of saved vCPU states.
2054         let vcpu_numbers = saved_vcpu_states.len();
2055 
2056         // Creating a GIC device here, as the GIC will not be created when
2057         // restoring the device manager. Note that currently only the bare GICv3
2058         // without ITS is supported.
2059         let mut gic_device = create_gic(&self.vm, vcpu_numbers.try_into().unwrap())
2060             .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?;
2061 
2062         // Here we prepare the GICR_TYPER registers from the restored vCPU states.
2063         gic_device.set_gicr_typers(&saved_vcpu_states);
2064 
2065         let gic_device = Arc::new(Mutex::new(gic_device));
2066         // Update the GIC entity in device manager
2067         self.device_manager
2068             .lock()
2069             .unwrap()
2070             .get_interrupt_controller()
2071             .unwrap()
2072             .lock()
2073             .unwrap()
2074             .set_gic_device(Arc::clone(&gic_device));
2075 
2076         // Restore GIC states.
2077         if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) {
2078             if let Some(gicv3_its) = gic_device
2079                 .lock()
2080                 .unwrap()
2081                 .as_any_concrete_mut()
2082                 .downcast_mut::<KvmGicV3Its>()
2083             {
2084                 gicv3_its.restore(*gicv3_its_snapshot.clone())?;
2085             } else {
2086                 return Err(MigratableError::Restore(anyhow!(
2087                     "GicDevice downcast to KvmGicV3Its failed when restoring VM!"
2088                 )));
2089             };
2090         } else {
2091             return Err(MigratableError::Restore(anyhow!(
2092                 "Missing GicV3Its snapshot"
2093             )));
2094         }
2095 
2096         // Activate gic device
2097         self.device_manager
2098             .lock()
2099             .unwrap()
2100             .get_interrupt_controller()
2101             .unwrap()
2102             .lock()
2103             .unwrap()
2104             .enable()
2105             .map_err(|e| {
2106                 MigratableError::Restore(anyhow!(
2107                     "Could not enable interrupt controller routing: {:#?}",
2108                     e
2109                 ))
2110             })?;
2111 
2112         Ok(())
2113     }
2114 
2115     /// Gets the actual size of the balloon.
2116     pub fn balloon_size(&self) -> u64 {
2117         self.device_manager.lock().unwrap().balloon_size()
2118     }
2119 
2120     pub fn receive_memory_regions<F>(
2121         &mut self,
2122         ranges: &MemoryRangeTable,
2123         fd: &mut F,
2124     ) -> std::result::Result<(), MigratableError>
2125     where
2126         F: Read,
2127     {
2128         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2129         let mem = guest_memory.memory();
2130 
2131         for range in ranges.regions() {
2132             mem.read_exact_from(GuestAddress(range.gpa), fd, range.length as usize)
2133                 .map_err(|e| {
2134                     MigratableError::MigrateReceive(anyhow!(
2135                         "Error transferring memory to socket: {}",
2136                         e
2137                     ))
2138                 })?;
2139         }
2140         Ok(())
2141     }
2142 
2143     pub fn send_memory_regions<F>(
2144         &mut self,
2145         ranges: &MemoryRangeTable,
2146         fd: &mut F,
2147     ) -> std::result::Result<(), MigratableError>
2148     where
2149         F: Write,
2150     {
2151         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2152         let mem = guest_memory.memory();
2153 
2154         for range in ranges.regions() {
2155             mem.write_all_to(GuestAddress(range.gpa), fd, range.length as usize)
2156                 .map_err(|e| {
2157                     MigratableError::MigrateSend(anyhow!(
2158                         "Error transferring memory to socket: {}",
2159                         e
2160                     ))
2161                 })?;
2162         }
2163 
2164         Ok(())
2165     }
2166 
2167     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2168         let mut table = MemoryRangeTable::default();
2169         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2170 
2171         for region in guest_memory.memory().iter() {
2172             table.push(MemoryRange {
2173                 gpa: region.start_addr().raw_value(),
2174                 length: region.len() as u64,
2175             });
2176         }
2177 
2178         Ok(table)
2179     }
2180 
2181     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2182         self.device_manager.lock().unwrap().device_tree()
2183     }
2184 
2185     pub fn activate_virtio_devices(&self) -> Result<()> {
2186         self.device_manager
2187             .lock()
2188             .unwrap()
2189             .activate_virtio_devices()
2190             .map_err(Error::ActivateVirtioDevices)
2191     }
2192 
2193     #[cfg(target_arch = "x86_64")]
2194     pub fn power_button(&self) -> Result<()> {
2195         #[cfg(feature = "acpi")]
2196         return self
2197             .device_manager
2198             .lock()
2199             .unwrap()
2200             .notify_power_button()
2201             .map_err(Error::PowerButton);
2202         #[cfg(not(feature = "acpi"))]
2203         Err(Error::PowerButtonNotSupported)
2204     }
2205 
2206     #[cfg(target_arch = "aarch64")]
2207     pub fn power_button(&self) -> Result<()> {
2208         self.device_manager
2209             .lock()
2210             .unwrap()
2211             .notify_power_button()
2212             .map_err(Error::PowerButton)
2213     }
2214 }
2215 
2216 impl Pausable for Vm {
2217     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2218         event!("vm", "pausing");
2219         let mut state = self
2220             .state
2221             .try_write()
2222             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2223         let new_state = VmState::Paused;
2224 
2225         state
2226             .valid_transition(new_state)
2227             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2228 
2229         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2230         {
2231             let mut clock = self
2232                 .vm
2233                 .get_clock()
2234                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2235             // Reset clock flags.
2236             clock.flags = 0;
2237             self.saved_clock = Some(clock);
2238         }
2239         self.cpu_manager.lock().unwrap().pause()?;
2240         self.device_manager.lock().unwrap().pause()?;
2241 
2242         *state = new_state;
2243 
2244         event!("vm", "paused");
2245         Ok(())
2246     }
2247 
2248     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2249         event!("vm", "resuming");
2250         let mut state = self
2251             .state
2252             .try_write()
2253             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2254         let new_state = VmState::Running;
2255 
2256         state
2257             .valid_transition(new_state)
2258             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2259 
2260         self.cpu_manager.lock().unwrap().resume()?;
2261         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2262         {
2263             if let Some(clock) = &self.saved_clock {
2264                 self.vm.set_clock(clock).map_err(|e| {
2265                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2266                 })?;
2267             }
2268         }
2269         self.device_manager.lock().unwrap().resume()?;
2270 
2271         // And we're back to the Running state.
2272         *state = new_state;
2273         event!("vm", "resumed");
2274         Ok(())
2275     }
2276 }
2277 
2278 #[derive(Serialize, Deserialize)]
2279 pub struct VmSnapshot {
2280     pub config: Arc<Mutex<VmConfig>>,
2281     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2282     pub clock: Option<hypervisor::ClockData>,
2283     pub state: Option<hypervisor::VmState>,
2284     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2285     pub common_cpuid: hypervisor::CpuId,
2286 }
2287 
2288 pub const VM_SNAPSHOT_ID: &str = "vm";
2289 impl Snapshottable for Vm {
2290     fn id(&self) -> String {
2291         VM_SNAPSHOT_ID.to_string()
2292     }
2293 
2294     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2295         event!("vm", "snapshotting");
2296 
2297         #[cfg(feature = "tdx")]
2298         {
2299             if self.config.lock().unwrap().tdx.is_some() {
2300                 return Err(MigratableError::Snapshot(anyhow!(
2301                     "Snapshot not possible with TDX VM"
2302                 )));
2303             }
2304         }
2305 
2306         let current_state = self.get_state().unwrap();
2307         if current_state != VmState::Paused {
2308             return Err(MigratableError::Snapshot(anyhow!(
2309                 "Trying to snapshot while VM is running"
2310             )));
2311         }
2312 
2313         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2314         let common_cpuid = {
2315             #[cfg(feature = "tdx")]
2316             let tdx_enabled = self.config.lock().unwrap().tdx.is_some();
2317             let phys_bits = physical_bits(
2318                 self.config.lock().unwrap().cpus.max_phys_bits,
2319                 #[cfg(feature = "tdx")]
2320                 tdx_enabled,
2321             );
2322             arch::generate_common_cpuid(
2323                 self.hypervisor.clone(),
2324                 None,
2325                 None,
2326                 phys_bits,
2327                 self.config.lock().unwrap().cpus.kvm_hyperv,
2328                 #[cfg(feature = "tdx")]
2329                 tdx_enabled,
2330             )
2331             .map_err(|e| {
2332                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2333             })?
2334         };
2335 
2336         let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID);
2337         let vm_state = self
2338             .vm
2339             .state()
2340             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2341         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2342             config: self.get_config(),
2343             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2344             clock: self.saved_clock,
2345             state: Some(vm_state),
2346             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2347             common_cpuid,
2348         })
2349         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2350 
2351         vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?);
2352         vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?);
2353 
2354         #[cfg(target_arch = "aarch64")]
2355         self.add_vgic_snapshot_section(&mut vm_snapshot)
2356             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2357 
2358         vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?);
2359         vm_snapshot.add_data_section(SnapshotDataSection {
2360             id: format!("{}-section", VM_SNAPSHOT_ID),
2361             snapshot: vm_snapshot_data,
2362         });
2363 
2364         event!("vm", "snapshotted");
2365         Ok(vm_snapshot)
2366     }
2367 
2368     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
2369         event!("vm", "restoring");
2370 
2371         let current_state = self
2372             .get_state()
2373             .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?;
2374         let new_state = VmState::Paused;
2375         current_state.valid_transition(new_state).map_err(|e| {
2376             MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e))
2377         })?;
2378 
2379         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2380             self.memory_manager
2381                 .lock()
2382                 .unwrap()
2383                 .restore(*memory_manager_snapshot.clone())?;
2384         } else {
2385             return Err(MigratableError::Restore(anyhow!(
2386                 "Missing memory manager snapshot"
2387             )));
2388         }
2389 
2390         if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) {
2391             self.cpu_manager
2392                 .lock()
2393                 .unwrap()
2394                 .restore(*cpu_manager_snapshot.clone())?;
2395         } else {
2396             return Err(MigratableError::Restore(anyhow!(
2397                 "Missing CPU manager snapshot"
2398             )));
2399         }
2400 
2401         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2402             self.device_manager
2403                 .lock()
2404                 .unwrap()
2405                 .restore(*device_manager_snapshot.clone())?;
2406         } else {
2407             return Err(MigratableError::Restore(anyhow!(
2408                 "Missing device manager snapshot"
2409             )));
2410         }
2411 
2412         #[cfg(target_arch = "aarch64")]
2413         self.restore_vgic_and_enable_interrupt(&snapshot)?;
2414 
2415         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2416             self.device_manager
2417                 .lock()
2418                 .unwrap()
2419                 .restore_devices(*device_manager_snapshot.clone())?;
2420         } else {
2421             return Err(MigratableError::Restore(anyhow!(
2422                 "Missing device manager snapshot"
2423             )));
2424         }
2425 
2426         // Now we can start all vCPUs from here.
2427         self.cpu_manager
2428             .lock()
2429             .unwrap()
2430             .start_restored_vcpus()
2431             .map_err(|e| {
2432                 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e))
2433             })?;
2434 
2435         self.setup_signal_handler().map_err(|e| {
2436             MigratableError::Restore(anyhow!("Could not setup signal handler: {:#?}", e))
2437         })?;
2438         self.setup_tty()
2439             .map_err(|e| MigratableError::Restore(anyhow!("Could not setup tty: {:#?}", e)))?;
2440 
2441         let mut state = self
2442             .state
2443             .try_write()
2444             .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?;
2445         *state = new_state;
2446 
2447         event!("vm", "restored");
2448         Ok(())
2449     }
2450 }
2451 
2452 impl Transportable for Vm {
2453     fn send(
2454         &self,
2455         snapshot: &Snapshot,
2456         destination_url: &str,
2457     ) -> std::result::Result<(), MigratableError> {
2458         let mut vm_snapshot_path = url_to_path(destination_url)?;
2459         vm_snapshot_path.push(VM_SNAPSHOT_FILE);
2460 
2461         // Create the snapshot file
2462         let mut vm_snapshot_file = OpenOptions::new()
2463             .read(true)
2464             .write(true)
2465             .create_new(true)
2466             .open(vm_snapshot_path)
2467             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2468 
2469         // Serialize and write the snapshot
2470         let vm_snapshot =
2471             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2472 
2473         vm_snapshot_file
2474             .write(&vm_snapshot)
2475             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2476 
2477         // Tell the memory manager to also send/write its own snapshot.
2478         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2479             self.memory_manager
2480                 .lock()
2481                 .unwrap()
2482                 .send(&*memory_manager_snapshot.clone(), destination_url)?;
2483         } else {
2484             return Err(MigratableError::Restore(anyhow!(
2485                 "Missing memory manager snapshot"
2486             )));
2487         }
2488 
2489         Ok(())
2490     }
2491 }
2492 
2493 impl Migratable for Vm {
2494     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2495         self.memory_manager.lock().unwrap().start_dirty_log()?;
2496         self.device_manager.lock().unwrap().start_dirty_log()
2497     }
2498 
2499     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2500         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2501         self.device_manager.lock().unwrap().stop_dirty_log()
2502     }
2503 
2504     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2505         Ok(MemoryRangeTable::new_from_tables(vec![
2506             self.memory_manager.lock().unwrap().dirty_log()?,
2507             self.device_manager.lock().unwrap().dirty_log()?,
2508         ]))
2509     }
2510 
2511     fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2512         self.memory_manager.lock().unwrap().complete_migration()?;
2513         self.device_manager.lock().unwrap().complete_migration()
2514     }
2515 }
2516 
2517 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2518 #[cfg(test)]
2519 mod tests {
2520     use super::*;
2521 
2522     fn test_vm_state_transitions(state: VmState) {
2523         match state {
2524             VmState::Created => {
2525                 // Check the transitions from Created
2526                 assert!(state.valid_transition(VmState::Created).is_err());
2527                 assert!(state.valid_transition(VmState::Running).is_ok());
2528                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2529                 assert!(state.valid_transition(VmState::Paused).is_ok());
2530             }
2531             VmState::Running => {
2532                 // Check the transitions from Running
2533                 assert!(state.valid_transition(VmState::Created).is_err());
2534                 assert!(state.valid_transition(VmState::Running).is_err());
2535                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2536                 assert!(state.valid_transition(VmState::Paused).is_ok());
2537             }
2538             VmState::Shutdown => {
2539                 // Check the transitions from Shutdown
2540                 assert!(state.valid_transition(VmState::Created).is_err());
2541                 assert!(state.valid_transition(VmState::Running).is_ok());
2542                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2543                 assert!(state.valid_transition(VmState::Paused).is_err());
2544             }
2545             VmState::Paused => {
2546                 // Check the transitions from Paused
2547                 assert!(state.valid_transition(VmState::Created).is_err());
2548                 assert!(state.valid_transition(VmState::Running).is_ok());
2549                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2550                 assert!(state.valid_transition(VmState::Paused).is_err());
2551             }
2552         }
2553     }
2554 
2555     #[test]
2556     fn test_vm_created_transitions() {
2557         test_vm_state_transitions(VmState::Created);
2558     }
2559 
2560     #[test]
2561     fn test_vm_running_transitions() {
2562         test_vm_state_transitions(VmState::Running);
2563     }
2564 
2565     #[test]
2566     fn test_vm_shutdown_transitions() {
2567         test_vm_state_transitions(VmState::Shutdown);
2568     }
2569 
2570     #[test]
2571     fn test_vm_paused_transitions() {
2572         test_vm_state_transitions(VmState::Paused);
2573     }
2574 }
2575 
2576 #[cfg(target_arch = "aarch64")]
2577 #[cfg(test)]
2578 mod tests {
2579     use super::*;
2580     use crate::GuestMemoryMmap;
2581     use arch::aarch64::fdt::create_fdt;
2582     use arch::aarch64::gic::kvm::create_gic;
2583     use arch::aarch64::layout;
2584     use arch::{DeviceType, MmioDeviceInfo};
2585     use vm_memory::GuestAddress;
2586 
2587     const LEN: u64 = 4096;
2588 
2589     #[test]
2590     fn test_create_fdt_with_devices() {
2591         let regions = vec![(
2592             GuestAddress(layout::RAM_64BIT_START),
2593             (layout::FDT_MAX_SIZE + 0x1000) as usize,
2594         )];
2595         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
2596 
2597         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
2598             (
2599                 (DeviceType::Serial, DeviceType::Serial.to_string()),
2600                 MmioDeviceInfo {
2601                     addr: 0x00,
2602                     irq: 33,
2603                 },
2604             ),
2605             (
2606                 (DeviceType::Virtio(1), "virtio".to_string()),
2607                 MmioDeviceInfo { addr: LEN, irq: 34 },
2608             ),
2609             (
2610                 (DeviceType::Rtc, "rtc".to_string()),
2611                 MmioDeviceInfo {
2612                     addr: 2 * LEN,
2613                     irq: 35,
2614                 },
2615             ),
2616         ]
2617         .iter()
2618         .cloned()
2619         .collect();
2620 
2621         let hv = hypervisor::new().unwrap();
2622         let vm = hv.create_vm().unwrap();
2623         let gic = create_gic(&vm, 1).unwrap();
2624         assert!(create_fdt(
2625             &mem,
2626             &CString::new("console=tty0").unwrap(),
2627             vec![0],
2628             Some((0, 0, 0)),
2629             &dev_info,
2630             &*gic,
2631             &None,
2632             &(0x1_0000_0000, 0x1_0000),
2633             &BTreeMap::new(),
2634         )
2635         .is_ok())
2636     }
2637 }
2638 
2639 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2640 #[test]
2641 pub fn test_vm() {
2642     use hypervisor::VmExit;
2643     use vm_memory::{GuestMemory, GuestMemoryRegion};
2644     // This example based on https://lwn.net/Articles/658511/
2645     let code = [
2646         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
2647         0x00, 0xd8, /* add %bl, %al */
2648         0x04, b'0', /* add $'0', %al */
2649         0xee, /* out %al, (%dx) */
2650         0xb0, b'\n', /* mov $'\n', %al */
2651         0xee,  /* out %al, (%dx) */
2652         0xf4,  /* hlt */
2653     ];
2654 
2655     let mem_size = 0x1000;
2656     let load_addr = GuestAddress(0x1000);
2657     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
2658 
2659     let hv = hypervisor::new().unwrap();
2660     let vm = hv.create_vm().expect("new VM creation failed");
2661 
2662     for (index, region) in mem.iter().enumerate() {
2663         let mem_region = vm.make_user_memory_region(
2664             index as u32,
2665             region.start_addr().raw_value(),
2666             region.len() as u64,
2667             region.as_ptr() as u64,
2668             false,
2669             false,
2670         );
2671 
2672         vm.create_user_memory_region(mem_region)
2673             .expect("Cannot configure guest memory");
2674     }
2675     mem.write_slice(&code, load_addr)
2676         .expect("Writing code to memory failed");
2677 
2678     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
2679 
2680     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
2681     vcpu_sregs.cs.base = 0;
2682     vcpu_sregs.cs.selector = 0;
2683     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
2684 
2685     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
2686     vcpu_regs.rip = 0x1000;
2687     vcpu_regs.rax = 2;
2688     vcpu_regs.rbx = 3;
2689     vcpu_regs.rflags = 2;
2690     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
2691 
2692     loop {
2693         match vcpu.run().expect("run failed") {
2694             VmExit::IoOut(addr, data) => {
2695                 println!(
2696                     "IO out -- addr: {:#x} data [{:?}]",
2697                     addr,
2698                     str::from_utf8(data).unwrap()
2699                 );
2700             }
2701             VmExit::Reset => {
2702                 println!("HLT");
2703                 break;
2704             }
2705             r => panic!("unexpected exit reason: {:?}", r),
2706         }
2707     }
2708 }
2709