xref: /cloud-hypervisor/vmm/src/vm.rs (revision f7f2f25a574b1b2dba22c094fc8226d404157d15)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13 
14 #[cfg(feature = "acpi")]
15 use crate::config::NumaConfig;
16 use crate::config::{
17     DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, ValidationError,
18     VmConfig, VsockConfig,
19 };
20 use crate::cpu;
21 use crate::device_manager::{
22     self, get_win_size, Console, DeviceManager, DeviceManagerError, PtyPair,
23 };
24 use crate::device_tree::DeviceTree;
25 use crate::memory_manager::{Error as MemoryManagerError, MemoryManager};
26 use crate::migration::{get_vm_snapshot, url_to_path, VM_SNAPSHOT_FILE};
27 use crate::seccomp_filters::{get_seccomp_filter, Thread};
28 use crate::{GuestMemoryMmap, GuestRegionMmap};
29 use crate::{
30     PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID,
31 };
32 use anyhow::anyhow;
33 use arch::get_host_cpu_phys_bits;
34 #[cfg(feature = "tdx")]
35 use arch::x86_64::tdx::TdvfSection;
36 #[cfg(target_arch = "x86_64")]
37 use arch::x86_64::SgxEpcSection;
38 use arch::EntryPoint;
39 use devices::AcpiNotificationFlags;
40 use hypervisor::vm::{HypervisorVmError, VmmOps};
41 use linux_loader::cmdline::Cmdline;
42 #[cfg(target_arch = "x86_64")]
43 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
44 #[cfg(target_arch = "aarch64")]
45 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
46 use linux_loader::loader::KernelLoader;
47 use seccomp::{SeccompAction, SeccompFilter};
48 use signal_hook::{
49     consts::{SIGINT, SIGTERM, SIGWINCH},
50     iterator::backend::Handle,
51     iterator::Signals,
52 };
53 use std::cmp;
54 use std::collections::{BTreeMap, HashMap};
55 use std::convert::TryInto;
56 use std::ffi::CString;
57 #[cfg(target_arch = "x86_64")]
58 use std::fmt;
59 use std::fs::{File, OpenOptions};
60 use std::io::{self, Read, Write};
61 use std::io::{Seek, SeekFrom};
62 use std::num::Wrapping;
63 use std::ops::Deref;
64 use std::sync::{Arc, Mutex, RwLock};
65 use std::{result, str, thread};
66 use vm_device::Bus;
67 use vm_memory::{
68     Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
69     GuestMemoryRegion,
70 };
71 use vm_migration::{
72     protocol::{MemoryRange, MemoryRangeTable},
73     Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable,
74     Transportable,
75 };
76 use vmm_sys_util::eventfd::EventFd;
77 use vmm_sys_util::terminal::Terminal;
78 
79 #[cfg(target_arch = "aarch64")]
80 use arch::aarch64::gic::gicv3_its::kvm::{KvmGicV3Its, GIC_V3_ITS_SNAPSHOT_ID};
81 #[cfg(target_arch = "aarch64")]
82 use arch::aarch64::gic::kvm::create_gic;
83 #[cfg(target_arch = "aarch64")]
84 use devices::interrupt_controller::{self, InterruptController};
85 
86 /// Errors associated with VM management
87 #[derive(Debug)]
88 pub enum Error {
89     /// Cannot open the kernel image
90     KernelFile(io::Error),
91 
92     /// Cannot open the initramfs image
93     InitramfsFile(io::Error),
94 
95     /// Cannot load the kernel in memory
96     KernelLoad(linux_loader::loader::Error),
97 
98     #[cfg(target_arch = "aarch64")]
99     /// Cannot load the UEFI binary in memory
100     UefiLoad(arch::aarch64::uefi::Error),
101 
102     /// Cannot load the initramfs in memory
103     InitramfsLoad,
104 
105     /// Cannot load the command line in memory
106     LoadCmdLine(linux_loader::loader::Error),
107 
108     /// Cannot modify the command line
109     CmdLineInsertStr(linux_loader::cmdline::Error),
110 
111     /// Cannot convert command line into CString
112     CmdLineCString(std::ffi::NulError),
113 
114     /// Cannot configure system
115     ConfigureSystem(arch::Error),
116 
117     /// Cannot enable interrupt controller
118     #[cfg(target_arch = "aarch64")]
119     EnableInterruptController(interrupt_controller::Error),
120 
121     PoisonedState,
122 
123     /// Cannot create a device manager.
124     DeviceManager(DeviceManagerError),
125 
126     /// Write to the console failed.
127     Console(vmm_sys_util::errno::Error),
128 
129     /// Write to the pty console failed.
130     PtyConsole(io::Error),
131 
132     /// Cannot setup terminal in raw mode.
133     SetTerminalRaw(vmm_sys_util::errno::Error),
134 
135     /// Cannot setup terminal in canonical mode.
136     SetTerminalCanon(vmm_sys_util::errno::Error),
137 
138     /// Memory is overflow
139     MemOverflow,
140 
141     /// Cannot spawn a signal handler thread
142     SignalHandlerSpawn(io::Error),
143 
144     /// Failed to join on vCPU threads
145     ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
146 
147     /// VM config is missing.
148     VmMissingConfig,
149 
150     /// VM is not created
151     VmNotCreated,
152 
153     /// VM is already created
154     VmAlreadyCreated,
155 
156     /// VM is not running
157     VmNotRunning,
158 
159     /// Cannot clone EventFd.
160     EventFdClone(io::Error),
161 
162     /// Invalid VM state transition
163     InvalidStateTransition(VmState, VmState),
164 
165     /// Error from CPU handling
166     CpuManager(cpu::Error),
167 
168     /// Cannot pause devices
169     PauseDevices(MigratableError),
170 
171     /// Cannot resume devices
172     ResumeDevices(MigratableError),
173 
174     /// Cannot pause CPUs
175     PauseCpus(MigratableError),
176 
177     /// Cannot resume cpus
178     ResumeCpus(MigratableError),
179 
180     /// Cannot pause VM
181     Pause(MigratableError),
182 
183     /// Cannot resume VM
184     Resume(MigratableError),
185 
186     /// Memory manager error
187     MemoryManager(MemoryManagerError),
188 
189     /// Eventfd write error
190     EventfdError(std::io::Error),
191 
192     /// Cannot snapshot VM
193     Snapshot(MigratableError),
194 
195     /// Cannot restore VM
196     Restore(MigratableError),
197 
198     /// Cannot send VM snapshot
199     SnapshotSend(MigratableError),
200 
201     /// Cannot convert source URL from Path into &str
202     RestoreSourceUrlPathToStr,
203 
204     /// Failed to validate config
205     ConfigValidation(ValidationError),
206 
207     /// No more that one virtio-vsock device
208     TooManyVsockDevices,
209 
210     /// Failed serializing into JSON
211     SerializeJson(serde_json::Error),
212 
213     /// Invalid configuration for NUMA.
214     InvalidNumaConfig,
215 
216     /// Cannot create seccomp filter
217     CreateSeccompFilter(seccomp::SeccompError),
218 
219     /// Cannot apply seccomp filter
220     ApplySeccompFilter(seccomp::Error),
221 
222     /// Failed resizing a memory zone.
223     ResizeZone,
224 
225     /// Cannot activate virtio devices
226     ActivateVirtioDevices(device_manager::DeviceManagerError),
227 
228     /// Power button not supported
229     PowerButtonNotSupported,
230 
231     /// Error triggering power button
232     PowerButton(device_manager::DeviceManagerError),
233 
234     /// Kernel lacks PVH header
235     KernelMissingPvhHeader,
236 
237     /// Error doing I/O on TDX firmware file
238     #[cfg(feature = "tdx")]
239     LoadTdvf(std::io::Error),
240 
241     /// Error parsing TDVF
242     #[cfg(feature = "tdx")]
243     ParseTdvf(arch::x86_64::tdx::TdvfError),
244 
245     /// Error populating HOB
246     #[cfg(feature = "tdx")]
247     PopulateHob(arch::x86_64::tdx::TdvfError),
248 
249     /// Error allocating TDVF memory
250     #[cfg(feature = "tdx")]
251     AllocatingTdvfMemory(crate::memory_manager::Error),
252 
253     /// Error enabling TDX VM
254     #[cfg(feature = "tdx")]
255     InitializeTdxVm(hypervisor::HypervisorVmError),
256 
257     /// Error enabling TDX memory region
258     #[cfg(feature = "tdx")]
259     InitializeTdxMemoryRegion(hypervisor::HypervisorVmError),
260 
261     /// Error finalizing TDX setup
262     #[cfg(feature = "tdx")]
263     FinalizeTdx(hypervisor::HypervisorVmError),
264 }
265 pub type Result<T> = result::Result<T, Error>;
266 
267 #[derive(Clone, Default)]
268 pub struct NumaNode {
269     memory_regions: Vec<Arc<GuestRegionMmap>>,
270     hotplug_regions: Vec<Arc<GuestRegionMmap>>,
271     cpus: Vec<u8>,
272     distances: BTreeMap<u32, u8>,
273     memory_zones: Vec<String>,
274     #[cfg(target_arch = "x86_64")]
275     sgx_epc_sections: Vec<SgxEpcSection>,
276 }
277 
278 impl NumaNode {
279     pub fn memory_regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
280         &self.memory_regions
281     }
282 
283     pub fn hotplug_regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
284         &self.hotplug_regions
285     }
286 
287     pub fn cpus(&self) -> &Vec<u8> {
288         &self.cpus
289     }
290 
291     pub fn distances(&self) -> &BTreeMap<u32, u8> {
292         &self.distances
293     }
294 
295     pub fn memory_zones(&self) -> &Vec<String> {
296         &self.memory_zones
297     }
298 
299     #[cfg(target_arch = "x86_64")]
300     pub fn sgx_epc_sections(&self) -> &Vec<SgxEpcSection> {
301         &self.sgx_epc_sections
302     }
303 }
304 
305 pub type NumaNodes = BTreeMap<u32, NumaNode>;
306 
307 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq)]
308 pub enum VmState {
309     Created,
310     Running,
311     Shutdown,
312     Paused,
313 }
314 
315 impl VmState {
316     fn valid_transition(self, new_state: VmState) -> Result<()> {
317         match self {
318             VmState::Created => match new_state {
319                 VmState::Created | VmState::Shutdown => {
320                     Err(Error::InvalidStateTransition(self, new_state))
321                 }
322                 VmState::Running | VmState::Paused => Ok(()),
323             },
324 
325             VmState::Running => match new_state {
326                 VmState::Created | VmState::Running => {
327                     Err(Error::InvalidStateTransition(self, new_state))
328                 }
329                 VmState::Paused | VmState::Shutdown => Ok(()),
330             },
331 
332             VmState::Shutdown => match new_state {
333                 VmState::Paused | VmState::Created | VmState::Shutdown => {
334                     Err(Error::InvalidStateTransition(self, new_state))
335                 }
336                 VmState::Running => Ok(()),
337             },
338 
339             VmState::Paused => match new_state {
340                 VmState::Created | VmState::Paused => {
341                     Err(Error::InvalidStateTransition(self, new_state))
342                 }
343                 VmState::Running | VmState::Shutdown => Ok(()),
344             },
345         }
346     }
347 }
348 
349 // Debug I/O port
350 #[cfg(target_arch = "x86_64")]
351 const DEBUG_IOPORT: u16 = 0x80;
352 #[cfg(target_arch = "x86_64")]
353 const DEBUG_IOPORT_PREFIX: &str = "Debug I/O port";
354 
355 #[cfg(target_arch = "x86_64")]
356 /// Debug I/O port, see:
357 /// https://www.intel.com/content/www/us/en/support/articles/000005500/boards-and-kits.html
358 ///
359 /// Since we're not a physical platform, we can freely assign code ranges for
360 /// debugging specific parts of our virtual platform.
361 pub enum DebugIoPortRange {
362     Firmware,
363     Bootloader,
364     Kernel,
365     Userspace,
366     Custom,
367 }
368 #[cfg(target_arch = "x86_64")]
369 impl DebugIoPortRange {
370     fn from_u8(value: u8) -> DebugIoPortRange {
371         match value {
372             0x00..=0x1f => DebugIoPortRange::Firmware,
373             0x20..=0x3f => DebugIoPortRange::Bootloader,
374             0x40..=0x5f => DebugIoPortRange::Kernel,
375             0x60..=0x7f => DebugIoPortRange::Userspace,
376             _ => DebugIoPortRange::Custom,
377         }
378     }
379 }
380 
381 #[cfg(target_arch = "x86_64")]
382 impl fmt::Display for DebugIoPortRange {
383     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
384         match self {
385             DebugIoPortRange::Firmware => write!(f, "{}: Firmware", DEBUG_IOPORT_PREFIX),
386             DebugIoPortRange::Bootloader => write!(f, "{}: Bootloader", DEBUG_IOPORT_PREFIX),
387             DebugIoPortRange::Kernel => write!(f, "{}: Kernel", DEBUG_IOPORT_PREFIX),
388             DebugIoPortRange::Userspace => write!(f, "{}: Userspace", DEBUG_IOPORT_PREFIX),
389             DebugIoPortRange::Custom => write!(f, "{}: Custom", DEBUG_IOPORT_PREFIX),
390         }
391     }
392 }
393 
394 struct VmOps {
395     memory: GuestMemoryAtomic<GuestMemoryMmap>,
396     #[cfg(target_arch = "x86_64")]
397     io_bus: Arc<Bus>,
398     mmio_bus: Arc<Bus>,
399     #[cfg(target_arch = "x86_64")]
400     timestamp: std::time::Instant,
401 }
402 
403 impl VmOps {
404     #[cfg(target_arch = "x86_64")]
405     // Log debug io port codes.
406     fn log_debug_ioport(&self, code: u8) {
407         let elapsed = self.timestamp.elapsed();
408 
409         debug!(
410             "[{} code 0x{:x}] {}.{:>06} seconds",
411             DebugIoPortRange::from_u8(code),
412             code,
413             elapsed.as_secs(),
414             elapsed.as_micros()
415         );
416     }
417 }
418 
419 impl VmmOps for VmOps {
420     fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> hypervisor::vm::Result<usize> {
421         self.memory
422             .memory()
423             .write(buf, GuestAddress(gpa))
424             .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
425     }
426 
427     fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> hypervisor::vm::Result<usize> {
428         self.memory
429             .memory()
430             .read(buf, GuestAddress(gpa))
431             .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
432     }
433 
434     fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> {
435         if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
436             warn!("Guest MMIO read to unregistered address 0x{:x}", gpa);
437         }
438         Ok(())
439     }
440 
441     fn mmio_write(&self, gpa: u64, data: &[u8]) -> hypervisor::vm::Result<()> {
442         match self.mmio_bus.write(gpa, data) {
443             Err(vm_device::BusError::MissingAddressRange) => {
444                 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa);
445             }
446             Ok(Some(barrier)) => {
447                 info!("Waiting for barrier");
448                 barrier.wait();
449                 info!("Barrier released");
450             }
451             _ => {}
452         };
453         Ok(())
454     }
455 
456     #[cfg(target_arch = "x86_64")]
457     fn pio_read(&self, port: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> {
458         if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
459             warn!("Guest PIO read to unregistered address 0x{:x}", port);
460         }
461         Ok(())
462     }
463 
464     #[cfg(target_arch = "x86_64")]
465     fn pio_write(&self, port: u64, data: &[u8]) -> hypervisor::vm::Result<()> {
466         if port == DEBUG_IOPORT as u64 && data.len() == 1 {
467             self.log_debug_ioport(data[0]);
468             return Ok(());
469         }
470 
471         match self.io_bus.write(port, data) {
472             Err(vm_device::BusError::MissingAddressRange) => {
473                 warn!("Guest PIO write to unregistered address 0x{:x}", port);
474             }
475             Ok(Some(barrier)) => {
476                 info!("Waiting for barrier");
477                 barrier.wait();
478                 info!("Barrier released");
479             }
480             _ => {}
481         };
482         Ok(())
483     }
484 }
485 
486 pub fn physical_bits(max_phys_bits: Option<u8>, #[cfg(feature = "tdx")] tdx_enabled: bool) -> u8 {
487     #[cfg(not(feature = "tdx"))]
488     let host_phys_bits = get_host_cpu_phys_bits();
489     #[cfg(feature = "tdx")]
490     let mut host_phys_bits = get_host_cpu_phys_bits();
491 
492     #[cfg(feature = "tdx")]
493     if tdx_enabled {
494         // When running TDX guest, the Guest Physical Address space is limited
495         // by a shared bit that is located on bit 47 for 4 level paging, and on
496         // bit 51 for 5 level paging (when GPAW bit is 1). In order to keep
497         // things simple, and since a 47 bits address space is 128TiB large, we
498         // ensure to limit the physical addressable space to 47 bits when
499         // runnning TDX.
500         host_phys_bits = std::cmp::min(host_phys_bits, 47)
501     }
502 
503     cmp::min(host_phys_bits, max_phys_bits.unwrap_or(host_phys_bits))
504 }
505 
506 pub struct Vm {
507     kernel: Option<File>,
508     initramfs: Option<File>,
509     threads: Vec<thread::JoinHandle<()>>,
510     device_manager: Arc<Mutex<DeviceManager>>,
511     config: Arc<Mutex<VmConfig>>,
512     on_tty: bool,
513     signals: Option<Handle>,
514     state: RwLock<VmState>,
515     cpu_manager: Arc<Mutex<cpu::CpuManager>>,
516     memory_manager: Arc<Mutex<MemoryManager>>,
517     #[cfg_attr(not(feature = "kvm"), allow(dead_code))]
518     // The hypervisor abstracted virtual machine.
519     vm: Arc<dyn hypervisor::Vm>,
520     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
521     saved_clock: Option<hypervisor::ClockData>,
522     #[cfg(feature = "acpi")]
523     numa_nodes: NumaNodes,
524     seccomp_action: SeccompAction,
525     exit_evt: EventFd,
526     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
527     hypervisor: Arc<dyn hypervisor::Hypervisor>,
528 }
529 
530 impl Vm {
531     #[allow(clippy::too_many_arguments)]
532     fn new_from_memory_manager(
533         config: Arc<Mutex<VmConfig>>,
534         memory_manager: Arc<Mutex<MemoryManager>>,
535         vm: Arc<dyn hypervisor::Vm>,
536         exit_evt: EventFd,
537         reset_evt: EventFd,
538         seccomp_action: &SeccompAction,
539         hypervisor: Arc<dyn hypervisor::Hypervisor>,
540         #[cfg(all(feature = "kvm", target_arch = "x86_64"))] _saved_clock: Option<
541             hypervisor::ClockData,
542         >,
543         activate_evt: EventFd,
544     ) -> Result<Self> {
545         config
546             .lock()
547             .unwrap()
548             .validate()
549             .map_err(Error::ConfigValidation)?;
550 
551         info!("Booting VM from config: {:?}", &config);
552 
553         // Create NUMA nodes based on NumaConfig.
554         #[cfg(feature = "acpi")]
555         let numa_nodes =
556             Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
557 
558         #[cfg(feature = "tdx")]
559         let force_iommu = config.lock().unwrap().tdx.is_some();
560         #[cfg(not(feature = "tdx"))]
561         let force_iommu = false;
562 
563         let device_manager = DeviceManager::new(
564             vm.clone(),
565             config.clone(),
566             memory_manager.clone(),
567             &exit_evt,
568             &reset_evt,
569             seccomp_action.clone(),
570             #[cfg(feature = "acpi")]
571             numa_nodes.clone(),
572             &activate_evt,
573             force_iommu,
574         )
575         .map_err(Error::DeviceManager)?;
576 
577         let memory = memory_manager.lock().unwrap().guest_memory();
578         #[cfg(target_arch = "x86_64")]
579         let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus());
580         let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus());
581         // Create the VmOps structure, which implements the VmmOps trait.
582         // And send it to the hypervisor.
583         let vm_ops: Arc<dyn VmmOps> = Arc::new(VmOps {
584             memory,
585             #[cfg(target_arch = "x86_64")]
586             io_bus,
587             mmio_bus,
588             #[cfg(target_arch = "x86_64")]
589             timestamp: std::time::Instant::now(),
590         });
591 
592         let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?;
593         #[cfg(feature = "tdx")]
594         let tdx_enabled = config.lock().unwrap().tdx.is_some();
595         let cpu_manager = cpu::CpuManager::new(
596             &config.lock().unwrap().cpus.clone(),
597             &device_manager,
598             &memory_manager,
599             vm.clone(),
600             exit_evt_clone,
601             reset_evt,
602             hypervisor.clone(),
603             seccomp_action.clone(),
604             vm_ops,
605             #[cfg(feature = "tdx")]
606             tdx_enabled,
607             #[cfg(feature = "acpi")]
608             &numa_nodes,
609         )
610         .map_err(Error::CpuManager)?;
611 
612         let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0;
613         let kernel = config
614             .lock()
615             .unwrap()
616             .kernel
617             .as_ref()
618             .map(|k| File::open(&k.path))
619             .transpose()
620             .map_err(Error::KernelFile)?;
621 
622         let initramfs = config
623             .lock()
624             .unwrap()
625             .initramfs
626             .as_ref()
627             .map(|i| File::open(&i.path))
628             .transpose()
629             .map_err(Error::InitramfsFile)?;
630 
631         Ok(Vm {
632             kernel,
633             initramfs,
634             device_manager,
635             config,
636             on_tty,
637             threads: Vec::with_capacity(1),
638             signals: None,
639             state: RwLock::new(VmState::Created),
640             cpu_manager,
641             memory_manager,
642             vm,
643             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
644             saved_clock: _saved_clock,
645             #[cfg(feature = "acpi")]
646             numa_nodes,
647             seccomp_action: seccomp_action.clone(),
648             exit_evt,
649             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
650             hypervisor,
651         })
652     }
653 
654     #[cfg(feature = "acpi")]
655     fn create_numa_nodes(
656         configs: Option<Vec<NumaConfig>>,
657         memory_manager: &Arc<Mutex<MemoryManager>>,
658     ) -> Result<NumaNodes> {
659         let mm = memory_manager.lock().unwrap();
660         let mm_zones = mm.memory_zones();
661         let mut numa_nodes = BTreeMap::new();
662 
663         if let Some(configs) = &configs {
664             for config in configs.iter() {
665                 if numa_nodes.contains_key(&config.guest_numa_id) {
666                     error!("Can't define twice the same NUMA node");
667                     return Err(Error::InvalidNumaConfig);
668                 }
669 
670                 let mut node = NumaNode::default();
671 
672                 if let Some(memory_zones) = &config.memory_zones {
673                     for memory_zone in memory_zones.iter() {
674                         if let Some(mm_zone) = mm_zones.get(memory_zone) {
675                             node.memory_regions.extend(mm_zone.regions().clone());
676                             if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
677                                 node.hotplug_regions.push(virtiomem_zone.region().clone());
678                             }
679                             node.memory_zones.push(memory_zone.clone());
680                         } else {
681                             error!("Unknown memory zone '{}'", memory_zone);
682                             return Err(Error::InvalidNumaConfig);
683                         }
684                     }
685                 }
686 
687                 if let Some(cpus) = &config.cpus {
688                     node.cpus.extend(cpus);
689                 }
690 
691                 if let Some(distances) = &config.distances {
692                     for distance in distances.iter() {
693                         let dest = distance.destination;
694                         let dist = distance.distance;
695 
696                         if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
697                             error!("Unknown destination NUMA node {}", dest);
698                             return Err(Error::InvalidNumaConfig);
699                         }
700 
701                         if node.distances.contains_key(&dest) {
702                             error!("Destination NUMA node {} has been already set", dest);
703                             return Err(Error::InvalidNumaConfig);
704                         }
705 
706                         node.distances.insert(dest, dist);
707                     }
708                 }
709 
710                 #[cfg(target_arch = "x86_64")]
711                 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
712                     if let Some(sgx_epc_region) = mm.sgx_epc_region() {
713                         let mm_sections = sgx_epc_region.epc_sections();
714                         for sgx_epc_section in sgx_epc_sections.iter() {
715                             if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
716                                 node.sgx_epc_sections.push(mm_section.clone());
717                             } else {
718                                 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
719                                 return Err(Error::InvalidNumaConfig);
720                             }
721                         }
722                     } else {
723                         error!("Missing SGX EPC region");
724                         return Err(Error::InvalidNumaConfig);
725                     }
726                 }
727 
728                 numa_nodes.insert(config.guest_numa_id, node);
729             }
730         }
731 
732         Ok(numa_nodes)
733     }
734 
735     #[allow(clippy::too_many_arguments)]
736     pub fn new(
737         config: Arc<Mutex<VmConfig>>,
738         exit_evt: EventFd,
739         reset_evt: EventFd,
740         seccomp_action: &SeccompAction,
741         hypervisor: Arc<dyn hypervisor::Hypervisor>,
742         activate_evt: EventFd,
743         serial_pty: Option<PtyPair>,
744         console_pty: Option<PtyPair>,
745     ) -> Result<Self> {
746         #[cfg(feature = "tdx")]
747         let tdx_enabled = config.lock().unwrap().tdx.is_some();
748         hypervisor.check_required_extensions().unwrap();
749         #[cfg(feature = "tdx")]
750         let vm = hypervisor
751             .create_vm_with_type(if tdx_enabled {
752                 2 // KVM_X86_TDX_VM
753             } else {
754                 0 // KVM_X86_LEGACY_VM
755             })
756             .unwrap();
757         #[cfg(not(feature = "tdx"))]
758         let vm = hypervisor.create_vm().unwrap();
759 
760         #[cfg(target_arch = "x86_64")]
761         vm.enable_split_irq().unwrap();
762         let phys_bits = physical_bits(
763             config.lock().unwrap().cpus.max_phys_bits,
764             #[cfg(feature = "tdx")]
765             tdx_enabled,
766         );
767         let memory_manager = MemoryManager::new(
768             vm.clone(),
769             &config.lock().unwrap().memory.clone(),
770             false,
771             phys_bits,
772             #[cfg(feature = "tdx")]
773             tdx_enabled,
774         )
775         .map_err(Error::MemoryManager)?;
776 
777         #[cfg(target_arch = "x86_64")]
778         {
779             if let Some(sgx_epc_config) = config.lock().unwrap().sgx_epc.clone() {
780                 memory_manager
781                     .lock()
782                     .unwrap()
783                     .setup_sgx(sgx_epc_config, &vm)
784                     .map_err(Error::MemoryManager)?;
785             }
786         }
787 
788         let new_vm = Vm::new_from_memory_manager(
789             config,
790             memory_manager,
791             vm,
792             exit_evt,
793             reset_evt,
794             seccomp_action,
795             hypervisor,
796             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
797             None,
798             activate_evt,
799         )?;
800 
801         // The device manager must create the devices from here as it is part
802         // of the regular code path creating everything from scratch.
803         new_vm
804             .device_manager
805             .lock()
806             .unwrap()
807             .create_devices(serial_pty, console_pty)
808             .map_err(Error::DeviceManager)?;
809         Ok(new_vm)
810     }
811 
812     #[allow(clippy::too_many_arguments)]
813     pub fn new_from_snapshot(
814         snapshot: &Snapshot,
815         exit_evt: EventFd,
816         reset_evt: EventFd,
817         source_url: Option<&str>,
818         prefault: bool,
819         seccomp_action: &SeccompAction,
820         hypervisor: Arc<dyn hypervisor::Hypervisor>,
821         activate_evt: EventFd,
822     ) -> Result<Self> {
823         hypervisor.check_required_extensions().unwrap();
824         let vm = hypervisor.create_vm().unwrap();
825         #[cfg(target_arch = "x86_64")]
826         vm.enable_split_irq().unwrap();
827         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
828         let config = vm_snapshot.config;
829         if let Some(state) = vm_snapshot.state {
830             vm.set_state(state)
831                 .map_err(|e| Error::Restore(MigratableError::Restore(e.into())))?;
832         }
833 
834         let memory_manager = if let Some(memory_manager_snapshot) =
835             snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID)
836         {
837             let phys_bits = physical_bits(
838                 config.lock().unwrap().cpus.max_phys_bits,
839                 #[cfg(feature = "tdx")]
840                 config.lock().unwrap().tdx.is_some(),
841             );
842             MemoryManager::new_from_snapshot(
843                 memory_manager_snapshot,
844                 vm.clone(),
845                 &config.lock().unwrap().memory.clone(),
846                 source_url,
847                 prefault,
848                 phys_bits,
849             )
850             .map_err(Error::MemoryManager)?
851         } else {
852             return Err(Error::Restore(MigratableError::Restore(anyhow!(
853                 "Missing memory manager snapshot"
854             ))));
855         };
856 
857         Vm::new_from_memory_manager(
858             config,
859             memory_manager,
860             vm,
861             exit_evt,
862             reset_evt,
863             seccomp_action,
864             hypervisor,
865             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
866             vm_snapshot.clock,
867             activate_evt,
868         )
869     }
870 
871     pub fn new_from_migration(
872         config: Arc<Mutex<VmConfig>>,
873         exit_evt: EventFd,
874         reset_evt: EventFd,
875         seccomp_action: &SeccompAction,
876         hypervisor: Arc<dyn hypervisor::Hypervisor>,
877         activate_evt: EventFd,
878     ) -> Result<Self> {
879         hypervisor.check_required_extensions().unwrap();
880         let vm = hypervisor.create_vm().unwrap();
881         #[cfg(target_arch = "x86_64")]
882         vm.enable_split_irq().unwrap();
883         let phys_bits = physical_bits(
884             config.lock().unwrap().cpus.max_phys_bits,
885             #[cfg(feature = "tdx")]
886             config.lock().unwrap().tdx.is_some(),
887         );
888 
889         let memory_manager = MemoryManager::new(
890             vm.clone(),
891             &config.lock().unwrap().memory.clone(),
892             false,
893             phys_bits,
894             #[cfg(feature = "tdx")]
895             false,
896         )
897         .map_err(Error::MemoryManager)?;
898 
899         Vm::new_from_memory_manager(
900             config,
901             memory_manager,
902             vm,
903             exit_evt,
904             reset_evt,
905             seccomp_action,
906             hypervisor,
907             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
908             None,
909             activate_evt,
910         )
911     }
912 
913     fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
914         let mut initramfs = self.initramfs.as_ref().unwrap();
915         let size: usize = initramfs
916             .seek(SeekFrom::End(0))
917             .map_err(|_| Error::InitramfsLoad)?
918             .try_into()
919             .unwrap();
920         initramfs
921             .seek(SeekFrom::Start(0))
922             .map_err(|_| Error::InitramfsLoad)?;
923 
924         let address =
925             arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
926         let address = GuestAddress(address);
927 
928         guest_mem
929             .read_from(address, &mut initramfs, size)
930             .map_err(|_| Error::InitramfsLoad)?;
931 
932         info!("Initramfs loaded: address = 0x{:x}", address.0);
933         Ok(arch::InitramfsConfig { address, size })
934     }
935 
936     fn get_cmdline(&mut self) -> Result<CString> {
937         let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE);
938         cmdline
939             .insert_str(self.config.lock().unwrap().cmdline.args.clone())
940             .map_err(Error::CmdLineInsertStr)?;
941         for entry in self.device_manager.lock().unwrap().cmdline_additions() {
942             cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
943         }
944         CString::new(cmdline).map_err(Error::CmdLineCString)
945     }
946 
947     #[cfg(target_arch = "aarch64")]
948     fn load_kernel(&mut self) -> Result<EntryPoint> {
949         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
950         let mem = guest_memory.memory();
951         let mut kernel = self.kernel.as_ref().unwrap();
952         let entry_addr = match linux_loader::loader::pe::PE::load(
953             mem.deref(),
954             Some(GuestAddress(arch::get_kernel_start())),
955             &mut kernel,
956             None,
957         ) {
958             Ok(entry_addr) => entry_addr,
959             // Try to load the binary as kernel PE file at first.
960             // If failed, retry to load it as UEFI binary.
961             // As the UEFI binary is formatless, it must be the last option to try.
962             Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
963                 arch::aarch64::uefi::load_uefi(
964                     mem.deref(),
965                     GuestAddress(arch::get_uefi_start()),
966                     &mut kernel,
967                 )
968                 .map_err(Error::UefiLoad)?;
969                 // The entry point offset in UEFI image is always 0.
970                 return Ok(EntryPoint {
971                     entry_addr: GuestAddress(arch::get_uefi_start()),
972                 });
973             }
974             Err(e) => {
975                 return Err(Error::KernelLoad(e));
976             }
977         };
978 
979         let entry_point_addr: GuestAddress = entry_addr.kernel_load;
980 
981         Ok(EntryPoint {
982             entry_addr: entry_point_addr,
983         })
984     }
985 
986     #[cfg(target_arch = "x86_64")]
987     fn load_kernel(&mut self) -> Result<EntryPoint> {
988         info!("Loading kernel");
989         let cmdline_cstring = self.get_cmdline()?;
990         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
991         let mem = guest_memory.memory();
992         let mut kernel = self.kernel.as_ref().unwrap();
993         let entry_addr = match linux_loader::loader::elf::Elf::load(
994             mem.deref(),
995             None,
996             &mut kernel,
997             Some(arch::layout::HIGH_RAM_START),
998         ) {
999             Ok(entry_addr) => entry_addr,
1000             Err(e) => {
1001                 return Err(Error::KernelLoad(e));
1002             }
1003         };
1004 
1005         linux_loader::loader::load_cmdline(
1006             mem.deref(),
1007             arch::layout::CMDLINE_START,
1008             &cmdline_cstring,
1009         )
1010         .map_err(Error::LoadCmdLine)?;
1011 
1012         if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1013             // Use the PVH kernel entry point to boot the guest
1014             info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1015             Ok(EntryPoint { entry_addr })
1016         } else {
1017             Err(Error::KernelMissingPvhHeader)
1018         }
1019     }
1020 
1021     #[cfg(target_arch = "x86_64")]
1022     fn configure_system(&mut self) -> Result<()> {
1023         info!("Configuring system");
1024         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1025 
1026         let initramfs_config = match self.initramfs {
1027             Some(_) => Some(self.load_initramfs(&mem)?),
1028             None => None,
1029         };
1030 
1031         let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1032 
1033         #[allow(unused_mut, unused_assignments)]
1034         let mut rsdp_addr: Option<GuestAddress> = None;
1035 
1036         #[cfg(feature = "acpi")]
1037         {
1038             rsdp_addr = Some(crate::acpi::create_acpi_tables(
1039                 &mem,
1040                 &self.device_manager,
1041                 &self.cpu_manager,
1042                 &self.memory_manager,
1043                 &self.numa_nodes,
1044             ));
1045             info!(
1046                 "Created ACPI tables: rsdp_addr = 0x{:x}",
1047                 rsdp_addr.unwrap().0
1048             );
1049         }
1050 
1051         let sgx_epc_region = self
1052             .memory_manager
1053             .lock()
1054             .unwrap()
1055             .sgx_epc_region()
1056             .as_ref()
1057             .cloned();
1058 
1059         arch::configure_system(
1060             &mem,
1061             arch::layout::CMDLINE_START,
1062             &initramfs_config,
1063             boot_vcpus,
1064             rsdp_addr,
1065             sgx_epc_region,
1066         )
1067         .map_err(Error::ConfigureSystem)?;
1068         Ok(())
1069     }
1070 
1071     #[cfg(target_arch = "aarch64")]
1072     fn configure_system(&mut self) -> Result<()> {
1073         let cmdline_cstring = self.get_cmdline()?;
1074         let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1075         let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1076         let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1077         let initramfs_config = match self.initramfs {
1078             Some(_) => Some(self.load_initramfs(&mem)?),
1079             None => None,
1080         };
1081 
1082         let device_info = &self
1083             .device_manager
1084             .lock()
1085             .unwrap()
1086             .get_device_info()
1087             .clone();
1088 
1089         let pci_space_start: GuestAddress = self
1090             .memory_manager
1091             .lock()
1092             .as_ref()
1093             .unwrap()
1094             .start_of_device_area();
1095 
1096         let pci_space_end: GuestAddress = self
1097             .memory_manager
1098             .lock()
1099             .as_ref()
1100             .unwrap()
1101             .end_of_device_area();
1102 
1103         let pci_space_size = pci_space_end
1104             .checked_offset_from(pci_space_start)
1105             .ok_or(Error::MemOverflow)?
1106             + 1;
1107 
1108         let pci_space = (pci_space_start.0, pci_space_size);
1109 
1110         #[cfg(feature = "acpi")]
1111         {
1112             let _ = crate::acpi::create_acpi_tables(
1113                 &mem,
1114                 &self.device_manager,
1115                 &self.cpu_manager,
1116                 &self.memory_manager,
1117                 &self.numa_nodes,
1118             );
1119         }
1120 
1121         let gic_device = create_gic(
1122             &self.memory_manager.lock().as_ref().unwrap().vm,
1123             self.cpu_manager.lock().unwrap().boot_vcpus() as u64,
1124         )
1125         .map_err(|e| {
1126             Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::SetupGic(e)))
1127         })?;
1128 
1129         arch::configure_system(
1130             &mem,
1131             &cmdline_cstring,
1132             vcpu_mpidrs,
1133             vcpu_topology,
1134             device_info,
1135             &initramfs_config,
1136             &pci_space,
1137             &*gic_device,
1138         )
1139         .map_err(Error::ConfigureSystem)?;
1140 
1141         // Update the GIC entity in device manager
1142         self.device_manager
1143             .lock()
1144             .unwrap()
1145             .get_interrupt_controller()
1146             .unwrap()
1147             .lock()
1148             .unwrap()
1149             .set_gic_device(Arc::new(Mutex::new(gic_device)));
1150 
1151         // Activate gic device
1152         self.device_manager
1153             .lock()
1154             .unwrap()
1155             .get_interrupt_controller()
1156             .unwrap()
1157             .lock()
1158             .unwrap()
1159             .enable()
1160             .map_err(Error::EnableInterruptController)?;
1161 
1162         Ok(())
1163     }
1164 
1165     pub fn serial_pty(&self) -> Option<PtyPair> {
1166         self.device_manager.lock().unwrap().serial_pty()
1167     }
1168 
1169     pub fn console_pty(&self) -> Option<PtyPair> {
1170         self.device_manager.lock().unwrap().console_pty()
1171     }
1172 
1173     pub fn shutdown(&mut self) -> Result<()> {
1174         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1175         let new_state = VmState::Shutdown;
1176 
1177         state.valid_transition(new_state)?;
1178 
1179         if self.on_tty {
1180             // Don't forget to set the terminal in canonical mode
1181             // before to exit.
1182             io::stdin()
1183                 .lock()
1184                 .set_canon_mode()
1185                 .map_err(Error::SetTerminalCanon)?;
1186         }
1187 
1188         // Trigger the termination of the signal_handler thread
1189         if let Some(signals) = self.signals.take() {
1190             signals.close();
1191         }
1192 
1193         // Wake up the DeviceManager threads so they will get terminated cleanly
1194         self.device_manager
1195             .lock()
1196             .unwrap()
1197             .resume()
1198             .map_err(Error::Resume)?;
1199 
1200         self.cpu_manager
1201             .lock()
1202             .unwrap()
1203             .shutdown()
1204             .map_err(Error::CpuManager)?;
1205 
1206         // Wait for all the threads to finish
1207         for thread in self.threads.drain(..) {
1208             thread.join().map_err(Error::ThreadCleanup)?
1209         }
1210         *state = new_state;
1211 
1212         event!("vm", "shutdown");
1213 
1214         Ok(())
1215     }
1216 
1217     pub fn resize(
1218         &mut self,
1219         desired_vcpus: Option<u8>,
1220         desired_memory: Option<u64>,
1221         desired_balloon: Option<u64>,
1222     ) -> Result<()> {
1223         event!("vm", "resizing");
1224 
1225         if let Some(desired_vcpus) = desired_vcpus {
1226             if self
1227                 .cpu_manager
1228                 .lock()
1229                 .unwrap()
1230                 .resize(desired_vcpus)
1231                 .map_err(Error::CpuManager)?
1232             {
1233                 self.device_manager
1234                     .lock()
1235                     .unwrap()
1236                     .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1237                     .map_err(Error::DeviceManager)?;
1238             }
1239             self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1240         }
1241 
1242         if let Some(desired_memory) = desired_memory {
1243             let new_region = self
1244                 .memory_manager
1245                 .lock()
1246                 .unwrap()
1247                 .resize(desired_memory)
1248                 .map_err(Error::MemoryManager)?;
1249 
1250             let mut memory_config = &mut self.config.lock().unwrap().memory;
1251 
1252             if let Some(new_region) = &new_region {
1253                 self.device_manager
1254                     .lock()
1255                     .unwrap()
1256                     .update_memory(new_region)
1257                     .map_err(Error::DeviceManager)?;
1258 
1259                 match memory_config.hotplug_method {
1260                     HotplugMethod::Acpi => {
1261                         self.device_manager
1262                             .lock()
1263                             .unwrap()
1264                             .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1265                             .map_err(Error::DeviceManager)?;
1266                     }
1267                     HotplugMethod::VirtioMem => {}
1268                 }
1269             }
1270 
1271             // We update the VM config regardless of the actual guest resize
1272             // operation result (happened or not), so that if the VM reboots
1273             // it will be running with the last configure memory size.
1274             match memory_config.hotplug_method {
1275                 HotplugMethod::Acpi => memory_config.size = desired_memory,
1276                 HotplugMethod::VirtioMem => {
1277                     if desired_memory > memory_config.size {
1278                         memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1279                     } else {
1280                         memory_config.hotplugged_size = None;
1281                     }
1282                 }
1283             }
1284         }
1285 
1286         if let Some(desired_balloon) = desired_balloon {
1287             self.device_manager
1288                 .lock()
1289                 .unwrap()
1290                 .resize_balloon(desired_balloon)
1291                 .map_err(Error::DeviceManager)?;
1292 
1293             // Update the configuration value for the balloon size to ensure
1294             // a reboot would use the right value.
1295             if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1296                 balloon_config.size = desired_balloon;
1297             }
1298         }
1299 
1300         event!("vm", "resized");
1301 
1302         Ok(())
1303     }
1304 
1305     pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1306         let memory_config = &mut self.config.lock().unwrap().memory;
1307 
1308         if let Some(zones) = &mut memory_config.zones {
1309             for zone in zones.iter_mut() {
1310                 if zone.id == id {
1311                     if desired_memory >= zone.size {
1312                         let hotplugged_size = desired_memory - zone.size;
1313                         self.memory_manager
1314                             .lock()
1315                             .unwrap()
1316                             .resize_zone(&id, desired_memory - zone.size)
1317                             .map_err(Error::MemoryManager)?;
1318                         // We update the memory zone config regardless of the
1319                         // actual 'resize-zone' operation result (happened or
1320                         // not), so that if the VM reboots it will be running
1321                         // with the last configured memory zone size.
1322                         zone.hotplugged_size = Some(hotplugged_size);
1323 
1324                         return Ok(());
1325                     } else {
1326                         error!(
1327                             "Invalid to ask less ({}) than boot RAM ({}) for \
1328                             this memory zone",
1329                             desired_memory, zone.size,
1330                         );
1331                         return Err(Error::ResizeZone);
1332                     }
1333                 }
1334             }
1335         }
1336 
1337         error!("Could not find the memory zone {} for the resize", id);
1338         Err(Error::ResizeZone)
1339     }
1340 
1341     fn add_to_config<T>(devices: &mut Option<Vec<T>>, device: T) {
1342         if let Some(devices) = devices {
1343             devices.push(device);
1344         } else {
1345             *devices = Some(vec![device]);
1346         }
1347     }
1348 
1349     pub fn add_device(&mut self, mut _device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1350         {
1351             // Validate on a clone of the config
1352             let mut config = self.config.lock().unwrap().clone();
1353             Self::add_to_config(&mut config.devices, _device_cfg.clone());
1354             config.validate().map_err(Error::ConfigValidation)?;
1355         }
1356 
1357         let pci_device_info = self
1358             .device_manager
1359             .lock()
1360             .unwrap()
1361             .add_device(&mut _device_cfg)
1362             .map_err(Error::DeviceManager)?;
1363 
1364         // Update VmConfig by adding the new device. This is important to
1365         // ensure the device would be created in case of a reboot.
1366         {
1367             let mut config = self.config.lock().unwrap();
1368             Self::add_to_config(&mut config.devices, _device_cfg);
1369         }
1370 
1371         self.device_manager
1372             .lock()
1373             .unwrap()
1374             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1375             .map_err(Error::DeviceManager)?;
1376 
1377         Ok(pci_device_info)
1378     }
1379 
1380     pub fn remove_device(&mut self, _id: String) -> Result<()> {
1381         self.device_manager
1382             .lock()
1383             .unwrap()
1384             .remove_device(_id.clone())
1385             .map_err(Error::DeviceManager)?;
1386 
1387         // Update VmConfig by removing the device. This is important to
1388         // ensure the device would not be created in case of a reboot.
1389         let mut config = self.config.lock().unwrap();
1390 
1391         // Remove if VFIO device
1392         if let Some(devices) = config.devices.as_mut() {
1393             devices.retain(|dev| dev.id.as_ref() != Some(&_id));
1394         }
1395 
1396         // Remove if disk device
1397         if let Some(disks) = config.disks.as_mut() {
1398             disks.retain(|dev| dev.id.as_ref() != Some(&_id));
1399         }
1400 
1401         // Remove if net device
1402         if let Some(net) = config.net.as_mut() {
1403             net.retain(|dev| dev.id.as_ref() != Some(&_id));
1404         }
1405 
1406         // Remove if pmem device
1407         if let Some(pmem) = config.pmem.as_mut() {
1408             pmem.retain(|dev| dev.id.as_ref() != Some(&_id));
1409         }
1410 
1411         // Remove if vsock device
1412         if let Some(vsock) = config.vsock.as_ref() {
1413             if vsock.id.as_ref() == Some(&_id) {
1414                 config.vsock = None;
1415             }
1416         }
1417 
1418         self.device_manager
1419             .lock()
1420             .unwrap()
1421             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1422             .map_err(Error::DeviceManager)?;
1423         Ok(())
1424     }
1425 
1426     pub fn add_disk(&mut self, mut _disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1427         {
1428             // Validate on a clone of the config
1429             let mut config = self.config.lock().unwrap().clone();
1430             Self::add_to_config(&mut config.disks, _disk_cfg.clone());
1431             config.validate().map_err(Error::ConfigValidation)?;
1432         }
1433 
1434         let pci_device_info = self
1435             .device_manager
1436             .lock()
1437             .unwrap()
1438             .add_disk(&mut _disk_cfg)
1439             .map_err(Error::DeviceManager)?;
1440 
1441         // Update VmConfig by adding the new device. This is important to
1442         // ensure the device would be created in case of a reboot.
1443         {
1444             let mut config = self.config.lock().unwrap();
1445             Self::add_to_config(&mut config.disks, _disk_cfg);
1446         }
1447 
1448         self.device_manager
1449             .lock()
1450             .unwrap()
1451             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1452             .map_err(Error::DeviceManager)?;
1453 
1454         Ok(pci_device_info)
1455     }
1456 
1457     pub fn add_fs(&mut self, mut _fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1458         {
1459             // Validate on a clone of the config
1460             let mut config = self.config.lock().unwrap().clone();
1461             Self::add_to_config(&mut config.fs, _fs_cfg.clone());
1462             config.validate().map_err(Error::ConfigValidation)?;
1463         }
1464 
1465         let pci_device_info = self
1466             .device_manager
1467             .lock()
1468             .unwrap()
1469             .add_fs(&mut _fs_cfg)
1470             .map_err(Error::DeviceManager)?;
1471 
1472         // Update VmConfig by adding the new device. This is important to
1473         // ensure the device would be created in case of a reboot.
1474         {
1475             let mut config = self.config.lock().unwrap();
1476             Self::add_to_config(&mut config.fs, _fs_cfg);
1477         }
1478 
1479         self.device_manager
1480             .lock()
1481             .unwrap()
1482             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1483             .map_err(Error::DeviceManager)?;
1484 
1485         Ok(pci_device_info)
1486     }
1487 
1488     pub fn add_pmem(&mut self, mut _pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1489         {
1490             // Validate on a clone of the config
1491             let mut config = self.config.lock().unwrap().clone();
1492             Self::add_to_config(&mut config.pmem, _pmem_cfg.clone());
1493             config.validate().map_err(Error::ConfigValidation)?;
1494         }
1495 
1496         let pci_device_info = self
1497             .device_manager
1498             .lock()
1499             .unwrap()
1500             .add_pmem(&mut _pmem_cfg)
1501             .map_err(Error::DeviceManager)?;
1502 
1503         // Update VmConfig by adding the new device. This is important to
1504         // ensure the device would be created in case of a reboot.
1505         {
1506             let mut config = self.config.lock().unwrap();
1507             Self::add_to_config(&mut config.pmem, _pmem_cfg);
1508         }
1509 
1510         self.device_manager
1511             .lock()
1512             .unwrap()
1513             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1514             .map_err(Error::DeviceManager)?;
1515 
1516         Ok(pci_device_info)
1517     }
1518 
1519     pub fn add_net(&mut self, mut _net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1520         {
1521             // Validate on a clone of the config
1522             let mut config = self.config.lock().unwrap().clone();
1523             Self::add_to_config(&mut config.net, _net_cfg.clone());
1524             config.validate().map_err(Error::ConfigValidation)?;
1525         }
1526 
1527         let pci_device_info = self
1528             .device_manager
1529             .lock()
1530             .unwrap()
1531             .add_net(&mut _net_cfg)
1532             .map_err(Error::DeviceManager)?;
1533 
1534         // Update VmConfig by adding the new device. This is important to
1535         // ensure the device would be created in case of a reboot.
1536         {
1537             let mut config = self.config.lock().unwrap();
1538             Self::add_to_config(&mut config.net, _net_cfg);
1539         }
1540 
1541         self.device_manager
1542             .lock()
1543             .unwrap()
1544             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1545             .map_err(Error::DeviceManager)?;
1546 
1547         Ok(pci_device_info)
1548     }
1549 
1550     pub fn add_vsock(&mut self, mut _vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1551         if self.config.lock().unwrap().vsock.is_some() {
1552             return Err(Error::TooManyVsockDevices);
1553         }
1554 
1555         {
1556             // Validate on a clone of the config
1557             let mut config = self.config.lock().unwrap().clone();
1558             config.vsock = Some(_vsock_cfg.clone());
1559             config.validate().map_err(Error::ConfigValidation)?;
1560         }
1561 
1562         let pci_device_info = self
1563             .device_manager
1564             .lock()
1565             .unwrap()
1566             .add_vsock(&mut _vsock_cfg)
1567             .map_err(Error::DeviceManager)?;
1568 
1569         // Update VmConfig by adding the new device. This is important to
1570         // ensure the device would be created in case of a reboot.
1571         {
1572             let mut config = self.config.lock().unwrap();
1573             config.vsock = Some(_vsock_cfg);
1574         }
1575 
1576         self.device_manager
1577             .lock()
1578             .unwrap()
1579             .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1580             .map_err(Error::DeviceManager)?;
1581 
1582         Ok(pci_device_info)
1583     }
1584 
1585     pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1586         Ok(self.device_manager.lock().unwrap().counters())
1587     }
1588 
1589     fn os_signal_handler(
1590         mut signals: Signals,
1591         console_input_clone: Arc<Console>,
1592         on_tty: bool,
1593         exit_evt: EventFd,
1594     ) {
1595         for signal in signals.forever() {
1596             match signal {
1597                 SIGWINCH => {
1598                     let (col, row) = get_win_size();
1599                     console_input_clone.update_console_size(col, row);
1600                 }
1601                 SIGTERM | SIGINT => {
1602                     if on_tty {
1603                         io::stdin()
1604                             .lock()
1605                             .set_canon_mode()
1606                             .expect("failed to restore terminal mode");
1607                     }
1608                     if exit_evt.write(1).is_err() {
1609                         std::process::exit(1);
1610                     }
1611                 }
1612                 _ => (),
1613             }
1614         }
1615     }
1616 
1617     #[cfg(feature = "tdx")]
1618     fn init_tdx(&mut self) -> Result<()> {
1619         let cpuid = self.cpu_manager.lock().unwrap().common_cpuid();
1620         let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32;
1621         self.vm
1622             .tdx_init(&cpuid, max_vcpus)
1623             .map_err(Error::InitializeTdxVm)?;
1624         Ok(())
1625     }
1626 
1627     #[cfg(feature = "tdx")]
1628     fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> {
1629         use arch::x86_64::tdx::*;
1630         // The TDVF file contains a table of section as well as code
1631         let mut firmware_file =
1632             File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware)
1633                 .map_err(Error::LoadTdvf)?;
1634 
1635         // For all the sections allocate some RAM backing them
1636         parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1637     }
1638 
1639     #[cfg(feature = "tdx")]
1640     fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> {
1641         use arch::x86_64::tdx::*;
1642         // Get the memory end *before* we start adding TDVF ram regions
1643         let boot_guest_memory = self
1644             .memory_manager
1645             .lock()
1646             .as_ref()
1647             .unwrap()
1648             .boot_guest_memory();
1649         for section in sections {
1650             // No need to allocate if the section falls within guest RAM ranges
1651             if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
1652                 info!(
1653                     "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
1654                     section
1655                 );
1656                 continue;
1657             }
1658 
1659             info!("Allocating TDVF Section: {:x?}", section);
1660             self.memory_manager
1661                 .lock()
1662                 .unwrap()
1663                 .add_ram_region(GuestAddress(section.address), section.size as usize)
1664                 .map_err(Error::AllocatingTdvfMemory)?;
1665         }
1666 
1667         // The TDVF file contains a table of section as well as code
1668         let mut firmware_file =
1669             File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware)
1670                 .map_err(Error::LoadTdvf)?;
1671 
1672         // The guest memory at this point now has all the required regions so it
1673         // is safe to copy from the TDVF file into it.
1674         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1675         let mem = guest_memory.memory();
1676         let mut hob_offset = None;
1677         for section in sections {
1678             info!("Populating TDVF Section: {:x?}", section);
1679             match section.r#type {
1680                 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
1681                     info!("Copying section to guest memory");
1682                     firmware_file
1683                         .seek(SeekFrom::Start(section.data_offset as u64))
1684                         .map_err(Error::LoadTdvf)?;
1685                     mem.read_from(
1686                         GuestAddress(section.address),
1687                         &mut firmware_file,
1688                         section.data_size as usize,
1689                     )
1690                     .unwrap();
1691                 }
1692                 TdvfSectionType::TdHob => {
1693                     hob_offset = Some(section.address);
1694                 }
1695                 _ => {}
1696             }
1697         }
1698 
1699         // Generate HOB
1700         let mut hob = TdHob::start(hob_offset.unwrap());
1701 
1702         let mut sorted_sections = sections.to_vec();
1703         sorted_sections.retain(|section| {
1704             !matches!(section.r#type, TdvfSectionType::Bfv | TdvfSectionType::Cfv)
1705         });
1706         sorted_sections.sort_by_key(|section| section.address);
1707         sorted_sections.reverse();
1708         let mut current_section = sorted_sections.pop();
1709 
1710         // RAM regions interleaved with TDVF sections
1711         let mut next_start_addr = 0;
1712         for region in boot_guest_memory.iter() {
1713             let region_start = region.start_addr().0;
1714             let region_end = region.last_addr().0;
1715             if region_start > next_start_addr {
1716                 next_start_addr = region_start;
1717             }
1718 
1719             loop {
1720                 let (start, size, ram) = if let Some(section) = &current_section {
1721                     if section.address <= next_start_addr {
1722                         (section.address, section.size, false)
1723                     } else {
1724                         let last_addr = std::cmp::min(section.address - 1, region_end);
1725                         (next_start_addr, last_addr - next_start_addr + 1, true)
1726                     }
1727                 } else {
1728                     (next_start_addr, region_end - next_start_addr + 1, true)
1729                 };
1730 
1731                 hob.add_memory_resource(&mem, start, size, ram)
1732                     .map_err(Error::PopulateHob)?;
1733 
1734                 if !ram {
1735                     current_section = sorted_sections.pop();
1736                 }
1737 
1738                 next_start_addr = start + size;
1739 
1740                 if next_start_addr > region_end {
1741                     break;
1742                 }
1743             }
1744         }
1745 
1746         // MMIO regions
1747         hob.add_mmio_resource(
1748             &mem,
1749             arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1750             arch::layout::APIC_START.raw_value()
1751                 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
1752         )
1753         .map_err(Error::PopulateHob)?;
1754         let start_of_device_area = self
1755             .memory_manager
1756             .lock()
1757             .unwrap()
1758             .start_of_device_area()
1759             .raw_value();
1760         let end_of_device_area = self
1761             .memory_manager
1762             .lock()
1763             .unwrap()
1764             .end_of_device_area()
1765             .raw_value();
1766         hob.add_mmio_resource(
1767             &mem,
1768             start_of_device_area,
1769             end_of_device_area - start_of_device_area,
1770         )
1771         .map_err(Error::PopulateHob)?;
1772 
1773         hob.finish(&mem).map_err(Error::PopulateHob)?;
1774 
1775         Ok(hob_offset)
1776     }
1777 
1778     #[cfg(feature = "tdx")]
1779     fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
1780         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
1781         let mem = guest_memory.memory();
1782 
1783         for section in sections {
1784             self.vm
1785                 .tdx_init_memory_region(
1786                     mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
1787                     section.address,
1788                     section.size,
1789                     /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
1790                     section.attributes == 1,
1791                 )
1792                 .map_err(Error::InitializeTdxMemoryRegion)?;
1793         }
1794         Ok(())
1795     }
1796 
1797     pub fn boot(&mut self) -> Result<()> {
1798         info!("Booting VM");
1799         event!("vm", "booting");
1800         let current_state = self.get_state()?;
1801         if current_state == VmState::Paused {
1802             return self.resume().map_err(Error::Resume);
1803         }
1804 
1805         let new_state = VmState::Running;
1806         current_state.valid_transition(new_state)?;
1807 
1808         // Load kernel if configured
1809         let entry_point = if self.kernel.as_ref().is_some() {
1810             Some(self.load_kernel()?)
1811         } else {
1812             None
1813         };
1814 
1815         // The initial TDX configuration must be done before the vCPUs are
1816         // created
1817         #[cfg(feature = "tdx")]
1818         if self.config.lock().unwrap().tdx.is_some() {
1819             self.init_tdx()?;
1820         }
1821 
1822         // Create and configure vcpus
1823         self.cpu_manager
1824             .lock()
1825             .unwrap()
1826             .create_boot_vcpus(entry_point)
1827             .map_err(Error::CpuManager)?;
1828 
1829         #[cfg(feature = "tdx")]
1830         let sections = self.extract_tdvf_sections()?;
1831 
1832         // Configuring the TDX regions requires that the vCPUs are created
1833         #[cfg(feature = "tdx")]
1834         let hob_address = if self.config.lock().unwrap().tdx.is_some() {
1835             self.populate_tdx_sections(&sections)?
1836         } else {
1837             None
1838         };
1839 
1840         // Configure shared state based on loaded kernel
1841         entry_point.map(|_| self.configure_system()).transpose()?;
1842 
1843         #[cfg(feature = "tdx")]
1844         if let Some(hob_address) = hob_address {
1845             // With the HOB address extracted the vCPUs can have
1846             // their TDX state configured.
1847             self.cpu_manager
1848                 .lock()
1849                 .unwrap()
1850                 .initialize_tdx(hob_address)
1851                 .map_err(Error::CpuManager)?;
1852             self.init_tdx_memory(&sections)?;
1853             // With TDX memory and CPU state configured TDX setup is complete
1854             self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
1855         }
1856 
1857         self.cpu_manager
1858             .lock()
1859             .unwrap()
1860             .start_boot_vcpus()
1861             .map_err(Error::CpuManager)?;
1862 
1863         if self
1864             .device_manager
1865             .lock()
1866             .unwrap()
1867             .console()
1868             .input_enabled()
1869         {
1870             let console = self.device_manager.lock().unwrap().console().clone();
1871             let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]);
1872             match signals {
1873                 Ok(signals) => {
1874                     self.signals = Some(signals.handle());
1875                     let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?;
1876                     let on_tty = self.on_tty;
1877                     let signal_handler_seccomp_filter =
1878                         get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler)
1879                             .map_err(Error::CreateSeccompFilter)?;
1880                     self.threads.push(
1881                         thread::Builder::new()
1882                             .name("signal_handler".to_string())
1883                             .spawn(move || {
1884                                 if let Err(e) = SeccompFilter::apply(signal_handler_seccomp_filter)
1885                                     .map_err(Error::ApplySeccompFilter)
1886                                 {
1887                                     error!("Error applying seccomp filter: {:?}", e);
1888                                     return;
1889                                 }
1890 
1891                                 Vm::os_signal_handler(signals, console, on_tty, exit_evt);
1892                             })
1893                             .map_err(Error::SignalHandlerSpawn)?,
1894                     );
1895                 }
1896                 Err(e) => error!("Signal not found {}", e),
1897             }
1898 
1899             if self.on_tty {
1900                 io::stdin()
1901                     .lock()
1902                     .set_raw_mode()
1903                     .map_err(Error::SetTerminalRaw)?;
1904             }
1905         }
1906 
1907         let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1908         *state = new_state;
1909         event!("vm", "booted");
1910         Ok(())
1911     }
1912 
1913     pub fn handle_pty(&self) -> Result<()> {
1914         // Could be a little dangerous, picks up a lock on device_manager
1915         // and goes into a blocking read. If the epoll loops starts to be
1916         // services by multiple threads likely need to revist this.
1917         let dm = self.device_manager.lock().unwrap();
1918         let mut out = [0u8; 64];
1919         if let Some(mut pty) = dm.serial_pty() {
1920             let count = pty.main.read(&mut out).map_err(Error::PtyConsole)?;
1921             let console = dm.console();
1922             if console.input_enabled() {
1923                 console
1924                     .queue_input_bytes_serial(&out[..count])
1925                     .map_err(Error::Console)?;
1926             }
1927         };
1928         let count = match dm.console_pty() {
1929             Some(mut pty) => pty.main.read(&mut out).map_err(Error::PtyConsole)?,
1930             None => return Ok(()),
1931         };
1932         let console = dm.console();
1933         if console.input_enabled() {
1934             console.queue_input_bytes_console(&out[..count])
1935         }
1936 
1937         Ok(())
1938     }
1939 
1940     pub fn handle_stdin(&self) -> Result<()> {
1941         let mut out = [0u8; 64];
1942         let count = io::stdin()
1943             .lock()
1944             .read_raw(&mut out)
1945             .map_err(Error::Console)?;
1946 
1947         // Replace "\n" with "\r" to deal with Windows SAC (#1170)
1948         if count == 1 && out[0] == 0x0a {
1949             out[0] = 0x0d;
1950         }
1951 
1952         if self
1953             .device_manager
1954             .lock()
1955             .unwrap()
1956             .console()
1957             .input_enabled()
1958         {
1959             self.device_manager
1960                 .lock()
1961                 .unwrap()
1962                 .console()
1963                 .queue_input_bytes(&out[..count])
1964                 .map_err(Error::Console)?;
1965         }
1966 
1967         Ok(())
1968     }
1969 
1970     /// Gets a thread-safe reference counted pointer to the VM configuration.
1971     pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
1972         Arc::clone(&self.config)
1973     }
1974 
1975     /// Get the VM state. Returns an error if the state is poisoned.
1976     pub fn get_state(&self) -> Result<VmState> {
1977         self.state
1978             .try_read()
1979             .map_err(|_| Error::PoisonedState)
1980             .map(|state| *state)
1981     }
1982 
1983     /// Load saved clock from snapshot
1984     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
1985     pub fn load_clock_from_snapshot(
1986         &mut self,
1987         snapshot: &Snapshot,
1988     ) -> Result<Option<hypervisor::ClockData>> {
1989         let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
1990         self.saved_clock = vm_snapshot.clock;
1991         Ok(self.saved_clock)
1992     }
1993 
1994     #[cfg(target_arch = "aarch64")]
1995     /// Add the vGIC section to the VM snapshot.
1996     fn add_vgic_snapshot_section(
1997         &self,
1998         vm_snapshot: &mut Snapshot,
1999     ) -> std::result::Result<(), MigratableError> {
2000         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2001         let gic_device = Arc::clone(
2002             self.device_manager
2003                 .lock()
2004                 .unwrap()
2005                 .get_interrupt_controller()
2006                 .unwrap()
2007                 .lock()
2008                 .unwrap()
2009                 .get_gic_device()
2010                 .unwrap(),
2011         );
2012 
2013         gic_device
2014             .lock()
2015             .unwrap()
2016             .set_gicr_typers(&saved_vcpu_states);
2017 
2018         vm_snapshot.add_snapshot(
2019             gic_device
2020                 .lock()
2021                 .unwrap()
2022                 .as_any_concrete_mut()
2023                 .downcast_mut::<KvmGicV3Its>()
2024                 .unwrap()
2025                 .snapshot()?,
2026         );
2027 
2028         Ok(())
2029     }
2030 
2031     #[cfg(target_arch = "aarch64")]
2032     /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing.
2033     fn restore_vgic_and_enable_interrupt(
2034         &self,
2035         vm_snapshot: &Snapshot,
2036     ) -> std::result::Result<(), MigratableError> {
2037         let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states();
2038         // The number of vCPUs is the same as the number of saved vCPU states.
2039         let vcpu_numbers = saved_vcpu_states.len();
2040 
2041         // Creating a GIC device here, as the GIC will not be created when
2042         // restoring the device manager. Note that currently only the bare GICv3
2043         // without ITS is supported.
2044         let mut gic_device = create_gic(&self.vm, vcpu_numbers.try_into().unwrap())
2045             .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?;
2046 
2047         // Here we prepare the GICR_TYPER registers from the restored vCPU states.
2048         gic_device.set_gicr_typers(&saved_vcpu_states);
2049 
2050         let gic_device = Arc::new(Mutex::new(gic_device));
2051         // Update the GIC entity in device manager
2052         self.device_manager
2053             .lock()
2054             .unwrap()
2055             .get_interrupt_controller()
2056             .unwrap()
2057             .lock()
2058             .unwrap()
2059             .set_gic_device(Arc::clone(&gic_device));
2060 
2061         // Restore GIC states.
2062         if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) {
2063             gic_device
2064                 .lock()
2065                 .unwrap()
2066                 .as_any_concrete_mut()
2067                 .downcast_mut::<KvmGicV3Its>()
2068                 .unwrap()
2069                 .restore(*gicv3_its_snapshot.clone())?;
2070         } else {
2071             return Err(MigratableError::Restore(anyhow!(
2072                 "Missing GicV3Its snapshot"
2073             )));
2074         }
2075 
2076         // Activate gic device
2077         self.device_manager
2078             .lock()
2079             .unwrap()
2080             .get_interrupt_controller()
2081             .unwrap()
2082             .lock()
2083             .unwrap()
2084             .enable()
2085             .map_err(|e| {
2086                 MigratableError::Restore(anyhow!(
2087                     "Could not enable interrupt controller routing: {:#?}",
2088                     e
2089                 ))
2090             })?;
2091 
2092         Ok(())
2093     }
2094 
2095     /// Gets the actual size of the balloon.
2096     pub fn balloon_size(&self) -> u64 {
2097         self.device_manager.lock().unwrap().balloon_size()
2098     }
2099 
2100     pub fn receive_memory_regions<F>(
2101         &mut self,
2102         ranges: &MemoryRangeTable,
2103         fd: &mut F,
2104     ) -> std::result::Result<(), MigratableError>
2105     where
2106         F: Read,
2107     {
2108         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2109         let mem = guest_memory.memory();
2110 
2111         for range in ranges.regions() {
2112             mem.read_exact_from(GuestAddress(range.gpa), fd, range.length as usize)
2113                 .map_err(|e| {
2114                     MigratableError::MigrateReceive(anyhow!(
2115                         "Error transferring memory to socket: {}",
2116                         e
2117                     ))
2118                 })?;
2119         }
2120         Ok(())
2121     }
2122 
2123     pub fn send_memory_regions<F>(
2124         &mut self,
2125         ranges: &MemoryRangeTable,
2126         fd: &mut F,
2127     ) -> std::result::Result<(), MigratableError>
2128     where
2129         F: Write,
2130     {
2131         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2132         let mem = guest_memory.memory();
2133 
2134         for range in ranges.regions() {
2135             mem.write_all_to(GuestAddress(range.gpa), fd, range.length as usize)
2136                 .map_err(|e| {
2137                     MigratableError::MigrateSend(anyhow!(
2138                         "Error transferring memory to socket: {}",
2139                         e
2140                     ))
2141                 })?;
2142         }
2143 
2144         Ok(())
2145     }
2146 
2147     pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2148         let mut table = MemoryRangeTable::default();
2149         let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2150 
2151         for region in guest_memory.memory().iter() {
2152             table.push(MemoryRange {
2153                 gpa: region.start_addr().raw_value(),
2154                 length: region.len() as u64,
2155             });
2156         }
2157 
2158         Ok(table)
2159     }
2160 
2161     pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2162         self.device_manager.lock().unwrap().device_tree()
2163     }
2164 
2165     pub fn activate_virtio_devices(&self) -> Result<()> {
2166         self.device_manager
2167             .lock()
2168             .unwrap()
2169             .activate_virtio_devices()
2170             .map_err(Error::ActivateVirtioDevices)
2171     }
2172 
2173     #[cfg(target_arch = "x86_64")]
2174     pub fn power_button(&self) -> Result<()> {
2175         #[cfg(feature = "acpi")]
2176         return self
2177             .device_manager
2178             .lock()
2179             .unwrap()
2180             .notify_power_button()
2181             .map_err(Error::PowerButton);
2182         #[cfg(not(feature = "acpi"))]
2183         Err(Error::PowerButtonNotSupported)
2184     }
2185 
2186     #[cfg(target_arch = "aarch64")]
2187     pub fn power_button(&self) -> Result<()> {
2188         self.device_manager
2189             .lock()
2190             .unwrap()
2191             .notify_power_button()
2192             .map_err(Error::PowerButton)
2193     }
2194 }
2195 
2196 impl Pausable for Vm {
2197     fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2198         event!("vm", "pausing");
2199         let mut state = self
2200             .state
2201             .try_write()
2202             .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2203         let new_state = VmState::Paused;
2204 
2205         state
2206             .valid_transition(new_state)
2207             .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2208 
2209         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2210         {
2211             let mut clock = self
2212                 .vm
2213                 .get_clock()
2214                 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2215             // Reset clock flags.
2216             clock.flags = 0;
2217             self.saved_clock = Some(clock);
2218         }
2219         self.cpu_manager.lock().unwrap().pause()?;
2220         self.device_manager.lock().unwrap().pause()?;
2221 
2222         *state = new_state;
2223 
2224         event!("vm", "paused");
2225         Ok(())
2226     }
2227 
2228     fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2229         event!("vm", "resuming");
2230         let mut state = self
2231             .state
2232             .try_write()
2233             .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2234         let new_state = VmState::Running;
2235 
2236         state
2237             .valid_transition(new_state)
2238             .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2239 
2240         self.cpu_manager.lock().unwrap().resume()?;
2241         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2242         {
2243             if let Some(clock) = &self.saved_clock {
2244                 self.vm.set_clock(clock).map_err(|e| {
2245                     MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2246                 })?;
2247             }
2248         }
2249         self.device_manager.lock().unwrap().resume()?;
2250 
2251         // And we're back to the Running state.
2252         *state = new_state;
2253         event!("vm", "resumed");
2254         Ok(())
2255     }
2256 }
2257 
2258 #[derive(Serialize, Deserialize)]
2259 pub struct VmSnapshot {
2260     pub config: Arc<Mutex<VmConfig>>,
2261     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2262     pub clock: Option<hypervisor::ClockData>,
2263     pub state: Option<hypervisor::VmState>,
2264     #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2265     pub common_cpuid: hypervisor::CpuId,
2266 }
2267 
2268 pub const VM_SNAPSHOT_ID: &str = "vm";
2269 impl Snapshottable for Vm {
2270     fn id(&self) -> String {
2271         VM_SNAPSHOT_ID.to_string()
2272     }
2273 
2274     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2275         event!("vm", "snapshotting");
2276 
2277         #[cfg(feature = "tdx")]
2278         {
2279             if self.config.lock().unwrap().tdx.is_some() {
2280                 return Err(MigratableError::Snapshot(anyhow!(
2281                     "Snapshot not possible with TDX VM"
2282                 )));
2283             }
2284         }
2285 
2286         let current_state = self.get_state().unwrap();
2287         if current_state != VmState::Paused {
2288             return Err(MigratableError::Snapshot(anyhow!(
2289                 "Trying to snapshot while VM is running"
2290             )));
2291         }
2292 
2293         #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2294         let common_cpuid = {
2295             #[cfg(feature = "tdx")]
2296             let tdx_enabled = self.config.lock().unwrap().tdx.is_some();
2297             let phys_bits = physical_bits(
2298                 self.config.lock().unwrap().cpus.max_phys_bits,
2299                 #[cfg(feature = "tdx")]
2300                 tdx_enabled,
2301             );
2302             arch::generate_common_cpuid(
2303                 self.hypervisor.clone(),
2304                 None,
2305                 None,
2306                 phys_bits,
2307                 self.config.lock().unwrap().cpus.kvm_hyperv,
2308                 #[cfg(feature = "tdx")]
2309                 tdx_enabled,
2310             )
2311             .map_err(|e| {
2312                 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2313             })?
2314         };
2315 
2316         let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID);
2317         let vm_state = self
2318             .vm
2319             .state()
2320             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2321         let vm_snapshot_data = serde_json::to_vec(&VmSnapshot {
2322             config: self.get_config(),
2323             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2324             clock: self.saved_clock,
2325             state: Some(vm_state),
2326             #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2327             common_cpuid,
2328         })
2329         .map_err(|e| MigratableError::Snapshot(e.into()))?;
2330 
2331         vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?);
2332         vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?);
2333 
2334         #[cfg(target_arch = "aarch64")]
2335         self.add_vgic_snapshot_section(&mut vm_snapshot)
2336             .map_err(|e| MigratableError::Snapshot(e.into()))?;
2337 
2338         vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?);
2339         vm_snapshot.add_data_section(SnapshotDataSection {
2340             id: format!("{}-section", VM_SNAPSHOT_ID),
2341             snapshot: vm_snapshot_data,
2342         });
2343 
2344         event!("vm", "snapshotted");
2345         Ok(vm_snapshot)
2346     }
2347 
2348     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
2349         event!("vm", "restoring");
2350 
2351         let current_state = self
2352             .get_state()
2353             .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?;
2354         let new_state = VmState::Paused;
2355         current_state.valid_transition(new_state).map_err(|e| {
2356             MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e))
2357         })?;
2358 
2359         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2360             self.memory_manager
2361                 .lock()
2362                 .unwrap()
2363                 .restore(*memory_manager_snapshot.clone())?;
2364         } else {
2365             return Err(MigratableError::Restore(anyhow!(
2366                 "Missing memory manager snapshot"
2367             )));
2368         }
2369 
2370         if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) {
2371             self.cpu_manager
2372                 .lock()
2373                 .unwrap()
2374                 .restore(*cpu_manager_snapshot.clone())?;
2375         } else {
2376             return Err(MigratableError::Restore(anyhow!(
2377                 "Missing CPU manager snapshot"
2378             )));
2379         }
2380 
2381         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2382             self.device_manager
2383                 .lock()
2384                 .unwrap()
2385                 .restore(*device_manager_snapshot.clone())?;
2386         } else {
2387             return Err(MigratableError::Restore(anyhow!(
2388                 "Missing device manager snapshot"
2389             )));
2390         }
2391 
2392         #[cfg(target_arch = "aarch64")]
2393         self.restore_vgic_and_enable_interrupt(&snapshot)?;
2394 
2395         if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) {
2396             self.device_manager
2397                 .lock()
2398                 .unwrap()
2399                 .restore_devices(*device_manager_snapshot.clone())?;
2400         } else {
2401             return Err(MigratableError::Restore(anyhow!(
2402                 "Missing device manager snapshot"
2403             )));
2404         }
2405 
2406         // Now we can start all vCPUs from here.
2407         self.cpu_manager
2408             .lock()
2409             .unwrap()
2410             .start_restored_vcpus()
2411             .map_err(|e| {
2412                 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e))
2413             })?;
2414 
2415         if self
2416             .device_manager
2417             .lock()
2418             .unwrap()
2419             .console()
2420             .input_enabled()
2421         {
2422             let console = self.device_manager.lock().unwrap().console().clone();
2423             let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]);
2424             match signals {
2425                 Ok(signals) => {
2426                     self.signals = Some(signals.handle());
2427 
2428                     let on_tty = self.on_tty;
2429                     let signal_handler_seccomp_filter =
2430                         get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler).map_err(
2431                             |e| {
2432                                 MigratableError::Restore(anyhow!(
2433                                     "Could not create seccomp filter: {:#?}",
2434                                     Error::CreateSeccompFilter(e)
2435                                 ))
2436                             },
2437                         )?;
2438                     let exit_evt = self.exit_evt.try_clone().map_err(|e| {
2439                         MigratableError::Restore(anyhow!("Could not clone exit event fd: {:?}", e))
2440                     })?;
2441 
2442                     self.threads.push(
2443                         thread::Builder::new()
2444                             .name("signal_handler".to_string())
2445                             .spawn(move || {
2446                                 if let Err(e) = SeccompFilter::apply(signal_handler_seccomp_filter)
2447                                     .map_err(Error::ApplySeccompFilter)
2448                                 {
2449                                     error!("Error applying seccomp filter: {:?}", e);
2450                                     return;
2451                                 }
2452 
2453                                 Vm::os_signal_handler(signals, console, on_tty, exit_evt)
2454                             })
2455                             .map_err(|e| {
2456                                 MigratableError::Restore(anyhow!(
2457                                     "Could not start console signal thread: {:#?}",
2458                                     e
2459                                 ))
2460                             })?,
2461                     );
2462                 }
2463                 Err(e) => error!("Signal not found {}", e),
2464             }
2465 
2466             if self.on_tty {
2467                 io::stdin().lock().set_raw_mode().map_err(|e| {
2468                     MigratableError::Restore(anyhow!(
2469                         "Could not set terminal in raw mode: {:#?}",
2470                         e
2471                     ))
2472                 })?;
2473             }
2474         }
2475 
2476         let mut state = self
2477             .state
2478             .try_write()
2479             .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?;
2480         *state = new_state;
2481 
2482         event!("vm", "restored");
2483         Ok(())
2484     }
2485 }
2486 
2487 impl Transportable for Vm {
2488     fn send(
2489         &self,
2490         snapshot: &Snapshot,
2491         destination_url: &str,
2492     ) -> std::result::Result<(), MigratableError> {
2493         let mut vm_snapshot_path = url_to_path(destination_url)?;
2494         vm_snapshot_path.push(VM_SNAPSHOT_FILE);
2495 
2496         // Create the snapshot file
2497         let mut vm_snapshot_file = OpenOptions::new()
2498             .read(true)
2499             .write(true)
2500             .create_new(true)
2501             .open(vm_snapshot_path)
2502             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2503 
2504         // Serialize and write the snapshot
2505         let vm_snapshot =
2506             serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2507 
2508         vm_snapshot_file
2509             .write(&vm_snapshot)
2510             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2511 
2512         // Tell the memory manager to also send/write its own snapshot.
2513         if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2514             self.memory_manager
2515                 .lock()
2516                 .unwrap()
2517                 .send(&*memory_manager_snapshot.clone(), destination_url)?;
2518         } else {
2519             return Err(MigratableError::Restore(anyhow!(
2520                 "Missing memory manager snapshot"
2521             )));
2522         }
2523 
2524         Ok(())
2525     }
2526 }
2527 
2528 impl Migratable for Vm {
2529     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2530         self.memory_manager.lock().unwrap().start_dirty_log()?;
2531         self.device_manager.lock().unwrap().start_dirty_log()
2532     }
2533 
2534     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2535         self.memory_manager.lock().unwrap().stop_dirty_log()?;
2536         self.device_manager.lock().unwrap().stop_dirty_log()
2537     }
2538 
2539     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2540         Ok(MemoryRangeTable::new_from_tables(vec![
2541             self.memory_manager.lock().unwrap().dirty_log()?,
2542             self.device_manager.lock().unwrap().dirty_log()?,
2543         ]))
2544     }
2545 }
2546 
2547 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2548 #[cfg(test)]
2549 mod tests {
2550     use super::*;
2551 
2552     fn test_vm_state_transitions(state: VmState) {
2553         match state {
2554             VmState::Created => {
2555                 // Check the transitions from Created
2556                 assert!(state.valid_transition(VmState::Created).is_err());
2557                 assert!(state.valid_transition(VmState::Running).is_ok());
2558                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2559                 assert!(state.valid_transition(VmState::Paused).is_ok());
2560             }
2561             VmState::Running => {
2562                 // Check the transitions from Running
2563                 assert!(state.valid_transition(VmState::Created).is_err());
2564                 assert!(state.valid_transition(VmState::Running).is_err());
2565                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2566                 assert!(state.valid_transition(VmState::Paused).is_ok());
2567             }
2568             VmState::Shutdown => {
2569                 // Check the transitions from Shutdown
2570                 assert!(state.valid_transition(VmState::Created).is_err());
2571                 assert!(state.valid_transition(VmState::Running).is_ok());
2572                 assert!(state.valid_transition(VmState::Shutdown).is_err());
2573                 assert!(state.valid_transition(VmState::Paused).is_err());
2574             }
2575             VmState::Paused => {
2576                 // Check the transitions from Paused
2577                 assert!(state.valid_transition(VmState::Created).is_err());
2578                 assert!(state.valid_transition(VmState::Running).is_ok());
2579                 assert!(state.valid_transition(VmState::Shutdown).is_ok());
2580                 assert!(state.valid_transition(VmState::Paused).is_err());
2581             }
2582         }
2583     }
2584 
2585     #[test]
2586     fn test_vm_created_transitions() {
2587         test_vm_state_transitions(VmState::Created);
2588     }
2589 
2590     #[test]
2591     fn test_vm_running_transitions() {
2592         test_vm_state_transitions(VmState::Running);
2593     }
2594 
2595     #[test]
2596     fn test_vm_shutdown_transitions() {
2597         test_vm_state_transitions(VmState::Shutdown);
2598     }
2599 
2600     #[test]
2601     fn test_vm_paused_transitions() {
2602         test_vm_state_transitions(VmState::Paused);
2603     }
2604 }
2605 
2606 #[cfg(target_arch = "aarch64")]
2607 #[cfg(test)]
2608 mod tests {
2609     use super::*;
2610     use crate::GuestMemoryMmap;
2611     use arch::aarch64::fdt::create_fdt;
2612     use arch::aarch64::gic::kvm::create_gic;
2613     use arch::aarch64::layout;
2614     use arch::{DeviceType, MmioDeviceInfo};
2615     use vm_memory::GuestAddress;
2616 
2617     const LEN: u64 = 4096;
2618 
2619     #[test]
2620     fn test_create_fdt_with_devices() {
2621         let regions = vec![(
2622             GuestAddress(layout::RAM_64BIT_START),
2623             (layout::FDT_MAX_SIZE + 0x1000) as usize,
2624         )];
2625         let mem = GuestMemoryMmap::from_ranges(&regions).expect("Cannot initialize memory");
2626 
2627         let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
2628             (
2629                 (DeviceType::Serial, DeviceType::Serial.to_string()),
2630                 MmioDeviceInfo {
2631                     addr: 0x00,
2632                     irq: 33,
2633                 },
2634             ),
2635             (
2636                 (DeviceType::Virtio(1), "virtio".to_string()),
2637                 MmioDeviceInfo { addr: LEN, irq: 34 },
2638             ),
2639             (
2640                 (DeviceType::Rtc, "rtc".to_string()),
2641                 MmioDeviceInfo {
2642                     addr: 2 * LEN,
2643                     irq: 35,
2644                 },
2645             ),
2646         ]
2647         .iter()
2648         .cloned()
2649         .collect();
2650 
2651         let hv = hypervisor::new().unwrap();
2652         let vm = hv.create_vm().unwrap();
2653         let gic = create_gic(&vm, 1).unwrap();
2654         assert!(create_fdt(
2655             &mem,
2656             &CString::new("console=tty0").unwrap(),
2657             vec![0],
2658             Some((0, 0, 0)),
2659             &dev_info,
2660             &*gic,
2661             &None,
2662             &(0x1_0000_0000, 0x1_0000),
2663         )
2664         .is_ok())
2665     }
2666 }
2667 
2668 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2669 #[test]
2670 pub fn test_vm() {
2671     use hypervisor::VmExit;
2672     use vm_memory::{GuestMemory, GuestMemoryRegion};
2673     // This example based on https://lwn.net/Articles/658511/
2674     let code = [
2675         0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
2676         0x00, 0xd8, /* add %bl, %al */
2677         0x04, b'0', /* add $'0', %al */
2678         0xee, /* out %al, (%dx) */
2679         0xb0, b'\n', /* mov $'\n', %al */
2680         0xee,  /* out %al, (%dx) */
2681         0xf4,  /* hlt */
2682     ];
2683 
2684     let mem_size = 0x1000;
2685     let load_addr = GuestAddress(0x1000);
2686     let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
2687 
2688     let hv = hypervisor::new().unwrap();
2689     let vm = hv.create_vm().expect("new VM creation failed");
2690 
2691     for (index, region) in mem.iter().enumerate() {
2692         let mem_region = vm.make_user_memory_region(
2693             index as u32,
2694             region.start_addr().raw_value(),
2695             region.len() as u64,
2696             region.as_ptr() as u64,
2697             false,
2698             false,
2699         );
2700 
2701         vm.create_user_memory_region(mem_region)
2702             .expect("Cannot configure guest memory");
2703     }
2704     mem.write_slice(&code, load_addr)
2705         .expect("Writing code to memory failed");
2706 
2707     let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
2708 
2709     let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
2710     vcpu_sregs.cs.base = 0;
2711     vcpu_sregs.cs.selector = 0;
2712     vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
2713 
2714     let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
2715     vcpu_regs.rip = 0x1000;
2716     vcpu_regs.rax = 2;
2717     vcpu_regs.rbx = 3;
2718     vcpu_regs.rflags = 2;
2719     vcpu.set_regs(&vcpu_regs).expect("set regs failed");
2720 
2721     loop {
2722         match vcpu.run().expect("run failed") {
2723             VmExit::IoOut(addr, data) => {
2724                 println!(
2725                     "IO out -- addr: {:#x} data [{:?}]",
2726                     addr,
2727                     str::from_utf8(data).unwrap()
2728                 );
2729             }
2730             VmExit::Reset => {
2731                 println!("HLT");
2732                 break;
2733             }
2734             r => panic!("unexpected exit reason: {:?}", r),
2735         }
2736     }
2737 }
2738