1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 //
5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
6 // Use of this source code is governed by a BSD-style license that can be
7 // found in the LICENSE-BSD-3-Clause file.
8 //
9 // Copyright © 2019 Intel Corporation
10 //
11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
12 //
13
14 use std::collections::{BTreeMap, HashMap};
15 use std::fs::{File, OpenOptions};
16 use std::io::{self, Seek, SeekFrom, Write};
17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
18 use std::mem::size_of;
19 use std::num::Wrapping;
20 use std::ops::Deref;
21 use std::os::unix::net::UnixStream;
22 use std::sync::{Arc, Mutex, RwLock};
23 #[cfg(not(target_arch = "riscv64"))]
24 use std::time::Instant;
25 use std::{cmp, result, str, thread};
26
27 use anyhow::anyhow;
28 #[cfg(target_arch = "x86_64")]
29 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START};
30 #[cfg(feature = "tdx")]
31 use arch::x86_64::tdx::TdvfSection;
32 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
33 use arch::PciSpaceInfo;
34 use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes};
35 #[cfg(target_arch = "aarch64")]
36 use devices::interrupt_controller;
37 use devices::AcpiNotificationFlags;
38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))]
39 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs;
40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
41 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs;
42 #[cfg(target_arch = "aarch64")]
43 use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ;
44 use hypervisor::{HypervisorVmError, VmOps};
45 use libc::{termios, SIGWINCH};
46 use linux_loader::cmdline::Cmdline;
47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
48 use linux_loader::elf;
49 #[cfg(target_arch = "x86_64")]
50 use linux_loader::loader::bzimage::BzImage;
51 #[cfg(target_arch = "x86_64")]
52 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent;
53 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
54 use linux_loader::loader::pe::Error::InvalidImageMagicNumber;
55 use linux_loader::loader::KernelLoader;
56 use seccompiler::SeccompAction;
57 use serde::{Deserialize, Serialize};
58 use thiserror::Error;
59 use tracer::trace_scoped;
60 use vm_device::Bus;
61 #[cfg(feature = "tdx")]
62 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile};
63 use vm_memory::{
64 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile,
65 };
66 use vm_migration::protocol::{MemoryRangeTable, Request, Response};
67 use vm_migration::{
68 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable,
69 };
70 use vmm_sys_util::eventfd::EventFd;
71 use vmm_sys_util::sock_ctrl_msg::ScmSocket;
72
73 use crate::config::{add_to_config, ValidationError};
74 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo};
75 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
76 use crate::coredump::{
77 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType,
78 };
79 use crate::device_manager::{DeviceManager, DeviceManagerError};
80 use crate::device_tree::DeviceTree;
81 #[cfg(feature = "guest_debug")]
82 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload};
83 #[cfg(feature = "igvm")]
84 use crate::igvm::igvm_loader;
85 use crate::landlock::LandlockError;
86 use crate::memory_manager::{
87 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData,
88 };
89 #[cfg(target_arch = "x86_64")]
90 use crate::migration::get_vm_snapshot;
91 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
92 use crate::migration::url_to_file;
93 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE};
94 use crate::vm_config::{
95 DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig,
96 PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig,
97 };
98 use crate::{
99 cpu, GuestMemoryMmap, PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID,
100 MEMORY_MANAGER_SNAPSHOT_ID,
101 };
102
103 /// Errors associated with VM management
104 #[derive(Debug, Error)]
105 pub enum Error {
106 #[error("Cannot open kernel file")]
107 KernelFile(#[source] io::Error),
108
109 #[error("Cannot open initramfs file")]
110 InitramfsFile(#[source] io::Error),
111
112 #[error("Cannot load the kernel into memory")]
113 KernelLoad(#[source] linux_loader::loader::Error),
114
115 #[cfg(target_arch = "aarch64")]
116 #[error("Cannot load the UEFI binary in memory")]
117 UefiLoad(#[source] arch::aarch64::uefi::Error),
118
119 #[error("Cannot load the initramfs into memory")]
120 InitramfsLoad,
121
122 #[error("Cannot load the kernel command line in memory")]
123 LoadCmdLine(#[source] linux_loader::loader::Error),
124
125 #[error("Failed to apply landlock config during vm_create")]
126 ApplyLandlock(#[source] LandlockError),
127
128 #[error("Cannot modify the kernel command line")]
129 CmdLineInsertStr(#[source] linux_loader::cmdline::Error),
130
131 #[error("Cannot create the kernel command line")]
132 CmdLineCreate(#[source] linux_loader::cmdline::Error),
133
134 #[error("Cannot configure system")]
135 ConfigureSystem(#[source] arch::Error),
136
137 #[cfg(target_arch = "aarch64")]
138 #[error("Cannot enable interrupt controller")]
139 EnableInterruptController(#[source] interrupt_controller::Error),
140
141 #[error("VM state is poisoned")]
142 PoisonedState,
143
144 #[error("Error from device manager")]
145 DeviceManager(#[source] DeviceManagerError),
146
147 #[error("Error initializing VM")]
148 InitializeVm(#[source] hypervisor::HypervisorVmError),
149
150 #[error("No device with id {0:?} to remove")]
151 NoDeviceToRemove(String),
152
153 #[error("Cannot spawn a signal handler thread")]
154 SignalHandlerSpawn(#[source] io::Error),
155
156 #[error("Failed to join on threads: {0:?}")]
157 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
158
159 #[error("VM config is missing")]
160 VmMissingConfig,
161
162 #[error("VM is not created")]
163 VmNotCreated,
164
165 #[error("VM is already created")]
166 VmAlreadyCreated,
167
168 #[error("VM is not running")]
169 VmNotRunning,
170
171 #[error("Cannot clone EventFd")]
172 EventFdClone(#[source] io::Error),
173
174 #[error("invalid VM state transition: {0:?} to {1:?}")]
175 InvalidStateTransition(VmState, VmState),
176
177 #[error("Error from CPU manager")]
178 CpuManager(#[source] cpu::Error),
179
180 #[error("Cannot pause devices")]
181 PauseDevices(#[source] MigratableError),
182
183 #[error("Cannot resume devices")]
184 ResumeDevices(#[source] MigratableError),
185
186 #[error("Cannot pause CPUs")]
187 PauseCpus(#[source] MigratableError),
188
189 #[error("Cannot resume cpus")]
190 ResumeCpus(#[source] MigratableError),
191
192 #[error("Cannot pause VM")]
193 Pause(#[source] MigratableError),
194
195 #[error("Cannot resume VM")]
196 Resume(#[source] MigratableError),
197
198 #[error("Memory manager error")]
199 MemoryManager(#[source] MemoryManagerError),
200
201 #[error("Eventfd write error")]
202 EventfdError(#[source] std::io::Error),
203
204 #[error("Cannot snapshot VM")]
205 Snapshot(#[source] MigratableError),
206
207 #[error("Cannot restore VM")]
208 Restore(#[source] MigratableError),
209
210 #[error("Cannot send VM snapshot")]
211 SnapshotSend(#[source] MigratableError),
212
213 #[error("Invalid restore source URL")]
214 InvalidRestoreSourceUrl,
215
216 #[error("Failed to validate config")]
217 ConfigValidation(#[source] ValidationError),
218
219 #[error("Too many virtio-vsock devices")]
220 TooManyVsockDevices,
221
222 #[error("Failed serializing into JSON")]
223 SerializeJson(#[source] serde_json::Error),
224
225 #[error("Invalid NUMA configuration")]
226 InvalidNumaConfig,
227
228 #[error("Cannot create seccomp filter")]
229 CreateSeccompFilter(#[source] seccompiler::Error),
230
231 #[error("Cannot apply seccomp filter")]
232 ApplySeccompFilter(#[source] seccompiler::Error),
233
234 #[error("Failed resizing a memory zone")]
235 ResizeZone,
236
237 #[error("Cannot activate virtio devices")]
238 ActivateVirtioDevices(#[source] DeviceManagerError),
239
240 #[error("Error triggering power button")]
241 PowerButton(#[source] DeviceManagerError),
242
243 #[error("Kernel lacks PVH header")]
244 KernelMissingPvhHeader,
245
246 #[error("Failed to allocate firmware RAM")]
247 AllocateFirmwareMemory(#[source] MemoryManagerError),
248
249 #[error("Error manipulating firmware file")]
250 FirmwareFile(#[source] std::io::Error),
251
252 #[error("Firmware too big")]
253 FirmwareTooLarge,
254
255 #[error("Failed to copy firmware to memory")]
256 FirmwareLoad(#[source] vm_memory::GuestMemoryError),
257
258 #[cfg(feature = "sev_snp")]
259 #[error("Error enabling SEV-SNP VM")]
260 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError),
261
262 #[cfg(feature = "tdx")]
263 #[error("Error performing I/O on TDX firmware file")]
264 LoadTdvf(#[source] std::io::Error),
265
266 #[cfg(feature = "tdx")]
267 #[error("Error performing I/O on the TDX payload file")]
268 LoadPayload(#[source] std::io::Error),
269
270 #[cfg(feature = "tdx")]
271 #[error("Error parsing TDVF")]
272 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError),
273
274 #[cfg(feature = "tdx")]
275 #[error("Error populating TDX HOB")]
276 PopulateHob(#[source] arch::x86_64::tdx::TdvfError),
277
278 #[cfg(feature = "tdx")]
279 #[error("Error allocating TDVF memory")]
280 AllocatingTdvfMemory(#[source] crate::memory_manager::Error),
281
282 #[cfg(feature = "tdx")]
283 #[error("Error enabling TDX VM")]
284 InitializeTdxVm(#[source] hypervisor::HypervisorVmError),
285
286 #[cfg(feature = "tdx")]
287 #[error("Error enabling TDX memory region")]
288 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError),
289
290 #[cfg(feature = "tdx")]
291 #[error("Error finalizing TDX VM")]
292 FinalizeTdx(#[source] hypervisor::HypervisorVmError),
293
294 #[cfg(feature = "tdx")]
295 #[error("TDX firmware missing")]
296 TdxFirmwareMissing,
297
298 #[cfg(feature = "tdx")]
299 #[error("Invalid TDX payload type")]
300 InvalidPayloadType,
301
302 #[cfg(feature = "guest_debug")]
303 #[error("Error debugging VM")]
304 Debug(#[source] DebuggableError),
305
306 #[error("Error spawning kernel loading thread")]
307 KernelLoadThreadSpawn(#[source] std::io::Error),
308
309 #[error("Error joining kernel loading thread")]
310 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>),
311
312 #[error("Payload configuration is not bootable")]
313 InvalidPayload,
314
315 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
316 #[error("Error coredumping VM")]
317 Coredump(#[source] GuestDebuggableError),
318
319 #[cfg(feature = "igvm")]
320 #[error("Cannot open igvm file")]
321 IgvmFile(#[source] io::Error),
322
323 #[cfg(feature = "igvm")]
324 #[error("Cannot load the igvm into memory")]
325 IgvmLoad(#[source] igvm_loader::Error),
326
327 #[error("Error injecting NMI")]
328 ErrorNmi,
329
330 #[error("Error resuming the VM")]
331 ResumeVm(#[source] hypervisor::HypervisorVmError),
332
333 #[error("Error creating console devices")]
334 CreateConsoleDevices(#[source] ConsoleDeviceError),
335
336 #[error("Error locking disk images: Another instance likely holds a lock")]
337 LockingError(#[source] DeviceManagerError),
338 }
339 pub type Result<T> = result::Result<T, Error>;
340
341 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
342 pub enum VmState {
343 Created,
344 Running,
345 Shutdown,
346 Paused,
347 BreakPoint,
348 }
349
350 impl VmState {
valid_transition(self, new_state: VmState) -> Result<()>351 fn valid_transition(self, new_state: VmState) -> Result<()> {
352 match self {
353 VmState::Created => match new_state {
354 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)),
355 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => {
356 Ok(())
357 }
358 },
359
360 VmState::Running => match new_state {
361 VmState::Created | VmState::Running => {
362 Err(Error::InvalidStateTransition(self, new_state))
363 }
364 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()),
365 },
366
367 VmState::Shutdown => match new_state {
368 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => {
369 Err(Error::InvalidStateTransition(self, new_state))
370 }
371 VmState::Running => Ok(()),
372 },
373
374 VmState::Paused => match new_state {
375 VmState::Created | VmState::Paused | VmState::BreakPoint => {
376 Err(Error::InvalidStateTransition(self, new_state))
377 }
378 VmState::Running | VmState::Shutdown => Ok(()),
379 },
380 VmState::BreakPoint => match new_state {
381 VmState::Created | VmState::Running => Ok(()),
382 _ => Err(Error::InvalidStateTransition(self, new_state)),
383 },
384 }
385 }
386 }
387
388 struct VmOpsHandler {
389 memory: GuestMemoryAtomic<GuestMemoryMmap>,
390 #[cfg(target_arch = "x86_64")]
391 io_bus: Arc<Bus>,
392 mmio_bus: Arc<Bus>,
393 }
394
395 impl VmOps for VmOpsHandler {
guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError>396 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> {
397 self.memory
398 .memory()
399 .write(buf, GuestAddress(gpa))
400 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into()))
401 }
402
guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError>403 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> {
404 self.memory
405 .memory()
406 .read(buf, GuestAddress(gpa))
407 .map_err(|e| HypervisorVmError::GuestMemRead(e.into()))
408 }
409
mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError>410 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
411 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) {
412 info!("Guest MMIO read to unregistered address 0x{:x}", gpa);
413 }
414 Ok(())
415 }
416
mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError>417 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
418 match self.mmio_bus.write(gpa, data) {
419 Err(vm_device::BusError::MissingAddressRange) => {
420 info!("Guest MMIO write to unregistered address 0x{:x}", gpa);
421 }
422 Ok(Some(barrier)) => {
423 info!("Waiting for barrier");
424 barrier.wait();
425 info!("Barrier released");
426 }
427 _ => {}
428 };
429 Ok(())
430 }
431
432 #[cfg(target_arch = "x86_64")]
pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError>433 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> {
434 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) {
435 info!("Guest PIO read to unregistered address 0x{:x}", port);
436 }
437 Ok(())
438 }
439
440 #[cfg(target_arch = "x86_64")]
pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError>441 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> {
442 match self.io_bus.write(port, data) {
443 Err(vm_device::BusError::MissingAddressRange) => {
444 info!("Guest PIO write to unregistered address 0x{:x}", port);
445 }
446 Ok(Some(barrier)) => {
447 info!("Waiting for barrier");
448 barrier.wait();
449 info!("Barrier released");
450 }
451 _ => {}
452 };
453 Ok(())
454 }
455 }
456
physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8457 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 {
458 let host_phys_bits = get_host_cpu_phys_bits(hypervisor);
459
460 cmp::min(host_phys_bits, max_phys_bits)
461 }
462
463 pub struct Vm {
464 #[cfg(feature = "tdx")]
465 kernel: Option<File>,
466 initramfs: Option<File>,
467 threads: Vec<thread::JoinHandle<()>>,
468 device_manager: Arc<Mutex<DeviceManager>>,
469 config: Arc<Mutex<VmConfig>>,
470 state: RwLock<VmState>,
471 cpu_manager: Arc<Mutex<cpu::CpuManager>>,
472 memory_manager: Arc<Mutex<MemoryManager>>,
473 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
474 // The hypervisor abstracted virtual machine.
475 vm: Arc<dyn hypervisor::Vm>,
476 #[cfg(target_arch = "x86_64")]
477 saved_clock: Option<hypervisor::ClockData>,
478 #[cfg(not(target_arch = "riscv64"))]
479 numa_nodes: NumaNodes,
480 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))]
481 #[cfg(not(target_arch = "riscv64"))]
482 hypervisor: Arc<dyn hypervisor::Hypervisor>,
483 stop_on_boot: bool,
484 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>,
485 }
486
487 impl Vm {
488 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH];
489
490 #[allow(clippy::too_many_arguments)]
new_from_memory_manager( config: Arc<Mutex<VmConfig>>, memory_manager: Arc<Mutex<MemoryManager>>, vm: Arc<dyn hypervisor::Vm>, exit_evt: EventFd, reset_evt: EventFd, #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, seccomp_action: &SeccompAction, hypervisor: Arc<dyn hypervisor::Hypervisor>, activate_evt: EventFd, #[cfg(not(target_arch = "riscv64"))] timestamp: Instant, console_info: Option<ConsoleInfo>, console_resize_pipe: Option<Arc<File>>, original_termios: Arc<Mutex<Option<termios>>>, snapshot: Option<Snapshot>, ) -> Result<Self>491 pub fn new_from_memory_manager(
492 config: Arc<Mutex<VmConfig>>,
493 memory_manager: Arc<Mutex<MemoryManager>>,
494 vm: Arc<dyn hypervisor::Vm>,
495 exit_evt: EventFd,
496 reset_evt: EventFd,
497 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
498 seccomp_action: &SeccompAction,
499 hypervisor: Arc<dyn hypervisor::Hypervisor>,
500 activate_evt: EventFd,
501 #[cfg(not(target_arch = "riscv64"))] timestamp: Instant,
502 console_info: Option<ConsoleInfo>,
503 console_resize_pipe: Option<Arc<File>>,
504 original_termios: Arc<Mutex<Option<termios>>>,
505 snapshot: Option<Snapshot>,
506 ) -> Result<Self> {
507 trace_scoped!("Vm::new_from_memory_manager");
508
509 let boot_id_list = config
510 .lock()
511 .unwrap()
512 .validate()
513 .map_err(Error::ConfigValidation)?;
514
515 info!("Booting VM from config: {:?}", &config);
516
517 // Create NUMA nodes based on NumaConfig.
518 let numa_nodes =
519 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?;
520
521 #[cfg(feature = "tdx")]
522 let tdx_enabled = config.lock().unwrap().is_tdx_enabled();
523 #[cfg(feature = "sev_snp")]
524 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled();
525 #[cfg(feature = "tdx")]
526 let force_iommu = tdx_enabled;
527 #[cfg(feature = "sev_snp")]
528 let force_iommu = sev_snp_enabled;
529 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))]
530 let force_iommu = false;
531
532 #[cfg(feature = "guest_debug")]
533 let stop_on_boot = config.lock().unwrap().gdb;
534 #[cfg(not(feature = "guest_debug"))]
535 let stop_on_boot = false;
536
537 let memory = memory_manager.lock().unwrap().guest_memory();
538 let io_bus = Arc::new(Bus::new());
539 let mmio_bus = Arc::new(Bus::new());
540
541 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler {
542 memory,
543 #[cfg(target_arch = "x86_64")]
544 io_bus: io_bus.clone(),
545 mmio_bus: mmio_bus.clone(),
546 });
547
548 let cpus_config = { &config.lock().unwrap().cpus.clone() };
549 let cpu_manager = cpu::CpuManager::new(
550 cpus_config,
551 vm.clone(),
552 exit_evt.try_clone().map_err(Error::EventFdClone)?,
553 reset_evt.try_clone().map_err(Error::EventFdClone)?,
554 #[cfg(feature = "guest_debug")]
555 vm_debug_evt,
556 &hypervisor,
557 seccomp_action.clone(),
558 vm_ops,
559 #[cfg(feature = "tdx")]
560 tdx_enabled,
561 &numa_nodes,
562 #[cfg(feature = "sev_snp")]
563 sev_snp_enabled,
564 )
565 .map_err(Error::CpuManager)?;
566
567 #[cfg(target_arch = "x86_64")]
568 cpu_manager
569 .lock()
570 .unwrap()
571 .populate_cpuid(
572 &memory_manager,
573 &hypervisor,
574 #[cfg(feature = "tdx")]
575 tdx_enabled,
576 )
577 .map_err(Error::CpuManager)?;
578
579 // The initial TDX configuration must be done before the vCPUs are
580 // created
581 #[cfg(feature = "tdx")]
582 if tdx_enabled {
583 let cpuid = cpu_manager.lock().unwrap().common_cpuid();
584 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32;
585 vm.tdx_init(&cpuid, max_vcpus)
586 .map_err(Error::InitializeTdxVm)?;
587 }
588
589 #[cfg(feature = "tdx")]
590 let dynamic = !tdx_enabled;
591 #[cfg(not(feature = "tdx"))]
592 let dynamic = true;
593
594 #[cfg(feature = "kvm")]
595 let is_kvm = matches!(
596 hypervisor.hypervisor_type(),
597 hypervisor::HypervisorType::Kvm
598 );
599 #[cfg(feature = "mshv")]
600 let is_mshv = matches!(
601 hypervisor.hypervisor_type(),
602 hypervisor::HypervisorType::Mshv
603 );
604
605 let device_manager = DeviceManager::new(
606 io_bus,
607 mmio_bus,
608 vm.clone(),
609 config.clone(),
610 memory_manager.clone(),
611 cpu_manager.clone(),
612 exit_evt.try_clone().map_err(Error::EventFdClone)?,
613 reset_evt,
614 seccomp_action.clone(),
615 numa_nodes.clone(),
616 &activate_evt,
617 force_iommu,
618 boot_id_list,
619 #[cfg(not(target_arch = "riscv64"))]
620 timestamp,
621 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID),
622 dynamic,
623 )
624 .map_err(Error::DeviceManager)?;
625
626 // For MSHV, we need to create the interrupt controller before we initialize the VM.
627 // Because we need to set the base address of GICD before we initialize the VM.
628 #[cfg(feature = "mshv")]
629 {
630 if is_mshv {
631 let ic = device_manager
632 .lock()
633 .unwrap()
634 .create_interrupt_controller()
635 .map_err(Error::DeviceManager)?;
636
637 vm.init().map_err(Error::InitializeVm)?;
638
639 device_manager
640 .lock()
641 .unwrap()
642 .create_devices(
643 console_info.clone(),
644 console_resize_pipe.clone(),
645 original_termios.clone(),
646 ic,
647 )
648 .map_err(Error::DeviceManager)?;
649 }
650 }
651
652 memory_manager
653 .lock()
654 .unwrap()
655 .allocate_address_space()
656 .map_err(Error::MemoryManager)?;
657
658 #[cfg(target_arch = "aarch64")]
659 memory_manager
660 .lock()
661 .unwrap()
662 .add_uefi_flash()
663 .map_err(Error::MemoryManager)?;
664
665 // Loading the igvm file is pushed down here because
666 // igvm parser needs cpu_manager to retrieve cpuid leaf.
667 // Currently, Microsoft Hypervisor does not provide any
668 // Hypervisor specific common cpuid, we need to call get_cpuid_values
669 // per cpuid through cpu_manager.
670 let load_payload_handle = if snapshot.is_none() {
671 Self::load_payload_async(
672 &memory_manager,
673 &config,
674 #[cfg(feature = "igvm")]
675 &cpu_manager,
676 #[cfg(feature = "sev_snp")]
677 sev_snp_enabled,
678 )?
679 } else {
680 None
681 };
682
683 cpu_manager
684 .lock()
685 .unwrap()
686 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID))
687 .map_err(Error::CpuManager)?;
688
689 // For KVM, we need to create interrupt controller after we create boot vcpus.
690 // Because we restore GIC state from the snapshot as part of boot vcpu creation.
691 // This means that we need to create interrupt controller after we restore in case of KVM guests.
692 #[cfg(feature = "kvm")]
693 {
694 if is_kvm {
695 let ic = device_manager
696 .lock()
697 .unwrap()
698 .create_interrupt_controller()
699 .map_err(Error::DeviceManager)?;
700
701 vm.init().map_err(Error::InitializeVm)?;
702
703 device_manager
704 .lock()
705 .unwrap()
706 .create_devices(console_info, console_resize_pipe, original_termios, ic)
707 .map_err(Error::DeviceManager)?;
708 }
709 }
710
711 // This initial SEV-SNP configuration must be done immediately after
712 // vCPUs are created. As part of this initialization we are
713 // transitioning the guest into secure state.
714 #[cfg(feature = "sev_snp")]
715 if sev_snp_enabled {
716 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?;
717 }
718
719 #[cfg(feature = "tdx")]
720 let kernel = config
721 .lock()
722 .unwrap()
723 .payload
724 .as_ref()
725 .map(|p| p.kernel.as_ref().map(File::open))
726 .unwrap_or_default()
727 .transpose()
728 .map_err(Error::KernelFile)?;
729
730 let initramfs = config
731 .lock()
732 .unwrap()
733 .payload
734 .as_ref()
735 .map(|p| p.initramfs.as_ref().map(File::open))
736 .unwrap_or_default()
737 .transpose()
738 .map_err(Error::InitramfsFile)?;
739
740 #[cfg(target_arch = "x86_64")]
741 let saved_clock = if let Some(snapshot) = snapshot.as_ref() {
742 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?;
743 vm_snapshot.clock
744 } else {
745 None
746 };
747
748 let vm_state = if snapshot.is_some() {
749 VmState::Paused
750 } else {
751 VmState::Created
752 };
753
754 Ok(Vm {
755 #[cfg(feature = "tdx")]
756 kernel,
757 initramfs,
758 device_manager,
759 config,
760 threads: Vec::with_capacity(1),
761 state: RwLock::new(vm_state),
762 cpu_manager,
763 memory_manager,
764 vm,
765 #[cfg(target_arch = "x86_64")]
766 saved_clock,
767 #[cfg(not(target_arch = "riscv64"))]
768 numa_nodes,
769 #[cfg(not(target_arch = "riscv64"))]
770 hypervisor,
771 stop_on_boot,
772 load_payload_handle,
773 })
774 }
775
create_numa_nodes( configs: Option<Vec<NumaConfig>>, memory_manager: &Arc<Mutex<MemoryManager>>, ) -> Result<NumaNodes>776 fn create_numa_nodes(
777 configs: Option<Vec<NumaConfig>>,
778 memory_manager: &Arc<Mutex<MemoryManager>>,
779 ) -> Result<NumaNodes> {
780 let mm = memory_manager.lock().unwrap();
781 let mm_zones = mm.memory_zones();
782 let mut numa_nodes = BTreeMap::new();
783
784 if let Some(configs) = &configs {
785 for config in configs.iter() {
786 if numa_nodes.contains_key(&config.guest_numa_id) {
787 error!("Can't define twice the same NUMA node");
788 return Err(Error::InvalidNumaConfig);
789 }
790
791 let mut node = NumaNode::default();
792
793 if let Some(memory_zones) = &config.memory_zones {
794 for memory_zone in memory_zones.iter() {
795 if let Some(mm_zone) = mm_zones.get(memory_zone) {
796 node.memory_regions.extend(mm_zone.regions().clone());
797 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() {
798 node.hotplug_regions.push(virtiomem_zone.region().clone());
799 }
800 node.memory_zones.push(memory_zone.clone());
801 } else {
802 error!("Unknown memory zone '{}'", memory_zone);
803 return Err(Error::InvalidNumaConfig);
804 }
805 }
806 }
807
808 if let Some(cpus) = &config.cpus {
809 node.cpus.extend(cpus);
810 }
811
812 if let Some(pci_segments) = &config.pci_segments {
813 node.pci_segments.extend(pci_segments);
814 }
815
816 if let Some(distances) = &config.distances {
817 for distance in distances.iter() {
818 let dest = distance.destination;
819 let dist = distance.distance;
820
821 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) {
822 error!("Unknown destination NUMA node {}", dest);
823 return Err(Error::InvalidNumaConfig);
824 }
825
826 if node.distances.contains_key(&dest) {
827 error!("Destination NUMA node {} has been already set", dest);
828 return Err(Error::InvalidNumaConfig);
829 }
830
831 node.distances.insert(dest, dist);
832 }
833 }
834
835 #[cfg(target_arch = "x86_64")]
836 if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
837 if let Some(sgx_epc_region) = mm.sgx_epc_region() {
838 let mm_sections = sgx_epc_region.epc_sections();
839 for sgx_epc_section in sgx_epc_sections.iter() {
840 if let Some(mm_section) = mm_sections.get(sgx_epc_section) {
841 node.sgx_epc_sections.push(mm_section.clone());
842 } else {
843 error!("Unknown SGX EPC section '{}'", sgx_epc_section);
844 return Err(Error::InvalidNumaConfig);
845 }
846 }
847 } else {
848 error!("Missing SGX EPC region");
849 return Err(Error::InvalidNumaConfig);
850 }
851 }
852
853 numa_nodes.insert(config.guest_numa_id, node);
854 }
855 }
856
857 Ok(numa_nodes)
858 }
859
860 #[allow(clippy::too_many_arguments)]
new( vm_config: Arc<Mutex<VmConfig>>, exit_evt: EventFd, reset_evt: EventFd, #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, seccomp_action: &SeccompAction, hypervisor: Arc<dyn hypervisor::Hypervisor>, activate_evt: EventFd, console_info: Option<ConsoleInfo>, console_resize_pipe: Option<Arc<File>>, original_termios: Arc<Mutex<Option<termios>>>, snapshot: Option<Snapshot>, source_url: Option<&str>, prefault: Option<bool>, ) -> Result<Self>861 pub fn new(
862 vm_config: Arc<Mutex<VmConfig>>,
863 exit_evt: EventFd,
864 reset_evt: EventFd,
865 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd,
866 seccomp_action: &SeccompAction,
867 hypervisor: Arc<dyn hypervisor::Hypervisor>,
868 activate_evt: EventFd,
869 console_info: Option<ConsoleInfo>,
870 console_resize_pipe: Option<Arc<File>>,
871 original_termios: Arc<Mutex<Option<termios>>>,
872 snapshot: Option<Snapshot>,
873 source_url: Option<&str>,
874 prefault: Option<bool>,
875 ) -> Result<Self> {
876 trace_scoped!("Vm::new");
877
878 #[cfg(not(target_arch = "riscv64"))]
879 let timestamp = Instant::now();
880
881 #[cfg(feature = "tdx")]
882 let tdx_enabled = if snapshot.is_some() {
883 false
884 } else {
885 vm_config.lock().unwrap().is_tdx_enabled()
886 };
887
888 #[cfg(feature = "sev_snp")]
889 let sev_snp_enabled = if snapshot.is_some() {
890 false
891 } else {
892 vm_config.lock().unwrap().is_sev_snp_enabled()
893 };
894
895 let vm = Self::create_hypervisor_vm(
896 &hypervisor,
897 #[cfg(feature = "tdx")]
898 tdx_enabled,
899 #[cfg(feature = "sev_snp")]
900 sev_snp_enabled,
901 #[cfg(feature = "sev_snp")]
902 vm_config.lock().unwrap().memory.total_size(),
903 )?;
904
905 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits);
906
907 let memory_manager = if let Some(snapshot) =
908 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID)
909 {
910 MemoryManager::new_from_snapshot(
911 &snapshot,
912 vm.clone(),
913 &vm_config.lock().unwrap().memory.clone(),
914 source_url,
915 prefault.unwrap(),
916 phys_bits,
917 )
918 .map_err(Error::MemoryManager)?
919 } else {
920 #[cfg(target_arch = "x86_64")]
921 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone();
922
923 MemoryManager::new(
924 vm.clone(),
925 &vm_config.lock().unwrap().memory.clone(),
926 None,
927 phys_bits,
928 #[cfg(feature = "tdx")]
929 tdx_enabled,
930 None,
931 None,
932 #[cfg(target_arch = "x86_64")]
933 sgx_epc_config,
934 )
935 .map_err(Error::MemoryManager)?
936 };
937
938 Vm::new_from_memory_manager(
939 vm_config,
940 memory_manager,
941 vm,
942 exit_evt,
943 reset_evt,
944 #[cfg(feature = "guest_debug")]
945 vm_debug_evt,
946 seccomp_action,
947 hypervisor,
948 activate_evt,
949 #[cfg(not(target_arch = "riscv64"))]
950 timestamp,
951 console_info,
952 console_resize_pipe,
953 original_termios,
954 snapshot,
955 )
956 }
957
create_hypervisor_vm( hypervisor: &Arc<dyn hypervisor::Hypervisor>, #[cfg(feature = "tdx")] tdx_enabled: bool, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, #[cfg(feature = "sev_snp")] mem_size: u64, ) -> Result<Arc<dyn hypervisor::Vm>>958 pub fn create_hypervisor_vm(
959 hypervisor: &Arc<dyn hypervisor::Hypervisor>,
960 #[cfg(feature = "tdx")] tdx_enabled: bool,
961 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
962 #[cfg(feature = "sev_snp")] mem_size: u64,
963 ) -> Result<Arc<dyn hypervisor::Vm>> {
964 hypervisor.check_required_extensions().unwrap();
965
966 cfg_if::cfg_if! {
967 if #[cfg(feature = "tdx")] {
968 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true
969 // Otherwise KVM_X86_LEGACY_VM: 0
970 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM
971 let vm = hypervisor
972 .create_vm_with_type(u64::from(tdx_enabled))
973 .unwrap();
974 } else if #[cfg(feature = "sev_snp")] {
975 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true
976 // Otherwise SEV_SNP_DISABLED: 0
977 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false
978 let vm = hypervisor
979 .create_vm_with_type_and_memory(u64::from(sev_snp_enabled), mem_size)
980 .unwrap();
981 } else {
982 let vm = hypervisor.create_vm().unwrap();
983 }
984 }
985
986 #[cfg(target_arch = "x86_64")]
987 {
988 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0)
989 .unwrap();
990 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap();
991 vm.enable_split_irq().unwrap();
992 }
993
994 Ok(vm)
995 }
996
load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig>997 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> {
998 let initramfs = self.initramfs.as_mut().unwrap();
999 let size: usize = initramfs
1000 .seek(SeekFrom::End(0))
1001 .map_err(|_| Error::InitramfsLoad)?
1002 .try_into()
1003 .unwrap();
1004 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?;
1005
1006 let address =
1007 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?;
1008 let address = GuestAddress(address);
1009
1010 guest_mem
1011 .read_volatile_from(address, initramfs, size)
1012 .map_err(|_| Error::InitramfsLoad)?;
1013
1014 info!("Initramfs loaded: address = 0x{:x}", address.0);
1015 Ok(arch::InitramfsConfig { address, size })
1016 }
1017
generate_cmdline( payload: &PayloadConfig, #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] device_manager: &Arc< Mutex<DeviceManager>, >, ) -> Result<Cmdline>1018 pub fn generate_cmdline(
1019 payload: &PayloadConfig,
1020 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] device_manager: &Arc<
1021 Mutex<DeviceManager>,
1022 >,
1023 ) -> Result<Cmdline> {
1024 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?;
1025 if let Some(s) = payload.cmdline.as_ref() {
1026 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?;
1027 }
1028
1029 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
1030 for entry in device_manager.lock().unwrap().cmdline_additions() {
1031 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?;
1032 }
1033 Ok(cmdline)
1034 }
1035
1036 #[cfg(target_arch = "aarch64")]
load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()>1037 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> {
1038 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash();
1039 let mem = uefi_flash.memory();
1040 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware)
1041 .map_err(Error::UefiLoad)?;
1042 Ok(())
1043 }
1044
1045 #[cfg(target_arch = "aarch64")]
load_kernel( firmware: Option<File>, kernel: Option<File>, memory_manager: Arc<Mutex<MemoryManager>>, ) -> Result<EntryPoint>1046 fn load_kernel(
1047 firmware: Option<File>,
1048 kernel: Option<File>,
1049 memory_manager: Arc<Mutex<MemoryManager>>,
1050 ) -> Result<EntryPoint> {
1051 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1052 let mem = guest_memory.memory();
1053 let entry_addr = match (firmware, kernel) {
1054 (None, Some(mut kernel)) => {
1055 match linux_loader::loader::pe::PE::load(
1056 mem.deref(),
1057 Some(arch::layout::KERNEL_START),
1058 &mut kernel,
1059 None,
1060 ) {
1061 Ok(entry_addr) => entry_addr.kernel_load,
1062 // Try to load the binary as kernel PE file at first.
1063 // If failed, retry to load it as UEFI binary.
1064 // As the UEFI binary is formatless, it must be the last option to try.
1065 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
1066 Self::load_firmware(&kernel, memory_manager)?;
1067 arch::layout::UEFI_START
1068 }
1069 Err(e) => {
1070 return Err(Error::KernelLoad(e));
1071 }
1072 }
1073 }
1074 (Some(firmware), None) => {
1075 Self::load_firmware(&firmware, memory_manager)?;
1076 arch::layout::UEFI_START
1077 }
1078 _ => return Err(Error::InvalidPayload),
1079 };
1080
1081 Ok(EntryPoint { entry_addr })
1082 }
1083
1084 #[cfg(target_arch = "riscv64")]
load_kernel( firmware: Option<File>, kernel: Option<File>, memory_manager: Arc<Mutex<MemoryManager>>, ) -> Result<EntryPoint>1085 fn load_kernel(
1086 firmware: Option<File>,
1087 kernel: Option<File>,
1088 memory_manager: Arc<Mutex<MemoryManager>>,
1089 ) -> Result<EntryPoint> {
1090 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1091 let mem = guest_memory.memory();
1092 let alignment = 0x20_0000;
1093 let aligned_kernel_addr = arch::layout::KERNEL_START.0 + (alignment - 1) & !(alignment - 1);
1094 let entry_addr = match (firmware, kernel) {
1095 (None, Some(mut kernel)) => {
1096 match linux_loader::loader::pe::PE::load(
1097 mem.deref(),
1098 Some(GuestAddress(aligned_kernel_addr)),
1099 &mut kernel,
1100 None,
1101 ) {
1102 Ok(entry_addr) => entry_addr.kernel_load,
1103 // Try to load the binary as kernel PE file at first.
1104 // If failed, retry to load it as UEFI binary.
1105 // As the UEFI binary is formatless, it must be the last option to try.
1106 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => {
1107 // TODO: UEFI for riscv64 is scheduled to next stage.
1108 unimplemented!()
1109 }
1110 Err(e) => {
1111 return Err(Error::KernelLoad(e));
1112 }
1113 }
1114 }
1115 (Some(_firmware), None) => {
1116 // TODO: UEFI for riscv64 is scheduled to next stage.
1117 unimplemented!()
1118 }
1119 _ => return Err(Error::InvalidPayload),
1120 };
1121
1122 Ok(EntryPoint { entry_addr })
1123 }
1124
1125 #[cfg(feature = "igvm")]
load_igvm( igvm: File, memory_manager: Arc<Mutex<MemoryManager>>, cpu_manager: Arc<Mutex<cpu::CpuManager>>, #[cfg(feature = "sev_snp")] host_data: &Option<String>, ) -> Result<EntryPoint>1126 fn load_igvm(
1127 igvm: File,
1128 memory_manager: Arc<Mutex<MemoryManager>>,
1129 cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1130 #[cfg(feature = "sev_snp")] host_data: &Option<String>,
1131 ) -> Result<EntryPoint> {
1132 let res = igvm_loader::load_igvm(
1133 &igvm,
1134 memory_manager,
1135 cpu_manager.clone(),
1136 "",
1137 #[cfg(feature = "sev_snp")]
1138 host_data,
1139 )
1140 .map_err(Error::IgvmLoad)?;
1141
1142 cfg_if::cfg_if! {
1143 if #[cfg(feature = "sev_snp")] {
1144 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() {
1145 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None }
1146 } else {
1147 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }
1148 };
1149 } else {
1150 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None };
1151 }
1152 };
1153 Ok(entry_point)
1154 }
1155
1156 #[cfg(target_arch = "x86_64")]
load_kernel( mut kernel: File, cmdline: Option<Cmdline>, memory_manager: Arc<Mutex<MemoryManager>>, ) -> Result<EntryPoint>1157 fn load_kernel(
1158 mut kernel: File,
1159 cmdline: Option<Cmdline>,
1160 memory_manager: Arc<Mutex<MemoryManager>>,
1161 ) -> Result<EntryPoint> {
1162 info!("Loading kernel");
1163
1164 let mem = {
1165 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory();
1166 guest_memory.memory()
1167 };
1168
1169 // Try ELF binary with PVH boot.
1170 let entry_addr = linux_loader::loader::elf::Elf::load(
1171 mem.deref(),
1172 None,
1173 &mut kernel,
1174 Some(arch::layout::HIGH_RAM_START),
1175 )
1176 // Try loading kernel as bzImage.
1177 .or_else(|_| {
1178 BzImage::load(
1179 mem.deref(),
1180 None,
1181 &mut kernel,
1182 Some(arch::layout::HIGH_RAM_START),
1183 )
1184 })
1185 .map_err(Error::KernelLoad)?;
1186
1187 if let Some(cmdline) = cmdline {
1188 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline)
1189 .map_err(Error::LoadCmdLine)?;
1190 }
1191
1192 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap {
1193 // Use the PVH kernel entry point to boot the guest
1194 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0);
1195 Ok(EntryPoint {
1196 entry_addr,
1197 setup_header: None,
1198 })
1199 } else if entry_addr.setup_header.is_some() {
1200 // Use the bzImage 32bit entry point to boot the guest
1201 info!(
1202 "bzImage kernel loaded: entry_addr = 0x{:x}",
1203 entry_addr.kernel_load.0
1204 );
1205 Ok(EntryPoint {
1206 entry_addr: entry_addr.kernel_load,
1207 setup_header: entry_addr.setup_header,
1208 })
1209 } else {
1210 Err(Error::KernelMissingPvhHeader)
1211 }
1212 }
1213
1214 #[cfg(target_arch = "x86_64")]
load_payload( payload: &PayloadConfig, memory_manager: Arc<Mutex<MemoryManager>>, #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, ) -> Result<EntryPoint>1215 fn load_payload(
1216 payload: &PayloadConfig,
1217 memory_manager: Arc<Mutex<MemoryManager>>,
1218 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>,
1219 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1220 ) -> Result<EntryPoint> {
1221 trace_scoped!("load_payload");
1222 #[cfg(feature = "igvm")]
1223 {
1224 if let Some(_igvm_file) = &payload.igvm {
1225 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?;
1226 #[cfg(feature = "sev_snp")]
1227 if sev_snp_enabled {
1228 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data);
1229 }
1230 #[cfg(not(feature = "sev_snp"))]
1231 return Self::load_igvm(igvm, memory_manager, cpu_manager);
1232 }
1233 }
1234 match (
1235 &payload.firmware,
1236 &payload.kernel,
1237 &payload.initramfs,
1238 &payload.cmdline,
1239 ) {
1240 (Some(firmware), None, None, None) => {
1241 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1242 Self::load_kernel(firmware, None, memory_manager)
1243 }
1244 (None, Some(kernel), _, _) => {
1245 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1246 let cmdline = Self::generate_cmdline(payload)?;
1247 Self::load_kernel(kernel, Some(cmdline), memory_manager)
1248 }
1249 _ => Err(Error::InvalidPayload),
1250 }
1251 }
1252
1253 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))]
load_payload( payload: &PayloadConfig, memory_manager: Arc<Mutex<MemoryManager>>, ) -> Result<EntryPoint>1254 fn load_payload(
1255 payload: &PayloadConfig,
1256 memory_manager: Arc<Mutex<MemoryManager>>,
1257 ) -> Result<EntryPoint> {
1258 match (&payload.firmware, &payload.kernel) {
1259 (Some(firmware), None) => {
1260 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?;
1261 Self::load_kernel(Some(firmware), None, memory_manager)
1262 }
1263 (None, Some(kernel)) => {
1264 let kernel = File::open(kernel).map_err(Error::KernelFile)?;
1265 Self::load_kernel(None, Some(kernel), memory_manager)
1266 }
1267 _ => Err(Error::InvalidPayload),
1268 }
1269 }
1270
load_payload_async( memory_manager: &Arc<Mutex<MemoryManager>>, config: &Arc<Mutex<VmConfig>>, #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>>1271 fn load_payload_async(
1272 memory_manager: &Arc<Mutex<MemoryManager>>,
1273 config: &Arc<Mutex<VmConfig>>,
1274 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>,
1275 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool,
1276 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> {
1277 // Kernel with TDX is loaded in a different manner
1278 #[cfg(feature = "tdx")]
1279 if config.lock().unwrap().is_tdx_enabled() {
1280 return Ok(None);
1281 }
1282
1283 config
1284 .lock()
1285 .unwrap()
1286 .payload
1287 .as_ref()
1288 .map(|payload| {
1289 let memory_manager = memory_manager.clone();
1290 let payload = payload.clone();
1291 #[cfg(feature = "igvm")]
1292 let cpu_manager = cpu_manager.clone();
1293
1294 std::thread::Builder::new()
1295 .name("payload_loader".into())
1296 .spawn(move || {
1297 Self::load_payload(
1298 &payload,
1299 memory_manager,
1300 #[cfg(feature = "igvm")]
1301 cpu_manager,
1302 #[cfg(feature = "sev_snp")]
1303 sev_snp_enabled,
1304 )
1305 })
1306 .map_err(Error::KernelLoadThreadSpawn)
1307 })
1308 .transpose()
1309 }
1310
1311 #[cfg(target_arch = "x86_64")]
configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()>1312 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> {
1313 trace_scoped!("configure_system");
1314 info!("Configuring system");
1315 let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1316
1317 let initramfs_config = match self.initramfs {
1318 Some(_) => Some(self.load_initramfs(&mem)?),
1319 None => None,
1320 };
1321
1322 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus();
1323 let rsdp_addr = Some(rsdp_addr);
1324 let sgx_epc_region = self
1325 .memory_manager
1326 .lock()
1327 .unwrap()
1328 .sgx_epc_region()
1329 .as_ref()
1330 .cloned();
1331
1332 let serial_number = self
1333 .config
1334 .lock()
1335 .unwrap()
1336 .platform
1337 .as_ref()
1338 .and_then(|p| p.serial_number.clone());
1339
1340 let uuid = self
1341 .config
1342 .lock()
1343 .unwrap()
1344 .platform
1345 .as_ref()
1346 .and_then(|p| p.uuid.clone());
1347
1348 let oem_strings = self
1349 .config
1350 .lock()
1351 .unwrap()
1352 .platform
1353 .as_ref()
1354 .and_then(|p| p.oem_strings.clone());
1355
1356 let oem_strings = oem_strings
1357 .as_deref()
1358 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>());
1359
1360 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1361
1362 arch::configure_system(
1363 &mem,
1364 arch::layout::CMDLINE_START,
1365 arch::layout::CMDLINE_MAX_SIZE,
1366 &initramfs_config,
1367 boot_vcpus,
1368 entry_addr.setup_header,
1369 rsdp_addr,
1370 sgx_epc_region,
1371 serial_number.as_deref(),
1372 uuid.as_deref(),
1373 oem_strings.as_deref(),
1374 topology,
1375 )
1376 .map_err(Error::ConfigureSystem)?;
1377 Ok(())
1378 }
1379
1380 #[cfg(target_arch = "aarch64")]
configure_system( &mut self, _rsdp_addr: GuestAddress, _entry_addr: EntryPoint, ) -> Result<()>1381 fn configure_system(
1382 &mut self,
1383 _rsdp_addr: GuestAddress,
1384 _entry_addr: EntryPoint,
1385 ) -> Result<()> {
1386 let cmdline = Self::generate_cmdline(
1387 self.config.lock().unwrap().payload.as_ref().unwrap(),
1388 &self.device_manager,
1389 )?;
1390 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs();
1391 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology();
1392 let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1393 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1394 let initramfs_config = match self.initramfs {
1395 Some(_) => Some(self.load_initramfs(&mem)?),
1396 None => None,
1397 };
1398
1399 let device_info = &self
1400 .device_manager
1401 .lock()
1402 .unwrap()
1403 .get_device_info()
1404 .clone();
1405
1406 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1407 let pci_space = PciSpaceInfo {
1408 pci_segment_id: pci_segment.id,
1409 mmio_config_address: pci_segment.mmio_config_address,
1410 pci_device_space_start: pci_segment.start_of_mem64_area,
1411 pci_device_space_size: pci_segment.end_of_mem64_area
1412 - pci_segment.start_of_mem64_area
1413 + 1,
1414 };
1415 pci_space_info.push(pci_space);
1416 }
1417
1418 let virtio_iommu_bdf = self
1419 .device_manager
1420 .lock()
1421 .unwrap()
1422 .iommu_attached_devices()
1423 .as_ref()
1424 .map(|(v, _)| *v);
1425
1426 let vgic = self
1427 .device_manager
1428 .lock()
1429 .unwrap()
1430 .get_interrupt_controller()
1431 .unwrap()
1432 .lock()
1433 .unwrap()
1434 .get_vgic()
1435 .map_err(|_| {
1436 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1437 arch::aarch64::Error::SetupGic,
1438 ))
1439 })?;
1440
1441 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number.
1442 let pmu_supported = self
1443 .cpu_manager
1444 .lock()
1445 .unwrap()
1446 .init_pmu(AARCH64_PMU_IRQ + 16)
1447 .map_err(|_| {
1448 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1449 arch::aarch64::Error::VcpuInitPmu,
1450 ))
1451 })?;
1452
1453 arch::configure_system(
1454 &mem,
1455 cmdline.as_cstring().unwrap().to_str().unwrap(),
1456 vcpu_mpidrs,
1457 vcpu_topology,
1458 device_info,
1459 &initramfs_config,
1460 &pci_space_info,
1461 virtio_iommu_bdf.map(|bdf| bdf.into()),
1462 &vgic,
1463 &self.numa_nodes,
1464 pmu_supported,
1465 )
1466 .map_err(Error::ConfigureSystem)?;
1467
1468 Ok(())
1469 }
1470
1471 #[cfg(target_arch = "riscv64")]
configure_system(&mut self) -> Result<()>1472 fn configure_system(&mut self) -> Result<()> {
1473 let cmdline = Self::generate_cmdline(
1474 self.config.lock().unwrap().payload.as_ref().unwrap(),
1475 &self.device_manager,
1476 )?;
1477 let num_vcpu = self.cpu_manager.lock().unwrap().vcpus().len();
1478 let mem = self.memory_manager.lock().unwrap().boot_guest_memory();
1479 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new();
1480 let initramfs_config = match self.initramfs {
1481 Some(_) => Some(self.load_initramfs(&mem)?),
1482 None => None,
1483 };
1484
1485 let device_info = &self
1486 .device_manager
1487 .lock()
1488 .unwrap()
1489 .get_device_info()
1490 .clone();
1491
1492 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() {
1493 let pci_space = PciSpaceInfo {
1494 pci_segment_id: pci_segment.id,
1495 mmio_config_address: pci_segment.mmio_config_address,
1496 pci_device_space_start: pci_segment.start_of_mem64_area,
1497 pci_device_space_size: pci_segment.end_of_mem64_area
1498 - pci_segment.start_of_mem64_area
1499 + 1,
1500 };
1501 pci_space_info.push(pci_space);
1502 }
1503
1504 // TODO: IOMMU for riscv64 is not yet support in kernel.
1505
1506 let vaia = self
1507 .device_manager
1508 .lock()
1509 .unwrap()
1510 .get_interrupt_controller()
1511 .unwrap()
1512 .lock()
1513 .unwrap()
1514 .get_vaia()
1515 .map_err(|_| {
1516 Error::ConfigureSystem(arch::Error::PlatformSpecific(
1517 arch::riscv64::Error::SetupAia,
1518 ))
1519 })?;
1520
1521 // TODO: PMU support for riscv64 is scheduled to next stage.
1522
1523 arch::configure_system(
1524 &mem,
1525 cmdline.as_cstring().unwrap().to_str().unwrap(),
1526 num_vcpu as u32,
1527 device_info,
1528 &initramfs_config,
1529 &pci_space_info,
1530 &vaia,
1531 )
1532 .map_err(Error::ConfigureSystem)?;
1533
1534 Ok(())
1535 }
1536
console_resize_pipe(&self) -> Option<Arc<File>>1537 pub fn console_resize_pipe(&self) -> Option<Arc<File>> {
1538 self.device_manager.lock().unwrap().console_resize_pipe()
1539 }
1540
shutdown(&mut self) -> Result<()>1541 pub fn shutdown(&mut self) -> Result<()> {
1542 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
1543 let new_state = VmState::Shutdown;
1544
1545 state.valid_transition(new_state)?;
1546
1547 // Wake up the DeviceManager threads so they will get terminated cleanly
1548 self.device_manager
1549 .lock()
1550 .unwrap()
1551 .resume()
1552 .map_err(Error::Resume)?;
1553
1554 self.cpu_manager
1555 .lock()
1556 .unwrap()
1557 .shutdown()
1558 .map_err(Error::CpuManager)?;
1559
1560 // Wait for all the threads to finish
1561 for thread in self.threads.drain(..) {
1562 thread.join().map_err(Error::ThreadCleanup)?
1563 }
1564 *state = new_state;
1565
1566 Ok(())
1567 }
1568
resize( &mut self, desired_vcpus: Option<u8>, desired_memory: Option<u64>, desired_balloon: Option<u64>, ) -> Result<()>1569 pub fn resize(
1570 &mut self,
1571 desired_vcpus: Option<u8>,
1572 desired_memory: Option<u64>,
1573 desired_balloon: Option<u64>,
1574 ) -> Result<()> {
1575 event!("vm", "resizing");
1576
1577 if let Some(desired_vcpus) = desired_vcpus {
1578 if self
1579 .cpu_manager
1580 .lock()
1581 .unwrap()
1582 .resize(desired_vcpus)
1583 .map_err(Error::CpuManager)?
1584 {
1585 self.device_manager
1586 .lock()
1587 .unwrap()
1588 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED)
1589 .map_err(Error::DeviceManager)?;
1590 }
1591 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus;
1592 }
1593
1594 if let Some(desired_memory) = desired_memory {
1595 let new_region = self
1596 .memory_manager
1597 .lock()
1598 .unwrap()
1599 .resize(desired_memory)
1600 .map_err(Error::MemoryManager)?;
1601
1602 let memory_config = &mut self.config.lock().unwrap().memory;
1603
1604 if let Some(new_region) = &new_region {
1605 self.device_manager
1606 .lock()
1607 .unwrap()
1608 .update_memory(new_region)
1609 .map_err(Error::DeviceManager)?;
1610
1611 match memory_config.hotplug_method {
1612 HotplugMethod::Acpi => {
1613 self.device_manager
1614 .lock()
1615 .unwrap()
1616 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED)
1617 .map_err(Error::DeviceManager)?;
1618 }
1619 HotplugMethod::VirtioMem => {}
1620 }
1621 }
1622
1623 // We update the VM config regardless of the actual guest resize
1624 // operation result (happened or not), so that if the VM reboots
1625 // it will be running with the last configure memory size.
1626 match memory_config.hotplug_method {
1627 HotplugMethod::Acpi => memory_config.size = desired_memory,
1628 HotplugMethod::VirtioMem => {
1629 if desired_memory > memory_config.size {
1630 memory_config.hotplugged_size = Some(desired_memory - memory_config.size);
1631 } else {
1632 memory_config.hotplugged_size = None;
1633 }
1634 }
1635 }
1636 }
1637
1638 if let Some(desired_balloon) = desired_balloon {
1639 self.device_manager
1640 .lock()
1641 .unwrap()
1642 .resize_balloon(desired_balloon)
1643 .map_err(Error::DeviceManager)?;
1644
1645 // Update the configuration value for the balloon size to ensure
1646 // a reboot would use the right value.
1647 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon {
1648 balloon_config.size = desired_balloon;
1649 }
1650 }
1651
1652 event!("vm", "resized");
1653
1654 Ok(())
1655 }
1656
resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()>1657 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> {
1658 let memory_config = &mut self.config.lock().unwrap().memory;
1659
1660 if let Some(zones) = &mut memory_config.zones {
1661 for zone in zones.iter_mut() {
1662 if zone.id == id {
1663 if desired_memory >= zone.size {
1664 let hotplugged_size = desired_memory - zone.size;
1665 self.memory_manager
1666 .lock()
1667 .unwrap()
1668 .resize_zone(&id, desired_memory - zone.size)
1669 .map_err(Error::MemoryManager)?;
1670 // We update the memory zone config regardless of the
1671 // actual 'resize-zone' operation result (happened or
1672 // not), so that if the VM reboots it will be running
1673 // with the last configured memory zone size.
1674 zone.hotplugged_size = Some(hotplugged_size);
1675
1676 return Ok(());
1677 } else {
1678 error!(
1679 "Invalid to ask less ({}) than boot RAM ({}) for \
1680 this memory zone",
1681 desired_memory, zone.size,
1682 );
1683 return Err(Error::ResizeZone);
1684 }
1685 }
1686 }
1687 }
1688
1689 error!("Could not find the memory zone {} for the resize", id);
1690 Err(Error::ResizeZone)
1691 }
1692
add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo>1693 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> {
1694 let pci_device_info = self
1695 .device_manager
1696 .lock()
1697 .unwrap()
1698 .add_device(&mut device_cfg)
1699 .map_err(Error::DeviceManager)?;
1700
1701 // Update VmConfig by adding the new device. This is important to
1702 // ensure the device would be created in case of a reboot.
1703 {
1704 let mut config = self.config.lock().unwrap();
1705 add_to_config(&mut config.devices, device_cfg);
1706 }
1707
1708 self.device_manager
1709 .lock()
1710 .unwrap()
1711 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1712 .map_err(Error::DeviceManager)?;
1713
1714 Ok(pci_device_info)
1715 }
1716
add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo>1717 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> {
1718 let pci_device_info = self
1719 .device_manager
1720 .lock()
1721 .unwrap()
1722 .add_user_device(&mut device_cfg)
1723 .map_err(Error::DeviceManager)?;
1724
1725 // Update VmConfig by adding the new device. This is important to
1726 // ensure the device would be created in case of a reboot.
1727 {
1728 let mut config = self.config.lock().unwrap();
1729 add_to_config(&mut config.user_devices, device_cfg);
1730 }
1731
1732 self.device_manager
1733 .lock()
1734 .unwrap()
1735 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1736 .map_err(Error::DeviceManager)?;
1737
1738 Ok(pci_device_info)
1739 }
1740
remove_device(&mut self, id: String) -> Result<()>1741 pub fn remove_device(&mut self, id: String) -> Result<()> {
1742 self.device_manager
1743 .lock()
1744 .unwrap()
1745 .remove_device(id.clone())
1746 .map_err(Error::DeviceManager)?;
1747
1748 // Update VmConfig by removing the device. This is important to
1749 // ensure the device would not be created in case of a reboot.
1750 self.config.lock().unwrap().remove_device(&id);
1751
1752 self.device_manager
1753 .lock()
1754 .unwrap()
1755 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1756 .map_err(Error::DeviceManager)?;
1757 Ok(())
1758 }
1759
add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo>1760 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> {
1761 let pci_device_info = self
1762 .device_manager
1763 .lock()
1764 .unwrap()
1765 .add_disk(&mut disk_cfg)
1766 .map_err(Error::DeviceManager)?;
1767
1768 // Update VmConfig by adding the new device. This is important to
1769 // ensure the device would be created in case of a reboot.
1770 {
1771 let mut config = self.config.lock().unwrap();
1772 add_to_config(&mut config.disks, disk_cfg);
1773 }
1774
1775 self.device_manager
1776 .lock()
1777 .unwrap()
1778 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1779 .map_err(Error::DeviceManager)?;
1780
1781 Ok(pci_device_info)
1782 }
1783
add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo>1784 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> {
1785 let pci_device_info = self
1786 .device_manager
1787 .lock()
1788 .unwrap()
1789 .add_fs(&mut fs_cfg)
1790 .map_err(Error::DeviceManager)?;
1791
1792 // Update VmConfig by adding the new device. This is important to
1793 // ensure the device would be created in case of a reboot.
1794 {
1795 let mut config = self.config.lock().unwrap();
1796 add_to_config(&mut config.fs, fs_cfg);
1797 }
1798
1799 self.device_manager
1800 .lock()
1801 .unwrap()
1802 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1803 .map_err(Error::DeviceManager)?;
1804
1805 Ok(pci_device_info)
1806 }
1807
add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo>1808 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> {
1809 let pci_device_info = self
1810 .device_manager
1811 .lock()
1812 .unwrap()
1813 .add_pmem(&mut pmem_cfg)
1814 .map_err(Error::DeviceManager)?;
1815
1816 // Update VmConfig by adding the new device. This is important to
1817 // ensure the device would be created in case of a reboot.
1818 {
1819 let mut config = self.config.lock().unwrap();
1820 add_to_config(&mut config.pmem, pmem_cfg);
1821 }
1822
1823 self.device_manager
1824 .lock()
1825 .unwrap()
1826 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1827 .map_err(Error::DeviceManager)?;
1828
1829 Ok(pci_device_info)
1830 }
1831
add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo>1832 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> {
1833 let pci_device_info = self
1834 .device_manager
1835 .lock()
1836 .unwrap()
1837 .add_net(&mut net_cfg)
1838 .map_err(Error::DeviceManager)?;
1839
1840 // Update VmConfig by adding the new device. This is important to
1841 // ensure the device would be created in case of a reboot.
1842 {
1843 let mut config = self.config.lock().unwrap();
1844 add_to_config(&mut config.net, net_cfg);
1845 }
1846
1847 self.device_manager
1848 .lock()
1849 .unwrap()
1850 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1851 .map_err(Error::DeviceManager)?;
1852
1853 Ok(pci_device_info)
1854 }
1855
add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo>1856 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> {
1857 let pci_device_info = self
1858 .device_manager
1859 .lock()
1860 .unwrap()
1861 .add_vdpa(&mut vdpa_cfg)
1862 .map_err(Error::DeviceManager)?;
1863
1864 // Update VmConfig by adding the new device. This is important to
1865 // ensure the device would be created in case of a reboot.
1866 {
1867 let mut config = self.config.lock().unwrap();
1868 add_to_config(&mut config.vdpa, vdpa_cfg);
1869 }
1870
1871 self.device_manager
1872 .lock()
1873 .unwrap()
1874 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1875 .map_err(Error::DeviceManager)?;
1876
1877 Ok(pci_device_info)
1878 }
1879
add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo>1880 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> {
1881 let pci_device_info = self
1882 .device_manager
1883 .lock()
1884 .unwrap()
1885 .add_vsock(&mut vsock_cfg)
1886 .map_err(Error::DeviceManager)?;
1887
1888 // Update VmConfig by adding the new device. This is important to
1889 // ensure the device would be created in case of a reboot.
1890 {
1891 let mut config = self.config.lock().unwrap();
1892 config.vsock = Some(vsock_cfg);
1893 }
1894
1895 self.device_manager
1896 .lock()
1897 .unwrap()
1898 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED)
1899 .map_err(Error::DeviceManager)?;
1900
1901 Ok(pci_device_info)
1902 }
1903
counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>>1904 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> {
1905 Ok(self.device_manager.lock().unwrap().counters())
1906 }
1907
1908 #[cfg(feature = "tdx")]
extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)>1909 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> {
1910 use arch::x86_64::tdx::*;
1911
1912 let firmware_path = self
1913 .config
1914 .lock()
1915 .unwrap()
1916 .payload
1917 .as_ref()
1918 .unwrap()
1919 .firmware
1920 .clone()
1921 .ok_or(Error::TdxFirmwareMissing)?;
1922 // The TDVF file contains a table of section as well as code
1923 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
1924
1925 // For all the sections allocate some RAM backing them
1926 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf)
1927 }
1928
1929 #[cfg(feature = "tdx")]
hob_memory_resources( mut sorted_sections: Vec<TdvfSection>, guest_memory: &GuestMemoryMmap, ) -> Vec<(u64, u64, bool)>1930 fn hob_memory_resources(
1931 mut sorted_sections: Vec<TdvfSection>,
1932 guest_memory: &GuestMemoryMmap,
1933 ) -> Vec<(u64, u64, bool)> {
1934 let mut list = Vec::new();
1935
1936 let mut current_section = sorted_sections.pop();
1937
1938 // RAM regions interleaved with TDVF sections
1939 let mut next_start_addr = 0;
1940 for region in guest_memory.iter() {
1941 let region_start = region.start_addr().0;
1942 let region_end = region.last_addr().0;
1943 if region_start > next_start_addr {
1944 next_start_addr = region_start;
1945 }
1946
1947 loop {
1948 let (start, size, ram) = if let Some(section) = ¤t_section {
1949 if section.address <= next_start_addr {
1950 (section.address, section.size, false)
1951 } else {
1952 let last_addr = std::cmp::min(section.address - 1, region_end);
1953 (next_start_addr, last_addr - next_start_addr + 1, true)
1954 }
1955 } else {
1956 (next_start_addr, region_end - next_start_addr + 1, true)
1957 };
1958
1959 list.push((start, size, ram));
1960
1961 if !ram {
1962 current_section = sorted_sections.pop();
1963 }
1964
1965 next_start_addr = start + size;
1966
1967 if region_start > next_start_addr {
1968 next_start_addr = region_start;
1969 }
1970
1971 if next_start_addr > region_end {
1972 break;
1973 }
1974 }
1975 }
1976
1977 // Once all the interleaved sections have been processed, let's simply
1978 // pull the remaining ones.
1979 if let Some(section) = current_section {
1980 list.push((section.address, section.size, false));
1981 }
1982 while let Some(section) = sorted_sections.pop() {
1983 list.push((section.address, section.size, false));
1984 }
1985
1986 list
1987 }
1988
1989 #[cfg(feature = "tdx")]
populate_tdx_sections( &mut self, sections: &[TdvfSection], guid_found: bool, ) -> Result<Option<u64>>1990 fn populate_tdx_sections(
1991 &mut self,
1992 sections: &[TdvfSection],
1993 guid_found: bool,
1994 ) -> Result<Option<u64>> {
1995 use arch::x86_64::tdx::*;
1996 // Get the memory end *before* we start adding TDVF ram regions
1997 let boot_guest_memory = self
1998 .memory_manager
1999 .lock()
2000 .as_ref()
2001 .unwrap()
2002 .boot_guest_memory();
2003 for section in sections {
2004 // No need to allocate if the section falls within guest RAM ranges
2005 if boot_guest_memory.address_in_range(GuestAddress(section.address)) {
2006 info!(
2007 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM",
2008 section
2009 );
2010 continue;
2011 }
2012
2013 info!("Allocating TDVF Section: {:x?}", section);
2014 self.memory_manager
2015 .lock()
2016 .unwrap()
2017 .add_ram_region(GuestAddress(section.address), section.size as usize)
2018 .map_err(Error::AllocatingTdvfMemory)?;
2019 }
2020
2021 // The TDVF file contains a table of section as well as code
2022 let firmware_path = self
2023 .config
2024 .lock()
2025 .unwrap()
2026 .payload
2027 .as_ref()
2028 .unwrap()
2029 .firmware
2030 .clone()
2031 .ok_or(Error::TdxFirmwareMissing)?;
2032 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?;
2033
2034 // The guest memory at this point now has all the required regions so it
2035 // is safe to copy from the TDVF file into it.
2036 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2037 let mem = guest_memory.memory();
2038 let mut payload_info = None;
2039 let mut hob_offset = None;
2040 for section in sections {
2041 info!("Populating TDVF Section: {:x?}", section);
2042 match section.r#type {
2043 TdvfSectionType::Bfv | TdvfSectionType::Cfv => {
2044 info!("Copying section to guest memory");
2045 firmware_file
2046 .seek(SeekFrom::Start(section.data_offset as u64))
2047 .map_err(Error::LoadTdvf)?;
2048 mem.read_volatile_from(
2049 GuestAddress(section.address),
2050 &mut firmware_file,
2051 section.data_size as usize,
2052 )
2053 .unwrap();
2054 }
2055 TdvfSectionType::TdHob => {
2056 hob_offset = Some(section.address);
2057 }
2058 TdvfSectionType::Payload => {
2059 info!("Copying payload to guest memory");
2060 if let Some(payload_file) = self.kernel.as_mut() {
2061 let payload_size = payload_file
2062 .seek(SeekFrom::End(0))
2063 .map_err(Error::LoadPayload)?;
2064
2065 payload_file
2066 .seek(SeekFrom::Start(0x1f1))
2067 .map_err(Error::LoadPayload)?;
2068
2069 let mut payload_header = linux_loader::bootparam::setup_header::default();
2070 payload_file
2071 .read_volatile(&mut payload_header.as_bytes())
2072 .unwrap();
2073
2074 if payload_header.header != 0x5372_6448 {
2075 return Err(Error::InvalidPayloadType);
2076 }
2077
2078 if (payload_header.version < 0x0200)
2079 || ((payload_header.loadflags & 0x1) == 0x0)
2080 {
2081 return Err(Error::InvalidPayloadType);
2082 }
2083
2084 payload_file.rewind().map_err(Error::LoadPayload)?;
2085 mem.read_volatile_from(
2086 GuestAddress(section.address),
2087 payload_file,
2088 payload_size as usize,
2089 )
2090 .unwrap();
2091
2092 // Create the payload info that will be inserted into
2093 // the HOB.
2094 payload_info = Some(PayloadInfo {
2095 image_type: PayloadImageType::BzImage,
2096 entry_point: section.address,
2097 });
2098 }
2099 }
2100 TdvfSectionType::PayloadParam => {
2101 info!("Copying payload parameters to guest memory");
2102 let cmdline = Self::generate_cmdline(
2103 self.config.lock().unwrap().payload.as_ref().unwrap(),
2104 )?;
2105 mem.write_slice(
2106 cmdline.as_cstring().unwrap().as_bytes_with_nul(),
2107 GuestAddress(section.address),
2108 )
2109 .unwrap();
2110 }
2111 _ => {}
2112 }
2113 }
2114
2115 // Generate HOB
2116 let mut hob = TdHob::start(hob_offset.unwrap());
2117
2118 let mut sorted_sections = sections.to_vec();
2119 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem));
2120
2121 sorted_sections.sort_by_key(|section| section.address);
2122 sorted_sections.reverse();
2123
2124 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) {
2125 hob.add_memory_resource(&mem, start, size, ram, guid_found)
2126 .map_err(Error::PopulateHob)?;
2127 }
2128
2129 // MMIO regions
2130 hob.add_mmio_resource(
2131 &mem,
2132 arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
2133 arch::layout::APIC_START.raw_value()
2134 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(),
2135 )
2136 .map_err(Error::PopulateHob)?;
2137 let start_of_device_area = self
2138 .memory_manager
2139 .lock()
2140 .unwrap()
2141 .start_of_device_area()
2142 .raw_value();
2143 let end_of_device_area = self
2144 .memory_manager
2145 .lock()
2146 .unwrap()
2147 .end_of_device_area()
2148 .raw_value();
2149 hob.add_mmio_resource(
2150 &mem,
2151 start_of_device_area,
2152 end_of_device_area - start_of_device_area,
2153 )
2154 .map_err(Error::PopulateHob)?;
2155
2156 // Loop over the ACPI tables and copy them to the HOB.
2157
2158 for acpi_table in crate::acpi::create_acpi_tables_tdx(
2159 &self.device_manager,
2160 &self.cpu_manager,
2161 &self.memory_manager,
2162 &self.numa_nodes,
2163 ) {
2164 hob.add_acpi_table(&mem, acpi_table.as_slice())
2165 .map_err(Error::PopulateHob)?;
2166 }
2167
2168 // If a payload info has been created, let's insert it into the HOB.
2169 if let Some(payload_info) = payload_info {
2170 hob.add_payload(&mem, payload_info)
2171 .map_err(Error::PopulateHob)?;
2172 }
2173
2174 hob.finish(&mem).map_err(Error::PopulateHob)?;
2175
2176 Ok(hob_offset)
2177 }
2178
2179 #[cfg(feature = "tdx")]
init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()>2180 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> {
2181 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2182 let mem = guest_memory.memory();
2183
2184 for section in sections {
2185 self.vm
2186 .tdx_init_memory_region(
2187 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64,
2188 section.address,
2189 section.size,
2190 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */
2191 section.attributes == 1,
2192 )
2193 .map_err(Error::InitializeTdxMemoryRegion)?;
2194 }
2195
2196 Ok(())
2197 }
2198
2199 // Creates ACPI tables
2200 // In case of TDX being used, this is a no-op since the tables will be
2201 // created and passed when populating the HOB.
2202
2203 #[cfg(not(target_arch = "riscv64"))]
create_acpi_tables(&self) -> Option<GuestAddress>2204 fn create_acpi_tables(&self) -> Option<GuestAddress> {
2205 #[cfg(feature = "tdx")]
2206 if self.config.lock().unwrap().is_tdx_enabled() {
2207 return None;
2208 }
2209 let mem = self.memory_manager.lock().unwrap().guest_memory().memory();
2210 let tpm_enabled = self.config.lock().unwrap().tpm.is_some();
2211 let rsdp_addr = crate::acpi::create_acpi_tables(
2212 &mem,
2213 &self.device_manager,
2214 &self.cpu_manager,
2215 &self.memory_manager,
2216 &self.numa_nodes,
2217 tpm_enabled,
2218 );
2219 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0);
2220
2221 Some(rsdp_addr)
2222 }
2223
entry_point(&mut self) -> Result<Option<EntryPoint>>2224 fn entry_point(&mut self) -> Result<Option<EntryPoint>> {
2225 trace_scoped!("entry_point");
2226
2227 self.load_payload_handle
2228 .take()
2229 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?)
2230 .transpose()
2231 }
2232
boot(&mut self) -> Result<()>2233 pub fn boot(&mut self) -> Result<()> {
2234 trace_scoped!("Vm::boot");
2235 let current_state = self.get_state()?;
2236 if current_state == VmState::Paused {
2237 return self.resume().map_err(Error::Resume);
2238 }
2239
2240 // We acquire all advisory disk image locks here and not on device creation
2241 // to enable live-migration without locking issues.
2242 self.device_manager
2243 .lock()
2244 .unwrap()
2245 .try_lock_disks()
2246 .map_err(Error::LockingError)?;
2247
2248 let new_state = if self.stop_on_boot {
2249 VmState::BreakPoint
2250 } else {
2251 VmState::Running
2252 };
2253 current_state.valid_transition(new_state)?;
2254
2255 // Do earlier to parallelise with loading kernel
2256 #[cfg(target_arch = "x86_64")]
2257 cfg_if::cfg_if! {
2258 if #[cfg(feature = "sev_snp")] {
2259 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled();
2260 let rsdp_addr = if sev_snp_enabled {
2261 // In case of SEV-SNP guest ACPI tables are provided via
2262 // IGVM. So skip the creation of ACPI tables and set the
2263 // rsdp addr to None.
2264 None
2265 } else {
2266 self.create_acpi_tables()
2267 };
2268 } else {
2269 let rsdp_addr = self.create_acpi_tables();
2270 }
2271 }
2272
2273 // Load kernel synchronously or if asynchronous then wait for load to
2274 // finish.
2275 let entry_point = self.entry_point()?;
2276
2277 #[cfg(feature = "tdx")]
2278 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled();
2279
2280 #[cfg(target_arch = "aarch64")]
2281 let vgic = self
2282 .device_manager
2283 .lock()
2284 .unwrap()
2285 .get_interrupt_controller()
2286 .unwrap()
2287 .lock()
2288 .unwrap()
2289 .get_vgic()
2290 .unwrap();
2291
2292 #[cfg(target_arch = "aarch64")]
2293 let redist_addr = vgic.lock().unwrap().device_properties();
2294
2295 // Configure the vcpus that have been created
2296 let vcpus = self.cpu_manager.lock().unwrap().vcpus();
2297 for vcpu in vcpus {
2298 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory();
2299 let boot_setup = entry_point.map(|e| (e, guest_memory));
2300 self.cpu_manager
2301 .lock()
2302 .unwrap()
2303 .configure_vcpu(vcpu.clone(), boot_setup)
2304 .map_err(Error::CpuManager)?;
2305
2306 #[cfg(target_arch = "aarch64")]
2307 vcpu.lock()
2308 .unwrap()
2309 .set_gic_redistributor_addr(redist_addr[2], redist_addr[3])
2310 .map_err(Error::CpuManager)?;
2311 }
2312
2313 #[cfg(feature = "tdx")]
2314 let (sections, guid_found) = if tdx_enabled {
2315 self.extract_tdvf_sections()?
2316 } else {
2317 (Vec::new(), false)
2318 };
2319
2320 // Configuring the TDX regions requires that the vCPUs are created.
2321 #[cfg(feature = "tdx")]
2322 let hob_address = if tdx_enabled {
2323 // TDX sections are written to memory.
2324 self.populate_tdx_sections(§ions, guid_found)?
2325 } else {
2326 None
2327 };
2328
2329 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only
2330 // available after they are configured
2331 #[cfg(target_arch = "aarch64")]
2332 let rsdp_addr = self.create_acpi_tables();
2333
2334 #[cfg(not(target_arch = "riscv64"))]
2335 // Configure shared state based on loaded kernel
2336 entry_point
2337 .map(|entry_point| {
2338 // Safe to unwrap rsdp_addr as we know it can't be None when
2339 // the entry_point is Some.
2340 self.configure_system(rsdp_addr.unwrap(), entry_point)
2341 })
2342 .transpose()?;
2343
2344 #[cfg(target_arch = "riscv64")]
2345 self.configure_system().unwrap();
2346
2347 #[cfg(feature = "tdx")]
2348 if let Some(hob_address) = hob_address {
2349 // With the HOB address extracted the vCPUs can have
2350 // their TDX state configured.
2351 self.cpu_manager
2352 .lock()
2353 .unwrap()
2354 .initialize_tdx(hob_address)
2355 .map_err(Error::CpuManager)?;
2356 // Let the hypervisor know which memory ranges are shared with the
2357 // guest. This prevents the guest from ignoring/discarding memory
2358 // regions provided by the host.
2359 self.init_tdx_memory(§ions)?;
2360 // With TDX memory and CPU state configured TDX setup is complete
2361 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?;
2362 }
2363
2364 // Resume the vm for MSHV
2365 if current_state == VmState::Created {
2366 self.vm.resume().map_err(Error::ResumeVm)?;
2367 }
2368
2369 self.cpu_manager
2370 .lock()
2371 .unwrap()
2372 .start_boot_vcpus(new_state == VmState::BreakPoint)
2373 .map_err(Error::CpuManager)?;
2374
2375 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?;
2376 *state = new_state;
2377 Ok(())
2378 }
2379
restore(&mut self) -> Result<()>2380 pub fn restore(&mut self) -> Result<()> {
2381 event!("vm", "restoring");
2382
2383 // We acquire all advisory disk image locks again.
2384 self.device_manager
2385 .lock()
2386 .unwrap()
2387 .try_lock_disks()
2388 .map_err(Error::LockingError)?;
2389
2390 // Now we can start all vCPUs from here.
2391 self.cpu_manager
2392 .lock()
2393 .unwrap()
2394 .start_restored_vcpus()
2395 .map_err(Error::CpuManager)?;
2396
2397 event!("vm", "restored");
2398 Ok(())
2399 }
2400
2401 /// Gets a thread-safe reference counted pointer to the VM configuration.
get_config(&self) -> Arc<Mutex<VmConfig>>2402 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> {
2403 Arc::clone(&self.config)
2404 }
2405
2406 /// Get the VM state. Returns an error if the state is poisoned.
get_state(&self) -> Result<VmState>2407 pub fn get_state(&self) -> Result<VmState> {
2408 self.state
2409 .try_read()
2410 .map_err(|_| Error::PoisonedState)
2411 .map(|state| *state)
2412 }
2413
2414 /// Gets the actual size of the balloon.
balloon_size(&self) -> u642415 pub fn balloon_size(&self) -> u64 {
2416 self.device_manager.lock().unwrap().balloon_size()
2417 }
2418
send_memory_fds( &mut self, socket: &mut UnixStream, ) -> std::result::Result<(), MigratableError>2419 pub fn send_memory_fds(
2420 &mut self,
2421 socket: &mut UnixStream,
2422 ) -> std::result::Result<(), MigratableError> {
2423 for (slot, fd) in self
2424 .memory_manager
2425 .lock()
2426 .unwrap()
2427 .memory_slot_fds()
2428 .drain()
2429 {
2430 Request::memory_fd(std::mem::size_of_val(&slot) as u64)
2431 .write_to(socket)
2432 .map_err(|e| {
2433 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e))
2434 })?;
2435 socket
2436 .send_with_fd(&slot.to_le_bytes()[..], fd)
2437 .map_err(|e| {
2438 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e))
2439 })?;
2440
2441 Response::read_from(socket)?.ok_or_abandon(
2442 socket,
2443 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")),
2444 )?;
2445 }
2446
2447 Ok(())
2448 }
2449
send_memory_regions<F>( &mut self, ranges: &MemoryRangeTable, fd: &mut F, ) -> std::result::Result<(), MigratableError> where F: WriteVolatile,2450 pub fn send_memory_regions<F>(
2451 &mut self,
2452 ranges: &MemoryRangeTable,
2453 fd: &mut F,
2454 ) -> std::result::Result<(), MigratableError>
2455 where
2456 F: WriteVolatile,
2457 {
2458 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2459 let mem = guest_memory.memory();
2460
2461 for range in ranges.regions() {
2462 let mut offset: u64 = 0;
2463 // Here we are manually handling the retry in case we can't the
2464 // whole region at once because we can't use the implementation
2465 // from vm-memory::GuestMemory of write_all_to() as it is not
2466 // following the correct behavior. For more info about this issue
2467 // see: https://github.com/rust-vmm/vm-memory/issues/174
2468 loop {
2469 let bytes_written = mem
2470 .write_volatile_to(
2471 GuestAddress(range.gpa + offset),
2472 fd,
2473 (range.length - offset) as usize,
2474 )
2475 .map_err(|e| {
2476 MigratableError::MigrateSend(anyhow!(
2477 "Error transferring memory to socket: {}",
2478 e
2479 ))
2480 })?;
2481 offset += bytes_written as u64;
2482
2483 if offset == range.length {
2484 break;
2485 }
2486 }
2487 }
2488
2489 Ok(())
2490 }
2491
memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError>2492 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2493 self.memory_manager
2494 .lock()
2495 .unwrap()
2496 .memory_range_table(false)
2497 }
2498
device_tree(&self) -> Arc<Mutex<DeviceTree>>2499 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> {
2500 self.device_manager.lock().unwrap().device_tree()
2501 }
2502
2503 /// Release all advisory locks held for the disk images.
2504 ///
2505 /// This should only be called when the VM is stopped and the VMM supposed
2506 /// to shut down. A new VMM, either after a live migration or a
2507 /// state save/resume cycle, should then acquire all locks before the VM
2508 /// starts to run.
release_disk_locks(&self) -> Result<()>2509 pub fn release_disk_locks(&self) -> Result<()> {
2510 self.device_manager
2511 .lock()
2512 .unwrap()
2513 .release_disk_locks()
2514 .map_err(Error::LockingError)?;
2515 Ok(())
2516 }
2517
activate_virtio_devices(&self) -> Result<()>2518 pub fn activate_virtio_devices(&self) -> Result<()> {
2519 self.device_manager
2520 .lock()
2521 .unwrap()
2522 .activate_virtio_devices()
2523 .map_err(Error::ActivateVirtioDevices)
2524 }
2525
2526 #[cfg(target_arch = "x86_64")]
power_button(&self) -> Result<()>2527 pub fn power_button(&self) -> Result<()> {
2528 return self
2529 .device_manager
2530 .lock()
2531 .unwrap()
2532 .notify_power_button()
2533 .map_err(Error::PowerButton);
2534 }
2535
2536 #[cfg(target_arch = "aarch64")]
power_button(&self) -> Result<()>2537 pub fn power_button(&self) -> Result<()> {
2538 self.device_manager
2539 .lock()
2540 .unwrap()
2541 .notify_power_button()
2542 .map_err(Error::PowerButton)
2543 }
2544
2545 #[cfg(target_arch = "riscv64")]
power_button(&self) -> Result<()>2546 pub fn power_button(&self) -> Result<()> {
2547 unimplemented!()
2548 }
2549
memory_manager_data(&self) -> MemoryManagerSnapshotData2550 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData {
2551 self.memory_manager.lock().unwrap().snapshot_data()
2552 }
2553
2554 #[cfg(feature = "guest_debug")]
debug_request( &mut self, gdb_request: &GdbRequestPayload, cpu_id: usize, ) -> Result<GdbResponsePayload>2555 pub fn debug_request(
2556 &mut self,
2557 gdb_request: &GdbRequestPayload,
2558 cpu_id: usize,
2559 ) -> Result<GdbResponsePayload> {
2560 use GdbRequestPayload::*;
2561 match gdb_request {
2562 SetSingleStep(single_step) => {
2563 self.set_guest_debug(cpu_id, &[], *single_step)
2564 .map_err(Error::Debug)?;
2565 }
2566 SetHwBreakPoint(addrs) => {
2567 self.set_guest_debug(cpu_id, addrs, false)
2568 .map_err(Error::Debug)?;
2569 }
2570 Pause => {
2571 self.debug_pause().map_err(Error::Debug)?;
2572 }
2573 Resume => {
2574 self.debug_resume().map_err(Error::Debug)?;
2575 }
2576 ReadRegs => {
2577 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?;
2578 return Ok(GdbResponsePayload::RegValues(Box::new(regs)));
2579 }
2580 WriteRegs(regs) => {
2581 self.write_regs(cpu_id, regs).map_err(Error::Debug)?;
2582 }
2583 ReadMem(vaddr, len) => {
2584 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2585 let mem = self
2586 .read_mem(&guest_memory, cpu_id, *vaddr, *len)
2587 .map_err(Error::Debug)?;
2588 return Ok(GdbResponsePayload::MemoryRegion(mem));
2589 }
2590 WriteMem(vaddr, data) => {
2591 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory();
2592 self.write_mem(&guest_memory, cpu_id, vaddr, data)
2593 .map_err(Error::Debug)?;
2594 }
2595 ActiveVcpus => {
2596 let active_vcpus = self.active_vcpus();
2597 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus));
2598 }
2599 }
2600 Ok(GdbResponsePayload::CommandComplete)
2601 }
2602
2603 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
get_dump_state( &mut self, destination_url: &str, ) -> std::result::Result<DumpState, GuestDebuggableError>2604 fn get_dump_state(
2605 &mut self,
2606 destination_url: &str,
2607 ) -> std::result::Result<DumpState, GuestDebuggableError> {
2608 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32;
2609 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize;
2610 let mut elf_phdr_num = 1;
2611 let elf_sh_info = 0;
2612 let coredump_file_path = url_to_file(destination_url)?;
2613 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings();
2614
2615 if mapping_num < UINT16_MAX - 2 {
2616 elf_phdr_num += mapping_num as u16;
2617 } else {
2618 panic!("mapping num beyond 65535 not supported");
2619 }
2620 let coredump_file = OpenOptions::new()
2621 .read(true)
2622 .write(true)
2623 .create_new(true)
2624 .open(coredump_file_path)
2625 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2626
2627 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size);
2628 let mem_data = self
2629 .memory_manager
2630 .lock()
2631 .unwrap()
2632 .coredump_memory_regions(mem_offset);
2633
2634 Ok(DumpState {
2635 elf_note_size,
2636 elf_phdr_num,
2637 elf_sh_info,
2638 mem_offset,
2639 mem_info: Some(mem_data),
2640 file: Some(coredump_file),
2641 })
2642 }
2643
2644 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u642645 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 {
2646 size_of::<elf::Elf64_Ehdr>() as u64
2647 + note_size as u64
2648 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64
2649 }
2650
nmi(&self) -> Result<()>2651 pub fn nmi(&self) -> Result<()> {
2652 return self
2653 .cpu_manager
2654 .lock()
2655 .unwrap()
2656 .nmi()
2657 .map_err(|_| Error::ErrorNmi);
2658 }
2659 }
2660
2661 impl Pausable for Vm {
pause(&mut self) -> std::result::Result<(), MigratableError>2662 fn pause(&mut self) -> std::result::Result<(), MigratableError> {
2663 event!("vm", "pausing");
2664 let mut state = self
2665 .state
2666 .try_write()
2667 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?;
2668 let new_state = VmState::Paused;
2669
2670 state
2671 .valid_transition(new_state)
2672 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?;
2673
2674 #[cfg(target_arch = "x86_64")]
2675 {
2676 let mut clock = self
2677 .vm
2678 .get_clock()
2679 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?;
2680 clock.reset_flags();
2681 self.saved_clock = Some(clock);
2682 }
2683
2684 // Before pausing the vCPUs activate any pending virtio devices that might
2685 // need activation between starting the pause (or e.g. a migration it's part of)
2686 self.activate_virtio_devices().map_err(|e| {
2687 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e))
2688 })?;
2689
2690 self.cpu_manager.lock().unwrap().pause()?;
2691 self.device_manager.lock().unwrap().pause()?;
2692
2693 self.vm
2694 .pause()
2695 .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?;
2696
2697 *state = new_state;
2698
2699 event!("vm", "paused");
2700 Ok(())
2701 }
2702
resume(&mut self) -> std::result::Result<(), MigratableError>2703 fn resume(&mut self) -> std::result::Result<(), MigratableError> {
2704 event!("vm", "resuming");
2705 let current_state = self.get_state().unwrap();
2706 let mut state = self
2707 .state
2708 .try_write()
2709 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?;
2710 let new_state = VmState::Running;
2711
2712 state
2713 .valid_transition(new_state)
2714 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?;
2715
2716 self.cpu_manager.lock().unwrap().resume()?;
2717 #[cfg(target_arch = "x86_64")]
2718 {
2719 if let Some(clock) = &self.saved_clock {
2720 self.vm.set_clock(clock).map_err(|e| {
2721 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e))
2722 })?;
2723 }
2724 }
2725
2726 if current_state == VmState::Paused {
2727 self.vm
2728 .resume()
2729 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?;
2730 }
2731
2732 self.device_manager.lock().unwrap().resume()?;
2733
2734 // And we're back to the Running state.
2735 *state = new_state;
2736 event!("vm", "resumed");
2737 Ok(())
2738 }
2739 }
2740
2741 #[derive(Serialize, Deserialize)]
2742 pub struct VmSnapshot {
2743 #[cfg(target_arch = "x86_64")]
2744 pub clock: Option<hypervisor::ClockData>,
2745 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2746 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>,
2747 }
2748
2749 pub const VM_SNAPSHOT_ID: &str = "vm";
2750 impl Snapshottable for Vm {
id(&self) -> String2751 fn id(&self) -> String {
2752 VM_SNAPSHOT_ID.to_string()
2753 }
2754
snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError>2755 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
2756 event!("vm", "snapshotting");
2757
2758 #[cfg(feature = "tdx")]
2759 {
2760 if self.config.lock().unwrap().is_tdx_enabled() {
2761 return Err(MigratableError::Snapshot(anyhow!(
2762 "Snapshot not possible with TDX VM"
2763 )));
2764 }
2765 }
2766
2767 let current_state = self.get_state().unwrap();
2768 if current_state != VmState::Paused {
2769 return Err(MigratableError::Snapshot(anyhow!(
2770 "Trying to snapshot while VM is running"
2771 )));
2772 }
2773
2774 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2775 let common_cpuid = {
2776 let amx = self.config.lock().unwrap().cpus.features.amx;
2777 let phys_bits = physical_bits(
2778 &self.hypervisor,
2779 self.config.lock().unwrap().cpus.max_phys_bits,
2780 );
2781 arch::generate_common_cpuid(
2782 &self.hypervisor,
2783 &arch::CpuidConfig {
2784 sgx_epc_sections: None,
2785 phys_bits,
2786 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv,
2787 #[cfg(feature = "tdx")]
2788 tdx: false,
2789 amx,
2790 },
2791 )
2792 .map_err(|e| {
2793 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e))
2794 })?
2795 };
2796
2797 let vm_snapshot_state = VmSnapshot {
2798 #[cfg(target_arch = "x86_64")]
2799 clock: self.saved_clock,
2800 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
2801 common_cpuid,
2802 };
2803
2804 let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?;
2805
2806 let (id, snapshot) = {
2807 let mut cpu_manager = self.cpu_manager.lock().unwrap();
2808 (cpu_manager.id(), cpu_manager.snapshot()?)
2809 };
2810 vm_snapshot.add_snapshot(id, snapshot);
2811 let (id, snapshot) = {
2812 let mut memory_manager = self.memory_manager.lock().unwrap();
2813 (memory_manager.id(), memory_manager.snapshot()?)
2814 };
2815 vm_snapshot.add_snapshot(id, snapshot);
2816 let (id, snapshot) = {
2817 let mut device_manager = self.device_manager.lock().unwrap();
2818 (device_manager.id(), device_manager.snapshot()?)
2819 };
2820 vm_snapshot.add_snapshot(id, snapshot);
2821
2822 event!("vm", "snapshotted");
2823 Ok(vm_snapshot)
2824 }
2825 }
2826
2827 impl Transportable for Vm {
send( &self, snapshot: &Snapshot, destination_url: &str, ) -> std::result::Result<(), MigratableError>2828 fn send(
2829 &self,
2830 snapshot: &Snapshot,
2831 destination_url: &str,
2832 ) -> std::result::Result<(), MigratableError> {
2833 let mut snapshot_config_path = url_to_path(destination_url)?;
2834 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE);
2835
2836 // Create the snapshot config file
2837 let mut snapshot_config_file = OpenOptions::new()
2838 .read(true)
2839 .write(true)
2840 .create_new(true)
2841 .open(snapshot_config_path)
2842 .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2843
2844 // Serialize and write the snapshot config
2845 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref())
2846 .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2847
2848 snapshot_config_file
2849 .write(vm_config.as_bytes())
2850 .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2851
2852 let mut snapshot_state_path = url_to_path(destination_url)?;
2853 snapshot_state_path.push(SNAPSHOT_STATE_FILE);
2854
2855 // Create the snapshot state file
2856 let mut snapshot_state_file = OpenOptions::new()
2857 .read(true)
2858 .write(true)
2859 .create_new(true)
2860 .open(snapshot_state_path)
2861 .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2862
2863 // Serialize and write the snapshot state
2864 let vm_state =
2865 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?;
2866
2867 snapshot_state_file
2868 .write(&vm_state)
2869 .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2870
2871 // Tell the memory manager to also send/write its own snapshot.
2872 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) {
2873 self.memory_manager
2874 .lock()
2875 .unwrap()
2876 .send(&memory_manager_snapshot.clone(), destination_url)?;
2877 } else {
2878 return Err(MigratableError::Restore(anyhow!(
2879 "Missing memory manager snapshot"
2880 )));
2881 }
2882
2883 Ok(())
2884 }
2885 }
2886
2887 impl Migratable for Vm {
start_dirty_log(&mut self) -> std::result::Result<(), MigratableError>2888 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2889 self.memory_manager.lock().unwrap().start_dirty_log()?;
2890 self.device_manager.lock().unwrap().start_dirty_log()
2891 }
2892
stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError>2893 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2894 self.memory_manager.lock().unwrap().stop_dirty_log()?;
2895 self.device_manager.lock().unwrap().stop_dirty_log()
2896 }
2897
dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError>2898 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2899 Ok(MemoryRangeTable::new_from_tables(vec![
2900 self.memory_manager.lock().unwrap().dirty_log()?,
2901 self.device_manager.lock().unwrap().dirty_log()?,
2902 ]))
2903 }
2904
start_migration(&mut self) -> std::result::Result<(), MigratableError>2905 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> {
2906 self.memory_manager.lock().unwrap().start_migration()?;
2907 self.device_manager.lock().unwrap().start_migration()
2908 }
2909
complete_migration(&mut self) -> std::result::Result<(), MigratableError>2910 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> {
2911 self.memory_manager.lock().unwrap().complete_migration()?;
2912 self.device_manager.lock().unwrap().complete_migration()
2913 }
2914 }
2915
2916 #[cfg(feature = "guest_debug")]
2917 impl Debuggable for Vm {
set_guest_debug( &self, cpu_id: usize, addrs: &[GuestAddress], singlestep: bool, ) -> std::result::Result<(), DebuggableError>2918 fn set_guest_debug(
2919 &self,
2920 cpu_id: usize,
2921 addrs: &[GuestAddress],
2922 singlestep: bool,
2923 ) -> std::result::Result<(), DebuggableError> {
2924 self.cpu_manager
2925 .lock()
2926 .unwrap()
2927 .set_guest_debug(cpu_id, addrs, singlestep)
2928 }
2929
debug_pause(&mut self) -> std::result::Result<(), DebuggableError>2930 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> {
2931 if *self.state.read().unwrap() == VmState::Running {
2932 self.pause().map_err(DebuggableError::Pause)?;
2933 }
2934
2935 let mut state = self
2936 .state
2937 .try_write()
2938 .map_err(|_| DebuggableError::PoisonedState)?;
2939 *state = VmState::BreakPoint;
2940 Ok(())
2941 }
2942
debug_resume(&mut self) -> std::result::Result<(), DebuggableError>2943 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> {
2944 if *self.state.read().unwrap() == VmState::BreakPoint {
2945 self.resume().map_err(DebuggableError::Pause)?;
2946 }
2947
2948 Ok(())
2949 }
2950
read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError>2951 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> {
2952 self.cpu_manager.lock().unwrap().read_regs(cpu_id)
2953 }
2954
write_regs( &self, cpu_id: usize, regs: &CoreRegs, ) -> std::result::Result<(), DebuggableError>2955 fn write_regs(
2956 &self,
2957 cpu_id: usize,
2958 regs: &CoreRegs,
2959 ) -> std::result::Result<(), DebuggableError> {
2960 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs)
2961 }
2962
read_mem( &self, guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, cpu_id: usize, vaddr: GuestAddress, len: usize, ) -> std::result::Result<Vec<u8>, DebuggableError>2963 fn read_mem(
2964 &self,
2965 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2966 cpu_id: usize,
2967 vaddr: GuestAddress,
2968 len: usize,
2969 ) -> std::result::Result<Vec<u8>, DebuggableError> {
2970 self.cpu_manager
2971 .lock()
2972 .unwrap()
2973 .read_mem(guest_memory, cpu_id, vaddr, len)
2974 }
2975
write_mem( &self, guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, cpu_id: usize, vaddr: &GuestAddress, data: &[u8], ) -> std::result::Result<(), DebuggableError>2976 fn write_mem(
2977 &self,
2978 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>,
2979 cpu_id: usize,
2980 vaddr: &GuestAddress,
2981 data: &[u8],
2982 ) -> std::result::Result<(), DebuggableError> {
2983 self.cpu_manager
2984 .lock()
2985 .unwrap()
2986 .write_mem(guest_memory, cpu_id, vaddr, data)
2987 }
2988
active_vcpus(&self) -> usize2989 fn active_vcpus(&self) -> usize {
2990 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus();
2991 if active_vcpus > 0 {
2992 active_vcpus
2993 } else {
2994 // The VM is not booted yet. Report boot_vcpus() instead.
2995 self.cpu_manager.lock().unwrap().boot_vcpus() as usize
2996 }
2997 }
2998 }
2999
3000 #[cfg(feature = "guest_debug")]
3001 pub const UINT16_MAX: u32 = 65535;
3002
3003 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
3004 impl Elf64Writable for Vm {}
3005
3006 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
3007 impl GuestDebuggable for Vm {
coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError>3008 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> {
3009 event!("vm", "coredumping");
3010
3011 let mut resume = false;
3012
3013 #[cfg(feature = "tdx")]
3014 {
3015 if let Some(ref platform) = self.config.lock().unwrap().platform {
3016 if platform.tdx {
3017 return Err(GuestDebuggableError::Coredump(anyhow!(
3018 "Coredump not possible with TDX VM"
3019 )));
3020 }
3021 }
3022 }
3023
3024 match self.get_state().unwrap() {
3025 VmState::Running => {
3026 self.pause().map_err(GuestDebuggableError::Pause)?;
3027 resume = true;
3028 }
3029 VmState::Paused => {}
3030 _ => {
3031 return Err(GuestDebuggableError::Coredump(anyhow!(
3032 "Trying to coredump while VM is not running or paused"
3033 )));
3034 }
3035 }
3036
3037 let coredump_state = self.get_dump_state(destination_url)?;
3038
3039 self.write_header(&coredump_state)?;
3040 self.write_note(&coredump_state)?;
3041 self.write_loads(&coredump_state)?;
3042
3043 self.cpu_manager
3044 .lock()
3045 .unwrap()
3046 .cpu_write_elf64_note(&coredump_state)?;
3047 self.cpu_manager
3048 .lock()
3049 .unwrap()
3050 .cpu_write_vmm_note(&coredump_state)?;
3051
3052 self.memory_manager
3053 .lock()
3054 .unwrap()
3055 .coredump_iterate_save_mem(&coredump_state)?;
3056
3057 if resume {
3058 self.resume().map_err(GuestDebuggableError::Resume)?;
3059 }
3060
3061 Ok(())
3062 }
3063 }
3064
3065 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3066 #[cfg(test)]
3067 mod tests {
3068 use super::*;
3069
test_vm_state_transitions(state: VmState)3070 fn test_vm_state_transitions(state: VmState) {
3071 match state {
3072 VmState::Created => {
3073 // Check the transitions from Created
3074 state.valid_transition(VmState::Created).unwrap_err();
3075 state.valid_transition(VmState::Running).unwrap();
3076 state.valid_transition(VmState::Shutdown).unwrap();
3077 state.valid_transition(VmState::Paused).unwrap();
3078 state.valid_transition(VmState::BreakPoint).unwrap();
3079 }
3080 VmState::Running => {
3081 // Check the transitions from Running
3082 state.valid_transition(VmState::Created).unwrap_err();
3083 state.valid_transition(VmState::Running).unwrap_err();
3084 state.valid_transition(VmState::Shutdown).unwrap();
3085 state.valid_transition(VmState::Paused).unwrap();
3086 state.valid_transition(VmState::BreakPoint).unwrap();
3087 }
3088 VmState::Shutdown => {
3089 // Check the transitions from Shutdown
3090 state.valid_transition(VmState::Created).unwrap_err();
3091 state.valid_transition(VmState::Running).unwrap();
3092 state.valid_transition(VmState::Shutdown).unwrap_err();
3093 state.valid_transition(VmState::Paused).unwrap_err();
3094 state.valid_transition(VmState::BreakPoint).unwrap_err();
3095 }
3096 VmState::Paused => {
3097 // Check the transitions from Paused
3098 state.valid_transition(VmState::Created).unwrap_err();
3099 state.valid_transition(VmState::Running).unwrap();
3100 state.valid_transition(VmState::Shutdown).unwrap();
3101 state.valid_transition(VmState::Paused).unwrap_err();
3102 state.valid_transition(VmState::BreakPoint).unwrap_err();
3103 }
3104 VmState::BreakPoint => {
3105 // Check the transitions from Breakpoint
3106 state.valid_transition(VmState::Created).unwrap();
3107 state.valid_transition(VmState::Running).unwrap();
3108 state.valid_transition(VmState::Shutdown).unwrap_err();
3109 state.valid_transition(VmState::Paused).unwrap_err();
3110 state.valid_transition(VmState::BreakPoint).unwrap_err();
3111 }
3112 }
3113 }
3114
3115 #[test]
test_vm_created_transitions()3116 fn test_vm_created_transitions() {
3117 test_vm_state_transitions(VmState::Created);
3118 }
3119
3120 #[test]
test_vm_running_transitions()3121 fn test_vm_running_transitions() {
3122 test_vm_state_transitions(VmState::Running);
3123 }
3124
3125 #[test]
test_vm_shutdown_transitions()3126 fn test_vm_shutdown_transitions() {
3127 test_vm_state_transitions(VmState::Shutdown);
3128 }
3129
3130 #[test]
test_vm_paused_transitions()3131 fn test_vm_paused_transitions() {
3132 test_vm_state_transitions(VmState::Paused);
3133 }
3134
3135 #[cfg(feature = "tdx")]
3136 #[test]
test_hob_memory_resources()3137 fn test_hob_memory_resources() {
3138 // Case 1: Two TDVF sections in the middle of the RAM
3139 let sections = vec![
3140 TdvfSection {
3141 address: 0xc000,
3142 size: 0x1000,
3143 ..Default::default()
3144 },
3145 TdvfSection {
3146 address: 0x1000,
3147 size: 0x4000,
3148 ..Default::default()
3149 },
3150 ];
3151 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)];
3152 let expected = vec![
3153 (0, 0x1000, true),
3154 (0x1000, 0x4000, false),
3155 (0x5000, 0x7000, true),
3156 (0xc000, 0x1000, false),
3157 (0xd000, 0x0fff_3000, true),
3158 ];
3159 assert_eq!(
3160 expected,
3161 Vm::hob_memory_resources(
3162 sections,
3163 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3164 )
3165 );
3166
3167 // Case 2: Two TDVF sections with no conflict with the RAM
3168 let sections = vec![
3169 TdvfSection {
3170 address: 0x1000_1000,
3171 size: 0x1000,
3172 ..Default::default()
3173 },
3174 TdvfSection {
3175 address: 0,
3176 size: 0x1000,
3177 ..Default::default()
3178 },
3179 ];
3180 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3181 let expected = vec![
3182 (0, 0x1000, false),
3183 (0x1000, 0x1000_0000, true),
3184 (0x1000_1000, 0x1000, false),
3185 ];
3186 assert_eq!(
3187 expected,
3188 Vm::hob_memory_resources(
3189 sections,
3190 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3191 )
3192 );
3193
3194 // Case 3: Two TDVF sections with partial conflicts with the RAM
3195 let sections = vec![
3196 TdvfSection {
3197 address: 0x1000_0000,
3198 size: 0x2000,
3199 ..Default::default()
3200 },
3201 TdvfSection {
3202 address: 0,
3203 size: 0x2000,
3204 ..Default::default()
3205 },
3206 ];
3207 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3208 let expected = vec![
3209 (0, 0x2000, false),
3210 (0x2000, 0x0fff_e000, true),
3211 (0x1000_0000, 0x2000, false),
3212 ];
3213 assert_eq!(
3214 expected,
3215 Vm::hob_memory_resources(
3216 sections,
3217 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3218 )
3219 );
3220
3221 // Case 4: Two TDVF sections with no conflict before the RAM and two
3222 // more additional sections with no conflict after the RAM.
3223 let sections = vec![
3224 TdvfSection {
3225 address: 0x2000_1000,
3226 size: 0x1000,
3227 ..Default::default()
3228 },
3229 TdvfSection {
3230 address: 0x2000_0000,
3231 size: 0x1000,
3232 ..Default::default()
3233 },
3234 TdvfSection {
3235 address: 0x1000,
3236 size: 0x1000,
3237 ..Default::default()
3238 },
3239 TdvfSection {
3240 address: 0,
3241 size: 0x1000,
3242 ..Default::default()
3243 },
3244 ];
3245 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)];
3246 let expected = vec![
3247 (0, 0x1000, false),
3248 (0x1000, 0x1000, false),
3249 (0x4000, 0x1000_0000, true),
3250 (0x2000_0000, 0x1000, false),
3251 (0x2000_1000, 0x1000, false),
3252 ];
3253 assert_eq!(
3254 expected,
3255 Vm::hob_memory_resources(
3256 sections,
3257 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3258 )
3259 );
3260
3261 // Case 5: One TDVF section overriding the entire RAM
3262 let sections = vec![TdvfSection {
3263 address: 0,
3264 size: 0x2000_0000,
3265 ..Default::default()
3266 }];
3267 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)];
3268 let expected = vec![(0, 0x2000_0000, false)];
3269 assert_eq!(
3270 expected,
3271 Vm::hob_memory_resources(
3272 sections,
3273 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3274 )
3275 );
3276
3277 // Case 6: Two TDVF sections with no conflict with 2 RAM regions
3278 let sections = vec![
3279 TdvfSection {
3280 address: 0x1000_2000,
3281 size: 0x2000,
3282 ..Default::default()
3283 },
3284 TdvfSection {
3285 address: 0,
3286 size: 0x2000,
3287 ..Default::default()
3288 },
3289 ];
3290 let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3291 (GuestAddress(0x2000), 0x1000_0000),
3292 (GuestAddress(0x1000_4000), 0x1000_0000),
3293 ];
3294 let expected = vec![
3295 (0, 0x2000, false),
3296 (0x2000, 0x1000_0000, true),
3297 (0x1000_2000, 0x2000, false),
3298 (0x1000_4000, 0x1000_0000, true),
3299 ];
3300 assert_eq!(
3301 expected,
3302 Vm::hob_memory_resources(
3303 sections,
3304 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3305 )
3306 );
3307
3308 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions
3309 let sections = vec![
3310 TdvfSection {
3311 address: 0x1000_0000,
3312 size: 0x4000,
3313 ..Default::default()
3314 },
3315 TdvfSection {
3316 address: 0,
3317 size: 0x4000,
3318 ..Default::default()
3319 },
3320 ];
3321 let guest_ranges: Vec<(GuestAddress, usize)> = vec![
3322 (GuestAddress(0x1000), 0x1000_0000),
3323 (GuestAddress(0x1000_3000), 0x1000_0000),
3324 ];
3325 let expected = vec![
3326 (0, 0x4000, false),
3327 (0x4000, 0x0fff_c000, true),
3328 (0x1000_0000, 0x4000, false),
3329 (0x1000_4000, 0x0fff_f000, true),
3330 ];
3331 assert_eq!(
3332 expected,
3333 Vm::hob_memory_resources(
3334 sections,
3335 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap()
3336 )
3337 );
3338 }
3339 }
3340
3341 #[cfg(target_arch = "aarch64")]
3342 #[cfg(test)]
3343 mod tests {
3344 use arch::aarch64::fdt::create_fdt;
3345 use arch::aarch64::layout;
3346 use arch::{DeviceType, MmioDeviceInfo};
3347 use devices::gic::Gic;
3348
3349 use super::*;
3350
3351 const LEN: u64 = 4096;
3352
3353 #[test]
test_create_fdt_with_devices()3354 fn test_create_fdt_with_devices() {
3355 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)];
3356 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory");
3357
3358 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [
3359 (
3360 (DeviceType::Serial, DeviceType::Serial.to_string()),
3361 MmioDeviceInfo {
3362 addr: 0x00,
3363 len: LEN,
3364 irq: 33,
3365 },
3366 ),
3367 (
3368 (DeviceType::Virtio(1), "virtio".to_string()),
3369 MmioDeviceInfo {
3370 addr: LEN,
3371 len: LEN,
3372 irq: 34,
3373 },
3374 ),
3375 (
3376 (DeviceType::Rtc, "rtc".to_string()),
3377 MmioDeviceInfo {
3378 addr: 2 * LEN,
3379 len: LEN,
3380 irq: 35,
3381 },
3382 ),
3383 ]
3384 .iter()
3385 .cloned()
3386 .collect();
3387
3388 let hv = hypervisor::new().unwrap();
3389 let vm = hv.create_vm().unwrap();
3390 let gic = vm
3391 .create_vgic(Gic::create_default_config(1))
3392 .expect("Cannot create gic");
3393 create_fdt(
3394 &mem,
3395 "console=tty0",
3396 vec![0],
3397 Some((0, 0, 0)),
3398 &dev_info,
3399 &gic,
3400 &None,
3401 &Vec::new(),
3402 &BTreeMap::new(),
3403 None,
3404 true,
3405 )
3406 .unwrap();
3407 }
3408 }
3409
3410 #[cfg(all(feature = "kvm", target_arch = "x86_64"))]
3411 #[test]
test_vm()3412 pub fn test_vm() {
3413 use hypervisor::VmExit;
3414 use vm_memory::{Address, GuestMemory, GuestMemoryRegion};
3415 // This example based on https://lwn.net/Articles/658511/
3416 let code = [
3417 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
3418 0x00, 0xd8, /* add %bl, %al */
3419 0x04, b'0', /* add $'0', %al */
3420 0xee, /* out %al, (%dx) */
3421 0xb0, b'\n', /* mov $'\n', %al */
3422 0xee, /* out %al, (%dx) */
3423 0xf4, /* hlt */
3424 ];
3425
3426 let mem_size = 0x1000;
3427 let load_addr = GuestAddress(0x1000);
3428 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap();
3429
3430 let hv = hypervisor::new().unwrap();
3431 let vm = hv.create_vm().expect("new VM creation failed");
3432
3433 for (index, region) in mem.iter().enumerate() {
3434 let mem_region = vm.make_user_memory_region(
3435 index as u32,
3436 region.start_addr().raw_value(),
3437 region.len(),
3438 region.as_ptr() as u64,
3439 false,
3440 false,
3441 );
3442
3443 vm.create_user_memory_region(mem_region)
3444 .expect("Cannot configure guest memory");
3445 }
3446 mem.write_slice(&code, load_addr)
3447 .expect("Writing code to memory failed");
3448
3449 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed");
3450
3451 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed");
3452 vcpu_sregs.cs.base = 0;
3453 vcpu_sregs.cs.selector = 0;
3454 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed");
3455
3456 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed");
3457 vcpu_regs.set_rip(0x1000);
3458 vcpu_regs.set_rax(2);
3459 vcpu_regs.set_rbx(3);
3460 vcpu_regs.set_rflags(2);
3461 vcpu.set_regs(&vcpu_regs).expect("set regs failed");
3462
3463 loop {
3464 match vcpu.run().expect("run failed") {
3465 VmExit::Reset => {
3466 println!("HLT");
3467 break;
3468 }
3469 VmExit::Ignore => {}
3470 r => panic!("unexpected exit reason: {r:?}"),
3471 }
3472 }
3473 }
3474