1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 #[cfg(feature = "igvm")] 29 use crate::igvm::igvm_loader; 30 use crate::memory_manager::{ 31 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 32 }; 33 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 34 use crate::migration::get_vm_snapshot; 35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 36 use crate::migration::url_to_file; 37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 38 use crate::GuestMemoryMmap; 39 use crate::{ 40 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 41 }; 42 use anyhow::anyhow; 43 use arch::get_host_cpu_phys_bits; 44 #[cfg(target_arch = "x86_64")] 45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 46 #[cfg(feature = "tdx")] 47 use arch::x86_64::tdx::TdvfSection; 48 use arch::EntryPoint; 49 #[cfg(target_arch = "aarch64")] 50 use arch::PciSpaceInfo; 51 use arch::{NumaNode, NumaNodes}; 52 #[cfg(target_arch = "aarch64")] 53 use devices::interrupt_controller; 54 use devices::AcpiNotificationFlags; 55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 59 use hypervisor::{HypervisorVmError, VmOps}; 60 use libc::{termios, SIGWINCH}; 61 use linux_loader::cmdline::Cmdline; 62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 63 use linux_loader::elf; 64 #[cfg(target_arch = "x86_64")] 65 use linux_loader::loader::bzimage::BzImage; 66 #[cfg(target_arch = "x86_64")] 67 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 68 #[cfg(target_arch = "aarch64")] 69 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 70 use linux_loader::loader::KernelLoader; 71 use seccompiler::SeccompAction; 72 use serde::{Deserialize, Serialize}; 73 use std::cmp; 74 use std::collections::BTreeMap; 75 use std::collections::HashMap; 76 use std::fs::{File, OpenOptions}; 77 use std::io::{self, Seek, SeekFrom, Write}; 78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 79 use std::mem::size_of; 80 use std::num::Wrapping; 81 use std::ops::Deref; 82 use std::os::unix::net::UnixStream; 83 use std::sync::{Arc, Mutex, RwLock}; 84 use std::time::Instant; 85 use std::{result, str, thread}; 86 use thiserror::Error; 87 use tracer::trace_scoped; 88 use vm_device::Bus; 89 #[cfg(feature = "tdx")] 90 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 91 use vm_memory::{ 92 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 93 }; 94 use vm_migration::protocol::{Request, Response, Status}; 95 use vm_migration::{ 96 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 97 SnapshotData, Snapshottable, Transportable, 98 }; 99 use vmm_sys_util::eventfd::EventFd; 100 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 101 102 /// Errors associated with VM management 103 #[derive(Debug, Error)] 104 pub enum Error { 105 #[error("Cannot open kernel file: {0}")] 106 KernelFile(#[source] io::Error), 107 108 #[error("Cannot open initramfs file: {0}")] 109 InitramfsFile(#[source] io::Error), 110 111 #[error("Cannot load the kernel into memory: {0}")] 112 KernelLoad(#[source] linux_loader::loader::Error), 113 114 #[cfg(target_arch = "aarch64")] 115 #[error("Cannot load the UEFI binary in memory: {0:?}")] 116 UefiLoad(arch::aarch64::uefi::Error), 117 118 #[error("Cannot load the initramfs into memory")] 119 InitramfsLoad, 120 121 #[error("Cannot load the kernel command line in memory: {0}")] 122 LoadCmdLine(#[source] linux_loader::loader::Error), 123 124 #[error("Cannot modify the kernel command line: {0}")] 125 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 126 127 #[error("Cannot create the kernel command line: {0}")] 128 CmdLineCreate(#[source] linux_loader::cmdline::Error), 129 130 #[error("Cannot configure system: {0}")] 131 ConfigureSystem(#[source] arch::Error), 132 133 #[cfg(target_arch = "aarch64")] 134 #[error("Cannot enable interrupt controller: {0:?}")] 135 EnableInterruptController(interrupt_controller::Error), 136 137 #[error("VM state is poisoned")] 138 PoisonedState, 139 140 #[error("Error from device manager: {0:?}")] 141 DeviceManager(DeviceManagerError), 142 143 #[error("No device with id {0:?} to remove")] 144 NoDeviceToRemove(String), 145 146 #[error("Cannot spawn a signal handler thread: {0}")] 147 SignalHandlerSpawn(#[source] io::Error), 148 149 #[error("Failed to join on threads: {0:?}")] 150 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 151 152 #[error("VM config is missing")] 153 VmMissingConfig, 154 155 #[error("VM is not created")] 156 VmNotCreated, 157 158 #[error("VM is already created")] 159 VmAlreadyCreated, 160 161 #[error("VM is not running")] 162 VmNotRunning, 163 164 #[error("Cannot clone EventFd: {0}")] 165 EventFdClone(#[source] io::Error), 166 167 #[error("invalid VM state transition: {0:?} to {1:?}")] 168 InvalidStateTransition(VmState, VmState), 169 170 #[error("Error from CPU manager: {0}")] 171 CpuManager(#[source] cpu::Error), 172 173 #[error("Cannot pause devices: {0}")] 174 PauseDevices(#[source] MigratableError), 175 176 #[error("Cannot resume devices: {0}")] 177 ResumeDevices(#[source] MigratableError), 178 179 #[error("Cannot pause CPUs: {0}")] 180 PauseCpus(#[source] MigratableError), 181 182 #[error("Cannot resume cpus: {0}")] 183 ResumeCpus(#[source] MigratableError), 184 185 #[error("Cannot pause VM: {0}")] 186 Pause(#[source] MigratableError), 187 188 #[error("Cannot resume VM: {0}")] 189 Resume(#[source] MigratableError), 190 191 #[error("Memory manager error: {0:?}")] 192 MemoryManager(MemoryManagerError), 193 194 #[error("Eventfd write error: {0}")] 195 EventfdError(#[source] std::io::Error), 196 197 #[error("Cannot snapshot VM: {0}")] 198 Snapshot(#[source] MigratableError), 199 200 #[error("Cannot restore VM: {0}")] 201 Restore(#[source] MigratableError), 202 203 #[error("Cannot send VM snapshot: {0}")] 204 SnapshotSend(#[source] MigratableError), 205 206 #[error("Invalid restore source URL")] 207 InvalidRestoreSourceUrl, 208 209 #[error("Failed to validate config: {0}")] 210 ConfigValidation(#[source] ValidationError), 211 212 #[error("Too many virtio-vsock devices")] 213 TooManyVsockDevices, 214 215 #[error("Failed serializing into JSON: {0}")] 216 SerializeJson(#[source] serde_json::Error), 217 218 #[error("Invalid NUMA configuration")] 219 InvalidNumaConfig, 220 221 #[error("Cannot create seccomp filter: {0}")] 222 CreateSeccompFilter(#[source] seccompiler::Error), 223 224 #[error("Cannot apply seccomp filter: {0}")] 225 ApplySeccompFilter(#[source] seccompiler::Error), 226 227 #[error("Failed resizing a memory zone")] 228 ResizeZone, 229 230 #[error("Cannot activate virtio devices: {0:?}")] 231 ActivateVirtioDevices(DeviceManagerError), 232 233 #[error("Error triggering power button: {0:?}")] 234 PowerButton(DeviceManagerError), 235 236 #[error("Kernel lacks PVH header")] 237 KernelMissingPvhHeader, 238 239 #[error("Failed to allocate firmware RAM: {0:?}")] 240 AllocateFirmwareMemory(MemoryManagerError), 241 242 #[error("Error manipulating firmware file: {0}")] 243 FirmwareFile(#[source] std::io::Error), 244 245 #[error("Firmware too big")] 246 FirmwareTooLarge, 247 248 #[error("Failed to copy firmware to memory: {0}")] 249 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 250 251 #[cfg(feature = "sev_snp")] 252 #[error("Error enabling SEV-SNP VM: {0}")] 253 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 254 255 #[cfg(feature = "tdx")] 256 #[error("Error performing I/O on TDX firmware file: {0}")] 257 LoadTdvf(#[source] std::io::Error), 258 259 #[cfg(feature = "tdx")] 260 #[error("Error performing I/O on the TDX payload file: {0}")] 261 LoadPayload(#[source] std::io::Error), 262 263 #[cfg(feature = "tdx")] 264 #[error("Error parsing TDVF: {0}")] 265 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 266 267 #[cfg(feature = "tdx")] 268 #[error("Error populating TDX HOB: {0}")] 269 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 270 271 #[cfg(feature = "tdx")] 272 #[error("Error allocating TDVF memory: {0:?}")] 273 AllocatingTdvfMemory(crate::memory_manager::Error), 274 275 #[cfg(feature = "tdx")] 276 #[error("Error enabling TDX VM: {0}")] 277 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 278 279 #[cfg(feature = "tdx")] 280 #[error("Error enabling TDX memory region: {0}")] 281 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 282 283 #[cfg(feature = "tdx")] 284 #[error("Error finalizing TDX VM: {0}")] 285 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 286 287 #[cfg(feature = "tdx")] 288 #[error("TDX firmware missing")] 289 TdxFirmwareMissing, 290 291 #[cfg(feature = "tdx")] 292 #[error("Invalid TDX payload type")] 293 InvalidPayloadType, 294 295 #[cfg(feature = "guest_debug")] 296 #[error("Error debugging VM: {0:?}")] 297 Debug(DebuggableError), 298 299 #[error("Error spawning kernel loading thread")] 300 KernelLoadThreadSpawn(std::io::Error), 301 302 #[error("Error joining kernel loading thread")] 303 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 304 305 #[error("Payload configuration is not bootable")] 306 InvalidPayload, 307 308 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 309 #[error("Error coredumping VM: {0:?}")] 310 Coredump(GuestDebuggableError), 311 312 #[cfg(feature = "igvm")] 313 #[error("Cannot open igvm file: {0}")] 314 IgvmFile(#[source] io::Error), 315 316 #[cfg(feature = "igvm")] 317 #[error("Cannot load the igvm into memory: {0}")] 318 IgvmLoad(#[source] igvm_loader::Error), 319 } 320 pub type Result<T> = result::Result<T, Error>; 321 322 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 323 pub enum VmState { 324 Created, 325 Running, 326 Shutdown, 327 Paused, 328 BreakPoint, 329 } 330 331 impl VmState { 332 fn valid_transition(self, new_state: VmState) -> Result<()> { 333 match self { 334 VmState::Created => match new_state { 335 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 336 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 337 Ok(()) 338 } 339 }, 340 341 VmState::Running => match new_state { 342 VmState::Created | VmState::Running => { 343 Err(Error::InvalidStateTransition(self, new_state)) 344 } 345 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 346 }, 347 348 VmState::Shutdown => match new_state { 349 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 350 Err(Error::InvalidStateTransition(self, new_state)) 351 } 352 VmState::Running => Ok(()), 353 }, 354 355 VmState::Paused => match new_state { 356 VmState::Created | VmState::Paused | VmState::BreakPoint => { 357 Err(Error::InvalidStateTransition(self, new_state)) 358 } 359 VmState::Running | VmState::Shutdown => Ok(()), 360 }, 361 VmState::BreakPoint => match new_state { 362 VmState::Created | VmState::Running => Ok(()), 363 _ => Err(Error::InvalidStateTransition(self, new_state)), 364 }, 365 } 366 } 367 } 368 369 struct VmOpsHandler { 370 memory: GuestMemoryAtomic<GuestMemoryMmap>, 371 #[cfg(target_arch = "x86_64")] 372 io_bus: Arc<Bus>, 373 mmio_bus: Arc<Bus>, 374 } 375 376 impl VmOps for VmOpsHandler { 377 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 378 self.memory 379 .memory() 380 .write(buf, GuestAddress(gpa)) 381 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 382 } 383 384 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 385 self.memory 386 .memory() 387 .read(buf, GuestAddress(gpa)) 388 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 389 } 390 391 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 392 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 393 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 394 } 395 Ok(()) 396 } 397 398 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 399 match self.mmio_bus.write(gpa, data) { 400 Err(vm_device::BusError::MissingAddressRange) => { 401 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 402 } 403 Ok(Some(barrier)) => { 404 info!("Waiting for barrier"); 405 barrier.wait(); 406 info!("Barrier released"); 407 } 408 _ => {} 409 }; 410 Ok(()) 411 } 412 413 #[cfg(target_arch = "x86_64")] 414 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 415 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 416 info!("Guest PIO read to unregistered address 0x{:x}", port); 417 } 418 Ok(()) 419 } 420 421 #[cfg(target_arch = "x86_64")] 422 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 423 match self.io_bus.write(port, data) { 424 Err(vm_device::BusError::MissingAddressRange) => { 425 info!("Guest PIO write to unregistered address 0x{:x}", port); 426 } 427 Ok(Some(barrier)) => { 428 info!("Waiting for barrier"); 429 barrier.wait(); 430 info!("Barrier released"); 431 } 432 _ => {} 433 }; 434 Ok(()) 435 } 436 } 437 438 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 439 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 440 441 cmp::min(host_phys_bits, max_phys_bits) 442 } 443 444 pub struct Vm { 445 #[cfg(feature = "tdx")] 446 kernel: Option<File>, 447 initramfs: Option<File>, 448 threads: Vec<thread::JoinHandle<()>>, 449 device_manager: Arc<Mutex<DeviceManager>>, 450 config: Arc<Mutex<VmConfig>>, 451 state: RwLock<VmState>, 452 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 453 memory_manager: Arc<Mutex<MemoryManager>>, 454 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 455 // The hypervisor abstracted virtual machine. 456 vm: Arc<dyn hypervisor::Vm>, 457 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 458 saved_clock: Option<hypervisor::ClockData>, 459 numa_nodes: NumaNodes, 460 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 461 hypervisor: Arc<dyn hypervisor::Hypervisor>, 462 stop_on_boot: bool, 463 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 464 } 465 466 impl Vm { 467 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 468 469 #[allow(clippy::too_many_arguments)] 470 pub fn new_from_memory_manager( 471 config: Arc<Mutex<VmConfig>>, 472 memory_manager: Arc<Mutex<MemoryManager>>, 473 vm: Arc<dyn hypervisor::Vm>, 474 exit_evt: EventFd, 475 reset_evt: EventFd, 476 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 477 seccomp_action: &SeccompAction, 478 hypervisor: Arc<dyn hypervisor::Hypervisor>, 479 activate_evt: EventFd, 480 timestamp: Instant, 481 serial_pty: Option<PtyPair>, 482 console_pty: Option<PtyPair>, 483 debug_console_pty: Option<PtyPair>, 484 console_resize_pipe: Option<File>, 485 original_termios: Arc<Mutex<Option<termios>>>, 486 snapshot: Option<Snapshot>, 487 ) -> Result<Self> { 488 trace_scoped!("Vm::new_from_memory_manager"); 489 490 let boot_id_list = config 491 .lock() 492 .unwrap() 493 .validate() 494 .map_err(Error::ConfigValidation)?; 495 496 #[cfg(not(feature = "igvm"))] 497 let load_payload_handle = if snapshot.is_none() { 498 Self::load_payload_async(&memory_manager, &config)? 499 } else { 500 None 501 }; 502 503 info!("Booting VM from config: {:?}", &config); 504 505 // Create NUMA nodes based on NumaConfig. 506 let numa_nodes = 507 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 508 509 #[cfg(feature = "tdx")] 510 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 511 #[cfg(feature = "sev_snp")] 512 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 513 #[cfg(feature = "tdx")] 514 let force_iommu = tdx_enabled; 515 #[cfg(feature = "sev_snp")] 516 let force_iommu = sev_snp_enabled; 517 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 518 let force_iommu = false; 519 520 #[cfg(feature = "guest_debug")] 521 let stop_on_boot = config.lock().unwrap().gdb; 522 #[cfg(not(feature = "guest_debug"))] 523 let stop_on_boot = false; 524 525 let memory = memory_manager.lock().unwrap().guest_memory(); 526 #[cfg(target_arch = "x86_64")] 527 let io_bus = Arc::new(Bus::new()); 528 let mmio_bus = Arc::new(Bus::new()); 529 530 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 531 memory, 532 #[cfg(target_arch = "x86_64")] 533 io_bus: io_bus.clone(), 534 mmio_bus: mmio_bus.clone(), 535 }); 536 537 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 538 let cpu_manager = cpu::CpuManager::new( 539 cpus_config, 540 vm.clone(), 541 exit_evt.try_clone().map_err(Error::EventFdClone)?, 542 reset_evt.try_clone().map_err(Error::EventFdClone)?, 543 #[cfg(feature = "guest_debug")] 544 vm_debug_evt, 545 &hypervisor, 546 seccomp_action.clone(), 547 vm_ops, 548 #[cfg(feature = "tdx")] 549 tdx_enabled, 550 &numa_nodes, 551 #[cfg(feature = "sev_snp")] 552 sev_snp_enabled, 553 ) 554 .map_err(Error::CpuManager)?; 555 556 #[cfg(target_arch = "x86_64")] 557 cpu_manager 558 .lock() 559 .unwrap() 560 .populate_cpuid( 561 &memory_manager, 562 &hypervisor, 563 #[cfg(feature = "tdx")] 564 tdx_enabled, 565 ) 566 .map_err(Error::CpuManager)?; 567 568 // Loading the igvm file is pushed down here because 569 // igvm parser needs cpu_manager to retrieve cpuid leaf. 570 // For the regular case, we can start loading early, but for 571 // igvm case we have to wait until cpu_manager is created. 572 // Currently, Microsoft Hypervisor does not provide any 573 // Hypervisor specific common cpuid, we need to call get_cpuid_values 574 // per cpuid through cpu_manager. 575 #[cfg(feature = "igvm")] 576 let load_payload_handle = if snapshot.is_none() { 577 Self::load_payload_async( 578 &memory_manager, 579 &config, 580 &cpu_manager, 581 #[cfg(feature = "sev_snp")] 582 sev_snp_enabled, 583 )? 584 } else { 585 None 586 }; 587 // The initial TDX configuration must be done before the vCPUs are 588 // created 589 #[cfg(feature = "tdx")] 590 if tdx_enabled { 591 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 592 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 593 vm.tdx_init(&cpuid, max_vcpus) 594 .map_err(Error::InitializeTdxVm)?; 595 } 596 597 cpu_manager 598 .lock() 599 .unwrap() 600 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 601 .map_err(Error::CpuManager)?; 602 603 // This initial SEV-SNP configuration must be done immediately after 604 // vCPUs are created. As part of this initialization we are 605 // transitioning the guest into secure state. 606 #[cfg(feature = "sev_snp")] 607 if sev_snp_enabled { 608 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 609 } 610 611 #[cfg(feature = "tdx")] 612 let dynamic = !tdx_enabled; 613 #[cfg(not(feature = "tdx"))] 614 let dynamic = true; 615 616 let device_manager = DeviceManager::new( 617 #[cfg(target_arch = "x86_64")] 618 io_bus, 619 mmio_bus, 620 hypervisor.hypervisor_type(), 621 vm.clone(), 622 config.clone(), 623 memory_manager.clone(), 624 cpu_manager.clone(), 625 exit_evt.try_clone().map_err(Error::EventFdClone)?, 626 reset_evt, 627 seccomp_action.clone(), 628 numa_nodes.clone(), 629 &activate_evt, 630 force_iommu, 631 boot_id_list, 632 timestamp, 633 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 634 dynamic, 635 ) 636 .map_err(Error::DeviceManager)?; 637 638 device_manager 639 .lock() 640 .unwrap() 641 .create_devices( 642 serial_pty, 643 console_pty, 644 debug_console_pty, 645 console_resize_pipe, 646 original_termios, 647 ) 648 .map_err(Error::DeviceManager)?; 649 650 #[cfg(feature = "tdx")] 651 let kernel = config 652 .lock() 653 .unwrap() 654 .payload 655 .as_ref() 656 .map(|p| p.kernel.as_ref().map(File::open)) 657 .unwrap_or_default() 658 .transpose() 659 .map_err(Error::KernelFile)?; 660 661 let initramfs = config 662 .lock() 663 .unwrap() 664 .payload 665 .as_ref() 666 .map(|p| p.initramfs.as_ref().map(File::open)) 667 .unwrap_or_default() 668 .transpose() 669 .map_err(Error::InitramfsFile)?; 670 671 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 672 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 673 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 674 vm_snapshot.clock 675 } else { 676 None 677 }; 678 679 let vm_state = if snapshot.is_some() { 680 VmState::Paused 681 } else { 682 VmState::Created 683 }; 684 685 Ok(Vm { 686 #[cfg(feature = "tdx")] 687 kernel, 688 initramfs, 689 device_manager, 690 config, 691 threads: Vec::with_capacity(1), 692 state: RwLock::new(vm_state), 693 cpu_manager, 694 memory_manager, 695 vm, 696 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 697 saved_clock, 698 numa_nodes, 699 hypervisor, 700 stop_on_boot, 701 load_payload_handle, 702 }) 703 } 704 705 fn create_numa_nodes( 706 configs: Option<Vec<NumaConfig>>, 707 memory_manager: &Arc<Mutex<MemoryManager>>, 708 ) -> Result<NumaNodes> { 709 let mm = memory_manager.lock().unwrap(); 710 let mm_zones = mm.memory_zones(); 711 let mut numa_nodes = BTreeMap::new(); 712 713 if let Some(configs) = &configs { 714 for config in configs.iter() { 715 if numa_nodes.contains_key(&config.guest_numa_id) { 716 error!("Can't define twice the same NUMA node"); 717 return Err(Error::InvalidNumaConfig); 718 } 719 720 let mut node = NumaNode::default(); 721 722 if let Some(memory_zones) = &config.memory_zones { 723 for memory_zone in memory_zones.iter() { 724 if let Some(mm_zone) = mm_zones.get(memory_zone) { 725 node.memory_regions.extend(mm_zone.regions().clone()); 726 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 727 node.hotplug_regions.push(virtiomem_zone.region().clone()); 728 } 729 node.memory_zones.push(memory_zone.clone()); 730 } else { 731 error!("Unknown memory zone '{}'", memory_zone); 732 return Err(Error::InvalidNumaConfig); 733 } 734 } 735 } 736 737 if let Some(cpus) = &config.cpus { 738 node.cpus.extend(cpus); 739 } 740 741 if let Some(pci_segments) = &config.pci_segments { 742 node.pci_segments.extend(pci_segments); 743 } 744 745 if let Some(distances) = &config.distances { 746 for distance in distances.iter() { 747 let dest = distance.destination; 748 let dist = distance.distance; 749 750 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 751 error!("Unknown destination NUMA node {}", dest); 752 return Err(Error::InvalidNumaConfig); 753 } 754 755 if node.distances.contains_key(&dest) { 756 error!("Destination NUMA node {} has been already set", dest); 757 return Err(Error::InvalidNumaConfig); 758 } 759 760 node.distances.insert(dest, dist); 761 } 762 } 763 764 #[cfg(target_arch = "x86_64")] 765 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 766 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 767 let mm_sections = sgx_epc_region.epc_sections(); 768 for sgx_epc_section in sgx_epc_sections.iter() { 769 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 770 node.sgx_epc_sections.push(mm_section.clone()); 771 } else { 772 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 773 return Err(Error::InvalidNumaConfig); 774 } 775 } 776 } else { 777 error!("Missing SGX EPC region"); 778 return Err(Error::InvalidNumaConfig); 779 } 780 } 781 782 numa_nodes.insert(config.guest_numa_id, node); 783 } 784 } 785 786 Ok(numa_nodes) 787 } 788 789 #[allow(clippy::too_many_arguments)] 790 pub fn new( 791 vm_config: Arc<Mutex<VmConfig>>, 792 exit_evt: EventFd, 793 reset_evt: EventFd, 794 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 795 seccomp_action: &SeccompAction, 796 hypervisor: Arc<dyn hypervisor::Hypervisor>, 797 activate_evt: EventFd, 798 serial_pty: Option<PtyPair>, 799 console_pty: Option<PtyPair>, 800 debug_console_pty: Option<PtyPair>, 801 console_resize_pipe: Option<File>, 802 original_termios: Arc<Mutex<Option<termios>>>, 803 snapshot: Option<Snapshot>, 804 source_url: Option<&str>, 805 prefault: Option<bool>, 806 ) -> Result<Self> { 807 trace_scoped!("Vm::new"); 808 809 let timestamp = Instant::now(); 810 811 #[cfg(feature = "tdx")] 812 let tdx_enabled = if snapshot.is_some() { 813 false 814 } else { 815 vm_config.lock().unwrap().is_tdx_enabled() 816 }; 817 818 #[cfg(feature = "sev_snp")] 819 let sev_snp_enabled = if snapshot.is_some() { 820 false 821 } else { 822 vm_config.lock().unwrap().is_sev_snp_enabled() 823 }; 824 825 let vm = Self::create_hypervisor_vm( 826 &hypervisor, 827 #[cfg(feature = "tdx")] 828 tdx_enabled, 829 #[cfg(feature = "sev_snp")] 830 sev_snp_enabled, 831 )?; 832 833 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 834 835 let memory_manager = if let Some(snapshot) = 836 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 837 { 838 MemoryManager::new_from_snapshot( 839 &snapshot, 840 vm.clone(), 841 &vm_config.lock().unwrap().memory.clone(), 842 source_url, 843 prefault.unwrap(), 844 phys_bits, 845 ) 846 .map_err(Error::MemoryManager)? 847 } else { 848 #[cfg(target_arch = "x86_64")] 849 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 850 851 MemoryManager::new( 852 vm.clone(), 853 &vm_config.lock().unwrap().memory.clone(), 854 None, 855 phys_bits, 856 #[cfg(feature = "tdx")] 857 tdx_enabled, 858 None, 859 None, 860 #[cfg(target_arch = "x86_64")] 861 sgx_epc_config, 862 ) 863 .map_err(Error::MemoryManager)? 864 }; 865 866 Vm::new_from_memory_manager( 867 vm_config, 868 memory_manager, 869 vm, 870 exit_evt, 871 reset_evt, 872 #[cfg(feature = "guest_debug")] 873 vm_debug_evt, 874 seccomp_action, 875 hypervisor, 876 activate_evt, 877 timestamp, 878 serial_pty, 879 console_pty, 880 debug_console_pty, 881 console_resize_pipe, 882 original_termios, 883 snapshot, 884 ) 885 } 886 887 pub fn create_hypervisor_vm( 888 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 889 #[cfg(feature = "tdx")] tdx_enabled: bool, 890 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 891 ) -> Result<Arc<dyn hypervisor::Vm>> { 892 hypervisor.check_required_extensions().unwrap(); 893 894 cfg_if::cfg_if! { 895 if #[cfg(feature = "tdx")] { 896 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 897 // Otherwise KVM_X86_LEGACY_VM: 0 898 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 899 let vm = hypervisor 900 .create_vm_with_type(u64::from(tdx_enabled)) 901 .unwrap(); 902 } else if #[cfg(feature = "sev_snp")] { 903 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 904 // Otherwise SEV_SNP_DISABLED: 0 905 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 906 let vm = hypervisor 907 .create_vm_with_type(u64::from(sev_snp_enabled)) 908 .unwrap(); 909 } else { 910 let vm = hypervisor.create_vm().unwrap(); 911 } 912 } 913 914 #[cfg(target_arch = "x86_64")] 915 { 916 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 917 .unwrap(); 918 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 919 vm.enable_split_irq().unwrap(); 920 } 921 922 Ok(vm) 923 } 924 925 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 926 let initramfs = self.initramfs.as_mut().unwrap(); 927 let size: usize = initramfs 928 .seek(SeekFrom::End(0)) 929 .map_err(|_| Error::InitramfsLoad)? 930 .try_into() 931 .unwrap(); 932 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 933 934 let address = 935 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 936 let address = GuestAddress(address); 937 938 guest_mem 939 .read_volatile_from(address, initramfs, size) 940 .map_err(|_| Error::InitramfsLoad)?; 941 942 info!("Initramfs loaded: address = 0x{:x}", address.0); 943 Ok(arch::InitramfsConfig { address, size }) 944 } 945 946 pub fn generate_cmdline( 947 payload: &PayloadConfig, 948 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 949 ) -> Result<Cmdline> { 950 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 951 if let Some(s) = payload.cmdline.as_ref() { 952 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 953 } 954 955 #[cfg(target_arch = "aarch64")] 956 for entry in device_manager.lock().unwrap().cmdline_additions() { 957 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 958 } 959 Ok(cmdline) 960 } 961 962 #[cfg(target_arch = "aarch64")] 963 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 964 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 965 let mem = uefi_flash.memory(); 966 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 967 .map_err(Error::UefiLoad)?; 968 Ok(()) 969 } 970 971 #[cfg(target_arch = "aarch64")] 972 fn load_kernel( 973 firmware: Option<File>, 974 kernel: Option<File>, 975 memory_manager: Arc<Mutex<MemoryManager>>, 976 ) -> Result<EntryPoint> { 977 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 978 let mem = guest_memory.memory(); 979 let entry_addr = match (firmware, kernel) { 980 (None, Some(mut kernel)) => { 981 match linux_loader::loader::pe::PE::load( 982 mem.deref(), 983 Some(arch::layout::KERNEL_START), 984 &mut kernel, 985 None, 986 ) { 987 Ok(entry_addr) => entry_addr.kernel_load, 988 // Try to load the binary as kernel PE file at first. 989 // If failed, retry to load it as UEFI binary. 990 // As the UEFI binary is formatless, it must be the last option to try. 991 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 992 Self::load_firmware(&kernel, memory_manager)?; 993 arch::layout::UEFI_START 994 } 995 Err(e) => { 996 return Err(Error::KernelLoad(e)); 997 } 998 } 999 } 1000 (Some(firmware), None) => { 1001 Self::load_firmware(&firmware, memory_manager)?; 1002 arch::layout::UEFI_START 1003 } 1004 _ => return Err(Error::InvalidPayload), 1005 }; 1006 1007 Ok(EntryPoint { entry_addr }) 1008 } 1009 1010 #[cfg(feature = "igvm")] 1011 fn load_igvm( 1012 igvm: File, 1013 memory_manager: Arc<Mutex<MemoryManager>>, 1014 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1015 #[cfg(feature = "sev_snp")] host_data: &Option<String>, 1016 ) -> Result<EntryPoint> { 1017 let res = igvm_loader::load_igvm( 1018 &igvm, 1019 memory_manager, 1020 cpu_manager.clone(), 1021 "", 1022 #[cfg(feature = "sev_snp")] 1023 host_data, 1024 ) 1025 .map_err(Error::IgvmLoad)?; 1026 1027 cfg_if::cfg_if! { 1028 if #[cfg(feature = "sev_snp")] { 1029 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1030 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1031 } else { 1032 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1033 }; 1034 } else { 1035 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1036 } 1037 }; 1038 Ok(entry_point) 1039 } 1040 1041 #[cfg(target_arch = "x86_64")] 1042 fn load_kernel( 1043 mut kernel: File, 1044 cmdline: Option<Cmdline>, 1045 memory_manager: Arc<Mutex<MemoryManager>>, 1046 ) -> Result<EntryPoint> { 1047 info!("Loading kernel"); 1048 1049 let mem = { 1050 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1051 guest_memory.memory() 1052 }; 1053 1054 // Try ELF binary with PVH boot. 1055 let entry_addr = linux_loader::loader::elf::Elf::load( 1056 mem.deref(), 1057 None, 1058 &mut kernel, 1059 Some(arch::layout::HIGH_RAM_START), 1060 ) 1061 // Try loading kernel as bzImage. 1062 .or_else(|_| { 1063 BzImage::load( 1064 mem.deref(), 1065 None, 1066 &mut kernel, 1067 Some(arch::layout::HIGH_RAM_START), 1068 ) 1069 }) 1070 .map_err(Error::KernelLoad)?; 1071 1072 if let Some(cmdline) = cmdline { 1073 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1074 .map_err(Error::LoadCmdLine)?; 1075 } 1076 1077 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1078 // Use the PVH kernel entry point to boot the guest 1079 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1080 Ok(EntryPoint { 1081 entry_addr, 1082 setup_header: None, 1083 }) 1084 } else if entry_addr.setup_header.is_some() { 1085 // Use the bzImage 32bit entry point to boot the guest 1086 info!( 1087 "bzImage kernel loaded: entry_addr = 0x{:x}", 1088 entry_addr.kernel_load.0 1089 ); 1090 Ok(EntryPoint { 1091 entry_addr: entry_addr.kernel_load, 1092 setup_header: entry_addr.setup_header, 1093 }) 1094 } else { 1095 Err(Error::KernelMissingPvhHeader) 1096 } 1097 } 1098 1099 #[cfg(target_arch = "x86_64")] 1100 fn load_payload( 1101 payload: &PayloadConfig, 1102 memory_manager: Arc<Mutex<MemoryManager>>, 1103 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1104 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1105 ) -> Result<EntryPoint> { 1106 trace_scoped!("load_payload"); 1107 #[cfg(feature = "igvm")] 1108 { 1109 if let Some(_igvm_file) = &payload.igvm { 1110 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1111 #[cfg(feature = "sev_snp")] 1112 if sev_snp_enabled { 1113 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); 1114 } 1115 #[cfg(not(feature = "sev_snp"))] 1116 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1117 } 1118 } 1119 match ( 1120 &payload.firmware, 1121 &payload.kernel, 1122 &payload.initramfs, 1123 &payload.cmdline, 1124 ) { 1125 (Some(firmware), None, None, None) => { 1126 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1127 Self::load_kernel(firmware, None, memory_manager) 1128 } 1129 (None, Some(kernel), _, _) => { 1130 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1131 let cmdline = Self::generate_cmdline(payload)?; 1132 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1133 } 1134 _ => Err(Error::InvalidPayload), 1135 } 1136 } 1137 1138 #[cfg(target_arch = "aarch64")] 1139 fn load_payload( 1140 payload: &PayloadConfig, 1141 memory_manager: Arc<Mutex<MemoryManager>>, 1142 ) -> Result<EntryPoint> { 1143 match (&payload.firmware, &payload.kernel) { 1144 (Some(firmware), None) => { 1145 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1146 Self::load_kernel(Some(firmware), None, memory_manager) 1147 } 1148 (None, Some(kernel)) => { 1149 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1150 Self::load_kernel(None, Some(kernel), memory_manager) 1151 } 1152 _ => Err(Error::InvalidPayload), 1153 } 1154 } 1155 1156 fn load_payload_async( 1157 memory_manager: &Arc<Mutex<MemoryManager>>, 1158 config: &Arc<Mutex<VmConfig>>, 1159 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1160 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1161 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1162 // Kernel with TDX is loaded in a different manner 1163 #[cfg(feature = "tdx")] 1164 if config.lock().unwrap().is_tdx_enabled() { 1165 return Ok(None); 1166 } 1167 1168 config 1169 .lock() 1170 .unwrap() 1171 .payload 1172 .as_ref() 1173 .map(|payload| { 1174 let memory_manager = memory_manager.clone(); 1175 let payload = payload.clone(); 1176 #[cfg(feature = "igvm")] 1177 let cpu_manager = cpu_manager.clone(); 1178 1179 std::thread::Builder::new() 1180 .name("payload_loader".into()) 1181 .spawn(move || { 1182 Self::load_payload( 1183 &payload, 1184 memory_manager, 1185 #[cfg(feature = "igvm")] 1186 cpu_manager, 1187 #[cfg(feature = "sev_snp")] 1188 sev_snp_enabled, 1189 ) 1190 }) 1191 .map_err(Error::KernelLoadThreadSpawn) 1192 }) 1193 .transpose() 1194 } 1195 1196 #[cfg(target_arch = "x86_64")] 1197 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1198 trace_scoped!("configure_system"); 1199 info!("Configuring system"); 1200 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1201 1202 let initramfs_config = match self.initramfs { 1203 Some(_) => Some(self.load_initramfs(&mem)?), 1204 None => None, 1205 }; 1206 1207 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1208 let rsdp_addr = Some(rsdp_addr); 1209 let sgx_epc_region = self 1210 .memory_manager 1211 .lock() 1212 .unwrap() 1213 .sgx_epc_region() 1214 .as_ref() 1215 .cloned(); 1216 1217 let serial_number = self 1218 .config 1219 .lock() 1220 .unwrap() 1221 .platform 1222 .as_ref() 1223 .and_then(|p| p.serial_number.clone()); 1224 1225 let uuid = self 1226 .config 1227 .lock() 1228 .unwrap() 1229 .platform 1230 .as_ref() 1231 .and_then(|p| p.uuid.clone()); 1232 1233 let oem_strings = self 1234 .config 1235 .lock() 1236 .unwrap() 1237 .platform 1238 .as_ref() 1239 .and_then(|p| p.oem_strings.clone()); 1240 1241 let oem_strings = oem_strings 1242 .as_deref() 1243 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1244 1245 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1246 1247 arch::configure_system( 1248 &mem, 1249 arch::layout::CMDLINE_START, 1250 arch::layout::CMDLINE_MAX_SIZE, 1251 &initramfs_config, 1252 boot_vcpus, 1253 entry_addr.setup_header, 1254 rsdp_addr, 1255 sgx_epc_region, 1256 serial_number.as_deref(), 1257 uuid.as_deref(), 1258 oem_strings.as_deref(), 1259 topology, 1260 ) 1261 .map_err(Error::ConfigureSystem)?; 1262 Ok(()) 1263 } 1264 1265 #[cfg(target_arch = "aarch64")] 1266 fn configure_system( 1267 &mut self, 1268 _rsdp_addr: GuestAddress, 1269 _entry_addr: EntryPoint, 1270 ) -> Result<()> { 1271 let cmdline = Self::generate_cmdline( 1272 self.config.lock().unwrap().payload.as_ref().unwrap(), 1273 &self.device_manager, 1274 )?; 1275 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1276 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1277 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1278 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1279 let initramfs_config = match self.initramfs { 1280 Some(_) => Some(self.load_initramfs(&mem)?), 1281 None => None, 1282 }; 1283 1284 let device_info = &self 1285 .device_manager 1286 .lock() 1287 .unwrap() 1288 .get_device_info() 1289 .clone(); 1290 1291 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1292 let pci_space = PciSpaceInfo { 1293 pci_segment_id: pci_segment.id, 1294 mmio_config_address: pci_segment.mmio_config_address, 1295 pci_device_space_start: pci_segment.start_of_mem64_area, 1296 pci_device_space_size: pci_segment.end_of_mem64_area 1297 - pci_segment.start_of_mem64_area 1298 + 1, 1299 }; 1300 pci_space_info.push(pci_space); 1301 } 1302 1303 let virtio_iommu_bdf = self 1304 .device_manager 1305 .lock() 1306 .unwrap() 1307 .iommu_attached_devices() 1308 .as_ref() 1309 .map(|(v, _)| *v); 1310 1311 let vgic = self 1312 .device_manager 1313 .lock() 1314 .unwrap() 1315 .get_interrupt_controller() 1316 .unwrap() 1317 .lock() 1318 .unwrap() 1319 .get_vgic() 1320 .map_err(|_| { 1321 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1322 arch::aarch64::Error::SetupGic, 1323 )) 1324 })?; 1325 1326 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1327 let pmu_supported = self 1328 .cpu_manager 1329 .lock() 1330 .unwrap() 1331 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1332 .map_err(|_| { 1333 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1334 arch::aarch64::Error::VcpuInitPmu, 1335 )) 1336 })?; 1337 1338 arch::configure_system( 1339 &mem, 1340 cmdline.as_cstring().unwrap().to_str().unwrap(), 1341 vcpu_mpidrs, 1342 vcpu_topology, 1343 device_info, 1344 &initramfs_config, 1345 &pci_space_info, 1346 virtio_iommu_bdf.map(|bdf| bdf.into()), 1347 &vgic, 1348 &self.numa_nodes, 1349 pmu_supported, 1350 ) 1351 .map_err(Error::ConfigureSystem)?; 1352 1353 Ok(()) 1354 } 1355 1356 pub fn serial_pty(&self) -> Option<PtyPair> { 1357 self.device_manager.lock().unwrap().serial_pty() 1358 } 1359 1360 pub fn console_pty(&self) -> Option<PtyPair> { 1361 self.device_manager.lock().unwrap().console_pty() 1362 } 1363 1364 pub fn debug_console_pty(&self) -> Option<PtyPair> { 1365 self.device_manager.lock().unwrap().debug_console_pty() 1366 } 1367 1368 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1369 self.device_manager.lock().unwrap().console_resize_pipe() 1370 } 1371 1372 pub fn shutdown(&mut self) -> Result<()> { 1373 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1374 let new_state = VmState::Shutdown; 1375 1376 state.valid_transition(new_state)?; 1377 1378 // Wake up the DeviceManager threads so they will get terminated cleanly 1379 self.device_manager 1380 .lock() 1381 .unwrap() 1382 .resume() 1383 .map_err(Error::Resume)?; 1384 1385 self.cpu_manager 1386 .lock() 1387 .unwrap() 1388 .shutdown() 1389 .map_err(Error::CpuManager)?; 1390 1391 // Wait for all the threads to finish 1392 for thread in self.threads.drain(..) { 1393 thread.join().map_err(Error::ThreadCleanup)? 1394 } 1395 *state = new_state; 1396 1397 event!("vm", "shutdown"); 1398 1399 Ok(()) 1400 } 1401 1402 pub fn resize( 1403 &mut self, 1404 desired_vcpus: Option<u8>, 1405 desired_memory: Option<u64>, 1406 desired_balloon: Option<u64>, 1407 ) -> Result<()> { 1408 event!("vm", "resizing"); 1409 1410 if let Some(desired_vcpus) = desired_vcpus { 1411 if self 1412 .cpu_manager 1413 .lock() 1414 .unwrap() 1415 .resize(desired_vcpus) 1416 .map_err(Error::CpuManager)? 1417 { 1418 self.device_manager 1419 .lock() 1420 .unwrap() 1421 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1422 .map_err(Error::DeviceManager)?; 1423 } 1424 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1425 } 1426 1427 if let Some(desired_memory) = desired_memory { 1428 let new_region = self 1429 .memory_manager 1430 .lock() 1431 .unwrap() 1432 .resize(desired_memory) 1433 .map_err(Error::MemoryManager)?; 1434 1435 let memory_config = &mut self.config.lock().unwrap().memory; 1436 1437 if let Some(new_region) = &new_region { 1438 self.device_manager 1439 .lock() 1440 .unwrap() 1441 .update_memory(new_region) 1442 .map_err(Error::DeviceManager)?; 1443 1444 match memory_config.hotplug_method { 1445 HotplugMethod::Acpi => { 1446 self.device_manager 1447 .lock() 1448 .unwrap() 1449 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1450 .map_err(Error::DeviceManager)?; 1451 } 1452 HotplugMethod::VirtioMem => {} 1453 } 1454 } 1455 1456 // We update the VM config regardless of the actual guest resize 1457 // operation result (happened or not), so that if the VM reboots 1458 // it will be running with the last configure memory size. 1459 match memory_config.hotplug_method { 1460 HotplugMethod::Acpi => memory_config.size = desired_memory, 1461 HotplugMethod::VirtioMem => { 1462 if desired_memory > memory_config.size { 1463 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1464 } else { 1465 memory_config.hotplugged_size = None; 1466 } 1467 } 1468 } 1469 } 1470 1471 if let Some(desired_balloon) = desired_balloon { 1472 self.device_manager 1473 .lock() 1474 .unwrap() 1475 .resize_balloon(desired_balloon) 1476 .map_err(Error::DeviceManager)?; 1477 1478 // Update the configuration value for the balloon size to ensure 1479 // a reboot would use the right value. 1480 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1481 balloon_config.size = desired_balloon; 1482 } 1483 } 1484 1485 event!("vm", "resized"); 1486 1487 Ok(()) 1488 } 1489 1490 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1491 let memory_config = &mut self.config.lock().unwrap().memory; 1492 1493 if let Some(zones) = &mut memory_config.zones { 1494 for zone in zones.iter_mut() { 1495 if zone.id == id { 1496 if desired_memory >= zone.size { 1497 let hotplugged_size = desired_memory - zone.size; 1498 self.memory_manager 1499 .lock() 1500 .unwrap() 1501 .resize_zone(&id, desired_memory - zone.size) 1502 .map_err(Error::MemoryManager)?; 1503 // We update the memory zone config regardless of the 1504 // actual 'resize-zone' operation result (happened or 1505 // not), so that if the VM reboots it will be running 1506 // with the last configured memory zone size. 1507 zone.hotplugged_size = Some(hotplugged_size); 1508 1509 return Ok(()); 1510 } else { 1511 error!( 1512 "Invalid to ask less ({}) than boot RAM ({}) for \ 1513 this memory zone", 1514 desired_memory, zone.size, 1515 ); 1516 return Err(Error::ResizeZone); 1517 } 1518 } 1519 } 1520 } 1521 1522 error!("Could not find the memory zone {} for the resize", id); 1523 Err(Error::ResizeZone) 1524 } 1525 1526 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1527 let pci_device_info = self 1528 .device_manager 1529 .lock() 1530 .unwrap() 1531 .add_device(&mut device_cfg) 1532 .map_err(Error::DeviceManager)?; 1533 1534 // Update VmConfig by adding the new device. This is important to 1535 // ensure the device would be created in case of a reboot. 1536 { 1537 let mut config = self.config.lock().unwrap(); 1538 add_to_config(&mut config.devices, device_cfg); 1539 } 1540 1541 self.device_manager 1542 .lock() 1543 .unwrap() 1544 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1545 .map_err(Error::DeviceManager)?; 1546 1547 Ok(pci_device_info) 1548 } 1549 1550 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1551 let pci_device_info = self 1552 .device_manager 1553 .lock() 1554 .unwrap() 1555 .add_user_device(&mut device_cfg) 1556 .map_err(Error::DeviceManager)?; 1557 1558 // Update VmConfig by adding the new device. This is important to 1559 // ensure the device would be created in case of a reboot. 1560 { 1561 let mut config = self.config.lock().unwrap(); 1562 add_to_config(&mut config.user_devices, device_cfg); 1563 } 1564 1565 self.device_manager 1566 .lock() 1567 .unwrap() 1568 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1569 .map_err(Error::DeviceManager)?; 1570 1571 Ok(pci_device_info) 1572 } 1573 1574 pub fn remove_device(&mut self, id: String) -> Result<()> { 1575 self.device_manager 1576 .lock() 1577 .unwrap() 1578 .remove_device(id.clone()) 1579 .map_err(Error::DeviceManager)?; 1580 1581 // Update VmConfig by removing the device. This is important to 1582 // ensure the device would not be created in case of a reboot. 1583 self.config.lock().unwrap().remove_device(&id); 1584 1585 self.device_manager 1586 .lock() 1587 .unwrap() 1588 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1589 .map_err(Error::DeviceManager)?; 1590 Ok(()) 1591 } 1592 1593 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1594 let pci_device_info = self 1595 .device_manager 1596 .lock() 1597 .unwrap() 1598 .add_disk(&mut disk_cfg) 1599 .map_err(Error::DeviceManager)?; 1600 1601 // Update VmConfig by adding the new device. This is important to 1602 // ensure the device would be created in case of a reboot. 1603 { 1604 let mut config = self.config.lock().unwrap(); 1605 add_to_config(&mut config.disks, disk_cfg); 1606 } 1607 1608 self.device_manager 1609 .lock() 1610 .unwrap() 1611 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1612 .map_err(Error::DeviceManager)?; 1613 1614 Ok(pci_device_info) 1615 } 1616 1617 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1618 let pci_device_info = self 1619 .device_manager 1620 .lock() 1621 .unwrap() 1622 .add_fs(&mut fs_cfg) 1623 .map_err(Error::DeviceManager)?; 1624 1625 // Update VmConfig by adding the new device. This is important to 1626 // ensure the device would be created in case of a reboot. 1627 { 1628 let mut config = self.config.lock().unwrap(); 1629 add_to_config(&mut config.fs, fs_cfg); 1630 } 1631 1632 self.device_manager 1633 .lock() 1634 .unwrap() 1635 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1636 .map_err(Error::DeviceManager)?; 1637 1638 Ok(pci_device_info) 1639 } 1640 1641 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1642 let pci_device_info = self 1643 .device_manager 1644 .lock() 1645 .unwrap() 1646 .add_pmem(&mut pmem_cfg) 1647 .map_err(Error::DeviceManager)?; 1648 1649 // Update VmConfig by adding the new device. This is important to 1650 // ensure the device would be created in case of a reboot. 1651 { 1652 let mut config = self.config.lock().unwrap(); 1653 add_to_config(&mut config.pmem, pmem_cfg); 1654 } 1655 1656 self.device_manager 1657 .lock() 1658 .unwrap() 1659 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1660 .map_err(Error::DeviceManager)?; 1661 1662 Ok(pci_device_info) 1663 } 1664 1665 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1666 let pci_device_info = self 1667 .device_manager 1668 .lock() 1669 .unwrap() 1670 .add_net(&mut net_cfg) 1671 .map_err(Error::DeviceManager)?; 1672 1673 // Update VmConfig by adding the new device. This is important to 1674 // ensure the device would be created in case of a reboot. 1675 { 1676 let mut config = self.config.lock().unwrap(); 1677 add_to_config(&mut config.net, net_cfg); 1678 } 1679 1680 self.device_manager 1681 .lock() 1682 .unwrap() 1683 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1684 .map_err(Error::DeviceManager)?; 1685 1686 Ok(pci_device_info) 1687 } 1688 1689 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1690 let pci_device_info = self 1691 .device_manager 1692 .lock() 1693 .unwrap() 1694 .add_vdpa(&mut vdpa_cfg) 1695 .map_err(Error::DeviceManager)?; 1696 1697 // Update VmConfig by adding the new device. This is important to 1698 // ensure the device would be created in case of a reboot. 1699 { 1700 let mut config = self.config.lock().unwrap(); 1701 add_to_config(&mut config.vdpa, vdpa_cfg); 1702 } 1703 1704 self.device_manager 1705 .lock() 1706 .unwrap() 1707 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1708 .map_err(Error::DeviceManager)?; 1709 1710 Ok(pci_device_info) 1711 } 1712 1713 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1714 let pci_device_info = self 1715 .device_manager 1716 .lock() 1717 .unwrap() 1718 .add_vsock(&mut vsock_cfg) 1719 .map_err(Error::DeviceManager)?; 1720 1721 // Update VmConfig by adding the new device. This is important to 1722 // ensure the device would be created in case of a reboot. 1723 { 1724 let mut config = self.config.lock().unwrap(); 1725 config.vsock = Some(vsock_cfg); 1726 } 1727 1728 self.device_manager 1729 .lock() 1730 .unwrap() 1731 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1732 .map_err(Error::DeviceManager)?; 1733 1734 Ok(pci_device_info) 1735 } 1736 1737 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1738 Ok(self.device_manager.lock().unwrap().counters()) 1739 } 1740 1741 #[cfg(feature = "tdx")] 1742 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1743 use arch::x86_64::tdx::*; 1744 1745 let firmware_path = self 1746 .config 1747 .lock() 1748 .unwrap() 1749 .payload 1750 .as_ref() 1751 .unwrap() 1752 .firmware 1753 .clone() 1754 .ok_or(Error::TdxFirmwareMissing)?; 1755 // The TDVF file contains a table of section as well as code 1756 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1757 1758 // For all the sections allocate some RAM backing them 1759 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1760 } 1761 1762 #[cfg(feature = "tdx")] 1763 fn hob_memory_resources( 1764 mut sorted_sections: Vec<TdvfSection>, 1765 guest_memory: &GuestMemoryMmap, 1766 ) -> Vec<(u64, u64, bool)> { 1767 let mut list = Vec::new(); 1768 1769 let mut current_section = sorted_sections.pop(); 1770 1771 // RAM regions interleaved with TDVF sections 1772 let mut next_start_addr = 0; 1773 for region in guest_memory.iter() { 1774 let region_start = region.start_addr().0; 1775 let region_end = region.last_addr().0; 1776 if region_start > next_start_addr { 1777 next_start_addr = region_start; 1778 } 1779 1780 loop { 1781 let (start, size, ram) = if let Some(section) = ¤t_section { 1782 if section.address <= next_start_addr { 1783 (section.address, section.size, false) 1784 } else { 1785 let last_addr = std::cmp::min(section.address - 1, region_end); 1786 (next_start_addr, last_addr - next_start_addr + 1, true) 1787 } 1788 } else { 1789 (next_start_addr, region_end - next_start_addr + 1, true) 1790 }; 1791 1792 list.push((start, size, ram)); 1793 1794 if !ram { 1795 current_section = sorted_sections.pop(); 1796 } 1797 1798 next_start_addr = start + size; 1799 1800 if region_start > next_start_addr { 1801 next_start_addr = region_start; 1802 } 1803 1804 if next_start_addr > region_end { 1805 break; 1806 } 1807 } 1808 } 1809 1810 // Once all the interleaved sections have been processed, let's simply 1811 // pull the remaining ones. 1812 if let Some(section) = current_section { 1813 list.push((section.address, section.size, false)); 1814 } 1815 while let Some(section) = sorted_sections.pop() { 1816 list.push((section.address, section.size, false)); 1817 } 1818 1819 list 1820 } 1821 1822 #[cfg(feature = "tdx")] 1823 fn populate_tdx_sections( 1824 &mut self, 1825 sections: &[TdvfSection], 1826 guid_found: bool, 1827 ) -> Result<Option<u64>> { 1828 use arch::x86_64::tdx::*; 1829 // Get the memory end *before* we start adding TDVF ram regions 1830 let boot_guest_memory = self 1831 .memory_manager 1832 .lock() 1833 .as_ref() 1834 .unwrap() 1835 .boot_guest_memory(); 1836 for section in sections { 1837 // No need to allocate if the section falls within guest RAM ranges 1838 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1839 info!( 1840 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1841 section 1842 ); 1843 continue; 1844 } 1845 1846 info!("Allocating TDVF Section: {:x?}", section); 1847 self.memory_manager 1848 .lock() 1849 .unwrap() 1850 .add_ram_region(GuestAddress(section.address), section.size as usize) 1851 .map_err(Error::AllocatingTdvfMemory)?; 1852 } 1853 1854 // The TDVF file contains a table of section as well as code 1855 let firmware_path = self 1856 .config 1857 .lock() 1858 .unwrap() 1859 .payload 1860 .as_ref() 1861 .unwrap() 1862 .firmware 1863 .clone() 1864 .ok_or(Error::TdxFirmwareMissing)?; 1865 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1866 1867 // The guest memory at this point now has all the required regions so it 1868 // is safe to copy from the TDVF file into it. 1869 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1870 let mem = guest_memory.memory(); 1871 let mut payload_info = None; 1872 let mut hob_offset = None; 1873 for section in sections { 1874 info!("Populating TDVF Section: {:x?}", section); 1875 match section.r#type { 1876 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1877 info!("Copying section to guest memory"); 1878 firmware_file 1879 .seek(SeekFrom::Start(section.data_offset as u64)) 1880 .map_err(Error::LoadTdvf)?; 1881 mem.read_volatile_from( 1882 GuestAddress(section.address), 1883 &mut firmware_file, 1884 section.data_size as usize, 1885 ) 1886 .unwrap(); 1887 } 1888 TdvfSectionType::TdHob => { 1889 hob_offset = Some(section.address); 1890 } 1891 TdvfSectionType::Payload => { 1892 info!("Copying payload to guest memory"); 1893 if let Some(payload_file) = self.kernel.as_mut() { 1894 let payload_size = payload_file 1895 .seek(SeekFrom::End(0)) 1896 .map_err(Error::LoadPayload)?; 1897 1898 payload_file 1899 .seek(SeekFrom::Start(0x1f1)) 1900 .map_err(Error::LoadPayload)?; 1901 1902 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1903 payload_file 1904 .read_volatile(&mut payload_header.as_bytes()) 1905 .unwrap(); 1906 1907 if payload_header.header != 0x5372_6448 { 1908 return Err(Error::InvalidPayloadType); 1909 } 1910 1911 if (payload_header.version < 0x0200) 1912 || ((payload_header.loadflags & 0x1) == 0x0) 1913 { 1914 return Err(Error::InvalidPayloadType); 1915 } 1916 1917 payload_file.rewind().map_err(Error::LoadPayload)?; 1918 mem.read_volatile_from( 1919 GuestAddress(section.address), 1920 payload_file, 1921 payload_size as usize, 1922 ) 1923 .unwrap(); 1924 1925 // Create the payload info that will be inserted into 1926 // the HOB. 1927 payload_info = Some(PayloadInfo { 1928 image_type: PayloadImageType::BzImage, 1929 entry_point: section.address, 1930 }); 1931 } 1932 } 1933 TdvfSectionType::PayloadParam => { 1934 info!("Copying payload parameters to guest memory"); 1935 let cmdline = Self::generate_cmdline( 1936 self.config.lock().unwrap().payload.as_ref().unwrap(), 1937 )?; 1938 mem.write_slice( 1939 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1940 GuestAddress(section.address), 1941 ) 1942 .unwrap(); 1943 } 1944 _ => {} 1945 } 1946 } 1947 1948 // Generate HOB 1949 let mut hob = TdHob::start(hob_offset.unwrap()); 1950 1951 let mut sorted_sections = sections.to_vec(); 1952 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1953 1954 sorted_sections.sort_by_key(|section| section.address); 1955 sorted_sections.reverse(); 1956 1957 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1958 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1959 .map_err(Error::PopulateHob)?; 1960 } 1961 1962 // MMIO regions 1963 hob.add_mmio_resource( 1964 &mem, 1965 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1966 arch::layout::APIC_START.raw_value() 1967 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1968 ) 1969 .map_err(Error::PopulateHob)?; 1970 let start_of_device_area = self 1971 .memory_manager 1972 .lock() 1973 .unwrap() 1974 .start_of_device_area() 1975 .raw_value(); 1976 let end_of_device_area = self 1977 .memory_manager 1978 .lock() 1979 .unwrap() 1980 .end_of_device_area() 1981 .raw_value(); 1982 hob.add_mmio_resource( 1983 &mem, 1984 start_of_device_area, 1985 end_of_device_area - start_of_device_area, 1986 ) 1987 .map_err(Error::PopulateHob)?; 1988 1989 // Loop over the ACPI tables and copy them to the HOB. 1990 1991 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1992 &self.device_manager, 1993 &self.cpu_manager, 1994 &self.memory_manager, 1995 &self.numa_nodes, 1996 ) { 1997 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1998 .map_err(Error::PopulateHob)?; 1999 } 2000 2001 // If a payload info has been created, let's insert it into the HOB. 2002 if let Some(payload_info) = payload_info { 2003 hob.add_payload(&mem, payload_info) 2004 .map_err(Error::PopulateHob)?; 2005 } 2006 2007 hob.finish(&mem).map_err(Error::PopulateHob)?; 2008 2009 Ok(hob_offset) 2010 } 2011 2012 #[cfg(feature = "tdx")] 2013 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 2014 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2015 let mem = guest_memory.memory(); 2016 2017 for section in sections { 2018 self.vm 2019 .tdx_init_memory_region( 2020 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2021 section.address, 2022 section.size, 2023 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2024 section.attributes == 1, 2025 ) 2026 .map_err(Error::InitializeTdxMemoryRegion)?; 2027 } 2028 2029 Ok(()) 2030 } 2031 2032 // Creates ACPI tables 2033 // In case of TDX being used, this is a no-op since the tables will be 2034 // created and passed when populating the HOB. 2035 2036 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2037 #[cfg(feature = "tdx")] 2038 if self.config.lock().unwrap().is_tdx_enabled() { 2039 return None; 2040 } 2041 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2042 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2043 let rsdp_addr = crate::acpi::create_acpi_tables( 2044 &mem, 2045 &self.device_manager, 2046 &self.cpu_manager, 2047 &self.memory_manager, 2048 &self.numa_nodes, 2049 tpm_enabled, 2050 ); 2051 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2052 2053 Some(rsdp_addr) 2054 } 2055 2056 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2057 trace_scoped!("entry_point"); 2058 2059 self.load_payload_handle 2060 .take() 2061 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2062 .transpose() 2063 } 2064 2065 pub fn boot(&mut self) -> Result<()> { 2066 trace_scoped!("Vm::boot"); 2067 info!("Booting VM"); 2068 event!("vm", "booting"); 2069 let current_state = self.get_state()?; 2070 if current_state == VmState::Paused { 2071 return self.resume().map_err(Error::Resume); 2072 } 2073 2074 let new_state = if self.stop_on_boot { 2075 VmState::BreakPoint 2076 } else { 2077 VmState::Running 2078 }; 2079 current_state.valid_transition(new_state)?; 2080 2081 // Do earlier to parallelise with loading kernel 2082 #[cfg(target_arch = "x86_64")] 2083 cfg_if::cfg_if! { 2084 if #[cfg(feature = "sev_snp")] { 2085 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2086 let rsdp_addr = if sev_snp_enabled { 2087 // In case of SEV-SNP guest ACPI tables are provided via 2088 // IGVM. So skip the creation of ACPI tables and set the 2089 // rsdp addr to None. 2090 None 2091 } else { 2092 self.create_acpi_tables() 2093 }; 2094 } else { 2095 let rsdp_addr = self.create_acpi_tables(); 2096 } 2097 } 2098 2099 // Load kernel synchronously or if asynchronous then wait for load to 2100 // finish. 2101 let entry_point = self.entry_point()?; 2102 2103 #[cfg(feature = "tdx")] 2104 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2105 2106 // Configure the vcpus that have been created 2107 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2108 for vcpu in vcpus { 2109 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2110 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2111 self.cpu_manager 2112 .lock() 2113 .unwrap() 2114 .configure_vcpu(vcpu, boot_setup) 2115 .map_err(Error::CpuManager)?; 2116 } 2117 2118 #[cfg(feature = "tdx")] 2119 let (sections, guid_found) = if tdx_enabled { 2120 self.extract_tdvf_sections()? 2121 } else { 2122 (Vec::new(), false) 2123 }; 2124 2125 // Configuring the TDX regions requires that the vCPUs are created. 2126 #[cfg(feature = "tdx")] 2127 let hob_address = if tdx_enabled { 2128 // TDX sections are written to memory. 2129 self.populate_tdx_sections(§ions, guid_found)? 2130 } else { 2131 None 2132 }; 2133 2134 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2135 // available after they are configured 2136 #[cfg(target_arch = "aarch64")] 2137 let rsdp_addr = self.create_acpi_tables(); 2138 2139 // Configure shared state based on loaded kernel 2140 entry_point 2141 .map(|entry_point| { 2142 // Safe to unwrap rsdp_addr as we know it can't be None when 2143 // the entry_point is Some. 2144 self.configure_system(rsdp_addr.unwrap(), entry_point) 2145 }) 2146 .transpose()?; 2147 2148 #[cfg(target_arch = "x86_64")] 2149 // Note: For x86, always call this function before invoking start boot vcpus. 2150 // Otherwise guest would fail to boot because we haven't created the 2151 // userspace mappings to update the hypervisor about the memory mappings. 2152 // These mappings must be created before we start the vCPU threads for 2153 // the very first time. 2154 self.memory_manager 2155 .lock() 2156 .unwrap() 2157 .allocate_address_space() 2158 .map_err(Error::MemoryManager)?; 2159 2160 #[cfg(feature = "tdx")] 2161 if let Some(hob_address) = hob_address { 2162 // With the HOB address extracted the vCPUs can have 2163 // their TDX state configured. 2164 self.cpu_manager 2165 .lock() 2166 .unwrap() 2167 .initialize_tdx(hob_address) 2168 .map_err(Error::CpuManager)?; 2169 // Let the hypervisor know which memory ranges are shared with the 2170 // guest. This prevents the guest from ignoring/discarding memory 2171 // regions provided by the host. 2172 self.init_tdx_memory(§ions)?; 2173 // With TDX memory and CPU state configured TDX setup is complete 2174 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2175 } 2176 2177 self.cpu_manager 2178 .lock() 2179 .unwrap() 2180 .start_boot_vcpus(new_state == VmState::BreakPoint) 2181 .map_err(Error::CpuManager)?; 2182 2183 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2184 *state = new_state; 2185 event!("vm", "booted"); 2186 Ok(()) 2187 } 2188 2189 pub fn restore(&mut self) -> Result<()> { 2190 event!("vm", "restoring"); 2191 2192 #[cfg(target_arch = "x86_64")] 2193 // Note: For x86, always call this function before invoking start boot vcpus. 2194 // Otherwise guest would fail to boot because we haven't created the 2195 // userspace mappings to update the hypervisor about the memory mappings. 2196 // These mappings must be created before we start the vCPU threads for 2197 // the very first time for the restored VM. 2198 self.memory_manager 2199 .lock() 2200 .unwrap() 2201 .allocate_address_space() 2202 .map_err(Error::MemoryManager)?; 2203 2204 // Now we can start all vCPUs from here. 2205 self.cpu_manager 2206 .lock() 2207 .unwrap() 2208 .start_restored_vcpus() 2209 .map_err(Error::CpuManager)?; 2210 2211 event!("vm", "restored"); 2212 Ok(()) 2213 } 2214 2215 /// Gets a thread-safe reference counted pointer to the VM configuration. 2216 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2217 Arc::clone(&self.config) 2218 } 2219 2220 /// Get the VM state. Returns an error if the state is poisoned. 2221 pub fn get_state(&self) -> Result<VmState> { 2222 self.state 2223 .try_read() 2224 .map_err(|_| Error::PoisonedState) 2225 .map(|state| *state) 2226 } 2227 2228 /// Gets the actual size of the balloon. 2229 pub fn balloon_size(&self) -> u64 { 2230 self.device_manager.lock().unwrap().balloon_size() 2231 } 2232 2233 pub fn send_memory_fds( 2234 &mut self, 2235 socket: &mut UnixStream, 2236 ) -> std::result::Result<(), MigratableError> { 2237 for (slot, fd) in self 2238 .memory_manager 2239 .lock() 2240 .unwrap() 2241 .memory_slot_fds() 2242 .drain() 2243 { 2244 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2245 .write_to(socket) 2246 .map_err(|e| { 2247 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2248 })?; 2249 socket 2250 .send_with_fd(&slot.to_le_bytes()[..], fd) 2251 .map_err(|e| { 2252 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2253 })?; 2254 2255 let res = Response::read_from(socket)?; 2256 if res.status() != Status::Ok { 2257 warn!("Error during memory fd migration"); 2258 Request::abandon().write_to(socket)?; 2259 Response::read_from(socket).ok(); 2260 return Err(MigratableError::MigrateSend(anyhow!( 2261 "Error during memory fd migration" 2262 ))); 2263 } 2264 } 2265 2266 Ok(()) 2267 } 2268 2269 pub fn send_memory_regions<F>( 2270 &mut self, 2271 ranges: &MemoryRangeTable, 2272 fd: &mut F, 2273 ) -> std::result::Result<(), MigratableError> 2274 where 2275 F: WriteVolatile, 2276 { 2277 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2278 let mem = guest_memory.memory(); 2279 2280 for range in ranges.regions() { 2281 let mut offset: u64 = 0; 2282 // Here we are manually handling the retry in case we can't the 2283 // whole region at once because we can't use the implementation 2284 // from vm-memory::GuestMemory of write_all_to() as it is not 2285 // following the correct behavior. For more info about this issue 2286 // see: https://github.com/rust-vmm/vm-memory/issues/174 2287 loop { 2288 let bytes_written = mem 2289 .write_volatile_to( 2290 GuestAddress(range.gpa + offset), 2291 fd, 2292 (range.length - offset) as usize, 2293 ) 2294 .map_err(|e| { 2295 MigratableError::MigrateSend(anyhow!( 2296 "Error transferring memory to socket: {}", 2297 e 2298 )) 2299 })?; 2300 offset += bytes_written as u64; 2301 2302 if offset == range.length { 2303 break; 2304 } 2305 } 2306 } 2307 2308 Ok(()) 2309 } 2310 2311 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2312 self.memory_manager 2313 .lock() 2314 .unwrap() 2315 .memory_range_table(false) 2316 } 2317 2318 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2319 self.device_manager.lock().unwrap().device_tree() 2320 } 2321 2322 pub fn activate_virtio_devices(&self) -> Result<()> { 2323 self.device_manager 2324 .lock() 2325 .unwrap() 2326 .activate_virtio_devices() 2327 .map_err(Error::ActivateVirtioDevices) 2328 } 2329 2330 #[cfg(target_arch = "x86_64")] 2331 pub fn power_button(&self) -> Result<()> { 2332 return self 2333 .device_manager 2334 .lock() 2335 .unwrap() 2336 .notify_power_button() 2337 .map_err(Error::PowerButton); 2338 } 2339 2340 #[cfg(target_arch = "aarch64")] 2341 pub fn power_button(&self) -> Result<()> { 2342 self.device_manager 2343 .lock() 2344 .unwrap() 2345 .notify_power_button() 2346 .map_err(Error::PowerButton) 2347 } 2348 2349 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2350 self.memory_manager.lock().unwrap().snapshot_data() 2351 } 2352 2353 #[cfg(feature = "guest_debug")] 2354 pub fn debug_request( 2355 &mut self, 2356 gdb_request: &GdbRequestPayload, 2357 cpu_id: usize, 2358 ) -> Result<GdbResponsePayload> { 2359 use GdbRequestPayload::*; 2360 match gdb_request { 2361 SetSingleStep(single_step) => { 2362 self.set_guest_debug(cpu_id, &[], *single_step) 2363 .map_err(Error::Debug)?; 2364 } 2365 SetHwBreakPoint(addrs) => { 2366 self.set_guest_debug(cpu_id, addrs, false) 2367 .map_err(Error::Debug)?; 2368 } 2369 Pause => { 2370 self.debug_pause().map_err(Error::Debug)?; 2371 } 2372 Resume => { 2373 self.debug_resume().map_err(Error::Debug)?; 2374 } 2375 ReadRegs => { 2376 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2377 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2378 } 2379 WriteRegs(regs) => { 2380 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2381 } 2382 ReadMem(vaddr, len) => { 2383 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2384 let mem = self 2385 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2386 .map_err(Error::Debug)?; 2387 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2388 } 2389 WriteMem(vaddr, data) => { 2390 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2391 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2392 .map_err(Error::Debug)?; 2393 } 2394 ActiveVcpus => { 2395 let active_vcpus = self.active_vcpus(); 2396 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2397 } 2398 } 2399 Ok(GdbResponsePayload::CommandComplete) 2400 } 2401 2402 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2403 fn get_dump_state( 2404 &mut self, 2405 destination_url: &str, 2406 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2407 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2408 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2409 let mut elf_phdr_num = 1; 2410 let elf_sh_info = 0; 2411 let coredump_file_path = url_to_file(destination_url)?; 2412 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2413 2414 if mapping_num < UINT16_MAX - 2 { 2415 elf_phdr_num += mapping_num as u16; 2416 } else { 2417 panic!("mapping num beyond 65535 not supported"); 2418 } 2419 let coredump_file = OpenOptions::new() 2420 .read(true) 2421 .write(true) 2422 .create_new(true) 2423 .open(coredump_file_path) 2424 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2425 2426 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2427 let mem_data = self 2428 .memory_manager 2429 .lock() 2430 .unwrap() 2431 .coredump_memory_regions(mem_offset); 2432 2433 Ok(DumpState { 2434 elf_note_size, 2435 elf_phdr_num, 2436 elf_sh_info, 2437 mem_offset, 2438 mem_info: Some(mem_data), 2439 file: Some(coredump_file), 2440 }) 2441 } 2442 2443 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2444 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2445 size_of::<elf::Elf64_Ehdr>() as u64 2446 + note_size as u64 2447 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2448 } 2449 } 2450 2451 impl Pausable for Vm { 2452 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2453 event!("vm", "pausing"); 2454 let mut state = self 2455 .state 2456 .try_write() 2457 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2458 let new_state = VmState::Paused; 2459 2460 state 2461 .valid_transition(new_state) 2462 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2463 2464 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2465 { 2466 let mut clock = self 2467 .vm 2468 .get_clock() 2469 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2470 clock.reset_flags(); 2471 self.saved_clock = Some(clock); 2472 } 2473 2474 // Before pausing the vCPUs activate any pending virtio devices that might 2475 // need activation between starting the pause (or e.g. a migration it's part of) 2476 self.activate_virtio_devices().map_err(|e| { 2477 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2478 })?; 2479 2480 self.cpu_manager.lock().unwrap().pause()?; 2481 self.device_manager.lock().unwrap().pause()?; 2482 2483 *state = new_state; 2484 2485 event!("vm", "paused"); 2486 Ok(()) 2487 } 2488 2489 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2490 event!("vm", "resuming"); 2491 let mut state = self 2492 .state 2493 .try_write() 2494 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2495 let new_state = VmState::Running; 2496 2497 state 2498 .valid_transition(new_state) 2499 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2500 2501 self.cpu_manager.lock().unwrap().resume()?; 2502 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2503 { 2504 if let Some(clock) = &self.saved_clock { 2505 self.vm.set_clock(clock).map_err(|e| { 2506 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2507 })?; 2508 } 2509 } 2510 self.device_manager.lock().unwrap().resume()?; 2511 2512 // And we're back to the Running state. 2513 *state = new_state; 2514 event!("vm", "resumed"); 2515 Ok(()) 2516 } 2517 } 2518 2519 #[derive(Serialize, Deserialize)] 2520 pub struct VmSnapshot { 2521 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2522 pub clock: Option<hypervisor::ClockData>, 2523 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2524 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2525 } 2526 2527 pub const VM_SNAPSHOT_ID: &str = "vm"; 2528 impl Snapshottable for Vm { 2529 fn id(&self) -> String { 2530 VM_SNAPSHOT_ID.to_string() 2531 } 2532 2533 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2534 event!("vm", "snapshotting"); 2535 2536 #[cfg(feature = "tdx")] 2537 { 2538 if self.config.lock().unwrap().is_tdx_enabled() { 2539 return Err(MigratableError::Snapshot(anyhow!( 2540 "Snapshot not possible with TDX VM" 2541 ))); 2542 } 2543 } 2544 2545 let current_state = self.get_state().unwrap(); 2546 if current_state != VmState::Paused { 2547 return Err(MigratableError::Snapshot(anyhow!( 2548 "Trying to snapshot while VM is running" 2549 ))); 2550 } 2551 2552 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2553 let common_cpuid = { 2554 let amx = self.config.lock().unwrap().cpus.features.amx; 2555 let phys_bits = physical_bits( 2556 &self.hypervisor, 2557 self.config.lock().unwrap().cpus.max_phys_bits, 2558 ); 2559 arch::generate_common_cpuid( 2560 &self.hypervisor, 2561 &arch::CpuidConfig { 2562 sgx_epc_sections: None, 2563 phys_bits, 2564 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2565 #[cfg(feature = "tdx")] 2566 tdx: false, 2567 amx, 2568 }, 2569 ) 2570 .map_err(|e| { 2571 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2572 })? 2573 }; 2574 2575 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2576 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2577 clock: self.saved_clock, 2578 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2579 common_cpuid, 2580 }) 2581 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2582 2583 let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data)); 2584 2585 let (id, snapshot) = { 2586 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2587 (cpu_manager.id(), cpu_manager.snapshot()?) 2588 }; 2589 vm_snapshot.add_snapshot(id, snapshot); 2590 let (id, snapshot) = { 2591 let mut memory_manager = self.memory_manager.lock().unwrap(); 2592 (memory_manager.id(), memory_manager.snapshot()?) 2593 }; 2594 vm_snapshot.add_snapshot(id, snapshot); 2595 let (id, snapshot) = { 2596 let mut device_manager = self.device_manager.lock().unwrap(); 2597 (device_manager.id(), device_manager.snapshot()?) 2598 }; 2599 vm_snapshot.add_snapshot(id, snapshot); 2600 2601 event!("vm", "snapshotted"); 2602 Ok(vm_snapshot) 2603 } 2604 } 2605 2606 impl Transportable for Vm { 2607 fn send( 2608 &self, 2609 snapshot: &Snapshot, 2610 destination_url: &str, 2611 ) -> std::result::Result<(), MigratableError> { 2612 let mut snapshot_config_path = url_to_path(destination_url)?; 2613 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2614 2615 // Create the snapshot config file 2616 let mut snapshot_config_file = OpenOptions::new() 2617 .read(true) 2618 .write(true) 2619 .create_new(true) 2620 .open(snapshot_config_path) 2621 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2622 2623 // Serialize and write the snapshot config 2624 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2625 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2626 2627 snapshot_config_file 2628 .write(vm_config.as_bytes()) 2629 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2630 2631 let mut snapshot_state_path = url_to_path(destination_url)?; 2632 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2633 2634 // Create the snapshot state file 2635 let mut snapshot_state_file = OpenOptions::new() 2636 .read(true) 2637 .write(true) 2638 .create_new(true) 2639 .open(snapshot_state_path) 2640 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2641 2642 // Serialize and write the snapshot state 2643 let vm_state = 2644 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2645 2646 snapshot_state_file 2647 .write(&vm_state) 2648 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2649 2650 // Tell the memory manager to also send/write its own snapshot. 2651 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2652 self.memory_manager 2653 .lock() 2654 .unwrap() 2655 .send(&memory_manager_snapshot.clone(), destination_url)?; 2656 } else { 2657 return Err(MigratableError::Restore(anyhow!( 2658 "Missing memory manager snapshot" 2659 ))); 2660 } 2661 2662 Ok(()) 2663 } 2664 } 2665 2666 impl Migratable for Vm { 2667 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2668 self.memory_manager.lock().unwrap().start_dirty_log()?; 2669 self.device_manager.lock().unwrap().start_dirty_log() 2670 } 2671 2672 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2673 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2674 self.device_manager.lock().unwrap().stop_dirty_log() 2675 } 2676 2677 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2678 Ok(MemoryRangeTable::new_from_tables(vec![ 2679 self.memory_manager.lock().unwrap().dirty_log()?, 2680 self.device_manager.lock().unwrap().dirty_log()?, 2681 ])) 2682 } 2683 2684 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2685 self.memory_manager.lock().unwrap().start_migration()?; 2686 self.device_manager.lock().unwrap().start_migration() 2687 } 2688 2689 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2690 self.memory_manager.lock().unwrap().complete_migration()?; 2691 self.device_manager.lock().unwrap().complete_migration() 2692 } 2693 } 2694 2695 #[cfg(feature = "guest_debug")] 2696 impl Debuggable for Vm { 2697 fn set_guest_debug( 2698 &self, 2699 cpu_id: usize, 2700 addrs: &[GuestAddress], 2701 singlestep: bool, 2702 ) -> std::result::Result<(), DebuggableError> { 2703 self.cpu_manager 2704 .lock() 2705 .unwrap() 2706 .set_guest_debug(cpu_id, addrs, singlestep) 2707 } 2708 2709 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2710 if *self.state.read().unwrap() == VmState::Running { 2711 self.pause().map_err(DebuggableError::Pause)?; 2712 } 2713 2714 let mut state = self 2715 .state 2716 .try_write() 2717 .map_err(|_| DebuggableError::PoisonedState)?; 2718 *state = VmState::BreakPoint; 2719 Ok(()) 2720 } 2721 2722 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2723 if *self.state.read().unwrap() == VmState::BreakPoint { 2724 self.resume().map_err(DebuggableError::Pause)?; 2725 } 2726 2727 Ok(()) 2728 } 2729 2730 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2731 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2732 } 2733 2734 fn write_regs( 2735 &self, 2736 cpu_id: usize, 2737 regs: &CoreRegs, 2738 ) -> std::result::Result<(), DebuggableError> { 2739 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2740 } 2741 2742 fn read_mem( 2743 &self, 2744 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2745 cpu_id: usize, 2746 vaddr: GuestAddress, 2747 len: usize, 2748 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2749 self.cpu_manager 2750 .lock() 2751 .unwrap() 2752 .read_mem(guest_memory, cpu_id, vaddr, len) 2753 } 2754 2755 fn write_mem( 2756 &self, 2757 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2758 cpu_id: usize, 2759 vaddr: &GuestAddress, 2760 data: &[u8], 2761 ) -> std::result::Result<(), DebuggableError> { 2762 self.cpu_manager 2763 .lock() 2764 .unwrap() 2765 .write_mem(guest_memory, cpu_id, vaddr, data) 2766 } 2767 2768 fn active_vcpus(&self) -> usize { 2769 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2770 if active_vcpus > 0 { 2771 active_vcpus 2772 } else { 2773 // The VM is not booted yet. Report boot_vcpus() instead. 2774 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2775 } 2776 } 2777 } 2778 2779 #[cfg(feature = "guest_debug")] 2780 pub const UINT16_MAX: u32 = 65535; 2781 2782 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2783 impl Elf64Writable for Vm {} 2784 2785 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2786 impl GuestDebuggable for Vm { 2787 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2788 event!("vm", "coredumping"); 2789 2790 let mut resume = false; 2791 2792 #[cfg(feature = "tdx")] 2793 { 2794 if let Some(ref platform) = self.config.lock().unwrap().platform { 2795 if platform.tdx { 2796 return Err(GuestDebuggableError::Coredump(anyhow!( 2797 "Coredump not possible with TDX VM" 2798 ))); 2799 } 2800 } 2801 } 2802 2803 match self.get_state().unwrap() { 2804 VmState::Running => { 2805 self.pause().map_err(GuestDebuggableError::Pause)?; 2806 resume = true; 2807 } 2808 VmState::Paused => {} 2809 _ => { 2810 return Err(GuestDebuggableError::Coredump(anyhow!( 2811 "Trying to coredump while VM is not running or paused" 2812 ))); 2813 } 2814 } 2815 2816 let coredump_state = self.get_dump_state(destination_url)?; 2817 2818 self.write_header(&coredump_state)?; 2819 self.write_note(&coredump_state)?; 2820 self.write_loads(&coredump_state)?; 2821 2822 self.cpu_manager 2823 .lock() 2824 .unwrap() 2825 .cpu_write_elf64_note(&coredump_state)?; 2826 self.cpu_manager 2827 .lock() 2828 .unwrap() 2829 .cpu_write_vmm_note(&coredump_state)?; 2830 2831 self.memory_manager 2832 .lock() 2833 .unwrap() 2834 .coredump_iterate_save_mem(&coredump_state)?; 2835 2836 if resume { 2837 self.resume().map_err(GuestDebuggableError::Resume)?; 2838 } 2839 2840 Ok(()) 2841 } 2842 } 2843 2844 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2845 #[cfg(test)] 2846 mod tests { 2847 use super::*; 2848 2849 fn test_vm_state_transitions(state: VmState) { 2850 match state { 2851 VmState::Created => { 2852 // Check the transitions from Created 2853 assert!(state.valid_transition(VmState::Created).is_err()); 2854 assert!(state.valid_transition(VmState::Running).is_ok()); 2855 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2856 assert!(state.valid_transition(VmState::Paused).is_ok()); 2857 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2858 } 2859 VmState::Running => { 2860 // Check the transitions from Running 2861 assert!(state.valid_transition(VmState::Created).is_err()); 2862 assert!(state.valid_transition(VmState::Running).is_err()); 2863 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2864 assert!(state.valid_transition(VmState::Paused).is_ok()); 2865 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2866 } 2867 VmState::Shutdown => { 2868 // Check the transitions from Shutdown 2869 assert!(state.valid_transition(VmState::Created).is_err()); 2870 assert!(state.valid_transition(VmState::Running).is_ok()); 2871 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2872 assert!(state.valid_transition(VmState::Paused).is_err()); 2873 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2874 } 2875 VmState::Paused => { 2876 // Check the transitions from Paused 2877 assert!(state.valid_transition(VmState::Created).is_err()); 2878 assert!(state.valid_transition(VmState::Running).is_ok()); 2879 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2880 assert!(state.valid_transition(VmState::Paused).is_err()); 2881 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2882 } 2883 VmState::BreakPoint => { 2884 // Check the transitions from Breakpoint 2885 assert!(state.valid_transition(VmState::Created).is_ok()); 2886 assert!(state.valid_transition(VmState::Running).is_ok()); 2887 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2888 assert!(state.valid_transition(VmState::Paused).is_err()); 2889 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2890 } 2891 } 2892 } 2893 2894 #[test] 2895 fn test_vm_created_transitions() { 2896 test_vm_state_transitions(VmState::Created); 2897 } 2898 2899 #[test] 2900 fn test_vm_running_transitions() { 2901 test_vm_state_transitions(VmState::Running); 2902 } 2903 2904 #[test] 2905 fn test_vm_shutdown_transitions() { 2906 test_vm_state_transitions(VmState::Shutdown); 2907 } 2908 2909 #[test] 2910 fn test_vm_paused_transitions() { 2911 test_vm_state_transitions(VmState::Paused); 2912 } 2913 2914 #[cfg(feature = "tdx")] 2915 #[test] 2916 fn test_hob_memory_resources() { 2917 // Case 1: Two TDVF sections in the middle of the RAM 2918 let sections = vec![ 2919 TdvfSection { 2920 address: 0xc000, 2921 size: 0x1000, 2922 ..Default::default() 2923 }, 2924 TdvfSection { 2925 address: 0x1000, 2926 size: 0x4000, 2927 ..Default::default() 2928 }, 2929 ]; 2930 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2931 let expected = vec![ 2932 (0, 0x1000, true), 2933 (0x1000, 0x4000, false), 2934 (0x5000, 0x7000, true), 2935 (0xc000, 0x1000, false), 2936 (0xd000, 0x0fff_3000, true), 2937 ]; 2938 assert_eq!( 2939 expected, 2940 Vm::hob_memory_resources( 2941 sections, 2942 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2943 ) 2944 ); 2945 2946 // Case 2: Two TDVF sections with no conflict with the RAM 2947 let sections = vec![ 2948 TdvfSection { 2949 address: 0x1000_1000, 2950 size: 0x1000, 2951 ..Default::default() 2952 }, 2953 TdvfSection { 2954 address: 0, 2955 size: 0x1000, 2956 ..Default::default() 2957 }, 2958 ]; 2959 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2960 let expected = vec![ 2961 (0, 0x1000, false), 2962 (0x1000, 0x1000_0000, true), 2963 (0x1000_1000, 0x1000, false), 2964 ]; 2965 assert_eq!( 2966 expected, 2967 Vm::hob_memory_resources( 2968 sections, 2969 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2970 ) 2971 ); 2972 2973 // Case 3: Two TDVF sections with partial conflicts with the RAM 2974 let sections = vec![ 2975 TdvfSection { 2976 address: 0x1000_0000, 2977 size: 0x2000, 2978 ..Default::default() 2979 }, 2980 TdvfSection { 2981 address: 0, 2982 size: 0x2000, 2983 ..Default::default() 2984 }, 2985 ]; 2986 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2987 let expected = vec![ 2988 (0, 0x2000, false), 2989 (0x2000, 0x0fff_e000, true), 2990 (0x1000_0000, 0x2000, false), 2991 ]; 2992 assert_eq!( 2993 expected, 2994 Vm::hob_memory_resources( 2995 sections, 2996 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2997 ) 2998 ); 2999 3000 // Case 4: Two TDVF sections with no conflict before the RAM and two 3001 // more additional sections with no conflict after the RAM. 3002 let sections = vec![ 3003 TdvfSection { 3004 address: 0x2000_1000, 3005 size: 0x1000, 3006 ..Default::default() 3007 }, 3008 TdvfSection { 3009 address: 0x2000_0000, 3010 size: 0x1000, 3011 ..Default::default() 3012 }, 3013 TdvfSection { 3014 address: 0x1000, 3015 size: 0x1000, 3016 ..Default::default() 3017 }, 3018 TdvfSection { 3019 address: 0, 3020 size: 0x1000, 3021 ..Default::default() 3022 }, 3023 ]; 3024 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3025 let expected = vec![ 3026 (0, 0x1000, false), 3027 (0x1000, 0x1000, false), 3028 (0x4000, 0x1000_0000, true), 3029 (0x2000_0000, 0x1000, false), 3030 (0x2000_1000, 0x1000, false), 3031 ]; 3032 assert_eq!( 3033 expected, 3034 Vm::hob_memory_resources( 3035 sections, 3036 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3037 ) 3038 ); 3039 3040 // Case 5: One TDVF section overriding the entire RAM 3041 let sections = vec![TdvfSection { 3042 address: 0, 3043 size: 0x2000_0000, 3044 ..Default::default() 3045 }]; 3046 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3047 let expected = vec![(0, 0x2000_0000, false)]; 3048 assert_eq!( 3049 expected, 3050 Vm::hob_memory_resources( 3051 sections, 3052 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3053 ) 3054 ); 3055 3056 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3057 let sections = vec![ 3058 TdvfSection { 3059 address: 0x1000_2000, 3060 size: 0x2000, 3061 ..Default::default() 3062 }, 3063 TdvfSection { 3064 address: 0, 3065 size: 0x2000, 3066 ..Default::default() 3067 }, 3068 ]; 3069 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3070 (GuestAddress(0x2000), 0x1000_0000), 3071 (GuestAddress(0x1000_4000), 0x1000_0000), 3072 ]; 3073 let expected = vec![ 3074 (0, 0x2000, false), 3075 (0x2000, 0x1000_0000, true), 3076 (0x1000_2000, 0x2000, false), 3077 (0x1000_4000, 0x1000_0000, true), 3078 ]; 3079 assert_eq!( 3080 expected, 3081 Vm::hob_memory_resources( 3082 sections, 3083 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3084 ) 3085 ); 3086 3087 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3088 let sections = vec![ 3089 TdvfSection { 3090 address: 0x1000_0000, 3091 size: 0x4000, 3092 ..Default::default() 3093 }, 3094 TdvfSection { 3095 address: 0, 3096 size: 0x4000, 3097 ..Default::default() 3098 }, 3099 ]; 3100 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3101 (GuestAddress(0x1000), 0x1000_0000), 3102 (GuestAddress(0x1000_3000), 0x1000_0000), 3103 ]; 3104 let expected = vec![ 3105 (0, 0x4000, false), 3106 (0x4000, 0x0fff_c000, true), 3107 (0x1000_0000, 0x4000, false), 3108 (0x1000_4000, 0x0fff_f000, true), 3109 ]; 3110 assert_eq!( 3111 expected, 3112 Vm::hob_memory_resources( 3113 sections, 3114 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3115 ) 3116 ); 3117 } 3118 } 3119 3120 #[cfg(target_arch = "aarch64")] 3121 #[cfg(test)] 3122 mod tests { 3123 use super::*; 3124 use crate::GuestMemoryMmap; 3125 use arch::aarch64::fdt::create_fdt; 3126 use arch::aarch64::layout; 3127 use arch::{DeviceType, MmioDeviceInfo}; 3128 use devices::gic::Gic; 3129 3130 const LEN: u64 = 4096; 3131 3132 #[test] 3133 fn test_create_fdt_with_devices() { 3134 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3135 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3136 3137 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3138 ( 3139 (DeviceType::Serial, DeviceType::Serial.to_string()), 3140 MmioDeviceInfo { 3141 addr: 0x00, 3142 len: LEN, 3143 irq: 33, 3144 }, 3145 ), 3146 ( 3147 (DeviceType::Virtio(1), "virtio".to_string()), 3148 MmioDeviceInfo { 3149 addr: LEN, 3150 len: LEN, 3151 irq: 34, 3152 }, 3153 ), 3154 ( 3155 (DeviceType::Rtc, "rtc".to_string()), 3156 MmioDeviceInfo { 3157 addr: 2 * LEN, 3158 len: LEN, 3159 irq: 35, 3160 }, 3161 ), 3162 ] 3163 .iter() 3164 .cloned() 3165 .collect(); 3166 3167 let hv = hypervisor::new().unwrap(); 3168 let vm = hv.create_vm().unwrap(); 3169 let gic = vm 3170 .create_vgic(Gic::create_default_config(1)) 3171 .expect("Cannot create gic"); 3172 assert!(create_fdt( 3173 &mem, 3174 "console=tty0", 3175 vec![0], 3176 Some((0, 0, 0)), 3177 &dev_info, 3178 &gic, 3179 &None, 3180 &Vec::new(), 3181 &BTreeMap::new(), 3182 None, 3183 true, 3184 ) 3185 .is_ok()) 3186 } 3187 } 3188 3189 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3190 #[test] 3191 pub fn test_vm() { 3192 use hypervisor::VmExit; 3193 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3194 // This example based on https://lwn.net/Articles/658511/ 3195 let code = [ 3196 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3197 0x00, 0xd8, /* add %bl, %al */ 3198 0x04, b'0', /* add $'0', %al */ 3199 0xee, /* out %al, (%dx) */ 3200 0xb0, b'\n', /* mov $'\n', %al */ 3201 0xee, /* out %al, (%dx) */ 3202 0xf4, /* hlt */ 3203 ]; 3204 3205 let mem_size = 0x1000; 3206 let load_addr = GuestAddress(0x1000); 3207 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3208 3209 let hv = hypervisor::new().unwrap(); 3210 let vm = hv.create_vm().expect("new VM creation failed"); 3211 3212 for (index, region) in mem.iter().enumerate() { 3213 let mem_region = vm.make_user_memory_region( 3214 index as u32, 3215 region.start_addr().raw_value(), 3216 region.len(), 3217 region.as_ptr() as u64, 3218 false, 3219 false, 3220 ); 3221 3222 vm.create_user_memory_region(mem_region) 3223 .expect("Cannot configure guest memory"); 3224 } 3225 mem.write_slice(&code, load_addr) 3226 .expect("Writing code to memory failed"); 3227 3228 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3229 3230 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3231 vcpu_sregs.cs.base = 0; 3232 vcpu_sregs.cs.selector = 0; 3233 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3234 3235 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3236 vcpu_regs.rip = 0x1000; 3237 vcpu_regs.rax = 2; 3238 vcpu_regs.rbx = 3; 3239 vcpu_regs.rflags = 2; 3240 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3241 3242 loop { 3243 match vcpu.run().expect("run failed") { 3244 VmExit::IoOut(addr, data) => { 3245 println!( 3246 "IO out -- addr: {:#x} data [{:?}]", 3247 addr, 3248 str::from_utf8(data).unwrap() 3249 ); 3250 } 3251 VmExit::Reset => { 3252 println!("HLT"); 3253 break; 3254 } 3255 r => panic!("unexpected exit reason: {r:?}"), 3256 } 3257 } 3258 } 3259