1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 #[cfg(feature = "igvm")] 29 use crate::igvm::igvm_loader; 30 use crate::memory_manager::{ 31 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 32 }; 33 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 34 use crate::migration::get_vm_snapshot; 35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 36 use crate::migration::url_to_file; 37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 38 use crate::GuestMemoryMmap; 39 use crate::{ 40 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 41 }; 42 use anyhow::anyhow; 43 use arch::get_host_cpu_phys_bits; 44 #[cfg(target_arch = "x86_64")] 45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 46 #[cfg(feature = "tdx")] 47 use arch::x86_64::tdx::TdvfSection; 48 use arch::EntryPoint; 49 #[cfg(target_arch = "aarch64")] 50 use arch::PciSpaceInfo; 51 use arch::{NumaNode, NumaNodes}; 52 #[cfg(target_arch = "aarch64")] 53 use devices::interrupt_controller; 54 use devices::AcpiNotificationFlags; 55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 59 use hypervisor::{HypervisorVmError, VmOps}; 60 use libc::{termios, SIGWINCH}; 61 use linux_loader::cmdline::Cmdline; 62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 63 use linux_loader::elf; 64 #[cfg(target_arch = "x86_64")] 65 use linux_loader::loader::bzimage::BzImage; 66 #[cfg(target_arch = "x86_64")] 67 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 68 #[cfg(target_arch = "aarch64")] 69 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 70 use linux_loader::loader::KernelLoader; 71 use seccompiler::SeccompAction; 72 use serde::{Deserialize, Serialize}; 73 use std::cmp; 74 use std::collections::BTreeMap; 75 use std::collections::HashMap; 76 use std::fs::{File, OpenOptions}; 77 use std::io::{self, Seek, SeekFrom, Write}; 78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 79 use std::mem::size_of; 80 use std::num::Wrapping; 81 use std::ops::Deref; 82 use std::os::unix::net::UnixStream; 83 use std::sync::{Arc, Mutex, RwLock}; 84 use std::time::Instant; 85 use std::{result, str, thread}; 86 use thiserror::Error; 87 use tracer::trace_scoped; 88 use vm_device::Bus; 89 #[cfg(feature = "tdx")] 90 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 91 use vm_memory::{ 92 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 93 }; 94 use vm_migration::protocol::{Request, Response, Status}; 95 use vm_migration::{ 96 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 97 SnapshotData, Snapshottable, Transportable, 98 }; 99 use vmm_sys_util::eventfd::EventFd; 100 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 101 102 /// Errors associated with VM management 103 #[derive(Debug, Error)] 104 pub enum Error { 105 #[error("Cannot open kernel file: {0}")] 106 KernelFile(#[source] io::Error), 107 108 #[error("Cannot open initramfs file: {0}")] 109 InitramfsFile(#[source] io::Error), 110 111 #[error("Cannot load the kernel into memory: {0}")] 112 KernelLoad(#[source] linux_loader::loader::Error), 113 114 #[cfg(target_arch = "aarch64")] 115 #[error("Cannot load the UEFI binary in memory: {0:?}")] 116 UefiLoad(arch::aarch64::uefi::Error), 117 118 #[error("Cannot load the initramfs into memory")] 119 InitramfsLoad, 120 121 #[error("Cannot load the kernel command line in memory: {0}")] 122 LoadCmdLine(#[source] linux_loader::loader::Error), 123 124 #[error("Cannot modify the kernel command line: {0}")] 125 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 126 127 #[error("Cannot create the kernel command line: {0}")] 128 CmdLineCreate(#[source] linux_loader::cmdline::Error), 129 130 #[error("Cannot configure system: {0}")] 131 ConfigureSystem(#[source] arch::Error), 132 133 #[cfg(target_arch = "aarch64")] 134 #[error("Cannot enable interrupt controller: {0:?}")] 135 EnableInterruptController(interrupt_controller::Error), 136 137 #[error("VM state is poisoned")] 138 PoisonedState, 139 140 #[error("Error from device manager: {0:?}")] 141 DeviceManager(DeviceManagerError), 142 143 #[error("No device with id {0:?} to remove")] 144 NoDeviceToRemove(String), 145 146 #[error("Cannot spawn a signal handler thread: {0}")] 147 SignalHandlerSpawn(#[source] io::Error), 148 149 #[error("Failed to join on threads: {0:?}")] 150 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 151 152 #[error("VM config is missing")] 153 VmMissingConfig, 154 155 #[error("VM is not created")] 156 VmNotCreated, 157 158 #[error("VM is already created")] 159 VmAlreadyCreated, 160 161 #[error("VM is not running")] 162 VmNotRunning, 163 164 #[error("Cannot clone EventFd: {0}")] 165 EventFdClone(#[source] io::Error), 166 167 #[error("invalid VM state transition: {0:?} to {1:?}")] 168 InvalidStateTransition(VmState, VmState), 169 170 #[error("Error from CPU manager: {0}")] 171 CpuManager(#[source] cpu::Error), 172 173 #[error("Cannot pause devices: {0}")] 174 PauseDevices(#[source] MigratableError), 175 176 #[error("Cannot resume devices: {0}")] 177 ResumeDevices(#[source] MigratableError), 178 179 #[error("Cannot pause CPUs: {0}")] 180 PauseCpus(#[source] MigratableError), 181 182 #[error("Cannot resume cpus: {0}")] 183 ResumeCpus(#[source] MigratableError), 184 185 #[error("Cannot pause VM: {0}")] 186 Pause(#[source] MigratableError), 187 188 #[error("Cannot resume VM: {0}")] 189 Resume(#[source] MigratableError), 190 191 #[error("Memory manager error: {0:?}")] 192 MemoryManager(MemoryManagerError), 193 194 #[error("Eventfd write error: {0}")] 195 EventfdError(#[source] std::io::Error), 196 197 #[error("Cannot snapshot VM: {0}")] 198 Snapshot(#[source] MigratableError), 199 200 #[error("Cannot restore VM: {0}")] 201 Restore(#[source] MigratableError), 202 203 #[error("Cannot send VM snapshot: {0}")] 204 SnapshotSend(#[source] MigratableError), 205 206 #[error("Invalid restore source URL")] 207 InvalidRestoreSourceUrl, 208 209 #[error("Failed to validate config: {0}")] 210 ConfigValidation(#[source] ValidationError), 211 212 #[error("Too many virtio-vsock devices")] 213 TooManyVsockDevices, 214 215 #[error("Failed serializing into JSON: {0}")] 216 SerializeJson(#[source] serde_json::Error), 217 218 #[error("Invalid NUMA configuration")] 219 InvalidNumaConfig, 220 221 #[error("Cannot create seccomp filter: {0}")] 222 CreateSeccompFilter(#[source] seccompiler::Error), 223 224 #[error("Cannot apply seccomp filter: {0}")] 225 ApplySeccompFilter(#[source] seccompiler::Error), 226 227 #[error("Failed resizing a memory zone")] 228 ResizeZone, 229 230 #[error("Cannot activate virtio devices: {0:?}")] 231 ActivateVirtioDevices(DeviceManagerError), 232 233 #[error("Error triggering power button: {0:?}")] 234 PowerButton(DeviceManagerError), 235 236 #[error("Kernel lacks PVH header")] 237 KernelMissingPvhHeader, 238 239 #[error("Failed to allocate firmware RAM: {0:?}")] 240 AllocateFirmwareMemory(MemoryManagerError), 241 242 #[error("Error manipulating firmware file: {0}")] 243 FirmwareFile(#[source] std::io::Error), 244 245 #[error("Firmware too big")] 246 FirmwareTooLarge, 247 248 #[error("Failed to copy firmware to memory: {0}")] 249 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 250 251 #[cfg(feature = "sev_snp")] 252 #[error("Error enabling SEV-SNP VM: {0}")] 253 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 254 255 #[cfg(feature = "tdx")] 256 #[error("Error performing I/O on TDX firmware file: {0}")] 257 LoadTdvf(#[source] std::io::Error), 258 259 #[cfg(feature = "tdx")] 260 #[error("Error performing I/O on the TDX payload file: {0}")] 261 LoadPayload(#[source] std::io::Error), 262 263 #[cfg(feature = "tdx")] 264 #[error("Error parsing TDVF: {0}")] 265 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 266 267 #[cfg(feature = "tdx")] 268 #[error("Error populating TDX HOB: {0}")] 269 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 270 271 #[cfg(feature = "tdx")] 272 #[error("Error allocating TDVF memory: {0:?}")] 273 AllocatingTdvfMemory(crate::memory_manager::Error), 274 275 #[cfg(feature = "tdx")] 276 #[error("Error enabling TDX VM: {0}")] 277 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 278 279 #[cfg(feature = "tdx")] 280 #[error("Error enabling TDX memory region: {0}")] 281 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 282 283 #[cfg(feature = "tdx")] 284 #[error("Error finalizing TDX VM: {0}")] 285 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 286 287 #[cfg(feature = "tdx")] 288 #[error("TDX firmware missing")] 289 TdxFirmwareMissing, 290 291 #[cfg(feature = "tdx")] 292 #[error("Invalid TDX payload type")] 293 InvalidPayloadType, 294 295 #[cfg(feature = "guest_debug")] 296 #[error("Error debugging VM: {0:?}")] 297 Debug(DebuggableError), 298 299 #[error("Error spawning kernel loading thread")] 300 KernelLoadThreadSpawn(std::io::Error), 301 302 #[error("Error joining kernel loading thread")] 303 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 304 305 #[error("Payload configuration is not bootable")] 306 InvalidPayload, 307 308 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 309 #[error("Error coredumping VM: {0:?}")] 310 Coredump(GuestDebuggableError), 311 312 #[cfg(feature = "igvm")] 313 #[error("Cannot open igvm file: {0}")] 314 IgvmFile(#[source] io::Error), 315 316 #[cfg(feature = "igvm")] 317 #[error("Cannot load the igvm into memory: {0}")] 318 IgvmLoad(#[source] igvm_loader::Error), 319 } 320 pub type Result<T> = result::Result<T, Error>; 321 322 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 323 pub enum VmState { 324 Created, 325 Running, 326 Shutdown, 327 Paused, 328 BreakPoint, 329 } 330 331 impl VmState { 332 fn valid_transition(self, new_state: VmState) -> Result<()> { 333 match self { 334 VmState::Created => match new_state { 335 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 336 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 337 Ok(()) 338 } 339 }, 340 341 VmState::Running => match new_state { 342 VmState::Created | VmState::Running => { 343 Err(Error::InvalidStateTransition(self, new_state)) 344 } 345 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 346 }, 347 348 VmState::Shutdown => match new_state { 349 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 350 Err(Error::InvalidStateTransition(self, new_state)) 351 } 352 VmState::Running => Ok(()), 353 }, 354 355 VmState::Paused => match new_state { 356 VmState::Created | VmState::Paused | VmState::BreakPoint => { 357 Err(Error::InvalidStateTransition(self, new_state)) 358 } 359 VmState::Running | VmState::Shutdown => Ok(()), 360 }, 361 VmState::BreakPoint => match new_state { 362 VmState::Created | VmState::Running => Ok(()), 363 _ => Err(Error::InvalidStateTransition(self, new_state)), 364 }, 365 } 366 } 367 } 368 369 struct VmOpsHandler { 370 memory: GuestMemoryAtomic<GuestMemoryMmap>, 371 #[cfg(target_arch = "x86_64")] 372 io_bus: Arc<Bus>, 373 mmio_bus: Arc<Bus>, 374 } 375 376 impl VmOps for VmOpsHandler { 377 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 378 self.memory 379 .memory() 380 .write(buf, GuestAddress(gpa)) 381 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 382 } 383 384 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 385 self.memory 386 .memory() 387 .read(buf, GuestAddress(gpa)) 388 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 389 } 390 391 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 392 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 393 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 394 } 395 Ok(()) 396 } 397 398 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 399 match self.mmio_bus.write(gpa, data) { 400 Err(vm_device::BusError::MissingAddressRange) => { 401 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 402 } 403 Ok(Some(barrier)) => { 404 info!("Waiting for barrier"); 405 barrier.wait(); 406 info!("Barrier released"); 407 } 408 _ => {} 409 }; 410 Ok(()) 411 } 412 413 #[cfg(target_arch = "x86_64")] 414 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 415 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 416 info!("Guest PIO read to unregistered address 0x{:x}", port); 417 } 418 Ok(()) 419 } 420 421 #[cfg(target_arch = "x86_64")] 422 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 423 match self.io_bus.write(port, data) { 424 Err(vm_device::BusError::MissingAddressRange) => { 425 info!("Guest PIO write to unregistered address 0x{:x}", port); 426 } 427 Ok(Some(barrier)) => { 428 info!("Waiting for barrier"); 429 barrier.wait(); 430 info!("Barrier released"); 431 } 432 _ => {} 433 }; 434 Ok(()) 435 } 436 } 437 438 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 439 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 440 441 cmp::min(host_phys_bits, max_phys_bits) 442 } 443 444 pub struct Vm { 445 #[cfg(feature = "tdx")] 446 kernel: Option<File>, 447 initramfs: Option<File>, 448 threads: Vec<thread::JoinHandle<()>>, 449 device_manager: Arc<Mutex<DeviceManager>>, 450 config: Arc<Mutex<VmConfig>>, 451 state: RwLock<VmState>, 452 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 453 memory_manager: Arc<Mutex<MemoryManager>>, 454 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 455 // The hypervisor abstracted virtual machine. 456 vm: Arc<dyn hypervisor::Vm>, 457 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 458 saved_clock: Option<hypervisor::ClockData>, 459 numa_nodes: NumaNodes, 460 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 461 hypervisor: Arc<dyn hypervisor::Hypervisor>, 462 stop_on_boot: bool, 463 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 464 } 465 466 impl Vm { 467 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 468 469 #[allow(clippy::too_many_arguments)] 470 pub fn new_from_memory_manager( 471 config: Arc<Mutex<VmConfig>>, 472 memory_manager: Arc<Mutex<MemoryManager>>, 473 vm: Arc<dyn hypervisor::Vm>, 474 exit_evt: EventFd, 475 reset_evt: EventFd, 476 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 477 seccomp_action: &SeccompAction, 478 hypervisor: Arc<dyn hypervisor::Hypervisor>, 479 activate_evt: EventFd, 480 timestamp: Instant, 481 serial_pty: Option<PtyPair>, 482 console_pty: Option<PtyPair>, 483 debug_console_pty: Option<PtyPair>, 484 console_resize_pipe: Option<File>, 485 original_termios: Arc<Mutex<Option<termios>>>, 486 snapshot: Option<Snapshot>, 487 ) -> Result<Self> { 488 trace_scoped!("Vm::new_from_memory_manager"); 489 490 let boot_id_list = config 491 .lock() 492 .unwrap() 493 .validate() 494 .map_err(Error::ConfigValidation)?; 495 496 #[cfg(not(feature = "igvm"))] 497 let load_payload_handle = if snapshot.is_none() { 498 Self::load_payload_async(&memory_manager, &config)? 499 } else { 500 None 501 }; 502 503 info!("Booting VM from config: {:?}", &config); 504 505 // Create NUMA nodes based on NumaConfig. 506 let numa_nodes = 507 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 508 509 #[cfg(feature = "tdx")] 510 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 511 #[cfg(feature = "sev_snp")] 512 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 513 #[cfg(feature = "tdx")] 514 let force_iommu = tdx_enabled; 515 #[cfg(feature = "sev_snp")] 516 let force_iommu = sev_snp_enabled; 517 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 518 let force_iommu = false; 519 520 #[cfg(feature = "guest_debug")] 521 let stop_on_boot = config.lock().unwrap().gdb; 522 #[cfg(not(feature = "guest_debug"))] 523 let stop_on_boot = false; 524 525 let memory = memory_manager.lock().unwrap().guest_memory(); 526 #[cfg(target_arch = "x86_64")] 527 let io_bus = Arc::new(Bus::new()); 528 let mmio_bus = Arc::new(Bus::new()); 529 530 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 531 memory, 532 #[cfg(target_arch = "x86_64")] 533 io_bus: io_bus.clone(), 534 mmio_bus: mmio_bus.clone(), 535 }); 536 537 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 538 let cpu_manager = cpu::CpuManager::new( 539 cpus_config, 540 vm.clone(), 541 exit_evt.try_clone().map_err(Error::EventFdClone)?, 542 reset_evt.try_clone().map_err(Error::EventFdClone)?, 543 #[cfg(feature = "guest_debug")] 544 vm_debug_evt, 545 &hypervisor, 546 seccomp_action.clone(), 547 vm_ops, 548 #[cfg(feature = "tdx")] 549 tdx_enabled, 550 &numa_nodes, 551 #[cfg(feature = "sev_snp")] 552 sev_snp_enabled, 553 ) 554 .map_err(Error::CpuManager)?; 555 556 #[cfg(target_arch = "x86_64")] 557 cpu_manager 558 .lock() 559 .unwrap() 560 .populate_cpuid( 561 &memory_manager, 562 &hypervisor, 563 #[cfg(feature = "tdx")] 564 tdx_enabled, 565 ) 566 .map_err(Error::CpuManager)?; 567 568 // Loading the igvm file is pushed down here because 569 // igvm parser needs cpu_manager to retrieve cpuid leaf. 570 // For the regular case, we can start loading early, but for 571 // igvm case we have to wait until cpu_manager is created. 572 // Currently, Microsoft Hypervisor does not provide any 573 // Hypervisor specific common cpuid, we need to call get_cpuid_values 574 // per cpuid through cpu_manager. 575 #[cfg(feature = "igvm")] 576 let load_payload_handle = if snapshot.is_none() { 577 Self::load_payload_async(&memory_manager, &config, &cpu_manager)? 578 } else { 579 None 580 }; 581 // The initial TDX configuration must be done before the vCPUs are 582 // created 583 #[cfg(feature = "tdx")] 584 if tdx_enabled { 585 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 586 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 587 vm.tdx_init(&cpuid, max_vcpus) 588 .map_err(Error::InitializeTdxVm)?; 589 } 590 591 cpu_manager 592 .lock() 593 .unwrap() 594 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 595 .map_err(Error::CpuManager)?; 596 597 // This initial SEV-SNP configuration must be done immediately after 598 // vCPUs are created. As part of this initialization we are 599 // transitioning the guest into secure state. 600 #[cfg(feature = "sev_snp")] 601 if sev_snp_enabled { 602 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 603 } 604 605 #[cfg(feature = "tdx")] 606 let dynamic = !tdx_enabled; 607 #[cfg(not(feature = "tdx"))] 608 let dynamic = true; 609 610 let device_manager = DeviceManager::new( 611 #[cfg(target_arch = "x86_64")] 612 io_bus, 613 mmio_bus, 614 hypervisor.hypervisor_type(), 615 vm.clone(), 616 config.clone(), 617 memory_manager.clone(), 618 cpu_manager.clone(), 619 exit_evt.try_clone().map_err(Error::EventFdClone)?, 620 reset_evt, 621 seccomp_action.clone(), 622 numa_nodes.clone(), 623 &activate_evt, 624 force_iommu, 625 boot_id_list, 626 timestamp, 627 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 628 dynamic, 629 ) 630 .map_err(Error::DeviceManager)?; 631 632 device_manager 633 .lock() 634 .unwrap() 635 .create_devices( 636 serial_pty, 637 console_pty, 638 debug_console_pty, 639 console_resize_pipe, 640 original_termios, 641 ) 642 .map_err(Error::DeviceManager)?; 643 644 #[cfg(feature = "tdx")] 645 let kernel = config 646 .lock() 647 .unwrap() 648 .payload 649 .as_ref() 650 .map(|p| p.kernel.as_ref().map(File::open)) 651 .unwrap_or_default() 652 .transpose() 653 .map_err(Error::KernelFile)?; 654 655 let initramfs = config 656 .lock() 657 .unwrap() 658 .payload 659 .as_ref() 660 .map(|p| p.initramfs.as_ref().map(File::open)) 661 .unwrap_or_default() 662 .transpose() 663 .map_err(Error::InitramfsFile)?; 664 665 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 666 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 667 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 668 vm_snapshot.clock 669 } else { 670 None 671 }; 672 673 let vm_state = if snapshot.is_some() { 674 VmState::Paused 675 } else { 676 VmState::Created 677 }; 678 679 Ok(Vm { 680 #[cfg(feature = "tdx")] 681 kernel, 682 initramfs, 683 device_manager, 684 config, 685 threads: Vec::with_capacity(1), 686 state: RwLock::new(vm_state), 687 cpu_manager, 688 memory_manager, 689 vm, 690 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 691 saved_clock, 692 numa_nodes, 693 hypervisor, 694 stop_on_boot, 695 load_payload_handle, 696 }) 697 } 698 699 fn create_numa_nodes( 700 configs: Option<Vec<NumaConfig>>, 701 memory_manager: &Arc<Mutex<MemoryManager>>, 702 ) -> Result<NumaNodes> { 703 let mm = memory_manager.lock().unwrap(); 704 let mm_zones = mm.memory_zones(); 705 let mut numa_nodes = BTreeMap::new(); 706 707 if let Some(configs) = &configs { 708 for config in configs.iter() { 709 if numa_nodes.contains_key(&config.guest_numa_id) { 710 error!("Can't define twice the same NUMA node"); 711 return Err(Error::InvalidNumaConfig); 712 } 713 714 let mut node = NumaNode::default(); 715 716 if let Some(memory_zones) = &config.memory_zones { 717 for memory_zone in memory_zones.iter() { 718 if let Some(mm_zone) = mm_zones.get(memory_zone) { 719 node.memory_regions.extend(mm_zone.regions().clone()); 720 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 721 node.hotplug_regions.push(virtiomem_zone.region().clone()); 722 } 723 node.memory_zones.push(memory_zone.clone()); 724 } else { 725 error!("Unknown memory zone '{}'", memory_zone); 726 return Err(Error::InvalidNumaConfig); 727 } 728 } 729 } 730 731 if let Some(cpus) = &config.cpus { 732 node.cpus.extend(cpus); 733 } 734 735 if let Some(pci_segments) = &config.pci_segments { 736 node.pci_segments.extend(pci_segments); 737 } 738 739 if let Some(distances) = &config.distances { 740 for distance in distances.iter() { 741 let dest = distance.destination; 742 let dist = distance.distance; 743 744 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 745 error!("Unknown destination NUMA node {}", dest); 746 return Err(Error::InvalidNumaConfig); 747 } 748 749 if node.distances.contains_key(&dest) { 750 error!("Destination NUMA node {} has been already set", dest); 751 return Err(Error::InvalidNumaConfig); 752 } 753 754 node.distances.insert(dest, dist); 755 } 756 } 757 758 #[cfg(target_arch = "x86_64")] 759 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 760 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 761 let mm_sections = sgx_epc_region.epc_sections(); 762 for sgx_epc_section in sgx_epc_sections.iter() { 763 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 764 node.sgx_epc_sections.push(mm_section.clone()); 765 } else { 766 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 767 return Err(Error::InvalidNumaConfig); 768 } 769 } 770 } else { 771 error!("Missing SGX EPC region"); 772 return Err(Error::InvalidNumaConfig); 773 } 774 } 775 776 numa_nodes.insert(config.guest_numa_id, node); 777 } 778 } 779 780 Ok(numa_nodes) 781 } 782 783 #[allow(clippy::too_many_arguments)] 784 pub fn new( 785 vm_config: Arc<Mutex<VmConfig>>, 786 exit_evt: EventFd, 787 reset_evt: EventFd, 788 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 789 seccomp_action: &SeccompAction, 790 hypervisor: Arc<dyn hypervisor::Hypervisor>, 791 activate_evt: EventFd, 792 serial_pty: Option<PtyPair>, 793 console_pty: Option<PtyPair>, 794 debug_console_pty: Option<PtyPair>, 795 console_resize_pipe: Option<File>, 796 original_termios: Arc<Mutex<Option<termios>>>, 797 snapshot: Option<Snapshot>, 798 source_url: Option<&str>, 799 prefault: Option<bool>, 800 ) -> Result<Self> { 801 trace_scoped!("Vm::new"); 802 803 let timestamp = Instant::now(); 804 805 #[cfg(feature = "tdx")] 806 let tdx_enabled = if snapshot.is_some() { 807 false 808 } else { 809 vm_config.lock().unwrap().is_tdx_enabled() 810 }; 811 812 #[cfg(feature = "sev_snp")] 813 let sev_snp_enabled = if snapshot.is_some() { 814 false 815 } else { 816 vm_config.lock().unwrap().is_sev_snp_enabled() 817 }; 818 819 let vm = Self::create_hypervisor_vm( 820 &hypervisor, 821 #[cfg(feature = "tdx")] 822 tdx_enabled, 823 #[cfg(feature = "sev_snp")] 824 sev_snp_enabled, 825 )?; 826 827 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 828 829 let memory_manager = if let Some(snapshot) = 830 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 831 { 832 MemoryManager::new_from_snapshot( 833 &snapshot, 834 vm.clone(), 835 &vm_config.lock().unwrap().memory.clone(), 836 source_url, 837 prefault.unwrap(), 838 phys_bits, 839 ) 840 .map_err(Error::MemoryManager)? 841 } else { 842 #[cfg(target_arch = "x86_64")] 843 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 844 845 MemoryManager::new( 846 vm.clone(), 847 &vm_config.lock().unwrap().memory.clone(), 848 None, 849 phys_bits, 850 #[cfg(feature = "tdx")] 851 tdx_enabled, 852 None, 853 None, 854 #[cfg(target_arch = "x86_64")] 855 sgx_epc_config, 856 ) 857 .map_err(Error::MemoryManager)? 858 }; 859 860 Vm::new_from_memory_manager( 861 vm_config, 862 memory_manager, 863 vm, 864 exit_evt, 865 reset_evt, 866 #[cfg(feature = "guest_debug")] 867 vm_debug_evt, 868 seccomp_action, 869 hypervisor, 870 activate_evt, 871 timestamp, 872 serial_pty, 873 console_pty, 874 debug_console_pty, 875 console_resize_pipe, 876 original_termios, 877 snapshot, 878 ) 879 } 880 881 pub fn create_hypervisor_vm( 882 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 883 #[cfg(feature = "tdx")] tdx_enabled: bool, 884 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 885 ) -> Result<Arc<dyn hypervisor::Vm>> { 886 hypervisor.check_required_extensions().unwrap(); 887 888 cfg_if::cfg_if! { 889 if #[cfg(feature = "tdx")] { 890 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 891 // Otherwise KVM_X86_LEGACY_VM: 0 892 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 893 let vm = hypervisor 894 .create_vm_with_type(u64::from(tdx_enabled)) 895 .unwrap(); 896 } else if #[cfg(feature = "sev_snp")] { 897 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 898 // Otherwise SEV_SNP_DISABLED: 0 899 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 900 let vm = hypervisor 901 .create_vm_with_type(u64::from(sev_snp_enabled)) 902 .unwrap(); 903 } else { 904 let vm = hypervisor.create_vm().unwrap(); 905 } 906 } 907 908 #[cfg(target_arch = "x86_64")] 909 { 910 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 911 .unwrap(); 912 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 913 vm.enable_split_irq().unwrap(); 914 } 915 916 Ok(vm) 917 } 918 919 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 920 let initramfs = self.initramfs.as_mut().unwrap(); 921 let size: usize = initramfs 922 .seek(SeekFrom::End(0)) 923 .map_err(|_| Error::InitramfsLoad)? 924 .try_into() 925 .unwrap(); 926 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 927 928 let address = 929 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 930 let address = GuestAddress(address); 931 932 guest_mem 933 .read_volatile_from(address, initramfs, size) 934 .map_err(|_| Error::InitramfsLoad)?; 935 936 info!("Initramfs loaded: address = 0x{:x}", address.0); 937 Ok(arch::InitramfsConfig { address, size }) 938 } 939 940 pub fn generate_cmdline( 941 payload: &PayloadConfig, 942 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 943 ) -> Result<Cmdline> { 944 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 945 if let Some(s) = payload.cmdline.as_ref() { 946 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 947 } 948 949 #[cfg(target_arch = "aarch64")] 950 for entry in device_manager.lock().unwrap().cmdline_additions() { 951 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 952 } 953 Ok(cmdline) 954 } 955 956 #[cfg(target_arch = "aarch64")] 957 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 958 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 959 let mem = uefi_flash.memory(); 960 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 961 .map_err(Error::UefiLoad)?; 962 Ok(()) 963 } 964 965 #[cfg(target_arch = "aarch64")] 966 fn load_kernel( 967 firmware: Option<File>, 968 kernel: Option<File>, 969 memory_manager: Arc<Mutex<MemoryManager>>, 970 ) -> Result<EntryPoint> { 971 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 972 let mem = guest_memory.memory(); 973 let entry_addr = match (firmware, kernel) { 974 (None, Some(mut kernel)) => { 975 match linux_loader::loader::pe::PE::load( 976 mem.deref(), 977 Some(arch::layout::KERNEL_START), 978 &mut kernel, 979 None, 980 ) { 981 Ok(entry_addr) => entry_addr.kernel_load, 982 // Try to load the binary as kernel PE file at first. 983 // If failed, retry to load it as UEFI binary. 984 // As the UEFI binary is formatless, it must be the last option to try. 985 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 986 Self::load_firmware(&kernel, memory_manager)?; 987 arch::layout::UEFI_START 988 } 989 Err(e) => { 990 return Err(Error::KernelLoad(e)); 991 } 992 } 993 } 994 (Some(firmware), None) => { 995 Self::load_firmware(&firmware, memory_manager)?; 996 arch::layout::UEFI_START 997 } 998 _ => return Err(Error::InvalidPayload), 999 }; 1000 1001 Ok(EntryPoint { entry_addr }) 1002 } 1003 1004 #[cfg(feature = "igvm")] 1005 fn load_igvm( 1006 igvm: File, 1007 memory_manager: Arc<Mutex<MemoryManager>>, 1008 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1009 ) -> Result<EntryPoint> { 1010 let res = igvm_loader::load_igvm(&igvm, memory_manager, cpu_manager.clone(), "") 1011 .map_err(Error::IgvmLoad)?; 1012 1013 cfg_if::cfg_if! { 1014 if #[cfg(feature = "sev_snp")] { 1015 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1016 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1017 } else { 1018 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1019 }; 1020 } else { 1021 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1022 } 1023 }; 1024 Ok(entry_point) 1025 } 1026 1027 #[cfg(target_arch = "x86_64")] 1028 fn load_kernel( 1029 mut kernel: File, 1030 cmdline: Option<Cmdline>, 1031 memory_manager: Arc<Mutex<MemoryManager>>, 1032 ) -> Result<EntryPoint> { 1033 info!("Loading kernel"); 1034 1035 let mem = { 1036 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1037 guest_memory.memory() 1038 }; 1039 1040 // Try ELF binary with PVH boot. 1041 let entry_addr = linux_loader::loader::elf::Elf::load( 1042 mem.deref(), 1043 None, 1044 &mut kernel, 1045 Some(arch::layout::HIGH_RAM_START), 1046 ) 1047 // Try loading kernel as bzImage. 1048 .or_else(|_| { 1049 BzImage::load( 1050 mem.deref(), 1051 None, 1052 &mut kernel, 1053 Some(arch::layout::HIGH_RAM_START), 1054 ) 1055 }) 1056 .map_err(Error::KernelLoad)?; 1057 1058 if let Some(cmdline) = cmdline { 1059 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1060 .map_err(Error::LoadCmdLine)?; 1061 } 1062 1063 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1064 // Use the PVH kernel entry point to boot the guest 1065 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1066 Ok(EntryPoint { 1067 entry_addr, 1068 setup_header: None, 1069 }) 1070 } else if entry_addr.setup_header.is_some() { 1071 // Use the bzImage 32bit entry point to boot the guest 1072 info!( 1073 "bzImage kernel loaded: entry_addr = 0x{:x}", 1074 entry_addr.kernel_load.0 1075 ); 1076 Ok(EntryPoint { 1077 entry_addr: entry_addr.kernel_load, 1078 setup_header: entry_addr.setup_header, 1079 }) 1080 } else { 1081 Err(Error::KernelMissingPvhHeader) 1082 } 1083 } 1084 1085 #[cfg(target_arch = "x86_64")] 1086 fn load_payload( 1087 payload: &PayloadConfig, 1088 memory_manager: Arc<Mutex<MemoryManager>>, 1089 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1090 ) -> Result<EntryPoint> { 1091 trace_scoped!("load_payload"); 1092 #[cfg(feature = "igvm")] 1093 if let Some(_igvm_file) = &payload.igvm { 1094 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1095 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1096 } 1097 match ( 1098 &payload.firmware, 1099 &payload.kernel, 1100 &payload.initramfs, 1101 &payload.cmdline, 1102 ) { 1103 (Some(firmware), None, None, None) => { 1104 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1105 Self::load_kernel(firmware, None, memory_manager) 1106 } 1107 (None, Some(kernel), _, _) => { 1108 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1109 let cmdline = Self::generate_cmdline(payload)?; 1110 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1111 } 1112 _ => Err(Error::InvalidPayload), 1113 } 1114 } 1115 1116 #[cfg(target_arch = "aarch64")] 1117 fn load_payload( 1118 payload: &PayloadConfig, 1119 memory_manager: Arc<Mutex<MemoryManager>>, 1120 ) -> Result<EntryPoint> { 1121 match (&payload.firmware, &payload.kernel) { 1122 (Some(firmware), None) => { 1123 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1124 Self::load_kernel(Some(firmware), None, memory_manager) 1125 } 1126 (None, Some(kernel)) => { 1127 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1128 Self::load_kernel(None, Some(kernel), memory_manager) 1129 } 1130 _ => Err(Error::InvalidPayload), 1131 } 1132 } 1133 1134 fn load_payload_async( 1135 memory_manager: &Arc<Mutex<MemoryManager>>, 1136 config: &Arc<Mutex<VmConfig>>, 1137 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1138 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1139 // Kernel with TDX is loaded in a different manner 1140 #[cfg(feature = "tdx")] 1141 if config.lock().unwrap().is_tdx_enabled() { 1142 return Ok(None); 1143 } 1144 1145 config 1146 .lock() 1147 .unwrap() 1148 .payload 1149 .as_ref() 1150 .map(|payload| { 1151 let memory_manager = memory_manager.clone(); 1152 let payload = payload.clone(); 1153 #[cfg(feature = "igvm")] 1154 let cpu_manager = cpu_manager.clone(); 1155 1156 std::thread::Builder::new() 1157 .name("payload_loader".into()) 1158 .spawn(move || { 1159 Self::load_payload( 1160 &payload, 1161 memory_manager, 1162 #[cfg(feature = "igvm")] 1163 cpu_manager, 1164 ) 1165 }) 1166 .map_err(Error::KernelLoadThreadSpawn) 1167 }) 1168 .transpose() 1169 } 1170 1171 #[cfg(target_arch = "x86_64")] 1172 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1173 trace_scoped!("configure_system"); 1174 info!("Configuring system"); 1175 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1176 1177 let initramfs_config = match self.initramfs { 1178 Some(_) => Some(self.load_initramfs(&mem)?), 1179 None => None, 1180 }; 1181 1182 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1183 let rsdp_addr = Some(rsdp_addr); 1184 let sgx_epc_region = self 1185 .memory_manager 1186 .lock() 1187 .unwrap() 1188 .sgx_epc_region() 1189 .as_ref() 1190 .cloned(); 1191 1192 let serial_number = self 1193 .config 1194 .lock() 1195 .unwrap() 1196 .platform 1197 .as_ref() 1198 .and_then(|p| p.serial_number.clone()); 1199 1200 let uuid = self 1201 .config 1202 .lock() 1203 .unwrap() 1204 .platform 1205 .as_ref() 1206 .and_then(|p| p.uuid.clone()); 1207 1208 let oem_strings = self 1209 .config 1210 .lock() 1211 .unwrap() 1212 .platform 1213 .as_ref() 1214 .and_then(|p| p.oem_strings.clone()); 1215 1216 let oem_strings = oem_strings 1217 .as_deref() 1218 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1219 1220 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1221 1222 arch::configure_system( 1223 &mem, 1224 arch::layout::CMDLINE_START, 1225 arch::layout::CMDLINE_MAX_SIZE, 1226 &initramfs_config, 1227 boot_vcpus, 1228 entry_addr.setup_header, 1229 rsdp_addr, 1230 sgx_epc_region, 1231 serial_number.as_deref(), 1232 uuid.as_deref(), 1233 oem_strings.as_deref(), 1234 topology, 1235 ) 1236 .map_err(Error::ConfigureSystem)?; 1237 Ok(()) 1238 } 1239 1240 #[cfg(target_arch = "aarch64")] 1241 fn configure_system( 1242 &mut self, 1243 _rsdp_addr: GuestAddress, 1244 _entry_addr: EntryPoint, 1245 ) -> Result<()> { 1246 let cmdline = Self::generate_cmdline( 1247 self.config.lock().unwrap().payload.as_ref().unwrap(), 1248 &self.device_manager, 1249 )?; 1250 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1251 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1252 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1253 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1254 let initramfs_config = match self.initramfs { 1255 Some(_) => Some(self.load_initramfs(&mem)?), 1256 None => None, 1257 }; 1258 1259 let device_info = &self 1260 .device_manager 1261 .lock() 1262 .unwrap() 1263 .get_device_info() 1264 .clone(); 1265 1266 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1267 let pci_space = PciSpaceInfo { 1268 pci_segment_id: pci_segment.id, 1269 mmio_config_address: pci_segment.mmio_config_address, 1270 pci_device_space_start: pci_segment.start_of_mem64_area, 1271 pci_device_space_size: pci_segment.end_of_mem64_area 1272 - pci_segment.start_of_mem64_area 1273 + 1, 1274 }; 1275 pci_space_info.push(pci_space); 1276 } 1277 1278 let virtio_iommu_bdf = self 1279 .device_manager 1280 .lock() 1281 .unwrap() 1282 .iommu_attached_devices() 1283 .as_ref() 1284 .map(|(v, _)| *v); 1285 1286 let vgic = self 1287 .device_manager 1288 .lock() 1289 .unwrap() 1290 .get_interrupt_controller() 1291 .unwrap() 1292 .lock() 1293 .unwrap() 1294 .get_vgic() 1295 .map_err(|_| { 1296 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1297 arch::aarch64::Error::SetupGic, 1298 )) 1299 })?; 1300 1301 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1302 let pmu_supported = self 1303 .cpu_manager 1304 .lock() 1305 .unwrap() 1306 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1307 .map_err(|_| { 1308 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1309 arch::aarch64::Error::VcpuInitPmu, 1310 )) 1311 })?; 1312 1313 arch::configure_system( 1314 &mem, 1315 cmdline.as_cstring().unwrap().to_str().unwrap(), 1316 vcpu_mpidrs, 1317 vcpu_topology, 1318 device_info, 1319 &initramfs_config, 1320 &pci_space_info, 1321 virtio_iommu_bdf.map(|bdf| bdf.into()), 1322 &vgic, 1323 &self.numa_nodes, 1324 pmu_supported, 1325 ) 1326 .map_err(Error::ConfigureSystem)?; 1327 1328 Ok(()) 1329 } 1330 1331 pub fn serial_pty(&self) -> Option<PtyPair> { 1332 self.device_manager.lock().unwrap().serial_pty() 1333 } 1334 1335 pub fn console_pty(&self) -> Option<PtyPair> { 1336 self.device_manager.lock().unwrap().console_pty() 1337 } 1338 1339 pub fn debug_console_pty(&self) -> Option<PtyPair> { 1340 self.device_manager.lock().unwrap().debug_console_pty() 1341 } 1342 1343 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1344 self.device_manager.lock().unwrap().console_resize_pipe() 1345 } 1346 1347 pub fn shutdown(&mut self) -> Result<()> { 1348 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1349 let new_state = VmState::Shutdown; 1350 1351 state.valid_transition(new_state)?; 1352 1353 // Wake up the DeviceManager threads so they will get terminated cleanly 1354 self.device_manager 1355 .lock() 1356 .unwrap() 1357 .resume() 1358 .map_err(Error::Resume)?; 1359 1360 self.cpu_manager 1361 .lock() 1362 .unwrap() 1363 .shutdown() 1364 .map_err(Error::CpuManager)?; 1365 1366 // Wait for all the threads to finish 1367 for thread in self.threads.drain(..) { 1368 thread.join().map_err(Error::ThreadCleanup)? 1369 } 1370 *state = new_state; 1371 1372 event!("vm", "shutdown"); 1373 1374 Ok(()) 1375 } 1376 1377 pub fn resize( 1378 &mut self, 1379 desired_vcpus: Option<u8>, 1380 desired_memory: Option<u64>, 1381 desired_balloon: Option<u64>, 1382 ) -> Result<()> { 1383 event!("vm", "resizing"); 1384 1385 if let Some(desired_vcpus) = desired_vcpus { 1386 if self 1387 .cpu_manager 1388 .lock() 1389 .unwrap() 1390 .resize(desired_vcpus) 1391 .map_err(Error::CpuManager)? 1392 { 1393 self.device_manager 1394 .lock() 1395 .unwrap() 1396 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1397 .map_err(Error::DeviceManager)?; 1398 } 1399 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1400 } 1401 1402 if let Some(desired_memory) = desired_memory { 1403 let new_region = self 1404 .memory_manager 1405 .lock() 1406 .unwrap() 1407 .resize(desired_memory) 1408 .map_err(Error::MemoryManager)?; 1409 1410 let memory_config = &mut self.config.lock().unwrap().memory; 1411 1412 if let Some(new_region) = &new_region { 1413 self.device_manager 1414 .lock() 1415 .unwrap() 1416 .update_memory(new_region) 1417 .map_err(Error::DeviceManager)?; 1418 1419 match memory_config.hotplug_method { 1420 HotplugMethod::Acpi => { 1421 self.device_manager 1422 .lock() 1423 .unwrap() 1424 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1425 .map_err(Error::DeviceManager)?; 1426 } 1427 HotplugMethod::VirtioMem => {} 1428 } 1429 } 1430 1431 // We update the VM config regardless of the actual guest resize 1432 // operation result (happened or not), so that if the VM reboots 1433 // it will be running with the last configure memory size. 1434 match memory_config.hotplug_method { 1435 HotplugMethod::Acpi => memory_config.size = desired_memory, 1436 HotplugMethod::VirtioMem => { 1437 if desired_memory > memory_config.size { 1438 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1439 } else { 1440 memory_config.hotplugged_size = None; 1441 } 1442 } 1443 } 1444 } 1445 1446 if let Some(desired_balloon) = desired_balloon { 1447 self.device_manager 1448 .lock() 1449 .unwrap() 1450 .resize_balloon(desired_balloon) 1451 .map_err(Error::DeviceManager)?; 1452 1453 // Update the configuration value for the balloon size to ensure 1454 // a reboot would use the right value. 1455 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1456 balloon_config.size = desired_balloon; 1457 } 1458 } 1459 1460 event!("vm", "resized"); 1461 1462 Ok(()) 1463 } 1464 1465 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1466 let memory_config = &mut self.config.lock().unwrap().memory; 1467 1468 if let Some(zones) = &mut memory_config.zones { 1469 for zone in zones.iter_mut() { 1470 if zone.id == id { 1471 if desired_memory >= zone.size { 1472 let hotplugged_size = desired_memory - zone.size; 1473 self.memory_manager 1474 .lock() 1475 .unwrap() 1476 .resize_zone(&id, desired_memory - zone.size) 1477 .map_err(Error::MemoryManager)?; 1478 // We update the memory zone config regardless of the 1479 // actual 'resize-zone' operation result (happened or 1480 // not), so that if the VM reboots it will be running 1481 // with the last configured memory zone size. 1482 zone.hotplugged_size = Some(hotplugged_size); 1483 1484 return Ok(()); 1485 } else { 1486 error!( 1487 "Invalid to ask less ({}) than boot RAM ({}) for \ 1488 this memory zone", 1489 desired_memory, zone.size, 1490 ); 1491 return Err(Error::ResizeZone); 1492 } 1493 } 1494 } 1495 } 1496 1497 error!("Could not find the memory zone {} for the resize", id); 1498 Err(Error::ResizeZone) 1499 } 1500 1501 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1502 let pci_device_info = self 1503 .device_manager 1504 .lock() 1505 .unwrap() 1506 .add_device(&mut device_cfg) 1507 .map_err(Error::DeviceManager)?; 1508 1509 // Update VmConfig by adding the new device. This is important to 1510 // ensure the device would be created in case of a reboot. 1511 { 1512 let mut config = self.config.lock().unwrap(); 1513 add_to_config(&mut config.devices, device_cfg); 1514 } 1515 1516 self.device_manager 1517 .lock() 1518 .unwrap() 1519 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1520 .map_err(Error::DeviceManager)?; 1521 1522 Ok(pci_device_info) 1523 } 1524 1525 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1526 let pci_device_info = self 1527 .device_manager 1528 .lock() 1529 .unwrap() 1530 .add_user_device(&mut device_cfg) 1531 .map_err(Error::DeviceManager)?; 1532 1533 // Update VmConfig by adding the new device. This is important to 1534 // ensure the device would be created in case of a reboot. 1535 { 1536 let mut config = self.config.lock().unwrap(); 1537 add_to_config(&mut config.user_devices, device_cfg); 1538 } 1539 1540 self.device_manager 1541 .lock() 1542 .unwrap() 1543 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1544 .map_err(Error::DeviceManager)?; 1545 1546 Ok(pci_device_info) 1547 } 1548 1549 pub fn remove_device(&mut self, id: String) -> Result<()> { 1550 self.device_manager 1551 .lock() 1552 .unwrap() 1553 .remove_device(id.clone()) 1554 .map_err(Error::DeviceManager)?; 1555 1556 // Update VmConfig by removing the device. This is important to 1557 // ensure the device would not be created in case of a reboot. 1558 self.config.lock().unwrap().remove_device(&id); 1559 1560 self.device_manager 1561 .lock() 1562 .unwrap() 1563 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1564 .map_err(Error::DeviceManager)?; 1565 Ok(()) 1566 } 1567 1568 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1569 let pci_device_info = self 1570 .device_manager 1571 .lock() 1572 .unwrap() 1573 .add_disk(&mut disk_cfg) 1574 .map_err(Error::DeviceManager)?; 1575 1576 // Update VmConfig by adding the new device. This is important to 1577 // ensure the device would be created in case of a reboot. 1578 { 1579 let mut config = self.config.lock().unwrap(); 1580 add_to_config(&mut config.disks, disk_cfg); 1581 } 1582 1583 self.device_manager 1584 .lock() 1585 .unwrap() 1586 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1587 .map_err(Error::DeviceManager)?; 1588 1589 Ok(pci_device_info) 1590 } 1591 1592 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1593 let pci_device_info = self 1594 .device_manager 1595 .lock() 1596 .unwrap() 1597 .add_fs(&mut fs_cfg) 1598 .map_err(Error::DeviceManager)?; 1599 1600 // Update VmConfig by adding the new device. This is important to 1601 // ensure the device would be created in case of a reboot. 1602 { 1603 let mut config = self.config.lock().unwrap(); 1604 add_to_config(&mut config.fs, fs_cfg); 1605 } 1606 1607 self.device_manager 1608 .lock() 1609 .unwrap() 1610 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1611 .map_err(Error::DeviceManager)?; 1612 1613 Ok(pci_device_info) 1614 } 1615 1616 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1617 let pci_device_info = self 1618 .device_manager 1619 .lock() 1620 .unwrap() 1621 .add_pmem(&mut pmem_cfg) 1622 .map_err(Error::DeviceManager)?; 1623 1624 // Update VmConfig by adding the new device. This is important to 1625 // ensure the device would be created in case of a reboot. 1626 { 1627 let mut config = self.config.lock().unwrap(); 1628 add_to_config(&mut config.pmem, pmem_cfg); 1629 } 1630 1631 self.device_manager 1632 .lock() 1633 .unwrap() 1634 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1635 .map_err(Error::DeviceManager)?; 1636 1637 Ok(pci_device_info) 1638 } 1639 1640 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1641 let pci_device_info = self 1642 .device_manager 1643 .lock() 1644 .unwrap() 1645 .add_net(&mut net_cfg) 1646 .map_err(Error::DeviceManager)?; 1647 1648 // Update VmConfig by adding the new device. This is important to 1649 // ensure the device would be created in case of a reboot. 1650 { 1651 let mut config = self.config.lock().unwrap(); 1652 add_to_config(&mut config.net, net_cfg); 1653 } 1654 1655 self.device_manager 1656 .lock() 1657 .unwrap() 1658 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1659 .map_err(Error::DeviceManager)?; 1660 1661 Ok(pci_device_info) 1662 } 1663 1664 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1665 let pci_device_info = self 1666 .device_manager 1667 .lock() 1668 .unwrap() 1669 .add_vdpa(&mut vdpa_cfg) 1670 .map_err(Error::DeviceManager)?; 1671 1672 // Update VmConfig by adding the new device. This is important to 1673 // ensure the device would be created in case of a reboot. 1674 { 1675 let mut config = self.config.lock().unwrap(); 1676 add_to_config(&mut config.vdpa, vdpa_cfg); 1677 } 1678 1679 self.device_manager 1680 .lock() 1681 .unwrap() 1682 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1683 .map_err(Error::DeviceManager)?; 1684 1685 Ok(pci_device_info) 1686 } 1687 1688 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1689 let pci_device_info = self 1690 .device_manager 1691 .lock() 1692 .unwrap() 1693 .add_vsock(&mut vsock_cfg) 1694 .map_err(Error::DeviceManager)?; 1695 1696 // Update VmConfig by adding the new device. This is important to 1697 // ensure the device would be created in case of a reboot. 1698 { 1699 let mut config = self.config.lock().unwrap(); 1700 config.vsock = Some(vsock_cfg); 1701 } 1702 1703 self.device_manager 1704 .lock() 1705 .unwrap() 1706 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1707 .map_err(Error::DeviceManager)?; 1708 1709 Ok(pci_device_info) 1710 } 1711 1712 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1713 Ok(self.device_manager.lock().unwrap().counters()) 1714 } 1715 1716 #[cfg(feature = "tdx")] 1717 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1718 use arch::x86_64::tdx::*; 1719 1720 let firmware_path = self 1721 .config 1722 .lock() 1723 .unwrap() 1724 .payload 1725 .as_ref() 1726 .unwrap() 1727 .firmware 1728 .clone() 1729 .ok_or(Error::TdxFirmwareMissing)?; 1730 // The TDVF file contains a table of section as well as code 1731 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1732 1733 // For all the sections allocate some RAM backing them 1734 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1735 } 1736 1737 #[cfg(feature = "tdx")] 1738 fn hob_memory_resources( 1739 mut sorted_sections: Vec<TdvfSection>, 1740 guest_memory: &GuestMemoryMmap, 1741 ) -> Vec<(u64, u64, bool)> { 1742 let mut list = Vec::new(); 1743 1744 let mut current_section = sorted_sections.pop(); 1745 1746 // RAM regions interleaved with TDVF sections 1747 let mut next_start_addr = 0; 1748 for region in guest_memory.iter() { 1749 let region_start = region.start_addr().0; 1750 let region_end = region.last_addr().0; 1751 if region_start > next_start_addr { 1752 next_start_addr = region_start; 1753 } 1754 1755 loop { 1756 let (start, size, ram) = if let Some(section) = ¤t_section { 1757 if section.address <= next_start_addr { 1758 (section.address, section.size, false) 1759 } else { 1760 let last_addr = std::cmp::min(section.address - 1, region_end); 1761 (next_start_addr, last_addr - next_start_addr + 1, true) 1762 } 1763 } else { 1764 (next_start_addr, region_end - next_start_addr + 1, true) 1765 }; 1766 1767 list.push((start, size, ram)); 1768 1769 if !ram { 1770 current_section = sorted_sections.pop(); 1771 } 1772 1773 next_start_addr = start + size; 1774 1775 if region_start > next_start_addr { 1776 next_start_addr = region_start; 1777 } 1778 1779 if next_start_addr > region_end { 1780 break; 1781 } 1782 } 1783 } 1784 1785 // Once all the interleaved sections have been processed, let's simply 1786 // pull the remaining ones. 1787 if let Some(section) = current_section { 1788 list.push((section.address, section.size, false)); 1789 } 1790 while let Some(section) = sorted_sections.pop() { 1791 list.push((section.address, section.size, false)); 1792 } 1793 1794 list 1795 } 1796 1797 #[cfg(feature = "tdx")] 1798 fn populate_tdx_sections( 1799 &mut self, 1800 sections: &[TdvfSection], 1801 guid_found: bool, 1802 ) -> Result<Option<u64>> { 1803 use arch::x86_64::tdx::*; 1804 // Get the memory end *before* we start adding TDVF ram regions 1805 let boot_guest_memory = self 1806 .memory_manager 1807 .lock() 1808 .as_ref() 1809 .unwrap() 1810 .boot_guest_memory(); 1811 for section in sections { 1812 // No need to allocate if the section falls within guest RAM ranges 1813 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1814 info!( 1815 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1816 section 1817 ); 1818 continue; 1819 } 1820 1821 info!("Allocating TDVF Section: {:x?}", section); 1822 self.memory_manager 1823 .lock() 1824 .unwrap() 1825 .add_ram_region(GuestAddress(section.address), section.size as usize) 1826 .map_err(Error::AllocatingTdvfMemory)?; 1827 } 1828 1829 // The TDVF file contains a table of section as well as code 1830 let firmware_path = self 1831 .config 1832 .lock() 1833 .unwrap() 1834 .payload 1835 .as_ref() 1836 .unwrap() 1837 .firmware 1838 .clone() 1839 .ok_or(Error::TdxFirmwareMissing)?; 1840 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1841 1842 // The guest memory at this point now has all the required regions so it 1843 // is safe to copy from the TDVF file into it. 1844 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1845 let mem = guest_memory.memory(); 1846 let mut payload_info = None; 1847 let mut hob_offset = None; 1848 for section in sections { 1849 info!("Populating TDVF Section: {:x?}", section); 1850 match section.r#type { 1851 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1852 info!("Copying section to guest memory"); 1853 firmware_file 1854 .seek(SeekFrom::Start(section.data_offset as u64)) 1855 .map_err(Error::LoadTdvf)?; 1856 mem.read_volatile_from( 1857 GuestAddress(section.address), 1858 &mut firmware_file, 1859 section.data_size as usize, 1860 ) 1861 .unwrap(); 1862 } 1863 TdvfSectionType::TdHob => { 1864 hob_offset = Some(section.address); 1865 } 1866 TdvfSectionType::Payload => { 1867 info!("Copying payload to guest memory"); 1868 if let Some(payload_file) = self.kernel.as_mut() { 1869 let payload_size = payload_file 1870 .seek(SeekFrom::End(0)) 1871 .map_err(Error::LoadPayload)?; 1872 1873 payload_file 1874 .seek(SeekFrom::Start(0x1f1)) 1875 .map_err(Error::LoadPayload)?; 1876 1877 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1878 payload_file 1879 .read_volatile(&mut payload_header.as_bytes()) 1880 .unwrap(); 1881 1882 if payload_header.header != 0x5372_6448 { 1883 return Err(Error::InvalidPayloadType); 1884 } 1885 1886 if (payload_header.version < 0x0200) 1887 || ((payload_header.loadflags & 0x1) == 0x0) 1888 { 1889 return Err(Error::InvalidPayloadType); 1890 } 1891 1892 payload_file.rewind().map_err(Error::LoadPayload)?; 1893 mem.read_volatile_from( 1894 GuestAddress(section.address), 1895 payload_file, 1896 payload_size as usize, 1897 ) 1898 .unwrap(); 1899 1900 // Create the payload info that will be inserted into 1901 // the HOB. 1902 payload_info = Some(PayloadInfo { 1903 image_type: PayloadImageType::BzImage, 1904 entry_point: section.address, 1905 }); 1906 } 1907 } 1908 TdvfSectionType::PayloadParam => { 1909 info!("Copying payload parameters to guest memory"); 1910 let cmdline = Self::generate_cmdline( 1911 self.config.lock().unwrap().payload.as_ref().unwrap(), 1912 )?; 1913 mem.write_slice( 1914 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1915 GuestAddress(section.address), 1916 ) 1917 .unwrap(); 1918 } 1919 _ => {} 1920 } 1921 } 1922 1923 // Generate HOB 1924 let mut hob = TdHob::start(hob_offset.unwrap()); 1925 1926 let mut sorted_sections = sections.to_vec(); 1927 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1928 1929 sorted_sections.sort_by_key(|section| section.address); 1930 sorted_sections.reverse(); 1931 1932 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1933 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1934 .map_err(Error::PopulateHob)?; 1935 } 1936 1937 // MMIO regions 1938 hob.add_mmio_resource( 1939 &mem, 1940 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1941 arch::layout::APIC_START.raw_value() 1942 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1943 ) 1944 .map_err(Error::PopulateHob)?; 1945 let start_of_device_area = self 1946 .memory_manager 1947 .lock() 1948 .unwrap() 1949 .start_of_device_area() 1950 .raw_value(); 1951 let end_of_device_area = self 1952 .memory_manager 1953 .lock() 1954 .unwrap() 1955 .end_of_device_area() 1956 .raw_value(); 1957 hob.add_mmio_resource( 1958 &mem, 1959 start_of_device_area, 1960 end_of_device_area - start_of_device_area, 1961 ) 1962 .map_err(Error::PopulateHob)?; 1963 1964 // Loop over the ACPI tables and copy them to the HOB. 1965 1966 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1967 &self.device_manager, 1968 &self.cpu_manager, 1969 &self.memory_manager, 1970 &self.numa_nodes, 1971 ) { 1972 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1973 .map_err(Error::PopulateHob)?; 1974 } 1975 1976 // If a payload info has been created, let's insert it into the HOB. 1977 if let Some(payload_info) = payload_info { 1978 hob.add_payload(&mem, payload_info) 1979 .map_err(Error::PopulateHob)?; 1980 } 1981 1982 hob.finish(&mem).map_err(Error::PopulateHob)?; 1983 1984 Ok(hob_offset) 1985 } 1986 1987 #[cfg(feature = "tdx")] 1988 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1989 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1990 let mem = guest_memory.memory(); 1991 1992 for section in sections { 1993 self.vm 1994 .tdx_init_memory_region( 1995 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1996 section.address, 1997 section.size, 1998 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1999 section.attributes == 1, 2000 ) 2001 .map_err(Error::InitializeTdxMemoryRegion)?; 2002 } 2003 2004 Ok(()) 2005 } 2006 2007 // Creates ACPI tables 2008 // In case of TDX being used, this is a no-op since the tables will be 2009 // created and passed when populating the HOB. 2010 2011 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2012 #[cfg(feature = "tdx")] 2013 if self.config.lock().unwrap().is_tdx_enabled() { 2014 return None; 2015 } 2016 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2017 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2018 let rsdp_addr = crate::acpi::create_acpi_tables( 2019 &mem, 2020 &self.device_manager, 2021 &self.cpu_manager, 2022 &self.memory_manager, 2023 &self.numa_nodes, 2024 tpm_enabled, 2025 ); 2026 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2027 2028 Some(rsdp_addr) 2029 } 2030 2031 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2032 trace_scoped!("entry_point"); 2033 2034 self.load_payload_handle 2035 .take() 2036 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2037 .transpose() 2038 } 2039 2040 pub fn boot(&mut self) -> Result<()> { 2041 trace_scoped!("Vm::boot"); 2042 info!("Booting VM"); 2043 event!("vm", "booting"); 2044 let current_state = self.get_state()?; 2045 if current_state == VmState::Paused { 2046 return self.resume().map_err(Error::Resume); 2047 } 2048 2049 let new_state = if self.stop_on_boot { 2050 VmState::BreakPoint 2051 } else { 2052 VmState::Running 2053 }; 2054 current_state.valid_transition(new_state)?; 2055 2056 // Do earlier to parallelise with loading kernel 2057 #[cfg(target_arch = "x86_64")] 2058 cfg_if::cfg_if! { 2059 if #[cfg(feature = "sev_snp")] { 2060 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2061 let rsdp_addr = if sev_snp_enabled { 2062 // In case of SEV-SNP guest ACPI tables are provided via 2063 // IGVM. So skip the creation of ACPI tables and set the 2064 // rsdp addr to None. 2065 None 2066 } else { 2067 self.create_acpi_tables() 2068 }; 2069 } else { 2070 let rsdp_addr = self.create_acpi_tables(); 2071 } 2072 } 2073 2074 // Load kernel synchronously or if asynchronous then wait for load to 2075 // finish. 2076 let entry_point = self.entry_point()?; 2077 2078 #[cfg(feature = "tdx")] 2079 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2080 2081 // Configure the vcpus that have been created 2082 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2083 for vcpu in vcpus { 2084 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2085 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2086 self.cpu_manager 2087 .lock() 2088 .unwrap() 2089 .configure_vcpu(vcpu, boot_setup) 2090 .map_err(Error::CpuManager)?; 2091 } 2092 2093 #[cfg(feature = "tdx")] 2094 let (sections, guid_found) = if tdx_enabled { 2095 self.extract_tdvf_sections()? 2096 } else { 2097 (Vec::new(), false) 2098 }; 2099 2100 // Configuring the TDX regions requires that the vCPUs are created. 2101 #[cfg(feature = "tdx")] 2102 let hob_address = if tdx_enabled { 2103 // TDX sections are written to memory. 2104 self.populate_tdx_sections(§ions, guid_found)? 2105 } else { 2106 None 2107 }; 2108 2109 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2110 // available after they are configured 2111 #[cfg(target_arch = "aarch64")] 2112 let rsdp_addr = self.create_acpi_tables(); 2113 2114 // Configure shared state based on loaded kernel 2115 entry_point 2116 .map(|entry_point| { 2117 // Safe to unwrap rsdp_addr as we know it can't be None when 2118 // the entry_point is Some. 2119 self.configure_system(rsdp_addr.unwrap(), entry_point) 2120 }) 2121 .transpose()?; 2122 2123 #[cfg(target_arch = "x86_64")] 2124 // Note: For x86, always call this function before invoking start boot vcpus. 2125 // Otherwise guest would fail to boot because we haven't created the 2126 // userspace mappings to update the hypervisor about the memory mappings. 2127 // These mappings must be created before we start the vCPU threads for 2128 // the very first time. 2129 self.memory_manager 2130 .lock() 2131 .unwrap() 2132 .allocate_address_space() 2133 .map_err(Error::MemoryManager)?; 2134 2135 #[cfg(feature = "tdx")] 2136 if let Some(hob_address) = hob_address { 2137 // With the HOB address extracted the vCPUs can have 2138 // their TDX state configured. 2139 self.cpu_manager 2140 .lock() 2141 .unwrap() 2142 .initialize_tdx(hob_address) 2143 .map_err(Error::CpuManager)?; 2144 // Let the hypervisor know which memory ranges are shared with the 2145 // guest. This prevents the guest from ignoring/discarding memory 2146 // regions provided by the host. 2147 self.init_tdx_memory(§ions)?; 2148 // With TDX memory and CPU state configured TDX setup is complete 2149 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2150 } 2151 2152 self.cpu_manager 2153 .lock() 2154 .unwrap() 2155 .start_boot_vcpus(new_state == VmState::BreakPoint) 2156 .map_err(Error::CpuManager)?; 2157 2158 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2159 *state = new_state; 2160 event!("vm", "booted"); 2161 Ok(()) 2162 } 2163 2164 pub fn restore(&mut self) -> Result<()> { 2165 event!("vm", "restoring"); 2166 2167 #[cfg(target_arch = "x86_64")] 2168 // Note: For x86, always call this function before invoking start boot vcpus. 2169 // Otherwise guest would fail to boot because we haven't created the 2170 // userspace mappings to update the hypervisor about the memory mappings. 2171 // These mappings must be created before we start the vCPU threads for 2172 // the very first time for the restored VM. 2173 self.memory_manager 2174 .lock() 2175 .unwrap() 2176 .allocate_address_space() 2177 .map_err(Error::MemoryManager)?; 2178 2179 // Now we can start all vCPUs from here. 2180 self.cpu_manager 2181 .lock() 2182 .unwrap() 2183 .start_restored_vcpus() 2184 .map_err(Error::CpuManager)?; 2185 2186 event!("vm", "restored"); 2187 Ok(()) 2188 } 2189 2190 /// Gets a thread-safe reference counted pointer to the VM configuration. 2191 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2192 Arc::clone(&self.config) 2193 } 2194 2195 /// Get the VM state. Returns an error if the state is poisoned. 2196 pub fn get_state(&self) -> Result<VmState> { 2197 self.state 2198 .try_read() 2199 .map_err(|_| Error::PoisonedState) 2200 .map(|state| *state) 2201 } 2202 2203 /// Gets the actual size of the balloon. 2204 pub fn balloon_size(&self) -> u64 { 2205 self.device_manager.lock().unwrap().balloon_size() 2206 } 2207 2208 pub fn send_memory_fds( 2209 &mut self, 2210 socket: &mut UnixStream, 2211 ) -> std::result::Result<(), MigratableError> { 2212 for (slot, fd) in self 2213 .memory_manager 2214 .lock() 2215 .unwrap() 2216 .memory_slot_fds() 2217 .drain() 2218 { 2219 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2220 .write_to(socket) 2221 .map_err(|e| { 2222 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2223 })?; 2224 socket 2225 .send_with_fd(&slot.to_le_bytes()[..], fd) 2226 .map_err(|e| { 2227 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2228 })?; 2229 2230 let res = Response::read_from(socket)?; 2231 if res.status() != Status::Ok { 2232 warn!("Error during memory fd migration"); 2233 Request::abandon().write_to(socket)?; 2234 Response::read_from(socket).ok(); 2235 return Err(MigratableError::MigrateSend(anyhow!( 2236 "Error during memory fd migration" 2237 ))); 2238 } 2239 } 2240 2241 Ok(()) 2242 } 2243 2244 pub fn send_memory_regions<F>( 2245 &mut self, 2246 ranges: &MemoryRangeTable, 2247 fd: &mut F, 2248 ) -> std::result::Result<(), MigratableError> 2249 where 2250 F: WriteVolatile, 2251 { 2252 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2253 let mem = guest_memory.memory(); 2254 2255 for range in ranges.regions() { 2256 let mut offset: u64 = 0; 2257 // Here we are manually handling the retry in case we can't the 2258 // whole region at once because we can't use the implementation 2259 // from vm-memory::GuestMemory of write_all_to() as it is not 2260 // following the correct behavior. For more info about this issue 2261 // see: https://github.com/rust-vmm/vm-memory/issues/174 2262 loop { 2263 let bytes_written = mem 2264 .write_volatile_to( 2265 GuestAddress(range.gpa + offset), 2266 fd, 2267 (range.length - offset) as usize, 2268 ) 2269 .map_err(|e| { 2270 MigratableError::MigrateSend(anyhow!( 2271 "Error transferring memory to socket: {}", 2272 e 2273 )) 2274 })?; 2275 offset += bytes_written as u64; 2276 2277 if offset == range.length { 2278 break; 2279 } 2280 } 2281 } 2282 2283 Ok(()) 2284 } 2285 2286 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2287 self.memory_manager 2288 .lock() 2289 .unwrap() 2290 .memory_range_table(false) 2291 } 2292 2293 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2294 self.device_manager.lock().unwrap().device_tree() 2295 } 2296 2297 pub fn activate_virtio_devices(&self) -> Result<()> { 2298 self.device_manager 2299 .lock() 2300 .unwrap() 2301 .activate_virtio_devices() 2302 .map_err(Error::ActivateVirtioDevices) 2303 } 2304 2305 #[cfg(target_arch = "x86_64")] 2306 pub fn power_button(&self) -> Result<()> { 2307 return self 2308 .device_manager 2309 .lock() 2310 .unwrap() 2311 .notify_power_button() 2312 .map_err(Error::PowerButton); 2313 } 2314 2315 #[cfg(target_arch = "aarch64")] 2316 pub fn power_button(&self) -> Result<()> { 2317 self.device_manager 2318 .lock() 2319 .unwrap() 2320 .notify_power_button() 2321 .map_err(Error::PowerButton) 2322 } 2323 2324 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2325 self.memory_manager.lock().unwrap().snapshot_data() 2326 } 2327 2328 #[cfg(feature = "guest_debug")] 2329 pub fn debug_request( 2330 &mut self, 2331 gdb_request: &GdbRequestPayload, 2332 cpu_id: usize, 2333 ) -> Result<GdbResponsePayload> { 2334 use GdbRequestPayload::*; 2335 match gdb_request { 2336 SetSingleStep(single_step) => { 2337 self.set_guest_debug(cpu_id, &[], *single_step) 2338 .map_err(Error::Debug)?; 2339 } 2340 SetHwBreakPoint(addrs) => { 2341 self.set_guest_debug(cpu_id, addrs, false) 2342 .map_err(Error::Debug)?; 2343 } 2344 Pause => { 2345 self.debug_pause().map_err(Error::Debug)?; 2346 } 2347 Resume => { 2348 self.debug_resume().map_err(Error::Debug)?; 2349 } 2350 ReadRegs => { 2351 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2352 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2353 } 2354 WriteRegs(regs) => { 2355 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2356 } 2357 ReadMem(vaddr, len) => { 2358 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2359 let mem = self 2360 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2361 .map_err(Error::Debug)?; 2362 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2363 } 2364 WriteMem(vaddr, data) => { 2365 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2366 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2367 .map_err(Error::Debug)?; 2368 } 2369 ActiveVcpus => { 2370 let active_vcpus = self.active_vcpus(); 2371 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2372 } 2373 } 2374 Ok(GdbResponsePayload::CommandComplete) 2375 } 2376 2377 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2378 fn get_dump_state( 2379 &mut self, 2380 destination_url: &str, 2381 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2382 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2383 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2384 let mut elf_phdr_num = 1; 2385 let elf_sh_info = 0; 2386 let coredump_file_path = url_to_file(destination_url)?; 2387 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2388 2389 if mapping_num < UINT16_MAX - 2 { 2390 elf_phdr_num += mapping_num as u16; 2391 } else { 2392 panic!("mapping num beyond 65535 not supported"); 2393 } 2394 let coredump_file = OpenOptions::new() 2395 .read(true) 2396 .write(true) 2397 .create_new(true) 2398 .open(coredump_file_path) 2399 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2400 2401 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2402 let mem_data = self 2403 .memory_manager 2404 .lock() 2405 .unwrap() 2406 .coredump_memory_regions(mem_offset); 2407 2408 Ok(DumpState { 2409 elf_note_size, 2410 elf_phdr_num, 2411 elf_sh_info, 2412 mem_offset, 2413 mem_info: Some(mem_data), 2414 file: Some(coredump_file), 2415 }) 2416 } 2417 2418 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2419 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2420 size_of::<elf::Elf64_Ehdr>() as u64 2421 + note_size as u64 2422 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2423 } 2424 } 2425 2426 impl Pausable for Vm { 2427 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2428 event!("vm", "pausing"); 2429 let mut state = self 2430 .state 2431 .try_write() 2432 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2433 let new_state = VmState::Paused; 2434 2435 state 2436 .valid_transition(new_state) 2437 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2438 2439 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2440 { 2441 let mut clock = self 2442 .vm 2443 .get_clock() 2444 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2445 clock.reset_flags(); 2446 self.saved_clock = Some(clock); 2447 } 2448 2449 // Before pausing the vCPUs activate any pending virtio devices that might 2450 // need activation between starting the pause (or e.g. a migration it's part of) 2451 self.activate_virtio_devices().map_err(|e| { 2452 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2453 })?; 2454 2455 self.cpu_manager.lock().unwrap().pause()?; 2456 self.device_manager.lock().unwrap().pause()?; 2457 2458 *state = new_state; 2459 2460 event!("vm", "paused"); 2461 Ok(()) 2462 } 2463 2464 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2465 event!("vm", "resuming"); 2466 let mut state = self 2467 .state 2468 .try_write() 2469 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2470 let new_state = VmState::Running; 2471 2472 state 2473 .valid_transition(new_state) 2474 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2475 2476 self.cpu_manager.lock().unwrap().resume()?; 2477 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2478 { 2479 if let Some(clock) = &self.saved_clock { 2480 self.vm.set_clock(clock).map_err(|e| { 2481 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2482 })?; 2483 } 2484 } 2485 self.device_manager.lock().unwrap().resume()?; 2486 2487 // And we're back to the Running state. 2488 *state = new_state; 2489 event!("vm", "resumed"); 2490 Ok(()) 2491 } 2492 } 2493 2494 #[derive(Serialize, Deserialize)] 2495 pub struct VmSnapshot { 2496 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2497 pub clock: Option<hypervisor::ClockData>, 2498 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2499 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2500 } 2501 2502 pub const VM_SNAPSHOT_ID: &str = "vm"; 2503 impl Snapshottable for Vm { 2504 fn id(&self) -> String { 2505 VM_SNAPSHOT_ID.to_string() 2506 } 2507 2508 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2509 event!("vm", "snapshotting"); 2510 2511 #[cfg(feature = "tdx")] 2512 { 2513 if self.config.lock().unwrap().is_tdx_enabled() { 2514 return Err(MigratableError::Snapshot(anyhow!( 2515 "Snapshot not possible with TDX VM" 2516 ))); 2517 } 2518 } 2519 2520 let current_state = self.get_state().unwrap(); 2521 if current_state != VmState::Paused { 2522 return Err(MigratableError::Snapshot(anyhow!( 2523 "Trying to snapshot while VM is running" 2524 ))); 2525 } 2526 2527 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2528 let common_cpuid = { 2529 let amx = self.config.lock().unwrap().cpus.features.amx; 2530 let phys_bits = physical_bits( 2531 &self.hypervisor, 2532 self.config.lock().unwrap().cpus.max_phys_bits, 2533 ); 2534 arch::generate_common_cpuid( 2535 &self.hypervisor, 2536 &arch::CpuidConfig { 2537 sgx_epc_sections: None, 2538 phys_bits, 2539 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2540 #[cfg(feature = "tdx")] 2541 tdx: false, 2542 amx, 2543 }, 2544 ) 2545 .map_err(|e| { 2546 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2547 })? 2548 }; 2549 2550 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2551 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2552 clock: self.saved_clock, 2553 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2554 common_cpuid, 2555 }) 2556 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2557 2558 let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data)); 2559 2560 let (id, snapshot) = { 2561 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2562 (cpu_manager.id(), cpu_manager.snapshot()?) 2563 }; 2564 vm_snapshot.add_snapshot(id, snapshot); 2565 let (id, snapshot) = { 2566 let mut memory_manager = self.memory_manager.lock().unwrap(); 2567 (memory_manager.id(), memory_manager.snapshot()?) 2568 }; 2569 vm_snapshot.add_snapshot(id, snapshot); 2570 let (id, snapshot) = { 2571 let mut device_manager = self.device_manager.lock().unwrap(); 2572 (device_manager.id(), device_manager.snapshot()?) 2573 }; 2574 vm_snapshot.add_snapshot(id, snapshot); 2575 2576 event!("vm", "snapshotted"); 2577 Ok(vm_snapshot) 2578 } 2579 } 2580 2581 impl Transportable for Vm { 2582 fn send( 2583 &self, 2584 snapshot: &Snapshot, 2585 destination_url: &str, 2586 ) -> std::result::Result<(), MigratableError> { 2587 let mut snapshot_config_path = url_to_path(destination_url)?; 2588 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2589 2590 // Create the snapshot config file 2591 let mut snapshot_config_file = OpenOptions::new() 2592 .read(true) 2593 .write(true) 2594 .create_new(true) 2595 .open(snapshot_config_path) 2596 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2597 2598 // Serialize and write the snapshot config 2599 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2600 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2601 2602 snapshot_config_file 2603 .write(vm_config.as_bytes()) 2604 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2605 2606 let mut snapshot_state_path = url_to_path(destination_url)?; 2607 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2608 2609 // Create the snapshot state file 2610 let mut snapshot_state_file = OpenOptions::new() 2611 .read(true) 2612 .write(true) 2613 .create_new(true) 2614 .open(snapshot_state_path) 2615 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2616 2617 // Serialize and write the snapshot state 2618 let vm_state = 2619 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2620 2621 snapshot_state_file 2622 .write(&vm_state) 2623 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2624 2625 // Tell the memory manager to also send/write its own snapshot. 2626 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2627 self.memory_manager 2628 .lock() 2629 .unwrap() 2630 .send(&memory_manager_snapshot.clone(), destination_url)?; 2631 } else { 2632 return Err(MigratableError::Restore(anyhow!( 2633 "Missing memory manager snapshot" 2634 ))); 2635 } 2636 2637 Ok(()) 2638 } 2639 } 2640 2641 impl Migratable for Vm { 2642 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2643 self.memory_manager.lock().unwrap().start_dirty_log()?; 2644 self.device_manager.lock().unwrap().start_dirty_log() 2645 } 2646 2647 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2648 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2649 self.device_manager.lock().unwrap().stop_dirty_log() 2650 } 2651 2652 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2653 Ok(MemoryRangeTable::new_from_tables(vec![ 2654 self.memory_manager.lock().unwrap().dirty_log()?, 2655 self.device_manager.lock().unwrap().dirty_log()?, 2656 ])) 2657 } 2658 2659 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2660 self.memory_manager.lock().unwrap().start_migration()?; 2661 self.device_manager.lock().unwrap().start_migration() 2662 } 2663 2664 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2665 self.memory_manager.lock().unwrap().complete_migration()?; 2666 self.device_manager.lock().unwrap().complete_migration() 2667 } 2668 } 2669 2670 #[cfg(feature = "guest_debug")] 2671 impl Debuggable for Vm { 2672 fn set_guest_debug( 2673 &self, 2674 cpu_id: usize, 2675 addrs: &[GuestAddress], 2676 singlestep: bool, 2677 ) -> std::result::Result<(), DebuggableError> { 2678 self.cpu_manager 2679 .lock() 2680 .unwrap() 2681 .set_guest_debug(cpu_id, addrs, singlestep) 2682 } 2683 2684 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2685 if *self.state.read().unwrap() == VmState::Running { 2686 self.pause().map_err(DebuggableError::Pause)?; 2687 } 2688 2689 let mut state = self 2690 .state 2691 .try_write() 2692 .map_err(|_| DebuggableError::PoisonedState)?; 2693 *state = VmState::BreakPoint; 2694 Ok(()) 2695 } 2696 2697 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2698 if *self.state.read().unwrap() == VmState::BreakPoint { 2699 self.resume().map_err(DebuggableError::Pause)?; 2700 } 2701 2702 Ok(()) 2703 } 2704 2705 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2706 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2707 } 2708 2709 fn write_regs( 2710 &self, 2711 cpu_id: usize, 2712 regs: &CoreRegs, 2713 ) -> std::result::Result<(), DebuggableError> { 2714 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2715 } 2716 2717 fn read_mem( 2718 &self, 2719 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2720 cpu_id: usize, 2721 vaddr: GuestAddress, 2722 len: usize, 2723 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2724 self.cpu_manager 2725 .lock() 2726 .unwrap() 2727 .read_mem(guest_memory, cpu_id, vaddr, len) 2728 } 2729 2730 fn write_mem( 2731 &self, 2732 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2733 cpu_id: usize, 2734 vaddr: &GuestAddress, 2735 data: &[u8], 2736 ) -> std::result::Result<(), DebuggableError> { 2737 self.cpu_manager 2738 .lock() 2739 .unwrap() 2740 .write_mem(guest_memory, cpu_id, vaddr, data) 2741 } 2742 2743 fn active_vcpus(&self) -> usize { 2744 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2745 if active_vcpus > 0 { 2746 active_vcpus 2747 } else { 2748 // The VM is not booted yet. Report boot_vcpus() instead. 2749 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2750 } 2751 } 2752 } 2753 2754 #[cfg(feature = "guest_debug")] 2755 pub const UINT16_MAX: u32 = 65535; 2756 2757 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2758 impl Elf64Writable for Vm {} 2759 2760 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2761 impl GuestDebuggable for Vm { 2762 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2763 event!("vm", "coredumping"); 2764 2765 let mut resume = false; 2766 2767 #[cfg(feature = "tdx")] 2768 { 2769 if let Some(ref platform) = self.config.lock().unwrap().platform { 2770 if platform.tdx { 2771 return Err(GuestDebuggableError::Coredump(anyhow!( 2772 "Coredump not possible with TDX VM" 2773 ))); 2774 } 2775 } 2776 } 2777 2778 match self.get_state().unwrap() { 2779 VmState::Running => { 2780 self.pause().map_err(GuestDebuggableError::Pause)?; 2781 resume = true; 2782 } 2783 VmState::Paused => {} 2784 _ => { 2785 return Err(GuestDebuggableError::Coredump(anyhow!( 2786 "Trying to coredump while VM is not running or paused" 2787 ))); 2788 } 2789 } 2790 2791 let coredump_state = self.get_dump_state(destination_url)?; 2792 2793 self.write_header(&coredump_state)?; 2794 self.write_note(&coredump_state)?; 2795 self.write_loads(&coredump_state)?; 2796 2797 self.cpu_manager 2798 .lock() 2799 .unwrap() 2800 .cpu_write_elf64_note(&coredump_state)?; 2801 self.cpu_manager 2802 .lock() 2803 .unwrap() 2804 .cpu_write_vmm_note(&coredump_state)?; 2805 2806 self.memory_manager 2807 .lock() 2808 .unwrap() 2809 .coredump_iterate_save_mem(&coredump_state)?; 2810 2811 if resume { 2812 self.resume().map_err(GuestDebuggableError::Resume)?; 2813 } 2814 2815 Ok(()) 2816 } 2817 } 2818 2819 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2820 #[cfg(test)] 2821 mod tests { 2822 use super::*; 2823 2824 fn test_vm_state_transitions(state: VmState) { 2825 match state { 2826 VmState::Created => { 2827 // Check the transitions from Created 2828 assert!(state.valid_transition(VmState::Created).is_err()); 2829 assert!(state.valid_transition(VmState::Running).is_ok()); 2830 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2831 assert!(state.valid_transition(VmState::Paused).is_ok()); 2832 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2833 } 2834 VmState::Running => { 2835 // Check the transitions from Running 2836 assert!(state.valid_transition(VmState::Created).is_err()); 2837 assert!(state.valid_transition(VmState::Running).is_err()); 2838 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2839 assert!(state.valid_transition(VmState::Paused).is_ok()); 2840 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2841 } 2842 VmState::Shutdown => { 2843 // Check the transitions from Shutdown 2844 assert!(state.valid_transition(VmState::Created).is_err()); 2845 assert!(state.valid_transition(VmState::Running).is_ok()); 2846 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2847 assert!(state.valid_transition(VmState::Paused).is_err()); 2848 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2849 } 2850 VmState::Paused => { 2851 // Check the transitions from Paused 2852 assert!(state.valid_transition(VmState::Created).is_err()); 2853 assert!(state.valid_transition(VmState::Running).is_ok()); 2854 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2855 assert!(state.valid_transition(VmState::Paused).is_err()); 2856 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2857 } 2858 VmState::BreakPoint => { 2859 // Check the transitions from Breakpoint 2860 assert!(state.valid_transition(VmState::Created).is_ok()); 2861 assert!(state.valid_transition(VmState::Running).is_ok()); 2862 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2863 assert!(state.valid_transition(VmState::Paused).is_err()); 2864 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2865 } 2866 } 2867 } 2868 2869 #[test] 2870 fn test_vm_created_transitions() { 2871 test_vm_state_transitions(VmState::Created); 2872 } 2873 2874 #[test] 2875 fn test_vm_running_transitions() { 2876 test_vm_state_transitions(VmState::Running); 2877 } 2878 2879 #[test] 2880 fn test_vm_shutdown_transitions() { 2881 test_vm_state_transitions(VmState::Shutdown); 2882 } 2883 2884 #[test] 2885 fn test_vm_paused_transitions() { 2886 test_vm_state_transitions(VmState::Paused); 2887 } 2888 2889 #[cfg(feature = "tdx")] 2890 #[test] 2891 fn test_hob_memory_resources() { 2892 // Case 1: Two TDVF sections in the middle of the RAM 2893 let sections = vec![ 2894 TdvfSection { 2895 address: 0xc000, 2896 size: 0x1000, 2897 ..Default::default() 2898 }, 2899 TdvfSection { 2900 address: 0x1000, 2901 size: 0x4000, 2902 ..Default::default() 2903 }, 2904 ]; 2905 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2906 let expected = vec![ 2907 (0, 0x1000, true), 2908 (0x1000, 0x4000, false), 2909 (0x5000, 0x7000, true), 2910 (0xc000, 0x1000, false), 2911 (0xd000, 0x0fff_3000, true), 2912 ]; 2913 assert_eq!( 2914 expected, 2915 Vm::hob_memory_resources( 2916 sections, 2917 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2918 ) 2919 ); 2920 2921 // Case 2: Two TDVF sections with no conflict with the RAM 2922 let sections = vec![ 2923 TdvfSection { 2924 address: 0x1000_1000, 2925 size: 0x1000, 2926 ..Default::default() 2927 }, 2928 TdvfSection { 2929 address: 0, 2930 size: 0x1000, 2931 ..Default::default() 2932 }, 2933 ]; 2934 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2935 let expected = vec![ 2936 (0, 0x1000, false), 2937 (0x1000, 0x1000_0000, true), 2938 (0x1000_1000, 0x1000, false), 2939 ]; 2940 assert_eq!( 2941 expected, 2942 Vm::hob_memory_resources( 2943 sections, 2944 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2945 ) 2946 ); 2947 2948 // Case 3: Two TDVF sections with partial conflicts with the RAM 2949 let sections = vec![ 2950 TdvfSection { 2951 address: 0x1000_0000, 2952 size: 0x2000, 2953 ..Default::default() 2954 }, 2955 TdvfSection { 2956 address: 0, 2957 size: 0x2000, 2958 ..Default::default() 2959 }, 2960 ]; 2961 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2962 let expected = vec![ 2963 (0, 0x2000, false), 2964 (0x2000, 0x0fff_e000, true), 2965 (0x1000_0000, 0x2000, false), 2966 ]; 2967 assert_eq!( 2968 expected, 2969 Vm::hob_memory_resources( 2970 sections, 2971 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2972 ) 2973 ); 2974 2975 // Case 4: Two TDVF sections with no conflict before the RAM and two 2976 // more additional sections with no conflict after the RAM. 2977 let sections = vec![ 2978 TdvfSection { 2979 address: 0x2000_1000, 2980 size: 0x1000, 2981 ..Default::default() 2982 }, 2983 TdvfSection { 2984 address: 0x2000_0000, 2985 size: 0x1000, 2986 ..Default::default() 2987 }, 2988 TdvfSection { 2989 address: 0x1000, 2990 size: 0x1000, 2991 ..Default::default() 2992 }, 2993 TdvfSection { 2994 address: 0, 2995 size: 0x1000, 2996 ..Default::default() 2997 }, 2998 ]; 2999 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3000 let expected = vec![ 3001 (0, 0x1000, false), 3002 (0x1000, 0x1000, false), 3003 (0x4000, 0x1000_0000, true), 3004 (0x2000_0000, 0x1000, false), 3005 (0x2000_1000, 0x1000, false), 3006 ]; 3007 assert_eq!( 3008 expected, 3009 Vm::hob_memory_resources( 3010 sections, 3011 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3012 ) 3013 ); 3014 3015 // Case 5: One TDVF section overriding the entire RAM 3016 let sections = vec![TdvfSection { 3017 address: 0, 3018 size: 0x2000_0000, 3019 ..Default::default() 3020 }]; 3021 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3022 let expected = vec![(0, 0x2000_0000, false)]; 3023 assert_eq!( 3024 expected, 3025 Vm::hob_memory_resources( 3026 sections, 3027 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3028 ) 3029 ); 3030 3031 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3032 let sections = vec![ 3033 TdvfSection { 3034 address: 0x1000_2000, 3035 size: 0x2000, 3036 ..Default::default() 3037 }, 3038 TdvfSection { 3039 address: 0, 3040 size: 0x2000, 3041 ..Default::default() 3042 }, 3043 ]; 3044 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3045 (GuestAddress(0x2000), 0x1000_0000), 3046 (GuestAddress(0x1000_4000), 0x1000_0000), 3047 ]; 3048 let expected = vec![ 3049 (0, 0x2000, false), 3050 (0x2000, 0x1000_0000, true), 3051 (0x1000_2000, 0x2000, false), 3052 (0x1000_4000, 0x1000_0000, true), 3053 ]; 3054 assert_eq!( 3055 expected, 3056 Vm::hob_memory_resources( 3057 sections, 3058 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3059 ) 3060 ); 3061 3062 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3063 let sections = vec![ 3064 TdvfSection { 3065 address: 0x1000_0000, 3066 size: 0x4000, 3067 ..Default::default() 3068 }, 3069 TdvfSection { 3070 address: 0, 3071 size: 0x4000, 3072 ..Default::default() 3073 }, 3074 ]; 3075 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3076 (GuestAddress(0x1000), 0x1000_0000), 3077 (GuestAddress(0x1000_3000), 0x1000_0000), 3078 ]; 3079 let expected = vec![ 3080 (0, 0x4000, false), 3081 (0x4000, 0x0fff_c000, true), 3082 (0x1000_0000, 0x4000, false), 3083 (0x1000_4000, 0x0fff_f000, true), 3084 ]; 3085 assert_eq!( 3086 expected, 3087 Vm::hob_memory_resources( 3088 sections, 3089 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3090 ) 3091 ); 3092 } 3093 } 3094 3095 #[cfg(target_arch = "aarch64")] 3096 #[cfg(test)] 3097 mod tests { 3098 use super::*; 3099 use crate::GuestMemoryMmap; 3100 use arch::aarch64::fdt::create_fdt; 3101 use arch::aarch64::layout; 3102 use arch::{DeviceType, MmioDeviceInfo}; 3103 use devices::gic::Gic; 3104 3105 const LEN: u64 = 4096; 3106 3107 #[test] 3108 fn test_create_fdt_with_devices() { 3109 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3110 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3111 3112 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3113 ( 3114 (DeviceType::Serial, DeviceType::Serial.to_string()), 3115 MmioDeviceInfo { 3116 addr: 0x00, 3117 len: LEN, 3118 irq: 33, 3119 }, 3120 ), 3121 ( 3122 (DeviceType::Virtio(1), "virtio".to_string()), 3123 MmioDeviceInfo { 3124 addr: LEN, 3125 len: LEN, 3126 irq: 34, 3127 }, 3128 ), 3129 ( 3130 (DeviceType::Rtc, "rtc".to_string()), 3131 MmioDeviceInfo { 3132 addr: 2 * LEN, 3133 len: LEN, 3134 irq: 35, 3135 }, 3136 ), 3137 ] 3138 .iter() 3139 .cloned() 3140 .collect(); 3141 3142 let hv = hypervisor::new().unwrap(); 3143 let vm = hv.create_vm().unwrap(); 3144 let gic = vm 3145 .create_vgic(Gic::create_default_config(1)) 3146 .expect("Cannot create gic"); 3147 assert!(create_fdt( 3148 &mem, 3149 "console=tty0", 3150 vec![0], 3151 Some((0, 0, 0)), 3152 &dev_info, 3153 &gic, 3154 &None, 3155 &Vec::new(), 3156 &BTreeMap::new(), 3157 None, 3158 true, 3159 ) 3160 .is_ok()) 3161 } 3162 } 3163 3164 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3165 #[test] 3166 pub fn test_vm() { 3167 use hypervisor::VmExit; 3168 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3169 // This example based on https://lwn.net/Articles/658511/ 3170 let code = [ 3171 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3172 0x00, 0xd8, /* add %bl, %al */ 3173 0x04, b'0', /* add $'0', %al */ 3174 0xee, /* out %al, (%dx) */ 3175 0xb0, b'\n', /* mov $'\n', %al */ 3176 0xee, /* out %al, (%dx) */ 3177 0xf4, /* hlt */ 3178 ]; 3179 3180 let mem_size = 0x1000; 3181 let load_addr = GuestAddress(0x1000); 3182 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3183 3184 let hv = hypervisor::new().unwrap(); 3185 let vm = hv.create_vm().expect("new VM creation failed"); 3186 3187 for (index, region) in mem.iter().enumerate() { 3188 let mem_region = vm.make_user_memory_region( 3189 index as u32, 3190 region.start_addr().raw_value(), 3191 region.len(), 3192 region.as_ptr() as u64, 3193 false, 3194 false, 3195 ); 3196 3197 vm.create_user_memory_region(mem_region) 3198 .expect("Cannot configure guest memory"); 3199 } 3200 mem.write_slice(&code, load_addr) 3201 .expect("Writing code to memory failed"); 3202 3203 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3204 3205 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3206 vcpu_sregs.cs.base = 0; 3207 vcpu_sregs.cs.selector = 0; 3208 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3209 3210 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3211 vcpu_regs.rip = 0x1000; 3212 vcpu_regs.rax = 2; 3213 vcpu_regs.rbx = 3; 3214 vcpu_regs.rflags = 2; 3215 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3216 3217 loop { 3218 match vcpu.run().expect("run failed") { 3219 VmExit::IoOut(addr, data) => { 3220 println!( 3221 "IO out -- addr: {:#x} data [{:?}]", 3222 addr, 3223 str::from_utf8(data).unwrap() 3224 ); 3225 } 3226 VmExit::Reset => { 3227 println!("HLT"); 3228 break; 3229 } 3230 r => panic!("unexpected exit reason: {r:?}"), 3231 } 3232 } 3233 } 3234