1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{Console, DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 use crate::memory_manager::{ 29 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 30 }; 31 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 32 use crate::migration::get_vm_snapshot; 33 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 34 use crate::migration::url_to_file; 35 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 36 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 37 use crate::GuestMemoryMmap; 38 use crate::{ 39 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 40 }; 41 use anyhow::anyhow; 42 use arch::get_host_cpu_phys_bits; 43 #[cfg(target_arch = "x86_64")] 44 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 45 #[cfg(feature = "tdx")] 46 use arch::x86_64::tdx::TdvfSection; 47 use arch::EntryPoint; 48 #[cfg(target_arch = "aarch64")] 49 use arch::PciSpaceInfo; 50 use arch::{NumaNode, NumaNodes}; 51 #[cfg(target_arch = "aarch64")] 52 use devices::interrupt_controller; 53 use devices::AcpiNotificationFlags; 54 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 55 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 56 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 57 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 58 use hypervisor::{HypervisorVmError, VmOps}; 59 use linux_loader::cmdline::Cmdline; 60 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 61 use linux_loader::elf; 62 #[cfg(target_arch = "x86_64")] 63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 64 #[cfg(target_arch = "aarch64")] 65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 66 use linux_loader::loader::KernelLoader; 67 use seccompiler::{apply_filter, SeccompAction}; 68 use serde::{Deserialize, Serialize}; 69 use signal_hook::{consts::SIGWINCH, iterator::backend::Handle, iterator::Signals}; 70 use std::cmp; 71 use std::collections::BTreeMap; 72 use std::collections::HashMap; 73 use std::convert::TryInto; 74 use std::fs::{File, OpenOptions}; 75 use std::io::{self, Seek, SeekFrom, Write}; 76 #[cfg(feature = "tdx")] 77 use std::mem; 78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 79 use std::mem::size_of; 80 use std::num::Wrapping; 81 use std::ops::Deref; 82 use std::os::unix::net::UnixStream; 83 use std::panic::AssertUnwindSafe; 84 use std::sync::{Arc, Mutex, RwLock}; 85 use std::time::Instant; 86 use std::{result, str, thread}; 87 use thiserror::Error; 88 use tracer::trace_scoped; 89 use vm_device::Bus; 90 #[cfg(feature = "tdx")] 91 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion}; 92 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; 93 use vm_migration::protocol::{Request, Response, Status}; 94 use vm_migration::{ 95 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 96 SnapshotData, Snapshottable, Transportable, 97 }; 98 use vmm_sys_util::eventfd::EventFd; 99 use vmm_sys_util::signal::unblock_signal; 100 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 101 use vmm_sys_util::terminal::Terminal; 102 103 /// Errors associated with VM management 104 #[derive(Debug, Error)] 105 pub enum Error { 106 #[error("Cannot open kernel file: {0}")] 107 KernelFile(#[source] io::Error), 108 109 #[error("Cannot open initramfs file: {0}")] 110 InitramfsFile(#[source] io::Error), 111 112 #[error("Cannot load the kernel into memory: {0}")] 113 KernelLoad(#[source] linux_loader::loader::Error), 114 115 #[cfg(target_arch = "aarch64")] 116 #[error("Cannot load the UEFI binary in memory: {0:?}")] 117 UefiLoad(arch::aarch64::uefi::Error), 118 119 #[error("Cannot load the initramfs into memory")] 120 InitramfsLoad, 121 122 #[error("Cannot load the kernel command line in memory: {0}")] 123 LoadCmdLine(#[source] linux_loader::loader::Error), 124 125 #[error("Cannot modify the kernel command line: {0}")] 126 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 127 128 #[error("Cannot create the kernel command line: {0}")] 129 CmdLineCreate(#[source] linux_loader::cmdline::Error), 130 131 #[error("Cannot configure system: {0}")] 132 ConfigureSystem(#[source] arch::Error), 133 134 #[cfg(target_arch = "aarch64")] 135 #[error("Cannot enable interrupt controller: {0:?}")] 136 EnableInterruptController(interrupt_controller::Error), 137 138 #[error("VM state is poisoned")] 139 PoisonedState, 140 141 #[error("Error from device manager: {0:?}")] 142 DeviceManager(DeviceManagerError), 143 144 #[error("Cannot setup terminal in raw mode: {0}")] 145 SetTerminalRaw(#[source] vmm_sys_util::errno::Error), 146 147 #[error("Cannot setup terminal in canonical mode.: {0}")] 148 SetTerminalCanon(#[source] vmm_sys_util::errno::Error), 149 150 #[error("Cannot spawn a signal handler thread: {0}")] 151 SignalHandlerSpawn(#[source] io::Error), 152 153 #[error("Failed to join on threads: {0:?}")] 154 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 155 156 #[error("VM config is missing")] 157 VmMissingConfig, 158 159 #[error("VM is not created")] 160 VmNotCreated, 161 162 #[error("VM is already created")] 163 VmAlreadyCreated, 164 165 #[error("VM is not running")] 166 VmNotRunning, 167 168 #[error("Cannot clone EventFd: {0}")] 169 EventFdClone(#[source] io::Error), 170 171 #[error("invalid VM state transition: {0:?} to {1:?}")] 172 InvalidStateTransition(VmState, VmState), 173 174 #[error("Error from CPU manager: {0}")] 175 CpuManager(#[source] cpu::Error), 176 177 #[error("Cannot pause devices: {0}")] 178 PauseDevices(#[source] MigratableError), 179 180 #[error("Cannot resume devices: {0}")] 181 ResumeDevices(#[source] MigratableError), 182 183 #[error("Cannot pause CPUs: {0}")] 184 PauseCpus(#[source] MigratableError), 185 186 #[error("Cannot resume cpus: {0}")] 187 ResumeCpus(#[source] MigratableError), 188 189 #[error("Cannot pause VM: {0}")] 190 Pause(#[source] MigratableError), 191 192 #[error("Cannot resume VM: {0}")] 193 Resume(#[source] MigratableError), 194 195 #[error("Memory manager error: {0:?}")] 196 MemoryManager(MemoryManagerError), 197 198 #[error("Eventfd write error: {0}")] 199 EventfdError(#[source] std::io::Error), 200 201 #[error("Cannot snapshot VM: {0}")] 202 Snapshot(#[source] MigratableError), 203 204 #[error("Cannot restore VM: {0}")] 205 Restore(#[source] MigratableError), 206 207 #[error("Cannot send VM snapshot: {0}")] 208 SnapshotSend(#[source] MigratableError), 209 210 #[error("Invalid restore source URL")] 211 InvalidRestoreSourceUrl, 212 213 #[error("Failed to validate config: {0}")] 214 ConfigValidation(#[source] ValidationError), 215 216 #[error("Too many virtio-vsock devices")] 217 TooManyVsockDevices, 218 219 #[error("Failed serializing into JSON: {0}")] 220 SerializeJson(#[source] serde_json::Error), 221 222 #[error("Invalid NUMA configuration")] 223 InvalidNumaConfig, 224 225 #[error("Cannot create seccomp filter: {0}")] 226 CreateSeccompFilter(#[source] seccompiler::Error), 227 228 #[error("Cannot apply seccomp filter: {0}")] 229 ApplySeccompFilter(#[source] seccompiler::Error), 230 231 #[error("Failed resizing a memory zone")] 232 ResizeZone, 233 234 #[error("Cannot activate virtio devices: {0:?}")] 235 ActivateVirtioDevices(DeviceManagerError), 236 237 #[error("Error triggering power button: {0:?}")] 238 PowerButton(DeviceManagerError), 239 240 #[error("Kernel lacks PVH header")] 241 KernelMissingPvhHeader, 242 243 #[error("Failed to allocate firmware RAM: {0:?}")] 244 AllocateFirmwareMemory(MemoryManagerError), 245 246 #[error("Error manipulating firmware file: {0}")] 247 FirmwareFile(#[source] std::io::Error), 248 249 #[error("Firmware too big")] 250 FirmwareTooLarge, 251 252 #[error("Failed to copy firmware to memory: {0}")] 253 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 254 255 #[cfg(feature = "tdx")] 256 #[error("Error performing I/O on TDX firmware file: {0}")] 257 LoadTdvf(#[source] std::io::Error), 258 259 #[cfg(feature = "tdx")] 260 #[error("Error performing I/O on the TDX payload file: {0}")] 261 LoadPayload(#[source] std::io::Error), 262 263 #[cfg(feature = "tdx")] 264 #[error("Error parsing TDVF: {0}")] 265 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 266 267 #[cfg(feature = "tdx")] 268 #[error("Error populating TDX HOB: {0}")] 269 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 270 271 #[cfg(feature = "tdx")] 272 #[error("Error allocating TDVF memory: {0:?}")] 273 AllocatingTdvfMemory(crate::memory_manager::Error), 274 275 #[cfg(feature = "tdx")] 276 #[error("Error enabling TDX VM: {0}")] 277 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 278 279 #[cfg(feature = "tdx")] 280 #[error("Error enabling TDX memory region: {0}")] 281 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 282 283 #[cfg(feature = "tdx")] 284 #[error("Error finalizing TDX VM: {0}")] 285 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 286 287 #[cfg(feature = "tdx")] 288 #[error("TDX firmware missing")] 289 TdxFirmwareMissing, 290 291 #[cfg(feature = "tdx")] 292 #[error("Invalid TDX payload type")] 293 InvalidPayloadType, 294 295 #[cfg(feature = "guest_debug")] 296 #[error("Error debugging VM: {0:?}")] 297 Debug(DebuggableError), 298 299 #[error("Error spawning kernel loading thread")] 300 KernelLoadThreadSpawn(std::io::Error), 301 302 #[error("Error joining kernel loading thread")] 303 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 304 305 #[error("Payload configuration is not bootable")] 306 InvalidPayload, 307 308 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 309 #[error("Error coredumping VM: {0:?}")] 310 Coredump(GuestDebuggableError), 311 } 312 pub type Result<T> = result::Result<T, Error>; 313 314 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 315 pub enum VmState { 316 Created, 317 Running, 318 Shutdown, 319 Paused, 320 BreakPoint, 321 } 322 323 impl VmState { 324 fn valid_transition(self, new_state: VmState) -> Result<()> { 325 match self { 326 VmState::Created => match new_state { 327 VmState::Created | VmState::Shutdown => { 328 Err(Error::InvalidStateTransition(self, new_state)) 329 } 330 VmState::Running | VmState::Paused | VmState::BreakPoint => Ok(()), 331 }, 332 333 VmState::Running => match new_state { 334 VmState::Created | VmState::Running => { 335 Err(Error::InvalidStateTransition(self, new_state)) 336 } 337 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 338 }, 339 340 VmState::Shutdown => match new_state { 341 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 342 Err(Error::InvalidStateTransition(self, new_state)) 343 } 344 VmState::Running => Ok(()), 345 }, 346 347 VmState::Paused => match new_state { 348 VmState::Created | VmState::Paused | VmState::BreakPoint => { 349 Err(Error::InvalidStateTransition(self, new_state)) 350 } 351 VmState::Running | VmState::Shutdown => Ok(()), 352 }, 353 VmState::BreakPoint => match new_state { 354 VmState::Created | VmState::Running => Ok(()), 355 _ => Err(Error::InvalidStateTransition(self, new_state)), 356 }, 357 } 358 } 359 } 360 361 struct VmOpsHandler { 362 memory: GuestMemoryAtomic<GuestMemoryMmap>, 363 #[cfg(target_arch = "x86_64")] 364 io_bus: Arc<Bus>, 365 mmio_bus: Arc<Bus>, 366 } 367 368 impl VmOps for VmOpsHandler { 369 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 370 self.memory 371 .memory() 372 .write(buf, GuestAddress(gpa)) 373 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 374 } 375 376 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 377 self.memory 378 .memory() 379 .read(buf, GuestAddress(gpa)) 380 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 381 } 382 383 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 384 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 385 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 386 } 387 Ok(()) 388 } 389 390 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 391 match self.mmio_bus.write(gpa, data) { 392 Err(vm_device::BusError::MissingAddressRange) => { 393 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 394 } 395 Ok(Some(barrier)) => { 396 info!("Waiting for barrier"); 397 barrier.wait(); 398 info!("Barrier released"); 399 } 400 _ => {} 401 }; 402 Ok(()) 403 } 404 405 #[cfg(target_arch = "x86_64")] 406 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 407 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 408 info!("Guest PIO read to unregistered address 0x{:x}", port); 409 } 410 Ok(()) 411 } 412 413 #[cfg(target_arch = "x86_64")] 414 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 415 match self.io_bus.write(port, data) { 416 Err(vm_device::BusError::MissingAddressRange) => { 417 info!("Guest PIO write to unregistered address 0x{:x}", port); 418 } 419 Ok(Some(barrier)) => { 420 info!("Waiting for barrier"); 421 barrier.wait(); 422 info!("Barrier released"); 423 } 424 _ => {} 425 }; 426 Ok(()) 427 } 428 } 429 430 pub fn physical_bits(max_phys_bits: u8) -> u8 { 431 let host_phys_bits = get_host_cpu_phys_bits(); 432 433 cmp::min(host_phys_bits, max_phys_bits) 434 } 435 436 pub struct Vm { 437 #[cfg(feature = "tdx")] 438 kernel: Option<File>, 439 initramfs: Option<File>, 440 threads: Vec<thread::JoinHandle<()>>, 441 device_manager: Arc<Mutex<DeviceManager>>, 442 config: Arc<Mutex<VmConfig>>, 443 on_tty: bool, 444 signals: Option<Handle>, 445 state: RwLock<VmState>, 446 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 447 memory_manager: Arc<Mutex<MemoryManager>>, 448 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 449 // The hypervisor abstracted virtual machine. 450 vm: Arc<dyn hypervisor::Vm>, 451 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 452 saved_clock: Option<hypervisor::ClockData>, 453 numa_nodes: NumaNodes, 454 seccomp_action: SeccompAction, 455 exit_evt: EventFd, 456 hypervisor: Arc<dyn hypervisor::Hypervisor>, 457 stop_on_boot: bool, 458 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 459 } 460 461 impl Vm { 462 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 463 464 #[allow(clippy::too_many_arguments)] 465 pub fn new_from_memory_manager( 466 config: Arc<Mutex<VmConfig>>, 467 memory_manager: Arc<Mutex<MemoryManager>>, 468 vm: Arc<dyn hypervisor::Vm>, 469 exit_evt: EventFd, 470 reset_evt: EventFd, 471 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 472 seccomp_action: &SeccompAction, 473 hypervisor: Arc<dyn hypervisor::Hypervisor>, 474 activate_evt: EventFd, 475 timestamp: Instant, 476 serial_pty: Option<PtyPair>, 477 console_pty: Option<PtyPair>, 478 console_resize_pipe: Option<File>, 479 snapshot: Option<Snapshot>, 480 ) -> Result<Self> { 481 trace_scoped!("Vm::new_from_memory_manager"); 482 483 let boot_id_list = config 484 .lock() 485 .unwrap() 486 .validate() 487 .map_err(Error::ConfigValidation)?; 488 489 let load_payload_handle = if snapshot.is_none() { 490 Self::load_payload_async(&memory_manager, &config)? 491 } else { 492 None 493 }; 494 495 info!("Booting VM from config: {:?}", &config); 496 497 // Create NUMA nodes based on NumaConfig. 498 let numa_nodes = 499 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 500 501 #[cfg(feature = "tdx")] 502 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 503 #[cfg(feature = "tdx")] 504 let force_iommu = tdx_enabled; 505 #[cfg(not(feature = "tdx"))] 506 let force_iommu = false; 507 508 #[cfg(feature = "guest_debug")] 509 let stop_on_boot = config.lock().unwrap().gdb; 510 #[cfg(not(feature = "guest_debug"))] 511 let stop_on_boot = false; 512 513 let memory = memory_manager.lock().unwrap().guest_memory(); 514 #[cfg(target_arch = "x86_64")] 515 let io_bus = Arc::new(Bus::new()); 516 let mmio_bus = Arc::new(Bus::new()); 517 518 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 519 memory, 520 #[cfg(target_arch = "x86_64")] 521 io_bus: io_bus.clone(), 522 mmio_bus: mmio_bus.clone(), 523 }); 524 525 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 526 let cpu_manager = cpu::CpuManager::new( 527 cpus_config, 528 vm.clone(), 529 exit_evt.try_clone().map_err(Error::EventFdClone)?, 530 reset_evt.try_clone().map_err(Error::EventFdClone)?, 531 #[cfg(feature = "guest_debug")] 532 vm_debug_evt, 533 &hypervisor, 534 seccomp_action.clone(), 535 vm_ops, 536 #[cfg(feature = "tdx")] 537 tdx_enabled, 538 &numa_nodes, 539 ) 540 .map_err(Error::CpuManager)?; 541 542 #[cfg(target_arch = "x86_64")] 543 cpu_manager 544 .lock() 545 .unwrap() 546 .populate_cpuid( 547 &memory_manager, 548 &hypervisor, 549 #[cfg(feature = "tdx")] 550 tdx_enabled, 551 ) 552 .map_err(Error::CpuManager)?; 553 554 // The initial TDX configuration must be done before the vCPUs are 555 // created 556 #[cfg(feature = "tdx")] 557 if tdx_enabled { 558 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 559 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 560 vm.tdx_init(&cpuid, max_vcpus) 561 .map_err(Error::InitializeTdxVm)?; 562 } 563 564 cpu_manager 565 .lock() 566 .unwrap() 567 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 568 .map_err(Error::CpuManager)?; 569 570 #[cfg(feature = "tdx")] 571 let dynamic = !tdx_enabled; 572 #[cfg(not(feature = "tdx"))] 573 let dynamic = true; 574 575 let device_manager = DeviceManager::new( 576 #[cfg(target_arch = "x86_64")] 577 io_bus, 578 mmio_bus, 579 hypervisor.hypervisor_type(), 580 vm.clone(), 581 config.clone(), 582 memory_manager.clone(), 583 cpu_manager.clone(), 584 exit_evt.try_clone().map_err(Error::EventFdClone)?, 585 reset_evt, 586 seccomp_action.clone(), 587 numa_nodes.clone(), 588 &activate_evt, 589 force_iommu, 590 boot_id_list, 591 timestamp, 592 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 593 dynamic, 594 ) 595 .map_err(Error::DeviceManager)?; 596 597 device_manager 598 .lock() 599 .unwrap() 600 .create_devices(serial_pty, console_pty, console_resize_pipe) 601 .map_err(Error::DeviceManager)?; 602 603 // SAFETY: trivially safe 604 let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO) } != 0; 605 606 #[cfg(feature = "tdx")] 607 let kernel = config 608 .lock() 609 .unwrap() 610 .payload 611 .as_ref() 612 .map(|p| p.kernel.as_ref().map(File::open)) 613 .unwrap_or_default() 614 .transpose() 615 .map_err(Error::KernelFile)?; 616 617 let initramfs = config 618 .lock() 619 .unwrap() 620 .payload 621 .as_ref() 622 .map(|p| p.initramfs.as_ref().map(File::open)) 623 .unwrap_or_default() 624 .transpose() 625 .map_err(Error::InitramfsFile)?; 626 627 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 628 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 629 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 630 vm_snapshot.clock 631 } else { 632 None 633 }; 634 635 let vm_state = if snapshot.is_some() { 636 VmState::Paused 637 } else { 638 VmState::Created 639 }; 640 641 Ok(Vm { 642 #[cfg(feature = "tdx")] 643 kernel, 644 initramfs, 645 device_manager, 646 config, 647 on_tty, 648 threads: Vec::with_capacity(1), 649 signals: None, 650 state: RwLock::new(vm_state), 651 cpu_manager, 652 memory_manager, 653 vm, 654 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 655 saved_clock, 656 numa_nodes, 657 seccomp_action: seccomp_action.clone(), 658 exit_evt, 659 hypervisor, 660 stop_on_boot, 661 load_payload_handle, 662 }) 663 } 664 665 fn create_numa_nodes( 666 configs: Option<Vec<NumaConfig>>, 667 memory_manager: &Arc<Mutex<MemoryManager>>, 668 ) -> Result<NumaNodes> { 669 let mm = memory_manager.lock().unwrap(); 670 let mm_zones = mm.memory_zones(); 671 let mut numa_nodes = BTreeMap::new(); 672 673 if let Some(configs) = &configs { 674 for config in configs.iter() { 675 if numa_nodes.contains_key(&config.guest_numa_id) { 676 error!("Can't define twice the same NUMA node"); 677 return Err(Error::InvalidNumaConfig); 678 } 679 680 let mut node = NumaNode::default(); 681 682 if let Some(memory_zones) = &config.memory_zones { 683 for memory_zone in memory_zones.iter() { 684 if let Some(mm_zone) = mm_zones.get(memory_zone) { 685 node.memory_regions.extend(mm_zone.regions().clone()); 686 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 687 node.hotplug_regions.push(virtiomem_zone.region().clone()); 688 } 689 node.memory_zones.push(memory_zone.clone()); 690 } else { 691 error!("Unknown memory zone '{}'", memory_zone); 692 return Err(Error::InvalidNumaConfig); 693 } 694 } 695 } 696 697 if let Some(cpus) = &config.cpus { 698 node.cpus.extend(cpus); 699 } 700 701 if let Some(distances) = &config.distances { 702 for distance in distances.iter() { 703 let dest = distance.destination; 704 let dist = distance.distance; 705 706 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 707 error!("Unknown destination NUMA node {}", dest); 708 return Err(Error::InvalidNumaConfig); 709 } 710 711 if node.distances.contains_key(&dest) { 712 error!("Destination NUMA node {} has been already set", dest); 713 return Err(Error::InvalidNumaConfig); 714 } 715 716 node.distances.insert(dest, dist); 717 } 718 } 719 720 #[cfg(target_arch = "x86_64")] 721 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 722 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 723 let mm_sections = sgx_epc_region.epc_sections(); 724 for sgx_epc_section in sgx_epc_sections.iter() { 725 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 726 node.sgx_epc_sections.push(mm_section.clone()); 727 } else { 728 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 729 return Err(Error::InvalidNumaConfig); 730 } 731 } 732 } else { 733 error!("Missing SGX EPC region"); 734 return Err(Error::InvalidNumaConfig); 735 } 736 } 737 738 numa_nodes.insert(config.guest_numa_id, node); 739 } 740 } 741 742 Ok(numa_nodes) 743 } 744 745 #[allow(clippy::too_many_arguments)] 746 pub fn new( 747 vm_config: Arc<Mutex<VmConfig>>, 748 exit_evt: EventFd, 749 reset_evt: EventFd, 750 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 751 seccomp_action: &SeccompAction, 752 hypervisor: Arc<dyn hypervisor::Hypervisor>, 753 activate_evt: EventFd, 754 serial_pty: Option<PtyPair>, 755 console_pty: Option<PtyPair>, 756 console_resize_pipe: Option<File>, 757 snapshot: Option<Snapshot>, 758 source_url: Option<&str>, 759 prefault: Option<bool>, 760 ) -> Result<Self> { 761 trace_scoped!("Vm::new"); 762 763 let timestamp = Instant::now(); 764 765 #[cfg(feature = "tdx")] 766 let tdx_enabled = if snapshot.is_some() { 767 false 768 } else { 769 vm_config.lock().unwrap().is_tdx_enabled() 770 }; 771 772 let vm = Self::create_hypervisor_vm( 773 &hypervisor, 774 #[cfg(feature = "tdx")] 775 tdx_enabled, 776 )?; 777 778 let phys_bits = physical_bits(vm_config.lock().unwrap().cpus.max_phys_bits); 779 780 let memory_manager = if let Some(snapshot) = 781 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 782 { 783 MemoryManager::new_from_snapshot( 784 &snapshot, 785 vm.clone(), 786 &vm_config.lock().unwrap().memory.clone(), 787 source_url, 788 prefault.unwrap(), 789 phys_bits, 790 ) 791 .map_err(Error::MemoryManager)? 792 } else { 793 #[cfg(target_arch = "x86_64")] 794 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 795 796 MemoryManager::new( 797 vm.clone(), 798 &vm_config.lock().unwrap().memory.clone(), 799 None, 800 phys_bits, 801 #[cfg(feature = "tdx")] 802 tdx_enabled, 803 None, 804 None, 805 #[cfg(target_arch = "x86_64")] 806 sgx_epc_config, 807 ) 808 .map_err(Error::MemoryManager)? 809 }; 810 811 Vm::new_from_memory_manager( 812 vm_config, 813 memory_manager, 814 vm, 815 exit_evt, 816 reset_evt, 817 #[cfg(feature = "guest_debug")] 818 vm_debug_evt, 819 seccomp_action, 820 hypervisor, 821 activate_evt, 822 timestamp, 823 serial_pty, 824 console_pty, 825 console_resize_pipe, 826 snapshot, 827 ) 828 } 829 830 pub fn create_hypervisor_vm( 831 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 832 #[cfg(feature = "tdx")] tdx_enabled: bool, 833 ) -> Result<Arc<dyn hypervisor::Vm>> { 834 hypervisor.check_required_extensions().unwrap(); 835 836 #[cfg(feature = "tdx")] 837 let vm = hypervisor 838 .create_vm_with_type(if tdx_enabled { 839 2 // KVM_X86_TDX_VM 840 } else { 841 0 // KVM_X86_LEGACY_VM 842 }) 843 .unwrap(); 844 #[cfg(not(feature = "tdx"))] 845 let vm = hypervisor.create_vm().unwrap(); 846 847 #[cfg(target_arch = "x86_64")] 848 { 849 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 850 .unwrap(); 851 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 852 vm.enable_split_irq().unwrap(); 853 } 854 855 Ok(vm) 856 } 857 858 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 859 let mut initramfs = self.initramfs.as_ref().unwrap(); 860 let size: usize = initramfs 861 .seek(SeekFrom::End(0)) 862 .map_err(|_| Error::InitramfsLoad)? 863 .try_into() 864 .unwrap(); 865 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 866 867 let address = 868 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 869 let address = GuestAddress(address); 870 871 guest_mem 872 .read_from(address, &mut initramfs, size) 873 .map_err(|_| Error::InitramfsLoad)?; 874 875 info!("Initramfs loaded: address = 0x{:x}", address.0); 876 Ok(arch::InitramfsConfig { address, size }) 877 } 878 879 pub fn generate_cmdline( 880 payload: &PayloadConfig, 881 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 882 ) -> Result<Cmdline> { 883 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 884 if let Some(s) = payload.cmdline.as_ref() { 885 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 886 } 887 888 #[cfg(target_arch = "aarch64")] 889 for entry in device_manager.lock().unwrap().cmdline_additions() { 890 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 891 } 892 Ok(cmdline) 893 } 894 895 #[cfg(target_arch = "aarch64")] 896 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 897 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 898 let mem = uefi_flash.memory(); 899 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 900 .map_err(Error::UefiLoad)?; 901 Ok(()) 902 } 903 904 #[cfg(target_arch = "aarch64")] 905 fn load_kernel( 906 firmware: Option<File>, 907 kernel: Option<File>, 908 memory_manager: Arc<Mutex<MemoryManager>>, 909 ) -> Result<EntryPoint> { 910 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 911 let mem = guest_memory.memory(); 912 let entry_addr = match (firmware, kernel) { 913 (None, Some(mut kernel)) => { 914 match linux_loader::loader::pe::PE::load( 915 mem.deref(), 916 Some(arch::layout::KERNEL_START), 917 &mut kernel, 918 None, 919 ) { 920 Ok(entry_addr) => entry_addr.kernel_load, 921 // Try to load the binary as kernel PE file at first. 922 // If failed, retry to load it as UEFI binary. 923 // As the UEFI binary is formatless, it must be the last option to try. 924 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 925 Self::load_firmware(&kernel, memory_manager)?; 926 arch::layout::UEFI_START 927 } 928 Err(e) => { 929 return Err(Error::KernelLoad(e)); 930 } 931 } 932 } 933 (Some(firmware), None) => { 934 Self::load_firmware(&firmware, memory_manager)?; 935 arch::layout::UEFI_START 936 } 937 _ => return Err(Error::InvalidPayload), 938 }; 939 940 Ok(EntryPoint { entry_addr }) 941 } 942 943 #[cfg(target_arch = "x86_64")] 944 fn load_kernel( 945 mut kernel: File, 946 cmdline: Option<Cmdline>, 947 memory_manager: Arc<Mutex<MemoryManager>>, 948 ) -> Result<EntryPoint> { 949 info!("Loading kernel"); 950 951 let mem = { 952 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 953 guest_memory.memory() 954 }; 955 let entry_addr = linux_loader::loader::elf::Elf::load( 956 mem.deref(), 957 None, 958 &mut kernel, 959 Some(arch::layout::HIGH_RAM_START), 960 ) 961 .map_err(Error::KernelLoad)?; 962 963 if let Some(cmdline) = cmdline { 964 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 965 .map_err(Error::LoadCmdLine)?; 966 } 967 968 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 969 // Use the PVH kernel entry point to boot the guest 970 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 971 Ok(EntryPoint { 972 entry_addr: Some(entry_addr), 973 }) 974 } else { 975 Err(Error::KernelMissingPvhHeader) 976 } 977 } 978 979 #[cfg(target_arch = "x86_64")] 980 fn load_payload( 981 payload: &PayloadConfig, 982 memory_manager: Arc<Mutex<MemoryManager>>, 983 ) -> Result<EntryPoint> { 984 trace_scoped!("load_payload"); 985 match ( 986 &payload.firmware, 987 &payload.kernel, 988 &payload.initramfs, 989 &payload.cmdline, 990 ) { 991 (Some(firmware), None, None, None) => { 992 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 993 Self::load_kernel(firmware, None, memory_manager) 994 } 995 (None, Some(kernel), _, _) => { 996 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 997 let cmdline = Self::generate_cmdline(payload)?; 998 Self::load_kernel(kernel, Some(cmdline), memory_manager) 999 } 1000 _ => Err(Error::InvalidPayload), 1001 } 1002 } 1003 1004 #[cfg(target_arch = "aarch64")] 1005 fn load_payload( 1006 payload: &PayloadConfig, 1007 memory_manager: Arc<Mutex<MemoryManager>>, 1008 ) -> Result<EntryPoint> { 1009 match (&payload.firmware, &payload.kernel) { 1010 (Some(firmware), None) => { 1011 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1012 Self::load_kernel(Some(firmware), None, memory_manager) 1013 } 1014 (None, Some(kernel)) => { 1015 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1016 Self::load_kernel(None, Some(kernel), memory_manager) 1017 } 1018 _ => Err(Error::InvalidPayload), 1019 } 1020 } 1021 1022 fn load_payload_async( 1023 memory_manager: &Arc<Mutex<MemoryManager>>, 1024 config: &Arc<Mutex<VmConfig>>, 1025 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1026 // Kernel with TDX is loaded in a different manner 1027 #[cfg(feature = "tdx")] 1028 if config.lock().unwrap().is_tdx_enabled() { 1029 return Ok(None); 1030 } 1031 1032 config 1033 .lock() 1034 .unwrap() 1035 .payload 1036 .as_ref() 1037 .map(|payload| { 1038 let memory_manager = memory_manager.clone(); 1039 let payload = payload.clone(); 1040 1041 std::thread::Builder::new() 1042 .name("payload_loader".into()) 1043 .spawn(move || Self::load_payload(&payload, memory_manager)) 1044 .map_err(Error::KernelLoadThreadSpawn) 1045 }) 1046 .transpose() 1047 } 1048 1049 #[cfg(target_arch = "x86_64")] 1050 fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> { 1051 trace_scoped!("configure_system"); 1052 info!("Configuring system"); 1053 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1054 1055 let initramfs_config = match self.initramfs { 1056 Some(_) => Some(self.load_initramfs(&mem)?), 1057 None => None, 1058 }; 1059 1060 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1061 let rsdp_addr = Some(rsdp_addr); 1062 let sgx_epc_region = self 1063 .memory_manager 1064 .lock() 1065 .unwrap() 1066 .sgx_epc_region() 1067 .as_ref() 1068 .cloned(); 1069 1070 let serial_number = self 1071 .config 1072 .lock() 1073 .unwrap() 1074 .platform 1075 .as_ref() 1076 .and_then(|p| p.serial_number.clone()); 1077 1078 let uuid = self 1079 .config 1080 .lock() 1081 .unwrap() 1082 .platform 1083 .as_ref() 1084 .and_then(|p| p.uuid.clone()); 1085 1086 let oem_strings = self 1087 .config 1088 .lock() 1089 .unwrap() 1090 .platform 1091 .as_ref() 1092 .and_then(|p| p.oem_strings.clone()); 1093 1094 let oem_strings = oem_strings 1095 .as_deref() 1096 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1097 1098 arch::configure_system( 1099 &mem, 1100 arch::layout::CMDLINE_START, 1101 &initramfs_config, 1102 boot_vcpus, 1103 rsdp_addr, 1104 sgx_epc_region, 1105 serial_number.as_deref(), 1106 uuid.as_deref(), 1107 oem_strings.as_deref(), 1108 ) 1109 .map_err(Error::ConfigureSystem)?; 1110 Ok(()) 1111 } 1112 1113 #[cfg(target_arch = "aarch64")] 1114 fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> { 1115 let cmdline = Self::generate_cmdline( 1116 self.config.lock().unwrap().payload.as_ref().unwrap(), 1117 &self.device_manager, 1118 )?; 1119 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1120 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1121 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1122 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1123 let initramfs_config = match self.initramfs { 1124 Some(_) => Some(self.load_initramfs(&mem)?), 1125 None => None, 1126 }; 1127 1128 let device_info = &self 1129 .device_manager 1130 .lock() 1131 .unwrap() 1132 .get_device_info() 1133 .clone(); 1134 1135 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1136 let pci_space = PciSpaceInfo { 1137 pci_segment_id: pci_segment.id, 1138 mmio_config_address: pci_segment.mmio_config_address, 1139 pci_device_space_start: pci_segment.start_of_device_area, 1140 pci_device_space_size: pci_segment.end_of_device_area 1141 - pci_segment.start_of_device_area 1142 + 1, 1143 }; 1144 pci_space_info.push(pci_space); 1145 } 1146 1147 let virtio_iommu_bdf = self 1148 .device_manager 1149 .lock() 1150 .unwrap() 1151 .iommu_attached_devices() 1152 .as_ref() 1153 .map(|(v, _)| *v); 1154 1155 let vgic = self 1156 .device_manager 1157 .lock() 1158 .unwrap() 1159 .get_interrupt_controller() 1160 .unwrap() 1161 .lock() 1162 .unwrap() 1163 .get_vgic() 1164 .map_err(|_| { 1165 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1166 arch::aarch64::Error::SetupGic, 1167 )) 1168 })?; 1169 1170 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1171 let pmu_supported = self 1172 .cpu_manager 1173 .lock() 1174 .unwrap() 1175 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1176 .map_err(|_| { 1177 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1178 arch::aarch64::Error::VcpuInitPmu, 1179 )) 1180 })?; 1181 1182 arch::configure_system( 1183 &mem, 1184 cmdline.as_cstring().unwrap().to_str().unwrap(), 1185 vcpu_mpidrs, 1186 vcpu_topology, 1187 device_info, 1188 &initramfs_config, 1189 &pci_space_info, 1190 virtio_iommu_bdf.map(|bdf| bdf.into()), 1191 &vgic, 1192 &self.numa_nodes, 1193 pmu_supported, 1194 ) 1195 .map_err(Error::ConfigureSystem)?; 1196 1197 Ok(()) 1198 } 1199 1200 pub fn serial_pty(&self) -> Option<PtyPair> { 1201 self.device_manager.lock().unwrap().serial_pty() 1202 } 1203 1204 pub fn console_pty(&self) -> Option<PtyPair> { 1205 self.device_manager.lock().unwrap().console_pty() 1206 } 1207 1208 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1209 self.device_manager.lock().unwrap().console_resize_pipe() 1210 } 1211 1212 pub fn shutdown(&mut self) -> Result<()> { 1213 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1214 let new_state = VmState::Shutdown; 1215 1216 state.valid_transition(new_state)?; 1217 1218 if self.on_tty { 1219 // Don't forget to set the terminal in canonical mode 1220 // before to exit. 1221 io::stdin() 1222 .lock() 1223 .set_canon_mode() 1224 .map_err(Error::SetTerminalCanon)?; 1225 } 1226 1227 // Trigger the termination of the signal_handler thread 1228 if let Some(signals) = self.signals.take() { 1229 signals.close(); 1230 } 1231 1232 // Wake up the DeviceManager threads so they will get terminated cleanly 1233 self.device_manager 1234 .lock() 1235 .unwrap() 1236 .resume() 1237 .map_err(Error::Resume)?; 1238 1239 self.cpu_manager 1240 .lock() 1241 .unwrap() 1242 .shutdown() 1243 .map_err(Error::CpuManager)?; 1244 1245 // Wait for all the threads to finish 1246 for thread in self.threads.drain(..) { 1247 thread.join().map_err(Error::ThreadCleanup)? 1248 } 1249 *state = new_state; 1250 1251 event!("vm", "shutdown"); 1252 1253 Ok(()) 1254 } 1255 1256 pub fn resize( 1257 &mut self, 1258 desired_vcpus: Option<u8>, 1259 desired_memory: Option<u64>, 1260 desired_balloon: Option<u64>, 1261 ) -> Result<()> { 1262 event!("vm", "resizing"); 1263 1264 if let Some(desired_vcpus) = desired_vcpus { 1265 if self 1266 .cpu_manager 1267 .lock() 1268 .unwrap() 1269 .resize(desired_vcpus) 1270 .map_err(Error::CpuManager)? 1271 { 1272 self.device_manager 1273 .lock() 1274 .unwrap() 1275 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1276 .map_err(Error::DeviceManager)?; 1277 } 1278 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1279 } 1280 1281 if let Some(desired_memory) = desired_memory { 1282 let new_region = self 1283 .memory_manager 1284 .lock() 1285 .unwrap() 1286 .resize(desired_memory) 1287 .map_err(Error::MemoryManager)?; 1288 1289 let mut memory_config = &mut self.config.lock().unwrap().memory; 1290 1291 if let Some(new_region) = &new_region { 1292 self.device_manager 1293 .lock() 1294 .unwrap() 1295 .update_memory(new_region) 1296 .map_err(Error::DeviceManager)?; 1297 1298 match memory_config.hotplug_method { 1299 HotplugMethod::Acpi => { 1300 self.device_manager 1301 .lock() 1302 .unwrap() 1303 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1304 .map_err(Error::DeviceManager)?; 1305 } 1306 HotplugMethod::VirtioMem => {} 1307 } 1308 } 1309 1310 // We update the VM config regardless of the actual guest resize 1311 // operation result (happened or not), so that if the VM reboots 1312 // it will be running with the last configure memory size. 1313 match memory_config.hotplug_method { 1314 HotplugMethod::Acpi => memory_config.size = desired_memory, 1315 HotplugMethod::VirtioMem => { 1316 if desired_memory > memory_config.size { 1317 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1318 } else { 1319 memory_config.hotplugged_size = None; 1320 } 1321 } 1322 } 1323 } 1324 1325 if let Some(desired_balloon) = desired_balloon { 1326 self.device_manager 1327 .lock() 1328 .unwrap() 1329 .resize_balloon(desired_balloon) 1330 .map_err(Error::DeviceManager)?; 1331 1332 // Update the configuration value for the balloon size to ensure 1333 // a reboot would use the right value. 1334 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1335 balloon_config.size = desired_balloon; 1336 } 1337 } 1338 1339 event!("vm", "resized"); 1340 1341 Ok(()) 1342 } 1343 1344 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1345 let memory_config = &mut self.config.lock().unwrap().memory; 1346 1347 if let Some(zones) = &mut memory_config.zones { 1348 for zone in zones.iter_mut() { 1349 if zone.id == id { 1350 if desired_memory >= zone.size { 1351 let hotplugged_size = desired_memory - zone.size; 1352 self.memory_manager 1353 .lock() 1354 .unwrap() 1355 .resize_zone(&id, desired_memory - zone.size) 1356 .map_err(Error::MemoryManager)?; 1357 // We update the memory zone config regardless of the 1358 // actual 'resize-zone' operation result (happened or 1359 // not), so that if the VM reboots it will be running 1360 // with the last configured memory zone size. 1361 zone.hotplugged_size = Some(hotplugged_size); 1362 1363 return Ok(()); 1364 } else { 1365 error!( 1366 "Invalid to ask less ({}) than boot RAM ({}) for \ 1367 this memory zone", 1368 desired_memory, zone.size, 1369 ); 1370 return Err(Error::ResizeZone); 1371 } 1372 } 1373 } 1374 } 1375 1376 error!("Could not find the memory zone {} for the resize", id); 1377 Err(Error::ResizeZone) 1378 } 1379 1380 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1381 let pci_device_info = self 1382 .device_manager 1383 .lock() 1384 .unwrap() 1385 .add_device(&mut device_cfg) 1386 .map_err(Error::DeviceManager)?; 1387 1388 // Update VmConfig by adding the new device. This is important to 1389 // ensure the device would be created in case of a reboot. 1390 { 1391 let mut config = self.config.lock().unwrap(); 1392 add_to_config(&mut config.devices, device_cfg); 1393 } 1394 1395 self.device_manager 1396 .lock() 1397 .unwrap() 1398 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1399 .map_err(Error::DeviceManager)?; 1400 1401 Ok(pci_device_info) 1402 } 1403 1404 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1405 let pci_device_info = self 1406 .device_manager 1407 .lock() 1408 .unwrap() 1409 .add_user_device(&mut device_cfg) 1410 .map_err(Error::DeviceManager)?; 1411 1412 // Update VmConfig by adding the new device. This is important to 1413 // ensure the device would be created in case of a reboot. 1414 { 1415 let mut config = self.config.lock().unwrap(); 1416 add_to_config(&mut config.user_devices, device_cfg); 1417 } 1418 1419 self.device_manager 1420 .lock() 1421 .unwrap() 1422 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1423 .map_err(Error::DeviceManager)?; 1424 1425 Ok(pci_device_info) 1426 } 1427 1428 pub fn remove_device(&mut self, id: String) -> Result<()> { 1429 self.device_manager 1430 .lock() 1431 .unwrap() 1432 .remove_device(id.clone()) 1433 .map_err(Error::DeviceManager)?; 1434 1435 // Update VmConfig by removing the device. This is important to 1436 // ensure the device would not be created in case of a reboot. 1437 let mut config = self.config.lock().unwrap(); 1438 1439 // Remove if VFIO device 1440 if let Some(devices) = config.devices.as_mut() { 1441 devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1442 } 1443 1444 // Remove if VFIO user device 1445 if let Some(user_devices) = config.user_devices.as_mut() { 1446 user_devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1447 } 1448 1449 // Remove if disk device 1450 if let Some(disks) = config.disks.as_mut() { 1451 disks.retain(|dev| dev.id.as_ref() != Some(&id)); 1452 } 1453 1454 // Remove if fs device 1455 if let Some(fs) = config.fs.as_mut() { 1456 fs.retain(|dev| dev.id.as_ref() != Some(&id)); 1457 } 1458 1459 // Remove if net device 1460 if let Some(net) = config.net.as_mut() { 1461 net.retain(|dev| dev.id.as_ref() != Some(&id)); 1462 } 1463 1464 // Remove if pmem device 1465 if let Some(pmem) = config.pmem.as_mut() { 1466 pmem.retain(|dev| dev.id.as_ref() != Some(&id)); 1467 } 1468 1469 // Remove if vDPA device 1470 if let Some(vdpa) = config.vdpa.as_mut() { 1471 vdpa.retain(|dev| dev.id.as_ref() != Some(&id)); 1472 } 1473 1474 // Remove if vsock device 1475 if let Some(vsock) = config.vsock.as_ref() { 1476 if vsock.id.as_ref() == Some(&id) { 1477 config.vsock = None; 1478 } 1479 } 1480 1481 self.device_manager 1482 .lock() 1483 .unwrap() 1484 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1485 .map_err(Error::DeviceManager)?; 1486 Ok(()) 1487 } 1488 1489 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1490 let pci_device_info = self 1491 .device_manager 1492 .lock() 1493 .unwrap() 1494 .add_disk(&mut disk_cfg) 1495 .map_err(Error::DeviceManager)?; 1496 1497 // Update VmConfig by adding the new device. This is important to 1498 // ensure the device would be created in case of a reboot. 1499 { 1500 let mut config = self.config.lock().unwrap(); 1501 add_to_config(&mut config.disks, disk_cfg); 1502 } 1503 1504 self.device_manager 1505 .lock() 1506 .unwrap() 1507 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1508 .map_err(Error::DeviceManager)?; 1509 1510 Ok(pci_device_info) 1511 } 1512 1513 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1514 let pci_device_info = self 1515 .device_manager 1516 .lock() 1517 .unwrap() 1518 .add_fs(&mut fs_cfg) 1519 .map_err(Error::DeviceManager)?; 1520 1521 // Update VmConfig by adding the new device. This is important to 1522 // ensure the device would be created in case of a reboot. 1523 { 1524 let mut config = self.config.lock().unwrap(); 1525 add_to_config(&mut config.fs, fs_cfg); 1526 } 1527 1528 self.device_manager 1529 .lock() 1530 .unwrap() 1531 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1532 .map_err(Error::DeviceManager)?; 1533 1534 Ok(pci_device_info) 1535 } 1536 1537 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1538 let pci_device_info = self 1539 .device_manager 1540 .lock() 1541 .unwrap() 1542 .add_pmem(&mut pmem_cfg) 1543 .map_err(Error::DeviceManager)?; 1544 1545 // Update VmConfig by adding the new device. This is important to 1546 // ensure the device would be created in case of a reboot. 1547 { 1548 let mut config = self.config.lock().unwrap(); 1549 add_to_config(&mut config.pmem, pmem_cfg); 1550 } 1551 1552 self.device_manager 1553 .lock() 1554 .unwrap() 1555 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1556 .map_err(Error::DeviceManager)?; 1557 1558 Ok(pci_device_info) 1559 } 1560 1561 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1562 let pci_device_info = self 1563 .device_manager 1564 .lock() 1565 .unwrap() 1566 .add_net(&mut net_cfg) 1567 .map_err(Error::DeviceManager)?; 1568 1569 // Update VmConfig by adding the new device. This is important to 1570 // ensure the device would be created in case of a reboot. 1571 { 1572 let mut config = self.config.lock().unwrap(); 1573 add_to_config(&mut config.net, net_cfg); 1574 } 1575 1576 self.device_manager 1577 .lock() 1578 .unwrap() 1579 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1580 .map_err(Error::DeviceManager)?; 1581 1582 Ok(pci_device_info) 1583 } 1584 1585 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1586 let pci_device_info = self 1587 .device_manager 1588 .lock() 1589 .unwrap() 1590 .add_vdpa(&mut vdpa_cfg) 1591 .map_err(Error::DeviceManager)?; 1592 1593 // Update VmConfig by adding the new device. This is important to 1594 // ensure the device would be created in case of a reboot. 1595 { 1596 let mut config = self.config.lock().unwrap(); 1597 add_to_config(&mut config.vdpa, vdpa_cfg); 1598 } 1599 1600 self.device_manager 1601 .lock() 1602 .unwrap() 1603 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1604 .map_err(Error::DeviceManager)?; 1605 1606 Ok(pci_device_info) 1607 } 1608 1609 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1610 let pci_device_info = self 1611 .device_manager 1612 .lock() 1613 .unwrap() 1614 .add_vsock(&mut vsock_cfg) 1615 .map_err(Error::DeviceManager)?; 1616 1617 // Update VmConfig by adding the new device. This is important to 1618 // ensure the device would be created in case of a reboot. 1619 { 1620 let mut config = self.config.lock().unwrap(); 1621 config.vsock = Some(vsock_cfg); 1622 } 1623 1624 self.device_manager 1625 .lock() 1626 .unwrap() 1627 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1628 .map_err(Error::DeviceManager)?; 1629 1630 Ok(pci_device_info) 1631 } 1632 1633 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1634 Ok(self.device_manager.lock().unwrap().counters()) 1635 } 1636 1637 fn signal_handler(mut signals: Signals, console_input_clone: Arc<Console>) { 1638 for sig in &Vm::HANDLED_SIGNALS { 1639 unblock_signal(*sig).unwrap(); 1640 } 1641 1642 for signal in signals.forever() { 1643 if signal == SIGWINCH { 1644 console_input_clone.update_console_size(); 1645 } 1646 } 1647 } 1648 1649 #[cfg(feature = "tdx")] 1650 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1651 use arch::x86_64::tdx::*; 1652 1653 let firmware_path = self 1654 .config 1655 .lock() 1656 .unwrap() 1657 .payload 1658 .as_ref() 1659 .unwrap() 1660 .firmware 1661 .clone() 1662 .ok_or(Error::TdxFirmwareMissing)?; 1663 // The TDVF file contains a table of section as well as code 1664 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1665 1666 // For all the sections allocate some RAM backing them 1667 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1668 } 1669 1670 #[cfg(feature = "tdx")] 1671 fn hob_memory_resources( 1672 mut sorted_sections: Vec<TdvfSection>, 1673 guest_memory: &GuestMemoryMmap, 1674 ) -> Vec<(u64, u64, bool)> { 1675 let mut list = Vec::new(); 1676 1677 let mut current_section = sorted_sections.pop(); 1678 1679 // RAM regions interleaved with TDVF sections 1680 let mut next_start_addr = 0; 1681 for region in guest_memory.iter() { 1682 let region_start = region.start_addr().0; 1683 let region_end = region.last_addr().0; 1684 if region_start > next_start_addr { 1685 next_start_addr = region_start; 1686 } 1687 1688 loop { 1689 let (start, size, ram) = if let Some(section) = ¤t_section { 1690 if section.address <= next_start_addr { 1691 (section.address, section.size, false) 1692 } else { 1693 let last_addr = std::cmp::min(section.address - 1, region_end); 1694 (next_start_addr, last_addr - next_start_addr + 1, true) 1695 } 1696 } else { 1697 (next_start_addr, region_end - next_start_addr + 1, true) 1698 }; 1699 1700 list.push((start, size, ram)); 1701 1702 if !ram { 1703 current_section = sorted_sections.pop(); 1704 } 1705 1706 next_start_addr = start + size; 1707 1708 if region_start > next_start_addr { 1709 next_start_addr = region_start; 1710 } 1711 1712 if next_start_addr > region_end { 1713 break; 1714 } 1715 } 1716 } 1717 1718 // Once all the interleaved sections have been processed, let's simply 1719 // pull the remaining ones. 1720 if let Some(section) = current_section { 1721 list.push((section.address, section.size, false)); 1722 } 1723 while let Some(section) = sorted_sections.pop() { 1724 list.push((section.address, section.size, false)); 1725 } 1726 1727 list 1728 } 1729 1730 #[cfg(feature = "tdx")] 1731 fn populate_tdx_sections( 1732 &mut self, 1733 sections: &[TdvfSection], 1734 guid_found: bool, 1735 ) -> Result<Option<u64>> { 1736 use arch::x86_64::tdx::*; 1737 // Get the memory end *before* we start adding TDVF ram regions 1738 let boot_guest_memory = self 1739 .memory_manager 1740 .lock() 1741 .as_ref() 1742 .unwrap() 1743 .boot_guest_memory(); 1744 for section in sections { 1745 // No need to allocate if the section falls within guest RAM ranges 1746 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1747 info!( 1748 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1749 section 1750 ); 1751 continue; 1752 } 1753 1754 info!("Allocating TDVF Section: {:x?}", section); 1755 self.memory_manager 1756 .lock() 1757 .unwrap() 1758 .add_ram_region(GuestAddress(section.address), section.size as usize) 1759 .map_err(Error::AllocatingTdvfMemory)?; 1760 } 1761 1762 // The TDVF file contains a table of section as well as code 1763 let firmware_path = self 1764 .config 1765 .lock() 1766 .unwrap() 1767 .payload 1768 .as_ref() 1769 .unwrap() 1770 .firmware 1771 .clone() 1772 .ok_or(Error::TdxFirmwareMissing)?; 1773 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1774 1775 // The guest memory at this point now has all the required regions so it 1776 // is safe to copy from the TDVF file into it. 1777 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1778 let mem = guest_memory.memory(); 1779 let mut payload_info = None; 1780 let mut hob_offset = None; 1781 for section in sections { 1782 info!("Populating TDVF Section: {:x?}", section); 1783 match section.r#type { 1784 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1785 info!("Copying section to guest memory"); 1786 firmware_file 1787 .seek(SeekFrom::Start(section.data_offset as u64)) 1788 .map_err(Error::LoadTdvf)?; 1789 mem.read_from( 1790 GuestAddress(section.address), 1791 &mut firmware_file, 1792 section.data_size as usize, 1793 ) 1794 .unwrap(); 1795 } 1796 TdvfSectionType::TdHob => { 1797 hob_offset = Some(section.address); 1798 } 1799 TdvfSectionType::Payload => { 1800 info!("Copying payload to guest memory"); 1801 if let Some(payload_file) = self.kernel.as_mut() { 1802 let payload_size = payload_file 1803 .seek(SeekFrom::End(0)) 1804 .map_err(Error::LoadPayload)?; 1805 1806 payload_file 1807 .seek(SeekFrom::Start(0x1f1)) 1808 .map_err(Error::LoadPayload)?; 1809 1810 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1811 payload_header 1812 .as_bytes() 1813 .read_from( 1814 0, 1815 payload_file, 1816 mem::size_of::<linux_loader::bootparam::setup_header>(), 1817 ) 1818 .unwrap(); 1819 1820 if payload_header.header != 0x5372_6448 { 1821 return Err(Error::InvalidPayloadType); 1822 } 1823 1824 if (payload_header.version < 0x0200) 1825 || ((payload_header.loadflags & 0x1) == 0x0) 1826 { 1827 return Err(Error::InvalidPayloadType); 1828 } 1829 1830 payload_file.rewind().map_err(Error::LoadPayload)?; 1831 mem.read_from( 1832 GuestAddress(section.address), 1833 payload_file, 1834 payload_size as usize, 1835 ) 1836 .unwrap(); 1837 1838 // Create the payload info that will be inserted into 1839 // the HOB. 1840 payload_info = Some(PayloadInfo { 1841 image_type: PayloadImageType::BzImage, 1842 entry_point: section.address, 1843 }); 1844 } 1845 } 1846 TdvfSectionType::PayloadParam => { 1847 info!("Copying payload parameters to guest memory"); 1848 let cmdline = Self::generate_cmdline( 1849 self.config.lock().unwrap().payload.as_ref().unwrap(), 1850 )?; 1851 mem.write_slice( 1852 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1853 GuestAddress(section.address), 1854 ) 1855 .unwrap(); 1856 } 1857 _ => {} 1858 } 1859 } 1860 1861 // Generate HOB 1862 let mut hob = TdHob::start(hob_offset.unwrap()); 1863 1864 let mut sorted_sections = sections.to_vec(); 1865 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1866 1867 sorted_sections.sort_by_key(|section| section.address); 1868 sorted_sections.reverse(); 1869 1870 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1871 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1872 .map_err(Error::PopulateHob)?; 1873 } 1874 1875 // MMIO regions 1876 hob.add_mmio_resource( 1877 &mem, 1878 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1879 arch::layout::APIC_START.raw_value() 1880 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1881 ) 1882 .map_err(Error::PopulateHob)?; 1883 let start_of_device_area = self 1884 .memory_manager 1885 .lock() 1886 .unwrap() 1887 .start_of_device_area() 1888 .raw_value(); 1889 let end_of_device_area = self 1890 .memory_manager 1891 .lock() 1892 .unwrap() 1893 .end_of_device_area() 1894 .raw_value(); 1895 hob.add_mmio_resource( 1896 &mem, 1897 start_of_device_area, 1898 end_of_device_area - start_of_device_area, 1899 ) 1900 .map_err(Error::PopulateHob)?; 1901 1902 // Loop over the ACPI tables and copy them to the HOB. 1903 1904 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1905 &self.device_manager, 1906 &self.cpu_manager, 1907 &self.memory_manager, 1908 &self.numa_nodes, 1909 ) { 1910 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1911 .map_err(Error::PopulateHob)?; 1912 } 1913 1914 // If a payload info has been created, let's insert it into the HOB. 1915 if let Some(payload_info) = payload_info { 1916 hob.add_payload(&mem, payload_info) 1917 .map_err(Error::PopulateHob)?; 1918 } 1919 1920 hob.finish(&mem).map_err(Error::PopulateHob)?; 1921 1922 Ok(hob_offset) 1923 } 1924 1925 #[cfg(feature = "tdx")] 1926 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1927 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1928 let mem = guest_memory.memory(); 1929 1930 for section in sections { 1931 self.vm 1932 .tdx_init_memory_region( 1933 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1934 section.address, 1935 section.size, 1936 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1937 section.attributes == 1, 1938 ) 1939 .map_err(Error::InitializeTdxMemoryRegion)?; 1940 } 1941 1942 Ok(()) 1943 } 1944 1945 fn setup_signal_handler(&mut self) -> Result<()> { 1946 let console = self.device_manager.lock().unwrap().console().clone(); 1947 let signals = Signals::new(Vm::HANDLED_SIGNALS); 1948 match signals { 1949 Ok(signals) => { 1950 self.signals = Some(signals.handle()); 1951 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?; 1952 let signal_handler_seccomp_filter = get_seccomp_filter( 1953 &self.seccomp_action, 1954 Thread::SignalHandler, 1955 self.hypervisor.hypervisor_type(), 1956 ) 1957 .map_err(Error::CreateSeccompFilter)?; 1958 self.threads.push( 1959 thread::Builder::new() 1960 .name("vm_signal_handler".to_string()) 1961 .spawn(move || { 1962 if !signal_handler_seccomp_filter.is_empty() { 1963 if let Err(e) = apply_filter(&signal_handler_seccomp_filter) 1964 .map_err(Error::ApplySeccompFilter) 1965 { 1966 error!("Error applying seccomp filter: {:?}", e); 1967 exit_evt.write(1).ok(); 1968 return; 1969 } 1970 } 1971 std::panic::catch_unwind(AssertUnwindSafe(|| { 1972 Vm::signal_handler(signals, console); 1973 })) 1974 .map_err(|_| { 1975 error!("signal_handler thead panicked"); 1976 exit_evt.write(1).ok() 1977 }) 1978 .ok(); 1979 }) 1980 .map_err(Error::SignalHandlerSpawn)?, 1981 ); 1982 } 1983 Err(e) => error!("Signal not found {}", e), 1984 } 1985 Ok(()) 1986 } 1987 1988 fn setup_tty(&self) -> Result<()> { 1989 if self.on_tty { 1990 io::stdin() 1991 .lock() 1992 .set_raw_mode() 1993 .map_err(Error::SetTerminalRaw)?; 1994 } 1995 1996 Ok(()) 1997 } 1998 1999 // Creates ACPI tables 2000 // In case of TDX being used, this is a no-op since the tables will be 2001 // created and passed when populating the HOB. 2002 2003 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2004 #[cfg(feature = "tdx")] 2005 if self.config.lock().unwrap().is_tdx_enabled() { 2006 return None; 2007 } 2008 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2009 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2010 let rsdp_addr = crate::acpi::create_acpi_tables( 2011 &mem, 2012 &self.device_manager, 2013 &self.cpu_manager, 2014 &self.memory_manager, 2015 &self.numa_nodes, 2016 tpm_enabled, 2017 ); 2018 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2019 2020 Some(rsdp_addr) 2021 } 2022 2023 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2024 trace_scoped!("entry_point"); 2025 2026 self.load_payload_handle 2027 .take() 2028 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2029 .transpose() 2030 } 2031 2032 pub fn boot(&mut self) -> Result<()> { 2033 trace_scoped!("Vm::boot"); 2034 info!("Booting VM"); 2035 event!("vm", "booting"); 2036 let current_state = self.get_state()?; 2037 if current_state == VmState::Paused { 2038 return self.resume().map_err(Error::Resume); 2039 } 2040 2041 let new_state = if self.stop_on_boot { 2042 VmState::BreakPoint 2043 } else { 2044 VmState::Running 2045 }; 2046 current_state.valid_transition(new_state)?; 2047 2048 // Do earlier to parallelise with loading kernel 2049 #[cfg(target_arch = "x86_64")] 2050 let rsdp_addr = self.create_acpi_tables(); 2051 2052 self.setup_signal_handler()?; 2053 self.setup_tty()?; 2054 2055 // Load kernel synchronously or if asynchronous then wait for load to 2056 // finish. 2057 let entry_point = self.entry_point()?; 2058 2059 #[cfg(feature = "tdx")] 2060 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2061 2062 // Configure the vcpus that have been created 2063 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2064 for vcpu in vcpus { 2065 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2066 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2067 self.cpu_manager 2068 .lock() 2069 .unwrap() 2070 .configure_vcpu(vcpu, boot_setup) 2071 .map_err(Error::CpuManager)?; 2072 } 2073 2074 #[cfg(feature = "tdx")] 2075 let (sections, guid_found) = if tdx_enabled { 2076 self.extract_tdvf_sections()? 2077 } else { 2078 (Vec::new(), false) 2079 }; 2080 2081 // Configuring the TDX regions requires that the vCPUs are created. 2082 #[cfg(feature = "tdx")] 2083 let hob_address = if tdx_enabled { 2084 // TDX sections are written to memory. 2085 self.populate_tdx_sections(§ions, guid_found)? 2086 } else { 2087 None 2088 }; 2089 2090 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2091 // available after they are configured 2092 #[cfg(target_arch = "aarch64")] 2093 let rsdp_addr = self.create_acpi_tables(); 2094 2095 // Configure shared state based on loaded kernel 2096 entry_point 2097 .map(|_| { 2098 // Safe to unwrap rsdp_addr as we know it can't be None when 2099 // the entry_point is Some. 2100 self.configure_system(rsdp_addr.unwrap()) 2101 }) 2102 .transpose()?; 2103 2104 #[cfg(feature = "tdx")] 2105 if let Some(hob_address) = hob_address { 2106 // With the HOB address extracted the vCPUs can have 2107 // their TDX state configured. 2108 self.cpu_manager 2109 .lock() 2110 .unwrap() 2111 .initialize_tdx(hob_address) 2112 .map_err(Error::CpuManager)?; 2113 // Let the hypervisor know which memory ranges are shared with the 2114 // guest. This prevents the guest from ignoring/discarding memory 2115 // regions provided by the host. 2116 self.init_tdx_memory(§ions)?; 2117 // With TDX memory and CPU state configured TDX setup is complete 2118 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2119 } 2120 2121 self.cpu_manager 2122 .lock() 2123 .unwrap() 2124 .start_boot_vcpus(new_state == VmState::BreakPoint) 2125 .map_err(Error::CpuManager)?; 2126 2127 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2128 *state = new_state; 2129 event!("vm", "booted"); 2130 Ok(()) 2131 } 2132 2133 pub fn restore(&mut self) -> Result<()> { 2134 event!("vm", "restoring"); 2135 2136 // Now we can start all vCPUs from here. 2137 self.cpu_manager 2138 .lock() 2139 .unwrap() 2140 .start_restored_vcpus() 2141 .map_err(Error::CpuManager)?; 2142 2143 self.setup_signal_handler()?; 2144 self.setup_tty()?; 2145 2146 event!("vm", "restored"); 2147 Ok(()) 2148 } 2149 2150 /// Gets a thread-safe reference counted pointer to the VM configuration. 2151 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2152 Arc::clone(&self.config) 2153 } 2154 2155 /// Get the VM state. Returns an error if the state is poisoned. 2156 pub fn get_state(&self) -> Result<VmState> { 2157 self.state 2158 .try_read() 2159 .map_err(|_| Error::PoisonedState) 2160 .map(|state| *state) 2161 } 2162 2163 /// Gets the actual size of the balloon. 2164 pub fn balloon_size(&self) -> u64 { 2165 self.device_manager.lock().unwrap().balloon_size() 2166 } 2167 2168 pub fn send_memory_fds( 2169 &mut self, 2170 socket: &mut UnixStream, 2171 ) -> std::result::Result<(), MigratableError> { 2172 for (slot, fd) in self 2173 .memory_manager 2174 .lock() 2175 .unwrap() 2176 .memory_slot_fds() 2177 .drain() 2178 { 2179 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2180 .write_to(socket) 2181 .map_err(|e| { 2182 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2183 })?; 2184 socket 2185 .send_with_fd(&slot.to_le_bytes()[..], fd) 2186 .map_err(|e| { 2187 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2188 })?; 2189 2190 let res = Response::read_from(socket)?; 2191 if res.status() != Status::Ok { 2192 warn!("Error during memory fd migration"); 2193 Request::abandon().write_to(socket)?; 2194 Response::read_from(socket).ok(); 2195 return Err(MigratableError::MigrateSend(anyhow!( 2196 "Error during memory fd migration" 2197 ))); 2198 } 2199 } 2200 2201 Ok(()) 2202 } 2203 2204 pub fn send_memory_regions<F>( 2205 &mut self, 2206 ranges: &MemoryRangeTable, 2207 fd: &mut F, 2208 ) -> std::result::Result<(), MigratableError> 2209 where 2210 F: Write, 2211 { 2212 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2213 let mem = guest_memory.memory(); 2214 2215 for range in ranges.regions() { 2216 let mut offset: u64 = 0; 2217 // Here we are manually handling the retry in case we can't the 2218 // whole region at once because we can't use the implementation 2219 // from vm-memory::GuestMemory of write_all_to() as it is not 2220 // following the correct behavior. For more info about this issue 2221 // see: https://github.com/rust-vmm/vm-memory/issues/174 2222 loop { 2223 let bytes_written = mem 2224 .write_to( 2225 GuestAddress(range.gpa + offset), 2226 fd, 2227 (range.length - offset) as usize, 2228 ) 2229 .map_err(|e| { 2230 MigratableError::MigrateSend(anyhow!( 2231 "Error transferring memory to socket: {}", 2232 e 2233 )) 2234 })?; 2235 offset += bytes_written as u64; 2236 2237 if offset == range.length { 2238 break; 2239 } 2240 } 2241 } 2242 2243 Ok(()) 2244 } 2245 2246 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2247 self.memory_manager 2248 .lock() 2249 .unwrap() 2250 .memory_range_table(false) 2251 } 2252 2253 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2254 self.device_manager.lock().unwrap().device_tree() 2255 } 2256 2257 pub fn activate_virtio_devices(&self) -> Result<()> { 2258 self.device_manager 2259 .lock() 2260 .unwrap() 2261 .activate_virtio_devices() 2262 .map_err(Error::ActivateVirtioDevices) 2263 } 2264 2265 #[cfg(target_arch = "x86_64")] 2266 pub fn power_button(&self) -> Result<()> { 2267 return self 2268 .device_manager 2269 .lock() 2270 .unwrap() 2271 .notify_power_button() 2272 .map_err(Error::PowerButton); 2273 } 2274 2275 #[cfg(target_arch = "aarch64")] 2276 pub fn power_button(&self) -> Result<()> { 2277 self.device_manager 2278 .lock() 2279 .unwrap() 2280 .notify_power_button() 2281 .map_err(Error::PowerButton) 2282 } 2283 2284 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2285 self.memory_manager.lock().unwrap().snapshot_data() 2286 } 2287 2288 #[cfg(feature = "guest_debug")] 2289 pub fn debug_request( 2290 &mut self, 2291 gdb_request: &GdbRequestPayload, 2292 cpu_id: usize, 2293 ) -> Result<GdbResponsePayload> { 2294 use GdbRequestPayload::*; 2295 match gdb_request { 2296 SetSingleStep(single_step) => { 2297 self.set_guest_debug(cpu_id, &[], *single_step) 2298 .map_err(Error::Debug)?; 2299 } 2300 SetHwBreakPoint(addrs) => { 2301 self.set_guest_debug(cpu_id, addrs, false) 2302 .map_err(Error::Debug)?; 2303 } 2304 Pause => { 2305 self.debug_pause().map_err(Error::Debug)?; 2306 } 2307 Resume => { 2308 self.debug_resume().map_err(Error::Debug)?; 2309 } 2310 ReadRegs => { 2311 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2312 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2313 } 2314 WriteRegs(regs) => { 2315 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2316 } 2317 ReadMem(vaddr, len) => { 2318 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2319 let mem = self 2320 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2321 .map_err(Error::Debug)?; 2322 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2323 } 2324 WriteMem(vaddr, data) => { 2325 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2326 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2327 .map_err(Error::Debug)?; 2328 } 2329 ActiveVcpus => { 2330 let active_vcpus = self.active_vcpus(); 2331 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2332 } 2333 } 2334 Ok(GdbResponsePayload::CommandComplete) 2335 } 2336 2337 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2338 fn get_dump_state( 2339 &mut self, 2340 destination_url: &str, 2341 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2342 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2343 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2344 let mut elf_phdr_num = 1; 2345 let elf_sh_info = 0; 2346 let coredump_file_path = url_to_file(destination_url)?; 2347 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2348 2349 if mapping_num < UINT16_MAX - 2 { 2350 elf_phdr_num += mapping_num as u16; 2351 } else { 2352 panic!("mapping num beyond 65535 not supported"); 2353 } 2354 let coredump_file = OpenOptions::new() 2355 .read(true) 2356 .write(true) 2357 .create_new(true) 2358 .open(coredump_file_path) 2359 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2360 2361 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2362 let mem_data = self 2363 .memory_manager 2364 .lock() 2365 .unwrap() 2366 .coredump_memory_regions(mem_offset); 2367 2368 Ok(DumpState { 2369 elf_note_size, 2370 elf_phdr_num, 2371 elf_sh_info, 2372 mem_offset, 2373 mem_info: Some(mem_data), 2374 file: Some(coredump_file), 2375 }) 2376 } 2377 2378 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2379 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2380 size_of::<elf::Elf64_Ehdr>() as u64 2381 + note_size as u64 2382 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2383 } 2384 } 2385 2386 impl Pausable for Vm { 2387 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2388 event!("vm", "pausing"); 2389 let mut state = self 2390 .state 2391 .try_write() 2392 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2393 let new_state = VmState::Paused; 2394 2395 state 2396 .valid_transition(new_state) 2397 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2398 2399 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2400 { 2401 let mut clock = self 2402 .vm 2403 .get_clock() 2404 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2405 clock.reset_flags(); 2406 self.saved_clock = Some(clock); 2407 } 2408 2409 // Before pausing the vCPUs activate any pending virtio devices that might 2410 // need activation between starting the pause (or e.g. a migration it's part of) 2411 self.activate_virtio_devices().map_err(|e| { 2412 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2413 })?; 2414 2415 self.cpu_manager.lock().unwrap().pause()?; 2416 self.device_manager.lock().unwrap().pause()?; 2417 2418 *state = new_state; 2419 2420 event!("vm", "paused"); 2421 Ok(()) 2422 } 2423 2424 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2425 event!("vm", "resuming"); 2426 let mut state = self 2427 .state 2428 .try_write() 2429 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2430 let new_state = VmState::Running; 2431 2432 state 2433 .valid_transition(new_state) 2434 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2435 2436 self.cpu_manager.lock().unwrap().resume()?; 2437 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2438 { 2439 if let Some(clock) = &self.saved_clock { 2440 self.vm.set_clock(clock).map_err(|e| { 2441 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2442 })?; 2443 } 2444 } 2445 self.device_manager.lock().unwrap().resume()?; 2446 2447 // And we're back to the Running state. 2448 *state = new_state; 2449 event!("vm", "resumed"); 2450 Ok(()) 2451 } 2452 } 2453 2454 #[derive(Serialize, Deserialize)] 2455 pub struct VmSnapshot { 2456 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2457 pub clock: Option<hypervisor::ClockData>, 2458 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2459 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2460 } 2461 2462 pub const VM_SNAPSHOT_ID: &str = "vm"; 2463 impl Snapshottable for Vm { 2464 fn id(&self) -> String { 2465 VM_SNAPSHOT_ID.to_string() 2466 } 2467 2468 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2469 event!("vm", "snapshotting"); 2470 2471 #[cfg(feature = "tdx")] 2472 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2473 2474 #[cfg(feature = "tdx")] 2475 { 2476 if tdx_enabled { 2477 return Err(MigratableError::Snapshot(anyhow!( 2478 "Snapshot not possible with TDX VM" 2479 ))); 2480 } 2481 } 2482 2483 let current_state = self.get_state().unwrap(); 2484 if current_state != VmState::Paused { 2485 return Err(MigratableError::Snapshot(anyhow!( 2486 "Trying to snapshot while VM is running" 2487 ))); 2488 } 2489 2490 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2491 let common_cpuid = { 2492 let phys_bits = physical_bits(self.config.lock().unwrap().cpus.max_phys_bits); 2493 arch::generate_common_cpuid( 2494 &self.hypervisor, 2495 None, 2496 None, 2497 phys_bits, 2498 self.config.lock().unwrap().cpus.kvm_hyperv, 2499 #[cfg(feature = "tdx")] 2500 tdx_enabled, 2501 ) 2502 .map_err(|e| { 2503 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2504 })? 2505 }; 2506 2507 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2508 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2509 clock: self.saved_clock, 2510 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2511 common_cpuid, 2512 }) 2513 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2514 2515 let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data)); 2516 2517 let (id, snapshot) = { 2518 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2519 (cpu_manager.id(), cpu_manager.snapshot()?) 2520 }; 2521 vm_snapshot.add_snapshot(id, snapshot); 2522 let (id, snapshot) = { 2523 let mut memory_manager = self.memory_manager.lock().unwrap(); 2524 (memory_manager.id(), memory_manager.snapshot()?) 2525 }; 2526 vm_snapshot.add_snapshot(id, snapshot); 2527 let (id, snapshot) = { 2528 let mut device_manager = self.device_manager.lock().unwrap(); 2529 (device_manager.id(), device_manager.snapshot()?) 2530 }; 2531 vm_snapshot.add_snapshot(id, snapshot); 2532 2533 event!("vm", "snapshotted"); 2534 Ok(vm_snapshot) 2535 } 2536 } 2537 2538 impl Transportable for Vm { 2539 fn send( 2540 &self, 2541 snapshot: &Snapshot, 2542 destination_url: &str, 2543 ) -> std::result::Result<(), MigratableError> { 2544 let mut snapshot_config_path = url_to_path(destination_url)?; 2545 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2546 2547 // Create the snapshot config file 2548 let mut snapshot_config_file = OpenOptions::new() 2549 .read(true) 2550 .write(true) 2551 .create_new(true) 2552 .open(snapshot_config_path) 2553 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2554 2555 // Serialize and write the snapshot config 2556 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2557 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2558 2559 snapshot_config_file 2560 .write(vm_config.as_bytes()) 2561 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2562 2563 let mut snapshot_state_path = url_to_path(destination_url)?; 2564 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2565 2566 // Create the snapshot state file 2567 let mut snapshot_state_file = OpenOptions::new() 2568 .read(true) 2569 .write(true) 2570 .create_new(true) 2571 .open(snapshot_state_path) 2572 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2573 2574 // Serialize and write the snapshot state 2575 let vm_state = 2576 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2577 2578 snapshot_state_file 2579 .write(&vm_state) 2580 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2581 2582 // Tell the memory manager to also send/write its own snapshot. 2583 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2584 self.memory_manager 2585 .lock() 2586 .unwrap() 2587 .send(&memory_manager_snapshot.clone(), destination_url)?; 2588 } else { 2589 return Err(MigratableError::Restore(anyhow!( 2590 "Missing memory manager snapshot" 2591 ))); 2592 } 2593 2594 Ok(()) 2595 } 2596 } 2597 2598 impl Migratable for Vm { 2599 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2600 self.memory_manager.lock().unwrap().start_dirty_log()?; 2601 self.device_manager.lock().unwrap().start_dirty_log() 2602 } 2603 2604 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2605 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2606 self.device_manager.lock().unwrap().stop_dirty_log() 2607 } 2608 2609 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2610 Ok(MemoryRangeTable::new_from_tables(vec![ 2611 self.memory_manager.lock().unwrap().dirty_log()?, 2612 self.device_manager.lock().unwrap().dirty_log()?, 2613 ])) 2614 } 2615 2616 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2617 self.memory_manager.lock().unwrap().start_migration()?; 2618 self.device_manager.lock().unwrap().start_migration() 2619 } 2620 2621 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2622 self.memory_manager.lock().unwrap().complete_migration()?; 2623 self.device_manager.lock().unwrap().complete_migration() 2624 } 2625 } 2626 2627 #[cfg(feature = "guest_debug")] 2628 impl Debuggable for Vm { 2629 fn set_guest_debug( 2630 &self, 2631 cpu_id: usize, 2632 addrs: &[GuestAddress], 2633 singlestep: bool, 2634 ) -> std::result::Result<(), DebuggableError> { 2635 self.cpu_manager 2636 .lock() 2637 .unwrap() 2638 .set_guest_debug(cpu_id, addrs, singlestep) 2639 } 2640 2641 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2642 if *self.state.read().unwrap() == VmState::Running { 2643 self.pause().map_err(DebuggableError::Pause)?; 2644 } 2645 2646 let mut state = self 2647 .state 2648 .try_write() 2649 .map_err(|_| DebuggableError::PoisonedState)?; 2650 *state = VmState::BreakPoint; 2651 Ok(()) 2652 } 2653 2654 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2655 if *self.state.read().unwrap() == VmState::BreakPoint { 2656 self.resume().map_err(DebuggableError::Pause)?; 2657 } 2658 2659 Ok(()) 2660 } 2661 2662 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2663 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2664 } 2665 2666 fn write_regs( 2667 &self, 2668 cpu_id: usize, 2669 regs: &CoreRegs, 2670 ) -> std::result::Result<(), DebuggableError> { 2671 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2672 } 2673 2674 fn read_mem( 2675 &self, 2676 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2677 cpu_id: usize, 2678 vaddr: GuestAddress, 2679 len: usize, 2680 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2681 self.cpu_manager 2682 .lock() 2683 .unwrap() 2684 .read_mem(guest_memory, cpu_id, vaddr, len) 2685 } 2686 2687 fn write_mem( 2688 &self, 2689 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2690 cpu_id: usize, 2691 vaddr: &GuestAddress, 2692 data: &[u8], 2693 ) -> std::result::Result<(), DebuggableError> { 2694 self.cpu_manager 2695 .lock() 2696 .unwrap() 2697 .write_mem(guest_memory, cpu_id, vaddr, data) 2698 } 2699 2700 fn active_vcpus(&self) -> usize { 2701 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2702 if active_vcpus > 0 { 2703 active_vcpus 2704 } else { 2705 // The VM is not booted yet. Report boot_vcpus() instead. 2706 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2707 } 2708 } 2709 } 2710 2711 #[cfg(feature = "guest_debug")] 2712 pub const UINT16_MAX: u32 = 65535; 2713 2714 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2715 impl Elf64Writable for Vm {} 2716 2717 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2718 impl GuestDebuggable for Vm { 2719 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2720 event!("vm", "coredumping"); 2721 2722 #[cfg(feature = "tdx")] 2723 { 2724 if let Some(ref platform) = self.config.lock().unwrap().platform { 2725 if platform.tdx { 2726 return Err(GuestDebuggableError::Coredump(anyhow!( 2727 "Coredump not possible with TDX VM" 2728 ))); 2729 } 2730 } 2731 } 2732 2733 let current_state = self.get_state().unwrap(); 2734 if current_state != VmState::Paused { 2735 return Err(GuestDebuggableError::Coredump(anyhow!( 2736 "Trying to coredump while VM is running" 2737 ))); 2738 } 2739 2740 let coredump_state = self.get_dump_state(destination_url)?; 2741 2742 self.write_header(&coredump_state)?; 2743 self.write_note(&coredump_state)?; 2744 self.write_loads(&coredump_state)?; 2745 2746 self.cpu_manager 2747 .lock() 2748 .unwrap() 2749 .cpu_write_elf64_note(&coredump_state)?; 2750 self.cpu_manager 2751 .lock() 2752 .unwrap() 2753 .cpu_write_vmm_note(&coredump_state)?; 2754 2755 self.memory_manager 2756 .lock() 2757 .unwrap() 2758 .coredump_iterate_save_mem(&coredump_state) 2759 } 2760 } 2761 2762 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2763 #[cfg(test)] 2764 mod tests { 2765 use super::*; 2766 2767 fn test_vm_state_transitions(state: VmState) { 2768 match state { 2769 VmState::Created => { 2770 // Check the transitions from Created 2771 assert!(state.valid_transition(VmState::Created).is_err()); 2772 assert!(state.valid_transition(VmState::Running).is_ok()); 2773 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2774 assert!(state.valid_transition(VmState::Paused).is_ok()); 2775 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2776 } 2777 VmState::Running => { 2778 // Check the transitions from Running 2779 assert!(state.valid_transition(VmState::Created).is_err()); 2780 assert!(state.valid_transition(VmState::Running).is_err()); 2781 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2782 assert!(state.valid_transition(VmState::Paused).is_ok()); 2783 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2784 } 2785 VmState::Shutdown => { 2786 // Check the transitions from Shutdown 2787 assert!(state.valid_transition(VmState::Created).is_err()); 2788 assert!(state.valid_transition(VmState::Running).is_ok()); 2789 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2790 assert!(state.valid_transition(VmState::Paused).is_err()); 2791 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2792 } 2793 VmState::Paused => { 2794 // Check the transitions from Paused 2795 assert!(state.valid_transition(VmState::Created).is_err()); 2796 assert!(state.valid_transition(VmState::Running).is_ok()); 2797 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2798 assert!(state.valid_transition(VmState::Paused).is_err()); 2799 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2800 } 2801 VmState::BreakPoint => { 2802 // Check the transitions from Breakpoint 2803 assert!(state.valid_transition(VmState::Created).is_ok()); 2804 assert!(state.valid_transition(VmState::Running).is_ok()); 2805 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2806 assert!(state.valid_transition(VmState::Paused).is_err()); 2807 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2808 } 2809 } 2810 } 2811 2812 #[test] 2813 fn test_vm_created_transitions() { 2814 test_vm_state_transitions(VmState::Created); 2815 } 2816 2817 #[test] 2818 fn test_vm_running_transitions() { 2819 test_vm_state_transitions(VmState::Running); 2820 } 2821 2822 #[test] 2823 fn test_vm_shutdown_transitions() { 2824 test_vm_state_transitions(VmState::Shutdown); 2825 } 2826 2827 #[test] 2828 fn test_vm_paused_transitions() { 2829 test_vm_state_transitions(VmState::Paused); 2830 } 2831 2832 #[cfg(feature = "tdx")] 2833 #[test] 2834 fn test_hob_memory_resources() { 2835 // Case 1: Two TDVF sections in the middle of the RAM 2836 let sections = vec![ 2837 TdvfSection { 2838 address: 0xc000, 2839 size: 0x1000, 2840 ..Default::default() 2841 }, 2842 TdvfSection { 2843 address: 0x1000, 2844 size: 0x4000, 2845 ..Default::default() 2846 }, 2847 ]; 2848 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2849 let expected = vec![ 2850 (0, 0x1000, true), 2851 (0x1000, 0x4000, false), 2852 (0x5000, 0x7000, true), 2853 (0xc000, 0x1000, false), 2854 (0xd000, 0x0fff_3000, true), 2855 ]; 2856 assert_eq!( 2857 expected, 2858 Vm::hob_memory_resources( 2859 sections, 2860 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2861 ) 2862 ); 2863 2864 // Case 2: Two TDVF sections with no conflict with the RAM 2865 let sections = vec![ 2866 TdvfSection { 2867 address: 0x1000_1000, 2868 size: 0x1000, 2869 ..Default::default() 2870 }, 2871 TdvfSection { 2872 address: 0, 2873 size: 0x1000, 2874 ..Default::default() 2875 }, 2876 ]; 2877 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2878 let expected = vec![ 2879 (0, 0x1000, false), 2880 (0x1000, 0x1000_0000, true), 2881 (0x1000_1000, 0x1000, false), 2882 ]; 2883 assert_eq!( 2884 expected, 2885 Vm::hob_memory_resources( 2886 sections, 2887 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2888 ) 2889 ); 2890 2891 // Case 3: Two TDVF sections with partial conflicts with the RAM 2892 let sections = vec![ 2893 TdvfSection { 2894 address: 0x1000_0000, 2895 size: 0x2000, 2896 ..Default::default() 2897 }, 2898 TdvfSection { 2899 address: 0, 2900 size: 0x2000, 2901 ..Default::default() 2902 }, 2903 ]; 2904 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2905 let expected = vec![ 2906 (0, 0x2000, false), 2907 (0x2000, 0x0fff_e000, true), 2908 (0x1000_0000, 0x2000, false), 2909 ]; 2910 assert_eq!( 2911 expected, 2912 Vm::hob_memory_resources( 2913 sections, 2914 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2915 ) 2916 ); 2917 2918 // Case 4: Two TDVF sections with no conflict before the RAM and two 2919 // more additional sections with no conflict after the RAM. 2920 let sections = vec![ 2921 TdvfSection { 2922 address: 0x2000_1000, 2923 size: 0x1000, 2924 ..Default::default() 2925 }, 2926 TdvfSection { 2927 address: 0x2000_0000, 2928 size: 0x1000, 2929 ..Default::default() 2930 }, 2931 TdvfSection { 2932 address: 0x1000, 2933 size: 0x1000, 2934 ..Default::default() 2935 }, 2936 TdvfSection { 2937 address: 0, 2938 size: 0x1000, 2939 ..Default::default() 2940 }, 2941 ]; 2942 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 2943 let expected = vec![ 2944 (0, 0x1000, false), 2945 (0x1000, 0x1000, false), 2946 (0x4000, 0x1000_0000, true), 2947 (0x2000_0000, 0x1000, false), 2948 (0x2000_1000, 0x1000, false), 2949 ]; 2950 assert_eq!( 2951 expected, 2952 Vm::hob_memory_resources( 2953 sections, 2954 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2955 ) 2956 ); 2957 2958 // Case 5: One TDVF section overriding the entire RAM 2959 let sections = vec![TdvfSection { 2960 address: 0, 2961 size: 0x2000_0000, 2962 ..Default::default() 2963 }]; 2964 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2965 let expected = vec![(0, 0x2000_0000, false)]; 2966 assert_eq!( 2967 expected, 2968 Vm::hob_memory_resources( 2969 sections, 2970 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2971 ) 2972 ); 2973 2974 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 2975 let sections = vec![ 2976 TdvfSection { 2977 address: 0x1000_2000, 2978 size: 0x2000, 2979 ..Default::default() 2980 }, 2981 TdvfSection { 2982 address: 0, 2983 size: 0x2000, 2984 ..Default::default() 2985 }, 2986 ]; 2987 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 2988 (GuestAddress(0x2000), 0x1000_0000), 2989 (GuestAddress(0x1000_4000), 0x1000_0000), 2990 ]; 2991 let expected = vec![ 2992 (0, 0x2000, false), 2993 (0x2000, 0x1000_0000, true), 2994 (0x1000_2000, 0x2000, false), 2995 (0x1000_4000, 0x1000_0000, true), 2996 ]; 2997 assert_eq!( 2998 expected, 2999 Vm::hob_memory_resources( 3000 sections, 3001 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3002 ) 3003 ); 3004 3005 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3006 let sections = vec![ 3007 TdvfSection { 3008 address: 0x1000_0000, 3009 size: 0x4000, 3010 ..Default::default() 3011 }, 3012 TdvfSection { 3013 address: 0, 3014 size: 0x4000, 3015 ..Default::default() 3016 }, 3017 ]; 3018 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3019 (GuestAddress(0x1000), 0x1000_0000), 3020 (GuestAddress(0x1000_3000), 0x1000_0000), 3021 ]; 3022 let expected = vec![ 3023 (0, 0x4000, false), 3024 (0x4000, 0x0fff_c000, true), 3025 (0x1000_0000, 0x4000, false), 3026 (0x1000_4000, 0x0fff_f000, true), 3027 ]; 3028 assert_eq!( 3029 expected, 3030 Vm::hob_memory_resources( 3031 sections, 3032 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3033 ) 3034 ); 3035 } 3036 } 3037 3038 #[cfg(target_arch = "aarch64")] 3039 #[cfg(test)] 3040 mod tests { 3041 use super::*; 3042 use crate::GuestMemoryMmap; 3043 use arch::aarch64::fdt::create_fdt; 3044 use arch::aarch64::layout; 3045 use arch::{DeviceType, MmioDeviceInfo}; 3046 use devices::gic::Gic; 3047 3048 const LEN: u64 = 4096; 3049 3050 #[test] 3051 fn test_create_fdt_with_devices() { 3052 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3053 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3054 3055 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3056 ( 3057 (DeviceType::Serial, DeviceType::Serial.to_string()), 3058 MmioDeviceInfo { 3059 addr: 0x00, 3060 len: LEN, 3061 irq: 33, 3062 }, 3063 ), 3064 ( 3065 (DeviceType::Virtio(1), "virtio".to_string()), 3066 MmioDeviceInfo { 3067 addr: LEN, 3068 len: LEN, 3069 irq: 34, 3070 }, 3071 ), 3072 ( 3073 (DeviceType::Rtc, "rtc".to_string()), 3074 MmioDeviceInfo { 3075 addr: 2 * LEN, 3076 len: LEN, 3077 irq: 35, 3078 }, 3079 ), 3080 ] 3081 .iter() 3082 .cloned() 3083 .collect(); 3084 3085 let hv = hypervisor::new().unwrap(); 3086 let vm = hv.create_vm().unwrap(); 3087 let gic = vm 3088 .create_vgic(Gic::create_default_config(1)) 3089 .expect("Cannot create gic"); 3090 assert!(create_fdt( 3091 &mem, 3092 "console=tty0", 3093 vec![0], 3094 Some((0, 0, 0)), 3095 &dev_info, 3096 &gic, 3097 &None, 3098 &Vec::new(), 3099 &BTreeMap::new(), 3100 None, 3101 true, 3102 ) 3103 .is_ok()) 3104 } 3105 } 3106 3107 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3108 #[test] 3109 pub fn test_vm() { 3110 use hypervisor::VmExit; 3111 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3112 // This example based on https://lwn.net/Articles/658511/ 3113 let code = [ 3114 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3115 0x00, 0xd8, /* add %bl, %al */ 3116 0x04, b'0', /* add $'0', %al */ 3117 0xee, /* out %al, (%dx) */ 3118 0xb0, b'\n', /* mov $'\n', %al */ 3119 0xee, /* out %al, (%dx) */ 3120 0xf4, /* hlt */ 3121 ]; 3122 3123 let mem_size = 0x1000; 3124 let load_addr = GuestAddress(0x1000); 3125 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3126 3127 let hv = hypervisor::new().unwrap(); 3128 let vm = hv.create_vm().expect("new VM creation failed"); 3129 3130 for (index, region) in mem.iter().enumerate() { 3131 let mem_region = vm.make_user_memory_region( 3132 index as u32, 3133 region.start_addr().raw_value(), 3134 region.len(), 3135 region.as_ptr() as u64, 3136 false, 3137 false, 3138 ); 3139 3140 vm.create_user_memory_region(mem_region) 3141 .expect("Cannot configure guest memory"); 3142 } 3143 mem.write_slice(&code, load_addr) 3144 .expect("Writing code to memory failed"); 3145 3146 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3147 3148 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3149 vcpu_sregs.cs.base = 0; 3150 vcpu_sregs.cs.selector = 0; 3151 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3152 3153 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3154 vcpu_regs.rip = 0x1000; 3155 vcpu_regs.rax = 2; 3156 vcpu_regs.rbx = 3; 3157 vcpu_regs.rflags = 2; 3158 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3159 3160 loop { 3161 match vcpu.run().expect("run failed") { 3162 VmExit::IoOut(addr, data) => { 3163 println!( 3164 "IO out -- addr: {:#x} data [{:?}]", 3165 addr, 3166 str::from_utf8(data).unwrap() 3167 ); 3168 } 3169 VmExit::Reset => { 3170 println!("HLT"); 3171 break; 3172 } 3173 r => panic!("unexpected exit reason: {r:?}"), 3174 } 3175 } 3176 } 3177