1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(feature = "guest_debug")] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{Console, DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 use crate::memory_manager::{ 29 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 30 }; 31 #[cfg(feature = "guest_debug")] 32 use crate::migration::url_to_file; 33 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 34 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 35 use crate::GuestMemoryMmap; 36 use crate::{ 37 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 38 }; 39 use anyhow::anyhow; 40 use arch::get_host_cpu_phys_bits; 41 #[cfg(target_arch = "x86_64")] 42 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 43 #[cfg(feature = "tdx")] 44 use arch::x86_64::tdx::TdvfSection; 45 use arch::EntryPoint; 46 #[cfg(target_arch = "aarch64")] 47 use arch::PciSpaceInfo; 48 use arch::{NumaNode, NumaNodes}; 49 #[cfg(target_arch = "aarch64")] 50 use devices::gic::GIC_V3_ITS_SNAPSHOT_ID; 51 #[cfg(target_arch = "aarch64")] 52 use devices::interrupt_controller; 53 use devices::AcpiNotificationFlags; 54 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 55 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 56 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 57 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 58 use hypervisor::{HypervisorVmError, VmOps}; 59 use linux_loader::cmdline::Cmdline; 60 #[cfg(feature = "guest_debug")] 61 use linux_loader::elf; 62 #[cfg(target_arch = "x86_64")] 63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 64 #[cfg(target_arch = "aarch64")] 65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 66 use linux_loader::loader::KernelLoader; 67 use seccompiler::{apply_filter, SeccompAction}; 68 use serde::{Deserialize, Serialize}; 69 use signal_hook::{consts::SIGWINCH, iterator::backend::Handle, iterator::Signals}; 70 use std::cmp; 71 use std::collections::BTreeMap; 72 use std::collections::HashMap; 73 use std::convert::TryInto; 74 use std::fs::{File, OpenOptions}; 75 use std::io::{self, Seek, SeekFrom, Write}; 76 #[cfg(feature = "tdx")] 77 use std::mem; 78 #[cfg(feature = "guest_debug")] 79 use std::mem::size_of; 80 use std::num::Wrapping; 81 use std::ops::Deref; 82 use std::os::unix::net::UnixStream; 83 use std::panic::AssertUnwindSafe; 84 use std::sync::{Arc, Mutex, RwLock}; 85 use std::time::Instant; 86 use std::{result, str, thread}; 87 use thiserror::Error; 88 use tracer::trace_scoped; 89 use vm_device::Bus; 90 #[cfg(feature = "tdx")] 91 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion}; 92 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; 93 use vm_migration::protocol::{Request, Response, Status}; 94 use vm_migration::{ 95 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 96 SnapshotDataSection, Snapshottable, Transportable, 97 }; 98 use vmm_sys_util::eventfd::EventFd; 99 use vmm_sys_util::signal::unblock_signal; 100 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 101 use vmm_sys_util::terminal::Terminal; 102 103 /// Errors associated with VM management 104 #[derive(Debug, Error)] 105 pub enum Error { 106 #[error("Cannot open kernel file: {0}")] 107 KernelFile(#[source] io::Error), 108 109 #[error("Cannot open initramfs file: {0}")] 110 InitramfsFile(#[source] io::Error), 111 112 #[error("Cannot load the kernel into memory: {0}")] 113 KernelLoad(#[source] linux_loader::loader::Error), 114 115 #[cfg(target_arch = "aarch64")] 116 #[error("Cannot load the UEFI binary in memory: {0:?}")] 117 UefiLoad(arch::aarch64::uefi::Error), 118 119 #[error("Cannot load the initramfs into memory")] 120 InitramfsLoad, 121 122 #[error("Cannot load the kernel command line in memory: {0}")] 123 LoadCmdLine(#[source] linux_loader::loader::Error), 124 125 #[error("Cannot modify the kernel command line: {0}")] 126 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 127 128 #[error("Cannot create the kernel command line: {0}")] 129 CmdLineCreate(#[source] linux_loader::cmdline::Error), 130 131 #[error("Cannot configure system: {0}")] 132 ConfigureSystem(#[source] arch::Error), 133 134 #[cfg(target_arch = "aarch64")] 135 #[error("Cannot enable interrupt controller: {0:?}")] 136 EnableInterruptController(interrupt_controller::Error), 137 138 #[error("VM state is poisoned")] 139 PoisonedState, 140 141 #[error("Error from device manager: {0:?}")] 142 DeviceManager(DeviceManagerError), 143 144 #[error("Cannot setup terminal in raw mode: {0}")] 145 SetTerminalRaw(#[source] vmm_sys_util::errno::Error), 146 147 #[error("Cannot setup terminal in canonical mode.: {0}")] 148 SetTerminalCanon(#[source] vmm_sys_util::errno::Error), 149 150 #[error("Cannot spawn a signal handler thread: {0}")] 151 SignalHandlerSpawn(#[source] io::Error), 152 153 #[error("Failed to join on threads: {0:?}")] 154 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 155 156 #[error("VM config is missing")] 157 VmMissingConfig, 158 159 #[error("VM is not created")] 160 VmNotCreated, 161 162 #[error("VM is already created")] 163 VmAlreadyCreated, 164 165 #[error("VM is not running")] 166 VmNotRunning, 167 168 #[error("Cannot clone EventFd: {0}")] 169 EventFdClone(#[source] io::Error), 170 171 #[error("invalid VM state transition: {0:?} to {1:?}")] 172 InvalidStateTransition(VmState, VmState), 173 174 #[error("Error from CPU manager: {0}")] 175 CpuManager(#[source] cpu::Error), 176 177 #[error("Cannot pause devices: {0}")] 178 PauseDevices(#[source] MigratableError), 179 180 #[error("Cannot resume devices: {0}")] 181 ResumeDevices(#[source] MigratableError), 182 183 #[error("Cannot pause CPUs: {0}")] 184 PauseCpus(#[source] MigratableError), 185 186 #[error("Cannot resume cpus: {0}")] 187 ResumeCpus(#[source] MigratableError), 188 189 #[error("Cannot pause VM: {0}")] 190 Pause(#[source] MigratableError), 191 192 #[error("Cannot resume VM: {0}")] 193 Resume(#[source] MigratableError), 194 195 #[error("Memory manager error: {0:?}")] 196 MemoryManager(MemoryManagerError), 197 198 #[error("Eventfd write error: {0}")] 199 EventfdError(#[source] std::io::Error), 200 201 #[error("Cannot snapshot VM: {0}")] 202 Snapshot(#[source] MigratableError), 203 204 #[error("Cannot restore VM: {0}")] 205 Restore(#[source] MigratableError), 206 207 #[error("Cannot send VM snapshot: {0}")] 208 SnapshotSend(#[source] MigratableError), 209 210 #[error("Invalid restore source URL")] 211 InvalidRestoreSourceUrl, 212 213 #[error("Failed to validate config: {0}")] 214 ConfigValidation(#[source] ValidationError), 215 216 #[error("Too many virtio-vsock devices")] 217 TooManyVsockDevices, 218 219 #[error("Failed serializing into JSON: {0}")] 220 SerializeJson(#[source] serde_json::Error), 221 222 #[error("Invalid NUMA configuration")] 223 InvalidNumaConfig, 224 225 #[error("Cannot create seccomp filter: {0}")] 226 CreateSeccompFilter(#[source] seccompiler::Error), 227 228 #[error("Cannot apply seccomp filter: {0}")] 229 ApplySeccompFilter(#[source] seccompiler::Error), 230 231 #[error("Failed resizing a memory zone")] 232 ResizeZone, 233 234 #[error("Cannot activate virtio devices: {0:?}")] 235 ActivateVirtioDevices(DeviceManagerError), 236 237 #[error("Error triggering power button: {0:?}")] 238 PowerButton(DeviceManagerError), 239 240 #[error("Kernel lacks PVH header")] 241 KernelMissingPvhHeader, 242 243 #[error("Failed to allocate firmware RAM: {0:?}")] 244 AllocateFirmwareMemory(MemoryManagerError), 245 246 #[error("Error manipulating firmware file: {0}")] 247 FirmwareFile(#[source] std::io::Error), 248 249 #[error("Firmware too big")] 250 FirmwareTooLarge, 251 252 #[error("Failed to copy firmware to memory: {0}")] 253 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 254 255 #[cfg(feature = "tdx")] 256 #[error("Error performing I/O on TDX firmware file: {0}")] 257 LoadTdvf(#[source] std::io::Error), 258 259 #[cfg(feature = "tdx")] 260 #[error("Error performing I/O on the TDX payload file: {0}")] 261 LoadPayload(#[source] std::io::Error), 262 263 #[cfg(feature = "tdx")] 264 #[error("Error parsing TDVF: {0}")] 265 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 266 267 #[cfg(feature = "tdx")] 268 #[error("Error populating TDX HOB: {0}")] 269 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 270 271 #[cfg(feature = "tdx")] 272 #[error("Error allocating TDVF memory: {0:?}")] 273 AllocatingTdvfMemory(crate::memory_manager::Error), 274 275 #[cfg(feature = "tdx")] 276 #[error("Error enabling TDX VM: {0}")] 277 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 278 279 #[cfg(feature = "tdx")] 280 #[error("Error enabling TDX memory region: {0}")] 281 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 282 283 #[cfg(feature = "tdx")] 284 #[error("Error finalizing TDX VM: {0}")] 285 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 286 287 #[cfg(feature = "tdx")] 288 #[error("TDX firmware missing")] 289 TdxFirmwareMissing, 290 291 #[cfg(feature = "tdx")] 292 #[error("Invalid TDX payload type")] 293 InvalidPayloadType, 294 295 #[cfg(feature = "guest_debug")] 296 #[error("Error debugging VM: {0:?}")] 297 Debug(DebuggableError), 298 299 #[error("Error spawning kernel loading thread")] 300 KernelLoadThreadSpawn(std::io::Error), 301 302 #[error("Error joining kernel loading thread")] 303 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 304 305 #[error("Payload configuration is not bootable")] 306 InvalidPayload, 307 308 #[cfg(feature = "guest_debug")] 309 #[error("Error coredumping VM: {0:?}")] 310 Coredump(GuestDebuggableError), 311 } 312 pub type Result<T> = result::Result<T, Error>; 313 314 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 315 pub enum VmState { 316 Created, 317 Running, 318 Shutdown, 319 Paused, 320 BreakPoint, 321 } 322 323 impl VmState { 324 fn valid_transition(self, new_state: VmState) -> Result<()> { 325 match self { 326 VmState::Created => match new_state { 327 VmState::Created | VmState::Shutdown => { 328 Err(Error::InvalidStateTransition(self, new_state)) 329 } 330 VmState::Running | VmState::Paused | VmState::BreakPoint => Ok(()), 331 }, 332 333 VmState::Running => match new_state { 334 VmState::Created | VmState::Running => { 335 Err(Error::InvalidStateTransition(self, new_state)) 336 } 337 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 338 }, 339 340 VmState::Shutdown => match new_state { 341 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 342 Err(Error::InvalidStateTransition(self, new_state)) 343 } 344 VmState::Running => Ok(()), 345 }, 346 347 VmState::Paused => match new_state { 348 VmState::Created | VmState::Paused | VmState::BreakPoint => { 349 Err(Error::InvalidStateTransition(self, new_state)) 350 } 351 VmState::Running | VmState::Shutdown => Ok(()), 352 }, 353 VmState::BreakPoint => match new_state { 354 VmState::Created | VmState::Running => Ok(()), 355 _ => Err(Error::InvalidStateTransition(self, new_state)), 356 }, 357 } 358 } 359 } 360 361 struct VmOpsHandler { 362 memory: GuestMemoryAtomic<GuestMemoryMmap>, 363 #[cfg(target_arch = "x86_64")] 364 io_bus: Arc<Bus>, 365 mmio_bus: Arc<Bus>, 366 } 367 368 impl VmOps for VmOpsHandler { 369 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 370 self.memory 371 .memory() 372 .write(buf, GuestAddress(gpa)) 373 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 374 } 375 376 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 377 self.memory 378 .memory() 379 .read(buf, GuestAddress(gpa)) 380 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 381 } 382 383 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 384 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 385 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 386 } 387 Ok(()) 388 } 389 390 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 391 match self.mmio_bus.write(gpa, data) { 392 Err(vm_device::BusError::MissingAddressRange) => { 393 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 394 } 395 Ok(Some(barrier)) => { 396 info!("Waiting for barrier"); 397 barrier.wait(); 398 info!("Barrier released"); 399 } 400 _ => {} 401 }; 402 Ok(()) 403 } 404 405 #[cfg(target_arch = "x86_64")] 406 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 407 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 408 info!("Guest PIO read to unregistered address 0x{:x}", port); 409 } 410 Ok(()) 411 } 412 413 #[cfg(target_arch = "x86_64")] 414 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 415 match self.io_bus.write(port, data) { 416 Err(vm_device::BusError::MissingAddressRange) => { 417 info!("Guest PIO write to unregistered address 0x{:x}", port); 418 } 419 Ok(Some(barrier)) => { 420 info!("Waiting for barrier"); 421 barrier.wait(); 422 info!("Barrier released"); 423 } 424 _ => {} 425 }; 426 Ok(()) 427 } 428 } 429 430 pub fn physical_bits(max_phys_bits: u8) -> u8 { 431 let host_phys_bits = get_host_cpu_phys_bits(); 432 433 cmp::min(host_phys_bits, max_phys_bits) 434 } 435 436 pub struct Vm { 437 #[cfg(feature = "tdx")] 438 kernel: Option<File>, 439 initramfs: Option<File>, 440 threads: Vec<thread::JoinHandle<()>>, 441 device_manager: Arc<Mutex<DeviceManager>>, 442 config: Arc<Mutex<VmConfig>>, 443 on_tty: bool, 444 signals: Option<Handle>, 445 state: RwLock<VmState>, 446 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 447 memory_manager: Arc<Mutex<MemoryManager>>, 448 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 449 // The hypervisor abstracted virtual machine. 450 vm: Arc<dyn hypervisor::Vm>, 451 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 452 saved_clock: Option<hypervisor::ClockData>, 453 numa_nodes: NumaNodes, 454 seccomp_action: SeccompAction, 455 exit_evt: EventFd, 456 hypervisor: Arc<dyn hypervisor::Hypervisor>, 457 stop_on_boot: bool, 458 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 459 } 460 461 impl Vm { 462 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 463 464 #[allow(clippy::too_many_arguments)] 465 pub fn new_from_memory_manager( 466 config: Arc<Mutex<VmConfig>>, 467 memory_manager: Arc<Mutex<MemoryManager>>, 468 vm: Arc<dyn hypervisor::Vm>, 469 exit_evt: EventFd, 470 reset_evt: EventFd, 471 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 472 seccomp_action: &SeccompAction, 473 hypervisor: Arc<dyn hypervisor::Hypervisor>, 474 activate_evt: EventFd, 475 restoring: bool, 476 timestamp: Instant, 477 snapshot: Option<&Snapshot>, 478 ) -> Result<Self> { 479 trace_scoped!("Vm::new_from_memory_manager"); 480 481 let boot_id_list = config 482 .lock() 483 .unwrap() 484 .validate() 485 .map_err(Error::ConfigValidation)?; 486 487 let load_payload_handle = if !restoring { 488 Self::load_payload_async(&memory_manager, &config)? 489 } else { 490 None 491 }; 492 493 info!("Booting VM from config: {:?}", &config); 494 495 // Create NUMA nodes based on NumaConfig. 496 let numa_nodes = 497 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 498 499 #[cfg(feature = "tdx")] 500 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 501 #[cfg(feature = "tdx")] 502 let force_iommu = tdx_enabled; 503 #[cfg(not(feature = "tdx"))] 504 let force_iommu = false; 505 506 #[cfg(feature = "guest_debug")] 507 let stop_on_boot = config.lock().unwrap().gdb; 508 #[cfg(not(feature = "guest_debug"))] 509 let stop_on_boot = false; 510 511 let memory = memory_manager.lock().unwrap().guest_memory(); 512 #[cfg(target_arch = "x86_64")] 513 let io_bus = Arc::new(Bus::new()); 514 let mmio_bus = Arc::new(Bus::new()); 515 516 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 517 memory, 518 #[cfg(target_arch = "x86_64")] 519 io_bus: io_bus.clone(), 520 mmio_bus: mmio_bus.clone(), 521 }); 522 523 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 524 let cpu_manager = cpu::CpuManager::new( 525 cpus_config, 526 &memory_manager, 527 vm.clone(), 528 exit_evt.try_clone().map_err(Error::EventFdClone)?, 529 reset_evt.try_clone().map_err(Error::EventFdClone)?, 530 #[cfg(feature = "guest_debug")] 531 vm_debug_evt, 532 hypervisor.clone(), 533 seccomp_action.clone(), 534 vm_ops, 535 #[cfg(feature = "tdx")] 536 tdx_enabled, 537 &numa_nodes, 538 ) 539 .map_err(Error::CpuManager)?; 540 541 cpu_manager 542 .lock() 543 .unwrap() 544 .create_boot_vcpus() 545 .map_err(Error::CpuManager)?; 546 547 #[cfg(feature = "tdx")] 548 let dynamic = !tdx_enabled; 549 #[cfg(not(feature = "tdx"))] 550 let dynamic = true; 551 552 let device_manager = DeviceManager::new( 553 #[cfg(target_arch = "x86_64")] 554 io_bus, 555 mmio_bus, 556 hypervisor.hypervisor_type(), 557 vm.clone(), 558 config.clone(), 559 memory_manager.clone(), 560 cpu_manager.clone(), 561 exit_evt.try_clone().map_err(Error::EventFdClone)?, 562 reset_evt, 563 seccomp_action.clone(), 564 numa_nodes.clone(), 565 &activate_evt, 566 force_iommu, 567 restoring, 568 boot_id_list, 569 timestamp, 570 snapshot_from_id(snapshot, DEVICE_MANAGER_SNAPSHOT_ID), 571 dynamic, 572 ) 573 .map_err(Error::DeviceManager)?; 574 575 // SAFETY: trivially safe 576 let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO) } != 0; 577 578 #[cfg(feature = "tdx")] 579 let kernel = config 580 .lock() 581 .unwrap() 582 .payload 583 .as_ref() 584 .map(|p| p.kernel.as_ref().map(File::open)) 585 .unwrap_or_default() 586 .transpose() 587 .map_err(Error::KernelFile)?; 588 589 let initramfs = config 590 .lock() 591 .unwrap() 592 .payload 593 .as_ref() 594 .map(|p| p.initramfs.as_ref().map(File::open)) 595 .unwrap_or_default() 596 .transpose() 597 .map_err(Error::InitramfsFile)?; 598 599 Ok(Vm { 600 #[cfg(feature = "tdx")] 601 kernel, 602 initramfs, 603 device_manager, 604 config, 605 on_tty, 606 threads: Vec::with_capacity(1), 607 signals: None, 608 state: RwLock::new(VmState::Created), 609 cpu_manager, 610 memory_manager, 611 vm, 612 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 613 saved_clock: None, 614 numa_nodes, 615 seccomp_action: seccomp_action.clone(), 616 exit_evt, 617 hypervisor, 618 stop_on_boot, 619 load_payload_handle, 620 }) 621 } 622 623 fn create_numa_nodes( 624 configs: Option<Vec<NumaConfig>>, 625 memory_manager: &Arc<Mutex<MemoryManager>>, 626 ) -> Result<NumaNodes> { 627 let mm = memory_manager.lock().unwrap(); 628 let mm_zones = mm.memory_zones(); 629 let mut numa_nodes = BTreeMap::new(); 630 631 if let Some(configs) = &configs { 632 for config in configs.iter() { 633 if numa_nodes.contains_key(&config.guest_numa_id) { 634 error!("Can't define twice the same NUMA node"); 635 return Err(Error::InvalidNumaConfig); 636 } 637 638 let mut node = NumaNode::default(); 639 640 if let Some(memory_zones) = &config.memory_zones { 641 for memory_zone in memory_zones.iter() { 642 if let Some(mm_zone) = mm_zones.get(memory_zone) { 643 node.memory_regions.extend(mm_zone.regions().clone()); 644 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 645 node.hotplug_regions.push(virtiomem_zone.region().clone()); 646 } 647 node.memory_zones.push(memory_zone.clone()); 648 } else { 649 error!("Unknown memory zone '{}'", memory_zone); 650 return Err(Error::InvalidNumaConfig); 651 } 652 } 653 } 654 655 if let Some(cpus) = &config.cpus { 656 node.cpus.extend(cpus); 657 } 658 659 if let Some(distances) = &config.distances { 660 for distance in distances.iter() { 661 let dest = distance.destination; 662 let dist = distance.distance; 663 664 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 665 error!("Unknown destination NUMA node {}", dest); 666 return Err(Error::InvalidNumaConfig); 667 } 668 669 if node.distances.contains_key(&dest) { 670 error!("Destination NUMA node {} has been already set", dest); 671 return Err(Error::InvalidNumaConfig); 672 } 673 674 node.distances.insert(dest, dist); 675 } 676 } 677 678 #[cfg(target_arch = "x86_64")] 679 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 680 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 681 let mm_sections = sgx_epc_region.epc_sections(); 682 for sgx_epc_section in sgx_epc_sections.iter() { 683 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 684 node.sgx_epc_sections.push(mm_section.clone()); 685 } else { 686 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 687 return Err(Error::InvalidNumaConfig); 688 } 689 } 690 } else { 691 error!("Missing SGX EPC region"); 692 return Err(Error::InvalidNumaConfig); 693 } 694 } 695 696 numa_nodes.insert(config.guest_numa_id, node); 697 } 698 } 699 700 Ok(numa_nodes) 701 } 702 703 #[allow(clippy::too_many_arguments)] 704 pub fn new( 705 config: Arc<Mutex<VmConfig>>, 706 exit_evt: EventFd, 707 reset_evt: EventFd, 708 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 709 seccomp_action: &SeccompAction, 710 hypervisor: Arc<dyn hypervisor::Hypervisor>, 711 activate_evt: EventFd, 712 serial_pty: Option<PtyPair>, 713 console_pty: Option<PtyPair>, 714 console_resize_pipe: Option<File>, 715 ) -> Result<Self> { 716 trace_scoped!("Vm::new"); 717 718 let timestamp = Instant::now(); 719 720 #[cfg(feature = "tdx")] 721 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 722 723 let vm = Self::create_hypervisor_vm( 724 &hypervisor, 725 #[cfg(feature = "tdx")] 726 tdx_enabled, 727 )?; 728 729 let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits); 730 731 #[cfg(target_arch = "x86_64")] 732 let sgx_epc_config = config.lock().unwrap().sgx_epc.clone(); 733 734 let memory_manager = MemoryManager::new( 735 vm.clone(), 736 &config.lock().unwrap().memory.clone(), 737 None, 738 phys_bits, 739 #[cfg(feature = "tdx")] 740 tdx_enabled, 741 None, 742 None, 743 #[cfg(target_arch = "x86_64")] 744 sgx_epc_config, 745 ) 746 .map_err(Error::MemoryManager)?; 747 748 let new_vm = Vm::new_from_memory_manager( 749 config, 750 memory_manager, 751 vm, 752 exit_evt, 753 reset_evt, 754 #[cfg(feature = "guest_debug")] 755 vm_debug_evt, 756 seccomp_action, 757 hypervisor, 758 activate_evt, 759 false, 760 timestamp, 761 None, 762 )?; 763 764 // The device manager must create the devices from here as it is part 765 // of the regular code path creating everything from scratch. 766 new_vm 767 .device_manager 768 .lock() 769 .unwrap() 770 .create_devices(serial_pty, console_pty, console_resize_pipe) 771 .map_err(Error::DeviceManager)?; 772 Ok(new_vm) 773 } 774 775 #[allow(clippy::too_many_arguments)] 776 pub fn new_from_snapshot( 777 snapshot: &Snapshot, 778 vm_config: Arc<Mutex<VmConfig>>, 779 exit_evt: EventFd, 780 reset_evt: EventFd, 781 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 782 source_url: Option<&str>, 783 prefault: bool, 784 seccomp_action: &SeccompAction, 785 hypervisor: Arc<dyn hypervisor::Hypervisor>, 786 activate_evt: EventFd, 787 ) -> Result<Self> { 788 let timestamp = Instant::now(); 789 790 let vm = Self::create_hypervisor_vm( 791 &hypervisor, 792 #[cfg(feature = "tdx")] 793 false, 794 )?; 795 796 let memory_manager = if let Some(memory_manager_snapshot) = 797 snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) 798 { 799 let phys_bits = physical_bits(vm_config.lock().unwrap().cpus.max_phys_bits); 800 MemoryManager::new_from_snapshot( 801 memory_manager_snapshot, 802 vm.clone(), 803 &vm_config.lock().unwrap().memory.clone(), 804 source_url, 805 prefault, 806 phys_bits, 807 ) 808 .map_err(Error::MemoryManager)? 809 } else { 810 return Err(Error::Restore(MigratableError::Restore(anyhow!( 811 "Missing memory manager snapshot" 812 )))); 813 }; 814 815 Vm::new_from_memory_manager( 816 vm_config, 817 memory_manager, 818 vm, 819 exit_evt, 820 reset_evt, 821 #[cfg(feature = "guest_debug")] 822 vm_debug_evt, 823 seccomp_action, 824 hypervisor, 825 activate_evt, 826 true, 827 timestamp, 828 Some(snapshot), 829 ) 830 } 831 832 pub fn create_hypervisor_vm( 833 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 834 #[cfg(feature = "tdx")] tdx_enabled: bool, 835 ) -> Result<Arc<dyn hypervisor::Vm>> { 836 hypervisor.check_required_extensions().unwrap(); 837 838 #[cfg(feature = "tdx")] 839 let vm = hypervisor 840 .create_vm_with_type(if tdx_enabled { 841 2 // KVM_X86_TDX_VM 842 } else { 843 0 // KVM_X86_LEGACY_VM 844 }) 845 .unwrap(); 846 #[cfg(not(feature = "tdx"))] 847 let vm = hypervisor.create_vm().unwrap(); 848 849 #[cfg(target_arch = "x86_64")] 850 { 851 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 852 .unwrap(); 853 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 854 vm.enable_split_irq().unwrap(); 855 } 856 857 Ok(vm) 858 } 859 860 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 861 let mut initramfs = self.initramfs.as_ref().unwrap(); 862 let size: usize = initramfs 863 .seek(SeekFrom::End(0)) 864 .map_err(|_| Error::InitramfsLoad)? 865 .try_into() 866 .unwrap(); 867 initramfs 868 .seek(SeekFrom::Start(0)) 869 .map_err(|_| Error::InitramfsLoad)?; 870 871 let address = 872 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 873 let address = GuestAddress(address); 874 875 guest_mem 876 .read_from(address, &mut initramfs, size) 877 .map_err(|_| Error::InitramfsLoad)?; 878 879 info!("Initramfs loaded: address = 0x{:x}", address.0); 880 Ok(arch::InitramfsConfig { address, size }) 881 } 882 883 fn generate_cmdline( 884 payload: &PayloadConfig, 885 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 886 ) -> Result<Cmdline> { 887 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 888 if let Some(s) = payload.cmdline.as_ref() { 889 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 890 } 891 892 #[cfg(target_arch = "aarch64")] 893 for entry in device_manager.lock().unwrap().cmdline_additions() { 894 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 895 } 896 Ok(cmdline) 897 } 898 899 #[cfg(target_arch = "aarch64")] 900 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 901 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 902 let mem = uefi_flash.memory(); 903 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 904 .map_err(Error::UefiLoad)?; 905 Ok(()) 906 } 907 908 #[cfg(target_arch = "aarch64")] 909 fn load_kernel( 910 firmware: Option<File>, 911 kernel: Option<File>, 912 memory_manager: Arc<Mutex<MemoryManager>>, 913 ) -> Result<EntryPoint> { 914 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 915 let mem = guest_memory.memory(); 916 let entry_addr = match (firmware, kernel) { 917 (None, Some(mut kernel)) => { 918 match linux_loader::loader::pe::PE::load( 919 mem.deref(), 920 Some(arch::layout::KERNEL_START), 921 &mut kernel, 922 None, 923 ) { 924 Ok(entry_addr) => entry_addr.kernel_load, 925 // Try to load the binary as kernel PE file at first. 926 // If failed, retry to load it as UEFI binary. 927 // As the UEFI binary is formatless, it must be the last option to try. 928 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 929 Self::load_firmware(&kernel, memory_manager)?; 930 arch::layout::UEFI_START 931 } 932 Err(e) => { 933 return Err(Error::KernelLoad(e)); 934 } 935 } 936 } 937 (Some(firmware), None) => { 938 Self::load_firmware(&firmware, memory_manager)?; 939 arch::layout::UEFI_START 940 } 941 _ => return Err(Error::InvalidPayload), 942 }; 943 944 Ok(EntryPoint { entry_addr }) 945 } 946 947 #[cfg(target_arch = "x86_64")] 948 fn load_kernel( 949 mut kernel: File, 950 cmdline: Option<Cmdline>, 951 memory_manager: Arc<Mutex<MemoryManager>>, 952 ) -> Result<EntryPoint> { 953 info!("Loading kernel"); 954 955 let mem = { 956 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 957 guest_memory.memory() 958 }; 959 let entry_addr = linux_loader::loader::elf::Elf::load( 960 mem.deref(), 961 None, 962 &mut kernel, 963 Some(arch::layout::HIGH_RAM_START), 964 ) 965 .map_err(Error::KernelLoad)?; 966 967 if let Some(cmdline) = cmdline { 968 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 969 .map_err(Error::LoadCmdLine)?; 970 } 971 972 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 973 // Use the PVH kernel entry point to boot the guest 974 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 975 Ok(EntryPoint { 976 entry_addr: Some(entry_addr), 977 }) 978 } else { 979 Err(Error::KernelMissingPvhHeader) 980 } 981 } 982 983 #[cfg(target_arch = "x86_64")] 984 fn load_payload( 985 payload: &PayloadConfig, 986 memory_manager: Arc<Mutex<MemoryManager>>, 987 ) -> Result<EntryPoint> { 988 trace_scoped!("load_payload"); 989 match ( 990 &payload.firmware, 991 &payload.kernel, 992 &payload.initramfs, 993 &payload.cmdline, 994 ) { 995 (Some(firmware), None, None, None) => { 996 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 997 Self::load_kernel(firmware, None, memory_manager) 998 } 999 (None, Some(kernel), _, _) => { 1000 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1001 let cmdline = Self::generate_cmdline(payload)?; 1002 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1003 } 1004 _ => Err(Error::InvalidPayload), 1005 } 1006 } 1007 1008 #[cfg(target_arch = "aarch64")] 1009 fn load_payload( 1010 payload: &PayloadConfig, 1011 memory_manager: Arc<Mutex<MemoryManager>>, 1012 ) -> Result<EntryPoint> { 1013 match (&payload.firmware, &payload.kernel) { 1014 (Some(firmware), None) => { 1015 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1016 Self::load_kernel(Some(firmware), None, memory_manager) 1017 } 1018 (None, Some(kernel)) => { 1019 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1020 Self::load_kernel(None, Some(kernel), memory_manager) 1021 } 1022 _ => Err(Error::InvalidPayload), 1023 } 1024 } 1025 1026 fn load_payload_async( 1027 memory_manager: &Arc<Mutex<MemoryManager>>, 1028 config: &Arc<Mutex<VmConfig>>, 1029 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1030 // Kernel with TDX is loaded in a different manner 1031 #[cfg(feature = "tdx")] 1032 if config.lock().unwrap().is_tdx_enabled() { 1033 return Ok(None); 1034 } 1035 1036 config 1037 .lock() 1038 .unwrap() 1039 .payload 1040 .as_ref() 1041 .map(|payload| { 1042 let memory_manager = memory_manager.clone(); 1043 let payload = payload.clone(); 1044 1045 std::thread::Builder::new() 1046 .name("payload_loader".into()) 1047 .spawn(move || Self::load_payload(&payload, memory_manager)) 1048 .map_err(Error::KernelLoadThreadSpawn) 1049 }) 1050 .transpose() 1051 } 1052 1053 #[cfg(target_arch = "x86_64")] 1054 fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> { 1055 trace_scoped!("configure_system"); 1056 info!("Configuring system"); 1057 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1058 1059 let initramfs_config = match self.initramfs { 1060 Some(_) => Some(self.load_initramfs(&mem)?), 1061 None => None, 1062 }; 1063 1064 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1065 let rsdp_addr = Some(rsdp_addr); 1066 let sgx_epc_region = self 1067 .memory_manager 1068 .lock() 1069 .unwrap() 1070 .sgx_epc_region() 1071 .as_ref() 1072 .cloned(); 1073 1074 let serial_number = self 1075 .config 1076 .lock() 1077 .unwrap() 1078 .platform 1079 .as_ref() 1080 .and_then(|p| p.serial_number.clone()); 1081 1082 let uuid = self 1083 .config 1084 .lock() 1085 .unwrap() 1086 .platform 1087 .as_ref() 1088 .and_then(|p| p.uuid.clone()); 1089 1090 let oem_strings = self 1091 .config 1092 .lock() 1093 .unwrap() 1094 .platform 1095 .as_ref() 1096 .and_then(|p| p.oem_strings.clone()); 1097 1098 let oem_strings = oem_strings 1099 .as_deref() 1100 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1101 1102 arch::configure_system( 1103 &mem, 1104 arch::layout::CMDLINE_START, 1105 &initramfs_config, 1106 boot_vcpus, 1107 rsdp_addr, 1108 sgx_epc_region, 1109 serial_number.as_deref(), 1110 uuid.as_deref(), 1111 oem_strings.as_deref(), 1112 ) 1113 .map_err(Error::ConfigureSystem)?; 1114 Ok(()) 1115 } 1116 1117 #[cfg(target_arch = "aarch64")] 1118 fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> { 1119 let cmdline = Self::generate_cmdline( 1120 self.config.lock().unwrap().payload.as_ref().unwrap(), 1121 &self.device_manager, 1122 )?; 1123 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1124 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1125 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1126 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1127 let initramfs_config = match self.initramfs { 1128 Some(_) => Some(self.load_initramfs(&mem)?), 1129 None => None, 1130 }; 1131 1132 let device_info = &self 1133 .device_manager 1134 .lock() 1135 .unwrap() 1136 .get_device_info() 1137 .clone(); 1138 1139 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1140 let pci_space = PciSpaceInfo { 1141 pci_segment_id: pci_segment.id, 1142 mmio_config_address: pci_segment.mmio_config_address, 1143 pci_device_space_start: pci_segment.start_of_device_area, 1144 pci_device_space_size: pci_segment.end_of_device_area 1145 - pci_segment.start_of_device_area 1146 + 1, 1147 }; 1148 pci_space_info.push(pci_space); 1149 } 1150 1151 let virtio_iommu_bdf = self 1152 .device_manager 1153 .lock() 1154 .unwrap() 1155 .iommu_attached_devices() 1156 .as_ref() 1157 .map(|(v, _)| *v); 1158 1159 let vgic = self 1160 .device_manager 1161 .lock() 1162 .unwrap() 1163 .get_interrupt_controller() 1164 .unwrap() 1165 .lock() 1166 .unwrap() 1167 .get_vgic() 1168 .map_err(|_| { 1169 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1170 arch::aarch64::Error::SetupGic, 1171 )) 1172 })?; 1173 1174 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1175 let pmu_supported = self 1176 .cpu_manager 1177 .lock() 1178 .unwrap() 1179 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1180 .map_err(|_| { 1181 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1182 arch::aarch64::Error::VcpuInitPmu, 1183 )) 1184 })?; 1185 1186 arch::configure_system( 1187 &mem, 1188 cmdline.as_cstring().unwrap().to_str().unwrap(), 1189 vcpu_mpidrs, 1190 vcpu_topology, 1191 device_info, 1192 &initramfs_config, 1193 &pci_space_info, 1194 virtio_iommu_bdf.map(|bdf| bdf.into()), 1195 &vgic, 1196 &self.numa_nodes, 1197 pmu_supported, 1198 ) 1199 .map_err(Error::ConfigureSystem)?; 1200 1201 Ok(()) 1202 } 1203 1204 pub fn serial_pty(&self) -> Option<PtyPair> { 1205 self.device_manager.lock().unwrap().serial_pty() 1206 } 1207 1208 pub fn console_pty(&self) -> Option<PtyPair> { 1209 self.device_manager.lock().unwrap().console_pty() 1210 } 1211 1212 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1213 self.device_manager.lock().unwrap().console_resize_pipe() 1214 } 1215 1216 pub fn shutdown(&mut self) -> Result<()> { 1217 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1218 let new_state = VmState::Shutdown; 1219 1220 state.valid_transition(new_state)?; 1221 1222 if self.on_tty { 1223 // Don't forget to set the terminal in canonical mode 1224 // before to exit. 1225 io::stdin() 1226 .lock() 1227 .set_canon_mode() 1228 .map_err(Error::SetTerminalCanon)?; 1229 } 1230 1231 // Trigger the termination of the signal_handler thread 1232 if let Some(signals) = self.signals.take() { 1233 signals.close(); 1234 } 1235 1236 // Wake up the DeviceManager threads so they will get terminated cleanly 1237 self.device_manager 1238 .lock() 1239 .unwrap() 1240 .resume() 1241 .map_err(Error::Resume)?; 1242 1243 self.cpu_manager 1244 .lock() 1245 .unwrap() 1246 .shutdown() 1247 .map_err(Error::CpuManager)?; 1248 1249 // Wait for all the threads to finish 1250 for thread in self.threads.drain(..) { 1251 thread.join().map_err(Error::ThreadCleanup)? 1252 } 1253 *state = new_state; 1254 1255 event!("vm", "shutdown"); 1256 1257 Ok(()) 1258 } 1259 1260 pub fn resize( 1261 &mut self, 1262 desired_vcpus: Option<u8>, 1263 desired_memory: Option<u64>, 1264 desired_balloon: Option<u64>, 1265 ) -> Result<()> { 1266 event!("vm", "resizing"); 1267 1268 if let Some(desired_vcpus) = desired_vcpus { 1269 if self 1270 .cpu_manager 1271 .lock() 1272 .unwrap() 1273 .resize(desired_vcpus) 1274 .map_err(Error::CpuManager)? 1275 { 1276 self.device_manager 1277 .lock() 1278 .unwrap() 1279 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1280 .map_err(Error::DeviceManager)?; 1281 } 1282 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1283 } 1284 1285 if let Some(desired_memory) = desired_memory { 1286 let new_region = self 1287 .memory_manager 1288 .lock() 1289 .unwrap() 1290 .resize(desired_memory) 1291 .map_err(Error::MemoryManager)?; 1292 1293 let mut memory_config = &mut self.config.lock().unwrap().memory; 1294 1295 if let Some(new_region) = &new_region { 1296 self.device_manager 1297 .lock() 1298 .unwrap() 1299 .update_memory(new_region) 1300 .map_err(Error::DeviceManager)?; 1301 1302 match memory_config.hotplug_method { 1303 HotplugMethod::Acpi => { 1304 self.device_manager 1305 .lock() 1306 .unwrap() 1307 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1308 .map_err(Error::DeviceManager)?; 1309 } 1310 HotplugMethod::VirtioMem => {} 1311 } 1312 } 1313 1314 // We update the VM config regardless of the actual guest resize 1315 // operation result (happened or not), so that if the VM reboots 1316 // it will be running with the last configure memory size. 1317 match memory_config.hotplug_method { 1318 HotplugMethod::Acpi => memory_config.size = desired_memory, 1319 HotplugMethod::VirtioMem => { 1320 if desired_memory > memory_config.size { 1321 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1322 } else { 1323 memory_config.hotplugged_size = None; 1324 } 1325 } 1326 } 1327 } 1328 1329 if let Some(desired_balloon) = desired_balloon { 1330 self.device_manager 1331 .lock() 1332 .unwrap() 1333 .resize_balloon(desired_balloon) 1334 .map_err(Error::DeviceManager)?; 1335 1336 // Update the configuration value for the balloon size to ensure 1337 // a reboot would use the right value. 1338 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1339 balloon_config.size = desired_balloon; 1340 } 1341 } 1342 1343 event!("vm", "resized"); 1344 1345 Ok(()) 1346 } 1347 1348 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1349 let memory_config = &mut self.config.lock().unwrap().memory; 1350 1351 if let Some(zones) = &mut memory_config.zones { 1352 for zone in zones.iter_mut() { 1353 if zone.id == id { 1354 if desired_memory >= zone.size { 1355 let hotplugged_size = desired_memory - zone.size; 1356 self.memory_manager 1357 .lock() 1358 .unwrap() 1359 .resize_zone(&id, desired_memory - zone.size) 1360 .map_err(Error::MemoryManager)?; 1361 // We update the memory zone config regardless of the 1362 // actual 'resize-zone' operation result (happened or 1363 // not), so that if the VM reboots it will be running 1364 // with the last configured memory zone size. 1365 zone.hotplugged_size = Some(hotplugged_size); 1366 1367 return Ok(()); 1368 } else { 1369 error!( 1370 "Invalid to ask less ({}) than boot RAM ({}) for \ 1371 this memory zone", 1372 desired_memory, zone.size, 1373 ); 1374 return Err(Error::ResizeZone); 1375 } 1376 } 1377 } 1378 } 1379 1380 error!("Could not find the memory zone {} for the resize", id); 1381 Err(Error::ResizeZone) 1382 } 1383 1384 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1385 let pci_device_info = self 1386 .device_manager 1387 .lock() 1388 .unwrap() 1389 .add_device(&mut device_cfg) 1390 .map_err(Error::DeviceManager)?; 1391 1392 // Update VmConfig by adding the new device. This is important to 1393 // ensure the device would be created in case of a reboot. 1394 { 1395 let mut config = self.config.lock().unwrap(); 1396 add_to_config(&mut config.devices, device_cfg); 1397 } 1398 1399 self.device_manager 1400 .lock() 1401 .unwrap() 1402 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1403 .map_err(Error::DeviceManager)?; 1404 1405 Ok(pci_device_info) 1406 } 1407 1408 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1409 let pci_device_info = self 1410 .device_manager 1411 .lock() 1412 .unwrap() 1413 .add_user_device(&mut device_cfg) 1414 .map_err(Error::DeviceManager)?; 1415 1416 // Update VmConfig by adding the new device. This is important to 1417 // ensure the device would be created in case of a reboot. 1418 { 1419 let mut config = self.config.lock().unwrap(); 1420 add_to_config(&mut config.user_devices, device_cfg); 1421 } 1422 1423 self.device_manager 1424 .lock() 1425 .unwrap() 1426 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1427 .map_err(Error::DeviceManager)?; 1428 1429 Ok(pci_device_info) 1430 } 1431 1432 pub fn remove_device(&mut self, id: String) -> Result<()> { 1433 self.device_manager 1434 .lock() 1435 .unwrap() 1436 .remove_device(id.clone()) 1437 .map_err(Error::DeviceManager)?; 1438 1439 // Update VmConfig by removing the device. This is important to 1440 // ensure the device would not be created in case of a reboot. 1441 let mut config = self.config.lock().unwrap(); 1442 1443 // Remove if VFIO device 1444 if let Some(devices) = config.devices.as_mut() { 1445 devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1446 } 1447 1448 // Remove if VFIO user device 1449 if let Some(user_devices) = config.user_devices.as_mut() { 1450 user_devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1451 } 1452 1453 // Remove if disk device 1454 if let Some(disks) = config.disks.as_mut() { 1455 disks.retain(|dev| dev.id.as_ref() != Some(&id)); 1456 } 1457 1458 // Remove if fs device 1459 if let Some(fs) = config.fs.as_mut() { 1460 fs.retain(|dev| dev.id.as_ref() != Some(&id)); 1461 } 1462 1463 // Remove if net device 1464 if let Some(net) = config.net.as_mut() { 1465 net.retain(|dev| dev.id.as_ref() != Some(&id)); 1466 } 1467 1468 // Remove if pmem device 1469 if let Some(pmem) = config.pmem.as_mut() { 1470 pmem.retain(|dev| dev.id.as_ref() != Some(&id)); 1471 } 1472 1473 // Remove if vDPA device 1474 if let Some(vdpa) = config.vdpa.as_mut() { 1475 vdpa.retain(|dev| dev.id.as_ref() != Some(&id)); 1476 } 1477 1478 // Remove if vsock device 1479 if let Some(vsock) = config.vsock.as_ref() { 1480 if vsock.id.as_ref() == Some(&id) { 1481 config.vsock = None; 1482 } 1483 } 1484 1485 self.device_manager 1486 .lock() 1487 .unwrap() 1488 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1489 .map_err(Error::DeviceManager)?; 1490 Ok(()) 1491 } 1492 1493 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1494 let pci_device_info = self 1495 .device_manager 1496 .lock() 1497 .unwrap() 1498 .add_disk(&mut disk_cfg) 1499 .map_err(Error::DeviceManager)?; 1500 1501 // Update VmConfig by adding the new device. This is important to 1502 // ensure the device would be created in case of a reboot. 1503 { 1504 let mut config = self.config.lock().unwrap(); 1505 add_to_config(&mut config.disks, disk_cfg); 1506 } 1507 1508 self.device_manager 1509 .lock() 1510 .unwrap() 1511 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1512 .map_err(Error::DeviceManager)?; 1513 1514 Ok(pci_device_info) 1515 } 1516 1517 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1518 let pci_device_info = self 1519 .device_manager 1520 .lock() 1521 .unwrap() 1522 .add_fs(&mut fs_cfg) 1523 .map_err(Error::DeviceManager)?; 1524 1525 // Update VmConfig by adding the new device. This is important to 1526 // ensure the device would be created in case of a reboot. 1527 { 1528 let mut config = self.config.lock().unwrap(); 1529 add_to_config(&mut config.fs, fs_cfg); 1530 } 1531 1532 self.device_manager 1533 .lock() 1534 .unwrap() 1535 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1536 .map_err(Error::DeviceManager)?; 1537 1538 Ok(pci_device_info) 1539 } 1540 1541 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1542 let pci_device_info = self 1543 .device_manager 1544 .lock() 1545 .unwrap() 1546 .add_pmem(&mut pmem_cfg) 1547 .map_err(Error::DeviceManager)?; 1548 1549 // Update VmConfig by adding the new device. This is important to 1550 // ensure the device would be created in case of a reboot. 1551 { 1552 let mut config = self.config.lock().unwrap(); 1553 add_to_config(&mut config.pmem, pmem_cfg); 1554 } 1555 1556 self.device_manager 1557 .lock() 1558 .unwrap() 1559 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1560 .map_err(Error::DeviceManager)?; 1561 1562 Ok(pci_device_info) 1563 } 1564 1565 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1566 let pci_device_info = self 1567 .device_manager 1568 .lock() 1569 .unwrap() 1570 .add_net(&mut net_cfg) 1571 .map_err(Error::DeviceManager)?; 1572 1573 // Update VmConfig by adding the new device. This is important to 1574 // ensure the device would be created in case of a reboot. 1575 { 1576 let mut config = self.config.lock().unwrap(); 1577 add_to_config(&mut config.net, net_cfg); 1578 } 1579 1580 self.device_manager 1581 .lock() 1582 .unwrap() 1583 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1584 .map_err(Error::DeviceManager)?; 1585 1586 Ok(pci_device_info) 1587 } 1588 1589 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1590 let pci_device_info = self 1591 .device_manager 1592 .lock() 1593 .unwrap() 1594 .add_vdpa(&mut vdpa_cfg) 1595 .map_err(Error::DeviceManager)?; 1596 1597 // Update VmConfig by adding the new device. This is important to 1598 // ensure the device would be created in case of a reboot. 1599 { 1600 let mut config = self.config.lock().unwrap(); 1601 add_to_config(&mut config.vdpa, vdpa_cfg); 1602 } 1603 1604 self.device_manager 1605 .lock() 1606 .unwrap() 1607 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1608 .map_err(Error::DeviceManager)?; 1609 1610 Ok(pci_device_info) 1611 } 1612 1613 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1614 let pci_device_info = self 1615 .device_manager 1616 .lock() 1617 .unwrap() 1618 .add_vsock(&mut vsock_cfg) 1619 .map_err(Error::DeviceManager)?; 1620 1621 // Update VmConfig by adding the new device. This is important to 1622 // ensure the device would be created in case of a reboot. 1623 { 1624 let mut config = self.config.lock().unwrap(); 1625 config.vsock = Some(vsock_cfg); 1626 } 1627 1628 self.device_manager 1629 .lock() 1630 .unwrap() 1631 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1632 .map_err(Error::DeviceManager)?; 1633 1634 Ok(pci_device_info) 1635 } 1636 1637 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1638 Ok(self.device_manager.lock().unwrap().counters()) 1639 } 1640 1641 fn signal_handler(mut signals: Signals, console_input_clone: Arc<Console>) { 1642 for sig in &Vm::HANDLED_SIGNALS { 1643 unblock_signal(*sig).unwrap(); 1644 } 1645 1646 for signal in signals.forever() { 1647 if signal == SIGWINCH { 1648 console_input_clone.update_console_size(); 1649 } 1650 } 1651 } 1652 1653 #[cfg(feature = "tdx")] 1654 fn init_tdx(&mut self) -> Result<()> { 1655 let cpuid = self.cpu_manager.lock().unwrap().common_cpuid(); 1656 let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32; 1657 self.vm 1658 .tdx_init(&cpuid, max_vcpus) 1659 .map_err(Error::InitializeTdxVm)?; 1660 Ok(()) 1661 } 1662 1663 #[cfg(feature = "tdx")] 1664 fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> { 1665 use arch::x86_64::tdx::*; 1666 1667 let firmware_path = self 1668 .config 1669 .lock() 1670 .unwrap() 1671 .payload 1672 .as_ref() 1673 .unwrap() 1674 .firmware 1675 .clone() 1676 .ok_or(Error::TdxFirmwareMissing)?; 1677 // The TDVF file contains a table of section as well as code 1678 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1679 1680 // For all the sections allocate some RAM backing them 1681 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1682 } 1683 1684 #[cfg(feature = "tdx")] 1685 fn hob_memory_resources( 1686 mut sorted_sections: Vec<TdvfSection>, 1687 guest_memory: &GuestMemoryMmap, 1688 ) -> Vec<(u64, u64, bool)> { 1689 let mut list = Vec::new(); 1690 1691 let mut current_section = sorted_sections.pop(); 1692 1693 // RAM regions interleaved with TDVF sections 1694 let mut next_start_addr = 0; 1695 for region in guest_memory.iter() { 1696 let region_start = region.start_addr().0; 1697 let region_end = region.last_addr().0; 1698 if region_start > next_start_addr { 1699 next_start_addr = region_start; 1700 } 1701 1702 loop { 1703 let (start, size, ram) = if let Some(section) = ¤t_section { 1704 if section.address <= next_start_addr { 1705 (section.address, section.size, false) 1706 } else { 1707 let last_addr = std::cmp::min(section.address - 1, region_end); 1708 (next_start_addr, last_addr - next_start_addr + 1, true) 1709 } 1710 } else { 1711 (next_start_addr, region_end - next_start_addr + 1, true) 1712 }; 1713 1714 list.push((start, size, ram)); 1715 1716 if !ram { 1717 current_section = sorted_sections.pop(); 1718 } 1719 1720 next_start_addr = start + size; 1721 1722 if region_start > next_start_addr { 1723 next_start_addr = region_start; 1724 } 1725 1726 if next_start_addr > region_end { 1727 break; 1728 } 1729 } 1730 } 1731 1732 // Once all the interleaved sections have been processed, let's simply 1733 // pull the remaining ones. 1734 if let Some(section) = current_section { 1735 list.push((section.address, section.size, false)); 1736 } 1737 while let Some(section) = sorted_sections.pop() { 1738 list.push((section.address, section.size, false)); 1739 } 1740 1741 list 1742 } 1743 1744 #[cfg(feature = "tdx")] 1745 fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> { 1746 use arch::x86_64::tdx::*; 1747 // Get the memory end *before* we start adding TDVF ram regions 1748 let boot_guest_memory = self 1749 .memory_manager 1750 .lock() 1751 .as_ref() 1752 .unwrap() 1753 .boot_guest_memory(); 1754 for section in sections { 1755 // No need to allocate if the section falls within guest RAM ranges 1756 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1757 info!( 1758 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1759 section 1760 ); 1761 continue; 1762 } 1763 1764 info!("Allocating TDVF Section: {:x?}", section); 1765 self.memory_manager 1766 .lock() 1767 .unwrap() 1768 .add_ram_region(GuestAddress(section.address), section.size as usize) 1769 .map_err(Error::AllocatingTdvfMemory)?; 1770 } 1771 1772 // The TDVF file contains a table of section as well as code 1773 let firmware_path = self 1774 .config 1775 .lock() 1776 .unwrap() 1777 .payload 1778 .as_ref() 1779 .unwrap() 1780 .firmware 1781 .clone() 1782 .ok_or(Error::TdxFirmwareMissing)?; 1783 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1784 1785 // The guest memory at this point now has all the required regions so it 1786 // is safe to copy from the TDVF file into it. 1787 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1788 let mem = guest_memory.memory(); 1789 let mut payload_info = None; 1790 let mut hob_offset = None; 1791 for section in sections { 1792 info!("Populating TDVF Section: {:x?}", section); 1793 match section.r#type { 1794 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1795 info!("Copying section to guest memory"); 1796 firmware_file 1797 .seek(SeekFrom::Start(section.data_offset as u64)) 1798 .map_err(Error::LoadTdvf)?; 1799 mem.read_from( 1800 GuestAddress(section.address), 1801 &mut firmware_file, 1802 section.data_size as usize, 1803 ) 1804 .unwrap(); 1805 } 1806 TdvfSectionType::TdHob => { 1807 hob_offset = Some(section.address); 1808 } 1809 TdvfSectionType::Payload => { 1810 info!("Copying payload to guest memory"); 1811 if let Some(payload_file) = self.kernel.as_mut() { 1812 let payload_size = payload_file 1813 .seek(SeekFrom::End(0)) 1814 .map_err(Error::LoadPayload)?; 1815 1816 payload_file 1817 .seek(SeekFrom::Start(0x1f1)) 1818 .map_err(Error::LoadPayload)?; 1819 1820 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1821 payload_header 1822 .as_bytes() 1823 .read_from( 1824 0, 1825 payload_file, 1826 mem::size_of::<linux_loader::bootparam::setup_header>(), 1827 ) 1828 .unwrap(); 1829 1830 if payload_header.header != 0x5372_6448 { 1831 return Err(Error::InvalidPayloadType); 1832 } 1833 1834 if (payload_header.version < 0x0200) 1835 || ((payload_header.loadflags & 0x1) == 0x0) 1836 { 1837 return Err(Error::InvalidPayloadType); 1838 } 1839 1840 payload_file 1841 .seek(SeekFrom::Start(0)) 1842 .map_err(Error::LoadPayload)?; 1843 mem.read_from( 1844 GuestAddress(section.address), 1845 payload_file, 1846 payload_size as usize, 1847 ) 1848 .unwrap(); 1849 1850 // Create the payload info that will be inserted into 1851 // the HOB. 1852 payload_info = Some(PayloadInfo { 1853 image_type: PayloadImageType::BzImage, 1854 entry_point: section.address, 1855 }); 1856 } 1857 } 1858 TdvfSectionType::PayloadParam => { 1859 info!("Copying payload parameters to guest memory"); 1860 let cmdline = Self::generate_cmdline( 1861 self.config.lock().unwrap().payload.as_ref().unwrap(), 1862 )?; 1863 mem.write_slice( 1864 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1865 GuestAddress(section.address), 1866 ) 1867 .unwrap(); 1868 } 1869 _ => {} 1870 } 1871 } 1872 1873 // Generate HOB 1874 let mut hob = TdHob::start(hob_offset.unwrap()); 1875 1876 let mut sorted_sections = sections.to_vec(); 1877 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1878 1879 sorted_sections.sort_by_key(|section| section.address); 1880 sorted_sections.reverse(); 1881 1882 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1883 hob.add_memory_resource(&mem, start, size, ram) 1884 .map_err(Error::PopulateHob)?; 1885 } 1886 1887 // MMIO regions 1888 hob.add_mmio_resource( 1889 &mem, 1890 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1891 arch::layout::APIC_START.raw_value() 1892 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1893 ) 1894 .map_err(Error::PopulateHob)?; 1895 let start_of_device_area = self 1896 .memory_manager 1897 .lock() 1898 .unwrap() 1899 .start_of_device_area() 1900 .raw_value(); 1901 let end_of_device_area = self 1902 .memory_manager 1903 .lock() 1904 .unwrap() 1905 .end_of_device_area() 1906 .raw_value(); 1907 hob.add_mmio_resource( 1908 &mem, 1909 start_of_device_area, 1910 end_of_device_area - start_of_device_area, 1911 ) 1912 .map_err(Error::PopulateHob)?; 1913 1914 // Loop over the ACPI tables and copy them to the HOB. 1915 1916 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1917 &self.device_manager, 1918 &self.cpu_manager, 1919 &self.memory_manager, 1920 &self.numa_nodes, 1921 ) { 1922 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1923 .map_err(Error::PopulateHob)?; 1924 } 1925 1926 // If a payload info has been created, let's insert it into the HOB. 1927 if let Some(payload_info) = payload_info { 1928 hob.add_payload(&mem, payload_info) 1929 .map_err(Error::PopulateHob)?; 1930 } 1931 1932 hob.finish(&mem).map_err(Error::PopulateHob)?; 1933 1934 Ok(hob_offset) 1935 } 1936 1937 #[cfg(feature = "tdx")] 1938 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1939 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1940 let mem = guest_memory.memory(); 1941 1942 for section in sections { 1943 self.vm 1944 .tdx_init_memory_region( 1945 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1946 section.address, 1947 section.size, 1948 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1949 section.attributes == 1, 1950 ) 1951 .map_err(Error::InitializeTdxMemoryRegion)?; 1952 } 1953 1954 Ok(()) 1955 } 1956 1957 fn setup_signal_handler(&mut self) -> Result<()> { 1958 let console = self.device_manager.lock().unwrap().console().clone(); 1959 let signals = Signals::new(Vm::HANDLED_SIGNALS); 1960 match signals { 1961 Ok(signals) => { 1962 self.signals = Some(signals.handle()); 1963 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?; 1964 let signal_handler_seccomp_filter = get_seccomp_filter( 1965 &self.seccomp_action, 1966 Thread::SignalHandler, 1967 self.hypervisor.hypervisor_type(), 1968 ) 1969 .map_err(Error::CreateSeccompFilter)?; 1970 self.threads.push( 1971 thread::Builder::new() 1972 .name("vm_signal_handler".to_string()) 1973 .spawn(move || { 1974 if !signal_handler_seccomp_filter.is_empty() { 1975 if let Err(e) = apply_filter(&signal_handler_seccomp_filter) 1976 .map_err(Error::ApplySeccompFilter) 1977 { 1978 error!("Error applying seccomp filter: {:?}", e); 1979 exit_evt.write(1).ok(); 1980 return; 1981 } 1982 } 1983 std::panic::catch_unwind(AssertUnwindSafe(|| { 1984 Vm::signal_handler(signals, console); 1985 })) 1986 .map_err(|_| { 1987 error!("signal_handler thead panicked"); 1988 exit_evt.write(1).ok() 1989 }) 1990 .ok(); 1991 }) 1992 .map_err(Error::SignalHandlerSpawn)?, 1993 ); 1994 } 1995 Err(e) => error!("Signal not found {}", e), 1996 } 1997 Ok(()) 1998 } 1999 2000 fn setup_tty(&self) -> Result<()> { 2001 if self.on_tty { 2002 io::stdin() 2003 .lock() 2004 .set_raw_mode() 2005 .map_err(Error::SetTerminalRaw)?; 2006 } 2007 2008 Ok(()) 2009 } 2010 2011 // Creates ACPI tables 2012 // In case of TDX being used, this is a no-op since the tables will be 2013 // created and passed when populating the HOB. 2014 2015 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2016 #[cfg(feature = "tdx")] 2017 if self.config.lock().unwrap().is_tdx_enabled() { 2018 return None; 2019 } 2020 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2021 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2022 let rsdp_addr = crate::acpi::create_acpi_tables( 2023 &mem, 2024 &self.device_manager, 2025 &self.cpu_manager, 2026 &self.memory_manager, 2027 &self.numa_nodes, 2028 tpm_enabled, 2029 ); 2030 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2031 2032 Some(rsdp_addr) 2033 } 2034 2035 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2036 trace_scoped!("entry_point"); 2037 2038 self.load_payload_handle 2039 .take() 2040 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2041 .transpose() 2042 } 2043 2044 pub fn boot(&mut self) -> Result<()> { 2045 trace_scoped!("Vm::boot"); 2046 info!("Booting VM"); 2047 event!("vm", "booting"); 2048 let current_state = self.get_state()?; 2049 if current_state == VmState::Paused { 2050 return self.resume().map_err(Error::Resume); 2051 } 2052 2053 let new_state = if self.stop_on_boot { 2054 VmState::BreakPoint 2055 } else { 2056 VmState::Running 2057 }; 2058 current_state.valid_transition(new_state)?; 2059 2060 // Do earlier to parallelise with loading kernel 2061 #[cfg(target_arch = "x86_64")] 2062 let rsdp_addr = self.create_acpi_tables(); 2063 2064 self.setup_signal_handler()?; 2065 self.setup_tty()?; 2066 2067 // Load kernel synchronously or if asynchronous then wait for load to 2068 // finish. 2069 let entry_point = self.entry_point()?; 2070 2071 #[cfg(feature = "tdx")] 2072 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2073 2074 // The initial TDX configuration must be done before the vCPUs are 2075 // created 2076 #[cfg(feature = "tdx")] 2077 if tdx_enabled { 2078 self.init_tdx()?; 2079 } 2080 2081 // Configure the vcpus that have been created 2082 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2083 for vcpu in vcpus { 2084 self.cpu_manager 2085 .lock() 2086 .unwrap() 2087 .configure_vcpu(vcpu, entry_point, None) 2088 .map_err(Error::CpuManager)?; 2089 } 2090 2091 #[cfg(feature = "tdx")] 2092 let sections = if tdx_enabled { 2093 self.extract_tdvf_sections()? 2094 } else { 2095 Vec::new() 2096 }; 2097 2098 // Configuring the TDX regions requires that the vCPUs are created. 2099 #[cfg(feature = "tdx")] 2100 let hob_address = if tdx_enabled { 2101 // TDX sections are written to memory. 2102 self.populate_tdx_sections(§ions)? 2103 } else { 2104 None 2105 }; 2106 2107 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2108 // available after they are configured 2109 #[cfg(target_arch = "aarch64")] 2110 let rsdp_addr = self.create_acpi_tables(); 2111 2112 // Configure shared state based on loaded kernel 2113 entry_point 2114 .map(|_| { 2115 // Safe to unwrap rsdp_addr as we know it can't be None when 2116 // the entry_point is Some. 2117 self.configure_system(rsdp_addr.unwrap()) 2118 }) 2119 .transpose()?; 2120 2121 #[cfg(feature = "tdx")] 2122 if let Some(hob_address) = hob_address { 2123 // With the HOB address extracted the vCPUs can have 2124 // their TDX state configured. 2125 self.cpu_manager 2126 .lock() 2127 .unwrap() 2128 .initialize_tdx(hob_address) 2129 .map_err(Error::CpuManager)?; 2130 // Let the hypervisor know which memory ranges are shared with the 2131 // guest. This prevents the guest from ignoring/discarding memory 2132 // regions provided by the host. 2133 self.init_tdx_memory(§ions)?; 2134 // With TDX memory and CPU state configured TDX setup is complete 2135 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2136 } 2137 2138 self.cpu_manager 2139 .lock() 2140 .unwrap() 2141 .start_boot_vcpus(new_state == VmState::BreakPoint) 2142 .map_err(Error::CpuManager)?; 2143 2144 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2145 *state = new_state; 2146 event!("vm", "booted"); 2147 Ok(()) 2148 } 2149 2150 /// Gets a thread-safe reference counted pointer to the VM configuration. 2151 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2152 Arc::clone(&self.config) 2153 } 2154 2155 /// Get the VM state. Returns an error if the state is poisoned. 2156 pub fn get_state(&self) -> Result<VmState> { 2157 self.state 2158 .try_read() 2159 .map_err(|_| Error::PoisonedState) 2160 .map(|state| *state) 2161 } 2162 2163 /// Load saved clock from snapshot 2164 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2165 pub fn load_clock_from_snapshot( 2166 &mut self, 2167 snapshot: &Snapshot, 2168 ) -> Result<Option<hypervisor::ClockData>> { 2169 use crate::migration::get_vm_snapshot; 2170 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 2171 self.saved_clock = vm_snapshot.clock; 2172 Ok(self.saved_clock) 2173 } 2174 2175 #[cfg(target_arch = "aarch64")] 2176 /// Add the vGIC section to the VM snapshot. 2177 fn add_vgic_snapshot_section( 2178 &self, 2179 vm_snapshot: &mut Snapshot, 2180 ) -> std::result::Result<(), MigratableError> { 2181 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2182 self.device_manager 2183 .lock() 2184 .unwrap() 2185 .get_interrupt_controller() 2186 .unwrap() 2187 .lock() 2188 .unwrap() 2189 .set_gicr_typers(&saved_vcpu_states); 2190 2191 vm_snapshot.add_snapshot( 2192 self.device_manager 2193 .lock() 2194 .unwrap() 2195 .get_interrupt_controller() 2196 .unwrap() 2197 .lock() 2198 .unwrap() 2199 .snapshot()?, 2200 ); 2201 2202 Ok(()) 2203 } 2204 2205 #[cfg(target_arch = "aarch64")] 2206 /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing. 2207 fn restore_vgic_and_enable_interrupt( 2208 &self, 2209 vm_snapshot: &Snapshot, 2210 ) -> std::result::Result<(), MigratableError> { 2211 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2212 2213 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 2214 self.cpu_manager 2215 .lock() 2216 .unwrap() 2217 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 2218 .map_err(|e| MigratableError::Restore(anyhow!("Error init PMU: {:?}", e)))?; 2219 2220 // Here we prepare the GICR_TYPER registers from the restored vCPU states. 2221 self.device_manager 2222 .lock() 2223 .unwrap() 2224 .get_interrupt_controller() 2225 .unwrap() 2226 .lock() 2227 .unwrap() 2228 .set_gicr_typers(&saved_vcpu_states); 2229 2230 // Restore GIC states. 2231 if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) { 2232 self.device_manager 2233 .lock() 2234 .unwrap() 2235 .get_interrupt_controller() 2236 .unwrap() 2237 .lock() 2238 .unwrap() 2239 .restore(*gicv3_its_snapshot.clone())?; 2240 } else { 2241 return Err(MigratableError::Restore(anyhow!( 2242 "Missing GicV3Its snapshot" 2243 ))); 2244 } 2245 2246 Ok(()) 2247 } 2248 2249 /// Gets the actual size of the balloon. 2250 pub fn balloon_size(&self) -> u64 { 2251 self.device_manager.lock().unwrap().balloon_size() 2252 } 2253 2254 pub fn send_memory_fds( 2255 &mut self, 2256 socket: &mut UnixStream, 2257 ) -> std::result::Result<(), MigratableError> { 2258 for (slot, fd) in self 2259 .memory_manager 2260 .lock() 2261 .unwrap() 2262 .memory_slot_fds() 2263 .drain() 2264 { 2265 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2266 .write_to(socket) 2267 .map_err(|e| { 2268 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2269 })?; 2270 socket 2271 .send_with_fd(&slot.to_le_bytes()[..], fd) 2272 .map_err(|e| { 2273 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2274 })?; 2275 2276 let res = Response::read_from(socket)?; 2277 if res.status() != Status::Ok { 2278 warn!("Error during memory fd migration"); 2279 Request::abandon().write_to(socket)?; 2280 Response::read_from(socket).ok(); 2281 return Err(MigratableError::MigrateSend(anyhow!( 2282 "Error during memory fd migration" 2283 ))); 2284 } 2285 } 2286 2287 Ok(()) 2288 } 2289 2290 pub fn send_memory_regions<F>( 2291 &mut self, 2292 ranges: &MemoryRangeTable, 2293 fd: &mut F, 2294 ) -> std::result::Result<(), MigratableError> 2295 where 2296 F: Write, 2297 { 2298 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2299 let mem = guest_memory.memory(); 2300 2301 for range in ranges.regions() { 2302 let mut offset: u64 = 0; 2303 // Here we are manually handling the retry in case we can't the 2304 // whole region at once because we can't use the implementation 2305 // from vm-memory::GuestMemory of write_all_to() as it is not 2306 // following the correct behavior. For more info about this issue 2307 // see: https://github.com/rust-vmm/vm-memory/issues/174 2308 loop { 2309 let bytes_written = mem 2310 .write_to( 2311 GuestAddress(range.gpa + offset), 2312 fd, 2313 (range.length - offset) as usize, 2314 ) 2315 .map_err(|e| { 2316 MigratableError::MigrateSend(anyhow!( 2317 "Error transferring memory to socket: {}", 2318 e 2319 )) 2320 })?; 2321 offset += bytes_written as u64; 2322 2323 if offset == range.length { 2324 break; 2325 } 2326 } 2327 } 2328 2329 Ok(()) 2330 } 2331 2332 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2333 self.memory_manager 2334 .lock() 2335 .unwrap() 2336 .memory_range_table(false) 2337 } 2338 2339 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2340 self.device_manager.lock().unwrap().device_tree() 2341 } 2342 2343 pub fn activate_virtio_devices(&self) -> Result<()> { 2344 self.device_manager 2345 .lock() 2346 .unwrap() 2347 .activate_virtio_devices() 2348 .map_err(Error::ActivateVirtioDevices) 2349 } 2350 2351 #[cfg(target_arch = "x86_64")] 2352 pub fn power_button(&self) -> Result<()> { 2353 return self 2354 .device_manager 2355 .lock() 2356 .unwrap() 2357 .notify_power_button() 2358 .map_err(Error::PowerButton); 2359 } 2360 2361 #[cfg(target_arch = "aarch64")] 2362 pub fn power_button(&self) -> Result<()> { 2363 self.device_manager 2364 .lock() 2365 .unwrap() 2366 .notify_power_button() 2367 .map_err(Error::PowerButton) 2368 } 2369 2370 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2371 self.memory_manager.lock().unwrap().snapshot_data() 2372 } 2373 2374 #[cfg(feature = "guest_debug")] 2375 pub fn debug_request( 2376 &mut self, 2377 gdb_request: &GdbRequestPayload, 2378 cpu_id: usize, 2379 ) -> Result<GdbResponsePayload> { 2380 use GdbRequestPayload::*; 2381 match gdb_request { 2382 SetSingleStep(single_step) => { 2383 self.set_guest_debug(cpu_id, &[], *single_step) 2384 .map_err(Error::Debug)?; 2385 } 2386 SetHwBreakPoint(addrs) => { 2387 self.set_guest_debug(cpu_id, addrs, false) 2388 .map_err(Error::Debug)?; 2389 } 2390 Pause => { 2391 self.debug_pause().map_err(Error::Debug)?; 2392 } 2393 Resume => { 2394 self.debug_resume().map_err(Error::Debug)?; 2395 } 2396 ReadRegs => { 2397 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2398 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2399 } 2400 WriteRegs(regs) => { 2401 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2402 } 2403 ReadMem(vaddr, len) => { 2404 let mem = self.read_mem(cpu_id, *vaddr, *len).map_err(Error::Debug)?; 2405 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2406 } 2407 WriteMem(vaddr, data) => { 2408 self.write_mem(cpu_id, vaddr, data).map_err(Error::Debug)?; 2409 } 2410 ActiveVcpus => { 2411 let active_vcpus = self.active_vcpus(); 2412 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2413 } 2414 } 2415 Ok(GdbResponsePayload::CommandComplete) 2416 } 2417 2418 #[cfg(feature = "guest_debug")] 2419 fn get_dump_state( 2420 &mut self, 2421 destination_url: &str, 2422 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2423 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2424 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2425 let mut elf_phdr_num = 1; 2426 let elf_sh_info = 0; 2427 let coredump_file_path = url_to_file(destination_url)?; 2428 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2429 2430 if mapping_num < UINT16_MAX - 2 { 2431 elf_phdr_num += mapping_num as u16; 2432 } else { 2433 panic!("mapping num beyond 65535 not supported"); 2434 } 2435 let coredump_file = OpenOptions::new() 2436 .read(true) 2437 .write(true) 2438 .create_new(true) 2439 .open(coredump_file_path) 2440 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2441 2442 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2443 let mem_data = self 2444 .memory_manager 2445 .lock() 2446 .unwrap() 2447 .coredump_memory_regions(mem_offset); 2448 2449 Ok(DumpState { 2450 elf_note_size, 2451 elf_phdr_num, 2452 elf_sh_info, 2453 mem_offset, 2454 mem_info: Some(mem_data), 2455 file: Some(coredump_file), 2456 }) 2457 } 2458 2459 #[cfg(feature = "guest_debug")] 2460 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2461 size_of::<elf::Elf64_Ehdr>() as u64 2462 + note_size as u64 2463 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2464 } 2465 } 2466 2467 impl Pausable for Vm { 2468 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2469 event!("vm", "pausing"); 2470 let mut state = self 2471 .state 2472 .try_write() 2473 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2474 let new_state = VmState::Paused; 2475 2476 state 2477 .valid_transition(new_state) 2478 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2479 2480 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2481 { 2482 let mut clock = self 2483 .vm 2484 .get_clock() 2485 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2486 clock.reset_flags(); 2487 self.saved_clock = Some(clock); 2488 } 2489 2490 // Before pausing the vCPUs activate any pending virtio devices that might 2491 // need activation between starting the pause (or e.g. a migration it's part of) 2492 self.activate_virtio_devices().map_err(|e| { 2493 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2494 })?; 2495 2496 self.cpu_manager.lock().unwrap().pause()?; 2497 self.device_manager.lock().unwrap().pause()?; 2498 2499 *state = new_state; 2500 2501 event!("vm", "paused"); 2502 Ok(()) 2503 } 2504 2505 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2506 event!("vm", "resuming"); 2507 let mut state = self 2508 .state 2509 .try_write() 2510 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2511 let new_state = VmState::Running; 2512 2513 state 2514 .valid_transition(new_state) 2515 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2516 2517 self.cpu_manager.lock().unwrap().resume()?; 2518 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2519 { 2520 if let Some(clock) = &self.saved_clock { 2521 self.vm.set_clock(clock).map_err(|e| { 2522 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2523 })?; 2524 } 2525 } 2526 self.device_manager.lock().unwrap().resume()?; 2527 2528 // And we're back to the Running state. 2529 *state = new_state; 2530 event!("vm", "resumed"); 2531 Ok(()) 2532 } 2533 } 2534 2535 #[derive(Serialize, Deserialize)] 2536 pub struct VmSnapshot { 2537 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2538 pub clock: Option<hypervisor::ClockData>, 2539 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2540 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2541 } 2542 2543 pub const VM_SNAPSHOT_ID: &str = "vm"; 2544 impl Snapshottable for Vm { 2545 fn id(&self) -> String { 2546 VM_SNAPSHOT_ID.to_string() 2547 } 2548 2549 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2550 event!("vm", "snapshotting"); 2551 2552 #[cfg(feature = "tdx")] 2553 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2554 2555 #[cfg(feature = "tdx")] 2556 { 2557 if tdx_enabled { 2558 return Err(MigratableError::Snapshot(anyhow!( 2559 "Snapshot not possible with TDX VM" 2560 ))); 2561 } 2562 } 2563 2564 let current_state = self.get_state().unwrap(); 2565 if current_state != VmState::Paused { 2566 return Err(MigratableError::Snapshot(anyhow!( 2567 "Trying to snapshot while VM is running" 2568 ))); 2569 } 2570 2571 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2572 let common_cpuid = { 2573 let phys_bits = physical_bits(self.config.lock().unwrap().cpus.max_phys_bits); 2574 arch::generate_common_cpuid( 2575 self.hypervisor.clone(), 2576 None, 2577 None, 2578 phys_bits, 2579 self.config.lock().unwrap().cpus.kvm_hyperv, 2580 #[cfg(feature = "tdx")] 2581 tdx_enabled, 2582 ) 2583 .map_err(|e| { 2584 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2585 })? 2586 }; 2587 2588 let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID); 2589 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2590 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2591 clock: self.saved_clock, 2592 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2593 common_cpuid, 2594 }) 2595 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2596 2597 vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?); 2598 vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?); 2599 2600 #[cfg(target_arch = "aarch64")] 2601 self.add_vgic_snapshot_section(&mut vm_snapshot) 2602 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2603 2604 vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?); 2605 vm_snapshot.add_data_section(SnapshotDataSection { 2606 id: format!("{}-section", VM_SNAPSHOT_ID), 2607 snapshot: vm_snapshot_data, 2608 }); 2609 2610 event!("vm", "snapshotted"); 2611 Ok(vm_snapshot) 2612 } 2613 2614 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 2615 event!("vm", "restoring"); 2616 2617 let current_state = self 2618 .get_state() 2619 .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?; 2620 let new_state = VmState::Paused; 2621 current_state.valid_transition(new_state).map_err(|e| { 2622 MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e)) 2623 })?; 2624 2625 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2626 self.load_clock_from_snapshot(&snapshot) 2627 .map_err(|e| MigratableError::Restore(anyhow!("Error restoring clock: {:?}", e)))?; 2628 2629 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2630 self.device_manager 2631 .lock() 2632 .unwrap() 2633 .restore(*device_manager_snapshot.clone())?; 2634 } else { 2635 return Err(MigratableError::Restore(anyhow!( 2636 "Missing device manager snapshot" 2637 ))); 2638 } 2639 2640 if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) { 2641 self.cpu_manager 2642 .lock() 2643 .unwrap() 2644 .restore(*cpu_manager_snapshot.clone())?; 2645 } else { 2646 return Err(MigratableError::Restore(anyhow!( 2647 "Missing CPU manager snapshot" 2648 ))); 2649 } 2650 2651 #[cfg(target_arch = "aarch64")] 2652 self.restore_vgic_and_enable_interrupt(&snapshot)?; 2653 2654 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2655 self.device_manager 2656 .lock() 2657 .unwrap() 2658 .restore_devices(*device_manager_snapshot.clone())?; 2659 } else { 2660 return Err(MigratableError::Restore(anyhow!( 2661 "Missing device manager snapshot" 2662 ))); 2663 } 2664 2665 // Now we can start all vCPUs from here. 2666 self.cpu_manager 2667 .lock() 2668 .unwrap() 2669 .start_restored_vcpus() 2670 .map_err(|e| { 2671 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e)) 2672 })?; 2673 2674 self.setup_signal_handler().map_err(|e| { 2675 MigratableError::Restore(anyhow!("Could not setup signal handler: {:#?}", e)) 2676 })?; 2677 self.setup_tty() 2678 .map_err(|e| MigratableError::Restore(anyhow!("Could not setup tty: {:#?}", e)))?; 2679 2680 let mut state = self 2681 .state 2682 .try_write() 2683 .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?; 2684 *state = new_state; 2685 2686 event!("vm", "restored"); 2687 Ok(()) 2688 } 2689 } 2690 2691 impl Transportable for Vm { 2692 fn send( 2693 &self, 2694 snapshot: &Snapshot, 2695 destination_url: &str, 2696 ) -> std::result::Result<(), MigratableError> { 2697 let mut snapshot_config_path = url_to_path(destination_url)?; 2698 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2699 2700 // Create the snapshot config file 2701 let mut snapshot_config_file = OpenOptions::new() 2702 .read(true) 2703 .write(true) 2704 .create_new(true) 2705 .open(snapshot_config_path) 2706 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2707 2708 // Serialize and write the snapshot config 2709 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2710 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2711 2712 snapshot_config_file 2713 .write(vm_config.as_bytes()) 2714 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2715 2716 let mut snapshot_state_path = url_to_path(destination_url)?; 2717 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2718 2719 // Create the snapshot state file 2720 let mut snapshot_state_file = OpenOptions::new() 2721 .read(true) 2722 .write(true) 2723 .create_new(true) 2724 .open(snapshot_state_path) 2725 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2726 2727 // Serialize and write the snapshot state 2728 let vm_state = 2729 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2730 2731 snapshot_state_file 2732 .write(&vm_state) 2733 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2734 2735 // Tell the memory manager to also send/write its own snapshot. 2736 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2737 self.memory_manager 2738 .lock() 2739 .unwrap() 2740 .send(&memory_manager_snapshot.clone(), destination_url)?; 2741 } else { 2742 return Err(MigratableError::Restore(anyhow!( 2743 "Missing memory manager snapshot" 2744 ))); 2745 } 2746 2747 Ok(()) 2748 } 2749 } 2750 2751 impl Migratable for Vm { 2752 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2753 self.memory_manager.lock().unwrap().start_dirty_log()?; 2754 self.device_manager.lock().unwrap().start_dirty_log() 2755 } 2756 2757 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2758 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2759 self.device_manager.lock().unwrap().stop_dirty_log() 2760 } 2761 2762 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2763 Ok(MemoryRangeTable::new_from_tables(vec![ 2764 self.memory_manager.lock().unwrap().dirty_log()?, 2765 self.device_manager.lock().unwrap().dirty_log()?, 2766 ])) 2767 } 2768 2769 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2770 self.memory_manager.lock().unwrap().start_migration()?; 2771 self.device_manager.lock().unwrap().start_migration() 2772 } 2773 2774 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2775 self.memory_manager.lock().unwrap().complete_migration()?; 2776 self.device_manager.lock().unwrap().complete_migration() 2777 } 2778 } 2779 2780 #[cfg(feature = "guest_debug")] 2781 impl Debuggable for Vm { 2782 fn set_guest_debug( 2783 &self, 2784 cpu_id: usize, 2785 addrs: &[GuestAddress], 2786 singlestep: bool, 2787 ) -> std::result::Result<(), DebuggableError> { 2788 self.cpu_manager 2789 .lock() 2790 .unwrap() 2791 .set_guest_debug(cpu_id, addrs, singlestep) 2792 } 2793 2794 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2795 if *self.state.read().unwrap() == VmState::Running { 2796 self.pause().map_err(DebuggableError::Pause)?; 2797 } 2798 2799 let mut state = self 2800 .state 2801 .try_write() 2802 .map_err(|_| DebuggableError::PoisonedState)?; 2803 *state = VmState::BreakPoint; 2804 Ok(()) 2805 } 2806 2807 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2808 if *self.state.read().unwrap() == VmState::BreakPoint { 2809 self.resume().map_err(DebuggableError::Pause)?; 2810 } 2811 2812 Ok(()) 2813 } 2814 2815 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2816 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2817 } 2818 2819 fn write_regs( 2820 &self, 2821 cpu_id: usize, 2822 regs: &CoreRegs, 2823 ) -> std::result::Result<(), DebuggableError> { 2824 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2825 } 2826 2827 fn read_mem( 2828 &self, 2829 cpu_id: usize, 2830 vaddr: GuestAddress, 2831 len: usize, 2832 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2833 self.cpu_manager 2834 .lock() 2835 .unwrap() 2836 .read_mem(cpu_id, vaddr, len) 2837 } 2838 2839 fn write_mem( 2840 &self, 2841 cpu_id: usize, 2842 vaddr: &GuestAddress, 2843 data: &[u8], 2844 ) -> std::result::Result<(), DebuggableError> { 2845 self.cpu_manager 2846 .lock() 2847 .unwrap() 2848 .write_mem(cpu_id, vaddr, data) 2849 } 2850 2851 fn active_vcpus(&self) -> usize { 2852 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2853 if active_vcpus > 0 { 2854 active_vcpus 2855 } else { 2856 // The VM is not booted yet. Report boot_vcpus() instead. 2857 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2858 } 2859 } 2860 } 2861 2862 #[cfg(feature = "guest_debug")] 2863 pub const UINT16_MAX: u32 = 65535; 2864 2865 #[cfg(feature = "guest_debug")] 2866 impl Elf64Writable for Vm {} 2867 2868 #[cfg(feature = "guest_debug")] 2869 impl GuestDebuggable for Vm { 2870 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2871 event!("vm", "coredumping"); 2872 2873 #[cfg(feature = "tdx")] 2874 { 2875 if let Some(ref platform) = self.config.lock().unwrap().platform { 2876 if platform.tdx { 2877 return Err(GuestDebuggableError::Coredump(anyhow!( 2878 "Coredump not possible with TDX VM" 2879 ))); 2880 } 2881 } 2882 } 2883 2884 let current_state = self.get_state().unwrap(); 2885 if current_state != VmState::Paused { 2886 return Err(GuestDebuggableError::Coredump(anyhow!( 2887 "Trying to coredump while VM is running" 2888 ))); 2889 } 2890 2891 let coredump_state = self.get_dump_state(destination_url)?; 2892 2893 self.write_header(&coredump_state)?; 2894 self.write_note(&coredump_state)?; 2895 self.write_loads(&coredump_state)?; 2896 2897 self.cpu_manager 2898 .lock() 2899 .unwrap() 2900 .cpu_write_elf64_note(&coredump_state)?; 2901 self.cpu_manager 2902 .lock() 2903 .unwrap() 2904 .cpu_write_vmm_note(&coredump_state)?; 2905 2906 self.memory_manager 2907 .lock() 2908 .unwrap() 2909 .coredump_iterate_save_mem(&coredump_state) 2910 } 2911 } 2912 2913 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2914 #[cfg(test)] 2915 mod tests { 2916 use super::*; 2917 2918 fn test_vm_state_transitions(state: VmState) { 2919 match state { 2920 VmState::Created => { 2921 // Check the transitions from Created 2922 assert!(state.valid_transition(VmState::Created).is_err()); 2923 assert!(state.valid_transition(VmState::Running).is_ok()); 2924 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2925 assert!(state.valid_transition(VmState::Paused).is_ok()); 2926 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2927 } 2928 VmState::Running => { 2929 // Check the transitions from Running 2930 assert!(state.valid_transition(VmState::Created).is_err()); 2931 assert!(state.valid_transition(VmState::Running).is_err()); 2932 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2933 assert!(state.valid_transition(VmState::Paused).is_ok()); 2934 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2935 } 2936 VmState::Shutdown => { 2937 // Check the transitions from Shutdown 2938 assert!(state.valid_transition(VmState::Created).is_err()); 2939 assert!(state.valid_transition(VmState::Running).is_ok()); 2940 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2941 assert!(state.valid_transition(VmState::Paused).is_err()); 2942 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2943 } 2944 VmState::Paused => { 2945 // Check the transitions from Paused 2946 assert!(state.valid_transition(VmState::Created).is_err()); 2947 assert!(state.valid_transition(VmState::Running).is_ok()); 2948 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2949 assert!(state.valid_transition(VmState::Paused).is_err()); 2950 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2951 } 2952 VmState::BreakPoint => { 2953 // Check the transitions from Breakpoint 2954 assert!(state.valid_transition(VmState::Created).is_ok()); 2955 assert!(state.valid_transition(VmState::Running).is_ok()); 2956 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2957 assert!(state.valid_transition(VmState::Paused).is_err()); 2958 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2959 } 2960 } 2961 } 2962 2963 #[test] 2964 fn test_vm_created_transitions() { 2965 test_vm_state_transitions(VmState::Created); 2966 } 2967 2968 #[test] 2969 fn test_vm_running_transitions() { 2970 test_vm_state_transitions(VmState::Running); 2971 } 2972 2973 #[test] 2974 fn test_vm_shutdown_transitions() { 2975 test_vm_state_transitions(VmState::Shutdown); 2976 } 2977 2978 #[test] 2979 fn test_vm_paused_transitions() { 2980 test_vm_state_transitions(VmState::Paused); 2981 } 2982 2983 #[cfg(feature = "tdx")] 2984 #[test] 2985 fn test_hob_memory_resources() { 2986 // Case 1: Two TDVF sections in the middle of the RAM 2987 let sections = vec![ 2988 TdvfSection { 2989 address: 0xc000, 2990 size: 0x1000, 2991 ..Default::default() 2992 }, 2993 TdvfSection { 2994 address: 0x1000, 2995 size: 0x4000, 2996 ..Default::default() 2997 }, 2998 ]; 2999 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 3000 let expected = vec![ 3001 (0, 0x1000, true), 3002 (0x1000, 0x4000, false), 3003 (0x5000, 0x7000, true), 3004 (0xc000, 0x1000, false), 3005 (0xd000, 0x0fff_3000, true), 3006 ]; 3007 assert_eq!( 3008 expected, 3009 Vm::hob_memory_resources( 3010 sections, 3011 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3012 ) 3013 ); 3014 3015 // Case 2: Two TDVF sections with no conflict with the RAM 3016 let sections = vec![ 3017 TdvfSection { 3018 address: 0x1000_1000, 3019 size: 0x1000, 3020 ..Default::default() 3021 }, 3022 TdvfSection { 3023 address: 0, 3024 size: 0x1000, 3025 ..Default::default() 3026 }, 3027 ]; 3028 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3029 let expected = vec![ 3030 (0, 0x1000, false), 3031 (0x1000, 0x1000_0000, true), 3032 (0x1000_1000, 0x1000, false), 3033 ]; 3034 assert_eq!( 3035 expected, 3036 Vm::hob_memory_resources( 3037 sections, 3038 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3039 ) 3040 ); 3041 3042 // Case 3: Two TDVF sections with partial conflicts with the RAM 3043 let sections = vec![ 3044 TdvfSection { 3045 address: 0x1000_0000, 3046 size: 0x2000, 3047 ..Default::default() 3048 }, 3049 TdvfSection { 3050 address: 0, 3051 size: 0x2000, 3052 ..Default::default() 3053 }, 3054 ]; 3055 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3056 let expected = vec![ 3057 (0, 0x2000, false), 3058 (0x2000, 0x0fff_e000, true), 3059 (0x1000_0000, 0x2000, false), 3060 ]; 3061 assert_eq!( 3062 expected, 3063 Vm::hob_memory_resources( 3064 sections, 3065 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3066 ) 3067 ); 3068 3069 // Case 4: Two TDVF sections with no conflict before the RAM and two 3070 // more additional sections with no conflict after the RAM. 3071 let sections = vec![ 3072 TdvfSection { 3073 address: 0x2000_1000, 3074 size: 0x1000, 3075 ..Default::default() 3076 }, 3077 TdvfSection { 3078 address: 0x2000_0000, 3079 size: 0x1000, 3080 ..Default::default() 3081 }, 3082 TdvfSection { 3083 address: 0x1000, 3084 size: 0x1000, 3085 ..Default::default() 3086 }, 3087 TdvfSection { 3088 address: 0, 3089 size: 0x1000, 3090 ..Default::default() 3091 }, 3092 ]; 3093 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3094 let expected = vec![ 3095 (0, 0x1000, false), 3096 (0x1000, 0x1000, false), 3097 (0x4000, 0x1000_0000, true), 3098 (0x2000_0000, 0x1000, false), 3099 (0x2000_1000, 0x1000, false), 3100 ]; 3101 assert_eq!( 3102 expected, 3103 Vm::hob_memory_resources( 3104 sections, 3105 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3106 ) 3107 ); 3108 3109 // Case 5: One TDVF section overriding the entire RAM 3110 let sections = vec![TdvfSection { 3111 address: 0, 3112 size: 0x2000_0000, 3113 ..Default::default() 3114 }]; 3115 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3116 let expected = vec![(0, 0x2000_0000, false)]; 3117 assert_eq!( 3118 expected, 3119 Vm::hob_memory_resources( 3120 sections, 3121 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3122 ) 3123 ); 3124 3125 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3126 let sections = vec![ 3127 TdvfSection { 3128 address: 0x1000_2000, 3129 size: 0x2000, 3130 ..Default::default() 3131 }, 3132 TdvfSection { 3133 address: 0, 3134 size: 0x2000, 3135 ..Default::default() 3136 }, 3137 ]; 3138 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3139 (GuestAddress(0x2000), 0x1000_0000), 3140 (GuestAddress(0x1000_4000), 0x1000_0000), 3141 ]; 3142 let expected = vec![ 3143 (0, 0x2000, false), 3144 (0x2000, 0x1000_0000, true), 3145 (0x1000_2000, 0x2000, false), 3146 (0x1000_4000, 0x1000_0000, true), 3147 ]; 3148 assert_eq!( 3149 expected, 3150 Vm::hob_memory_resources( 3151 sections, 3152 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3153 ) 3154 ); 3155 3156 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3157 let sections = vec![ 3158 TdvfSection { 3159 address: 0x1000_0000, 3160 size: 0x4000, 3161 ..Default::default() 3162 }, 3163 TdvfSection { 3164 address: 0, 3165 size: 0x4000, 3166 ..Default::default() 3167 }, 3168 ]; 3169 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3170 (GuestAddress(0x1000), 0x1000_0000), 3171 (GuestAddress(0x1000_3000), 0x1000_0000), 3172 ]; 3173 let expected = vec![ 3174 (0, 0x4000, false), 3175 (0x4000, 0x0fff_c000, true), 3176 (0x1000_0000, 0x4000, false), 3177 (0x1000_4000, 0x0fff_f000, true), 3178 ]; 3179 assert_eq!( 3180 expected, 3181 Vm::hob_memory_resources( 3182 sections, 3183 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3184 ) 3185 ); 3186 } 3187 } 3188 3189 #[cfg(target_arch = "aarch64")] 3190 #[cfg(test)] 3191 mod tests { 3192 use super::*; 3193 use crate::GuestMemoryMmap; 3194 use arch::aarch64::fdt::create_fdt; 3195 use arch::aarch64::layout; 3196 use arch::{DeviceType, MmioDeviceInfo}; 3197 use devices::gic::Gic; 3198 3199 const LEN: u64 = 4096; 3200 3201 #[test] 3202 fn test_create_fdt_with_devices() { 3203 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3204 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3205 3206 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3207 ( 3208 (DeviceType::Serial, DeviceType::Serial.to_string()), 3209 MmioDeviceInfo { 3210 addr: 0x00, 3211 len: LEN, 3212 irq: 33, 3213 }, 3214 ), 3215 ( 3216 (DeviceType::Virtio(1), "virtio".to_string()), 3217 MmioDeviceInfo { 3218 addr: LEN, 3219 len: LEN, 3220 irq: 34, 3221 }, 3222 ), 3223 ( 3224 (DeviceType::Rtc, "rtc".to_string()), 3225 MmioDeviceInfo { 3226 addr: 2 * LEN, 3227 len: LEN, 3228 irq: 35, 3229 }, 3230 ), 3231 ] 3232 .iter() 3233 .cloned() 3234 .collect(); 3235 3236 let hv = hypervisor::new().unwrap(); 3237 let vm = hv.create_vm().unwrap(); 3238 let gic = vm 3239 .create_vgic(Gic::create_default_config(1)) 3240 .expect("Cannot create gic"); 3241 assert!(create_fdt( 3242 &mem, 3243 "console=tty0", 3244 vec![0], 3245 Some((0, 0, 0)), 3246 &dev_info, 3247 &gic, 3248 &None, 3249 &Vec::new(), 3250 &BTreeMap::new(), 3251 None, 3252 true, 3253 ) 3254 .is_ok()) 3255 } 3256 } 3257 3258 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3259 #[test] 3260 pub fn test_vm() { 3261 use hypervisor::VmExit; 3262 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3263 // This example based on https://lwn.net/Articles/658511/ 3264 let code = [ 3265 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3266 0x00, 0xd8, /* add %bl, %al */ 3267 0x04, b'0', /* add $'0', %al */ 3268 0xee, /* out %al, (%dx) */ 3269 0xb0, b'\n', /* mov $'\n', %al */ 3270 0xee, /* out %al, (%dx) */ 3271 0xf4, /* hlt */ 3272 ]; 3273 3274 let mem_size = 0x1000; 3275 let load_addr = GuestAddress(0x1000); 3276 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3277 3278 let hv = hypervisor::new().unwrap(); 3279 let vm = hv.create_vm().expect("new VM creation failed"); 3280 3281 for (index, region) in mem.iter().enumerate() { 3282 let mem_region = vm.make_user_memory_region( 3283 index as u32, 3284 region.start_addr().raw_value(), 3285 region.len(), 3286 region.as_ptr() as u64, 3287 false, 3288 false, 3289 ); 3290 3291 vm.create_user_memory_region(mem_region) 3292 .expect("Cannot configure guest memory"); 3293 } 3294 mem.write_slice(&code, load_addr) 3295 .expect("Writing code to memory failed"); 3296 3297 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3298 3299 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3300 vcpu_sregs.cs.base = 0; 3301 vcpu_sregs.cs.selector = 0; 3302 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3303 3304 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3305 vcpu_regs.rip = 0x1000; 3306 vcpu_regs.rax = 2; 3307 vcpu_regs.rbx = 3; 3308 vcpu_regs.rflags = 2; 3309 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3310 3311 loop { 3312 match vcpu.run().expect("run failed") { 3313 VmExit::IoOut(addr, data) => { 3314 println!( 3315 "IO out -- addr: {:#x} data [{:?}]", 3316 addr, 3317 str::from_utf8(data).unwrap() 3318 ); 3319 } 3320 VmExit::Reset => { 3321 println!("HLT"); 3322 break; 3323 } 3324 r => panic!("unexpected exit reason: {:?}", r), 3325 } 3326 } 3327 } 3328