1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 use crate::memory_manager::{ 29 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 30 }; 31 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 32 use crate::migration::get_vm_snapshot; 33 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 34 use crate::migration::url_to_file; 35 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 36 use crate::GuestMemoryMmap; 37 use crate::{ 38 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 39 }; 40 use anyhow::anyhow; 41 use arch::get_host_cpu_phys_bits; 42 #[cfg(target_arch = "x86_64")] 43 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 44 #[cfg(feature = "tdx")] 45 use arch::x86_64::tdx::TdvfSection; 46 use arch::EntryPoint; 47 #[cfg(target_arch = "aarch64")] 48 use arch::PciSpaceInfo; 49 use arch::{NumaNode, NumaNodes}; 50 #[cfg(target_arch = "aarch64")] 51 use devices::interrupt_controller; 52 use devices::AcpiNotificationFlags; 53 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 54 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 55 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 56 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 57 use hypervisor::{HypervisorVmError, VmOps}; 58 use libc::{termios, SIGWINCH}; 59 use linux_loader::cmdline::Cmdline; 60 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 61 use linux_loader::elf; 62 #[cfg(target_arch = "x86_64")] 63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 64 #[cfg(target_arch = "aarch64")] 65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 66 use linux_loader::loader::KernelLoader; 67 use seccompiler::SeccompAction; 68 use serde::{Deserialize, Serialize}; 69 use std::cmp; 70 use std::collections::BTreeMap; 71 use std::collections::HashMap; 72 use std::convert::TryInto; 73 use std::fs::{File, OpenOptions}; 74 use std::io::{self, Seek, SeekFrom, Write}; 75 #[cfg(feature = "tdx")] 76 use std::mem; 77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 78 use std::mem::size_of; 79 use std::num::Wrapping; 80 use std::ops::Deref; 81 use std::os::unix::net::UnixStream; 82 use std::sync::{Arc, Mutex, RwLock}; 83 use std::time::Instant; 84 use std::{result, str, thread}; 85 use thiserror::Error; 86 use tracer::trace_scoped; 87 use vm_device::Bus; 88 #[cfg(feature = "tdx")] 89 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion}; 90 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; 91 use vm_migration::protocol::{Request, Response, Status}; 92 use vm_migration::{ 93 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 94 SnapshotData, Snapshottable, Transportable, 95 }; 96 use vmm_sys_util::eventfd::EventFd; 97 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 98 99 /// Errors associated with VM management 100 #[derive(Debug, Error)] 101 pub enum Error { 102 #[error("Cannot open kernel file: {0}")] 103 KernelFile(#[source] io::Error), 104 105 #[error("Cannot open initramfs file: {0}")] 106 InitramfsFile(#[source] io::Error), 107 108 #[error("Cannot load the kernel into memory: {0}")] 109 KernelLoad(#[source] linux_loader::loader::Error), 110 111 #[cfg(target_arch = "aarch64")] 112 #[error("Cannot load the UEFI binary in memory: {0:?}")] 113 UefiLoad(arch::aarch64::uefi::Error), 114 115 #[error("Cannot load the initramfs into memory")] 116 InitramfsLoad, 117 118 #[error("Cannot load the kernel command line in memory: {0}")] 119 LoadCmdLine(#[source] linux_loader::loader::Error), 120 121 #[error("Cannot modify the kernel command line: {0}")] 122 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 123 124 #[error("Cannot create the kernel command line: {0}")] 125 CmdLineCreate(#[source] linux_loader::cmdline::Error), 126 127 #[error("Cannot configure system: {0}")] 128 ConfigureSystem(#[source] arch::Error), 129 130 #[cfg(target_arch = "aarch64")] 131 #[error("Cannot enable interrupt controller: {0:?}")] 132 EnableInterruptController(interrupt_controller::Error), 133 134 #[error("VM state is poisoned")] 135 PoisonedState, 136 137 #[error("Error from device manager: {0:?}")] 138 DeviceManager(DeviceManagerError), 139 140 #[error("No device with id {0:?} to remove")] 141 NoDeviceToRemove(String), 142 143 #[error("Cannot spawn a signal handler thread: {0}")] 144 SignalHandlerSpawn(#[source] io::Error), 145 146 #[error("Failed to join on threads: {0:?}")] 147 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 148 149 #[error("VM config is missing")] 150 VmMissingConfig, 151 152 #[error("VM is not created")] 153 VmNotCreated, 154 155 #[error("VM is already created")] 156 VmAlreadyCreated, 157 158 #[error("VM is not running")] 159 VmNotRunning, 160 161 #[error("Cannot clone EventFd: {0}")] 162 EventFdClone(#[source] io::Error), 163 164 #[error("invalid VM state transition: {0:?} to {1:?}")] 165 InvalidStateTransition(VmState, VmState), 166 167 #[error("Error from CPU manager: {0}")] 168 CpuManager(#[source] cpu::Error), 169 170 #[error("Cannot pause devices: {0}")] 171 PauseDevices(#[source] MigratableError), 172 173 #[error("Cannot resume devices: {0}")] 174 ResumeDevices(#[source] MigratableError), 175 176 #[error("Cannot pause CPUs: {0}")] 177 PauseCpus(#[source] MigratableError), 178 179 #[error("Cannot resume cpus: {0}")] 180 ResumeCpus(#[source] MigratableError), 181 182 #[error("Cannot pause VM: {0}")] 183 Pause(#[source] MigratableError), 184 185 #[error("Cannot resume VM: {0}")] 186 Resume(#[source] MigratableError), 187 188 #[error("Memory manager error: {0:?}")] 189 MemoryManager(MemoryManagerError), 190 191 #[error("Eventfd write error: {0}")] 192 EventfdError(#[source] std::io::Error), 193 194 #[error("Cannot snapshot VM: {0}")] 195 Snapshot(#[source] MigratableError), 196 197 #[error("Cannot restore VM: {0}")] 198 Restore(#[source] MigratableError), 199 200 #[error("Cannot send VM snapshot: {0}")] 201 SnapshotSend(#[source] MigratableError), 202 203 #[error("Invalid restore source URL")] 204 InvalidRestoreSourceUrl, 205 206 #[error("Failed to validate config: {0}")] 207 ConfigValidation(#[source] ValidationError), 208 209 #[error("Too many virtio-vsock devices")] 210 TooManyVsockDevices, 211 212 #[error("Failed serializing into JSON: {0}")] 213 SerializeJson(#[source] serde_json::Error), 214 215 #[error("Invalid NUMA configuration")] 216 InvalidNumaConfig, 217 218 #[error("Cannot create seccomp filter: {0}")] 219 CreateSeccompFilter(#[source] seccompiler::Error), 220 221 #[error("Cannot apply seccomp filter: {0}")] 222 ApplySeccompFilter(#[source] seccompiler::Error), 223 224 #[error("Failed resizing a memory zone")] 225 ResizeZone, 226 227 #[error("Cannot activate virtio devices: {0:?}")] 228 ActivateVirtioDevices(DeviceManagerError), 229 230 #[error("Error triggering power button: {0:?}")] 231 PowerButton(DeviceManagerError), 232 233 #[error("Kernel lacks PVH header")] 234 KernelMissingPvhHeader, 235 236 #[error("Failed to allocate firmware RAM: {0:?}")] 237 AllocateFirmwareMemory(MemoryManagerError), 238 239 #[error("Error manipulating firmware file: {0}")] 240 FirmwareFile(#[source] std::io::Error), 241 242 #[error("Firmware too big")] 243 FirmwareTooLarge, 244 245 #[error("Failed to copy firmware to memory: {0}")] 246 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 247 248 #[cfg(feature = "sev_snp")] 249 #[error("Error enabling SEV-SNP VM: {0}")] 250 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 251 252 #[cfg(feature = "tdx")] 253 #[error("Error performing I/O on TDX firmware file: {0}")] 254 LoadTdvf(#[source] std::io::Error), 255 256 #[cfg(feature = "tdx")] 257 #[error("Error performing I/O on the TDX payload file: {0}")] 258 LoadPayload(#[source] std::io::Error), 259 260 #[cfg(feature = "tdx")] 261 #[error("Error parsing TDVF: {0}")] 262 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 263 264 #[cfg(feature = "tdx")] 265 #[error("Error populating TDX HOB: {0}")] 266 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 267 268 #[cfg(feature = "tdx")] 269 #[error("Error allocating TDVF memory: {0:?}")] 270 AllocatingTdvfMemory(crate::memory_manager::Error), 271 272 #[cfg(feature = "tdx")] 273 #[error("Error enabling TDX VM: {0}")] 274 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 275 276 #[cfg(feature = "tdx")] 277 #[error("Error enabling TDX memory region: {0}")] 278 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 279 280 #[cfg(feature = "tdx")] 281 #[error("Error finalizing TDX VM: {0}")] 282 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 283 284 #[cfg(feature = "tdx")] 285 #[error("TDX firmware missing")] 286 TdxFirmwareMissing, 287 288 #[cfg(feature = "tdx")] 289 #[error("Invalid TDX payload type")] 290 InvalidPayloadType, 291 292 #[cfg(feature = "guest_debug")] 293 #[error("Error debugging VM: {0:?}")] 294 Debug(DebuggableError), 295 296 #[error("Error spawning kernel loading thread")] 297 KernelLoadThreadSpawn(std::io::Error), 298 299 #[error("Error joining kernel loading thread")] 300 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 301 302 #[error("Payload configuration is not bootable")] 303 InvalidPayload, 304 305 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 306 #[error("Error coredumping VM: {0:?}")] 307 Coredump(GuestDebuggableError), 308 } 309 pub type Result<T> = result::Result<T, Error>; 310 311 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 312 pub enum VmState { 313 Created, 314 Running, 315 Shutdown, 316 Paused, 317 BreakPoint, 318 } 319 320 impl VmState { 321 fn valid_transition(self, new_state: VmState) -> Result<()> { 322 match self { 323 VmState::Created => match new_state { 324 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 325 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 326 Ok(()) 327 } 328 }, 329 330 VmState::Running => match new_state { 331 VmState::Created | VmState::Running => { 332 Err(Error::InvalidStateTransition(self, new_state)) 333 } 334 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 335 }, 336 337 VmState::Shutdown => match new_state { 338 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 339 Err(Error::InvalidStateTransition(self, new_state)) 340 } 341 VmState::Running => Ok(()), 342 }, 343 344 VmState::Paused => match new_state { 345 VmState::Created | VmState::Paused | VmState::BreakPoint => { 346 Err(Error::InvalidStateTransition(self, new_state)) 347 } 348 VmState::Running | VmState::Shutdown => Ok(()), 349 }, 350 VmState::BreakPoint => match new_state { 351 VmState::Created | VmState::Running => Ok(()), 352 _ => Err(Error::InvalidStateTransition(self, new_state)), 353 }, 354 } 355 } 356 } 357 358 struct VmOpsHandler { 359 memory: GuestMemoryAtomic<GuestMemoryMmap>, 360 #[cfg(target_arch = "x86_64")] 361 io_bus: Arc<Bus>, 362 mmio_bus: Arc<Bus>, 363 } 364 365 impl VmOps for VmOpsHandler { 366 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 367 self.memory 368 .memory() 369 .write(buf, GuestAddress(gpa)) 370 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 371 } 372 373 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 374 self.memory 375 .memory() 376 .read(buf, GuestAddress(gpa)) 377 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 378 } 379 380 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 381 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 382 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 383 } 384 Ok(()) 385 } 386 387 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 388 match self.mmio_bus.write(gpa, data) { 389 Err(vm_device::BusError::MissingAddressRange) => { 390 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 391 } 392 Ok(Some(barrier)) => { 393 info!("Waiting for barrier"); 394 barrier.wait(); 395 info!("Barrier released"); 396 } 397 _ => {} 398 }; 399 Ok(()) 400 } 401 402 #[cfg(target_arch = "x86_64")] 403 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 404 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 405 info!("Guest PIO read to unregistered address 0x{:x}", port); 406 } 407 Ok(()) 408 } 409 410 #[cfg(target_arch = "x86_64")] 411 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 412 match self.io_bus.write(port, data) { 413 Err(vm_device::BusError::MissingAddressRange) => { 414 info!("Guest PIO write to unregistered address 0x{:x}", port); 415 } 416 Ok(Some(barrier)) => { 417 info!("Waiting for barrier"); 418 barrier.wait(); 419 info!("Barrier released"); 420 } 421 _ => {} 422 }; 423 Ok(()) 424 } 425 } 426 427 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 428 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 429 430 cmp::min(host_phys_bits, max_phys_bits) 431 } 432 433 pub struct Vm { 434 #[cfg(feature = "tdx")] 435 kernel: Option<File>, 436 initramfs: Option<File>, 437 threads: Vec<thread::JoinHandle<()>>, 438 device_manager: Arc<Mutex<DeviceManager>>, 439 config: Arc<Mutex<VmConfig>>, 440 state: RwLock<VmState>, 441 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 442 memory_manager: Arc<Mutex<MemoryManager>>, 443 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 444 // The hypervisor abstracted virtual machine. 445 vm: Arc<dyn hypervisor::Vm>, 446 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 447 saved_clock: Option<hypervisor::ClockData>, 448 numa_nodes: NumaNodes, 449 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 450 hypervisor: Arc<dyn hypervisor::Hypervisor>, 451 stop_on_boot: bool, 452 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 453 } 454 455 impl Vm { 456 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 457 458 #[allow(clippy::too_many_arguments)] 459 pub fn new_from_memory_manager( 460 config: Arc<Mutex<VmConfig>>, 461 memory_manager: Arc<Mutex<MemoryManager>>, 462 vm: Arc<dyn hypervisor::Vm>, 463 exit_evt: EventFd, 464 reset_evt: EventFd, 465 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 466 seccomp_action: &SeccompAction, 467 hypervisor: Arc<dyn hypervisor::Hypervisor>, 468 activate_evt: EventFd, 469 timestamp: Instant, 470 serial_pty: Option<PtyPair>, 471 console_pty: Option<PtyPair>, 472 console_resize_pipe: Option<File>, 473 original_termios: Arc<Mutex<Option<termios>>>, 474 snapshot: Option<Snapshot>, 475 ) -> Result<Self> { 476 trace_scoped!("Vm::new_from_memory_manager"); 477 478 let boot_id_list = config 479 .lock() 480 .unwrap() 481 .validate() 482 .map_err(Error::ConfigValidation)?; 483 484 let load_payload_handle = if snapshot.is_none() { 485 Self::load_payload_async(&memory_manager, &config)? 486 } else { 487 None 488 }; 489 490 info!("Booting VM from config: {:?}", &config); 491 492 // Create NUMA nodes based on NumaConfig. 493 let numa_nodes = 494 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 495 496 #[cfg(feature = "tdx")] 497 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 498 #[cfg(feature = "sev_snp")] 499 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 500 #[cfg(feature = "tdx")] 501 let force_iommu = tdx_enabled; 502 #[cfg(not(feature = "tdx"))] 503 let force_iommu = false; 504 505 #[cfg(feature = "guest_debug")] 506 let stop_on_boot = config.lock().unwrap().gdb; 507 #[cfg(not(feature = "guest_debug"))] 508 let stop_on_boot = false; 509 510 let memory = memory_manager.lock().unwrap().guest_memory(); 511 #[cfg(target_arch = "x86_64")] 512 let io_bus = Arc::new(Bus::new()); 513 let mmio_bus = Arc::new(Bus::new()); 514 515 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 516 memory, 517 #[cfg(target_arch = "x86_64")] 518 io_bus: io_bus.clone(), 519 mmio_bus: mmio_bus.clone(), 520 }); 521 522 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 523 let cpu_manager = cpu::CpuManager::new( 524 cpus_config, 525 vm.clone(), 526 exit_evt.try_clone().map_err(Error::EventFdClone)?, 527 reset_evt.try_clone().map_err(Error::EventFdClone)?, 528 #[cfg(feature = "guest_debug")] 529 vm_debug_evt, 530 &hypervisor, 531 seccomp_action.clone(), 532 vm_ops, 533 #[cfg(feature = "tdx")] 534 tdx_enabled, 535 &numa_nodes, 536 ) 537 .map_err(Error::CpuManager)?; 538 539 #[cfg(target_arch = "x86_64")] 540 cpu_manager 541 .lock() 542 .unwrap() 543 .populate_cpuid( 544 &memory_manager, 545 &hypervisor, 546 #[cfg(feature = "tdx")] 547 tdx_enabled, 548 ) 549 .map_err(Error::CpuManager)?; 550 551 // The initial TDX configuration must be done before the vCPUs are 552 // created 553 #[cfg(feature = "tdx")] 554 if tdx_enabled { 555 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 556 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 557 vm.tdx_init(&cpuid, max_vcpus) 558 .map_err(Error::InitializeTdxVm)?; 559 } 560 561 cpu_manager 562 .lock() 563 .unwrap() 564 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 565 .map_err(Error::CpuManager)?; 566 567 // This initial SEV-SNP configuration must be done immediately after 568 // vCPUs are created. As part of this initialization we are 569 // transitioning the guest into secure state. 570 #[cfg(feature = "sev_snp")] 571 if sev_snp_enabled { 572 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 573 } 574 575 #[cfg(feature = "tdx")] 576 let dynamic = !tdx_enabled; 577 #[cfg(not(feature = "tdx"))] 578 let dynamic = true; 579 580 let device_manager = DeviceManager::new( 581 #[cfg(target_arch = "x86_64")] 582 io_bus, 583 mmio_bus, 584 hypervisor.hypervisor_type(), 585 vm.clone(), 586 config.clone(), 587 memory_manager.clone(), 588 cpu_manager.clone(), 589 exit_evt.try_clone().map_err(Error::EventFdClone)?, 590 reset_evt, 591 seccomp_action.clone(), 592 numa_nodes.clone(), 593 &activate_evt, 594 force_iommu, 595 boot_id_list, 596 timestamp, 597 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 598 dynamic, 599 ) 600 .map_err(Error::DeviceManager)?; 601 602 device_manager 603 .lock() 604 .unwrap() 605 .create_devices( 606 serial_pty, 607 console_pty, 608 console_resize_pipe, 609 original_termios, 610 ) 611 .map_err(Error::DeviceManager)?; 612 613 #[cfg(feature = "tdx")] 614 let kernel = config 615 .lock() 616 .unwrap() 617 .payload 618 .as_ref() 619 .map(|p| p.kernel.as_ref().map(File::open)) 620 .unwrap_or_default() 621 .transpose() 622 .map_err(Error::KernelFile)?; 623 624 let initramfs = config 625 .lock() 626 .unwrap() 627 .payload 628 .as_ref() 629 .map(|p| p.initramfs.as_ref().map(File::open)) 630 .unwrap_or_default() 631 .transpose() 632 .map_err(Error::InitramfsFile)?; 633 634 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 635 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 636 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 637 vm_snapshot.clock 638 } else { 639 None 640 }; 641 642 let vm_state = if snapshot.is_some() { 643 VmState::Paused 644 } else { 645 VmState::Created 646 }; 647 648 Ok(Vm { 649 #[cfg(feature = "tdx")] 650 kernel, 651 initramfs, 652 device_manager, 653 config, 654 threads: Vec::with_capacity(1), 655 state: RwLock::new(vm_state), 656 cpu_manager, 657 memory_manager, 658 vm, 659 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 660 saved_clock, 661 numa_nodes, 662 hypervisor, 663 stop_on_boot, 664 load_payload_handle, 665 }) 666 } 667 668 fn create_numa_nodes( 669 configs: Option<Vec<NumaConfig>>, 670 memory_manager: &Arc<Mutex<MemoryManager>>, 671 ) -> Result<NumaNodes> { 672 let mm = memory_manager.lock().unwrap(); 673 let mm_zones = mm.memory_zones(); 674 let mut numa_nodes = BTreeMap::new(); 675 676 if let Some(configs) = &configs { 677 for config in configs.iter() { 678 if numa_nodes.contains_key(&config.guest_numa_id) { 679 error!("Can't define twice the same NUMA node"); 680 return Err(Error::InvalidNumaConfig); 681 } 682 683 let mut node = NumaNode::default(); 684 685 if let Some(memory_zones) = &config.memory_zones { 686 for memory_zone in memory_zones.iter() { 687 if let Some(mm_zone) = mm_zones.get(memory_zone) { 688 node.memory_regions.extend(mm_zone.regions().clone()); 689 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 690 node.hotplug_regions.push(virtiomem_zone.region().clone()); 691 } 692 node.memory_zones.push(memory_zone.clone()); 693 } else { 694 error!("Unknown memory zone '{}'", memory_zone); 695 return Err(Error::InvalidNumaConfig); 696 } 697 } 698 } 699 700 if let Some(cpus) = &config.cpus { 701 node.cpus.extend(cpus); 702 } 703 704 if let Some(pci_segments) = &config.pci_segments { 705 node.pci_segments.extend(pci_segments); 706 } 707 708 if let Some(distances) = &config.distances { 709 for distance in distances.iter() { 710 let dest = distance.destination; 711 let dist = distance.distance; 712 713 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 714 error!("Unknown destination NUMA node {}", dest); 715 return Err(Error::InvalidNumaConfig); 716 } 717 718 if node.distances.contains_key(&dest) { 719 error!("Destination NUMA node {} has been already set", dest); 720 return Err(Error::InvalidNumaConfig); 721 } 722 723 node.distances.insert(dest, dist); 724 } 725 } 726 727 #[cfg(target_arch = "x86_64")] 728 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 729 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 730 let mm_sections = sgx_epc_region.epc_sections(); 731 for sgx_epc_section in sgx_epc_sections.iter() { 732 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 733 node.sgx_epc_sections.push(mm_section.clone()); 734 } else { 735 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 736 return Err(Error::InvalidNumaConfig); 737 } 738 } 739 } else { 740 error!("Missing SGX EPC region"); 741 return Err(Error::InvalidNumaConfig); 742 } 743 } 744 745 numa_nodes.insert(config.guest_numa_id, node); 746 } 747 } 748 749 Ok(numa_nodes) 750 } 751 752 #[allow(clippy::too_many_arguments)] 753 pub fn new( 754 vm_config: Arc<Mutex<VmConfig>>, 755 exit_evt: EventFd, 756 reset_evt: EventFd, 757 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 758 seccomp_action: &SeccompAction, 759 hypervisor: Arc<dyn hypervisor::Hypervisor>, 760 activate_evt: EventFd, 761 serial_pty: Option<PtyPair>, 762 console_pty: Option<PtyPair>, 763 console_resize_pipe: Option<File>, 764 original_termios: Arc<Mutex<Option<termios>>>, 765 snapshot: Option<Snapshot>, 766 source_url: Option<&str>, 767 prefault: Option<bool>, 768 ) -> Result<Self> { 769 trace_scoped!("Vm::new"); 770 771 let timestamp = Instant::now(); 772 773 #[cfg(feature = "tdx")] 774 let tdx_enabled = if snapshot.is_some() { 775 false 776 } else { 777 vm_config.lock().unwrap().is_tdx_enabled() 778 }; 779 780 #[cfg(feature = "sev_snp")] 781 let sev_snp_enabled = if snapshot.is_some() { 782 false 783 } else { 784 vm_config.lock().unwrap().is_sev_snp_enabled() 785 }; 786 787 let vm = Self::create_hypervisor_vm( 788 &hypervisor, 789 #[cfg(feature = "tdx")] 790 tdx_enabled, 791 #[cfg(feature = "sev_snp")] 792 sev_snp_enabled, 793 )?; 794 795 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 796 797 let memory_manager = if let Some(snapshot) = 798 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 799 { 800 MemoryManager::new_from_snapshot( 801 &snapshot, 802 vm.clone(), 803 &vm_config.lock().unwrap().memory.clone(), 804 source_url, 805 prefault.unwrap(), 806 phys_bits, 807 ) 808 .map_err(Error::MemoryManager)? 809 } else { 810 #[cfg(target_arch = "x86_64")] 811 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 812 813 MemoryManager::new( 814 vm.clone(), 815 &vm_config.lock().unwrap().memory.clone(), 816 None, 817 phys_bits, 818 #[cfg(feature = "tdx")] 819 tdx_enabled, 820 None, 821 None, 822 #[cfg(target_arch = "x86_64")] 823 sgx_epc_config, 824 ) 825 .map_err(Error::MemoryManager)? 826 }; 827 828 Vm::new_from_memory_manager( 829 vm_config, 830 memory_manager, 831 vm, 832 exit_evt, 833 reset_evt, 834 #[cfg(feature = "guest_debug")] 835 vm_debug_evt, 836 seccomp_action, 837 hypervisor, 838 activate_evt, 839 timestamp, 840 serial_pty, 841 console_pty, 842 console_resize_pipe, 843 original_termios, 844 snapshot, 845 ) 846 } 847 848 pub fn create_hypervisor_vm( 849 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 850 #[cfg(feature = "tdx")] tdx_enabled: bool, 851 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 852 ) -> Result<Arc<dyn hypervisor::Vm>> { 853 hypervisor.check_required_extensions().unwrap(); 854 855 cfg_if::cfg_if! { 856 if #[cfg(feature = "tdx")] { 857 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 858 // Otherwise KVM_X86_LEGACY_VM: 0 859 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 860 let vm = hypervisor 861 .create_vm_with_type(u64::from(tdx_enabled)) 862 .unwrap(); 863 } else if #[cfg(feature = "sev_snp")] { 864 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 865 // Otherwise SEV_SNP_DISABLED: 0 866 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 867 let vm = hypervisor 868 .create_vm_with_type(u64::from(sev_snp_enabled)) 869 .unwrap(); 870 } else { 871 let vm = hypervisor.create_vm().unwrap(); 872 } 873 } 874 875 #[cfg(target_arch = "x86_64")] 876 { 877 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 878 .unwrap(); 879 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 880 vm.enable_split_irq().unwrap(); 881 } 882 883 Ok(vm) 884 } 885 886 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 887 let mut initramfs = self.initramfs.as_ref().unwrap(); 888 let size: usize = initramfs 889 .seek(SeekFrom::End(0)) 890 .map_err(|_| Error::InitramfsLoad)? 891 .try_into() 892 .unwrap(); 893 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 894 895 let address = 896 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 897 let address = GuestAddress(address); 898 899 guest_mem 900 .read_from(address, &mut initramfs, size) 901 .map_err(|_| Error::InitramfsLoad)?; 902 903 info!("Initramfs loaded: address = 0x{:x}", address.0); 904 Ok(arch::InitramfsConfig { address, size }) 905 } 906 907 pub fn generate_cmdline( 908 payload: &PayloadConfig, 909 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 910 ) -> Result<Cmdline> { 911 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 912 if let Some(s) = payload.cmdline.as_ref() { 913 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 914 } 915 916 #[cfg(target_arch = "aarch64")] 917 for entry in device_manager.lock().unwrap().cmdline_additions() { 918 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 919 } 920 Ok(cmdline) 921 } 922 923 #[cfg(target_arch = "aarch64")] 924 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 925 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 926 let mem = uefi_flash.memory(); 927 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 928 .map_err(Error::UefiLoad)?; 929 Ok(()) 930 } 931 932 #[cfg(target_arch = "aarch64")] 933 fn load_kernel( 934 firmware: Option<File>, 935 kernel: Option<File>, 936 memory_manager: Arc<Mutex<MemoryManager>>, 937 ) -> Result<EntryPoint> { 938 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 939 let mem = guest_memory.memory(); 940 let entry_addr = match (firmware, kernel) { 941 (None, Some(mut kernel)) => { 942 match linux_loader::loader::pe::PE::load( 943 mem.deref(), 944 Some(arch::layout::KERNEL_START), 945 &mut kernel, 946 None, 947 ) { 948 Ok(entry_addr) => entry_addr.kernel_load, 949 // Try to load the binary as kernel PE file at first. 950 // If failed, retry to load it as UEFI binary. 951 // As the UEFI binary is formatless, it must be the last option to try. 952 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 953 Self::load_firmware(&kernel, memory_manager)?; 954 arch::layout::UEFI_START 955 } 956 Err(e) => { 957 return Err(Error::KernelLoad(e)); 958 } 959 } 960 } 961 (Some(firmware), None) => { 962 Self::load_firmware(&firmware, memory_manager)?; 963 arch::layout::UEFI_START 964 } 965 _ => return Err(Error::InvalidPayload), 966 }; 967 968 Ok(EntryPoint { entry_addr }) 969 } 970 971 #[cfg(target_arch = "x86_64")] 972 fn load_kernel( 973 mut kernel: File, 974 cmdline: Option<Cmdline>, 975 memory_manager: Arc<Mutex<MemoryManager>>, 976 ) -> Result<EntryPoint> { 977 info!("Loading kernel"); 978 979 let mem = { 980 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 981 guest_memory.memory() 982 }; 983 let entry_addr = linux_loader::loader::elf::Elf::load( 984 mem.deref(), 985 None, 986 &mut kernel, 987 Some(arch::layout::HIGH_RAM_START), 988 ) 989 .map_err(Error::KernelLoad)?; 990 991 if let Some(cmdline) = cmdline { 992 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 993 .map_err(Error::LoadCmdLine)?; 994 } 995 996 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 997 // Use the PVH kernel entry point to boot the guest 998 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 999 Ok(EntryPoint { entry_addr }) 1000 } else { 1001 Err(Error::KernelMissingPvhHeader) 1002 } 1003 } 1004 1005 #[cfg(target_arch = "x86_64")] 1006 fn load_payload( 1007 payload: &PayloadConfig, 1008 memory_manager: Arc<Mutex<MemoryManager>>, 1009 ) -> Result<EntryPoint> { 1010 trace_scoped!("load_payload"); 1011 match ( 1012 &payload.firmware, 1013 &payload.kernel, 1014 &payload.initramfs, 1015 &payload.cmdline, 1016 ) { 1017 (Some(firmware), None, None, None) => { 1018 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1019 Self::load_kernel(firmware, None, memory_manager) 1020 } 1021 (None, Some(kernel), _, _) => { 1022 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1023 let cmdline = Self::generate_cmdline(payload)?; 1024 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1025 } 1026 _ => Err(Error::InvalidPayload), 1027 } 1028 } 1029 1030 #[cfg(target_arch = "aarch64")] 1031 fn load_payload( 1032 payload: &PayloadConfig, 1033 memory_manager: Arc<Mutex<MemoryManager>>, 1034 ) -> Result<EntryPoint> { 1035 match (&payload.firmware, &payload.kernel) { 1036 (Some(firmware), None) => { 1037 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1038 Self::load_kernel(Some(firmware), None, memory_manager) 1039 } 1040 (None, Some(kernel)) => { 1041 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1042 Self::load_kernel(None, Some(kernel), memory_manager) 1043 } 1044 _ => Err(Error::InvalidPayload), 1045 } 1046 } 1047 1048 fn load_payload_async( 1049 memory_manager: &Arc<Mutex<MemoryManager>>, 1050 config: &Arc<Mutex<VmConfig>>, 1051 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1052 // Kernel with TDX is loaded in a different manner 1053 #[cfg(feature = "tdx")] 1054 if config.lock().unwrap().is_tdx_enabled() { 1055 return Ok(None); 1056 } 1057 1058 config 1059 .lock() 1060 .unwrap() 1061 .payload 1062 .as_ref() 1063 .map(|payload| { 1064 let memory_manager = memory_manager.clone(); 1065 let payload = payload.clone(); 1066 1067 std::thread::Builder::new() 1068 .name("payload_loader".into()) 1069 .spawn(move || Self::load_payload(&payload, memory_manager)) 1070 .map_err(Error::KernelLoadThreadSpawn) 1071 }) 1072 .transpose() 1073 } 1074 1075 #[cfg(target_arch = "x86_64")] 1076 fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> { 1077 trace_scoped!("configure_system"); 1078 info!("Configuring system"); 1079 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1080 1081 let initramfs_config = match self.initramfs { 1082 Some(_) => Some(self.load_initramfs(&mem)?), 1083 None => None, 1084 }; 1085 1086 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1087 let rsdp_addr = Some(rsdp_addr); 1088 let sgx_epc_region = self 1089 .memory_manager 1090 .lock() 1091 .unwrap() 1092 .sgx_epc_region() 1093 .as_ref() 1094 .cloned(); 1095 1096 let serial_number = self 1097 .config 1098 .lock() 1099 .unwrap() 1100 .platform 1101 .as_ref() 1102 .and_then(|p| p.serial_number.clone()); 1103 1104 let uuid = self 1105 .config 1106 .lock() 1107 .unwrap() 1108 .platform 1109 .as_ref() 1110 .and_then(|p| p.uuid.clone()); 1111 1112 let oem_strings = self 1113 .config 1114 .lock() 1115 .unwrap() 1116 .platform 1117 .as_ref() 1118 .and_then(|p| p.oem_strings.clone()); 1119 1120 let oem_strings = oem_strings 1121 .as_deref() 1122 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1123 1124 arch::configure_system( 1125 &mem, 1126 arch::layout::CMDLINE_START, 1127 &initramfs_config, 1128 boot_vcpus, 1129 rsdp_addr, 1130 sgx_epc_region, 1131 serial_number.as_deref(), 1132 uuid.as_deref(), 1133 oem_strings.as_deref(), 1134 ) 1135 .map_err(Error::ConfigureSystem)?; 1136 Ok(()) 1137 } 1138 1139 #[cfg(target_arch = "aarch64")] 1140 fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> { 1141 let cmdline = Self::generate_cmdline( 1142 self.config.lock().unwrap().payload.as_ref().unwrap(), 1143 &self.device_manager, 1144 )?; 1145 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1146 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1147 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1148 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1149 let initramfs_config = match self.initramfs { 1150 Some(_) => Some(self.load_initramfs(&mem)?), 1151 None => None, 1152 }; 1153 1154 let device_info = &self 1155 .device_manager 1156 .lock() 1157 .unwrap() 1158 .get_device_info() 1159 .clone(); 1160 1161 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1162 let pci_space = PciSpaceInfo { 1163 pci_segment_id: pci_segment.id, 1164 mmio_config_address: pci_segment.mmio_config_address, 1165 pci_device_space_start: pci_segment.start_of_device_area, 1166 pci_device_space_size: pci_segment.end_of_device_area 1167 - pci_segment.start_of_device_area 1168 + 1, 1169 }; 1170 pci_space_info.push(pci_space); 1171 } 1172 1173 let virtio_iommu_bdf = self 1174 .device_manager 1175 .lock() 1176 .unwrap() 1177 .iommu_attached_devices() 1178 .as_ref() 1179 .map(|(v, _)| *v); 1180 1181 let vgic = self 1182 .device_manager 1183 .lock() 1184 .unwrap() 1185 .get_interrupt_controller() 1186 .unwrap() 1187 .lock() 1188 .unwrap() 1189 .get_vgic() 1190 .map_err(|_| { 1191 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1192 arch::aarch64::Error::SetupGic, 1193 )) 1194 })?; 1195 1196 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1197 let pmu_supported = self 1198 .cpu_manager 1199 .lock() 1200 .unwrap() 1201 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1202 .map_err(|_| { 1203 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1204 arch::aarch64::Error::VcpuInitPmu, 1205 )) 1206 })?; 1207 1208 arch::configure_system( 1209 &mem, 1210 cmdline.as_cstring().unwrap().to_str().unwrap(), 1211 vcpu_mpidrs, 1212 vcpu_topology, 1213 device_info, 1214 &initramfs_config, 1215 &pci_space_info, 1216 virtio_iommu_bdf.map(|bdf| bdf.into()), 1217 &vgic, 1218 &self.numa_nodes, 1219 pmu_supported, 1220 ) 1221 .map_err(Error::ConfigureSystem)?; 1222 1223 Ok(()) 1224 } 1225 1226 pub fn serial_pty(&self) -> Option<PtyPair> { 1227 self.device_manager.lock().unwrap().serial_pty() 1228 } 1229 1230 pub fn console_pty(&self) -> Option<PtyPair> { 1231 self.device_manager.lock().unwrap().console_pty() 1232 } 1233 1234 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1235 self.device_manager.lock().unwrap().console_resize_pipe() 1236 } 1237 1238 pub fn shutdown(&mut self) -> Result<()> { 1239 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1240 let new_state = VmState::Shutdown; 1241 1242 state.valid_transition(new_state)?; 1243 1244 // Wake up the DeviceManager threads so they will get terminated cleanly 1245 self.device_manager 1246 .lock() 1247 .unwrap() 1248 .resume() 1249 .map_err(Error::Resume)?; 1250 1251 self.cpu_manager 1252 .lock() 1253 .unwrap() 1254 .shutdown() 1255 .map_err(Error::CpuManager)?; 1256 1257 // Wait for all the threads to finish 1258 for thread in self.threads.drain(..) { 1259 thread.join().map_err(Error::ThreadCleanup)? 1260 } 1261 *state = new_state; 1262 1263 event!("vm", "shutdown"); 1264 1265 Ok(()) 1266 } 1267 1268 pub fn resize( 1269 &mut self, 1270 desired_vcpus: Option<u8>, 1271 desired_memory: Option<u64>, 1272 desired_balloon: Option<u64>, 1273 ) -> Result<()> { 1274 event!("vm", "resizing"); 1275 1276 if let Some(desired_vcpus) = desired_vcpus { 1277 if self 1278 .cpu_manager 1279 .lock() 1280 .unwrap() 1281 .resize(desired_vcpus) 1282 .map_err(Error::CpuManager)? 1283 { 1284 self.device_manager 1285 .lock() 1286 .unwrap() 1287 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1288 .map_err(Error::DeviceManager)?; 1289 } 1290 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1291 } 1292 1293 if let Some(desired_memory) = desired_memory { 1294 let new_region = self 1295 .memory_manager 1296 .lock() 1297 .unwrap() 1298 .resize(desired_memory) 1299 .map_err(Error::MemoryManager)?; 1300 1301 let memory_config = &mut self.config.lock().unwrap().memory; 1302 1303 if let Some(new_region) = &new_region { 1304 self.device_manager 1305 .lock() 1306 .unwrap() 1307 .update_memory(new_region) 1308 .map_err(Error::DeviceManager)?; 1309 1310 match memory_config.hotplug_method { 1311 HotplugMethod::Acpi => { 1312 self.device_manager 1313 .lock() 1314 .unwrap() 1315 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1316 .map_err(Error::DeviceManager)?; 1317 } 1318 HotplugMethod::VirtioMem => {} 1319 } 1320 } 1321 1322 // We update the VM config regardless of the actual guest resize 1323 // operation result (happened or not), so that if the VM reboots 1324 // it will be running with the last configure memory size. 1325 match memory_config.hotplug_method { 1326 HotplugMethod::Acpi => memory_config.size = desired_memory, 1327 HotplugMethod::VirtioMem => { 1328 if desired_memory > memory_config.size { 1329 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1330 } else { 1331 memory_config.hotplugged_size = None; 1332 } 1333 } 1334 } 1335 } 1336 1337 if let Some(desired_balloon) = desired_balloon { 1338 self.device_manager 1339 .lock() 1340 .unwrap() 1341 .resize_balloon(desired_balloon) 1342 .map_err(Error::DeviceManager)?; 1343 1344 // Update the configuration value for the balloon size to ensure 1345 // a reboot would use the right value. 1346 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1347 balloon_config.size = desired_balloon; 1348 } 1349 } 1350 1351 event!("vm", "resized"); 1352 1353 Ok(()) 1354 } 1355 1356 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1357 let memory_config = &mut self.config.lock().unwrap().memory; 1358 1359 if let Some(zones) = &mut memory_config.zones { 1360 for zone in zones.iter_mut() { 1361 if zone.id == id { 1362 if desired_memory >= zone.size { 1363 let hotplugged_size = desired_memory - zone.size; 1364 self.memory_manager 1365 .lock() 1366 .unwrap() 1367 .resize_zone(&id, desired_memory - zone.size) 1368 .map_err(Error::MemoryManager)?; 1369 // We update the memory zone config regardless of the 1370 // actual 'resize-zone' operation result (happened or 1371 // not), so that if the VM reboots it will be running 1372 // with the last configured memory zone size. 1373 zone.hotplugged_size = Some(hotplugged_size); 1374 1375 return Ok(()); 1376 } else { 1377 error!( 1378 "Invalid to ask less ({}) than boot RAM ({}) for \ 1379 this memory zone", 1380 desired_memory, zone.size, 1381 ); 1382 return Err(Error::ResizeZone); 1383 } 1384 } 1385 } 1386 } 1387 1388 error!("Could not find the memory zone {} for the resize", id); 1389 Err(Error::ResizeZone) 1390 } 1391 1392 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1393 let pci_device_info = self 1394 .device_manager 1395 .lock() 1396 .unwrap() 1397 .add_device(&mut device_cfg) 1398 .map_err(Error::DeviceManager)?; 1399 1400 // Update VmConfig by adding the new device. This is important to 1401 // ensure the device would be created in case of a reboot. 1402 { 1403 let mut config = self.config.lock().unwrap(); 1404 add_to_config(&mut config.devices, device_cfg); 1405 } 1406 1407 self.device_manager 1408 .lock() 1409 .unwrap() 1410 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1411 .map_err(Error::DeviceManager)?; 1412 1413 Ok(pci_device_info) 1414 } 1415 1416 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1417 let pci_device_info = self 1418 .device_manager 1419 .lock() 1420 .unwrap() 1421 .add_user_device(&mut device_cfg) 1422 .map_err(Error::DeviceManager)?; 1423 1424 // Update VmConfig by adding the new device. This is important to 1425 // ensure the device would be created in case of a reboot. 1426 { 1427 let mut config = self.config.lock().unwrap(); 1428 add_to_config(&mut config.user_devices, device_cfg); 1429 } 1430 1431 self.device_manager 1432 .lock() 1433 .unwrap() 1434 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1435 .map_err(Error::DeviceManager)?; 1436 1437 Ok(pci_device_info) 1438 } 1439 1440 pub fn remove_device(&mut self, id: String) -> Result<()> { 1441 self.device_manager 1442 .lock() 1443 .unwrap() 1444 .remove_device(id.clone()) 1445 .map_err(Error::DeviceManager)?; 1446 1447 // Update VmConfig by removing the device. This is important to 1448 // ensure the device would not be created in case of a reboot. 1449 self.config.lock().unwrap().remove_device(&id); 1450 1451 self.device_manager 1452 .lock() 1453 .unwrap() 1454 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1455 .map_err(Error::DeviceManager)?; 1456 Ok(()) 1457 } 1458 1459 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1460 let pci_device_info = self 1461 .device_manager 1462 .lock() 1463 .unwrap() 1464 .add_disk(&mut disk_cfg) 1465 .map_err(Error::DeviceManager)?; 1466 1467 // Update VmConfig by adding the new device. This is important to 1468 // ensure the device would be created in case of a reboot. 1469 { 1470 let mut config = self.config.lock().unwrap(); 1471 add_to_config(&mut config.disks, disk_cfg); 1472 } 1473 1474 self.device_manager 1475 .lock() 1476 .unwrap() 1477 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1478 .map_err(Error::DeviceManager)?; 1479 1480 Ok(pci_device_info) 1481 } 1482 1483 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1484 let pci_device_info = self 1485 .device_manager 1486 .lock() 1487 .unwrap() 1488 .add_fs(&mut fs_cfg) 1489 .map_err(Error::DeviceManager)?; 1490 1491 // Update VmConfig by adding the new device. This is important to 1492 // ensure the device would be created in case of a reboot. 1493 { 1494 let mut config = self.config.lock().unwrap(); 1495 add_to_config(&mut config.fs, fs_cfg); 1496 } 1497 1498 self.device_manager 1499 .lock() 1500 .unwrap() 1501 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1502 .map_err(Error::DeviceManager)?; 1503 1504 Ok(pci_device_info) 1505 } 1506 1507 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1508 let pci_device_info = self 1509 .device_manager 1510 .lock() 1511 .unwrap() 1512 .add_pmem(&mut pmem_cfg) 1513 .map_err(Error::DeviceManager)?; 1514 1515 // Update VmConfig by adding the new device. This is important to 1516 // ensure the device would be created in case of a reboot. 1517 { 1518 let mut config = self.config.lock().unwrap(); 1519 add_to_config(&mut config.pmem, pmem_cfg); 1520 } 1521 1522 self.device_manager 1523 .lock() 1524 .unwrap() 1525 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1526 .map_err(Error::DeviceManager)?; 1527 1528 Ok(pci_device_info) 1529 } 1530 1531 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1532 let pci_device_info = self 1533 .device_manager 1534 .lock() 1535 .unwrap() 1536 .add_net(&mut net_cfg) 1537 .map_err(Error::DeviceManager)?; 1538 1539 // Update VmConfig by adding the new device. This is important to 1540 // ensure the device would be created in case of a reboot. 1541 { 1542 let mut config = self.config.lock().unwrap(); 1543 add_to_config(&mut config.net, net_cfg); 1544 } 1545 1546 self.device_manager 1547 .lock() 1548 .unwrap() 1549 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1550 .map_err(Error::DeviceManager)?; 1551 1552 Ok(pci_device_info) 1553 } 1554 1555 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1556 let pci_device_info = self 1557 .device_manager 1558 .lock() 1559 .unwrap() 1560 .add_vdpa(&mut vdpa_cfg) 1561 .map_err(Error::DeviceManager)?; 1562 1563 // Update VmConfig by adding the new device. This is important to 1564 // ensure the device would be created in case of a reboot. 1565 { 1566 let mut config = self.config.lock().unwrap(); 1567 add_to_config(&mut config.vdpa, vdpa_cfg); 1568 } 1569 1570 self.device_manager 1571 .lock() 1572 .unwrap() 1573 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1574 .map_err(Error::DeviceManager)?; 1575 1576 Ok(pci_device_info) 1577 } 1578 1579 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1580 let pci_device_info = self 1581 .device_manager 1582 .lock() 1583 .unwrap() 1584 .add_vsock(&mut vsock_cfg) 1585 .map_err(Error::DeviceManager)?; 1586 1587 // Update VmConfig by adding the new device. This is important to 1588 // ensure the device would be created in case of a reboot. 1589 { 1590 let mut config = self.config.lock().unwrap(); 1591 config.vsock = Some(vsock_cfg); 1592 } 1593 1594 self.device_manager 1595 .lock() 1596 .unwrap() 1597 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1598 .map_err(Error::DeviceManager)?; 1599 1600 Ok(pci_device_info) 1601 } 1602 1603 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1604 Ok(self.device_manager.lock().unwrap().counters()) 1605 } 1606 1607 #[cfg(feature = "tdx")] 1608 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1609 use arch::x86_64::tdx::*; 1610 1611 let firmware_path = self 1612 .config 1613 .lock() 1614 .unwrap() 1615 .payload 1616 .as_ref() 1617 .unwrap() 1618 .firmware 1619 .clone() 1620 .ok_or(Error::TdxFirmwareMissing)?; 1621 // The TDVF file contains a table of section as well as code 1622 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1623 1624 // For all the sections allocate some RAM backing them 1625 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1626 } 1627 1628 #[cfg(feature = "tdx")] 1629 fn hob_memory_resources( 1630 mut sorted_sections: Vec<TdvfSection>, 1631 guest_memory: &GuestMemoryMmap, 1632 ) -> Vec<(u64, u64, bool)> { 1633 let mut list = Vec::new(); 1634 1635 let mut current_section = sorted_sections.pop(); 1636 1637 // RAM regions interleaved with TDVF sections 1638 let mut next_start_addr = 0; 1639 for region in guest_memory.iter() { 1640 let region_start = region.start_addr().0; 1641 let region_end = region.last_addr().0; 1642 if region_start > next_start_addr { 1643 next_start_addr = region_start; 1644 } 1645 1646 loop { 1647 let (start, size, ram) = if let Some(section) = ¤t_section { 1648 if section.address <= next_start_addr { 1649 (section.address, section.size, false) 1650 } else { 1651 let last_addr = std::cmp::min(section.address - 1, region_end); 1652 (next_start_addr, last_addr - next_start_addr + 1, true) 1653 } 1654 } else { 1655 (next_start_addr, region_end - next_start_addr + 1, true) 1656 }; 1657 1658 list.push((start, size, ram)); 1659 1660 if !ram { 1661 current_section = sorted_sections.pop(); 1662 } 1663 1664 next_start_addr = start + size; 1665 1666 if region_start > next_start_addr { 1667 next_start_addr = region_start; 1668 } 1669 1670 if next_start_addr > region_end { 1671 break; 1672 } 1673 } 1674 } 1675 1676 // Once all the interleaved sections have been processed, let's simply 1677 // pull the remaining ones. 1678 if let Some(section) = current_section { 1679 list.push((section.address, section.size, false)); 1680 } 1681 while let Some(section) = sorted_sections.pop() { 1682 list.push((section.address, section.size, false)); 1683 } 1684 1685 list 1686 } 1687 1688 #[cfg(feature = "tdx")] 1689 fn populate_tdx_sections( 1690 &mut self, 1691 sections: &[TdvfSection], 1692 guid_found: bool, 1693 ) -> Result<Option<u64>> { 1694 use arch::x86_64::tdx::*; 1695 // Get the memory end *before* we start adding TDVF ram regions 1696 let boot_guest_memory = self 1697 .memory_manager 1698 .lock() 1699 .as_ref() 1700 .unwrap() 1701 .boot_guest_memory(); 1702 for section in sections { 1703 // No need to allocate if the section falls within guest RAM ranges 1704 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1705 info!( 1706 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1707 section 1708 ); 1709 continue; 1710 } 1711 1712 info!("Allocating TDVF Section: {:x?}", section); 1713 self.memory_manager 1714 .lock() 1715 .unwrap() 1716 .add_ram_region(GuestAddress(section.address), section.size as usize) 1717 .map_err(Error::AllocatingTdvfMemory)?; 1718 } 1719 1720 // The TDVF file contains a table of section as well as code 1721 let firmware_path = self 1722 .config 1723 .lock() 1724 .unwrap() 1725 .payload 1726 .as_ref() 1727 .unwrap() 1728 .firmware 1729 .clone() 1730 .ok_or(Error::TdxFirmwareMissing)?; 1731 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1732 1733 // The guest memory at this point now has all the required regions so it 1734 // is safe to copy from the TDVF file into it. 1735 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1736 let mem = guest_memory.memory(); 1737 let mut payload_info = None; 1738 let mut hob_offset = None; 1739 for section in sections { 1740 info!("Populating TDVF Section: {:x?}", section); 1741 match section.r#type { 1742 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1743 info!("Copying section to guest memory"); 1744 firmware_file 1745 .seek(SeekFrom::Start(section.data_offset as u64)) 1746 .map_err(Error::LoadTdvf)?; 1747 mem.read_from( 1748 GuestAddress(section.address), 1749 &mut firmware_file, 1750 section.data_size as usize, 1751 ) 1752 .unwrap(); 1753 } 1754 TdvfSectionType::TdHob => { 1755 hob_offset = Some(section.address); 1756 } 1757 TdvfSectionType::Payload => { 1758 info!("Copying payload to guest memory"); 1759 if let Some(payload_file) = self.kernel.as_mut() { 1760 let payload_size = payload_file 1761 .seek(SeekFrom::End(0)) 1762 .map_err(Error::LoadPayload)?; 1763 1764 payload_file 1765 .seek(SeekFrom::Start(0x1f1)) 1766 .map_err(Error::LoadPayload)?; 1767 1768 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1769 payload_header 1770 .as_bytes() 1771 .read_from( 1772 0, 1773 payload_file, 1774 mem::size_of::<linux_loader::bootparam::setup_header>(), 1775 ) 1776 .unwrap(); 1777 1778 if payload_header.header != 0x5372_6448 { 1779 return Err(Error::InvalidPayloadType); 1780 } 1781 1782 if (payload_header.version < 0x0200) 1783 || ((payload_header.loadflags & 0x1) == 0x0) 1784 { 1785 return Err(Error::InvalidPayloadType); 1786 } 1787 1788 payload_file.rewind().map_err(Error::LoadPayload)?; 1789 mem.read_from( 1790 GuestAddress(section.address), 1791 payload_file, 1792 payload_size as usize, 1793 ) 1794 .unwrap(); 1795 1796 // Create the payload info that will be inserted into 1797 // the HOB. 1798 payload_info = Some(PayloadInfo { 1799 image_type: PayloadImageType::BzImage, 1800 entry_point: section.address, 1801 }); 1802 } 1803 } 1804 TdvfSectionType::PayloadParam => { 1805 info!("Copying payload parameters to guest memory"); 1806 let cmdline = Self::generate_cmdline( 1807 self.config.lock().unwrap().payload.as_ref().unwrap(), 1808 )?; 1809 mem.write_slice( 1810 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1811 GuestAddress(section.address), 1812 ) 1813 .unwrap(); 1814 } 1815 _ => {} 1816 } 1817 } 1818 1819 // Generate HOB 1820 let mut hob = TdHob::start(hob_offset.unwrap()); 1821 1822 let mut sorted_sections = sections.to_vec(); 1823 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1824 1825 sorted_sections.sort_by_key(|section| section.address); 1826 sorted_sections.reverse(); 1827 1828 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1829 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1830 .map_err(Error::PopulateHob)?; 1831 } 1832 1833 // MMIO regions 1834 hob.add_mmio_resource( 1835 &mem, 1836 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1837 arch::layout::APIC_START.raw_value() 1838 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1839 ) 1840 .map_err(Error::PopulateHob)?; 1841 let start_of_device_area = self 1842 .memory_manager 1843 .lock() 1844 .unwrap() 1845 .start_of_device_area() 1846 .raw_value(); 1847 let end_of_device_area = self 1848 .memory_manager 1849 .lock() 1850 .unwrap() 1851 .end_of_device_area() 1852 .raw_value(); 1853 hob.add_mmio_resource( 1854 &mem, 1855 start_of_device_area, 1856 end_of_device_area - start_of_device_area, 1857 ) 1858 .map_err(Error::PopulateHob)?; 1859 1860 // Loop over the ACPI tables and copy them to the HOB. 1861 1862 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1863 &self.device_manager, 1864 &self.cpu_manager, 1865 &self.memory_manager, 1866 &self.numa_nodes, 1867 ) { 1868 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1869 .map_err(Error::PopulateHob)?; 1870 } 1871 1872 // If a payload info has been created, let's insert it into the HOB. 1873 if let Some(payload_info) = payload_info { 1874 hob.add_payload(&mem, payload_info) 1875 .map_err(Error::PopulateHob)?; 1876 } 1877 1878 hob.finish(&mem).map_err(Error::PopulateHob)?; 1879 1880 Ok(hob_offset) 1881 } 1882 1883 #[cfg(feature = "tdx")] 1884 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1885 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1886 let mem = guest_memory.memory(); 1887 1888 for section in sections { 1889 self.vm 1890 .tdx_init_memory_region( 1891 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1892 section.address, 1893 section.size, 1894 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1895 section.attributes == 1, 1896 ) 1897 .map_err(Error::InitializeTdxMemoryRegion)?; 1898 } 1899 1900 Ok(()) 1901 } 1902 1903 // Creates ACPI tables 1904 // In case of TDX being used, this is a no-op since the tables will be 1905 // created and passed when populating the HOB. 1906 1907 fn create_acpi_tables(&self) -> Option<GuestAddress> { 1908 #[cfg(feature = "tdx")] 1909 if self.config.lock().unwrap().is_tdx_enabled() { 1910 return None; 1911 } 1912 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 1913 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 1914 let rsdp_addr = crate::acpi::create_acpi_tables( 1915 &mem, 1916 &self.device_manager, 1917 &self.cpu_manager, 1918 &self.memory_manager, 1919 &self.numa_nodes, 1920 tpm_enabled, 1921 ); 1922 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 1923 1924 Some(rsdp_addr) 1925 } 1926 1927 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 1928 trace_scoped!("entry_point"); 1929 1930 self.load_payload_handle 1931 .take() 1932 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 1933 .transpose() 1934 } 1935 1936 pub fn boot(&mut self) -> Result<()> { 1937 trace_scoped!("Vm::boot"); 1938 info!("Booting VM"); 1939 event!("vm", "booting"); 1940 let current_state = self.get_state()?; 1941 if current_state == VmState::Paused { 1942 return self.resume().map_err(Error::Resume); 1943 } 1944 1945 let new_state = if self.stop_on_boot { 1946 VmState::BreakPoint 1947 } else { 1948 VmState::Running 1949 }; 1950 current_state.valid_transition(new_state)?; 1951 1952 // Do earlier to parallelise with loading kernel 1953 #[cfg(target_arch = "x86_64")] 1954 let rsdp_addr = self.create_acpi_tables(); 1955 1956 // Load kernel synchronously or if asynchronous then wait for load to 1957 // finish. 1958 let entry_point = self.entry_point()?; 1959 1960 #[cfg(feature = "tdx")] 1961 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 1962 1963 // Configure the vcpus that have been created 1964 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 1965 for vcpu in vcpus { 1966 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1967 let boot_setup = entry_point.map(|e| (e, guest_memory)); 1968 self.cpu_manager 1969 .lock() 1970 .unwrap() 1971 .configure_vcpu(vcpu, boot_setup) 1972 .map_err(Error::CpuManager)?; 1973 } 1974 1975 #[cfg(feature = "tdx")] 1976 let (sections, guid_found) = if tdx_enabled { 1977 self.extract_tdvf_sections()? 1978 } else { 1979 (Vec::new(), false) 1980 }; 1981 1982 // Configuring the TDX regions requires that the vCPUs are created. 1983 #[cfg(feature = "tdx")] 1984 let hob_address = if tdx_enabled { 1985 // TDX sections are written to memory. 1986 self.populate_tdx_sections(§ions, guid_found)? 1987 } else { 1988 None 1989 }; 1990 1991 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 1992 // available after they are configured 1993 #[cfg(target_arch = "aarch64")] 1994 let rsdp_addr = self.create_acpi_tables(); 1995 1996 // Configure shared state based on loaded kernel 1997 entry_point 1998 .map(|_| { 1999 // Safe to unwrap rsdp_addr as we know it can't be None when 2000 // the entry_point is Some. 2001 self.configure_system(rsdp_addr.unwrap()) 2002 }) 2003 .transpose()?; 2004 2005 #[cfg(target_arch = "x86_64")] 2006 // Note: For x86, always call this function before invoking start boot vcpus. 2007 // Otherwise guest would fail to boot because we haven't created the 2008 // userspace mappings to update the hypervisor about the memory mappings. 2009 // These mappings must be created before we start the vCPU threads for 2010 // the very first time. 2011 self.memory_manager 2012 .lock() 2013 .unwrap() 2014 .allocate_address_space() 2015 .map_err(Error::MemoryManager)?; 2016 2017 #[cfg(feature = "tdx")] 2018 if let Some(hob_address) = hob_address { 2019 // With the HOB address extracted the vCPUs can have 2020 // their TDX state configured. 2021 self.cpu_manager 2022 .lock() 2023 .unwrap() 2024 .initialize_tdx(hob_address) 2025 .map_err(Error::CpuManager)?; 2026 // Let the hypervisor know which memory ranges are shared with the 2027 // guest. This prevents the guest from ignoring/discarding memory 2028 // regions provided by the host. 2029 self.init_tdx_memory(§ions)?; 2030 // With TDX memory and CPU state configured TDX setup is complete 2031 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2032 } 2033 2034 self.cpu_manager 2035 .lock() 2036 .unwrap() 2037 .start_boot_vcpus(new_state == VmState::BreakPoint) 2038 .map_err(Error::CpuManager)?; 2039 2040 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2041 *state = new_state; 2042 event!("vm", "booted"); 2043 Ok(()) 2044 } 2045 2046 pub fn restore(&mut self) -> Result<()> { 2047 event!("vm", "restoring"); 2048 2049 #[cfg(target_arch = "x86_64")] 2050 // Note: For x86, always call this function before invoking start boot vcpus. 2051 // Otherwise guest would fail to boot because we haven't created the 2052 // userspace mappings to update the hypervisor about the memory mappings. 2053 // These mappings must be created before we start the vCPU threads for 2054 // the very first time for the restored VM. 2055 self.memory_manager 2056 .lock() 2057 .unwrap() 2058 .allocate_address_space() 2059 .map_err(Error::MemoryManager)?; 2060 2061 // Now we can start all vCPUs from here. 2062 self.cpu_manager 2063 .lock() 2064 .unwrap() 2065 .start_restored_vcpus() 2066 .map_err(Error::CpuManager)?; 2067 2068 event!("vm", "restored"); 2069 Ok(()) 2070 } 2071 2072 /// Gets a thread-safe reference counted pointer to the VM configuration. 2073 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2074 Arc::clone(&self.config) 2075 } 2076 2077 /// Get the VM state. Returns an error if the state is poisoned. 2078 pub fn get_state(&self) -> Result<VmState> { 2079 self.state 2080 .try_read() 2081 .map_err(|_| Error::PoisonedState) 2082 .map(|state| *state) 2083 } 2084 2085 /// Gets the actual size of the balloon. 2086 pub fn balloon_size(&self) -> u64 { 2087 self.device_manager.lock().unwrap().balloon_size() 2088 } 2089 2090 pub fn send_memory_fds( 2091 &mut self, 2092 socket: &mut UnixStream, 2093 ) -> std::result::Result<(), MigratableError> { 2094 for (slot, fd) in self 2095 .memory_manager 2096 .lock() 2097 .unwrap() 2098 .memory_slot_fds() 2099 .drain() 2100 { 2101 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2102 .write_to(socket) 2103 .map_err(|e| { 2104 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2105 })?; 2106 socket 2107 .send_with_fd(&slot.to_le_bytes()[..], fd) 2108 .map_err(|e| { 2109 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2110 })?; 2111 2112 let res = Response::read_from(socket)?; 2113 if res.status() != Status::Ok { 2114 warn!("Error during memory fd migration"); 2115 Request::abandon().write_to(socket)?; 2116 Response::read_from(socket).ok(); 2117 return Err(MigratableError::MigrateSend(anyhow!( 2118 "Error during memory fd migration" 2119 ))); 2120 } 2121 } 2122 2123 Ok(()) 2124 } 2125 2126 pub fn send_memory_regions<F>( 2127 &mut self, 2128 ranges: &MemoryRangeTable, 2129 fd: &mut F, 2130 ) -> std::result::Result<(), MigratableError> 2131 where 2132 F: Write, 2133 { 2134 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2135 let mem = guest_memory.memory(); 2136 2137 for range in ranges.regions() { 2138 let mut offset: u64 = 0; 2139 // Here we are manually handling the retry in case we can't the 2140 // whole region at once because we can't use the implementation 2141 // from vm-memory::GuestMemory of write_all_to() as it is not 2142 // following the correct behavior. For more info about this issue 2143 // see: https://github.com/rust-vmm/vm-memory/issues/174 2144 loop { 2145 let bytes_written = mem 2146 .write_to( 2147 GuestAddress(range.gpa + offset), 2148 fd, 2149 (range.length - offset) as usize, 2150 ) 2151 .map_err(|e| { 2152 MigratableError::MigrateSend(anyhow!( 2153 "Error transferring memory to socket: {}", 2154 e 2155 )) 2156 })?; 2157 offset += bytes_written as u64; 2158 2159 if offset == range.length { 2160 break; 2161 } 2162 } 2163 } 2164 2165 Ok(()) 2166 } 2167 2168 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2169 self.memory_manager 2170 .lock() 2171 .unwrap() 2172 .memory_range_table(false) 2173 } 2174 2175 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2176 self.device_manager.lock().unwrap().device_tree() 2177 } 2178 2179 pub fn activate_virtio_devices(&self) -> Result<()> { 2180 self.device_manager 2181 .lock() 2182 .unwrap() 2183 .activate_virtio_devices() 2184 .map_err(Error::ActivateVirtioDevices) 2185 } 2186 2187 #[cfg(target_arch = "x86_64")] 2188 pub fn power_button(&self) -> Result<()> { 2189 return self 2190 .device_manager 2191 .lock() 2192 .unwrap() 2193 .notify_power_button() 2194 .map_err(Error::PowerButton); 2195 } 2196 2197 #[cfg(target_arch = "aarch64")] 2198 pub fn power_button(&self) -> Result<()> { 2199 self.device_manager 2200 .lock() 2201 .unwrap() 2202 .notify_power_button() 2203 .map_err(Error::PowerButton) 2204 } 2205 2206 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2207 self.memory_manager.lock().unwrap().snapshot_data() 2208 } 2209 2210 #[cfg(feature = "guest_debug")] 2211 pub fn debug_request( 2212 &mut self, 2213 gdb_request: &GdbRequestPayload, 2214 cpu_id: usize, 2215 ) -> Result<GdbResponsePayload> { 2216 use GdbRequestPayload::*; 2217 match gdb_request { 2218 SetSingleStep(single_step) => { 2219 self.set_guest_debug(cpu_id, &[], *single_step) 2220 .map_err(Error::Debug)?; 2221 } 2222 SetHwBreakPoint(addrs) => { 2223 self.set_guest_debug(cpu_id, addrs, false) 2224 .map_err(Error::Debug)?; 2225 } 2226 Pause => { 2227 self.debug_pause().map_err(Error::Debug)?; 2228 } 2229 Resume => { 2230 self.debug_resume().map_err(Error::Debug)?; 2231 } 2232 ReadRegs => { 2233 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2234 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2235 } 2236 WriteRegs(regs) => { 2237 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2238 } 2239 ReadMem(vaddr, len) => { 2240 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2241 let mem = self 2242 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2243 .map_err(Error::Debug)?; 2244 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2245 } 2246 WriteMem(vaddr, data) => { 2247 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2248 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2249 .map_err(Error::Debug)?; 2250 } 2251 ActiveVcpus => { 2252 let active_vcpus = self.active_vcpus(); 2253 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2254 } 2255 } 2256 Ok(GdbResponsePayload::CommandComplete) 2257 } 2258 2259 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2260 fn get_dump_state( 2261 &mut self, 2262 destination_url: &str, 2263 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2264 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2265 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2266 let mut elf_phdr_num = 1; 2267 let elf_sh_info = 0; 2268 let coredump_file_path = url_to_file(destination_url)?; 2269 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2270 2271 if mapping_num < UINT16_MAX - 2 { 2272 elf_phdr_num += mapping_num as u16; 2273 } else { 2274 panic!("mapping num beyond 65535 not supported"); 2275 } 2276 let coredump_file = OpenOptions::new() 2277 .read(true) 2278 .write(true) 2279 .create_new(true) 2280 .open(coredump_file_path) 2281 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2282 2283 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2284 let mem_data = self 2285 .memory_manager 2286 .lock() 2287 .unwrap() 2288 .coredump_memory_regions(mem_offset); 2289 2290 Ok(DumpState { 2291 elf_note_size, 2292 elf_phdr_num, 2293 elf_sh_info, 2294 mem_offset, 2295 mem_info: Some(mem_data), 2296 file: Some(coredump_file), 2297 }) 2298 } 2299 2300 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2301 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2302 size_of::<elf::Elf64_Ehdr>() as u64 2303 + note_size as u64 2304 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2305 } 2306 } 2307 2308 impl Pausable for Vm { 2309 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2310 event!("vm", "pausing"); 2311 let mut state = self 2312 .state 2313 .try_write() 2314 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2315 let new_state = VmState::Paused; 2316 2317 state 2318 .valid_transition(new_state) 2319 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2320 2321 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2322 { 2323 let mut clock = self 2324 .vm 2325 .get_clock() 2326 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2327 clock.reset_flags(); 2328 self.saved_clock = Some(clock); 2329 } 2330 2331 // Before pausing the vCPUs activate any pending virtio devices that might 2332 // need activation between starting the pause (or e.g. a migration it's part of) 2333 self.activate_virtio_devices().map_err(|e| { 2334 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2335 })?; 2336 2337 self.cpu_manager.lock().unwrap().pause()?; 2338 self.device_manager.lock().unwrap().pause()?; 2339 2340 *state = new_state; 2341 2342 event!("vm", "paused"); 2343 Ok(()) 2344 } 2345 2346 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2347 event!("vm", "resuming"); 2348 let mut state = self 2349 .state 2350 .try_write() 2351 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2352 let new_state = VmState::Running; 2353 2354 state 2355 .valid_transition(new_state) 2356 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2357 2358 self.cpu_manager.lock().unwrap().resume()?; 2359 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2360 { 2361 if let Some(clock) = &self.saved_clock { 2362 self.vm.set_clock(clock).map_err(|e| { 2363 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2364 })?; 2365 } 2366 } 2367 self.device_manager.lock().unwrap().resume()?; 2368 2369 // And we're back to the Running state. 2370 *state = new_state; 2371 event!("vm", "resumed"); 2372 Ok(()) 2373 } 2374 } 2375 2376 #[derive(Serialize, Deserialize)] 2377 pub struct VmSnapshot { 2378 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2379 pub clock: Option<hypervisor::ClockData>, 2380 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2381 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2382 } 2383 2384 pub const VM_SNAPSHOT_ID: &str = "vm"; 2385 impl Snapshottable for Vm { 2386 fn id(&self) -> String { 2387 VM_SNAPSHOT_ID.to_string() 2388 } 2389 2390 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2391 event!("vm", "snapshotting"); 2392 2393 #[cfg(feature = "tdx")] 2394 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2395 2396 #[cfg(feature = "tdx")] 2397 { 2398 if tdx_enabled { 2399 return Err(MigratableError::Snapshot(anyhow!( 2400 "Snapshot not possible with TDX VM" 2401 ))); 2402 } 2403 } 2404 2405 let current_state = self.get_state().unwrap(); 2406 if current_state != VmState::Paused { 2407 return Err(MigratableError::Snapshot(anyhow!( 2408 "Trying to snapshot while VM is running" 2409 ))); 2410 } 2411 2412 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2413 let common_cpuid = { 2414 let amx = self.config.lock().unwrap().cpus.features.amx; 2415 let phys_bits = physical_bits( 2416 &self.hypervisor, 2417 self.config.lock().unwrap().cpus.max_phys_bits, 2418 ); 2419 arch::generate_common_cpuid( 2420 &self.hypervisor, 2421 &arch::CpuidConfig { 2422 sgx_epc_sections: None, 2423 phys_bits, 2424 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2425 #[cfg(feature = "tdx")] 2426 tdx: tdx_enabled, 2427 amx, 2428 }, 2429 ) 2430 .map_err(|e| { 2431 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2432 })? 2433 }; 2434 2435 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2436 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2437 clock: self.saved_clock, 2438 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2439 common_cpuid, 2440 }) 2441 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2442 2443 let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data)); 2444 2445 let (id, snapshot) = { 2446 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2447 (cpu_manager.id(), cpu_manager.snapshot()?) 2448 }; 2449 vm_snapshot.add_snapshot(id, snapshot); 2450 let (id, snapshot) = { 2451 let mut memory_manager = self.memory_manager.lock().unwrap(); 2452 (memory_manager.id(), memory_manager.snapshot()?) 2453 }; 2454 vm_snapshot.add_snapshot(id, snapshot); 2455 let (id, snapshot) = { 2456 let mut device_manager = self.device_manager.lock().unwrap(); 2457 (device_manager.id(), device_manager.snapshot()?) 2458 }; 2459 vm_snapshot.add_snapshot(id, snapshot); 2460 2461 event!("vm", "snapshotted"); 2462 Ok(vm_snapshot) 2463 } 2464 } 2465 2466 impl Transportable for Vm { 2467 fn send( 2468 &self, 2469 snapshot: &Snapshot, 2470 destination_url: &str, 2471 ) -> std::result::Result<(), MigratableError> { 2472 let mut snapshot_config_path = url_to_path(destination_url)?; 2473 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2474 2475 // Create the snapshot config file 2476 let mut snapshot_config_file = OpenOptions::new() 2477 .read(true) 2478 .write(true) 2479 .create_new(true) 2480 .open(snapshot_config_path) 2481 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2482 2483 // Serialize and write the snapshot config 2484 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2485 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2486 2487 snapshot_config_file 2488 .write(vm_config.as_bytes()) 2489 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2490 2491 let mut snapshot_state_path = url_to_path(destination_url)?; 2492 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2493 2494 // Create the snapshot state file 2495 let mut snapshot_state_file = OpenOptions::new() 2496 .read(true) 2497 .write(true) 2498 .create_new(true) 2499 .open(snapshot_state_path) 2500 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2501 2502 // Serialize and write the snapshot state 2503 let vm_state = 2504 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2505 2506 snapshot_state_file 2507 .write(&vm_state) 2508 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2509 2510 // Tell the memory manager to also send/write its own snapshot. 2511 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2512 self.memory_manager 2513 .lock() 2514 .unwrap() 2515 .send(&memory_manager_snapshot.clone(), destination_url)?; 2516 } else { 2517 return Err(MigratableError::Restore(anyhow!( 2518 "Missing memory manager snapshot" 2519 ))); 2520 } 2521 2522 Ok(()) 2523 } 2524 } 2525 2526 impl Migratable for Vm { 2527 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2528 self.memory_manager.lock().unwrap().start_dirty_log()?; 2529 self.device_manager.lock().unwrap().start_dirty_log() 2530 } 2531 2532 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2533 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2534 self.device_manager.lock().unwrap().stop_dirty_log() 2535 } 2536 2537 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2538 Ok(MemoryRangeTable::new_from_tables(vec![ 2539 self.memory_manager.lock().unwrap().dirty_log()?, 2540 self.device_manager.lock().unwrap().dirty_log()?, 2541 ])) 2542 } 2543 2544 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2545 self.memory_manager.lock().unwrap().start_migration()?; 2546 self.device_manager.lock().unwrap().start_migration() 2547 } 2548 2549 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2550 self.memory_manager.lock().unwrap().complete_migration()?; 2551 self.device_manager.lock().unwrap().complete_migration() 2552 } 2553 } 2554 2555 #[cfg(feature = "guest_debug")] 2556 impl Debuggable for Vm { 2557 fn set_guest_debug( 2558 &self, 2559 cpu_id: usize, 2560 addrs: &[GuestAddress], 2561 singlestep: bool, 2562 ) -> std::result::Result<(), DebuggableError> { 2563 self.cpu_manager 2564 .lock() 2565 .unwrap() 2566 .set_guest_debug(cpu_id, addrs, singlestep) 2567 } 2568 2569 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2570 if *self.state.read().unwrap() == VmState::Running { 2571 self.pause().map_err(DebuggableError::Pause)?; 2572 } 2573 2574 let mut state = self 2575 .state 2576 .try_write() 2577 .map_err(|_| DebuggableError::PoisonedState)?; 2578 *state = VmState::BreakPoint; 2579 Ok(()) 2580 } 2581 2582 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2583 if *self.state.read().unwrap() == VmState::BreakPoint { 2584 self.resume().map_err(DebuggableError::Pause)?; 2585 } 2586 2587 Ok(()) 2588 } 2589 2590 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2591 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2592 } 2593 2594 fn write_regs( 2595 &self, 2596 cpu_id: usize, 2597 regs: &CoreRegs, 2598 ) -> std::result::Result<(), DebuggableError> { 2599 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2600 } 2601 2602 fn read_mem( 2603 &self, 2604 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2605 cpu_id: usize, 2606 vaddr: GuestAddress, 2607 len: usize, 2608 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2609 self.cpu_manager 2610 .lock() 2611 .unwrap() 2612 .read_mem(guest_memory, cpu_id, vaddr, len) 2613 } 2614 2615 fn write_mem( 2616 &self, 2617 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2618 cpu_id: usize, 2619 vaddr: &GuestAddress, 2620 data: &[u8], 2621 ) -> std::result::Result<(), DebuggableError> { 2622 self.cpu_manager 2623 .lock() 2624 .unwrap() 2625 .write_mem(guest_memory, cpu_id, vaddr, data) 2626 } 2627 2628 fn active_vcpus(&self) -> usize { 2629 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2630 if active_vcpus > 0 { 2631 active_vcpus 2632 } else { 2633 // The VM is not booted yet. Report boot_vcpus() instead. 2634 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2635 } 2636 } 2637 } 2638 2639 #[cfg(feature = "guest_debug")] 2640 pub const UINT16_MAX: u32 = 65535; 2641 2642 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2643 impl Elf64Writable for Vm {} 2644 2645 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2646 impl GuestDebuggable for Vm { 2647 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2648 event!("vm", "coredumping"); 2649 2650 let mut resume = false; 2651 2652 #[cfg(feature = "tdx")] 2653 { 2654 if let Some(ref platform) = self.config.lock().unwrap().platform { 2655 if platform.tdx { 2656 return Err(GuestDebuggableError::Coredump(anyhow!( 2657 "Coredump not possible with TDX VM" 2658 ))); 2659 } 2660 } 2661 } 2662 2663 match self.get_state().unwrap() { 2664 VmState::Running => { 2665 self.pause().map_err(GuestDebuggableError::Pause)?; 2666 resume = true; 2667 } 2668 VmState::Paused => {} 2669 _ => { 2670 return Err(GuestDebuggableError::Coredump(anyhow!( 2671 "Trying to coredump while VM is not running or paused" 2672 ))); 2673 } 2674 } 2675 2676 let coredump_state = self.get_dump_state(destination_url)?; 2677 2678 self.write_header(&coredump_state)?; 2679 self.write_note(&coredump_state)?; 2680 self.write_loads(&coredump_state)?; 2681 2682 self.cpu_manager 2683 .lock() 2684 .unwrap() 2685 .cpu_write_elf64_note(&coredump_state)?; 2686 self.cpu_manager 2687 .lock() 2688 .unwrap() 2689 .cpu_write_vmm_note(&coredump_state)?; 2690 2691 self.memory_manager 2692 .lock() 2693 .unwrap() 2694 .coredump_iterate_save_mem(&coredump_state)?; 2695 2696 if resume { 2697 self.resume().map_err(GuestDebuggableError::Resume)?; 2698 } 2699 2700 Ok(()) 2701 } 2702 } 2703 2704 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2705 #[cfg(test)] 2706 mod tests { 2707 use super::*; 2708 2709 fn test_vm_state_transitions(state: VmState) { 2710 match state { 2711 VmState::Created => { 2712 // Check the transitions from Created 2713 assert!(state.valid_transition(VmState::Created).is_err()); 2714 assert!(state.valid_transition(VmState::Running).is_ok()); 2715 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2716 assert!(state.valid_transition(VmState::Paused).is_ok()); 2717 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2718 } 2719 VmState::Running => { 2720 // Check the transitions from Running 2721 assert!(state.valid_transition(VmState::Created).is_err()); 2722 assert!(state.valid_transition(VmState::Running).is_err()); 2723 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2724 assert!(state.valid_transition(VmState::Paused).is_ok()); 2725 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2726 } 2727 VmState::Shutdown => { 2728 // Check the transitions from Shutdown 2729 assert!(state.valid_transition(VmState::Created).is_err()); 2730 assert!(state.valid_transition(VmState::Running).is_ok()); 2731 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2732 assert!(state.valid_transition(VmState::Paused).is_err()); 2733 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2734 } 2735 VmState::Paused => { 2736 // Check the transitions from Paused 2737 assert!(state.valid_transition(VmState::Created).is_err()); 2738 assert!(state.valid_transition(VmState::Running).is_ok()); 2739 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2740 assert!(state.valid_transition(VmState::Paused).is_err()); 2741 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2742 } 2743 VmState::BreakPoint => { 2744 // Check the transitions from Breakpoint 2745 assert!(state.valid_transition(VmState::Created).is_ok()); 2746 assert!(state.valid_transition(VmState::Running).is_ok()); 2747 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2748 assert!(state.valid_transition(VmState::Paused).is_err()); 2749 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2750 } 2751 } 2752 } 2753 2754 #[test] 2755 fn test_vm_created_transitions() { 2756 test_vm_state_transitions(VmState::Created); 2757 } 2758 2759 #[test] 2760 fn test_vm_running_transitions() { 2761 test_vm_state_transitions(VmState::Running); 2762 } 2763 2764 #[test] 2765 fn test_vm_shutdown_transitions() { 2766 test_vm_state_transitions(VmState::Shutdown); 2767 } 2768 2769 #[test] 2770 fn test_vm_paused_transitions() { 2771 test_vm_state_transitions(VmState::Paused); 2772 } 2773 2774 #[cfg(feature = "tdx")] 2775 #[test] 2776 fn test_hob_memory_resources() { 2777 // Case 1: Two TDVF sections in the middle of the RAM 2778 let sections = vec![ 2779 TdvfSection { 2780 address: 0xc000, 2781 size: 0x1000, 2782 ..Default::default() 2783 }, 2784 TdvfSection { 2785 address: 0x1000, 2786 size: 0x4000, 2787 ..Default::default() 2788 }, 2789 ]; 2790 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2791 let expected = vec![ 2792 (0, 0x1000, true), 2793 (0x1000, 0x4000, false), 2794 (0x5000, 0x7000, true), 2795 (0xc000, 0x1000, false), 2796 (0xd000, 0x0fff_3000, true), 2797 ]; 2798 assert_eq!( 2799 expected, 2800 Vm::hob_memory_resources( 2801 sections, 2802 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2803 ) 2804 ); 2805 2806 // Case 2: Two TDVF sections with no conflict with the RAM 2807 let sections = vec![ 2808 TdvfSection { 2809 address: 0x1000_1000, 2810 size: 0x1000, 2811 ..Default::default() 2812 }, 2813 TdvfSection { 2814 address: 0, 2815 size: 0x1000, 2816 ..Default::default() 2817 }, 2818 ]; 2819 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2820 let expected = vec![ 2821 (0, 0x1000, false), 2822 (0x1000, 0x1000_0000, true), 2823 (0x1000_1000, 0x1000, false), 2824 ]; 2825 assert_eq!( 2826 expected, 2827 Vm::hob_memory_resources( 2828 sections, 2829 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2830 ) 2831 ); 2832 2833 // Case 3: Two TDVF sections with partial conflicts with the RAM 2834 let sections = vec![ 2835 TdvfSection { 2836 address: 0x1000_0000, 2837 size: 0x2000, 2838 ..Default::default() 2839 }, 2840 TdvfSection { 2841 address: 0, 2842 size: 0x2000, 2843 ..Default::default() 2844 }, 2845 ]; 2846 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2847 let expected = vec![ 2848 (0, 0x2000, false), 2849 (0x2000, 0x0fff_e000, true), 2850 (0x1000_0000, 0x2000, false), 2851 ]; 2852 assert_eq!( 2853 expected, 2854 Vm::hob_memory_resources( 2855 sections, 2856 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2857 ) 2858 ); 2859 2860 // Case 4: Two TDVF sections with no conflict before the RAM and two 2861 // more additional sections with no conflict after the RAM. 2862 let sections = vec![ 2863 TdvfSection { 2864 address: 0x2000_1000, 2865 size: 0x1000, 2866 ..Default::default() 2867 }, 2868 TdvfSection { 2869 address: 0x2000_0000, 2870 size: 0x1000, 2871 ..Default::default() 2872 }, 2873 TdvfSection { 2874 address: 0x1000, 2875 size: 0x1000, 2876 ..Default::default() 2877 }, 2878 TdvfSection { 2879 address: 0, 2880 size: 0x1000, 2881 ..Default::default() 2882 }, 2883 ]; 2884 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 2885 let expected = vec![ 2886 (0, 0x1000, false), 2887 (0x1000, 0x1000, false), 2888 (0x4000, 0x1000_0000, true), 2889 (0x2000_0000, 0x1000, false), 2890 (0x2000_1000, 0x1000, false), 2891 ]; 2892 assert_eq!( 2893 expected, 2894 Vm::hob_memory_resources( 2895 sections, 2896 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2897 ) 2898 ); 2899 2900 // Case 5: One TDVF section overriding the entire RAM 2901 let sections = vec![TdvfSection { 2902 address: 0, 2903 size: 0x2000_0000, 2904 ..Default::default() 2905 }]; 2906 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2907 let expected = vec![(0, 0x2000_0000, false)]; 2908 assert_eq!( 2909 expected, 2910 Vm::hob_memory_resources( 2911 sections, 2912 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2913 ) 2914 ); 2915 2916 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 2917 let sections = vec![ 2918 TdvfSection { 2919 address: 0x1000_2000, 2920 size: 0x2000, 2921 ..Default::default() 2922 }, 2923 TdvfSection { 2924 address: 0, 2925 size: 0x2000, 2926 ..Default::default() 2927 }, 2928 ]; 2929 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 2930 (GuestAddress(0x2000), 0x1000_0000), 2931 (GuestAddress(0x1000_4000), 0x1000_0000), 2932 ]; 2933 let expected = vec![ 2934 (0, 0x2000, false), 2935 (0x2000, 0x1000_0000, true), 2936 (0x1000_2000, 0x2000, false), 2937 (0x1000_4000, 0x1000_0000, true), 2938 ]; 2939 assert_eq!( 2940 expected, 2941 Vm::hob_memory_resources( 2942 sections, 2943 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2944 ) 2945 ); 2946 2947 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 2948 let sections = vec![ 2949 TdvfSection { 2950 address: 0x1000_0000, 2951 size: 0x4000, 2952 ..Default::default() 2953 }, 2954 TdvfSection { 2955 address: 0, 2956 size: 0x4000, 2957 ..Default::default() 2958 }, 2959 ]; 2960 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 2961 (GuestAddress(0x1000), 0x1000_0000), 2962 (GuestAddress(0x1000_3000), 0x1000_0000), 2963 ]; 2964 let expected = vec![ 2965 (0, 0x4000, false), 2966 (0x4000, 0x0fff_c000, true), 2967 (0x1000_0000, 0x4000, false), 2968 (0x1000_4000, 0x0fff_f000, true), 2969 ]; 2970 assert_eq!( 2971 expected, 2972 Vm::hob_memory_resources( 2973 sections, 2974 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2975 ) 2976 ); 2977 } 2978 } 2979 2980 #[cfg(target_arch = "aarch64")] 2981 #[cfg(test)] 2982 mod tests { 2983 use super::*; 2984 use crate::GuestMemoryMmap; 2985 use arch::aarch64::fdt::create_fdt; 2986 use arch::aarch64::layout; 2987 use arch::{DeviceType, MmioDeviceInfo}; 2988 use devices::gic::Gic; 2989 2990 const LEN: u64 = 4096; 2991 2992 #[test] 2993 fn test_create_fdt_with_devices() { 2994 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 2995 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 2996 2997 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 2998 ( 2999 (DeviceType::Serial, DeviceType::Serial.to_string()), 3000 MmioDeviceInfo { 3001 addr: 0x00, 3002 len: LEN, 3003 irq: 33, 3004 }, 3005 ), 3006 ( 3007 (DeviceType::Virtio(1), "virtio".to_string()), 3008 MmioDeviceInfo { 3009 addr: LEN, 3010 len: LEN, 3011 irq: 34, 3012 }, 3013 ), 3014 ( 3015 (DeviceType::Rtc, "rtc".to_string()), 3016 MmioDeviceInfo { 3017 addr: 2 * LEN, 3018 len: LEN, 3019 irq: 35, 3020 }, 3021 ), 3022 ] 3023 .iter() 3024 .cloned() 3025 .collect(); 3026 3027 let hv = hypervisor::new().unwrap(); 3028 let vm = hv.create_vm().unwrap(); 3029 let gic = vm 3030 .create_vgic(Gic::create_default_config(1)) 3031 .expect("Cannot create gic"); 3032 assert!(create_fdt( 3033 &mem, 3034 "console=tty0", 3035 vec![0], 3036 Some((0, 0, 0)), 3037 &dev_info, 3038 &gic, 3039 &None, 3040 &Vec::new(), 3041 &BTreeMap::new(), 3042 None, 3043 true, 3044 ) 3045 .is_ok()) 3046 } 3047 } 3048 3049 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3050 #[test] 3051 pub fn test_vm() { 3052 use hypervisor::VmExit; 3053 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3054 // This example based on https://lwn.net/Articles/658511/ 3055 let code = [ 3056 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3057 0x00, 0xd8, /* add %bl, %al */ 3058 0x04, b'0', /* add $'0', %al */ 3059 0xee, /* out %al, (%dx) */ 3060 0xb0, b'\n', /* mov $'\n', %al */ 3061 0xee, /* out %al, (%dx) */ 3062 0xf4, /* hlt */ 3063 ]; 3064 3065 let mem_size = 0x1000; 3066 let load_addr = GuestAddress(0x1000); 3067 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3068 3069 let hv = hypervisor::new().unwrap(); 3070 let vm = hv.create_vm().expect("new VM creation failed"); 3071 3072 for (index, region) in mem.iter().enumerate() { 3073 let mem_region = vm.make_user_memory_region( 3074 index as u32, 3075 region.start_addr().raw_value(), 3076 region.len(), 3077 region.as_ptr() as u64, 3078 false, 3079 false, 3080 ); 3081 3082 vm.create_user_memory_region(mem_region) 3083 .expect("Cannot configure guest memory"); 3084 } 3085 mem.write_slice(&code, load_addr) 3086 .expect("Writing code to memory failed"); 3087 3088 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3089 3090 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3091 vcpu_sregs.cs.base = 0; 3092 vcpu_sregs.cs.selector = 0; 3093 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3094 3095 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3096 vcpu_regs.rip = 0x1000; 3097 vcpu_regs.rax = 2; 3098 vcpu_regs.rbx = 3; 3099 vcpu_regs.rflags = 2; 3100 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3101 3102 loop { 3103 match vcpu.run().expect("run failed") { 3104 VmExit::IoOut(addr, data) => { 3105 println!( 3106 "IO out -- addr: {:#x} data [{:?}]", 3107 addr, 3108 str::from_utf8(data).unwrap() 3109 ); 3110 } 3111 VmExit::Reset => { 3112 println!("HLT"); 3113 break; 3114 } 3115 r => panic!("unexpected exit reason: {r:?}"), 3116 } 3117 } 3118 } 3119