1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 use crate::memory_manager::{ 29 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 30 }; 31 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 32 use crate::migration::get_vm_snapshot; 33 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 34 use crate::migration::url_to_file; 35 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 36 use crate::GuestMemoryMmap; 37 use crate::{ 38 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 39 }; 40 use anyhow::anyhow; 41 use arch::get_host_cpu_phys_bits; 42 #[cfg(target_arch = "x86_64")] 43 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 44 #[cfg(feature = "tdx")] 45 use arch::x86_64::tdx::TdvfSection; 46 use arch::EntryPoint; 47 #[cfg(target_arch = "aarch64")] 48 use arch::PciSpaceInfo; 49 use arch::{NumaNode, NumaNodes}; 50 #[cfg(target_arch = "aarch64")] 51 use devices::interrupt_controller; 52 use devices::AcpiNotificationFlags; 53 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 54 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 55 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 56 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 57 use hypervisor::{HypervisorVmError, VmOps}; 58 use libc::{termios, SIGWINCH}; 59 use linux_loader::cmdline::Cmdline; 60 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 61 use linux_loader::elf; 62 #[cfg(target_arch = "x86_64")] 63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 64 #[cfg(target_arch = "aarch64")] 65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 66 use linux_loader::loader::KernelLoader; 67 use seccompiler::SeccompAction; 68 use serde::{Deserialize, Serialize}; 69 use std::cmp; 70 use std::collections::BTreeMap; 71 use std::collections::HashMap; 72 use std::convert::TryInto; 73 use std::fs::{File, OpenOptions}; 74 use std::io::{self, Seek, SeekFrom, Write}; 75 #[cfg(feature = "tdx")] 76 use std::mem; 77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 78 use std::mem::size_of; 79 use std::num::Wrapping; 80 use std::ops::Deref; 81 use std::os::unix::net::UnixStream; 82 use std::sync::{Arc, Mutex, RwLock}; 83 use std::time::Instant; 84 use std::{result, str, thread}; 85 use thiserror::Error; 86 use tracer::trace_scoped; 87 use vm_device::Bus; 88 #[cfg(feature = "tdx")] 89 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion}; 90 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; 91 use vm_migration::protocol::{Request, Response, Status}; 92 use vm_migration::{ 93 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 94 SnapshotData, Snapshottable, Transportable, 95 }; 96 use vmm_sys_util::eventfd::EventFd; 97 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 98 99 /// Errors associated with VM management 100 #[derive(Debug, Error)] 101 pub enum Error { 102 #[error("Cannot open kernel file: {0}")] 103 KernelFile(#[source] io::Error), 104 105 #[error("Cannot open initramfs file: {0}")] 106 InitramfsFile(#[source] io::Error), 107 108 #[error("Cannot load the kernel into memory: {0}")] 109 KernelLoad(#[source] linux_loader::loader::Error), 110 111 #[cfg(target_arch = "aarch64")] 112 #[error("Cannot load the UEFI binary in memory: {0:?}")] 113 UefiLoad(arch::aarch64::uefi::Error), 114 115 #[error("Cannot load the initramfs into memory")] 116 InitramfsLoad, 117 118 #[error("Cannot load the kernel command line in memory: {0}")] 119 LoadCmdLine(#[source] linux_loader::loader::Error), 120 121 #[error("Cannot modify the kernel command line: {0}")] 122 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 123 124 #[error("Cannot create the kernel command line: {0}")] 125 CmdLineCreate(#[source] linux_loader::cmdline::Error), 126 127 #[error("Cannot configure system: {0}")] 128 ConfigureSystem(#[source] arch::Error), 129 130 #[cfg(target_arch = "aarch64")] 131 #[error("Cannot enable interrupt controller: {0:?}")] 132 EnableInterruptController(interrupt_controller::Error), 133 134 #[error("VM state is poisoned")] 135 PoisonedState, 136 137 #[error("Error from device manager: {0:?}")] 138 DeviceManager(DeviceManagerError), 139 140 #[error("No device with id {0:?} to remove")] 141 NoDeviceToRemove(String), 142 143 #[error("Cannot spawn a signal handler thread: {0}")] 144 SignalHandlerSpawn(#[source] io::Error), 145 146 #[error("Failed to join on threads: {0:?}")] 147 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 148 149 #[error("VM config is missing")] 150 VmMissingConfig, 151 152 #[error("VM is not created")] 153 VmNotCreated, 154 155 #[error("VM is already created")] 156 VmAlreadyCreated, 157 158 #[error("VM is not running")] 159 VmNotRunning, 160 161 #[error("Cannot clone EventFd: {0}")] 162 EventFdClone(#[source] io::Error), 163 164 #[error("invalid VM state transition: {0:?} to {1:?}")] 165 InvalidStateTransition(VmState, VmState), 166 167 #[error("Error from CPU manager: {0}")] 168 CpuManager(#[source] cpu::Error), 169 170 #[error("Cannot pause devices: {0}")] 171 PauseDevices(#[source] MigratableError), 172 173 #[error("Cannot resume devices: {0}")] 174 ResumeDevices(#[source] MigratableError), 175 176 #[error("Cannot pause CPUs: {0}")] 177 PauseCpus(#[source] MigratableError), 178 179 #[error("Cannot resume cpus: {0}")] 180 ResumeCpus(#[source] MigratableError), 181 182 #[error("Cannot pause VM: {0}")] 183 Pause(#[source] MigratableError), 184 185 #[error("Cannot resume VM: {0}")] 186 Resume(#[source] MigratableError), 187 188 #[error("Memory manager error: {0:?}")] 189 MemoryManager(MemoryManagerError), 190 191 #[error("Eventfd write error: {0}")] 192 EventfdError(#[source] std::io::Error), 193 194 #[error("Cannot snapshot VM: {0}")] 195 Snapshot(#[source] MigratableError), 196 197 #[error("Cannot restore VM: {0}")] 198 Restore(#[source] MigratableError), 199 200 #[error("Cannot send VM snapshot: {0}")] 201 SnapshotSend(#[source] MigratableError), 202 203 #[error("Invalid restore source URL")] 204 InvalidRestoreSourceUrl, 205 206 #[error("Failed to validate config: {0}")] 207 ConfigValidation(#[source] ValidationError), 208 209 #[error("Too many virtio-vsock devices")] 210 TooManyVsockDevices, 211 212 #[error("Failed serializing into JSON: {0}")] 213 SerializeJson(#[source] serde_json::Error), 214 215 #[error("Invalid NUMA configuration")] 216 InvalidNumaConfig, 217 218 #[error("Cannot create seccomp filter: {0}")] 219 CreateSeccompFilter(#[source] seccompiler::Error), 220 221 #[error("Cannot apply seccomp filter: {0}")] 222 ApplySeccompFilter(#[source] seccompiler::Error), 223 224 #[error("Failed resizing a memory zone")] 225 ResizeZone, 226 227 #[error("Cannot activate virtio devices: {0:?}")] 228 ActivateVirtioDevices(DeviceManagerError), 229 230 #[error("Error triggering power button: {0:?}")] 231 PowerButton(DeviceManagerError), 232 233 #[error("Kernel lacks PVH header")] 234 KernelMissingPvhHeader, 235 236 #[error("Failed to allocate firmware RAM: {0:?}")] 237 AllocateFirmwareMemory(MemoryManagerError), 238 239 #[error("Error manipulating firmware file: {0}")] 240 FirmwareFile(#[source] std::io::Error), 241 242 #[error("Firmware too big")] 243 FirmwareTooLarge, 244 245 #[error("Failed to copy firmware to memory: {0}")] 246 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 247 248 #[cfg(feature = "tdx")] 249 #[error("Error performing I/O on TDX firmware file: {0}")] 250 LoadTdvf(#[source] std::io::Error), 251 252 #[cfg(feature = "tdx")] 253 #[error("Error performing I/O on the TDX payload file: {0}")] 254 LoadPayload(#[source] std::io::Error), 255 256 #[cfg(feature = "tdx")] 257 #[error("Error parsing TDVF: {0}")] 258 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 259 260 #[cfg(feature = "tdx")] 261 #[error("Error populating TDX HOB: {0}")] 262 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 263 264 #[cfg(feature = "tdx")] 265 #[error("Error allocating TDVF memory: {0:?}")] 266 AllocatingTdvfMemory(crate::memory_manager::Error), 267 268 #[cfg(feature = "tdx")] 269 #[error("Error enabling TDX VM: {0}")] 270 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 271 272 #[cfg(feature = "tdx")] 273 #[error("Error enabling TDX memory region: {0}")] 274 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 275 276 #[cfg(feature = "tdx")] 277 #[error("Error finalizing TDX VM: {0}")] 278 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 279 280 #[cfg(feature = "tdx")] 281 #[error("TDX firmware missing")] 282 TdxFirmwareMissing, 283 284 #[cfg(feature = "tdx")] 285 #[error("Invalid TDX payload type")] 286 InvalidPayloadType, 287 288 #[cfg(feature = "guest_debug")] 289 #[error("Error debugging VM: {0:?}")] 290 Debug(DebuggableError), 291 292 #[error("Error spawning kernel loading thread")] 293 KernelLoadThreadSpawn(std::io::Error), 294 295 #[error("Error joining kernel loading thread")] 296 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 297 298 #[error("Payload configuration is not bootable")] 299 InvalidPayload, 300 301 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 302 #[error("Error coredumping VM: {0:?}")] 303 Coredump(GuestDebuggableError), 304 } 305 pub type Result<T> = result::Result<T, Error>; 306 307 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 308 pub enum VmState { 309 Created, 310 Running, 311 Shutdown, 312 Paused, 313 BreakPoint, 314 } 315 316 impl VmState { 317 fn valid_transition(self, new_state: VmState) -> Result<()> { 318 match self { 319 VmState::Created => match new_state { 320 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 321 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 322 Ok(()) 323 } 324 }, 325 326 VmState::Running => match new_state { 327 VmState::Created | VmState::Running => { 328 Err(Error::InvalidStateTransition(self, new_state)) 329 } 330 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 331 }, 332 333 VmState::Shutdown => match new_state { 334 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 335 Err(Error::InvalidStateTransition(self, new_state)) 336 } 337 VmState::Running => Ok(()), 338 }, 339 340 VmState::Paused => match new_state { 341 VmState::Created | VmState::Paused | VmState::BreakPoint => { 342 Err(Error::InvalidStateTransition(self, new_state)) 343 } 344 VmState::Running | VmState::Shutdown => Ok(()), 345 }, 346 VmState::BreakPoint => match new_state { 347 VmState::Created | VmState::Running => Ok(()), 348 _ => Err(Error::InvalidStateTransition(self, new_state)), 349 }, 350 } 351 } 352 } 353 354 struct VmOpsHandler { 355 memory: GuestMemoryAtomic<GuestMemoryMmap>, 356 #[cfg(target_arch = "x86_64")] 357 io_bus: Arc<Bus>, 358 mmio_bus: Arc<Bus>, 359 } 360 361 impl VmOps for VmOpsHandler { 362 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 363 self.memory 364 .memory() 365 .write(buf, GuestAddress(gpa)) 366 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 367 } 368 369 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 370 self.memory 371 .memory() 372 .read(buf, GuestAddress(gpa)) 373 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 374 } 375 376 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 377 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 378 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 379 } 380 Ok(()) 381 } 382 383 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 384 match self.mmio_bus.write(gpa, data) { 385 Err(vm_device::BusError::MissingAddressRange) => { 386 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 387 } 388 Ok(Some(barrier)) => { 389 info!("Waiting for barrier"); 390 barrier.wait(); 391 info!("Barrier released"); 392 } 393 _ => {} 394 }; 395 Ok(()) 396 } 397 398 #[cfg(target_arch = "x86_64")] 399 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 400 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 401 info!("Guest PIO read to unregistered address 0x{:x}", port); 402 } 403 Ok(()) 404 } 405 406 #[cfg(target_arch = "x86_64")] 407 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 408 match self.io_bus.write(port, data) { 409 Err(vm_device::BusError::MissingAddressRange) => { 410 info!("Guest PIO write to unregistered address 0x{:x}", port); 411 } 412 Ok(Some(barrier)) => { 413 info!("Waiting for barrier"); 414 barrier.wait(); 415 info!("Barrier released"); 416 } 417 _ => {} 418 }; 419 Ok(()) 420 } 421 } 422 423 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 424 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 425 426 cmp::min(host_phys_bits, max_phys_bits) 427 } 428 429 pub struct Vm { 430 #[cfg(feature = "tdx")] 431 kernel: Option<File>, 432 initramfs: Option<File>, 433 threads: Vec<thread::JoinHandle<()>>, 434 device_manager: Arc<Mutex<DeviceManager>>, 435 config: Arc<Mutex<VmConfig>>, 436 state: RwLock<VmState>, 437 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 438 memory_manager: Arc<Mutex<MemoryManager>>, 439 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 440 // The hypervisor abstracted virtual machine. 441 vm: Arc<dyn hypervisor::Vm>, 442 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 443 saved_clock: Option<hypervisor::ClockData>, 444 numa_nodes: NumaNodes, 445 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 446 hypervisor: Arc<dyn hypervisor::Hypervisor>, 447 stop_on_boot: bool, 448 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 449 } 450 451 impl Vm { 452 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 453 454 #[allow(clippy::too_many_arguments)] 455 pub fn new_from_memory_manager( 456 config: Arc<Mutex<VmConfig>>, 457 memory_manager: Arc<Mutex<MemoryManager>>, 458 vm: Arc<dyn hypervisor::Vm>, 459 exit_evt: EventFd, 460 reset_evt: EventFd, 461 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 462 seccomp_action: &SeccompAction, 463 hypervisor: Arc<dyn hypervisor::Hypervisor>, 464 activate_evt: EventFd, 465 timestamp: Instant, 466 serial_pty: Option<PtyPair>, 467 console_pty: Option<PtyPair>, 468 console_resize_pipe: Option<File>, 469 original_termios: Arc<Mutex<Option<termios>>>, 470 snapshot: Option<Snapshot>, 471 ) -> Result<Self> { 472 trace_scoped!("Vm::new_from_memory_manager"); 473 474 let boot_id_list = config 475 .lock() 476 .unwrap() 477 .validate() 478 .map_err(Error::ConfigValidation)?; 479 480 let load_payload_handle = if snapshot.is_none() { 481 Self::load_payload_async(&memory_manager, &config)? 482 } else { 483 None 484 }; 485 486 info!("Booting VM from config: {:?}", &config); 487 488 // Create NUMA nodes based on NumaConfig. 489 let numa_nodes = 490 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 491 492 #[cfg(feature = "tdx")] 493 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 494 #[cfg(feature = "tdx")] 495 let force_iommu = tdx_enabled; 496 #[cfg(not(feature = "tdx"))] 497 let force_iommu = false; 498 499 #[cfg(feature = "guest_debug")] 500 let stop_on_boot = config.lock().unwrap().gdb; 501 #[cfg(not(feature = "guest_debug"))] 502 let stop_on_boot = false; 503 504 let memory = memory_manager.lock().unwrap().guest_memory(); 505 #[cfg(target_arch = "x86_64")] 506 let io_bus = Arc::new(Bus::new()); 507 let mmio_bus = Arc::new(Bus::new()); 508 509 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 510 memory, 511 #[cfg(target_arch = "x86_64")] 512 io_bus: io_bus.clone(), 513 mmio_bus: mmio_bus.clone(), 514 }); 515 516 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 517 let cpu_manager = cpu::CpuManager::new( 518 cpus_config, 519 vm.clone(), 520 exit_evt.try_clone().map_err(Error::EventFdClone)?, 521 reset_evt.try_clone().map_err(Error::EventFdClone)?, 522 #[cfg(feature = "guest_debug")] 523 vm_debug_evt, 524 &hypervisor, 525 seccomp_action.clone(), 526 vm_ops, 527 #[cfg(feature = "tdx")] 528 tdx_enabled, 529 &numa_nodes, 530 ) 531 .map_err(Error::CpuManager)?; 532 533 #[cfg(target_arch = "x86_64")] 534 cpu_manager 535 .lock() 536 .unwrap() 537 .populate_cpuid( 538 &memory_manager, 539 &hypervisor, 540 #[cfg(feature = "tdx")] 541 tdx_enabled, 542 ) 543 .map_err(Error::CpuManager)?; 544 545 // The initial TDX configuration must be done before the vCPUs are 546 // created 547 #[cfg(feature = "tdx")] 548 if tdx_enabled { 549 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 550 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 551 vm.tdx_init(&cpuid, max_vcpus) 552 .map_err(Error::InitializeTdxVm)?; 553 } 554 555 cpu_manager 556 .lock() 557 .unwrap() 558 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 559 .map_err(Error::CpuManager)?; 560 561 #[cfg(feature = "tdx")] 562 let dynamic = !tdx_enabled; 563 #[cfg(not(feature = "tdx"))] 564 let dynamic = true; 565 566 let device_manager = DeviceManager::new( 567 #[cfg(target_arch = "x86_64")] 568 io_bus, 569 mmio_bus, 570 hypervisor.hypervisor_type(), 571 vm.clone(), 572 config.clone(), 573 memory_manager.clone(), 574 cpu_manager.clone(), 575 exit_evt.try_clone().map_err(Error::EventFdClone)?, 576 reset_evt, 577 seccomp_action.clone(), 578 numa_nodes.clone(), 579 &activate_evt, 580 force_iommu, 581 boot_id_list, 582 timestamp, 583 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 584 dynamic, 585 ) 586 .map_err(Error::DeviceManager)?; 587 588 device_manager 589 .lock() 590 .unwrap() 591 .create_devices( 592 serial_pty, 593 console_pty, 594 console_resize_pipe, 595 original_termios, 596 ) 597 .map_err(Error::DeviceManager)?; 598 599 #[cfg(feature = "tdx")] 600 let kernel = config 601 .lock() 602 .unwrap() 603 .payload 604 .as_ref() 605 .map(|p| p.kernel.as_ref().map(File::open)) 606 .unwrap_or_default() 607 .transpose() 608 .map_err(Error::KernelFile)?; 609 610 let initramfs = config 611 .lock() 612 .unwrap() 613 .payload 614 .as_ref() 615 .map(|p| p.initramfs.as_ref().map(File::open)) 616 .unwrap_or_default() 617 .transpose() 618 .map_err(Error::InitramfsFile)?; 619 620 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 621 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 622 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 623 vm_snapshot.clock 624 } else { 625 None 626 }; 627 628 let vm_state = if snapshot.is_some() { 629 VmState::Paused 630 } else { 631 VmState::Created 632 }; 633 634 Ok(Vm { 635 #[cfg(feature = "tdx")] 636 kernel, 637 initramfs, 638 device_manager, 639 config, 640 threads: Vec::with_capacity(1), 641 state: RwLock::new(vm_state), 642 cpu_manager, 643 memory_manager, 644 vm, 645 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 646 saved_clock, 647 numa_nodes, 648 hypervisor, 649 stop_on_boot, 650 load_payload_handle, 651 }) 652 } 653 654 fn create_numa_nodes( 655 configs: Option<Vec<NumaConfig>>, 656 memory_manager: &Arc<Mutex<MemoryManager>>, 657 ) -> Result<NumaNodes> { 658 let mm = memory_manager.lock().unwrap(); 659 let mm_zones = mm.memory_zones(); 660 let mut numa_nodes = BTreeMap::new(); 661 662 if let Some(configs) = &configs { 663 for config in configs.iter() { 664 if numa_nodes.contains_key(&config.guest_numa_id) { 665 error!("Can't define twice the same NUMA node"); 666 return Err(Error::InvalidNumaConfig); 667 } 668 669 let mut node = NumaNode::default(); 670 671 if let Some(memory_zones) = &config.memory_zones { 672 for memory_zone in memory_zones.iter() { 673 if let Some(mm_zone) = mm_zones.get(memory_zone) { 674 node.memory_regions.extend(mm_zone.regions().clone()); 675 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 676 node.hotplug_regions.push(virtiomem_zone.region().clone()); 677 } 678 node.memory_zones.push(memory_zone.clone()); 679 } else { 680 error!("Unknown memory zone '{}'", memory_zone); 681 return Err(Error::InvalidNumaConfig); 682 } 683 } 684 } 685 686 if let Some(cpus) = &config.cpus { 687 node.cpus.extend(cpus); 688 } 689 690 if let Some(distances) = &config.distances { 691 for distance in distances.iter() { 692 let dest = distance.destination; 693 let dist = distance.distance; 694 695 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 696 error!("Unknown destination NUMA node {}", dest); 697 return Err(Error::InvalidNumaConfig); 698 } 699 700 if node.distances.contains_key(&dest) { 701 error!("Destination NUMA node {} has been already set", dest); 702 return Err(Error::InvalidNumaConfig); 703 } 704 705 node.distances.insert(dest, dist); 706 } 707 } 708 709 #[cfg(target_arch = "x86_64")] 710 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 711 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 712 let mm_sections = sgx_epc_region.epc_sections(); 713 for sgx_epc_section in sgx_epc_sections.iter() { 714 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 715 node.sgx_epc_sections.push(mm_section.clone()); 716 } else { 717 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 718 return Err(Error::InvalidNumaConfig); 719 } 720 } 721 } else { 722 error!("Missing SGX EPC region"); 723 return Err(Error::InvalidNumaConfig); 724 } 725 } 726 727 numa_nodes.insert(config.guest_numa_id, node); 728 } 729 } 730 731 Ok(numa_nodes) 732 } 733 734 #[allow(clippy::too_many_arguments)] 735 pub fn new( 736 vm_config: Arc<Mutex<VmConfig>>, 737 exit_evt: EventFd, 738 reset_evt: EventFd, 739 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 740 seccomp_action: &SeccompAction, 741 hypervisor: Arc<dyn hypervisor::Hypervisor>, 742 activate_evt: EventFd, 743 serial_pty: Option<PtyPair>, 744 console_pty: Option<PtyPair>, 745 console_resize_pipe: Option<File>, 746 original_termios: Arc<Mutex<Option<termios>>>, 747 snapshot: Option<Snapshot>, 748 source_url: Option<&str>, 749 prefault: Option<bool>, 750 ) -> Result<Self> { 751 trace_scoped!("Vm::new"); 752 753 let timestamp = Instant::now(); 754 755 #[cfg(feature = "tdx")] 756 let tdx_enabled = if snapshot.is_some() { 757 false 758 } else { 759 vm_config.lock().unwrap().is_tdx_enabled() 760 }; 761 762 #[cfg(feature = "sev_snp")] 763 let sev_snp_enabled = if snapshot.is_some() { 764 false 765 } else { 766 vm_config.lock().unwrap().is_sev_snp_enabled() 767 }; 768 769 let vm = Self::create_hypervisor_vm( 770 &hypervisor, 771 #[cfg(feature = "tdx")] 772 tdx_enabled, 773 #[cfg(feature = "sev_snp")] 774 sev_snp_enabled, 775 )?; 776 777 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 778 779 let memory_manager = if let Some(snapshot) = 780 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 781 { 782 MemoryManager::new_from_snapshot( 783 &snapshot, 784 vm.clone(), 785 &vm_config.lock().unwrap().memory.clone(), 786 source_url, 787 prefault.unwrap(), 788 phys_bits, 789 ) 790 .map_err(Error::MemoryManager)? 791 } else { 792 #[cfg(target_arch = "x86_64")] 793 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 794 795 MemoryManager::new( 796 vm.clone(), 797 &vm_config.lock().unwrap().memory.clone(), 798 None, 799 phys_bits, 800 #[cfg(feature = "tdx")] 801 tdx_enabled, 802 None, 803 None, 804 #[cfg(target_arch = "x86_64")] 805 sgx_epc_config, 806 ) 807 .map_err(Error::MemoryManager)? 808 }; 809 810 Vm::new_from_memory_manager( 811 vm_config, 812 memory_manager, 813 vm, 814 exit_evt, 815 reset_evt, 816 #[cfg(feature = "guest_debug")] 817 vm_debug_evt, 818 seccomp_action, 819 hypervisor, 820 activate_evt, 821 timestamp, 822 serial_pty, 823 console_pty, 824 console_resize_pipe, 825 original_termios, 826 snapshot, 827 ) 828 } 829 830 pub fn create_hypervisor_vm( 831 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 832 #[cfg(feature = "tdx")] tdx_enabled: bool, 833 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 834 ) -> Result<Arc<dyn hypervisor::Vm>> { 835 hypervisor.check_required_extensions().unwrap(); 836 837 cfg_if::cfg_if! { 838 if #[cfg(feature = "tdx")] { 839 let vm = hypervisor 840 .create_vm_with_type(if tdx_enabled { 841 1 // KVM_X86_TDX_VM 842 } else { 843 0 // KVM_X86_LEGACY_VM 844 }) 845 .unwrap(); 846 } else if #[cfg(feature = "sev_snp")] { 847 let vm = hypervisor 848 .create_vm_with_type(if sev_snp_enabled { 849 1 // SEV_SNP_ENABLED 850 } else { 851 0 // SEV_SNP_DISABLED 852 }) 853 .unwrap(); 854 } else { 855 let vm = hypervisor.create_vm().unwrap(); 856 } 857 } 858 859 #[cfg(target_arch = "x86_64")] 860 { 861 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 862 .unwrap(); 863 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 864 vm.enable_split_irq().unwrap(); 865 } 866 867 Ok(vm) 868 } 869 870 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 871 let mut initramfs = self.initramfs.as_ref().unwrap(); 872 let size: usize = initramfs 873 .seek(SeekFrom::End(0)) 874 .map_err(|_| Error::InitramfsLoad)? 875 .try_into() 876 .unwrap(); 877 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 878 879 let address = 880 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 881 let address = GuestAddress(address); 882 883 guest_mem 884 .read_from(address, &mut initramfs, size) 885 .map_err(|_| Error::InitramfsLoad)?; 886 887 info!("Initramfs loaded: address = 0x{:x}", address.0); 888 Ok(arch::InitramfsConfig { address, size }) 889 } 890 891 pub fn generate_cmdline( 892 payload: &PayloadConfig, 893 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 894 ) -> Result<Cmdline> { 895 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 896 if let Some(s) = payload.cmdline.as_ref() { 897 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 898 } 899 900 #[cfg(target_arch = "aarch64")] 901 for entry in device_manager.lock().unwrap().cmdline_additions() { 902 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 903 } 904 Ok(cmdline) 905 } 906 907 #[cfg(target_arch = "aarch64")] 908 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 909 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 910 let mem = uefi_flash.memory(); 911 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 912 .map_err(Error::UefiLoad)?; 913 Ok(()) 914 } 915 916 #[cfg(target_arch = "aarch64")] 917 fn load_kernel( 918 firmware: Option<File>, 919 kernel: Option<File>, 920 memory_manager: Arc<Mutex<MemoryManager>>, 921 ) -> Result<EntryPoint> { 922 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 923 let mem = guest_memory.memory(); 924 let entry_addr = match (firmware, kernel) { 925 (None, Some(mut kernel)) => { 926 match linux_loader::loader::pe::PE::load( 927 mem.deref(), 928 Some(arch::layout::KERNEL_START), 929 &mut kernel, 930 None, 931 ) { 932 Ok(entry_addr) => entry_addr.kernel_load, 933 // Try to load the binary as kernel PE file at first. 934 // If failed, retry to load it as UEFI binary. 935 // As the UEFI binary is formatless, it must be the last option to try. 936 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 937 Self::load_firmware(&kernel, memory_manager)?; 938 arch::layout::UEFI_START 939 } 940 Err(e) => { 941 return Err(Error::KernelLoad(e)); 942 } 943 } 944 } 945 (Some(firmware), None) => { 946 Self::load_firmware(&firmware, memory_manager)?; 947 arch::layout::UEFI_START 948 } 949 _ => return Err(Error::InvalidPayload), 950 }; 951 952 Ok(EntryPoint { entry_addr }) 953 } 954 955 #[cfg(target_arch = "x86_64")] 956 fn load_kernel( 957 mut kernel: File, 958 cmdline: Option<Cmdline>, 959 memory_manager: Arc<Mutex<MemoryManager>>, 960 ) -> Result<EntryPoint> { 961 info!("Loading kernel"); 962 963 let mem = { 964 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 965 guest_memory.memory() 966 }; 967 let entry_addr = linux_loader::loader::elf::Elf::load( 968 mem.deref(), 969 None, 970 &mut kernel, 971 Some(arch::layout::HIGH_RAM_START), 972 ) 973 .map_err(Error::KernelLoad)?; 974 975 if let Some(cmdline) = cmdline { 976 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 977 .map_err(Error::LoadCmdLine)?; 978 } 979 980 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 981 // Use the PVH kernel entry point to boot the guest 982 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 983 Ok(EntryPoint { 984 entry_addr: Some(entry_addr), 985 }) 986 } else { 987 Err(Error::KernelMissingPvhHeader) 988 } 989 } 990 991 #[cfg(target_arch = "x86_64")] 992 fn load_payload( 993 payload: &PayloadConfig, 994 memory_manager: Arc<Mutex<MemoryManager>>, 995 ) -> Result<EntryPoint> { 996 trace_scoped!("load_payload"); 997 match ( 998 &payload.firmware, 999 &payload.kernel, 1000 &payload.initramfs, 1001 &payload.cmdline, 1002 ) { 1003 (Some(firmware), None, None, None) => { 1004 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1005 Self::load_kernel(firmware, None, memory_manager) 1006 } 1007 (None, Some(kernel), _, _) => { 1008 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1009 let cmdline = Self::generate_cmdline(payload)?; 1010 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1011 } 1012 _ => Err(Error::InvalidPayload), 1013 } 1014 } 1015 1016 #[cfg(target_arch = "aarch64")] 1017 fn load_payload( 1018 payload: &PayloadConfig, 1019 memory_manager: Arc<Mutex<MemoryManager>>, 1020 ) -> Result<EntryPoint> { 1021 match (&payload.firmware, &payload.kernel) { 1022 (Some(firmware), None) => { 1023 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1024 Self::load_kernel(Some(firmware), None, memory_manager) 1025 } 1026 (None, Some(kernel)) => { 1027 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1028 Self::load_kernel(None, Some(kernel), memory_manager) 1029 } 1030 _ => Err(Error::InvalidPayload), 1031 } 1032 } 1033 1034 fn load_payload_async( 1035 memory_manager: &Arc<Mutex<MemoryManager>>, 1036 config: &Arc<Mutex<VmConfig>>, 1037 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1038 // Kernel with TDX is loaded in a different manner 1039 #[cfg(feature = "tdx")] 1040 if config.lock().unwrap().is_tdx_enabled() { 1041 return Ok(None); 1042 } 1043 1044 config 1045 .lock() 1046 .unwrap() 1047 .payload 1048 .as_ref() 1049 .map(|payload| { 1050 let memory_manager = memory_manager.clone(); 1051 let payload = payload.clone(); 1052 1053 std::thread::Builder::new() 1054 .name("payload_loader".into()) 1055 .spawn(move || Self::load_payload(&payload, memory_manager)) 1056 .map_err(Error::KernelLoadThreadSpawn) 1057 }) 1058 .transpose() 1059 } 1060 1061 #[cfg(target_arch = "x86_64")] 1062 fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> { 1063 trace_scoped!("configure_system"); 1064 info!("Configuring system"); 1065 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1066 1067 let initramfs_config = match self.initramfs { 1068 Some(_) => Some(self.load_initramfs(&mem)?), 1069 None => None, 1070 }; 1071 1072 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1073 let rsdp_addr = Some(rsdp_addr); 1074 let sgx_epc_region = self 1075 .memory_manager 1076 .lock() 1077 .unwrap() 1078 .sgx_epc_region() 1079 .as_ref() 1080 .cloned(); 1081 1082 let serial_number = self 1083 .config 1084 .lock() 1085 .unwrap() 1086 .platform 1087 .as_ref() 1088 .and_then(|p| p.serial_number.clone()); 1089 1090 let uuid = self 1091 .config 1092 .lock() 1093 .unwrap() 1094 .platform 1095 .as_ref() 1096 .and_then(|p| p.uuid.clone()); 1097 1098 let oem_strings = self 1099 .config 1100 .lock() 1101 .unwrap() 1102 .platform 1103 .as_ref() 1104 .and_then(|p| p.oem_strings.clone()); 1105 1106 let oem_strings = oem_strings 1107 .as_deref() 1108 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1109 1110 arch::configure_system( 1111 &mem, 1112 arch::layout::CMDLINE_START, 1113 &initramfs_config, 1114 boot_vcpus, 1115 rsdp_addr, 1116 sgx_epc_region, 1117 serial_number.as_deref(), 1118 uuid.as_deref(), 1119 oem_strings.as_deref(), 1120 ) 1121 .map_err(Error::ConfigureSystem)?; 1122 Ok(()) 1123 } 1124 1125 #[cfg(target_arch = "aarch64")] 1126 fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> { 1127 let cmdline = Self::generate_cmdline( 1128 self.config.lock().unwrap().payload.as_ref().unwrap(), 1129 &self.device_manager, 1130 )?; 1131 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1132 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1133 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1134 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1135 let initramfs_config = match self.initramfs { 1136 Some(_) => Some(self.load_initramfs(&mem)?), 1137 None => None, 1138 }; 1139 1140 let device_info = &self 1141 .device_manager 1142 .lock() 1143 .unwrap() 1144 .get_device_info() 1145 .clone(); 1146 1147 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1148 let pci_space = PciSpaceInfo { 1149 pci_segment_id: pci_segment.id, 1150 mmio_config_address: pci_segment.mmio_config_address, 1151 pci_device_space_start: pci_segment.start_of_device_area, 1152 pci_device_space_size: pci_segment.end_of_device_area 1153 - pci_segment.start_of_device_area 1154 + 1, 1155 }; 1156 pci_space_info.push(pci_space); 1157 } 1158 1159 let virtio_iommu_bdf = self 1160 .device_manager 1161 .lock() 1162 .unwrap() 1163 .iommu_attached_devices() 1164 .as_ref() 1165 .map(|(v, _)| *v); 1166 1167 let vgic = self 1168 .device_manager 1169 .lock() 1170 .unwrap() 1171 .get_interrupt_controller() 1172 .unwrap() 1173 .lock() 1174 .unwrap() 1175 .get_vgic() 1176 .map_err(|_| { 1177 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1178 arch::aarch64::Error::SetupGic, 1179 )) 1180 })?; 1181 1182 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1183 let pmu_supported = self 1184 .cpu_manager 1185 .lock() 1186 .unwrap() 1187 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1188 .map_err(|_| { 1189 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1190 arch::aarch64::Error::VcpuInitPmu, 1191 )) 1192 })?; 1193 1194 arch::configure_system( 1195 &mem, 1196 cmdline.as_cstring().unwrap().to_str().unwrap(), 1197 vcpu_mpidrs, 1198 vcpu_topology, 1199 device_info, 1200 &initramfs_config, 1201 &pci_space_info, 1202 virtio_iommu_bdf.map(|bdf| bdf.into()), 1203 &vgic, 1204 &self.numa_nodes, 1205 pmu_supported, 1206 ) 1207 .map_err(Error::ConfigureSystem)?; 1208 1209 Ok(()) 1210 } 1211 1212 pub fn serial_pty(&self) -> Option<PtyPair> { 1213 self.device_manager.lock().unwrap().serial_pty() 1214 } 1215 1216 pub fn console_pty(&self) -> Option<PtyPair> { 1217 self.device_manager.lock().unwrap().console_pty() 1218 } 1219 1220 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1221 self.device_manager.lock().unwrap().console_resize_pipe() 1222 } 1223 1224 pub fn shutdown(&mut self) -> Result<()> { 1225 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1226 let new_state = VmState::Shutdown; 1227 1228 state.valid_transition(new_state)?; 1229 1230 // Wake up the DeviceManager threads so they will get terminated cleanly 1231 self.device_manager 1232 .lock() 1233 .unwrap() 1234 .resume() 1235 .map_err(Error::Resume)?; 1236 1237 self.cpu_manager 1238 .lock() 1239 .unwrap() 1240 .shutdown() 1241 .map_err(Error::CpuManager)?; 1242 1243 // Wait for all the threads to finish 1244 for thread in self.threads.drain(..) { 1245 thread.join().map_err(Error::ThreadCleanup)? 1246 } 1247 *state = new_state; 1248 1249 event!("vm", "shutdown"); 1250 1251 Ok(()) 1252 } 1253 1254 pub fn resize( 1255 &mut self, 1256 desired_vcpus: Option<u8>, 1257 desired_memory: Option<u64>, 1258 desired_balloon: Option<u64>, 1259 ) -> Result<()> { 1260 event!("vm", "resizing"); 1261 1262 if let Some(desired_vcpus) = desired_vcpus { 1263 if self 1264 .cpu_manager 1265 .lock() 1266 .unwrap() 1267 .resize(desired_vcpus) 1268 .map_err(Error::CpuManager)? 1269 { 1270 self.device_manager 1271 .lock() 1272 .unwrap() 1273 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1274 .map_err(Error::DeviceManager)?; 1275 } 1276 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1277 } 1278 1279 if let Some(desired_memory) = desired_memory { 1280 let new_region = self 1281 .memory_manager 1282 .lock() 1283 .unwrap() 1284 .resize(desired_memory) 1285 .map_err(Error::MemoryManager)?; 1286 1287 let memory_config = &mut self.config.lock().unwrap().memory; 1288 1289 if let Some(new_region) = &new_region { 1290 self.device_manager 1291 .lock() 1292 .unwrap() 1293 .update_memory(new_region) 1294 .map_err(Error::DeviceManager)?; 1295 1296 match memory_config.hotplug_method { 1297 HotplugMethod::Acpi => { 1298 self.device_manager 1299 .lock() 1300 .unwrap() 1301 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1302 .map_err(Error::DeviceManager)?; 1303 } 1304 HotplugMethod::VirtioMem => {} 1305 } 1306 } 1307 1308 // We update the VM config regardless of the actual guest resize 1309 // operation result (happened or not), so that if the VM reboots 1310 // it will be running with the last configure memory size. 1311 match memory_config.hotplug_method { 1312 HotplugMethod::Acpi => memory_config.size = desired_memory, 1313 HotplugMethod::VirtioMem => { 1314 if desired_memory > memory_config.size { 1315 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1316 } else { 1317 memory_config.hotplugged_size = None; 1318 } 1319 } 1320 } 1321 } 1322 1323 if let Some(desired_balloon) = desired_balloon { 1324 self.device_manager 1325 .lock() 1326 .unwrap() 1327 .resize_balloon(desired_balloon) 1328 .map_err(Error::DeviceManager)?; 1329 1330 // Update the configuration value for the balloon size to ensure 1331 // a reboot would use the right value. 1332 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1333 balloon_config.size = desired_balloon; 1334 } 1335 } 1336 1337 event!("vm", "resized"); 1338 1339 Ok(()) 1340 } 1341 1342 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1343 let memory_config = &mut self.config.lock().unwrap().memory; 1344 1345 if let Some(zones) = &mut memory_config.zones { 1346 for zone in zones.iter_mut() { 1347 if zone.id == id { 1348 if desired_memory >= zone.size { 1349 let hotplugged_size = desired_memory - zone.size; 1350 self.memory_manager 1351 .lock() 1352 .unwrap() 1353 .resize_zone(&id, desired_memory - zone.size) 1354 .map_err(Error::MemoryManager)?; 1355 // We update the memory zone config regardless of the 1356 // actual 'resize-zone' operation result (happened or 1357 // not), so that if the VM reboots it will be running 1358 // with the last configured memory zone size. 1359 zone.hotplugged_size = Some(hotplugged_size); 1360 1361 return Ok(()); 1362 } else { 1363 error!( 1364 "Invalid to ask less ({}) than boot RAM ({}) for \ 1365 this memory zone", 1366 desired_memory, zone.size, 1367 ); 1368 return Err(Error::ResizeZone); 1369 } 1370 } 1371 } 1372 } 1373 1374 error!("Could not find the memory zone {} for the resize", id); 1375 Err(Error::ResizeZone) 1376 } 1377 1378 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1379 let pci_device_info = self 1380 .device_manager 1381 .lock() 1382 .unwrap() 1383 .add_device(&mut device_cfg) 1384 .map_err(Error::DeviceManager)?; 1385 1386 // Update VmConfig by adding the new device. This is important to 1387 // ensure the device would be created in case of a reboot. 1388 { 1389 let mut config = self.config.lock().unwrap(); 1390 add_to_config(&mut config.devices, device_cfg); 1391 } 1392 1393 self.device_manager 1394 .lock() 1395 .unwrap() 1396 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1397 .map_err(Error::DeviceManager)?; 1398 1399 Ok(pci_device_info) 1400 } 1401 1402 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1403 let pci_device_info = self 1404 .device_manager 1405 .lock() 1406 .unwrap() 1407 .add_user_device(&mut device_cfg) 1408 .map_err(Error::DeviceManager)?; 1409 1410 // Update VmConfig by adding the new device. This is important to 1411 // ensure the device would be created in case of a reboot. 1412 { 1413 let mut config = self.config.lock().unwrap(); 1414 add_to_config(&mut config.user_devices, device_cfg); 1415 } 1416 1417 self.device_manager 1418 .lock() 1419 .unwrap() 1420 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1421 .map_err(Error::DeviceManager)?; 1422 1423 Ok(pci_device_info) 1424 } 1425 1426 pub fn remove_device(&mut self, id: String) -> Result<()> { 1427 self.device_manager 1428 .lock() 1429 .unwrap() 1430 .remove_device(id.clone()) 1431 .map_err(Error::DeviceManager)?; 1432 1433 // Update VmConfig by removing the device. This is important to 1434 // ensure the device would not be created in case of a reboot. 1435 self.config.lock().unwrap().remove_device(&id); 1436 1437 self.device_manager 1438 .lock() 1439 .unwrap() 1440 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1441 .map_err(Error::DeviceManager)?; 1442 Ok(()) 1443 } 1444 1445 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1446 let pci_device_info = self 1447 .device_manager 1448 .lock() 1449 .unwrap() 1450 .add_disk(&mut disk_cfg) 1451 .map_err(Error::DeviceManager)?; 1452 1453 // Update VmConfig by adding the new device. This is important to 1454 // ensure the device would be created in case of a reboot. 1455 { 1456 let mut config = self.config.lock().unwrap(); 1457 add_to_config(&mut config.disks, disk_cfg); 1458 } 1459 1460 self.device_manager 1461 .lock() 1462 .unwrap() 1463 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1464 .map_err(Error::DeviceManager)?; 1465 1466 Ok(pci_device_info) 1467 } 1468 1469 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1470 let pci_device_info = self 1471 .device_manager 1472 .lock() 1473 .unwrap() 1474 .add_fs(&mut fs_cfg) 1475 .map_err(Error::DeviceManager)?; 1476 1477 // Update VmConfig by adding the new device. This is important to 1478 // ensure the device would be created in case of a reboot. 1479 { 1480 let mut config = self.config.lock().unwrap(); 1481 add_to_config(&mut config.fs, fs_cfg); 1482 } 1483 1484 self.device_manager 1485 .lock() 1486 .unwrap() 1487 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1488 .map_err(Error::DeviceManager)?; 1489 1490 Ok(pci_device_info) 1491 } 1492 1493 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1494 let pci_device_info = self 1495 .device_manager 1496 .lock() 1497 .unwrap() 1498 .add_pmem(&mut pmem_cfg) 1499 .map_err(Error::DeviceManager)?; 1500 1501 // Update VmConfig by adding the new device. This is important to 1502 // ensure the device would be created in case of a reboot. 1503 { 1504 let mut config = self.config.lock().unwrap(); 1505 add_to_config(&mut config.pmem, pmem_cfg); 1506 } 1507 1508 self.device_manager 1509 .lock() 1510 .unwrap() 1511 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1512 .map_err(Error::DeviceManager)?; 1513 1514 Ok(pci_device_info) 1515 } 1516 1517 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1518 let pci_device_info = self 1519 .device_manager 1520 .lock() 1521 .unwrap() 1522 .add_net(&mut net_cfg) 1523 .map_err(Error::DeviceManager)?; 1524 1525 // Update VmConfig by adding the new device. This is important to 1526 // ensure the device would be created in case of a reboot. 1527 { 1528 let mut config = self.config.lock().unwrap(); 1529 add_to_config(&mut config.net, net_cfg); 1530 } 1531 1532 self.device_manager 1533 .lock() 1534 .unwrap() 1535 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1536 .map_err(Error::DeviceManager)?; 1537 1538 Ok(pci_device_info) 1539 } 1540 1541 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1542 let pci_device_info = self 1543 .device_manager 1544 .lock() 1545 .unwrap() 1546 .add_vdpa(&mut vdpa_cfg) 1547 .map_err(Error::DeviceManager)?; 1548 1549 // Update VmConfig by adding the new device. This is important to 1550 // ensure the device would be created in case of a reboot. 1551 { 1552 let mut config = self.config.lock().unwrap(); 1553 add_to_config(&mut config.vdpa, vdpa_cfg); 1554 } 1555 1556 self.device_manager 1557 .lock() 1558 .unwrap() 1559 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1560 .map_err(Error::DeviceManager)?; 1561 1562 Ok(pci_device_info) 1563 } 1564 1565 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1566 let pci_device_info = self 1567 .device_manager 1568 .lock() 1569 .unwrap() 1570 .add_vsock(&mut vsock_cfg) 1571 .map_err(Error::DeviceManager)?; 1572 1573 // Update VmConfig by adding the new device. This is important to 1574 // ensure the device would be created in case of a reboot. 1575 { 1576 let mut config = self.config.lock().unwrap(); 1577 config.vsock = Some(vsock_cfg); 1578 } 1579 1580 self.device_manager 1581 .lock() 1582 .unwrap() 1583 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1584 .map_err(Error::DeviceManager)?; 1585 1586 Ok(pci_device_info) 1587 } 1588 1589 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1590 Ok(self.device_manager.lock().unwrap().counters()) 1591 } 1592 1593 #[cfg(feature = "tdx")] 1594 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1595 use arch::x86_64::tdx::*; 1596 1597 let firmware_path = self 1598 .config 1599 .lock() 1600 .unwrap() 1601 .payload 1602 .as_ref() 1603 .unwrap() 1604 .firmware 1605 .clone() 1606 .ok_or(Error::TdxFirmwareMissing)?; 1607 // The TDVF file contains a table of section as well as code 1608 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1609 1610 // For all the sections allocate some RAM backing them 1611 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1612 } 1613 1614 #[cfg(feature = "tdx")] 1615 fn hob_memory_resources( 1616 mut sorted_sections: Vec<TdvfSection>, 1617 guest_memory: &GuestMemoryMmap, 1618 ) -> Vec<(u64, u64, bool)> { 1619 let mut list = Vec::new(); 1620 1621 let mut current_section = sorted_sections.pop(); 1622 1623 // RAM regions interleaved with TDVF sections 1624 let mut next_start_addr = 0; 1625 for region in guest_memory.iter() { 1626 let region_start = region.start_addr().0; 1627 let region_end = region.last_addr().0; 1628 if region_start > next_start_addr { 1629 next_start_addr = region_start; 1630 } 1631 1632 loop { 1633 let (start, size, ram) = if let Some(section) = ¤t_section { 1634 if section.address <= next_start_addr { 1635 (section.address, section.size, false) 1636 } else { 1637 let last_addr = std::cmp::min(section.address - 1, region_end); 1638 (next_start_addr, last_addr - next_start_addr + 1, true) 1639 } 1640 } else { 1641 (next_start_addr, region_end - next_start_addr + 1, true) 1642 }; 1643 1644 list.push((start, size, ram)); 1645 1646 if !ram { 1647 current_section = sorted_sections.pop(); 1648 } 1649 1650 next_start_addr = start + size; 1651 1652 if region_start > next_start_addr { 1653 next_start_addr = region_start; 1654 } 1655 1656 if next_start_addr > region_end { 1657 break; 1658 } 1659 } 1660 } 1661 1662 // Once all the interleaved sections have been processed, let's simply 1663 // pull the remaining ones. 1664 if let Some(section) = current_section { 1665 list.push((section.address, section.size, false)); 1666 } 1667 while let Some(section) = sorted_sections.pop() { 1668 list.push((section.address, section.size, false)); 1669 } 1670 1671 list 1672 } 1673 1674 #[cfg(feature = "tdx")] 1675 fn populate_tdx_sections( 1676 &mut self, 1677 sections: &[TdvfSection], 1678 guid_found: bool, 1679 ) -> Result<Option<u64>> { 1680 use arch::x86_64::tdx::*; 1681 // Get the memory end *before* we start adding TDVF ram regions 1682 let boot_guest_memory = self 1683 .memory_manager 1684 .lock() 1685 .as_ref() 1686 .unwrap() 1687 .boot_guest_memory(); 1688 for section in sections { 1689 // No need to allocate if the section falls within guest RAM ranges 1690 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1691 info!( 1692 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1693 section 1694 ); 1695 continue; 1696 } 1697 1698 info!("Allocating TDVF Section: {:x?}", section); 1699 self.memory_manager 1700 .lock() 1701 .unwrap() 1702 .add_ram_region(GuestAddress(section.address), section.size as usize) 1703 .map_err(Error::AllocatingTdvfMemory)?; 1704 } 1705 1706 // The TDVF file contains a table of section as well as code 1707 let firmware_path = self 1708 .config 1709 .lock() 1710 .unwrap() 1711 .payload 1712 .as_ref() 1713 .unwrap() 1714 .firmware 1715 .clone() 1716 .ok_or(Error::TdxFirmwareMissing)?; 1717 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1718 1719 // The guest memory at this point now has all the required regions so it 1720 // is safe to copy from the TDVF file into it. 1721 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1722 let mem = guest_memory.memory(); 1723 let mut payload_info = None; 1724 let mut hob_offset = None; 1725 for section in sections { 1726 info!("Populating TDVF Section: {:x?}", section); 1727 match section.r#type { 1728 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1729 info!("Copying section to guest memory"); 1730 firmware_file 1731 .seek(SeekFrom::Start(section.data_offset as u64)) 1732 .map_err(Error::LoadTdvf)?; 1733 mem.read_from( 1734 GuestAddress(section.address), 1735 &mut firmware_file, 1736 section.data_size as usize, 1737 ) 1738 .unwrap(); 1739 } 1740 TdvfSectionType::TdHob => { 1741 hob_offset = Some(section.address); 1742 } 1743 TdvfSectionType::Payload => { 1744 info!("Copying payload to guest memory"); 1745 if let Some(payload_file) = self.kernel.as_mut() { 1746 let payload_size = payload_file 1747 .seek(SeekFrom::End(0)) 1748 .map_err(Error::LoadPayload)?; 1749 1750 payload_file 1751 .seek(SeekFrom::Start(0x1f1)) 1752 .map_err(Error::LoadPayload)?; 1753 1754 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1755 payload_header 1756 .as_bytes() 1757 .read_from( 1758 0, 1759 payload_file, 1760 mem::size_of::<linux_loader::bootparam::setup_header>(), 1761 ) 1762 .unwrap(); 1763 1764 if payload_header.header != 0x5372_6448 { 1765 return Err(Error::InvalidPayloadType); 1766 } 1767 1768 if (payload_header.version < 0x0200) 1769 || ((payload_header.loadflags & 0x1) == 0x0) 1770 { 1771 return Err(Error::InvalidPayloadType); 1772 } 1773 1774 payload_file.rewind().map_err(Error::LoadPayload)?; 1775 mem.read_from( 1776 GuestAddress(section.address), 1777 payload_file, 1778 payload_size as usize, 1779 ) 1780 .unwrap(); 1781 1782 // Create the payload info that will be inserted into 1783 // the HOB. 1784 payload_info = Some(PayloadInfo { 1785 image_type: PayloadImageType::BzImage, 1786 entry_point: section.address, 1787 }); 1788 } 1789 } 1790 TdvfSectionType::PayloadParam => { 1791 info!("Copying payload parameters to guest memory"); 1792 let cmdline = Self::generate_cmdline( 1793 self.config.lock().unwrap().payload.as_ref().unwrap(), 1794 )?; 1795 mem.write_slice( 1796 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1797 GuestAddress(section.address), 1798 ) 1799 .unwrap(); 1800 } 1801 _ => {} 1802 } 1803 } 1804 1805 // Generate HOB 1806 let mut hob = TdHob::start(hob_offset.unwrap()); 1807 1808 let mut sorted_sections = sections.to_vec(); 1809 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1810 1811 sorted_sections.sort_by_key(|section| section.address); 1812 sorted_sections.reverse(); 1813 1814 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1815 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1816 .map_err(Error::PopulateHob)?; 1817 } 1818 1819 // MMIO regions 1820 hob.add_mmio_resource( 1821 &mem, 1822 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1823 arch::layout::APIC_START.raw_value() 1824 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1825 ) 1826 .map_err(Error::PopulateHob)?; 1827 let start_of_device_area = self 1828 .memory_manager 1829 .lock() 1830 .unwrap() 1831 .start_of_device_area() 1832 .raw_value(); 1833 let end_of_device_area = self 1834 .memory_manager 1835 .lock() 1836 .unwrap() 1837 .end_of_device_area() 1838 .raw_value(); 1839 hob.add_mmio_resource( 1840 &mem, 1841 start_of_device_area, 1842 end_of_device_area - start_of_device_area, 1843 ) 1844 .map_err(Error::PopulateHob)?; 1845 1846 // Loop over the ACPI tables and copy them to the HOB. 1847 1848 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1849 &self.device_manager, 1850 &self.cpu_manager, 1851 &self.memory_manager, 1852 &self.numa_nodes, 1853 ) { 1854 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1855 .map_err(Error::PopulateHob)?; 1856 } 1857 1858 // If a payload info has been created, let's insert it into the HOB. 1859 if let Some(payload_info) = payload_info { 1860 hob.add_payload(&mem, payload_info) 1861 .map_err(Error::PopulateHob)?; 1862 } 1863 1864 hob.finish(&mem).map_err(Error::PopulateHob)?; 1865 1866 Ok(hob_offset) 1867 } 1868 1869 #[cfg(feature = "tdx")] 1870 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1871 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1872 let mem = guest_memory.memory(); 1873 1874 for section in sections { 1875 self.vm 1876 .tdx_init_memory_region( 1877 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1878 section.address, 1879 section.size, 1880 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1881 section.attributes == 1, 1882 ) 1883 .map_err(Error::InitializeTdxMemoryRegion)?; 1884 } 1885 1886 Ok(()) 1887 } 1888 1889 // Creates ACPI tables 1890 // In case of TDX being used, this is a no-op since the tables will be 1891 // created and passed when populating the HOB. 1892 1893 fn create_acpi_tables(&self) -> Option<GuestAddress> { 1894 #[cfg(feature = "tdx")] 1895 if self.config.lock().unwrap().is_tdx_enabled() { 1896 return None; 1897 } 1898 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 1899 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 1900 let rsdp_addr = crate::acpi::create_acpi_tables( 1901 &mem, 1902 &self.device_manager, 1903 &self.cpu_manager, 1904 &self.memory_manager, 1905 &self.numa_nodes, 1906 tpm_enabled, 1907 ); 1908 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 1909 1910 Some(rsdp_addr) 1911 } 1912 1913 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 1914 trace_scoped!("entry_point"); 1915 1916 self.load_payload_handle 1917 .take() 1918 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 1919 .transpose() 1920 } 1921 1922 pub fn boot(&mut self) -> Result<()> { 1923 trace_scoped!("Vm::boot"); 1924 info!("Booting VM"); 1925 event!("vm", "booting"); 1926 let current_state = self.get_state()?; 1927 if current_state == VmState::Paused { 1928 return self.resume().map_err(Error::Resume); 1929 } 1930 1931 let new_state = if self.stop_on_boot { 1932 VmState::BreakPoint 1933 } else { 1934 VmState::Running 1935 }; 1936 current_state.valid_transition(new_state)?; 1937 1938 // Do earlier to parallelise with loading kernel 1939 #[cfg(target_arch = "x86_64")] 1940 let rsdp_addr = self.create_acpi_tables(); 1941 1942 // Load kernel synchronously or if asynchronous then wait for load to 1943 // finish. 1944 let entry_point = self.entry_point()?; 1945 1946 #[cfg(feature = "tdx")] 1947 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 1948 1949 // Configure the vcpus that have been created 1950 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 1951 for vcpu in vcpus { 1952 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1953 let boot_setup = entry_point.map(|e| (e, guest_memory)); 1954 self.cpu_manager 1955 .lock() 1956 .unwrap() 1957 .configure_vcpu(vcpu, boot_setup) 1958 .map_err(Error::CpuManager)?; 1959 } 1960 1961 #[cfg(feature = "tdx")] 1962 let (sections, guid_found) = if tdx_enabled { 1963 self.extract_tdvf_sections()? 1964 } else { 1965 (Vec::new(), false) 1966 }; 1967 1968 // Configuring the TDX regions requires that the vCPUs are created. 1969 #[cfg(feature = "tdx")] 1970 let hob_address = if tdx_enabled { 1971 // TDX sections are written to memory. 1972 self.populate_tdx_sections(§ions, guid_found)? 1973 } else { 1974 None 1975 }; 1976 1977 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 1978 // available after they are configured 1979 #[cfg(target_arch = "aarch64")] 1980 let rsdp_addr = self.create_acpi_tables(); 1981 1982 // Configure shared state based on loaded kernel 1983 entry_point 1984 .map(|_| { 1985 // Safe to unwrap rsdp_addr as we know it can't be None when 1986 // the entry_point is Some. 1987 self.configure_system(rsdp_addr.unwrap()) 1988 }) 1989 .transpose()?; 1990 1991 #[cfg(target_arch = "x86_64")] 1992 // Note: For x86, always call this function before invoking start boot vcpus. 1993 // Otherwise guest would fail to boot because we haven't created the 1994 // userspace mappings to update the hypervisor about the memory mappings. 1995 // These mappings must be created before we start the vCPU threads for 1996 // the very first time. 1997 self.memory_manager 1998 .lock() 1999 .unwrap() 2000 .allocate_address_space() 2001 .map_err(Error::MemoryManager)?; 2002 2003 #[cfg(feature = "tdx")] 2004 if let Some(hob_address) = hob_address { 2005 // With the HOB address extracted the vCPUs can have 2006 // their TDX state configured. 2007 self.cpu_manager 2008 .lock() 2009 .unwrap() 2010 .initialize_tdx(hob_address) 2011 .map_err(Error::CpuManager)?; 2012 // Let the hypervisor know which memory ranges are shared with the 2013 // guest. This prevents the guest from ignoring/discarding memory 2014 // regions provided by the host. 2015 self.init_tdx_memory(§ions)?; 2016 // With TDX memory and CPU state configured TDX setup is complete 2017 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2018 } 2019 2020 self.cpu_manager 2021 .lock() 2022 .unwrap() 2023 .start_boot_vcpus(new_state == VmState::BreakPoint) 2024 .map_err(Error::CpuManager)?; 2025 2026 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2027 *state = new_state; 2028 event!("vm", "booted"); 2029 Ok(()) 2030 } 2031 2032 pub fn restore(&mut self) -> Result<()> { 2033 event!("vm", "restoring"); 2034 2035 #[cfg(target_arch = "x86_64")] 2036 // Note: For x86, always call this function before invoking start boot vcpus. 2037 // Otherwise guest would fail to boot because we haven't created the 2038 // userspace mappings to update the hypervisor about the memory mappings. 2039 // These mappings must be created before we start the vCPU threads for 2040 // the very first time for the restored VM. 2041 self.memory_manager 2042 .lock() 2043 .unwrap() 2044 .allocate_address_space() 2045 .map_err(Error::MemoryManager)?; 2046 2047 // Now we can start all vCPUs from here. 2048 self.cpu_manager 2049 .lock() 2050 .unwrap() 2051 .start_restored_vcpus() 2052 .map_err(Error::CpuManager)?; 2053 2054 event!("vm", "restored"); 2055 Ok(()) 2056 } 2057 2058 /// Gets a thread-safe reference counted pointer to the VM configuration. 2059 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2060 Arc::clone(&self.config) 2061 } 2062 2063 /// Get the VM state. Returns an error if the state is poisoned. 2064 pub fn get_state(&self) -> Result<VmState> { 2065 self.state 2066 .try_read() 2067 .map_err(|_| Error::PoisonedState) 2068 .map(|state| *state) 2069 } 2070 2071 /// Gets the actual size of the balloon. 2072 pub fn balloon_size(&self) -> u64 { 2073 self.device_manager.lock().unwrap().balloon_size() 2074 } 2075 2076 pub fn send_memory_fds( 2077 &mut self, 2078 socket: &mut UnixStream, 2079 ) -> std::result::Result<(), MigratableError> { 2080 for (slot, fd) in self 2081 .memory_manager 2082 .lock() 2083 .unwrap() 2084 .memory_slot_fds() 2085 .drain() 2086 { 2087 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2088 .write_to(socket) 2089 .map_err(|e| { 2090 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2091 })?; 2092 socket 2093 .send_with_fd(&slot.to_le_bytes()[..], fd) 2094 .map_err(|e| { 2095 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2096 })?; 2097 2098 let res = Response::read_from(socket)?; 2099 if res.status() != Status::Ok { 2100 warn!("Error during memory fd migration"); 2101 Request::abandon().write_to(socket)?; 2102 Response::read_from(socket).ok(); 2103 return Err(MigratableError::MigrateSend(anyhow!( 2104 "Error during memory fd migration" 2105 ))); 2106 } 2107 } 2108 2109 Ok(()) 2110 } 2111 2112 pub fn send_memory_regions<F>( 2113 &mut self, 2114 ranges: &MemoryRangeTable, 2115 fd: &mut F, 2116 ) -> std::result::Result<(), MigratableError> 2117 where 2118 F: Write, 2119 { 2120 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2121 let mem = guest_memory.memory(); 2122 2123 for range in ranges.regions() { 2124 let mut offset: u64 = 0; 2125 // Here we are manually handling the retry in case we can't the 2126 // whole region at once because we can't use the implementation 2127 // from vm-memory::GuestMemory of write_all_to() as it is not 2128 // following the correct behavior. For more info about this issue 2129 // see: https://github.com/rust-vmm/vm-memory/issues/174 2130 loop { 2131 let bytes_written = mem 2132 .write_to( 2133 GuestAddress(range.gpa + offset), 2134 fd, 2135 (range.length - offset) as usize, 2136 ) 2137 .map_err(|e| { 2138 MigratableError::MigrateSend(anyhow!( 2139 "Error transferring memory to socket: {}", 2140 e 2141 )) 2142 })?; 2143 offset += bytes_written as u64; 2144 2145 if offset == range.length { 2146 break; 2147 } 2148 } 2149 } 2150 2151 Ok(()) 2152 } 2153 2154 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2155 self.memory_manager 2156 .lock() 2157 .unwrap() 2158 .memory_range_table(false) 2159 } 2160 2161 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2162 self.device_manager.lock().unwrap().device_tree() 2163 } 2164 2165 pub fn activate_virtio_devices(&self) -> Result<()> { 2166 self.device_manager 2167 .lock() 2168 .unwrap() 2169 .activate_virtio_devices() 2170 .map_err(Error::ActivateVirtioDevices) 2171 } 2172 2173 #[cfg(target_arch = "x86_64")] 2174 pub fn power_button(&self) -> Result<()> { 2175 return self 2176 .device_manager 2177 .lock() 2178 .unwrap() 2179 .notify_power_button() 2180 .map_err(Error::PowerButton); 2181 } 2182 2183 #[cfg(target_arch = "aarch64")] 2184 pub fn power_button(&self) -> Result<()> { 2185 self.device_manager 2186 .lock() 2187 .unwrap() 2188 .notify_power_button() 2189 .map_err(Error::PowerButton) 2190 } 2191 2192 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2193 self.memory_manager.lock().unwrap().snapshot_data() 2194 } 2195 2196 #[cfg(feature = "guest_debug")] 2197 pub fn debug_request( 2198 &mut self, 2199 gdb_request: &GdbRequestPayload, 2200 cpu_id: usize, 2201 ) -> Result<GdbResponsePayload> { 2202 use GdbRequestPayload::*; 2203 match gdb_request { 2204 SetSingleStep(single_step) => { 2205 self.set_guest_debug(cpu_id, &[], *single_step) 2206 .map_err(Error::Debug)?; 2207 } 2208 SetHwBreakPoint(addrs) => { 2209 self.set_guest_debug(cpu_id, addrs, false) 2210 .map_err(Error::Debug)?; 2211 } 2212 Pause => { 2213 self.debug_pause().map_err(Error::Debug)?; 2214 } 2215 Resume => { 2216 self.debug_resume().map_err(Error::Debug)?; 2217 } 2218 ReadRegs => { 2219 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2220 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2221 } 2222 WriteRegs(regs) => { 2223 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2224 } 2225 ReadMem(vaddr, len) => { 2226 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2227 let mem = self 2228 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2229 .map_err(Error::Debug)?; 2230 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2231 } 2232 WriteMem(vaddr, data) => { 2233 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2234 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2235 .map_err(Error::Debug)?; 2236 } 2237 ActiveVcpus => { 2238 let active_vcpus = self.active_vcpus(); 2239 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2240 } 2241 } 2242 Ok(GdbResponsePayload::CommandComplete) 2243 } 2244 2245 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2246 fn get_dump_state( 2247 &mut self, 2248 destination_url: &str, 2249 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2250 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2251 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2252 let mut elf_phdr_num = 1; 2253 let elf_sh_info = 0; 2254 let coredump_file_path = url_to_file(destination_url)?; 2255 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2256 2257 if mapping_num < UINT16_MAX - 2 { 2258 elf_phdr_num += mapping_num as u16; 2259 } else { 2260 panic!("mapping num beyond 65535 not supported"); 2261 } 2262 let coredump_file = OpenOptions::new() 2263 .read(true) 2264 .write(true) 2265 .create_new(true) 2266 .open(coredump_file_path) 2267 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2268 2269 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2270 let mem_data = self 2271 .memory_manager 2272 .lock() 2273 .unwrap() 2274 .coredump_memory_regions(mem_offset); 2275 2276 Ok(DumpState { 2277 elf_note_size, 2278 elf_phdr_num, 2279 elf_sh_info, 2280 mem_offset, 2281 mem_info: Some(mem_data), 2282 file: Some(coredump_file), 2283 }) 2284 } 2285 2286 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2287 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2288 size_of::<elf::Elf64_Ehdr>() as u64 2289 + note_size as u64 2290 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2291 } 2292 } 2293 2294 impl Pausable for Vm { 2295 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2296 event!("vm", "pausing"); 2297 let mut state = self 2298 .state 2299 .try_write() 2300 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2301 let new_state = VmState::Paused; 2302 2303 state 2304 .valid_transition(new_state) 2305 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2306 2307 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2308 { 2309 let mut clock = self 2310 .vm 2311 .get_clock() 2312 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2313 clock.reset_flags(); 2314 self.saved_clock = Some(clock); 2315 } 2316 2317 // Before pausing the vCPUs activate any pending virtio devices that might 2318 // need activation between starting the pause (or e.g. a migration it's part of) 2319 self.activate_virtio_devices().map_err(|e| { 2320 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2321 })?; 2322 2323 self.cpu_manager.lock().unwrap().pause()?; 2324 self.device_manager.lock().unwrap().pause()?; 2325 2326 *state = new_state; 2327 2328 event!("vm", "paused"); 2329 Ok(()) 2330 } 2331 2332 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2333 event!("vm", "resuming"); 2334 let mut state = self 2335 .state 2336 .try_write() 2337 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2338 let new_state = VmState::Running; 2339 2340 state 2341 .valid_transition(new_state) 2342 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2343 2344 self.cpu_manager.lock().unwrap().resume()?; 2345 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2346 { 2347 if let Some(clock) = &self.saved_clock { 2348 self.vm.set_clock(clock).map_err(|e| { 2349 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2350 })?; 2351 } 2352 } 2353 self.device_manager.lock().unwrap().resume()?; 2354 2355 // And we're back to the Running state. 2356 *state = new_state; 2357 event!("vm", "resumed"); 2358 Ok(()) 2359 } 2360 } 2361 2362 #[derive(Serialize, Deserialize)] 2363 pub struct VmSnapshot { 2364 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2365 pub clock: Option<hypervisor::ClockData>, 2366 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2367 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2368 } 2369 2370 pub const VM_SNAPSHOT_ID: &str = "vm"; 2371 impl Snapshottable for Vm { 2372 fn id(&self) -> String { 2373 VM_SNAPSHOT_ID.to_string() 2374 } 2375 2376 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2377 event!("vm", "snapshotting"); 2378 2379 #[cfg(feature = "tdx")] 2380 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2381 2382 #[cfg(feature = "tdx")] 2383 { 2384 if tdx_enabled { 2385 return Err(MigratableError::Snapshot(anyhow!( 2386 "Snapshot not possible with TDX VM" 2387 ))); 2388 } 2389 } 2390 2391 let current_state = self.get_state().unwrap(); 2392 if current_state != VmState::Paused { 2393 return Err(MigratableError::Snapshot(anyhow!( 2394 "Trying to snapshot while VM is running" 2395 ))); 2396 } 2397 2398 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2399 let common_cpuid = { 2400 let phys_bits = physical_bits( 2401 &self.hypervisor, 2402 self.config.lock().unwrap().cpus.max_phys_bits, 2403 ); 2404 arch::generate_common_cpuid( 2405 &self.hypervisor, 2406 None, 2407 None, 2408 phys_bits, 2409 self.config.lock().unwrap().cpus.kvm_hyperv, 2410 #[cfg(feature = "tdx")] 2411 tdx_enabled, 2412 ) 2413 .map_err(|e| { 2414 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2415 })? 2416 }; 2417 2418 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2419 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2420 clock: self.saved_clock, 2421 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2422 common_cpuid, 2423 }) 2424 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2425 2426 let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data)); 2427 2428 let (id, snapshot) = { 2429 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2430 (cpu_manager.id(), cpu_manager.snapshot()?) 2431 }; 2432 vm_snapshot.add_snapshot(id, snapshot); 2433 let (id, snapshot) = { 2434 let mut memory_manager = self.memory_manager.lock().unwrap(); 2435 (memory_manager.id(), memory_manager.snapshot()?) 2436 }; 2437 vm_snapshot.add_snapshot(id, snapshot); 2438 let (id, snapshot) = { 2439 let mut device_manager = self.device_manager.lock().unwrap(); 2440 (device_manager.id(), device_manager.snapshot()?) 2441 }; 2442 vm_snapshot.add_snapshot(id, snapshot); 2443 2444 event!("vm", "snapshotted"); 2445 Ok(vm_snapshot) 2446 } 2447 } 2448 2449 impl Transportable for Vm { 2450 fn send( 2451 &self, 2452 snapshot: &Snapshot, 2453 destination_url: &str, 2454 ) -> std::result::Result<(), MigratableError> { 2455 let mut snapshot_config_path = url_to_path(destination_url)?; 2456 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2457 2458 // Create the snapshot config file 2459 let mut snapshot_config_file = OpenOptions::new() 2460 .read(true) 2461 .write(true) 2462 .create_new(true) 2463 .open(snapshot_config_path) 2464 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2465 2466 // Serialize and write the snapshot config 2467 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2468 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2469 2470 snapshot_config_file 2471 .write(vm_config.as_bytes()) 2472 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2473 2474 let mut snapshot_state_path = url_to_path(destination_url)?; 2475 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2476 2477 // Create the snapshot state file 2478 let mut snapshot_state_file = OpenOptions::new() 2479 .read(true) 2480 .write(true) 2481 .create_new(true) 2482 .open(snapshot_state_path) 2483 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2484 2485 // Serialize and write the snapshot state 2486 let vm_state = 2487 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2488 2489 snapshot_state_file 2490 .write(&vm_state) 2491 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2492 2493 // Tell the memory manager to also send/write its own snapshot. 2494 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2495 self.memory_manager 2496 .lock() 2497 .unwrap() 2498 .send(&memory_manager_snapshot.clone(), destination_url)?; 2499 } else { 2500 return Err(MigratableError::Restore(anyhow!( 2501 "Missing memory manager snapshot" 2502 ))); 2503 } 2504 2505 Ok(()) 2506 } 2507 } 2508 2509 impl Migratable for Vm { 2510 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2511 self.memory_manager.lock().unwrap().start_dirty_log()?; 2512 self.device_manager.lock().unwrap().start_dirty_log() 2513 } 2514 2515 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2516 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2517 self.device_manager.lock().unwrap().stop_dirty_log() 2518 } 2519 2520 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2521 Ok(MemoryRangeTable::new_from_tables(vec![ 2522 self.memory_manager.lock().unwrap().dirty_log()?, 2523 self.device_manager.lock().unwrap().dirty_log()?, 2524 ])) 2525 } 2526 2527 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2528 self.memory_manager.lock().unwrap().start_migration()?; 2529 self.device_manager.lock().unwrap().start_migration() 2530 } 2531 2532 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2533 self.memory_manager.lock().unwrap().complete_migration()?; 2534 self.device_manager.lock().unwrap().complete_migration() 2535 } 2536 } 2537 2538 #[cfg(feature = "guest_debug")] 2539 impl Debuggable for Vm { 2540 fn set_guest_debug( 2541 &self, 2542 cpu_id: usize, 2543 addrs: &[GuestAddress], 2544 singlestep: bool, 2545 ) -> std::result::Result<(), DebuggableError> { 2546 self.cpu_manager 2547 .lock() 2548 .unwrap() 2549 .set_guest_debug(cpu_id, addrs, singlestep) 2550 } 2551 2552 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2553 if *self.state.read().unwrap() == VmState::Running { 2554 self.pause().map_err(DebuggableError::Pause)?; 2555 } 2556 2557 let mut state = self 2558 .state 2559 .try_write() 2560 .map_err(|_| DebuggableError::PoisonedState)?; 2561 *state = VmState::BreakPoint; 2562 Ok(()) 2563 } 2564 2565 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2566 if *self.state.read().unwrap() == VmState::BreakPoint { 2567 self.resume().map_err(DebuggableError::Pause)?; 2568 } 2569 2570 Ok(()) 2571 } 2572 2573 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2574 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2575 } 2576 2577 fn write_regs( 2578 &self, 2579 cpu_id: usize, 2580 regs: &CoreRegs, 2581 ) -> std::result::Result<(), DebuggableError> { 2582 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2583 } 2584 2585 fn read_mem( 2586 &self, 2587 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2588 cpu_id: usize, 2589 vaddr: GuestAddress, 2590 len: usize, 2591 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2592 self.cpu_manager 2593 .lock() 2594 .unwrap() 2595 .read_mem(guest_memory, cpu_id, vaddr, len) 2596 } 2597 2598 fn write_mem( 2599 &self, 2600 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2601 cpu_id: usize, 2602 vaddr: &GuestAddress, 2603 data: &[u8], 2604 ) -> std::result::Result<(), DebuggableError> { 2605 self.cpu_manager 2606 .lock() 2607 .unwrap() 2608 .write_mem(guest_memory, cpu_id, vaddr, data) 2609 } 2610 2611 fn active_vcpus(&self) -> usize { 2612 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2613 if active_vcpus > 0 { 2614 active_vcpus 2615 } else { 2616 // The VM is not booted yet. Report boot_vcpus() instead. 2617 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2618 } 2619 } 2620 } 2621 2622 #[cfg(feature = "guest_debug")] 2623 pub const UINT16_MAX: u32 = 65535; 2624 2625 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2626 impl Elf64Writable for Vm {} 2627 2628 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2629 impl GuestDebuggable for Vm { 2630 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2631 event!("vm", "coredumping"); 2632 2633 let mut resume = false; 2634 2635 #[cfg(feature = "tdx")] 2636 { 2637 if let Some(ref platform) = self.config.lock().unwrap().platform { 2638 if platform.tdx { 2639 return Err(GuestDebuggableError::Coredump(anyhow!( 2640 "Coredump not possible with TDX VM" 2641 ))); 2642 } 2643 } 2644 } 2645 2646 match self.get_state().unwrap() { 2647 VmState::Running => { 2648 self.pause().map_err(GuestDebuggableError::Pause)?; 2649 resume = true; 2650 } 2651 VmState::Paused => {} 2652 _ => { 2653 return Err(GuestDebuggableError::Coredump(anyhow!( 2654 "Trying to coredump while VM is not running or paused" 2655 ))); 2656 } 2657 } 2658 2659 let coredump_state = self.get_dump_state(destination_url)?; 2660 2661 self.write_header(&coredump_state)?; 2662 self.write_note(&coredump_state)?; 2663 self.write_loads(&coredump_state)?; 2664 2665 self.cpu_manager 2666 .lock() 2667 .unwrap() 2668 .cpu_write_elf64_note(&coredump_state)?; 2669 self.cpu_manager 2670 .lock() 2671 .unwrap() 2672 .cpu_write_vmm_note(&coredump_state)?; 2673 2674 self.memory_manager 2675 .lock() 2676 .unwrap() 2677 .coredump_iterate_save_mem(&coredump_state)?; 2678 2679 if resume { 2680 self.resume().map_err(GuestDebuggableError::Resume)?; 2681 } 2682 2683 Ok(()) 2684 } 2685 } 2686 2687 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2688 #[cfg(test)] 2689 mod tests { 2690 use super::*; 2691 2692 fn test_vm_state_transitions(state: VmState) { 2693 match state { 2694 VmState::Created => { 2695 // Check the transitions from Created 2696 assert!(state.valid_transition(VmState::Created).is_err()); 2697 assert!(state.valid_transition(VmState::Running).is_ok()); 2698 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2699 assert!(state.valid_transition(VmState::Paused).is_ok()); 2700 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2701 } 2702 VmState::Running => { 2703 // Check the transitions from Running 2704 assert!(state.valid_transition(VmState::Created).is_err()); 2705 assert!(state.valid_transition(VmState::Running).is_err()); 2706 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2707 assert!(state.valid_transition(VmState::Paused).is_ok()); 2708 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2709 } 2710 VmState::Shutdown => { 2711 // Check the transitions from Shutdown 2712 assert!(state.valid_transition(VmState::Created).is_err()); 2713 assert!(state.valid_transition(VmState::Running).is_ok()); 2714 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2715 assert!(state.valid_transition(VmState::Paused).is_err()); 2716 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2717 } 2718 VmState::Paused => { 2719 // Check the transitions from Paused 2720 assert!(state.valid_transition(VmState::Created).is_err()); 2721 assert!(state.valid_transition(VmState::Running).is_ok()); 2722 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2723 assert!(state.valid_transition(VmState::Paused).is_err()); 2724 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2725 } 2726 VmState::BreakPoint => { 2727 // Check the transitions from Breakpoint 2728 assert!(state.valid_transition(VmState::Created).is_ok()); 2729 assert!(state.valid_transition(VmState::Running).is_ok()); 2730 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2731 assert!(state.valid_transition(VmState::Paused).is_err()); 2732 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2733 } 2734 } 2735 } 2736 2737 #[test] 2738 fn test_vm_created_transitions() { 2739 test_vm_state_transitions(VmState::Created); 2740 } 2741 2742 #[test] 2743 fn test_vm_running_transitions() { 2744 test_vm_state_transitions(VmState::Running); 2745 } 2746 2747 #[test] 2748 fn test_vm_shutdown_transitions() { 2749 test_vm_state_transitions(VmState::Shutdown); 2750 } 2751 2752 #[test] 2753 fn test_vm_paused_transitions() { 2754 test_vm_state_transitions(VmState::Paused); 2755 } 2756 2757 #[cfg(feature = "tdx")] 2758 #[test] 2759 fn test_hob_memory_resources() { 2760 // Case 1: Two TDVF sections in the middle of the RAM 2761 let sections = vec![ 2762 TdvfSection { 2763 address: 0xc000, 2764 size: 0x1000, 2765 ..Default::default() 2766 }, 2767 TdvfSection { 2768 address: 0x1000, 2769 size: 0x4000, 2770 ..Default::default() 2771 }, 2772 ]; 2773 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2774 let expected = vec![ 2775 (0, 0x1000, true), 2776 (0x1000, 0x4000, false), 2777 (0x5000, 0x7000, true), 2778 (0xc000, 0x1000, false), 2779 (0xd000, 0x0fff_3000, true), 2780 ]; 2781 assert_eq!( 2782 expected, 2783 Vm::hob_memory_resources( 2784 sections, 2785 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2786 ) 2787 ); 2788 2789 // Case 2: Two TDVF sections with no conflict with the RAM 2790 let sections = vec![ 2791 TdvfSection { 2792 address: 0x1000_1000, 2793 size: 0x1000, 2794 ..Default::default() 2795 }, 2796 TdvfSection { 2797 address: 0, 2798 size: 0x1000, 2799 ..Default::default() 2800 }, 2801 ]; 2802 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2803 let expected = vec![ 2804 (0, 0x1000, false), 2805 (0x1000, 0x1000_0000, true), 2806 (0x1000_1000, 0x1000, false), 2807 ]; 2808 assert_eq!( 2809 expected, 2810 Vm::hob_memory_resources( 2811 sections, 2812 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2813 ) 2814 ); 2815 2816 // Case 3: Two TDVF sections with partial conflicts with the RAM 2817 let sections = vec![ 2818 TdvfSection { 2819 address: 0x1000_0000, 2820 size: 0x2000, 2821 ..Default::default() 2822 }, 2823 TdvfSection { 2824 address: 0, 2825 size: 0x2000, 2826 ..Default::default() 2827 }, 2828 ]; 2829 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2830 let expected = vec![ 2831 (0, 0x2000, false), 2832 (0x2000, 0x0fff_e000, true), 2833 (0x1000_0000, 0x2000, false), 2834 ]; 2835 assert_eq!( 2836 expected, 2837 Vm::hob_memory_resources( 2838 sections, 2839 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2840 ) 2841 ); 2842 2843 // Case 4: Two TDVF sections with no conflict before the RAM and two 2844 // more additional sections with no conflict after the RAM. 2845 let sections = vec![ 2846 TdvfSection { 2847 address: 0x2000_1000, 2848 size: 0x1000, 2849 ..Default::default() 2850 }, 2851 TdvfSection { 2852 address: 0x2000_0000, 2853 size: 0x1000, 2854 ..Default::default() 2855 }, 2856 TdvfSection { 2857 address: 0x1000, 2858 size: 0x1000, 2859 ..Default::default() 2860 }, 2861 TdvfSection { 2862 address: 0, 2863 size: 0x1000, 2864 ..Default::default() 2865 }, 2866 ]; 2867 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 2868 let expected = vec![ 2869 (0, 0x1000, false), 2870 (0x1000, 0x1000, false), 2871 (0x4000, 0x1000_0000, true), 2872 (0x2000_0000, 0x1000, false), 2873 (0x2000_1000, 0x1000, false), 2874 ]; 2875 assert_eq!( 2876 expected, 2877 Vm::hob_memory_resources( 2878 sections, 2879 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2880 ) 2881 ); 2882 2883 // Case 5: One TDVF section overriding the entire RAM 2884 let sections = vec![TdvfSection { 2885 address: 0, 2886 size: 0x2000_0000, 2887 ..Default::default() 2888 }]; 2889 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2890 let expected = vec![(0, 0x2000_0000, false)]; 2891 assert_eq!( 2892 expected, 2893 Vm::hob_memory_resources( 2894 sections, 2895 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2896 ) 2897 ); 2898 2899 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 2900 let sections = vec![ 2901 TdvfSection { 2902 address: 0x1000_2000, 2903 size: 0x2000, 2904 ..Default::default() 2905 }, 2906 TdvfSection { 2907 address: 0, 2908 size: 0x2000, 2909 ..Default::default() 2910 }, 2911 ]; 2912 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 2913 (GuestAddress(0x2000), 0x1000_0000), 2914 (GuestAddress(0x1000_4000), 0x1000_0000), 2915 ]; 2916 let expected = vec![ 2917 (0, 0x2000, false), 2918 (0x2000, 0x1000_0000, true), 2919 (0x1000_2000, 0x2000, false), 2920 (0x1000_4000, 0x1000_0000, true), 2921 ]; 2922 assert_eq!( 2923 expected, 2924 Vm::hob_memory_resources( 2925 sections, 2926 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2927 ) 2928 ); 2929 2930 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 2931 let sections = vec![ 2932 TdvfSection { 2933 address: 0x1000_0000, 2934 size: 0x4000, 2935 ..Default::default() 2936 }, 2937 TdvfSection { 2938 address: 0, 2939 size: 0x4000, 2940 ..Default::default() 2941 }, 2942 ]; 2943 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 2944 (GuestAddress(0x1000), 0x1000_0000), 2945 (GuestAddress(0x1000_3000), 0x1000_0000), 2946 ]; 2947 let expected = vec![ 2948 (0, 0x4000, false), 2949 (0x4000, 0x0fff_c000, true), 2950 (0x1000_0000, 0x4000, false), 2951 (0x1000_4000, 0x0fff_f000, true), 2952 ]; 2953 assert_eq!( 2954 expected, 2955 Vm::hob_memory_resources( 2956 sections, 2957 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2958 ) 2959 ); 2960 } 2961 } 2962 2963 #[cfg(target_arch = "aarch64")] 2964 #[cfg(test)] 2965 mod tests { 2966 use super::*; 2967 use crate::GuestMemoryMmap; 2968 use arch::aarch64::fdt::create_fdt; 2969 use arch::aarch64::layout; 2970 use arch::{DeviceType, MmioDeviceInfo}; 2971 use devices::gic::Gic; 2972 2973 const LEN: u64 = 4096; 2974 2975 #[test] 2976 fn test_create_fdt_with_devices() { 2977 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 2978 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 2979 2980 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 2981 ( 2982 (DeviceType::Serial, DeviceType::Serial.to_string()), 2983 MmioDeviceInfo { 2984 addr: 0x00, 2985 len: LEN, 2986 irq: 33, 2987 }, 2988 ), 2989 ( 2990 (DeviceType::Virtio(1), "virtio".to_string()), 2991 MmioDeviceInfo { 2992 addr: LEN, 2993 len: LEN, 2994 irq: 34, 2995 }, 2996 ), 2997 ( 2998 (DeviceType::Rtc, "rtc".to_string()), 2999 MmioDeviceInfo { 3000 addr: 2 * LEN, 3001 len: LEN, 3002 irq: 35, 3003 }, 3004 ), 3005 ] 3006 .iter() 3007 .cloned() 3008 .collect(); 3009 3010 let hv = hypervisor::new().unwrap(); 3011 let vm = hv.create_vm().unwrap(); 3012 let gic = vm 3013 .create_vgic(Gic::create_default_config(1)) 3014 .expect("Cannot create gic"); 3015 assert!(create_fdt( 3016 &mem, 3017 "console=tty0", 3018 vec![0], 3019 Some((0, 0, 0)), 3020 &dev_info, 3021 &gic, 3022 &None, 3023 &Vec::new(), 3024 &BTreeMap::new(), 3025 None, 3026 true, 3027 ) 3028 .is_ok()) 3029 } 3030 } 3031 3032 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3033 #[test] 3034 pub fn test_vm() { 3035 use hypervisor::VmExit; 3036 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3037 // This example based on https://lwn.net/Articles/658511/ 3038 let code = [ 3039 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3040 0x00, 0xd8, /* add %bl, %al */ 3041 0x04, b'0', /* add $'0', %al */ 3042 0xee, /* out %al, (%dx) */ 3043 0xb0, b'\n', /* mov $'\n', %al */ 3044 0xee, /* out %al, (%dx) */ 3045 0xf4, /* hlt */ 3046 ]; 3047 3048 let mem_size = 0x1000; 3049 let load_addr = GuestAddress(0x1000); 3050 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3051 3052 let hv = hypervisor::new().unwrap(); 3053 let vm = hv.create_vm().expect("new VM creation failed"); 3054 3055 for (index, region) in mem.iter().enumerate() { 3056 let mem_region = vm.make_user_memory_region( 3057 index as u32, 3058 region.start_addr().raw_value(), 3059 region.len(), 3060 region.as_ptr() as u64, 3061 false, 3062 false, 3063 ); 3064 3065 vm.create_user_memory_region(mem_region) 3066 .expect("Cannot configure guest memory"); 3067 } 3068 mem.write_slice(&code, load_addr) 3069 .expect("Writing code to memory failed"); 3070 3071 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3072 3073 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3074 vcpu_sregs.cs.base = 0; 3075 vcpu_sregs.cs.selector = 0; 3076 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3077 3078 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3079 vcpu_regs.rip = 0x1000; 3080 vcpu_regs.rax = 2; 3081 vcpu_regs.rbx = 3; 3082 vcpu_regs.rflags = 2; 3083 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3084 3085 loop { 3086 match vcpu.run().expect("run failed") { 3087 VmExit::IoOut(addr, data) => { 3088 println!( 3089 "IO out -- addr: {:#x} data [{:?}]", 3090 addr, 3091 str::from_utf8(data).unwrap() 3092 ); 3093 } 3094 VmExit::Reset => { 3095 println!("HLT"); 3096 break; 3097 } 3098 r => panic!("unexpected exit reason: {r:?}"), 3099 } 3100 } 3101 } 3102