1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::cmp; 15 use std::collections::BTreeMap; 16 use std::collections::HashMap; 17 use std::fs::{File, OpenOptions}; 18 use std::io::{self, Seek, SeekFrom, Write}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use std::mem::size_of; 21 use std::num::Wrapping; 22 use std::ops::Deref; 23 use std::os::unix::net::UnixStream; 24 use std::sync::{Arc, Mutex, RwLock}; 25 use std::time::Instant; 26 use std::{result, str, thread}; 27 28 use anyhow::anyhow; 29 use arch::get_host_cpu_phys_bits; 30 #[cfg(target_arch = "x86_64")] 31 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 32 #[cfg(feature = "tdx")] 33 use arch::x86_64::tdx::TdvfSection; 34 use arch::EntryPoint; 35 #[cfg(target_arch = "aarch64")] 36 use arch::PciSpaceInfo; 37 use arch::{NumaNode, NumaNodes}; 38 #[cfg(target_arch = "aarch64")] 39 use devices::interrupt_controller; 40 use devices::AcpiNotificationFlags; 41 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 42 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 43 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 44 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 45 use hypervisor::{HypervisorVmError, VmOps}; 46 use libc::{termios, SIGWINCH}; 47 use linux_loader::cmdline::Cmdline; 48 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 49 use linux_loader::elf; 50 #[cfg(target_arch = "x86_64")] 51 use linux_loader::loader::bzimage::BzImage; 52 #[cfg(target_arch = "x86_64")] 53 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 54 #[cfg(target_arch = "aarch64")] 55 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 56 use linux_loader::loader::KernelLoader; 57 use seccompiler::SeccompAction; 58 use serde::{Deserialize, Serialize}; 59 use thiserror::Error; 60 use tracer::trace_scoped; 61 use vm_device::Bus; 62 #[cfg(feature = "tdx")] 63 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 64 use vm_memory::{ 65 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 66 }; 67 use vm_migration::protocol::{Request, Response}; 68 use vm_migration::{ 69 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 70 Snapshottable, Transportable, 71 }; 72 use vmm_sys_util::eventfd::EventFd; 73 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 74 75 use crate::config::{ 76 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 77 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 78 }; 79 use crate::config::{NumaConfig, PayloadConfig}; 80 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; 81 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 82 use crate::coredump::{ 83 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 84 }; 85 use crate::cpu; 86 use crate::device_manager::{DeviceManager, DeviceManagerError}; 87 use crate::device_tree::DeviceTree; 88 #[cfg(feature = "guest_debug")] 89 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 90 #[cfg(feature = "igvm")] 91 use crate::igvm::igvm_loader; 92 use crate::landlock::LandlockError; 93 use crate::memory_manager::{ 94 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 95 }; 96 #[cfg(target_arch = "x86_64")] 97 use crate::migration::get_vm_snapshot; 98 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 99 use crate::migration::url_to_file; 100 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 101 use crate::GuestMemoryMmap; 102 use crate::{ 103 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 104 }; 105 106 /// Errors associated with VM management 107 #[derive(Debug, Error)] 108 pub enum Error { 109 #[error("Cannot open kernel file: {0}")] 110 KernelFile(#[source] io::Error), 111 112 #[error("Cannot open initramfs file: {0}")] 113 InitramfsFile(#[source] io::Error), 114 115 #[error("Cannot load the kernel into memory: {0}")] 116 KernelLoad(#[source] linux_loader::loader::Error), 117 118 #[cfg(target_arch = "aarch64")] 119 #[error("Cannot load the UEFI binary in memory: {0:?}")] 120 UefiLoad(arch::aarch64::uefi::Error), 121 122 #[error("Cannot load the initramfs into memory")] 123 InitramfsLoad, 124 125 #[error("Cannot load the kernel command line in memory: {0}")] 126 LoadCmdLine(#[source] linux_loader::loader::Error), 127 128 #[error("Failed to apply landlock config during vm_create: {0}")] 129 ApplyLandlock(#[source] LandlockError), 130 131 #[error("Cannot modify the kernel command line: {0}")] 132 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 133 134 #[error("Cannot create the kernel command line: {0}")] 135 CmdLineCreate(#[source] linux_loader::cmdline::Error), 136 137 #[error("Cannot configure system: {0}")] 138 ConfigureSystem(#[source] arch::Error), 139 140 #[cfg(target_arch = "aarch64")] 141 #[error("Cannot enable interrupt controller: {0:?}")] 142 EnableInterruptController(interrupt_controller::Error), 143 144 #[error("VM state is poisoned")] 145 PoisonedState, 146 147 #[error("Error from device manager: {0:?}")] 148 DeviceManager(DeviceManagerError), 149 150 #[error("No device with id {0:?} to remove")] 151 NoDeviceToRemove(String), 152 153 #[error("Cannot spawn a signal handler thread: {0}")] 154 SignalHandlerSpawn(#[source] io::Error), 155 156 #[error("Failed to join on threads: {0:?}")] 157 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 158 159 #[error("VM config is missing")] 160 VmMissingConfig, 161 162 #[error("VM is not created")] 163 VmNotCreated, 164 165 #[error("VM is already created")] 166 VmAlreadyCreated, 167 168 #[error("VM is not running")] 169 VmNotRunning, 170 171 #[error("Cannot clone EventFd: {0}")] 172 EventFdClone(#[source] io::Error), 173 174 #[error("invalid VM state transition: {0:?} to {1:?}")] 175 InvalidStateTransition(VmState, VmState), 176 177 #[error("Error from CPU manager: {0}")] 178 CpuManager(#[source] cpu::Error), 179 180 #[error("Cannot pause devices: {0}")] 181 PauseDevices(#[source] MigratableError), 182 183 #[error("Cannot resume devices: {0}")] 184 ResumeDevices(#[source] MigratableError), 185 186 #[error("Cannot pause CPUs: {0}")] 187 PauseCpus(#[source] MigratableError), 188 189 #[error("Cannot resume cpus: {0}")] 190 ResumeCpus(#[source] MigratableError), 191 192 #[error("Cannot pause VM: {0}")] 193 Pause(#[source] MigratableError), 194 195 #[error("Cannot resume VM: {0}")] 196 Resume(#[source] MigratableError), 197 198 #[error("Memory manager error: {0:?}")] 199 MemoryManager(MemoryManagerError), 200 201 #[error("Eventfd write error: {0}")] 202 EventfdError(#[source] std::io::Error), 203 204 #[error("Cannot snapshot VM: {0}")] 205 Snapshot(#[source] MigratableError), 206 207 #[error("Cannot restore VM: {0}")] 208 Restore(#[source] MigratableError), 209 210 #[error("Cannot send VM snapshot: {0}")] 211 SnapshotSend(#[source] MigratableError), 212 213 #[error("Invalid restore source URL")] 214 InvalidRestoreSourceUrl, 215 216 #[error("Failed to validate config: {0}")] 217 ConfigValidation(#[source] ValidationError), 218 219 #[error("Too many virtio-vsock devices")] 220 TooManyVsockDevices, 221 222 #[error("Failed serializing into JSON: {0}")] 223 SerializeJson(#[source] serde_json::Error), 224 225 #[error("Invalid NUMA configuration")] 226 InvalidNumaConfig, 227 228 #[error("Cannot create seccomp filter: {0}")] 229 CreateSeccompFilter(#[source] seccompiler::Error), 230 231 #[error("Cannot apply seccomp filter: {0}")] 232 ApplySeccompFilter(#[source] seccompiler::Error), 233 234 #[error("Failed resizing a memory zone")] 235 ResizeZone, 236 237 #[error("Cannot activate virtio devices: {0:?}")] 238 ActivateVirtioDevices(DeviceManagerError), 239 240 #[error("Error triggering power button: {0:?}")] 241 PowerButton(DeviceManagerError), 242 243 #[error("Kernel lacks PVH header")] 244 KernelMissingPvhHeader, 245 246 #[error("Failed to allocate firmware RAM: {0:?}")] 247 AllocateFirmwareMemory(MemoryManagerError), 248 249 #[error("Error manipulating firmware file: {0}")] 250 FirmwareFile(#[source] std::io::Error), 251 252 #[error("Firmware too big")] 253 FirmwareTooLarge, 254 255 #[error("Failed to copy firmware to memory: {0}")] 256 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 257 258 #[cfg(feature = "sev_snp")] 259 #[error("Error enabling SEV-SNP VM: {0}")] 260 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 261 262 #[cfg(feature = "tdx")] 263 #[error("Error performing I/O on TDX firmware file: {0}")] 264 LoadTdvf(#[source] std::io::Error), 265 266 #[cfg(feature = "tdx")] 267 #[error("Error performing I/O on the TDX payload file: {0}")] 268 LoadPayload(#[source] std::io::Error), 269 270 #[cfg(feature = "tdx")] 271 #[error("Error parsing TDVF: {0}")] 272 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 273 274 #[cfg(feature = "tdx")] 275 #[error("Error populating TDX HOB: {0}")] 276 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 277 278 #[cfg(feature = "tdx")] 279 #[error("Error allocating TDVF memory: {0:?}")] 280 AllocatingTdvfMemory(crate::memory_manager::Error), 281 282 #[cfg(feature = "tdx")] 283 #[error("Error enabling TDX VM: {0}")] 284 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 285 286 #[cfg(feature = "tdx")] 287 #[error("Error enabling TDX memory region: {0}")] 288 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 289 290 #[cfg(feature = "tdx")] 291 #[error("Error finalizing TDX VM: {0}")] 292 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 293 294 #[cfg(feature = "tdx")] 295 #[error("TDX firmware missing")] 296 TdxFirmwareMissing, 297 298 #[cfg(feature = "tdx")] 299 #[error("Invalid TDX payload type")] 300 InvalidPayloadType, 301 302 #[cfg(feature = "guest_debug")] 303 #[error("Error debugging VM: {0:?}")] 304 Debug(DebuggableError), 305 306 #[error("Error spawning kernel loading thread")] 307 KernelLoadThreadSpawn(std::io::Error), 308 309 #[error("Error joining kernel loading thread")] 310 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 311 312 #[error("Payload configuration is not bootable")] 313 InvalidPayload, 314 315 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 316 #[error("Error coredumping VM: {0:?}")] 317 Coredump(GuestDebuggableError), 318 319 #[cfg(feature = "igvm")] 320 #[error("Cannot open igvm file: {0}")] 321 IgvmFile(#[source] io::Error), 322 323 #[cfg(feature = "igvm")] 324 #[error("Cannot load the igvm into memory: {0}")] 325 IgvmLoad(#[source] igvm_loader::Error), 326 327 #[error("Error injecting NMI")] 328 ErrorNmi, 329 330 #[error("Error resuming the VM: {0}")] 331 ResumeVm(#[source] hypervisor::HypervisorVmError), 332 333 #[error("Error creating console devices")] 334 CreateConsoleDevices(ConsoleDeviceError), 335 } 336 pub type Result<T> = result::Result<T, Error>; 337 338 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 339 pub enum VmState { 340 Created, 341 Running, 342 Shutdown, 343 Paused, 344 BreakPoint, 345 } 346 347 impl VmState { 348 fn valid_transition(self, new_state: VmState) -> Result<()> { 349 match self { 350 VmState::Created => match new_state { 351 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 352 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 353 Ok(()) 354 } 355 }, 356 357 VmState::Running => match new_state { 358 VmState::Created | VmState::Running => { 359 Err(Error::InvalidStateTransition(self, new_state)) 360 } 361 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 362 }, 363 364 VmState::Shutdown => match new_state { 365 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 366 Err(Error::InvalidStateTransition(self, new_state)) 367 } 368 VmState::Running => Ok(()), 369 }, 370 371 VmState::Paused => match new_state { 372 VmState::Created | VmState::Paused | VmState::BreakPoint => { 373 Err(Error::InvalidStateTransition(self, new_state)) 374 } 375 VmState::Running | VmState::Shutdown => Ok(()), 376 }, 377 VmState::BreakPoint => match new_state { 378 VmState::Created | VmState::Running => Ok(()), 379 _ => Err(Error::InvalidStateTransition(self, new_state)), 380 }, 381 } 382 } 383 } 384 385 struct VmOpsHandler { 386 memory: GuestMemoryAtomic<GuestMemoryMmap>, 387 #[cfg(target_arch = "x86_64")] 388 io_bus: Arc<Bus>, 389 mmio_bus: Arc<Bus>, 390 } 391 392 impl VmOps for VmOpsHandler { 393 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 394 self.memory 395 .memory() 396 .write(buf, GuestAddress(gpa)) 397 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 398 } 399 400 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 401 self.memory 402 .memory() 403 .read(buf, GuestAddress(gpa)) 404 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 405 } 406 407 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 408 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 409 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 410 } 411 Ok(()) 412 } 413 414 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 415 match self.mmio_bus.write(gpa, data) { 416 Err(vm_device::BusError::MissingAddressRange) => { 417 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 418 } 419 Ok(Some(barrier)) => { 420 info!("Waiting for barrier"); 421 barrier.wait(); 422 info!("Barrier released"); 423 } 424 _ => {} 425 }; 426 Ok(()) 427 } 428 429 #[cfg(target_arch = "x86_64")] 430 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 431 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 432 info!("Guest PIO read to unregistered address 0x{:x}", port); 433 } 434 Ok(()) 435 } 436 437 #[cfg(target_arch = "x86_64")] 438 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 439 match self.io_bus.write(port, data) { 440 Err(vm_device::BusError::MissingAddressRange) => { 441 info!("Guest PIO write to unregistered address 0x{:x}", port); 442 } 443 Ok(Some(barrier)) => { 444 info!("Waiting for barrier"); 445 barrier.wait(); 446 info!("Barrier released"); 447 } 448 _ => {} 449 }; 450 Ok(()) 451 } 452 } 453 454 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 455 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 456 457 cmp::min(host_phys_bits, max_phys_bits) 458 } 459 460 pub struct Vm { 461 #[cfg(feature = "tdx")] 462 kernel: Option<File>, 463 initramfs: Option<File>, 464 threads: Vec<thread::JoinHandle<()>>, 465 device_manager: Arc<Mutex<DeviceManager>>, 466 config: Arc<Mutex<VmConfig>>, 467 state: RwLock<VmState>, 468 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 469 memory_manager: Arc<Mutex<MemoryManager>>, 470 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 471 // The hypervisor abstracted virtual machine. 472 vm: Arc<dyn hypervisor::Vm>, 473 #[cfg(target_arch = "x86_64")] 474 saved_clock: Option<hypervisor::ClockData>, 475 numa_nodes: NumaNodes, 476 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 477 hypervisor: Arc<dyn hypervisor::Hypervisor>, 478 stop_on_boot: bool, 479 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 480 } 481 482 impl Vm { 483 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 484 485 #[allow(clippy::too_many_arguments)] 486 pub fn new_from_memory_manager( 487 config: Arc<Mutex<VmConfig>>, 488 memory_manager: Arc<Mutex<MemoryManager>>, 489 vm: Arc<dyn hypervisor::Vm>, 490 exit_evt: EventFd, 491 reset_evt: EventFd, 492 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 493 seccomp_action: &SeccompAction, 494 hypervisor: Arc<dyn hypervisor::Hypervisor>, 495 activate_evt: EventFd, 496 timestamp: Instant, 497 console_info: Option<ConsoleInfo>, 498 console_resize_pipe: Option<Arc<File>>, 499 original_termios: Arc<Mutex<Option<termios>>>, 500 snapshot: Option<Snapshot>, 501 ) -> Result<Self> { 502 trace_scoped!("Vm::new_from_memory_manager"); 503 504 let boot_id_list = config 505 .lock() 506 .unwrap() 507 .validate() 508 .map_err(Error::ConfigValidation)?; 509 510 #[cfg(not(feature = "igvm"))] 511 let load_payload_handle = if snapshot.is_none() { 512 Self::load_payload_async(&memory_manager, &config)? 513 } else { 514 None 515 }; 516 517 info!("Booting VM from config: {:?}", &config); 518 519 // Create NUMA nodes based on NumaConfig. 520 let numa_nodes = 521 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 522 523 #[cfg(feature = "tdx")] 524 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 525 #[cfg(feature = "sev_snp")] 526 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 527 #[cfg(feature = "tdx")] 528 let force_iommu = tdx_enabled; 529 #[cfg(feature = "sev_snp")] 530 let force_iommu = sev_snp_enabled; 531 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 532 let force_iommu = false; 533 534 #[cfg(feature = "guest_debug")] 535 let stop_on_boot = config.lock().unwrap().gdb; 536 #[cfg(not(feature = "guest_debug"))] 537 let stop_on_boot = false; 538 539 let memory = memory_manager.lock().unwrap().guest_memory(); 540 #[cfg(target_arch = "x86_64")] 541 let io_bus = Arc::new(Bus::new()); 542 let mmio_bus = Arc::new(Bus::new()); 543 544 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 545 memory, 546 #[cfg(target_arch = "x86_64")] 547 io_bus: io_bus.clone(), 548 mmio_bus: mmio_bus.clone(), 549 }); 550 551 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 552 let cpu_manager = cpu::CpuManager::new( 553 cpus_config, 554 vm.clone(), 555 exit_evt.try_clone().map_err(Error::EventFdClone)?, 556 reset_evt.try_clone().map_err(Error::EventFdClone)?, 557 #[cfg(feature = "guest_debug")] 558 vm_debug_evt, 559 &hypervisor, 560 seccomp_action.clone(), 561 vm_ops, 562 #[cfg(feature = "tdx")] 563 tdx_enabled, 564 &numa_nodes, 565 #[cfg(feature = "sev_snp")] 566 sev_snp_enabled, 567 ) 568 .map_err(Error::CpuManager)?; 569 570 #[cfg(target_arch = "x86_64")] 571 cpu_manager 572 .lock() 573 .unwrap() 574 .populate_cpuid( 575 &memory_manager, 576 &hypervisor, 577 #[cfg(feature = "tdx")] 578 tdx_enabled, 579 ) 580 .map_err(Error::CpuManager)?; 581 582 // Loading the igvm file is pushed down here because 583 // igvm parser needs cpu_manager to retrieve cpuid leaf. 584 // For the regular case, we can start loading early, but for 585 // igvm case we have to wait until cpu_manager is created. 586 // Currently, Microsoft Hypervisor does not provide any 587 // Hypervisor specific common cpuid, we need to call get_cpuid_values 588 // per cpuid through cpu_manager. 589 #[cfg(feature = "igvm")] 590 let load_payload_handle = if snapshot.is_none() { 591 Self::load_payload_async( 592 &memory_manager, 593 &config, 594 &cpu_manager, 595 #[cfg(feature = "sev_snp")] 596 sev_snp_enabled, 597 )? 598 } else { 599 None 600 }; 601 // The initial TDX configuration must be done before the vCPUs are 602 // created 603 #[cfg(feature = "tdx")] 604 if tdx_enabled { 605 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 606 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 607 vm.tdx_init(&cpuid, max_vcpus) 608 .map_err(Error::InitializeTdxVm)?; 609 } 610 611 cpu_manager 612 .lock() 613 .unwrap() 614 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 615 .map_err(Error::CpuManager)?; 616 617 // This initial SEV-SNP configuration must be done immediately after 618 // vCPUs are created. As part of this initialization we are 619 // transitioning the guest into secure state. 620 #[cfg(feature = "sev_snp")] 621 if sev_snp_enabled { 622 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 623 } 624 625 #[cfg(feature = "tdx")] 626 let dynamic = !tdx_enabled; 627 #[cfg(not(feature = "tdx"))] 628 let dynamic = true; 629 630 let device_manager = DeviceManager::new( 631 #[cfg(target_arch = "x86_64")] 632 io_bus, 633 mmio_bus, 634 vm.clone(), 635 config.clone(), 636 memory_manager.clone(), 637 cpu_manager.clone(), 638 exit_evt.try_clone().map_err(Error::EventFdClone)?, 639 reset_evt, 640 seccomp_action.clone(), 641 numa_nodes.clone(), 642 &activate_evt, 643 force_iommu, 644 boot_id_list, 645 timestamp, 646 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 647 dynamic, 648 ) 649 .map_err(Error::DeviceManager)?; 650 651 device_manager 652 .lock() 653 .unwrap() 654 .create_devices(console_info, console_resize_pipe, original_termios) 655 .map_err(Error::DeviceManager)?; 656 657 #[cfg(feature = "tdx")] 658 let kernel = config 659 .lock() 660 .unwrap() 661 .payload 662 .as_ref() 663 .map(|p| p.kernel.as_ref().map(File::open)) 664 .unwrap_or_default() 665 .transpose() 666 .map_err(Error::KernelFile)?; 667 668 let initramfs = config 669 .lock() 670 .unwrap() 671 .payload 672 .as_ref() 673 .map(|p| p.initramfs.as_ref().map(File::open)) 674 .unwrap_or_default() 675 .transpose() 676 .map_err(Error::InitramfsFile)?; 677 678 #[cfg(target_arch = "x86_64")] 679 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 680 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 681 vm_snapshot.clock 682 } else { 683 None 684 }; 685 686 let vm_state = if snapshot.is_some() { 687 VmState::Paused 688 } else { 689 VmState::Created 690 }; 691 692 Ok(Vm { 693 #[cfg(feature = "tdx")] 694 kernel, 695 initramfs, 696 device_manager, 697 config, 698 threads: Vec::with_capacity(1), 699 state: RwLock::new(vm_state), 700 cpu_manager, 701 memory_manager, 702 vm, 703 #[cfg(target_arch = "x86_64")] 704 saved_clock, 705 numa_nodes, 706 hypervisor, 707 stop_on_boot, 708 load_payload_handle, 709 }) 710 } 711 712 fn create_numa_nodes( 713 configs: Option<Vec<NumaConfig>>, 714 memory_manager: &Arc<Mutex<MemoryManager>>, 715 ) -> Result<NumaNodes> { 716 let mm = memory_manager.lock().unwrap(); 717 let mm_zones = mm.memory_zones(); 718 let mut numa_nodes = BTreeMap::new(); 719 720 if let Some(configs) = &configs { 721 for config in configs.iter() { 722 if numa_nodes.contains_key(&config.guest_numa_id) { 723 error!("Can't define twice the same NUMA node"); 724 return Err(Error::InvalidNumaConfig); 725 } 726 727 let mut node = NumaNode::default(); 728 729 if let Some(memory_zones) = &config.memory_zones { 730 for memory_zone in memory_zones.iter() { 731 if let Some(mm_zone) = mm_zones.get(memory_zone) { 732 node.memory_regions.extend(mm_zone.regions().clone()); 733 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 734 node.hotplug_regions.push(virtiomem_zone.region().clone()); 735 } 736 node.memory_zones.push(memory_zone.clone()); 737 } else { 738 error!("Unknown memory zone '{}'", memory_zone); 739 return Err(Error::InvalidNumaConfig); 740 } 741 } 742 } 743 744 if let Some(cpus) = &config.cpus { 745 node.cpus.extend(cpus); 746 } 747 748 if let Some(pci_segments) = &config.pci_segments { 749 node.pci_segments.extend(pci_segments); 750 } 751 752 if let Some(distances) = &config.distances { 753 for distance in distances.iter() { 754 let dest = distance.destination; 755 let dist = distance.distance; 756 757 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 758 error!("Unknown destination NUMA node {}", dest); 759 return Err(Error::InvalidNumaConfig); 760 } 761 762 if node.distances.contains_key(&dest) { 763 error!("Destination NUMA node {} has been already set", dest); 764 return Err(Error::InvalidNumaConfig); 765 } 766 767 node.distances.insert(dest, dist); 768 } 769 } 770 771 #[cfg(target_arch = "x86_64")] 772 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 773 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 774 let mm_sections = sgx_epc_region.epc_sections(); 775 for sgx_epc_section in sgx_epc_sections.iter() { 776 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 777 node.sgx_epc_sections.push(mm_section.clone()); 778 } else { 779 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 780 return Err(Error::InvalidNumaConfig); 781 } 782 } 783 } else { 784 error!("Missing SGX EPC region"); 785 return Err(Error::InvalidNumaConfig); 786 } 787 } 788 789 numa_nodes.insert(config.guest_numa_id, node); 790 } 791 } 792 793 Ok(numa_nodes) 794 } 795 796 #[allow(clippy::too_many_arguments)] 797 pub fn new( 798 vm_config: Arc<Mutex<VmConfig>>, 799 exit_evt: EventFd, 800 reset_evt: EventFd, 801 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 802 seccomp_action: &SeccompAction, 803 hypervisor: Arc<dyn hypervisor::Hypervisor>, 804 activate_evt: EventFd, 805 console_info: Option<ConsoleInfo>, 806 console_resize_pipe: Option<Arc<File>>, 807 original_termios: Arc<Mutex<Option<termios>>>, 808 snapshot: Option<Snapshot>, 809 source_url: Option<&str>, 810 prefault: Option<bool>, 811 ) -> Result<Self> { 812 trace_scoped!("Vm::new"); 813 814 let timestamp = Instant::now(); 815 816 #[cfg(feature = "tdx")] 817 let tdx_enabled = if snapshot.is_some() { 818 false 819 } else { 820 vm_config.lock().unwrap().is_tdx_enabled() 821 }; 822 823 #[cfg(feature = "sev_snp")] 824 let sev_snp_enabled = if snapshot.is_some() { 825 false 826 } else { 827 vm_config.lock().unwrap().is_sev_snp_enabled() 828 }; 829 830 let vm = Self::create_hypervisor_vm( 831 &hypervisor, 832 #[cfg(feature = "tdx")] 833 tdx_enabled, 834 #[cfg(feature = "sev_snp")] 835 sev_snp_enabled, 836 )?; 837 838 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 839 840 let memory_manager = if let Some(snapshot) = 841 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 842 { 843 MemoryManager::new_from_snapshot( 844 &snapshot, 845 vm.clone(), 846 &vm_config.lock().unwrap().memory.clone(), 847 source_url, 848 prefault.unwrap(), 849 phys_bits, 850 ) 851 .map_err(Error::MemoryManager)? 852 } else { 853 #[cfg(target_arch = "x86_64")] 854 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 855 856 MemoryManager::new( 857 vm.clone(), 858 &vm_config.lock().unwrap().memory.clone(), 859 None, 860 phys_bits, 861 #[cfg(feature = "tdx")] 862 tdx_enabled, 863 None, 864 None, 865 #[cfg(target_arch = "x86_64")] 866 sgx_epc_config, 867 ) 868 .map_err(Error::MemoryManager)? 869 }; 870 871 Vm::new_from_memory_manager( 872 vm_config, 873 memory_manager, 874 vm, 875 exit_evt, 876 reset_evt, 877 #[cfg(feature = "guest_debug")] 878 vm_debug_evt, 879 seccomp_action, 880 hypervisor, 881 activate_evt, 882 timestamp, 883 console_info, 884 console_resize_pipe, 885 original_termios, 886 snapshot, 887 ) 888 } 889 890 pub fn create_hypervisor_vm( 891 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 892 #[cfg(feature = "tdx")] tdx_enabled: bool, 893 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 894 ) -> Result<Arc<dyn hypervisor::Vm>> { 895 hypervisor.check_required_extensions().unwrap(); 896 897 cfg_if::cfg_if! { 898 if #[cfg(feature = "tdx")] { 899 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 900 // Otherwise KVM_X86_LEGACY_VM: 0 901 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 902 let vm = hypervisor 903 .create_vm_with_type(u64::from(tdx_enabled)) 904 .unwrap(); 905 } else if #[cfg(feature = "sev_snp")] { 906 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 907 // Otherwise SEV_SNP_DISABLED: 0 908 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 909 let vm = hypervisor 910 .create_vm_with_type(u64::from(sev_snp_enabled)) 911 .unwrap(); 912 } else { 913 let vm = hypervisor.create_vm().unwrap(); 914 } 915 } 916 917 #[cfg(target_arch = "x86_64")] 918 { 919 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 920 .unwrap(); 921 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 922 vm.enable_split_irq().unwrap(); 923 } 924 925 Ok(vm) 926 } 927 928 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 929 let initramfs = self.initramfs.as_mut().unwrap(); 930 let size: usize = initramfs 931 .seek(SeekFrom::End(0)) 932 .map_err(|_| Error::InitramfsLoad)? 933 .try_into() 934 .unwrap(); 935 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 936 937 let address = 938 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 939 let address = GuestAddress(address); 940 941 guest_mem 942 .read_volatile_from(address, initramfs, size) 943 .map_err(|_| Error::InitramfsLoad)?; 944 945 info!("Initramfs loaded: address = 0x{:x}", address.0); 946 Ok(arch::InitramfsConfig { address, size }) 947 } 948 949 pub fn generate_cmdline( 950 payload: &PayloadConfig, 951 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 952 ) -> Result<Cmdline> { 953 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 954 if let Some(s) = payload.cmdline.as_ref() { 955 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 956 } 957 958 #[cfg(target_arch = "aarch64")] 959 for entry in device_manager.lock().unwrap().cmdline_additions() { 960 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 961 } 962 Ok(cmdline) 963 } 964 965 #[cfg(target_arch = "aarch64")] 966 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 967 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 968 let mem = uefi_flash.memory(); 969 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 970 .map_err(Error::UefiLoad)?; 971 Ok(()) 972 } 973 974 #[cfg(target_arch = "aarch64")] 975 fn load_kernel( 976 firmware: Option<File>, 977 kernel: Option<File>, 978 memory_manager: Arc<Mutex<MemoryManager>>, 979 ) -> Result<EntryPoint> { 980 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 981 let mem = guest_memory.memory(); 982 let entry_addr = match (firmware, kernel) { 983 (None, Some(mut kernel)) => { 984 match linux_loader::loader::pe::PE::load( 985 mem.deref(), 986 Some(arch::layout::KERNEL_START), 987 &mut kernel, 988 None, 989 ) { 990 Ok(entry_addr) => entry_addr.kernel_load, 991 // Try to load the binary as kernel PE file at first. 992 // If failed, retry to load it as UEFI binary. 993 // As the UEFI binary is formatless, it must be the last option to try. 994 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 995 Self::load_firmware(&kernel, memory_manager)?; 996 arch::layout::UEFI_START 997 } 998 Err(e) => { 999 return Err(Error::KernelLoad(e)); 1000 } 1001 } 1002 } 1003 (Some(firmware), None) => { 1004 Self::load_firmware(&firmware, memory_manager)?; 1005 arch::layout::UEFI_START 1006 } 1007 _ => return Err(Error::InvalidPayload), 1008 }; 1009 1010 Ok(EntryPoint { entry_addr }) 1011 } 1012 1013 #[cfg(feature = "igvm")] 1014 fn load_igvm( 1015 igvm: File, 1016 memory_manager: Arc<Mutex<MemoryManager>>, 1017 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1018 #[cfg(feature = "sev_snp")] host_data: &Option<String>, 1019 ) -> Result<EntryPoint> { 1020 let res = igvm_loader::load_igvm( 1021 &igvm, 1022 memory_manager, 1023 cpu_manager.clone(), 1024 "", 1025 #[cfg(feature = "sev_snp")] 1026 host_data, 1027 ) 1028 .map_err(Error::IgvmLoad)?; 1029 1030 cfg_if::cfg_if! { 1031 if #[cfg(feature = "sev_snp")] { 1032 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1033 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1034 } else { 1035 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1036 }; 1037 } else { 1038 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1039 } 1040 }; 1041 Ok(entry_point) 1042 } 1043 1044 #[cfg(target_arch = "x86_64")] 1045 fn load_kernel( 1046 mut kernel: File, 1047 cmdline: Option<Cmdline>, 1048 memory_manager: Arc<Mutex<MemoryManager>>, 1049 ) -> Result<EntryPoint> { 1050 info!("Loading kernel"); 1051 1052 let mem = { 1053 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1054 guest_memory.memory() 1055 }; 1056 1057 // Try ELF binary with PVH boot. 1058 let entry_addr = linux_loader::loader::elf::Elf::load( 1059 mem.deref(), 1060 None, 1061 &mut kernel, 1062 Some(arch::layout::HIGH_RAM_START), 1063 ) 1064 // Try loading kernel as bzImage. 1065 .or_else(|_| { 1066 BzImage::load( 1067 mem.deref(), 1068 None, 1069 &mut kernel, 1070 Some(arch::layout::HIGH_RAM_START), 1071 ) 1072 }) 1073 .map_err(Error::KernelLoad)?; 1074 1075 if let Some(cmdline) = cmdline { 1076 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1077 .map_err(Error::LoadCmdLine)?; 1078 } 1079 1080 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1081 // Use the PVH kernel entry point to boot the guest 1082 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1083 Ok(EntryPoint { 1084 entry_addr, 1085 setup_header: None, 1086 }) 1087 } else if entry_addr.setup_header.is_some() { 1088 // Use the bzImage 32bit entry point to boot the guest 1089 info!( 1090 "bzImage kernel loaded: entry_addr = 0x{:x}", 1091 entry_addr.kernel_load.0 1092 ); 1093 Ok(EntryPoint { 1094 entry_addr: entry_addr.kernel_load, 1095 setup_header: entry_addr.setup_header, 1096 }) 1097 } else { 1098 Err(Error::KernelMissingPvhHeader) 1099 } 1100 } 1101 1102 #[cfg(target_arch = "x86_64")] 1103 fn load_payload( 1104 payload: &PayloadConfig, 1105 memory_manager: Arc<Mutex<MemoryManager>>, 1106 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1107 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1108 ) -> Result<EntryPoint> { 1109 trace_scoped!("load_payload"); 1110 #[cfg(feature = "igvm")] 1111 { 1112 if let Some(_igvm_file) = &payload.igvm { 1113 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1114 #[cfg(feature = "sev_snp")] 1115 if sev_snp_enabled { 1116 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); 1117 } 1118 #[cfg(not(feature = "sev_snp"))] 1119 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1120 } 1121 } 1122 match ( 1123 &payload.firmware, 1124 &payload.kernel, 1125 &payload.initramfs, 1126 &payload.cmdline, 1127 ) { 1128 (Some(firmware), None, None, None) => { 1129 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1130 Self::load_kernel(firmware, None, memory_manager) 1131 } 1132 (None, Some(kernel), _, _) => { 1133 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1134 let cmdline = Self::generate_cmdline(payload)?; 1135 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1136 } 1137 _ => Err(Error::InvalidPayload), 1138 } 1139 } 1140 1141 #[cfg(target_arch = "aarch64")] 1142 fn load_payload( 1143 payload: &PayloadConfig, 1144 memory_manager: Arc<Mutex<MemoryManager>>, 1145 ) -> Result<EntryPoint> { 1146 match (&payload.firmware, &payload.kernel) { 1147 (Some(firmware), None) => { 1148 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1149 Self::load_kernel(Some(firmware), None, memory_manager) 1150 } 1151 (None, Some(kernel)) => { 1152 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1153 Self::load_kernel(None, Some(kernel), memory_manager) 1154 } 1155 _ => Err(Error::InvalidPayload), 1156 } 1157 } 1158 1159 fn load_payload_async( 1160 memory_manager: &Arc<Mutex<MemoryManager>>, 1161 config: &Arc<Mutex<VmConfig>>, 1162 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1163 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1164 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1165 // Kernel with TDX is loaded in a different manner 1166 #[cfg(feature = "tdx")] 1167 if config.lock().unwrap().is_tdx_enabled() { 1168 return Ok(None); 1169 } 1170 1171 config 1172 .lock() 1173 .unwrap() 1174 .payload 1175 .as_ref() 1176 .map(|payload| { 1177 let memory_manager = memory_manager.clone(); 1178 let payload = payload.clone(); 1179 #[cfg(feature = "igvm")] 1180 let cpu_manager = cpu_manager.clone(); 1181 1182 std::thread::Builder::new() 1183 .name("payload_loader".into()) 1184 .spawn(move || { 1185 Self::load_payload( 1186 &payload, 1187 memory_manager, 1188 #[cfg(feature = "igvm")] 1189 cpu_manager, 1190 #[cfg(feature = "sev_snp")] 1191 sev_snp_enabled, 1192 ) 1193 }) 1194 .map_err(Error::KernelLoadThreadSpawn) 1195 }) 1196 .transpose() 1197 } 1198 1199 #[cfg(target_arch = "x86_64")] 1200 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1201 trace_scoped!("configure_system"); 1202 info!("Configuring system"); 1203 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1204 1205 let initramfs_config = match self.initramfs { 1206 Some(_) => Some(self.load_initramfs(&mem)?), 1207 None => None, 1208 }; 1209 1210 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1211 let rsdp_addr = Some(rsdp_addr); 1212 let sgx_epc_region = self 1213 .memory_manager 1214 .lock() 1215 .unwrap() 1216 .sgx_epc_region() 1217 .as_ref() 1218 .cloned(); 1219 1220 let serial_number = self 1221 .config 1222 .lock() 1223 .unwrap() 1224 .platform 1225 .as_ref() 1226 .and_then(|p| p.serial_number.clone()); 1227 1228 let uuid = self 1229 .config 1230 .lock() 1231 .unwrap() 1232 .platform 1233 .as_ref() 1234 .and_then(|p| p.uuid.clone()); 1235 1236 let oem_strings = self 1237 .config 1238 .lock() 1239 .unwrap() 1240 .platform 1241 .as_ref() 1242 .and_then(|p| p.oem_strings.clone()); 1243 1244 let oem_strings = oem_strings 1245 .as_deref() 1246 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1247 1248 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1249 1250 arch::configure_system( 1251 &mem, 1252 arch::layout::CMDLINE_START, 1253 arch::layout::CMDLINE_MAX_SIZE, 1254 &initramfs_config, 1255 boot_vcpus, 1256 entry_addr.setup_header, 1257 rsdp_addr, 1258 sgx_epc_region, 1259 serial_number.as_deref(), 1260 uuid.as_deref(), 1261 oem_strings.as_deref(), 1262 topology, 1263 ) 1264 .map_err(Error::ConfigureSystem)?; 1265 Ok(()) 1266 } 1267 1268 #[cfg(target_arch = "aarch64")] 1269 fn configure_system( 1270 &mut self, 1271 _rsdp_addr: GuestAddress, 1272 _entry_addr: EntryPoint, 1273 ) -> Result<()> { 1274 let cmdline = Self::generate_cmdline( 1275 self.config.lock().unwrap().payload.as_ref().unwrap(), 1276 &self.device_manager, 1277 )?; 1278 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1279 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1280 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1281 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1282 let initramfs_config = match self.initramfs { 1283 Some(_) => Some(self.load_initramfs(&mem)?), 1284 None => None, 1285 }; 1286 1287 let device_info = &self 1288 .device_manager 1289 .lock() 1290 .unwrap() 1291 .get_device_info() 1292 .clone(); 1293 1294 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1295 let pci_space = PciSpaceInfo { 1296 pci_segment_id: pci_segment.id, 1297 mmio_config_address: pci_segment.mmio_config_address, 1298 pci_device_space_start: pci_segment.start_of_mem64_area, 1299 pci_device_space_size: pci_segment.end_of_mem64_area 1300 - pci_segment.start_of_mem64_area 1301 + 1, 1302 }; 1303 pci_space_info.push(pci_space); 1304 } 1305 1306 let virtio_iommu_bdf = self 1307 .device_manager 1308 .lock() 1309 .unwrap() 1310 .iommu_attached_devices() 1311 .as_ref() 1312 .map(|(v, _)| *v); 1313 1314 let vgic = self 1315 .device_manager 1316 .lock() 1317 .unwrap() 1318 .get_interrupt_controller() 1319 .unwrap() 1320 .lock() 1321 .unwrap() 1322 .get_vgic() 1323 .map_err(|_| { 1324 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1325 arch::aarch64::Error::SetupGic, 1326 )) 1327 })?; 1328 1329 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1330 let pmu_supported = self 1331 .cpu_manager 1332 .lock() 1333 .unwrap() 1334 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1335 .map_err(|_| { 1336 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1337 arch::aarch64::Error::VcpuInitPmu, 1338 )) 1339 })?; 1340 1341 arch::configure_system( 1342 &mem, 1343 cmdline.as_cstring().unwrap().to_str().unwrap(), 1344 vcpu_mpidrs, 1345 vcpu_topology, 1346 device_info, 1347 &initramfs_config, 1348 &pci_space_info, 1349 virtio_iommu_bdf.map(|bdf| bdf.into()), 1350 &vgic, 1351 &self.numa_nodes, 1352 pmu_supported, 1353 ) 1354 .map_err(Error::ConfigureSystem)?; 1355 1356 Ok(()) 1357 } 1358 1359 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1360 self.device_manager.lock().unwrap().console_resize_pipe() 1361 } 1362 1363 pub fn shutdown(&mut self) -> Result<()> { 1364 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1365 let new_state = VmState::Shutdown; 1366 1367 state.valid_transition(new_state)?; 1368 1369 // Wake up the DeviceManager threads so they will get terminated cleanly 1370 self.device_manager 1371 .lock() 1372 .unwrap() 1373 .resume() 1374 .map_err(Error::Resume)?; 1375 1376 self.cpu_manager 1377 .lock() 1378 .unwrap() 1379 .shutdown() 1380 .map_err(Error::CpuManager)?; 1381 1382 // Wait for all the threads to finish 1383 for thread in self.threads.drain(..) { 1384 thread.join().map_err(Error::ThreadCleanup)? 1385 } 1386 *state = new_state; 1387 1388 Ok(()) 1389 } 1390 1391 pub fn resize( 1392 &mut self, 1393 desired_vcpus: Option<u8>, 1394 desired_memory: Option<u64>, 1395 desired_balloon: Option<u64>, 1396 ) -> Result<()> { 1397 event!("vm", "resizing"); 1398 1399 if let Some(desired_vcpus) = desired_vcpus { 1400 if self 1401 .cpu_manager 1402 .lock() 1403 .unwrap() 1404 .resize(desired_vcpus) 1405 .map_err(Error::CpuManager)? 1406 { 1407 self.device_manager 1408 .lock() 1409 .unwrap() 1410 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1411 .map_err(Error::DeviceManager)?; 1412 } 1413 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1414 } 1415 1416 if let Some(desired_memory) = desired_memory { 1417 let new_region = self 1418 .memory_manager 1419 .lock() 1420 .unwrap() 1421 .resize(desired_memory) 1422 .map_err(Error::MemoryManager)?; 1423 1424 let memory_config = &mut self.config.lock().unwrap().memory; 1425 1426 if let Some(new_region) = &new_region { 1427 self.device_manager 1428 .lock() 1429 .unwrap() 1430 .update_memory(new_region) 1431 .map_err(Error::DeviceManager)?; 1432 1433 match memory_config.hotplug_method { 1434 HotplugMethod::Acpi => { 1435 self.device_manager 1436 .lock() 1437 .unwrap() 1438 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1439 .map_err(Error::DeviceManager)?; 1440 } 1441 HotplugMethod::VirtioMem => {} 1442 } 1443 } 1444 1445 // We update the VM config regardless of the actual guest resize 1446 // operation result (happened or not), so that if the VM reboots 1447 // it will be running with the last configure memory size. 1448 match memory_config.hotplug_method { 1449 HotplugMethod::Acpi => memory_config.size = desired_memory, 1450 HotplugMethod::VirtioMem => { 1451 if desired_memory > memory_config.size { 1452 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1453 } else { 1454 memory_config.hotplugged_size = None; 1455 } 1456 } 1457 } 1458 } 1459 1460 if let Some(desired_balloon) = desired_balloon { 1461 self.device_manager 1462 .lock() 1463 .unwrap() 1464 .resize_balloon(desired_balloon) 1465 .map_err(Error::DeviceManager)?; 1466 1467 // Update the configuration value for the balloon size to ensure 1468 // a reboot would use the right value. 1469 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1470 balloon_config.size = desired_balloon; 1471 } 1472 } 1473 1474 event!("vm", "resized"); 1475 1476 Ok(()) 1477 } 1478 1479 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1480 let memory_config = &mut self.config.lock().unwrap().memory; 1481 1482 if let Some(zones) = &mut memory_config.zones { 1483 for zone in zones.iter_mut() { 1484 if zone.id == id { 1485 if desired_memory >= zone.size { 1486 let hotplugged_size = desired_memory - zone.size; 1487 self.memory_manager 1488 .lock() 1489 .unwrap() 1490 .resize_zone(&id, desired_memory - zone.size) 1491 .map_err(Error::MemoryManager)?; 1492 // We update the memory zone config regardless of the 1493 // actual 'resize-zone' operation result (happened or 1494 // not), so that if the VM reboots it will be running 1495 // with the last configured memory zone size. 1496 zone.hotplugged_size = Some(hotplugged_size); 1497 1498 return Ok(()); 1499 } else { 1500 error!( 1501 "Invalid to ask less ({}) than boot RAM ({}) for \ 1502 this memory zone", 1503 desired_memory, zone.size, 1504 ); 1505 return Err(Error::ResizeZone); 1506 } 1507 } 1508 } 1509 } 1510 1511 error!("Could not find the memory zone {} for the resize", id); 1512 Err(Error::ResizeZone) 1513 } 1514 1515 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1516 let pci_device_info = self 1517 .device_manager 1518 .lock() 1519 .unwrap() 1520 .add_device(&mut device_cfg) 1521 .map_err(Error::DeviceManager)?; 1522 1523 // Update VmConfig by adding the new device. This is important to 1524 // ensure the device would be created in case of a reboot. 1525 { 1526 let mut config = self.config.lock().unwrap(); 1527 add_to_config(&mut config.devices, device_cfg); 1528 } 1529 1530 self.device_manager 1531 .lock() 1532 .unwrap() 1533 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1534 .map_err(Error::DeviceManager)?; 1535 1536 Ok(pci_device_info) 1537 } 1538 1539 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1540 let pci_device_info = self 1541 .device_manager 1542 .lock() 1543 .unwrap() 1544 .add_user_device(&mut device_cfg) 1545 .map_err(Error::DeviceManager)?; 1546 1547 // Update VmConfig by adding the new device. This is important to 1548 // ensure the device would be created in case of a reboot. 1549 { 1550 let mut config = self.config.lock().unwrap(); 1551 add_to_config(&mut config.user_devices, device_cfg); 1552 } 1553 1554 self.device_manager 1555 .lock() 1556 .unwrap() 1557 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1558 .map_err(Error::DeviceManager)?; 1559 1560 Ok(pci_device_info) 1561 } 1562 1563 pub fn remove_device(&mut self, id: String) -> Result<()> { 1564 self.device_manager 1565 .lock() 1566 .unwrap() 1567 .remove_device(id.clone()) 1568 .map_err(Error::DeviceManager)?; 1569 1570 // Update VmConfig by removing the device. This is important to 1571 // ensure the device would not be created in case of a reboot. 1572 self.config.lock().unwrap().remove_device(&id); 1573 1574 self.device_manager 1575 .lock() 1576 .unwrap() 1577 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1578 .map_err(Error::DeviceManager)?; 1579 Ok(()) 1580 } 1581 1582 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1583 let pci_device_info = self 1584 .device_manager 1585 .lock() 1586 .unwrap() 1587 .add_disk(&mut disk_cfg) 1588 .map_err(Error::DeviceManager)?; 1589 1590 // Update VmConfig by adding the new device. This is important to 1591 // ensure the device would be created in case of a reboot. 1592 { 1593 let mut config = self.config.lock().unwrap(); 1594 add_to_config(&mut config.disks, disk_cfg); 1595 } 1596 1597 self.device_manager 1598 .lock() 1599 .unwrap() 1600 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1601 .map_err(Error::DeviceManager)?; 1602 1603 Ok(pci_device_info) 1604 } 1605 1606 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1607 let pci_device_info = self 1608 .device_manager 1609 .lock() 1610 .unwrap() 1611 .add_fs(&mut fs_cfg) 1612 .map_err(Error::DeviceManager)?; 1613 1614 // Update VmConfig by adding the new device. This is important to 1615 // ensure the device would be created in case of a reboot. 1616 { 1617 let mut config = self.config.lock().unwrap(); 1618 add_to_config(&mut config.fs, fs_cfg); 1619 } 1620 1621 self.device_manager 1622 .lock() 1623 .unwrap() 1624 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1625 .map_err(Error::DeviceManager)?; 1626 1627 Ok(pci_device_info) 1628 } 1629 1630 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1631 let pci_device_info = self 1632 .device_manager 1633 .lock() 1634 .unwrap() 1635 .add_pmem(&mut pmem_cfg) 1636 .map_err(Error::DeviceManager)?; 1637 1638 // Update VmConfig by adding the new device. This is important to 1639 // ensure the device would be created in case of a reboot. 1640 { 1641 let mut config = self.config.lock().unwrap(); 1642 add_to_config(&mut config.pmem, pmem_cfg); 1643 } 1644 1645 self.device_manager 1646 .lock() 1647 .unwrap() 1648 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1649 .map_err(Error::DeviceManager)?; 1650 1651 Ok(pci_device_info) 1652 } 1653 1654 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1655 let pci_device_info = self 1656 .device_manager 1657 .lock() 1658 .unwrap() 1659 .add_net(&mut net_cfg) 1660 .map_err(Error::DeviceManager)?; 1661 1662 // Update VmConfig by adding the new device. This is important to 1663 // ensure the device would be created in case of a reboot. 1664 { 1665 let mut config = self.config.lock().unwrap(); 1666 add_to_config(&mut config.net, net_cfg); 1667 } 1668 1669 self.device_manager 1670 .lock() 1671 .unwrap() 1672 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1673 .map_err(Error::DeviceManager)?; 1674 1675 Ok(pci_device_info) 1676 } 1677 1678 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1679 let pci_device_info = self 1680 .device_manager 1681 .lock() 1682 .unwrap() 1683 .add_vdpa(&mut vdpa_cfg) 1684 .map_err(Error::DeviceManager)?; 1685 1686 // Update VmConfig by adding the new device. This is important to 1687 // ensure the device would be created in case of a reboot. 1688 { 1689 let mut config = self.config.lock().unwrap(); 1690 add_to_config(&mut config.vdpa, vdpa_cfg); 1691 } 1692 1693 self.device_manager 1694 .lock() 1695 .unwrap() 1696 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1697 .map_err(Error::DeviceManager)?; 1698 1699 Ok(pci_device_info) 1700 } 1701 1702 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1703 let pci_device_info = self 1704 .device_manager 1705 .lock() 1706 .unwrap() 1707 .add_vsock(&mut vsock_cfg) 1708 .map_err(Error::DeviceManager)?; 1709 1710 // Update VmConfig by adding the new device. This is important to 1711 // ensure the device would be created in case of a reboot. 1712 { 1713 let mut config = self.config.lock().unwrap(); 1714 config.vsock = Some(vsock_cfg); 1715 } 1716 1717 self.device_manager 1718 .lock() 1719 .unwrap() 1720 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1721 .map_err(Error::DeviceManager)?; 1722 1723 Ok(pci_device_info) 1724 } 1725 1726 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1727 Ok(self.device_manager.lock().unwrap().counters()) 1728 } 1729 1730 #[cfg(feature = "tdx")] 1731 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1732 use arch::x86_64::tdx::*; 1733 1734 let firmware_path = self 1735 .config 1736 .lock() 1737 .unwrap() 1738 .payload 1739 .as_ref() 1740 .unwrap() 1741 .firmware 1742 .clone() 1743 .ok_or(Error::TdxFirmwareMissing)?; 1744 // The TDVF file contains a table of section as well as code 1745 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1746 1747 // For all the sections allocate some RAM backing them 1748 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1749 } 1750 1751 #[cfg(feature = "tdx")] 1752 fn hob_memory_resources( 1753 mut sorted_sections: Vec<TdvfSection>, 1754 guest_memory: &GuestMemoryMmap, 1755 ) -> Vec<(u64, u64, bool)> { 1756 let mut list = Vec::new(); 1757 1758 let mut current_section = sorted_sections.pop(); 1759 1760 // RAM regions interleaved with TDVF sections 1761 let mut next_start_addr = 0; 1762 for region in guest_memory.iter() { 1763 let region_start = region.start_addr().0; 1764 let region_end = region.last_addr().0; 1765 if region_start > next_start_addr { 1766 next_start_addr = region_start; 1767 } 1768 1769 loop { 1770 let (start, size, ram) = if let Some(section) = ¤t_section { 1771 if section.address <= next_start_addr { 1772 (section.address, section.size, false) 1773 } else { 1774 let last_addr = std::cmp::min(section.address - 1, region_end); 1775 (next_start_addr, last_addr - next_start_addr + 1, true) 1776 } 1777 } else { 1778 (next_start_addr, region_end - next_start_addr + 1, true) 1779 }; 1780 1781 list.push((start, size, ram)); 1782 1783 if !ram { 1784 current_section = sorted_sections.pop(); 1785 } 1786 1787 next_start_addr = start + size; 1788 1789 if region_start > next_start_addr { 1790 next_start_addr = region_start; 1791 } 1792 1793 if next_start_addr > region_end { 1794 break; 1795 } 1796 } 1797 } 1798 1799 // Once all the interleaved sections have been processed, let's simply 1800 // pull the remaining ones. 1801 if let Some(section) = current_section { 1802 list.push((section.address, section.size, false)); 1803 } 1804 while let Some(section) = sorted_sections.pop() { 1805 list.push((section.address, section.size, false)); 1806 } 1807 1808 list 1809 } 1810 1811 #[cfg(feature = "tdx")] 1812 fn populate_tdx_sections( 1813 &mut self, 1814 sections: &[TdvfSection], 1815 guid_found: bool, 1816 ) -> Result<Option<u64>> { 1817 use arch::x86_64::tdx::*; 1818 // Get the memory end *before* we start adding TDVF ram regions 1819 let boot_guest_memory = self 1820 .memory_manager 1821 .lock() 1822 .as_ref() 1823 .unwrap() 1824 .boot_guest_memory(); 1825 for section in sections { 1826 // No need to allocate if the section falls within guest RAM ranges 1827 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1828 info!( 1829 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1830 section 1831 ); 1832 continue; 1833 } 1834 1835 info!("Allocating TDVF Section: {:x?}", section); 1836 self.memory_manager 1837 .lock() 1838 .unwrap() 1839 .add_ram_region(GuestAddress(section.address), section.size as usize) 1840 .map_err(Error::AllocatingTdvfMemory)?; 1841 } 1842 1843 // The TDVF file contains a table of section as well as code 1844 let firmware_path = self 1845 .config 1846 .lock() 1847 .unwrap() 1848 .payload 1849 .as_ref() 1850 .unwrap() 1851 .firmware 1852 .clone() 1853 .ok_or(Error::TdxFirmwareMissing)?; 1854 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1855 1856 // The guest memory at this point now has all the required regions so it 1857 // is safe to copy from the TDVF file into it. 1858 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1859 let mem = guest_memory.memory(); 1860 let mut payload_info = None; 1861 let mut hob_offset = None; 1862 for section in sections { 1863 info!("Populating TDVF Section: {:x?}", section); 1864 match section.r#type { 1865 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1866 info!("Copying section to guest memory"); 1867 firmware_file 1868 .seek(SeekFrom::Start(section.data_offset as u64)) 1869 .map_err(Error::LoadTdvf)?; 1870 mem.read_volatile_from( 1871 GuestAddress(section.address), 1872 &mut firmware_file, 1873 section.data_size as usize, 1874 ) 1875 .unwrap(); 1876 } 1877 TdvfSectionType::TdHob => { 1878 hob_offset = Some(section.address); 1879 } 1880 TdvfSectionType::Payload => { 1881 info!("Copying payload to guest memory"); 1882 if let Some(payload_file) = self.kernel.as_mut() { 1883 let payload_size = payload_file 1884 .seek(SeekFrom::End(0)) 1885 .map_err(Error::LoadPayload)?; 1886 1887 payload_file 1888 .seek(SeekFrom::Start(0x1f1)) 1889 .map_err(Error::LoadPayload)?; 1890 1891 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1892 payload_file 1893 .read_volatile(&mut payload_header.as_bytes()) 1894 .unwrap(); 1895 1896 if payload_header.header != 0x5372_6448 { 1897 return Err(Error::InvalidPayloadType); 1898 } 1899 1900 if (payload_header.version < 0x0200) 1901 || ((payload_header.loadflags & 0x1) == 0x0) 1902 { 1903 return Err(Error::InvalidPayloadType); 1904 } 1905 1906 payload_file.rewind().map_err(Error::LoadPayload)?; 1907 mem.read_volatile_from( 1908 GuestAddress(section.address), 1909 payload_file, 1910 payload_size as usize, 1911 ) 1912 .unwrap(); 1913 1914 // Create the payload info that will be inserted into 1915 // the HOB. 1916 payload_info = Some(PayloadInfo { 1917 image_type: PayloadImageType::BzImage, 1918 entry_point: section.address, 1919 }); 1920 } 1921 } 1922 TdvfSectionType::PayloadParam => { 1923 info!("Copying payload parameters to guest memory"); 1924 let cmdline = Self::generate_cmdline( 1925 self.config.lock().unwrap().payload.as_ref().unwrap(), 1926 )?; 1927 mem.write_slice( 1928 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1929 GuestAddress(section.address), 1930 ) 1931 .unwrap(); 1932 } 1933 _ => {} 1934 } 1935 } 1936 1937 // Generate HOB 1938 let mut hob = TdHob::start(hob_offset.unwrap()); 1939 1940 let mut sorted_sections = sections.to_vec(); 1941 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1942 1943 sorted_sections.sort_by_key(|section| section.address); 1944 sorted_sections.reverse(); 1945 1946 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1947 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1948 .map_err(Error::PopulateHob)?; 1949 } 1950 1951 // MMIO regions 1952 hob.add_mmio_resource( 1953 &mem, 1954 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1955 arch::layout::APIC_START.raw_value() 1956 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1957 ) 1958 .map_err(Error::PopulateHob)?; 1959 let start_of_device_area = self 1960 .memory_manager 1961 .lock() 1962 .unwrap() 1963 .start_of_device_area() 1964 .raw_value(); 1965 let end_of_device_area = self 1966 .memory_manager 1967 .lock() 1968 .unwrap() 1969 .end_of_device_area() 1970 .raw_value(); 1971 hob.add_mmio_resource( 1972 &mem, 1973 start_of_device_area, 1974 end_of_device_area - start_of_device_area, 1975 ) 1976 .map_err(Error::PopulateHob)?; 1977 1978 // Loop over the ACPI tables and copy them to the HOB. 1979 1980 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1981 &self.device_manager, 1982 &self.cpu_manager, 1983 &self.memory_manager, 1984 &self.numa_nodes, 1985 ) { 1986 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1987 .map_err(Error::PopulateHob)?; 1988 } 1989 1990 // If a payload info has been created, let's insert it into the HOB. 1991 if let Some(payload_info) = payload_info { 1992 hob.add_payload(&mem, payload_info) 1993 .map_err(Error::PopulateHob)?; 1994 } 1995 1996 hob.finish(&mem).map_err(Error::PopulateHob)?; 1997 1998 Ok(hob_offset) 1999 } 2000 2001 #[cfg(feature = "tdx")] 2002 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 2003 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2004 let mem = guest_memory.memory(); 2005 2006 for section in sections { 2007 self.vm 2008 .tdx_init_memory_region( 2009 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2010 section.address, 2011 section.size, 2012 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2013 section.attributes == 1, 2014 ) 2015 .map_err(Error::InitializeTdxMemoryRegion)?; 2016 } 2017 2018 Ok(()) 2019 } 2020 2021 // Creates ACPI tables 2022 // In case of TDX being used, this is a no-op since the tables will be 2023 // created and passed when populating the HOB. 2024 2025 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2026 #[cfg(feature = "tdx")] 2027 if self.config.lock().unwrap().is_tdx_enabled() { 2028 return None; 2029 } 2030 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2031 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2032 let rsdp_addr = crate::acpi::create_acpi_tables( 2033 &mem, 2034 &self.device_manager, 2035 &self.cpu_manager, 2036 &self.memory_manager, 2037 &self.numa_nodes, 2038 tpm_enabled, 2039 ); 2040 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2041 2042 Some(rsdp_addr) 2043 } 2044 2045 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2046 trace_scoped!("entry_point"); 2047 2048 self.load_payload_handle 2049 .take() 2050 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2051 .transpose() 2052 } 2053 2054 pub fn boot(&mut self) -> Result<()> { 2055 trace_scoped!("Vm::boot"); 2056 let current_state = self.get_state()?; 2057 if current_state == VmState::Paused { 2058 return self.resume().map_err(Error::Resume); 2059 } 2060 2061 let new_state = if self.stop_on_boot { 2062 VmState::BreakPoint 2063 } else { 2064 VmState::Running 2065 }; 2066 current_state.valid_transition(new_state)?; 2067 2068 // Do earlier to parallelise with loading kernel 2069 #[cfg(target_arch = "x86_64")] 2070 cfg_if::cfg_if! { 2071 if #[cfg(feature = "sev_snp")] { 2072 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2073 let rsdp_addr = if sev_snp_enabled { 2074 // In case of SEV-SNP guest ACPI tables are provided via 2075 // IGVM. So skip the creation of ACPI tables and set the 2076 // rsdp addr to None. 2077 None 2078 } else { 2079 self.create_acpi_tables() 2080 }; 2081 } else { 2082 let rsdp_addr = self.create_acpi_tables(); 2083 } 2084 } 2085 2086 // Load kernel synchronously or if asynchronous then wait for load to 2087 // finish. 2088 let entry_point = self.entry_point()?; 2089 2090 #[cfg(feature = "tdx")] 2091 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2092 2093 // Configure the vcpus that have been created 2094 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2095 for vcpu in vcpus { 2096 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2097 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2098 self.cpu_manager 2099 .lock() 2100 .unwrap() 2101 .configure_vcpu(vcpu, boot_setup) 2102 .map_err(Error::CpuManager)?; 2103 } 2104 2105 #[cfg(feature = "tdx")] 2106 let (sections, guid_found) = if tdx_enabled { 2107 self.extract_tdvf_sections()? 2108 } else { 2109 (Vec::new(), false) 2110 }; 2111 2112 // Configuring the TDX regions requires that the vCPUs are created. 2113 #[cfg(feature = "tdx")] 2114 let hob_address = if tdx_enabled { 2115 // TDX sections are written to memory. 2116 self.populate_tdx_sections(§ions, guid_found)? 2117 } else { 2118 None 2119 }; 2120 2121 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2122 // available after they are configured 2123 #[cfg(target_arch = "aarch64")] 2124 let rsdp_addr = self.create_acpi_tables(); 2125 2126 // Configure shared state based on loaded kernel 2127 entry_point 2128 .map(|entry_point| { 2129 // Safe to unwrap rsdp_addr as we know it can't be None when 2130 // the entry_point is Some. 2131 self.configure_system(rsdp_addr.unwrap(), entry_point) 2132 }) 2133 .transpose()?; 2134 2135 #[cfg(target_arch = "x86_64")] 2136 // Note: For x86, always call this function before invoking start boot vcpus. 2137 // Otherwise guest would fail to boot because we haven't created the 2138 // userspace mappings to update the hypervisor about the memory mappings. 2139 // These mappings must be created before we start the vCPU threads for 2140 // the very first time. 2141 self.memory_manager 2142 .lock() 2143 .unwrap() 2144 .allocate_address_space() 2145 .map_err(Error::MemoryManager)?; 2146 2147 #[cfg(feature = "tdx")] 2148 if let Some(hob_address) = hob_address { 2149 // With the HOB address extracted the vCPUs can have 2150 // their TDX state configured. 2151 self.cpu_manager 2152 .lock() 2153 .unwrap() 2154 .initialize_tdx(hob_address) 2155 .map_err(Error::CpuManager)?; 2156 // Let the hypervisor know which memory ranges are shared with the 2157 // guest. This prevents the guest from ignoring/discarding memory 2158 // regions provided by the host. 2159 self.init_tdx_memory(§ions)?; 2160 // With TDX memory and CPU state configured TDX setup is complete 2161 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2162 } 2163 2164 // Resume the vm for MSHV 2165 if current_state == VmState::Created { 2166 self.vm.resume().map_err(Error::ResumeVm)?; 2167 } 2168 2169 self.cpu_manager 2170 .lock() 2171 .unwrap() 2172 .start_boot_vcpus(new_state == VmState::BreakPoint) 2173 .map_err(Error::CpuManager)?; 2174 2175 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2176 *state = new_state; 2177 Ok(()) 2178 } 2179 2180 pub fn restore(&mut self) -> Result<()> { 2181 event!("vm", "restoring"); 2182 2183 #[cfg(target_arch = "x86_64")] 2184 // Note: For x86, always call this function before invoking start boot vcpus. 2185 // Otherwise guest would fail to boot because we haven't created the 2186 // userspace mappings to update the hypervisor about the memory mappings. 2187 // These mappings must be created before we start the vCPU threads for 2188 // the very first time for the restored VM. 2189 self.memory_manager 2190 .lock() 2191 .unwrap() 2192 .allocate_address_space() 2193 .map_err(Error::MemoryManager)?; 2194 2195 // Now we can start all vCPUs from here. 2196 self.cpu_manager 2197 .lock() 2198 .unwrap() 2199 .start_restored_vcpus() 2200 .map_err(Error::CpuManager)?; 2201 2202 event!("vm", "restored"); 2203 Ok(()) 2204 } 2205 2206 /// Gets a thread-safe reference counted pointer to the VM configuration. 2207 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2208 Arc::clone(&self.config) 2209 } 2210 2211 /// Get the VM state. Returns an error if the state is poisoned. 2212 pub fn get_state(&self) -> Result<VmState> { 2213 self.state 2214 .try_read() 2215 .map_err(|_| Error::PoisonedState) 2216 .map(|state| *state) 2217 } 2218 2219 /// Gets the actual size of the balloon. 2220 pub fn balloon_size(&self) -> u64 { 2221 self.device_manager.lock().unwrap().balloon_size() 2222 } 2223 2224 pub fn send_memory_fds( 2225 &mut self, 2226 socket: &mut UnixStream, 2227 ) -> std::result::Result<(), MigratableError> { 2228 for (slot, fd) in self 2229 .memory_manager 2230 .lock() 2231 .unwrap() 2232 .memory_slot_fds() 2233 .drain() 2234 { 2235 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2236 .write_to(socket) 2237 .map_err(|e| { 2238 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2239 })?; 2240 socket 2241 .send_with_fd(&slot.to_le_bytes()[..], fd) 2242 .map_err(|e| { 2243 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2244 })?; 2245 2246 Response::read_from(socket)?.ok_or_abandon( 2247 socket, 2248 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")), 2249 )?; 2250 } 2251 2252 Ok(()) 2253 } 2254 2255 pub fn send_memory_regions<F>( 2256 &mut self, 2257 ranges: &MemoryRangeTable, 2258 fd: &mut F, 2259 ) -> std::result::Result<(), MigratableError> 2260 where 2261 F: WriteVolatile, 2262 { 2263 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2264 let mem = guest_memory.memory(); 2265 2266 for range in ranges.regions() { 2267 let mut offset: u64 = 0; 2268 // Here we are manually handling the retry in case we can't the 2269 // whole region at once because we can't use the implementation 2270 // from vm-memory::GuestMemory of write_all_to() as it is not 2271 // following the correct behavior. For more info about this issue 2272 // see: https://github.com/rust-vmm/vm-memory/issues/174 2273 loop { 2274 let bytes_written = mem 2275 .write_volatile_to( 2276 GuestAddress(range.gpa + offset), 2277 fd, 2278 (range.length - offset) as usize, 2279 ) 2280 .map_err(|e| { 2281 MigratableError::MigrateSend(anyhow!( 2282 "Error transferring memory to socket: {}", 2283 e 2284 )) 2285 })?; 2286 offset += bytes_written as u64; 2287 2288 if offset == range.length { 2289 break; 2290 } 2291 } 2292 } 2293 2294 Ok(()) 2295 } 2296 2297 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2298 self.memory_manager 2299 .lock() 2300 .unwrap() 2301 .memory_range_table(false) 2302 } 2303 2304 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2305 self.device_manager.lock().unwrap().device_tree() 2306 } 2307 2308 pub fn activate_virtio_devices(&self) -> Result<()> { 2309 self.device_manager 2310 .lock() 2311 .unwrap() 2312 .activate_virtio_devices() 2313 .map_err(Error::ActivateVirtioDevices) 2314 } 2315 2316 #[cfg(target_arch = "x86_64")] 2317 pub fn power_button(&self) -> Result<()> { 2318 return self 2319 .device_manager 2320 .lock() 2321 .unwrap() 2322 .notify_power_button() 2323 .map_err(Error::PowerButton); 2324 } 2325 2326 #[cfg(target_arch = "aarch64")] 2327 pub fn power_button(&self) -> Result<()> { 2328 self.device_manager 2329 .lock() 2330 .unwrap() 2331 .notify_power_button() 2332 .map_err(Error::PowerButton) 2333 } 2334 2335 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2336 self.memory_manager.lock().unwrap().snapshot_data() 2337 } 2338 2339 #[cfg(feature = "guest_debug")] 2340 pub fn debug_request( 2341 &mut self, 2342 gdb_request: &GdbRequestPayload, 2343 cpu_id: usize, 2344 ) -> Result<GdbResponsePayload> { 2345 use GdbRequestPayload::*; 2346 match gdb_request { 2347 SetSingleStep(single_step) => { 2348 self.set_guest_debug(cpu_id, &[], *single_step) 2349 .map_err(Error::Debug)?; 2350 } 2351 SetHwBreakPoint(addrs) => { 2352 self.set_guest_debug(cpu_id, addrs, false) 2353 .map_err(Error::Debug)?; 2354 } 2355 Pause => { 2356 self.debug_pause().map_err(Error::Debug)?; 2357 } 2358 Resume => { 2359 self.debug_resume().map_err(Error::Debug)?; 2360 } 2361 ReadRegs => { 2362 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2363 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2364 } 2365 WriteRegs(regs) => { 2366 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2367 } 2368 ReadMem(vaddr, len) => { 2369 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2370 let mem = self 2371 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2372 .map_err(Error::Debug)?; 2373 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2374 } 2375 WriteMem(vaddr, data) => { 2376 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2377 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2378 .map_err(Error::Debug)?; 2379 } 2380 ActiveVcpus => { 2381 let active_vcpus = self.active_vcpus(); 2382 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2383 } 2384 } 2385 Ok(GdbResponsePayload::CommandComplete) 2386 } 2387 2388 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2389 fn get_dump_state( 2390 &mut self, 2391 destination_url: &str, 2392 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2393 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2394 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2395 let mut elf_phdr_num = 1; 2396 let elf_sh_info = 0; 2397 let coredump_file_path = url_to_file(destination_url)?; 2398 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2399 2400 if mapping_num < UINT16_MAX - 2 { 2401 elf_phdr_num += mapping_num as u16; 2402 } else { 2403 panic!("mapping num beyond 65535 not supported"); 2404 } 2405 let coredump_file = OpenOptions::new() 2406 .read(true) 2407 .write(true) 2408 .create_new(true) 2409 .open(coredump_file_path) 2410 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2411 2412 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2413 let mem_data = self 2414 .memory_manager 2415 .lock() 2416 .unwrap() 2417 .coredump_memory_regions(mem_offset); 2418 2419 Ok(DumpState { 2420 elf_note_size, 2421 elf_phdr_num, 2422 elf_sh_info, 2423 mem_offset, 2424 mem_info: Some(mem_data), 2425 file: Some(coredump_file), 2426 }) 2427 } 2428 2429 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2430 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2431 size_of::<elf::Elf64_Ehdr>() as u64 2432 + note_size as u64 2433 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2434 } 2435 2436 pub fn nmi(&self) -> Result<()> { 2437 return self 2438 .cpu_manager 2439 .lock() 2440 .unwrap() 2441 .nmi() 2442 .map_err(|_| Error::ErrorNmi); 2443 } 2444 } 2445 2446 impl Pausable for Vm { 2447 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2448 event!("vm", "pausing"); 2449 let mut state = self 2450 .state 2451 .try_write() 2452 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2453 let new_state = VmState::Paused; 2454 2455 state 2456 .valid_transition(new_state) 2457 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2458 2459 #[cfg(target_arch = "x86_64")] 2460 { 2461 let mut clock = self 2462 .vm 2463 .get_clock() 2464 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2465 clock.reset_flags(); 2466 self.saved_clock = Some(clock); 2467 } 2468 2469 // Before pausing the vCPUs activate any pending virtio devices that might 2470 // need activation between starting the pause (or e.g. a migration it's part of) 2471 self.activate_virtio_devices().map_err(|e| { 2472 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2473 })?; 2474 2475 self.cpu_manager.lock().unwrap().pause()?; 2476 self.device_manager.lock().unwrap().pause()?; 2477 2478 self.vm 2479 .pause() 2480 .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?; 2481 2482 *state = new_state; 2483 2484 event!("vm", "paused"); 2485 Ok(()) 2486 } 2487 2488 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2489 event!("vm", "resuming"); 2490 let current_state = self.get_state().unwrap(); 2491 let mut state = self 2492 .state 2493 .try_write() 2494 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2495 let new_state = VmState::Running; 2496 2497 state 2498 .valid_transition(new_state) 2499 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2500 2501 self.cpu_manager.lock().unwrap().resume()?; 2502 #[cfg(target_arch = "x86_64")] 2503 { 2504 if let Some(clock) = &self.saved_clock { 2505 self.vm.set_clock(clock).map_err(|e| { 2506 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2507 })?; 2508 } 2509 } 2510 2511 if current_state == VmState::Paused { 2512 self.vm 2513 .resume() 2514 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?; 2515 } 2516 2517 self.device_manager.lock().unwrap().resume()?; 2518 2519 // And we're back to the Running state. 2520 *state = new_state; 2521 event!("vm", "resumed"); 2522 Ok(()) 2523 } 2524 } 2525 2526 #[derive(Serialize, Deserialize)] 2527 pub struct VmSnapshot { 2528 #[cfg(target_arch = "x86_64")] 2529 pub clock: Option<hypervisor::ClockData>, 2530 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2531 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2532 } 2533 2534 pub const VM_SNAPSHOT_ID: &str = "vm"; 2535 impl Snapshottable for Vm { 2536 fn id(&self) -> String { 2537 VM_SNAPSHOT_ID.to_string() 2538 } 2539 2540 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2541 event!("vm", "snapshotting"); 2542 2543 #[cfg(feature = "tdx")] 2544 { 2545 if self.config.lock().unwrap().is_tdx_enabled() { 2546 return Err(MigratableError::Snapshot(anyhow!( 2547 "Snapshot not possible with TDX VM" 2548 ))); 2549 } 2550 } 2551 2552 let current_state = self.get_state().unwrap(); 2553 if current_state != VmState::Paused { 2554 return Err(MigratableError::Snapshot(anyhow!( 2555 "Trying to snapshot while VM is running" 2556 ))); 2557 } 2558 2559 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2560 let common_cpuid = { 2561 let amx = self.config.lock().unwrap().cpus.features.amx; 2562 let phys_bits = physical_bits( 2563 &self.hypervisor, 2564 self.config.lock().unwrap().cpus.max_phys_bits, 2565 ); 2566 arch::generate_common_cpuid( 2567 &self.hypervisor, 2568 &arch::CpuidConfig { 2569 sgx_epc_sections: None, 2570 phys_bits, 2571 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2572 #[cfg(feature = "tdx")] 2573 tdx: false, 2574 amx, 2575 }, 2576 ) 2577 .map_err(|e| { 2578 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2579 })? 2580 }; 2581 2582 let vm_snapshot_state = VmSnapshot { 2583 #[cfg(target_arch = "x86_64")] 2584 clock: self.saved_clock, 2585 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2586 common_cpuid, 2587 }; 2588 2589 let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?; 2590 2591 let (id, snapshot) = { 2592 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2593 (cpu_manager.id(), cpu_manager.snapshot()?) 2594 }; 2595 vm_snapshot.add_snapshot(id, snapshot); 2596 let (id, snapshot) = { 2597 let mut memory_manager = self.memory_manager.lock().unwrap(); 2598 (memory_manager.id(), memory_manager.snapshot()?) 2599 }; 2600 vm_snapshot.add_snapshot(id, snapshot); 2601 let (id, snapshot) = { 2602 let mut device_manager = self.device_manager.lock().unwrap(); 2603 (device_manager.id(), device_manager.snapshot()?) 2604 }; 2605 vm_snapshot.add_snapshot(id, snapshot); 2606 2607 event!("vm", "snapshotted"); 2608 Ok(vm_snapshot) 2609 } 2610 } 2611 2612 impl Transportable for Vm { 2613 fn send( 2614 &self, 2615 snapshot: &Snapshot, 2616 destination_url: &str, 2617 ) -> std::result::Result<(), MigratableError> { 2618 let mut snapshot_config_path = url_to_path(destination_url)?; 2619 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2620 2621 // Create the snapshot config file 2622 let mut snapshot_config_file = OpenOptions::new() 2623 .read(true) 2624 .write(true) 2625 .create_new(true) 2626 .open(snapshot_config_path) 2627 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2628 2629 // Serialize and write the snapshot config 2630 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2631 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2632 2633 snapshot_config_file 2634 .write(vm_config.as_bytes()) 2635 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2636 2637 let mut snapshot_state_path = url_to_path(destination_url)?; 2638 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2639 2640 // Create the snapshot state file 2641 let mut snapshot_state_file = OpenOptions::new() 2642 .read(true) 2643 .write(true) 2644 .create_new(true) 2645 .open(snapshot_state_path) 2646 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2647 2648 // Serialize and write the snapshot state 2649 let vm_state = 2650 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2651 2652 snapshot_state_file 2653 .write(&vm_state) 2654 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2655 2656 // Tell the memory manager to also send/write its own snapshot. 2657 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2658 self.memory_manager 2659 .lock() 2660 .unwrap() 2661 .send(&memory_manager_snapshot.clone(), destination_url)?; 2662 } else { 2663 return Err(MigratableError::Restore(anyhow!( 2664 "Missing memory manager snapshot" 2665 ))); 2666 } 2667 2668 Ok(()) 2669 } 2670 } 2671 2672 impl Migratable for Vm { 2673 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2674 self.memory_manager.lock().unwrap().start_dirty_log()?; 2675 self.device_manager.lock().unwrap().start_dirty_log() 2676 } 2677 2678 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2679 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2680 self.device_manager.lock().unwrap().stop_dirty_log() 2681 } 2682 2683 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2684 Ok(MemoryRangeTable::new_from_tables(vec![ 2685 self.memory_manager.lock().unwrap().dirty_log()?, 2686 self.device_manager.lock().unwrap().dirty_log()?, 2687 ])) 2688 } 2689 2690 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2691 self.memory_manager.lock().unwrap().start_migration()?; 2692 self.device_manager.lock().unwrap().start_migration() 2693 } 2694 2695 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2696 self.memory_manager.lock().unwrap().complete_migration()?; 2697 self.device_manager.lock().unwrap().complete_migration() 2698 } 2699 } 2700 2701 #[cfg(feature = "guest_debug")] 2702 impl Debuggable for Vm { 2703 fn set_guest_debug( 2704 &self, 2705 cpu_id: usize, 2706 addrs: &[GuestAddress], 2707 singlestep: bool, 2708 ) -> std::result::Result<(), DebuggableError> { 2709 self.cpu_manager 2710 .lock() 2711 .unwrap() 2712 .set_guest_debug(cpu_id, addrs, singlestep) 2713 } 2714 2715 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2716 if *self.state.read().unwrap() == VmState::Running { 2717 self.pause().map_err(DebuggableError::Pause)?; 2718 } 2719 2720 let mut state = self 2721 .state 2722 .try_write() 2723 .map_err(|_| DebuggableError::PoisonedState)?; 2724 *state = VmState::BreakPoint; 2725 Ok(()) 2726 } 2727 2728 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2729 if *self.state.read().unwrap() == VmState::BreakPoint { 2730 self.resume().map_err(DebuggableError::Pause)?; 2731 } 2732 2733 Ok(()) 2734 } 2735 2736 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2737 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2738 } 2739 2740 fn write_regs( 2741 &self, 2742 cpu_id: usize, 2743 regs: &CoreRegs, 2744 ) -> std::result::Result<(), DebuggableError> { 2745 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2746 } 2747 2748 fn read_mem( 2749 &self, 2750 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2751 cpu_id: usize, 2752 vaddr: GuestAddress, 2753 len: usize, 2754 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2755 self.cpu_manager 2756 .lock() 2757 .unwrap() 2758 .read_mem(guest_memory, cpu_id, vaddr, len) 2759 } 2760 2761 fn write_mem( 2762 &self, 2763 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2764 cpu_id: usize, 2765 vaddr: &GuestAddress, 2766 data: &[u8], 2767 ) -> std::result::Result<(), DebuggableError> { 2768 self.cpu_manager 2769 .lock() 2770 .unwrap() 2771 .write_mem(guest_memory, cpu_id, vaddr, data) 2772 } 2773 2774 fn active_vcpus(&self) -> usize { 2775 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2776 if active_vcpus > 0 { 2777 active_vcpus 2778 } else { 2779 // The VM is not booted yet. Report boot_vcpus() instead. 2780 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2781 } 2782 } 2783 } 2784 2785 #[cfg(feature = "guest_debug")] 2786 pub const UINT16_MAX: u32 = 65535; 2787 2788 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2789 impl Elf64Writable for Vm {} 2790 2791 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2792 impl GuestDebuggable for Vm { 2793 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2794 event!("vm", "coredumping"); 2795 2796 let mut resume = false; 2797 2798 #[cfg(feature = "tdx")] 2799 { 2800 if let Some(ref platform) = self.config.lock().unwrap().platform { 2801 if platform.tdx { 2802 return Err(GuestDebuggableError::Coredump(anyhow!( 2803 "Coredump not possible with TDX VM" 2804 ))); 2805 } 2806 } 2807 } 2808 2809 match self.get_state().unwrap() { 2810 VmState::Running => { 2811 self.pause().map_err(GuestDebuggableError::Pause)?; 2812 resume = true; 2813 } 2814 VmState::Paused => {} 2815 _ => { 2816 return Err(GuestDebuggableError::Coredump(anyhow!( 2817 "Trying to coredump while VM is not running or paused" 2818 ))); 2819 } 2820 } 2821 2822 let coredump_state = self.get_dump_state(destination_url)?; 2823 2824 self.write_header(&coredump_state)?; 2825 self.write_note(&coredump_state)?; 2826 self.write_loads(&coredump_state)?; 2827 2828 self.cpu_manager 2829 .lock() 2830 .unwrap() 2831 .cpu_write_elf64_note(&coredump_state)?; 2832 self.cpu_manager 2833 .lock() 2834 .unwrap() 2835 .cpu_write_vmm_note(&coredump_state)?; 2836 2837 self.memory_manager 2838 .lock() 2839 .unwrap() 2840 .coredump_iterate_save_mem(&coredump_state)?; 2841 2842 if resume { 2843 self.resume().map_err(GuestDebuggableError::Resume)?; 2844 } 2845 2846 Ok(()) 2847 } 2848 } 2849 2850 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2851 #[cfg(test)] 2852 mod tests { 2853 use super::*; 2854 2855 fn test_vm_state_transitions(state: VmState) { 2856 match state { 2857 VmState::Created => { 2858 // Check the transitions from Created 2859 assert!(state.valid_transition(VmState::Created).is_err()); 2860 assert!(state.valid_transition(VmState::Running).is_ok()); 2861 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2862 assert!(state.valid_transition(VmState::Paused).is_ok()); 2863 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2864 } 2865 VmState::Running => { 2866 // Check the transitions from Running 2867 assert!(state.valid_transition(VmState::Created).is_err()); 2868 assert!(state.valid_transition(VmState::Running).is_err()); 2869 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2870 assert!(state.valid_transition(VmState::Paused).is_ok()); 2871 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2872 } 2873 VmState::Shutdown => { 2874 // Check the transitions from Shutdown 2875 assert!(state.valid_transition(VmState::Created).is_err()); 2876 assert!(state.valid_transition(VmState::Running).is_ok()); 2877 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2878 assert!(state.valid_transition(VmState::Paused).is_err()); 2879 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2880 } 2881 VmState::Paused => { 2882 // Check the transitions from Paused 2883 assert!(state.valid_transition(VmState::Created).is_err()); 2884 assert!(state.valid_transition(VmState::Running).is_ok()); 2885 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2886 assert!(state.valid_transition(VmState::Paused).is_err()); 2887 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2888 } 2889 VmState::BreakPoint => { 2890 // Check the transitions from Breakpoint 2891 assert!(state.valid_transition(VmState::Created).is_ok()); 2892 assert!(state.valid_transition(VmState::Running).is_ok()); 2893 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2894 assert!(state.valid_transition(VmState::Paused).is_err()); 2895 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2896 } 2897 } 2898 } 2899 2900 #[test] 2901 fn test_vm_created_transitions() { 2902 test_vm_state_transitions(VmState::Created); 2903 } 2904 2905 #[test] 2906 fn test_vm_running_transitions() { 2907 test_vm_state_transitions(VmState::Running); 2908 } 2909 2910 #[test] 2911 fn test_vm_shutdown_transitions() { 2912 test_vm_state_transitions(VmState::Shutdown); 2913 } 2914 2915 #[test] 2916 fn test_vm_paused_transitions() { 2917 test_vm_state_transitions(VmState::Paused); 2918 } 2919 2920 #[cfg(feature = "tdx")] 2921 #[test] 2922 fn test_hob_memory_resources() { 2923 // Case 1: Two TDVF sections in the middle of the RAM 2924 let sections = vec![ 2925 TdvfSection { 2926 address: 0xc000, 2927 size: 0x1000, 2928 ..Default::default() 2929 }, 2930 TdvfSection { 2931 address: 0x1000, 2932 size: 0x4000, 2933 ..Default::default() 2934 }, 2935 ]; 2936 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2937 let expected = vec![ 2938 (0, 0x1000, true), 2939 (0x1000, 0x4000, false), 2940 (0x5000, 0x7000, true), 2941 (0xc000, 0x1000, false), 2942 (0xd000, 0x0fff_3000, true), 2943 ]; 2944 assert_eq!( 2945 expected, 2946 Vm::hob_memory_resources( 2947 sections, 2948 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2949 ) 2950 ); 2951 2952 // Case 2: Two TDVF sections with no conflict with the RAM 2953 let sections = vec![ 2954 TdvfSection { 2955 address: 0x1000_1000, 2956 size: 0x1000, 2957 ..Default::default() 2958 }, 2959 TdvfSection { 2960 address: 0, 2961 size: 0x1000, 2962 ..Default::default() 2963 }, 2964 ]; 2965 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2966 let expected = vec![ 2967 (0, 0x1000, false), 2968 (0x1000, 0x1000_0000, true), 2969 (0x1000_1000, 0x1000, false), 2970 ]; 2971 assert_eq!( 2972 expected, 2973 Vm::hob_memory_resources( 2974 sections, 2975 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2976 ) 2977 ); 2978 2979 // Case 3: Two TDVF sections with partial conflicts with the RAM 2980 let sections = vec![ 2981 TdvfSection { 2982 address: 0x1000_0000, 2983 size: 0x2000, 2984 ..Default::default() 2985 }, 2986 TdvfSection { 2987 address: 0, 2988 size: 0x2000, 2989 ..Default::default() 2990 }, 2991 ]; 2992 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2993 let expected = vec![ 2994 (0, 0x2000, false), 2995 (0x2000, 0x0fff_e000, true), 2996 (0x1000_0000, 0x2000, false), 2997 ]; 2998 assert_eq!( 2999 expected, 3000 Vm::hob_memory_resources( 3001 sections, 3002 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3003 ) 3004 ); 3005 3006 // Case 4: Two TDVF sections with no conflict before the RAM and two 3007 // more additional sections with no conflict after the RAM. 3008 let sections = vec![ 3009 TdvfSection { 3010 address: 0x2000_1000, 3011 size: 0x1000, 3012 ..Default::default() 3013 }, 3014 TdvfSection { 3015 address: 0x2000_0000, 3016 size: 0x1000, 3017 ..Default::default() 3018 }, 3019 TdvfSection { 3020 address: 0x1000, 3021 size: 0x1000, 3022 ..Default::default() 3023 }, 3024 TdvfSection { 3025 address: 0, 3026 size: 0x1000, 3027 ..Default::default() 3028 }, 3029 ]; 3030 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3031 let expected = vec![ 3032 (0, 0x1000, false), 3033 (0x1000, 0x1000, false), 3034 (0x4000, 0x1000_0000, true), 3035 (0x2000_0000, 0x1000, false), 3036 (0x2000_1000, 0x1000, false), 3037 ]; 3038 assert_eq!( 3039 expected, 3040 Vm::hob_memory_resources( 3041 sections, 3042 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3043 ) 3044 ); 3045 3046 // Case 5: One TDVF section overriding the entire RAM 3047 let sections = vec![TdvfSection { 3048 address: 0, 3049 size: 0x2000_0000, 3050 ..Default::default() 3051 }]; 3052 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3053 let expected = vec![(0, 0x2000_0000, false)]; 3054 assert_eq!( 3055 expected, 3056 Vm::hob_memory_resources( 3057 sections, 3058 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3059 ) 3060 ); 3061 3062 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3063 let sections = vec![ 3064 TdvfSection { 3065 address: 0x1000_2000, 3066 size: 0x2000, 3067 ..Default::default() 3068 }, 3069 TdvfSection { 3070 address: 0, 3071 size: 0x2000, 3072 ..Default::default() 3073 }, 3074 ]; 3075 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3076 (GuestAddress(0x2000), 0x1000_0000), 3077 (GuestAddress(0x1000_4000), 0x1000_0000), 3078 ]; 3079 let expected = vec![ 3080 (0, 0x2000, false), 3081 (0x2000, 0x1000_0000, true), 3082 (0x1000_2000, 0x2000, false), 3083 (0x1000_4000, 0x1000_0000, true), 3084 ]; 3085 assert_eq!( 3086 expected, 3087 Vm::hob_memory_resources( 3088 sections, 3089 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3090 ) 3091 ); 3092 3093 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3094 let sections = vec![ 3095 TdvfSection { 3096 address: 0x1000_0000, 3097 size: 0x4000, 3098 ..Default::default() 3099 }, 3100 TdvfSection { 3101 address: 0, 3102 size: 0x4000, 3103 ..Default::default() 3104 }, 3105 ]; 3106 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3107 (GuestAddress(0x1000), 0x1000_0000), 3108 (GuestAddress(0x1000_3000), 0x1000_0000), 3109 ]; 3110 let expected = vec![ 3111 (0, 0x4000, false), 3112 (0x4000, 0x0fff_c000, true), 3113 (0x1000_0000, 0x4000, false), 3114 (0x1000_4000, 0x0fff_f000, true), 3115 ]; 3116 assert_eq!( 3117 expected, 3118 Vm::hob_memory_resources( 3119 sections, 3120 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3121 ) 3122 ); 3123 } 3124 } 3125 3126 #[cfg(target_arch = "aarch64")] 3127 #[cfg(test)] 3128 mod tests { 3129 use arch::aarch64::fdt::create_fdt; 3130 use arch::aarch64::layout; 3131 use arch::{DeviceType, MmioDeviceInfo}; 3132 use devices::gic::Gic; 3133 3134 use super::*; 3135 3136 const LEN: u64 = 4096; 3137 3138 #[test] 3139 fn test_create_fdt_with_devices() { 3140 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3141 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3142 3143 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3144 ( 3145 (DeviceType::Serial, DeviceType::Serial.to_string()), 3146 MmioDeviceInfo { 3147 addr: 0x00, 3148 len: LEN, 3149 irq: 33, 3150 }, 3151 ), 3152 ( 3153 (DeviceType::Virtio(1), "virtio".to_string()), 3154 MmioDeviceInfo { 3155 addr: LEN, 3156 len: LEN, 3157 irq: 34, 3158 }, 3159 ), 3160 ( 3161 (DeviceType::Rtc, "rtc".to_string()), 3162 MmioDeviceInfo { 3163 addr: 2 * LEN, 3164 len: LEN, 3165 irq: 35, 3166 }, 3167 ), 3168 ] 3169 .iter() 3170 .cloned() 3171 .collect(); 3172 3173 let hv = hypervisor::new().unwrap(); 3174 let vm = hv.create_vm().unwrap(); 3175 let gic = vm 3176 .create_vgic(Gic::create_default_config(1)) 3177 .expect("Cannot create gic"); 3178 assert!(create_fdt( 3179 &mem, 3180 "console=tty0", 3181 vec![0], 3182 Some((0, 0, 0)), 3183 &dev_info, 3184 &gic, 3185 &None, 3186 &Vec::new(), 3187 &BTreeMap::new(), 3188 None, 3189 true, 3190 ) 3191 .is_ok()) 3192 } 3193 } 3194 3195 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3196 #[test] 3197 pub fn test_vm() { 3198 use hypervisor::VmExit; 3199 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3200 // This example based on https://lwn.net/Articles/658511/ 3201 let code = [ 3202 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3203 0x00, 0xd8, /* add %bl, %al */ 3204 0x04, b'0', /* add $'0', %al */ 3205 0xee, /* out %al, (%dx) */ 3206 0xb0, b'\n', /* mov $'\n', %al */ 3207 0xee, /* out %al, (%dx) */ 3208 0xf4, /* hlt */ 3209 ]; 3210 3211 let mem_size = 0x1000; 3212 let load_addr = GuestAddress(0x1000); 3213 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3214 3215 let hv = hypervisor::new().unwrap(); 3216 let vm = hv.create_vm().expect("new VM creation failed"); 3217 3218 for (index, region) in mem.iter().enumerate() { 3219 let mem_region = vm.make_user_memory_region( 3220 index as u32, 3221 region.start_addr().raw_value(), 3222 region.len(), 3223 region.as_ptr() as u64, 3224 false, 3225 false, 3226 ); 3227 3228 vm.create_user_memory_region(mem_region) 3229 .expect("Cannot configure guest memory"); 3230 } 3231 mem.write_slice(&code, load_addr) 3232 .expect("Writing code to memory failed"); 3233 3234 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3235 3236 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3237 vcpu_sregs.cs.base = 0; 3238 vcpu_sregs.cs.selector = 0; 3239 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3240 3241 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3242 vcpu_regs.set_rip(0x1000); 3243 vcpu_regs.set_rax(2); 3244 vcpu_regs.set_rbx(3); 3245 vcpu_regs.set_rflags(2); 3246 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3247 3248 loop { 3249 match vcpu.run().expect("run failed") { 3250 VmExit::Reset => { 3251 println!("HLT"); 3252 break; 3253 } 3254 VmExit::Ignore => {} 3255 r => panic!("unexpected exit reason: {r:?}"), 3256 } 3257 } 3258 } 3259