1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::collections::{BTreeMap, HashMap}; 15 use std::fs::{File, OpenOptions}; 16 use std::io::{self, Seek, SeekFrom, Write}; 17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 18 use std::mem::size_of; 19 use std::num::Wrapping; 20 use std::ops::Deref; 21 use std::os::unix::net::UnixStream; 22 use std::sync::{Arc, Mutex, RwLock}; 23 use std::time::Instant; 24 use std::{cmp, result, str, thread}; 25 26 use anyhow::anyhow; 27 #[cfg(target_arch = "x86_64")] 28 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 29 #[cfg(feature = "tdx")] 30 use arch::x86_64::tdx::TdvfSection; 31 #[cfg(target_arch = "aarch64")] 32 use arch::PciSpaceInfo; 33 use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes}; 34 #[cfg(target_arch = "aarch64")] 35 use devices::interrupt_controller; 36 use devices::AcpiNotificationFlags; 37 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 38 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 39 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 40 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 41 use hypervisor::{HypervisorVmError, VmOps}; 42 use libc::{termios, SIGWINCH}; 43 use linux_loader::cmdline::Cmdline; 44 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 45 use linux_loader::elf; 46 #[cfg(target_arch = "x86_64")] 47 use linux_loader::loader::bzimage::BzImage; 48 #[cfg(target_arch = "x86_64")] 49 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 50 #[cfg(target_arch = "aarch64")] 51 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 52 use linux_loader::loader::KernelLoader; 53 use seccompiler::SeccompAction; 54 use serde::{Deserialize, Serialize}; 55 use thiserror::Error; 56 use tracer::trace_scoped; 57 use vm_device::Bus; 58 #[cfg(feature = "tdx")] 59 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 60 use vm_memory::{ 61 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 62 }; 63 use vm_migration::protocol::{MemoryRangeTable, Request, Response}; 64 use vm_migration::{ 65 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, 66 }; 67 use vmm_sys_util::eventfd::EventFd; 68 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 69 70 use crate::config::{add_to_config, ValidationError}; 71 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; 72 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 73 use crate::coredump::{ 74 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 75 }; 76 use crate::device_manager::{DeviceManager, DeviceManagerError}; 77 use crate::device_tree::DeviceTree; 78 #[cfg(feature = "guest_debug")] 79 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 80 #[cfg(feature = "igvm")] 81 use crate::igvm::igvm_loader; 82 use crate::landlock::LandlockError; 83 use crate::memory_manager::{ 84 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 85 }; 86 #[cfg(target_arch = "x86_64")] 87 use crate::migration::get_vm_snapshot; 88 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 89 use crate::migration::url_to_file; 90 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 91 use crate::vm_config::{ 92 DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig, 93 PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, 94 }; 95 use crate::{ 96 cpu, GuestMemoryMmap, PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, 97 MEMORY_MANAGER_SNAPSHOT_ID, 98 }; 99 100 /// Errors associated with VM management 101 #[derive(Debug, Error)] 102 pub enum Error { 103 #[error("Cannot open kernel file: {0}")] 104 KernelFile(#[source] io::Error), 105 106 #[error("Cannot open initramfs file: {0}")] 107 InitramfsFile(#[source] io::Error), 108 109 #[error("Cannot load the kernel into memory: {0}")] 110 KernelLoad(#[source] linux_loader::loader::Error), 111 112 #[cfg(target_arch = "aarch64")] 113 #[error("Cannot load the UEFI binary in memory: {0:?}")] 114 UefiLoad(arch::aarch64::uefi::Error), 115 116 #[error("Cannot load the initramfs into memory")] 117 InitramfsLoad, 118 119 #[error("Cannot load the kernel command line in memory: {0}")] 120 LoadCmdLine(#[source] linux_loader::loader::Error), 121 122 #[error("Failed to apply landlock config during vm_create: {0}")] 123 ApplyLandlock(#[source] LandlockError), 124 125 #[error("Cannot modify the kernel command line: {0}")] 126 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 127 128 #[error("Cannot create the kernel command line: {0}")] 129 CmdLineCreate(#[source] linux_loader::cmdline::Error), 130 131 #[error("Cannot configure system: {0}")] 132 ConfigureSystem(#[source] arch::Error), 133 134 #[cfg(target_arch = "aarch64")] 135 #[error("Cannot enable interrupt controller: {0:?}")] 136 EnableInterruptController(interrupt_controller::Error), 137 138 #[error("VM state is poisoned")] 139 PoisonedState, 140 141 #[error("Error from device manager: {0:?}")] 142 DeviceManager(DeviceManagerError), 143 144 #[error("No device with id {0:?} to remove")] 145 NoDeviceToRemove(String), 146 147 #[error("Cannot spawn a signal handler thread: {0}")] 148 SignalHandlerSpawn(#[source] io::Error), 149 150 #[error("Failed to join on threads: {0:?}")] 151 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 152 153 #[error("VM config is missing")] 154 VmMissingConfig, 155 156 #[error("VM is not created")] 157 VmNotCreated, 158 159 #[error("VM is already created")] 160 VmAlreadyCreated, 161 162 #[error("VM is not running")] 163 VmNotRunning, 164 165 #[error("Cannot clone EventFd: {0}")] 166 EventFdClone(#[source] io::Error), 167 168 #[error("invalid VM state transition: {0:?} to {1:?}")] 169 InvalidStateTransition(VmState, VmState), 170 171 #[error("Error from CPU manager: {0}")] 172 CpuManager(#[source] cpu::Error), 173 174 #[error("Cannot pause devices: {0}")] 175 PauseDevices(#[source] MigratableError), 176 177 #[error("Cannot resume devices: {0}")] 178 ResumeDevices(#[source] MigratableError), 179 180 #[error("Cannot pause CPUs: {0}")] 181 PauseCpus(#[source] MigratableError), 182 183 #[error("Cannot resume cpus: {0}")] 184 ResumeCpus(#[source] MigratableError), 185 186 #[error("Cannot pause VM: {0}")] 187 Pause(#[source] MigratableError), 188 189 #[error("Cannot resume VM: {0}")] 190 Resume(#[source] MigratableError), 191 192 #[error("Memory manager error: {0:?}")] 193 MemoryManager(MemoryManagerError), 194 195 #[error("Eventfd write error: {0}")] 196 EventfdError(#[source] std::io::Error), 197 198 #[error("Cannot snapshot VM: {0}")] 199 Snapshot(#[source] MigratableError), 200 201 #[error("Cannot restore VM: {0}")] 202 Restore(#[source] MigratableError), 203 204 #[error("Cannot send VM snapshot: {0}")] 205 SnapshotSend(#[source] MigratableError), 206 207 #[error("Invalid restore source URL")] 208 InvalidRestoreSourceUrl, 209 210 #[error("Failed to validate config: {0}")] 211 ConfigValidation(#[source] ValidationError), 212 213 #[error("Too many virtio-vsock devices")] 214 TooManyVsockDevices, 215 216 #[error("Failed serializing into JSON: {0}")] 217 SerializeJson(#[source] serde_json::Error), 218 219 #[error("Invalid NUMA configuration")] 220 InvalidNumaConfig, 221 222 #[error("Cannot create seccomp filter: {0}")] 223 CreateSeccompFilter(#[source] seccompiler::Error), 224 225 #[error("Cannot apply seccomp filter: {0}")] 226 ApplySeccompFilter(#[source] seccompiler::Error), 227 228 #[error("Failed resizing a memory zone")] 229 ResizeZone, 230 231 #[error("Cannot activate virtio devices: {0:?}")] 232 ActivateVirtioDevices(DeviceManagerError), 233 234 #[error("Error triggering power button: {0:?}")] 235 PowerButton(DeviceManagerError), 236 237 #[error("Kernel lacks PVH header")] 238 KernelMissingPvhHeader, 239 240 #[error("Failed to allocate firmware RAM: {0:?}")] 241 AllocateFirmwareMemory(MemoryManagerError), 242 243 #[error("Error manipulating firmware file: {0}")] 244 FirmwareFile(#[source] std::io::Error), 245 246 #[error("Firmware too big")] 247 FirmwareTooLarge, 248 249 #[error("Failed to copy firmware to memory: {0}")] 250 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 251 252 #[cfg(feature = "sev_snp")] 253 #[error("Error enabling SEV-SNP VM: {0}")] 254 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 255 256 #[cfg(feature = "tdx")] 257 #[error("Error performing I/O on TDX firmware file: {0}")] 258 LoadTdvf(#[source] std::io::Error), 259 260 #[cfg(feature = "tdx")] 261 #[error("Error performing I/O on the TDX payload file: {0}")] 262 LoadPayload(#[source] std::io::Error), 263 264 #[cfg(feature = "tdx")] 265 #[error("Error parsing TDVF: {0}")] 266 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 267 268 #[cfg(feature = "tdx")] 269 #[error("Error populating TDX HOB: {0}")] 270 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 271 272 #[cfg(feature = "tdx")] 273 #[error("Error allocating TDVF memory: {0:?}")] 274 AllocatingTdvfMemory(crate::memory_manager::Error), 275 276 #[cfg(feature = "tdx")] 277 #[error("Error enabling TDX VM: {0}")] 278 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 279 280 #[cfg(feature = "tdx")] 281 #[error("Error enabling TDX memory region: {0}")] 282 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 283 284 #[cfg(feature = "tdx")] 285 #[error("Error finalizing TDX VM: {0}")] 286 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 287 288 #[cfg(feature = "tdx")] 289 #[error("TDX firmware missing")] 290 TdxFirmwareMissing, 291 292 #[cfg(feature = "tdx")] 293 #[error("Invalid TDX payload type")] 294 InvalidPayloadType, 295 296 #[cfg(feature = "guest_debug")] 297 #[error("Error debugging VM: {0:?}")] 298 Debug(DebuggableError), 299 300 #[error("Error spawning kernel loading thread")] 301 KernelLoadThreadSpawn(std::io::Error), 302 303 #[error("Error joining kernel loading thread")] 304 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 305 306 #[error("Payload configuration is not bootable")] 307 InvalidPayload, 308 309 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 310 #[error("Error coredumping VM: {0:?}")] 311 Coredump(GuestDebuggableError), 312 313 #[cfg(feature = "igvm")] 314 #[error("Cannot open igvm file: {0}")] 315 IgvmFile(#[source] io::Error), 316 317 #[cfg(feature = "igvm")] 318 #[error("Cannot load the igvm into memory: {0}")] 319 IgvmLoad(#[source] igvm_loader::Error), 320 321 #[error("Error injecting NMI")] 322 ErrorNmi, 323 324 #[error("Error resuming the VM: {0}")] 325 ResumeVm(#[source] hypervisor::HypervisorVmError), 326 327 #[error("Error creating console devices")] 328 CreateConsoleDevices(ConsoleDeviceError), 329 } 330 pub type Result<T> = result::Result<T, Error>; 331 332 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 333 pub enum VmState { 334 Created, 335 Running, 336 Shutdown, 337 Paused, 338 BreakPoint, 339 } 340 341 impl VmState { 342 fn valid_transition(self, new_state: VmState) -> Result<()> { 343 match self { 344 VmState::Created => match new_state { 345 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 346 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 347 Ok(()) 348 } 349 }, 350 351 VmState::Running => match new_state { 352 VmState::Created | VmState::Running => { 353 Err(Error::InvalidStateTransition(self, new_state)) 354 } 355 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 356 }, 357 358 VmState::Shutdown => match new_state { 359 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 360 Err(Error::InvalidStateTransition(self, new_state)) 361 } 362 VmState::Running => Ok(()), 363 }, 364 365 VmState::Paused => match new_state { 366 VmState::Created | VmState::Paused | VmState::BreakPoint => { 367 Err(Error::InvalidStateTransition(self, new_state)) 368 } 369 VmState::Running | VmState::Shutdown => Ok(()), 370 }, 371 VmState::BreakPoint => match new_state { 372 VmState::Created | VmState::Running => Ok(()), 373 _ => Err(Error::InvalidStateTransition(self, new_state)), 374 }, 375 } 376 } 377 } 378 379 struct VmOpsHandler { 380 memory: GuestMemoryAtomic<GuestMemoryMmap>, 381 #[cfg(target_arch = "x86_64")] 382 io_bus: Arc<Bus>, 383 mmio_bus: Arc<Bus>, 384 } 385 386 impl VmOps for VmOpsHandler { 387 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 388 self.memory 389 .memory() 390 .write(buf, GuestAddress(gpa)) 391 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 392 } 393 394 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 395 self.memory 396 .memory() 397 .read(buf, GuestAddress(gpa)) 398 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 399 } 400 401 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 402 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 403 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 404 } 405 Ok(()) 406 } 407 408 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 409 match self.mmio_bus.write(gpa, data) { 410 Err(vm_device::BusError::MissingAddressRange) => { 411 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 412 } 413 Ok(Some(barrier)) => { 414 info!("Waiting for barrier"); 415 barrier.wait(); 416 info!("Barrier released"); 417 } 418 _ => {} 419 }; 420 Ok(()) 421 } 422 423 #[cfg(target_arch = "x86_64")] 424 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 425 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 426 info!("Guest PIO read to unregistered address 0x{:x}", port); 427 } 428 Ok(()) 429 } 430 431 #[cfg(target_arch = "x86_64")] 432 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 433 match self.io_bus.write(port, data) { 434 Err(vm_device::BusError::MissingAddressRange) => { 435 info!("Guest PIO write to unregistered address 0x{:x}", port); 436 } 437 Ok(Some(barrier)) => { 438 info!("Waiting for barrier"); 439 barrier.wait(); 440 info!("Barrier released"); 441 } 442 _ => {} 443 }; 444 Ok(()) 445 } 446 } 447 448 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 449 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 450 451 cmp::min(host_phys_bits, max_phys_bits) 452 } 453 454 pub struct Vm { 455 #[cfg(feature = "tdx")] 456 kernel: Option<File>, 457 initramfs: Option<File>, 458 threads: Vec<thread::JoinHandle<()>>, 459 device_manager: Arc<Mutex<DeviceManager>>, 460 config: Arc<Mutex<VmConfig>>, 461 state: RwLock<VmState>, 462 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 463 memory_manager: Arc<Mutex<MemoryManager>>, 464 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 465 // The hypervisor abstracted virtual machine. 466 vm: Arc<dyn hypervisor::Vm>, 467 #[cfg(target_arch = "x86_64")] 468 saved_clock: Option<hypervisor::ClockData>, 469 numa_nodes: NumaNodes, 470 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 471 hypervisor: Arc<dyn hypervisor::Hypervisor>, 472 stop_on_boot: bool, 473 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 474 } 475 476 impl Vm { 477 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 478 479 #[allow(clippy::too_many_arguments)] 480 pub fn new_from_memory_manager( 481 config: Arc<Mutex<VmConfig>>, 482 memory_manager: Arc<Mutex<MemoryManager>>, 483 vm: Arc<dyn hypervisor::Vm>, 484 exit_evt: EventFd, 485 reset_evt: EventFd, 486 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 487 seccomp_action: &SeccompAction, 488 hypervisor: Arc<dyn hypervisor::Hypervisor>, 489 activate_evt: EventFd, 490 timestamp: Instant, 491 console_info: Option<ConsoleInfo>, 492 console_resize_pipe: Option<Arc<File>>, 493 original_termios: Arc<Mutex<Option<termios>>>, 494 snapshot: Option<Snapshot>, 495 ) -> Result<Self> { 496 trace_scoped!("Vm::new_from_memory_manager"); 497 498 let boot_id_list = config 499 .lock() 500 .unwrap() 501 .validate() 502 .map_err(Error::ConfigValidation)?; 503 504 #[cfg(not(feature = "igvm"))] 505 let load_payload_handle = if snapshot.is_none() { 506 Self::load_payload_async(&memory_manager, &config)? 507 } else { 508 None 509 }; 510 511 info!("Booting VM from config: {:?}", &config); 512 513 // Create NUMA nodes based on NumaConfig. 514 let numa_nodes = 515 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 516 517 #[cfg(feature = "tdx")] 518 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 519 #[cfg(feature = "sev_snp")] 520 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 521 #[cfg(feature = "tdx")] 522 let force_iommu = tdx_enabled; 523 #[cfg(feature = "sev_snp")] 524 let force_iommu = sev_snp_enabled; 525 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 526 let force_iommu = false; 527 528 #[cfg(feature = "guest_debug")] 529 let stop_on_boot = config.lock().unwrap().gdb; 530 #[cfg(not(feature = "guest_debug"))] 531 let stop_on_boot = false; 532 533 let memory = memory_manager.lock().unwrap().guest_memory(); 534 #[cfg(target_arch = "x86_64")] 535 let io_bus = Arc::new(Bus::new()); 536 let mmio_bus = Arc::new(Bus::new()); 537 538 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 539 memory, 540 #[cfg(target_arch = "x86_64")] 541 io_bus: io_bus.clone(), 542 mmio_bus: mmio_bus.clone(), 543 }); 544 545 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 546 let cpu_manager = cpu::CpuManager::new( 547 cpus_config, 548 vm.clone(), 549 exit_evt.try_clone().map_err(Error::EventFdClone)?, 550 reset_evt.try_clone().map_err(Error::EventFdClone)?, 551 #[cfg(feature = "guest_debug")] 552 vm_debug_evt, 553 &hypervisor, 554 seccomp_action.clone(), 555 vm_ops, 556 #[cfg(feature = "tdx")] 557 tdx_enabled, 558 &numa_nodes, 559 #[cfg(feature = "sev_snp")] 560 sev_snp_enabled, 561 ) 562 .map_err(Error::CpuManager)?; 563 564 #[cfg(target_arch = "x86_64")] 565 cpu_manager 566 .lock() 567 .unwrap() 568 .populate_cpuid( 569 &memory_manager, 570 &hypervisor, 571 #[cfg(feature = "tdx")] 572 tdx_enabled, 573 ) 574 .map_err(Error::CpuManager)?; 575 576 // Loading the igvm file is pushed down here because 577 // igvm parser needs cpu_manager to retrieve cpuid leaf. 578 // For the regular case, we can start loading early, but for 579 // igvm case we have to wait until cpu_manager is created. 580 // Currently, Microsoft Hypervisor does not provide any 581 // Hypervisor specific common cpuid, we need to call get_cpuid_values 582 // per cpuid through cpu_manager. 583 #[cfg(feature = "igvm")] 584 let load_payload_handle = if snapshot.is_none() { 585 Self::load_payload_async( 586 &memory_manager, 587 &config, 588 &cpu_manager, 589 #[cfg(feature = "sev_snp")] 590 sev_snp_enabled, 591 )? 592 } else { 593 None 594 }; 595 // The initial TDX configuration must be done before the vCPUs are 596 // created 597 #[cfg(feature = "tdx")] 598 if tdx_enabled { 599 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 600 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 601 vm.tdx_init(&cpuid, max_vcpus) 602 .map_err(Error::InitializeTdxVm)?; 603 } 604 605 cpu_manager 606 .lock() 607 .unwrap() 608 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 609 .map_err(Error::CpuManager)?; 610 611 // This initial SEV-SNP configuration must be done immediately after 612 // vCPUs are created. As part of this initialization we are 613 // transitioning the guest into secure state. 614 #[cfg(feature = "sev_snp")] 615 if sev_snp_enabled { 616 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 617 } 618 619 #[cfg(feature = "tdx")] 620 let dynamic = !tdx_enabled; 621 #[cfg(not(feature = "tdx"))] 622 let dynamic = true; 623 624 let device_manager = DeviceManager::new( 625 #[cfg(target_arch = "x86_64")] 626 io_bus, 627 mmio_bus, 628 vm.clone(), 629 config.clone(), 630 memory_manager.clone(), 631 cpu_manager.clone(), 632 exit_evt.try_clone().map_err(Error::EventFdClone)?, 633 reset_evt, 634 seccomp_action.clone(), 635 numa_nodes.clone(), 636 &activate_evt, 637 force_iommu, 638 boot_id_list, 639 timestamp, 640 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 641 dynamic, 642 ) 643 .map_err(Error::DeviceManager)?; 644 645 device_manager 646 .lock() 647 .unwrap() 648 .create_devices(console_info, console_resize_pipe, original_termios) 649 .map_err(Error::DeviceManager)?; 650 651 #[cfg(feature = "tdx")] 652 let kernel = config 653 .lock() 654 .unwrap() 655 .payload 656 .as_ref() 657 .map(|p| p.kernel.as_ref().map(File::open)) 658 .unwrap_or_default() 659 .transpose() 660 .map_err(Error::KernelFile)?; 661 662 let initramfs = config 663 .lock() 664 .unwrap() 665 .payload 666 .as_ref() 667 .map(|p| p.initramfs.as_ref().map(File::open)) 668 .unwrap_or_default() 669 .transpose() 670 .map_err(Error::InitramfsFile)?; 671 672 #[cfg(target_arch = "x86_64")] 673 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 674 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 675 vm_snapshot.clock 676 } else { 677 None 678 }; 679 680 let vm_state = if snapshot.is_some() { 681 VmState::Paused 682 } else { 683 VmState::Created 684 }; 685 686 Ok(Vm { 687 #[cfg(feature = "tdx")] 688 kernel, 689 initramfs, 690 device_manager, 691 config, 692 threads: Vec::with_capacity(1), 693 state: RwLock::new(vm_state), 694 cpu_manager, 695 memory_manager, 696 vm, 697 #[cfg(target_arch = "x86_64")] 698 saved_clock, 699 numa_nodes, 700 hypervisor, 701 stop_on_boot, 702 load_payload_handle, 703 }) 704 } 705 706 fn create_numa_nodes( 707 configs: Option<Vec<NumaConfig>>, 708 memory_manager: &Arc<Mutex<MemoryManager>>, 709 ) -> Result<NumaNodes> { 710 let mm = memory_manager.lock().unwrap(); 711 let mm_zones = mm.memory_zones(); 712 let mut numa_nodes = BTreeMap::new(); 713 714 if let Some(configs) = &configs { 715 for config in configs.iter() { 716 if numa_nodes.contains_key(&config.guest_numa_id) { 717 error!("Can't define twice the same NUMA node"); 718 return Err(Error::InvalidNumaConfig); 719 } 720 721 let mut node = NumaNode::default(); 722 723 if let Some(memory_zones) = &config.memory_zones { 724 for memory_zone in memory_zones.iter() { 725 if let Some(mm_zone) = mm_zones.get(memory_zone) { 726 node.memory_regions.extend(mm_zone.regions().clone()); 727 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 728 node.hotplug_regions.push(virtiomem_zone.region().clone()); 729 } 730 node.memory_zones.push(memory_zone.clone()); 731 } else { 732 error!("Unknown memory zone '{}'", memory_zone); 733 return Err(Error::InvalidNumaConfig); 734 } 735 } 736 } 737 738 if let Some(cpus) = &config.cpus { 739 node.cpus.extend(cpus); 740 } 741 742 if let Some(pci_segments) = &config.pci_segments { 743 node.pci_segments.extend(pci_segments); 744 } 745 746 if let Some(distances) = &config.distances { 747 for distance in distances.iter() { 748 let dest = distance.destination; 749 let dist = distance.distance; 750 751 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 752 error!("Unknown destination NUMA node {}", dest); 753 return Err(Error::InvalidNumaConfig); 754 } 755 756 if node.distances.contains_key(&dest) { 757 error!("Destination NUMA node {} has been already set", dest); 758 return Err(Error::InvalidNumaConfig); 759 } 760 761 node.distances.insert(dest, dist); 762 } 763 } 764 765 #[cfg(target_arch = "x86_64")] 766 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 767 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 768 let mm_sections = sgx_epc_region.epc_sections(); 769 for sgx_epc_section in sgx_epc_sections.iter() { 770 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 771 node.sgx_epc_sections.push(mm_section.clone()); 772 } else { 773 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 774 return Err(Error::InvalidNumaConfig); 775 } 776 } 777 } else { 778 error!("Missing SGX EPC region"); 779 return Err(Error::InvalidNumaConfig); 780 } 781 } 782 783 numa_nodes.insert(config.guest_numa_id, node); 784 } 785 } 786 787 Ok(numa_nodes) 788 } 789 790 #[allow(clippy::too_many_arguments)] 791 pub fn new( 792 vm_config: Arc<Mutex<VmConfig>>, 793 exit_evt: EventFd, 794 reset_evt: EventFd, 795 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 796 seccomp_action: &SeccompAction, 797 hypervisor: Arc<dyn hypervisor::Hypervisor>, 798 activate_evt: EventFd, 799 console_info: Option<ConsoleInfo>, 800 console_resize_pipe: Option<Arc<File>>, 801 original_termios: Arc<Mutex<Option<termios>>>, 802 snapshot: Option<Snapshot>, 803 source_url: Option<&str>, 804 prefault: Option<bool>, 805 ) -> Result<Self> { 806 trace_scoped!("Vm::new"); 807 808 let timestamp = Instant::now(); 809 810 #[cfg(feature = "tdx")] 811 let tdx_enabled = if snapshot.is_some() { 812 false 813 } else { 814 vm_config.lock().unwrap().is_tdx_enabled() 815 }; 816 817 #[cfg(feature = "sev_snp")] 818 let sev_snp_enabled = if snapshot.is_some() { 819 false 820 } else { 821 vm_config.lock().unwrap().is_sev_snp_enabled() 822 }; 823 824 let vm = Self::create_hypervisor_vm( 825 &hypervisor, 826 #[cfg(feature = "tdx")] 827 tdx_enabled, 828 #[cfg(feature = "sev_snp")] 829 sev_snp_enabled, 830 )?; 831 832 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 833 834 let memory_manager = if let Some(snapshot) = 835 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 836 { 837 MemoryManager::new_from_snapshot( 838 &snapshot, 839 vm.clone(), 840 &vm_config.lock().unwrap().memory.clone(), 841 source_url, 842 prefault.unwrap(), 843 phys_bits, 844 ) 845 .map_err(Error::MemoryManager)? 846 } else { 847 #[cfg(target_arch = "x86_64")] 848 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 849 850 MemoryManager::new( 851 vm.clone(), 852 &vm_config.lock().unwrap().memory.clone(), 853 None, 854 phys_bits, 855 #[cfg(feature = "tdx")] 856 tdx_enabled, 857 None, 858 None, 859 #[cfg(target_arch = "x86_64")] 860 sgx_epc_config, 861 ) 862 .map_err(Error::MemoryManager)? 863 }; 864 865 Vm::new_from_memory_manager( 866 vm_config, 867 memory_manager, 868 vm, 869 exit_evt, 870 reset_evt, 871 #[cfg(feature = "guest_debug")] 872 vm_debug_evt, 873 seccomp_action, 874 hypervisor, 875 activate_evt, 876 timestamp, 877 console_info, 878 console_resize_pipe, 879 original_termios, 880 snapshot, 881 ) 882 } 883 884 pub fn create_hypervisor_vm( 885 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 886 #[cfg(feature = "tdx")] tdx_enabled: bool, 887 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 888 ) -> Result<Arc<dyn hypervisor::Vm>> { 889 hypervisor.check_required_extensions().unwrap(); 890 891 cfg_if::cfg_if! { 892 if #[cfg(feature = "tdx")] { 893 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 894 // Otherwise KVM_X86_LEGACY_VM: 0 895 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 896 let vm = hypervisor 897 .create_vm_with_type(u64::from(tdx_enabled)) 898 .unwrap(); 899 } else if #[cfg(feature = "sev_snp")] { 900 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 901 // Otherwise SEV_SNP_DISABLED: 0 902 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 903 let vm = hypervisor 904 .create_vm_with_type(u64::from(sev_snp_enabled)) 905 .unwrap(); 906 } else { 907 let vm = hypervisor.create_vm().unwrap(); 908 } 909 } 910 911 #[cfg(target_arch = "x86_64")] 912 { 913 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 914 .unwrap(); 915 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 916 vm.enable_split_irq().unwrap(); 917 } 918 919 Ok(vm) 920 } 921 922 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 923 let initramfs = self.initramfs.as_mut().unwrap(); 924 let size: usize = initramfs 925 .seek(SeekFrom::End(0)) 926 .map_err(|_| Error::InitramfsLoad)? 927 .try_into() 928 .unwrap(); 929 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 930 931 let address = 932 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 933 let address = GuestAddress(address); 934 935 guest_mem 936 .read_volatile_from(address, initramfs, size) 937 .map_err(|_| Error::InitramfsLoad)?; 938 939 info!("Initramfs loaded: address = 0x{:x}", address.0); 940 Ok(arch::InitramfsConfig { address, size }) 941 } 942 943 pub fn generate_cmdline( 944 payload: &PayloadConfig, 945 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 946 ) -> Result<Cmdline> { 947 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 948 if let Some(s) = payload.cmdline.as_ref() { 949 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 950 } 951 952 #[cfg(target_arch = "aarch64")] 953 for entry in device_manager.lock().unwrap().cmdline_additions() { 954 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 955 } 956 Ok(cmdline) 957 } 958 959 #[cfg(target_arch = "aarch64")] 960 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 961 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 962 let mem = uefi_flash.memory(); 963 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 964 .map_err(Error::UefiLoad)?; 965 Ok(()) 966 } 967 968 #[cfg(target_arch = "aarch64")] 969 fn load_kernel( 970 firmware: Option<File>, 971 kernel: Option<File>, 972 memory_manager: Arc<Mutex<MemoryManager>>, 973 ) -> Result<EntryPoint> { 974 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 975 let mem = guest_memory.memory(); 976 let entry_addr = match (firmware, kernel) { 977 (None, Some(mut kernel)) => { 978 match linux_loader::loader::pe::PE::load( 979 mem.deref(), 980 Some(arch::layout::KERNEL_START), 981 &mut kernel, 982 None, 983 ) { 984 Ok(entry_addr) => entry_addr.kernel_load, 985 // Try to load the binary as kernel PE file at first. 986 // If failed, retry to load it as UEFI binary. 987 // As the UEFI binary is formatless, it must be the last option to try. 988 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 989 Self::load_firmware(&kernel, memory_manager)?; 990 arch::layout::UEFI_START 991 } 992 Err(e) => { 993 return Err(Error::KernelLoad(e)); 994 } 995 } 996 } 997 (Some(firmware), None) => { 998 Self::load_firmware(&firmware, memory_manager)?; 999 arch::layout::UEFI_START 1000 } 1001 _ => return Err(Error::InvalidPayload), 1002 }; 1003 1004 Ok(EntryPoint { entry_addr }) 1005 } 1006 1007 #[cfg(feature = "igvm")] 1008 fn load_igvm( 1009 igvm: File, 1010 memory_manager: Arc<Mutex<MemoryManager>>, 1011 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1012 #[cfg(feature = "sev_snp")] host_data: &Option<String>, 1013 ) -> Result<EntryPoint> { 1014 let res = igvm_loader::load_igvm( 1015 &igvm, 1016 memory_manager, 1017 cpu_manager.clone(), 1018 "", 1019 #[cfg(feature = "sev_snp")] 1020 host_data, 1021 ) 1022 .map_err(Error::IgvmLoad)?; 1023 1024 cfg_if::cfg_if! { 1025 if #[cfg(feature = "sev_snp")] { 1026 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1027 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1028 } else { 1029 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1030 }; 1031 } else { 1032 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1033 } 1034 }; 1035 Ok(entry_point) 1036 } 1037 1038 #[cfg(target_arch = "x86_64")] 1039 fn load_kernel( 1040 mut kernel: File, 1041 cmdline: Option<Cmdline>, 1042 memory_manager: Arc<Mutex<MemoryManager>>, 1043 ) -> Result<EntryPoint> { 1044 info!("Loading kernel"); 1045 1046 let mem = { 1047 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1048 guest_memory.memory() 1049 }; 1050 1051 // Try ELF binary with PVH boot. 1052 let entry_addr = linux_loader::loader::elf::Elf::load( 1053 mem.deref(), 1054 None, 1055 &mut kernel, 1056 Some(arch::layout::HIGH_RAM_START), 1057 ) 1058 // Try loading kernel as bzImage. 1059 .or_else(|_| { 1060 BzImage::load( 1061 mem.deref(), 1062 None, 1063 &mut kernel, 1064 Some(arch::layout::HIGH_RAM_START), 1065 ) 1066 }) 1067 .map_err(Error::KernelLoad)?; 1068 1069 if let Some(cmdline) = cmdline { 1070 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1071 .map_err(Error::LoadCmdLine)?; 1072 } 1073 1074 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1075 // Use the PVH kernel entry point to boot the guest 1076 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1077 Ok(EntryPoint { 1078 entry_addr, 1079 setup_header: None, 1080 }) 1081 } else if entry_addr.setup_header.is_some() { 1082 // Use the bzImage 32bit entry point to boot the guest 1083 info!( 1084 "bzImage kernel loaded: entry_addr = 0x{:x}", 1085 entry_addr.kernel_load.0 1086 ); 1087 Ok(EntryPoint { 1088 entry_addr: entry_addr.kernel_load, 1089 setup_header: entry_addr.setup_header, 1090 }) 1091 } else { 1092 Err(Error::KernelMissingPvhHeader) 1093 } 1094 } 1095 1096 #[cfg(target_arch = "x86_64")] 1097 fn load_payload( 1098 payload: &PayloadConfig, 1099 memory_manager: Arc<Mutex<MemoryManager>>, 1100 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1101 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1102 ) -> Result<EntryPoint> { 1103 trace_scoped!("load_payload"); 1104 #[cfg(feature = "igvm")] 1105 { 1106 if let Some(_igvm_file) = &payload.igvm { 1107 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1108 #[cfg(feature = "sev_snp")] 1109 if sev_snp_enabled { 1110 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); 1111 } 1112 #[cfg(not(feature = "sev_snp"))] 1113 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1114 } 1115 } 1116 match ( 1117 &payload.firmware, 1118 &payload.kernel, 1119 &payload.initramfs, 1120 &payload.cmdline, 1121 ) { 1122 (Some(firmware), None, None, None) => { 1123 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1124 Self::load_kernel(firmware, None, memory_manager) 1125 } 1126 (None, Some(kernel), _, _) => { 1127 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1128 let cmdline = Self::generate_cmdline(payload)?; 1129 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1130 } 1131 _ => Err(Error::InvalidPayload), 1132 } 1133 } 1134 1135 #[cfg(target_arch = "aarch64")] 1136 fn load_payload( 1137 payload: &PayloadConfig, 1138 memory_manager: Arc<Mutex<MemoryManager>>, 1139 ) -> Result<EntryPoint> { 1140 match (&payload.firmware, &payload.kernel) { 1141 (Some(firmware), None) => { 1142 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1143 Self::load_kernel(Some(firmware), None, memory_manager) 1144 } 1145 (None, Some(kernel)) => { 1146 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1147 Self::load_kernel(None, Some(kernel), memory_manager) 1148 } 1149 _ => Err(Error::InvalidPayload), 1150 } 1151 } 1152 1153 fn load_payload_async( 1154 memory_manager: &Arc<Mutex<MemoryManager>>, 1155 config: &Arc<Mutex<VmConfig>>, 1156 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1157 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1158 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1159 // Kernel with TDX is loaded in a different manner 1160 #[cfg(feature = "tdx")] 1161 if config.lock().unwrap().is_tdx_enabled() { 1162 return Ok(None); 1163 } 1164 1165 config 1166 .lock() 1167 .unwrap() 1168 .payload 1169 .as_ref() 1170 .map(|payload| { 1171 let memory_manager = memory_manager.clone(); 1172 let payload = payload.clone(); 1173 #[cfg(feature = "igvm")] 1174 let cpu_manager = cpu_manager.clone(); 1175 1176 std::thread::Builder::new() 1177 .name("payload_loader".into()) 1178 .spawn(move || { 1179 Self::load_payload( 1180 &payload, 1181 memory_manager, 1182 #[cfg(feature = "igvm")] 1183 cpu_manager, 1184 #[cfg(feature = "sev_snp")] 1185 sev_snp_enabled, 1186 ) 1187 }) 1188 .map_err(Error::KernelLoadThreadSpawn) 1189 }) 1190 .transpose() 1191 } 1192 1193 #[cfg(target_arch = "x86_64")] 1194 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1195 trace_scoped!("configure_system"); 1196 info!("Configuring system"); 1197 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1198 1199 let initramfs_config = match self.initramfs { 1200 Some(_) => Some(self.load_initramfs(&mem)?), 1201 None => None, 1202 }; 1203 1204 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1205 let rsdp_addr = Some(rsdp_addr); 1206 let sgx_epc_region = self 1207 .memory_manager 1208 .lock() 1209 .unwrap() 1210 .sgx_epc_region() 1211 .as_ref() 1212 .cloned(); 1213 1214 let serial_number = self 1215 .config 1216 .lock() 1217 .unwrap() 1218 .platform 1219 .as_ref() 1220 .and_then(|p| p.serial_number.clone()); 1221 1222 let uuid = self 1223 .config 1224 .lock() 1225 .unwrap() 1226 .platform 1227 .as_ref() 1228 .and_then(|p| p.uuid.clone()); 1229 1230 let oem_strings = self 1231 .config 1232 .lock() 1233 .unwrap() 1234 .platform 1235 .as_ref() 1236 .and_then(|p| p.oem_strings.clone()); 1237 1238 let oem_strings = oem_strings 1239 .as_deref() 1240 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1241 1242 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1243 1244 arch::configure_system( 1245 &mem, 1246 arch::layout::CMDLINE_START, 1247 arch::layout::CMDLINE_MAX_SIZE, 1248 &initramfs_config, 1249 boot_vcpus, 1250 entry_addr.setup_header, 1251 rsdp_addr, 1252 sgx_epc_region, 1253 serial_number.as_deref(), 1254 uuid.as_deref(), 1255 oem_strings.as_deref(), 1256 topology, 1257 ) 1258 .map_err(Error::ConfigureSystem)?; 1259 Ok(()) 1260 } 1261 1262 #[cfg(target_arch = "aarch64")] 1263 fn configure_system( 1264 &mut self, 1265 _rsdp_addr: GuestAddress, 1266 _entry_addr: EntryPoint, 1267 ) -> Result<()> { 1268 let cmdline = Self::generate_cmdline( 1269 self.config.lock().unwrap().payload.as_ref().unwrap(), 1270 &self.device_manager, 1271 )?; 1272 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1273 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1274 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1275 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1276 let initramfs_config = match self.initramfs { 1277 Some(_) => Some(self.load_initramfs(&mem)?), 1278 None => None, 1279 }; 1280 1281 let device_info = &self 1282 .device_manager 1283 .lock() 1284 .unwrap() 1285 .get_device_info() 1286 .clone(); 1287 1288 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1289 let pci_space = PciSpaceInfo { 1290 pci_segment_id: pci_segment.id, 1291 mmio_config_address: pci_segment.mmio_config_address, 1292 pci_device_space_start: pci_segment.start_of_mem64_area, 1293 pci_device_space_size: pci_segment.end_of_mem64_area 1294 - pci_segment.start_of_mem64_area 1295 + 1, 1296 }; 1297 pci_space_info.push(pci_space); 1298 } 1299 1300 let virtio_iommu_bdf = self 1301 .device_manager 1302 .lock() 1303 .unwrap() 1304 .iommu_attached_devices() 1305 .as_ref() 1306 .map(|(v, _)| *v); 1307 1308 let vgic = self 1309 .device_manager 1310 .lock() 1311 .unwrap() 1312 .get_interrupt_controller() 1313 .unwrap() 1314 .lock() 1315 .unwrap() 1316 .get_vgic() 1317 .map_err(|_| { 1318 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1319 arch::aarch64::Error::SetupGic, 1320 )) 1321 })?; 1322 1323 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1324 let pmu_supported = self 1325 .cpu_manager 1326 .lock() 1327 .unwrap() 1328 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1329 .map_err(|_| { 1330 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1331 arch::aarch64::Error::VcpuInitPmu, 1332 )) 1333 })?; 1334 1335 arch::configure_system( 1336 &mem, 1337 cmdline.as_cstring().unwrap().to_str().unwrap(), 1338 vcpu_mpidrs, 1339 vcpu_topology, 1340 device_info, 1341 &initramfs_config, 1342 &pci_space_info, 1343 virtio_iommu_bdf.map(|bdf| bdf.into()), 1344 &vgic, 1345 &self.numa_nodes, 1346 pmu_supported, 1347 ) 1348 .map_err(Error::ConfigureSystem)?; 1349 1350 Ok(()) 1351 } 1352 1353 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1354 self.device_manager.lock().unwrap().console_resize_pipe() 1355 } 1356 1357 pub fn shutdown(&mut self) -> Result<()> { 1358 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1359 let new_state = VmState::Shutdown; 1360 1361 state.valid_transition(new_state)?; 1362 1363 // Wake up the DeviceManager threads so they will get terminated cleanly 1364 self.device_manager 1365 .lock() 1366 .unwrap() 1367 .resume() 1368 .map_err(Error::Resume)?; 1369 1370 self.cpu_manager 1371 .lock() 1372 .unwrap() 1373 .shutdown() 1374 .map_err(Error::CpuManager)?; 1375 1376 // Wait for all the threads to finish 1377 for thread in self.threads.drain(..) { 1378 thread.join().map_err(Error::ThreadCleanup)? 1379 } 1380 *state = new_state; 1381 1382 Ok(()) 1383 } 1384 1385 pub fn resize( 1386 &mut self, 1387 desired_vcpus: Option<u8>, 1388 desired_memory: Option<u64>, 1389 desired_balloon: Option<u64>, 1390 ) -> Result<()> { 1391 event!("vm", "resizing"); 1392 1393 if let Some(desired_vcpus) = desired_vcpus { 1394 if self 1395 .cpu_manager 1396 .lock() 1397 .unwrap() 1398 .resize(desired_vcpus) 1399 .map_err(Error::CpuManager)? 1400 { 1401 self.device_manager 1402 .lock() 1403 .unwrap() 1404 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1405 .map_err(Error::DeviceManager)?; 1406 } 1407 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1408 } 1409 1410 if let Some(desired_memory) = desired_memory { 1411 let new_region = self 1412 .memory_manager 1413 .lock() 1414 .unwrap() 1415 .resize(desired_memory) 1416 .map_err(Error::MemoryManager)?; 1417 1418 let memory_config = &mut self.config.lock().unwrap().memory; 1419 1420 if let Some(new_region) = &new_region { 1421 self.device_manager 1422 .lock() 1423 .unwrap() 1424 .update_memory(new_region) 1425 .map_err(Error::DeviceManager)?; 1426 1427 match memory_config.hotplug_method { 1428 HotplugMethod::Acpi => { 1429 self.device_manager 1430 .lock() 1431 .unwrap() 1432 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1433 .map_err(Error::DeviceManager)?; 1434 } 1435 HotplugMethod::VirtioMem => {} 1436 } 1437 } 1438 1439 // We update the VM config regardless of the actual guest resize 1440 // operation result (happened or not), so that if the VM reboots 1441 // it will be running with the last configure memory size. 1442 match memory_config.hotplug_method { 1443 HotplugMethod::Acpi => memory_config.size = desired_memory, 1444 HotplugMethod::VirtioMem => { 1445 if desired_memory > memory_config.size { 1446 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1447 } else { 1448 memory_config.hotplugged_size = None; 1449 } 1450 } 1451 } 1452 } 1453 1454 if let Some(desired_balloon) = desired_balloon { 1455 self.device_manager 1456 .lock() 1457 .unwrap() 1458 .resize_balloon(desired_balloon) 1459 .map_err(Error::DeviceManager)?; 1460 1461 // Update the configuration value for the balloon size to ensure 1462 // a reboot would use the right value. 1463 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1464 balloon_config.size = desired_balloon; 1465 } 1466 } 1467 1468 event!("vm", "resized"); 1469 1470 Ok(()) 1471 } 1472 1473 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1474 let memory_config = &mut self.config.lock().unwrap().memory; 1475 1476 if let Some(zones) = &mut memory_config.zones { 1477 for zone in zones.iter_mut() { 1478 if zone.id == id { 1479 if desired_memory >= zone.size { 1480 let hotplugged_size = desired_memory - zone.size; 1481 self.memory_manager 1482 .lock() 1483 .unwrap() 1484 .resize_zone(&id, desired_memory - zone.size) 1485 .map_err(Error::MemoryManager)?; 1486 // We update the memory zone config regardless of the 1487 // actual 'resize-zone' operation result (happened or 1488 // not), so that if the VM reboots it will be running 1489 // with the last configured memory zone size. 1490 zone.hotplugged_size = Some(hotplugged_size); 1491 1492 return Ok(()); 1493 } else { 1494 error!( 1495 "Invalid to ask less ({}) than boot RAM ({}) for \ 1496 this memory zone", 1497 desired_memory, zone.size, 1498 ); 1499 return Err(Error::ResizeZone); 1500 } 1501 } 1502 } 1503 } 1504 1505 error!("Could not find the memory zone {} for the resize", id); 1506 Err(Error::ResizeZone) 1507 } 1508 1509 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1510 let pci_device_info = self 1511 .device_manager 1512 .lock() 1513 .unwrap() 1514 .add_device(&mut device_cfg) 1515 .map_err(Error::DeviceManager)?; 1516 1517 // Update VmConfig by adding the new device. This is important to 1518 // ensure the device would be created in case of a reboot. 1519 { 1520 let mut config = self.config.lock().unwrap(); 1521 add_to_config(&mut config.devices, device_cfg); 1522 } 1523 1524 self.device_manager 1525 .lock() 1526 .unwrap() 1527 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1528 .map_err(Error::DeviceManager)?; 1529 1530 Ok(pci_device_info) 1531 } 1532 1533 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1534 let pci_device_info = self 1535 .device_manager 1536 .lock() 1537 .unwrap() 1538 .add_user_device(&mut device_cfg) 1539 .map_err(Error::DeviceManager)?; 1540 1541 // Update VmConfig by adding the new device. This is important to 1542 // ensure the device would be created in case of a reboot. 1543 { 1544 let mut config = self.config.lock().unwrap(); 1545 add_to_config(&mut config.user_devices, device_cfg); 1546 } 1547 1548 self.device_manager 1549 .lock() 1550 .unwrap() 1551 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1552 .map_err(Error::DeviceManager)?; 1553 1554 Ok(pci_device_info) 1555 } 1556 1557 pub fn remove_device(&mut self, id: String) -> Result<()> { 1558 self.device_manager 1559 .lock() 1560 .unwrap() 1561 .remove_device(id.clone()) 1562 .map_err(Error::DeviceManager)?; 1563 1564 // Update VmConfig by removing the device. This is important to 1565 // ensure the device would not be created in case of a reboot. 1566 self.config.lock().unwrap().remove_device(&id); 1567 1568 self.device_manager 1569 .lock() 1570 .unwrap() 1571 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1572 .map_err(Error::DeviceManager)?; 1573 Ok(()) 1574 } 1575 1576 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1577 let pci_device_info = self 1578 .device_manager 1579 .lock() 1580 .unwrap() 1581 .add_disk(&mut disk_cfg) 1582 .map_err(Error::DeviceManager)?; 1583 1584 // Update VmConfig by adding the new device. This is important to 1585 // ensure the device would be created in case of a reboot. 1586 { 1587 let mut config = self.config.lock().unwrap(); 1588 add_to_config(&mut config.disks, disk_cfg); 1589 } 1590 1591 self.device_manager 1592 .lock() 1593 .unwrap() 1594 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1595 .map_err(Error::DeviceManager)?; 1596 1597 Ok(pci_device_info) 1598 } 1599 1600 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1601 let pci_device_info = self 1602 .device_manager 1603 .lock() 1604 .unwrap() 1605 .add_fs(&mut fs_cfg) 1606 .map_err(Error::DeviceManager)?; 1607 1608 // Update VmConfig by adding the new device. This is important to 1609 // ensure the device would be created in case of a reboot. 1610 { 1611 let mut config = self.config.lock().unwrap(); 1612 add_to_config(&mut config.fs, fs_cfg); 1613 } 1614 1615 self.device_manager 1616 .lock() 1617 .unwrap() 1618 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1619 .map_err(Error::DeviceManager)?; 1620 1621 Ok(pci_device_info) 1622 } 1623 1624 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1625 let pci_device_info = self 1626 .device_manager 1627 .lock() 1628 .unwrap() 1629 .add_pmem(&mut pmem_cfg) 1630 .map_err(Error::DeviceManager)?; 1631 1632 // Update VmConfig by adding the new device. This is important to 1633 // ensure the device would be created in case of a reboot. 1634 { 1635 let mut config = self.config.lock().unwrap(); 1636 add_to_config(&mut config.pmem, pmem_cfg); 1637 } 1638 1639 self.device_manager 1640 .lock() 1641 .unwrap() 1642 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1643 .map_err(Error::DeviceManager)?; 1644 1645 Ok(pci_device_info) 1646 } 1647 1648 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1649 let pci_device_info = self 1650 .device_manager 1651 .lock() 1652 .unwrap() 1653 .add_net(&mut net_cfg) 1654 .map_err(Error::DeviceManager)?; 1655 1656 // Update VmConfig by adding the new device. This is important to 1657 // ensure the device would be created in case of a reboot. 1658 { 1659 let mut config = self.config.lock().unwrap(); 1660 add_to_config(&mut config.net, net_cfg); 1661 } 1662 1663 self.device_manager 1664 .lock() 1665 .unwrap() 1666 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1667 .map_err(Error::DeviceManager)?; 1668 1669 Ok(pci_device_info) 1670 } 1671 1672 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1673 let pci_device_info = self 1674 .device_manager 1675 .lock() 1676 .unwrap() 1677 .add_vdpa(&mut vdpa_cfg) 1678 .map_err(Error::DeviceManager)?; 1679 1680 // Update VmConfig by adding the new device. This is important to 1681 // ensure the device would be created in case of a reboot. 1682 { 1683 let mut config = self.config.lock().unwrap(); 1684 add_to_config(&mut config.vdpa, vdpa_cfg); 1685 } 1686 1687 self.device_manager 1688 .lock() 1689 .unwrap() 1690 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1691 .map_err(Error::DeviceManager)?; 1692 1693 Ok(pci_device_info) 1694 } 1695 1696 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1697 let pci_device_info = self 1698 .device_manager 1699 .lock() 1700 .unwrap() 1701 .add_vsock(&mut vsock_cfg) 1702 .map_err(Error::DeviceManager)?; 1703 1704 // Update VmConfig by adding the new device. This is important to 1705 // ensure the device would be created in case of a reboot. 1706 { 1707 let mut config = self.config.lock().unwrap(); 1708 config.vsock = Some(vsock_cfg); 1709 } 1710 1711 self.device_manager 1712 .lock() 1713 .unwrap() 1714 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1715 .map_err(Error::DeviceManager)?; 1716 1717 Ok(pci_device_info) 1718 } 1719 1720 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1721 Ok(self.device_manager.lock().unwrap().counters()) 1722 } 1723 1724 #[cfg(feature = "tdx")] 1725 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1726 use arch::x86_64::tdx::*; 1727 1728 let firmware_path = self 1729 .config 1730 .lock() 1731 .unwrap() 1732 .payload 1733 .as_ref() 1734 .unwrap() 1735 .firmware 1736 .clone() 1737 .ok_or(Error::TdxFirmwareMissing)?; 1738 // The TDVF file contains a table of section as well as code 1739 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1740 1741 // For all the sections allocate some RAM backing them 1742 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1743 } 1744 1745 #[cfg(feature = "tdx")] 1746 fn hob_memory_resources( 1747 mut sorted_sections: Vec<TdvfSection>, 1748 guest_memory: &GuestMemoryMmap, 1749 ) -> Vec<(u64, u64, bool)> { 1750 let mut list = Vec::new(); 1751 1752 let mut current_section = sorted_sections.pop(); 1753 1754 // RAM regions interleaved with TDVF sections 1755 let mut next_start_addr = 0; 1756 for region in guest_memory.iter() { 1757 let region_start = region.start_addr().0; 1758 let region_end = region.last_addr().0; 1759 if region_start > next_start_addr { 1760 next_start_addr = region_start; 1761 } 1762 1763 loop { 1764 let (start, size, ram) = if let Some(section) = ¤t_section { 1765 if section.address <= next_start_addr { 1766 (section.address, section.size, false) 1767 } else { 1768 let last_addr = std::cmp::min(section.address - 1, region_end); 1769 (next_start_addr, last_addr - next_start_addr + 1, true) 1770 } 1771 } else { 1772 (next_start_addr, region_end - next_start_addr + 1, true) 1773 }; 1774 1775 list.push((start, size, ram)); 1776 1777 if !ram { 1778 current_section = sorted_sections.pop(); 1779 } 1780 1781 next_start_addr = start + size; 1782 1783 if region_start > next_start_addr { 1784 next_start_addr = region_start; 1785 } 1786 1787 if next_start_addr > region_end { 1788 break; 1789 } 1790 } 1791 } 1792 1793 // Once all the interleaved sections have been processed, let's simply 1794 // pull the remaining ones. 1795 if let Some(section) = current_section { 1796 list.push((section.address, section.size, false)); 1797 } 1798 while let Some(section) = sorted_sections.pop() { 1799 list.push((section.address, section.size, false)); 1800 } 1801 1802 list 1803 } 1804 1805 #[cfg(feature = "tdx")] 1806 fn populate_tdx_sections( 1807 &mut self, 1808 sections: &[TdvfSection], 1809 guid_found: bool, 1810 ) -> Result<Option<u64>> { 1811 use arch::x86_64::tdx::*; 1812 // Get the memory end *before* we start adding TDVF ram regions 1813 let boot_guest_memory = self 1814 .memory_manager 1815 .lock() 1816 .as_ref() 1817 .unwrap() 1818 .boot_guest_memory(); 1819 for section in sections { 1820 // No need to allocate if the section falls within guest RAM ranges 1821 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1822 info!( 1823 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1824 section 1825 ); 1826 continue; 1827 } 1828 1829 info!("Allocating TDVF Section: {:x?}", section); 1830 self.memory_manager 1831 .lock() 1832 .unwrap() 1833 .add_ram_region(GuestAddress(section.address), section.size as usize) 1834 .map_err(Error::AllocatingTdvfMemory)?; 1835 } 1836 1837 // The TDVF file contains a table of section as well as code 1838 let firmware_path = self 1839 .config 1840 .lock() 1841 .unwrap() 1842 .payload 1843 .as_ref() 1844 .unwrap() 1845 .firmware 1846 .clone() 1847 .ok_or(Error::TdxFirmwareMissing)?; 1848 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1849 1850 // The guest memory at this point now has all the required regions so it 1851 // is safe to copy from the TDVF file into it. 1852 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1853 let mem = guest_memory.memory(); 1854 let mut payload_info = None; 1855 let mut hob_offset = None; 1856 for section in sections { 1857 info!("Populating TDVF Section: {:x?}", section); 1858 match section.r#type { 1859 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1860 info!("Copying section to guest memory"); 1861 firmware_file 1862 .seek(SeekFrom::Start(section.data_offset as u64)) 1863 .map_err(Error::LoadTdvf)?; 1864 mem.read_volatile_from( 1865 GuestAddress(section.address), 1866 &mut firmware_file, 1867 section.data_size as usize, 1868 ) 1869 .unwrap(); 1870 } 1871 TdvfSectionType::TdHob => { 1872 hob_offset = Some(section.address); 1873 } 1874 TdvfSectionType::Payload => { 1875 info!("Copying payload to guest memory"); 1876 if let Some(payload_file) = self.kernel.as_mut() { 1877 let payload_size = payload_file 1878 .seek(SeekFrom::End(0)) 1879 .map_err(Error::LoadPayload)?; 1880 1881 payload_file 1882 .seek(SeekFrom::Start(0x1f1)) 1883 .map_err(Error::LoadPayload)?; 1884 1885 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1886 payload_file 1887 .read_volatile(&mut payload_header.as_bytes()) 1888 .unwrap(); 1889 1890 if payload_header.header != 0x5372_6448 { 1891 return Err(Error::InvalidPayloadType); 1892 } 1893 1894 if (payload_header.version < 0x0200) 1895 || ((payload_header.loadflags & 0x1) == 0x0) 1896 { 1897 return Err(Error::InvalidPayloadType); 1898 } 1899 1900 payload_file.rewind().map_err(Error::LoadPayload)?; 1901 mem.read_volatile_from( 1902 GuestAddress(section.address), 1903 payload_file, 1904 payload_size as usize, 1905 ) 1906 .unwrap(); 1907 1908 // Create the payload info that will be inserted into 1909 // the HOB. 1910 payload_info = Some(PayloadInfo { 1911 image_type: PayloadImageType::BzImage, 1912 entry_point: section.address, 1913 }); 1914 } 1915 } 1916 TdvfSectionType::PayloadParam => { 1917 info!("Copying payload parameters to guest memory"); 1918 let cmdline = Self::generate_cmdline( 1919 self.config.lock().unwrap().payload.as_ref().unwrap(), 1920 )?; 1921 mem.write_slice( 1922 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1923 GuestAddress(section.address), 1924 ) 1925 .unwrap(); 1926 } 1927 _ => {} 1928 } 1929 } 1930 1931 // Generate HOB 1932 let mut hob = TdHob::start(hob_offset.unwrap()); 1933 1934 let mut sorted_sections = sections.to_vec(); 1935 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1936 1937 sorted_sections.sort_by_key(|section| section.address); 1938 sorted_sections.reverse(); 1939 1940 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1941 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1942 .map_err(Error::PopulateHob)?; 1943 } 1944 1945 // MMIO regions 1946 hob.add_mmio_resource( 1947 &mem, 1948 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1949 arch::layout::APIC_START.raw_value() 1950 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1951 ) 1952 .map_err(Error::PopulateHob)?; 1953 let start_of_device_area = self 1954 .memory_manager 1955 .lock() 1956 .unwrap() 1957 .start_of_device_area() 1958 .raw_value(); 1959 let end_of_device_area = self 1960 .memory_manager 1961 .lock() 1962 .unwrap() 1963 .end_of_device_area() 1964 .raw_value(); 1965 hob.add_mmio_resource( 1966 &mem, 1967 start_of_device_area, 1968 end_of_device_area - start_of_device_area, 1969 ) 1970 .map_err(Error::PopulateHob)?; 1971 1972 // Loop over the ACPI tables and copy them to the HOB. 1973 1974 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1975 &self.device_manager, 1976 &self.cpu_manager, 1977 &self.memory_manager, 1978 &self.numa_nodes, 1979 ) { 1980 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1981 .map_err(Error::PopulateHob)?; 1982 } 1983 1984 // If a payload info has been created, let's insert it into the HOB. 1985 if let Some(payload_info) = payload_info { 1986 hob.add_payload(&mem, payload_info) 1987 .map_err(Error::PopulateHob)?; 1988 } 1989 1990 hob.finish(&mem).map_err(Error::PopulateHob)?; 1991 1992 Ok(hob_offset) 1993 } 1994 1995 #[cfg(feature = "tdx")] 1996 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1997 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1998 let mem = guest_memory.memory(); 1999 2000 for section in sections { 2001 self.vm 2002 .tdx_init_memory_region( 2003 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2004 section.address, 2005 section.size, 2006 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2007 section.attributes == 1, 2008 ) 2009 .map_err(Error::InitializeTdxMemoryRegion)?; 2010 } 2011 2012 Ok(()) 2013 } 2014 2015 // Creates ACPI tables 2016 // In case of TDX being used, this is a no-op since the tables will be 2017 // created and passed when populating the HOB. 2018 2019 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2020 #[cfg(feature = "tdx")] 2021 if self.config.lock().unwrap().is_tdx_enabled() { 2022 return None; 2023 } 2024 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2025 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2026 let rsdp_addr = crate::acpi::create_acpi_tables( 2027 &mem, 2028 &self.device_manager, 2029 &self.cpu_manager, 2030 &self.memory_manager, 2031 &self.numa_nodes, 2032 tpm_enabled, 2033 ); 2034 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2035 2036 Some(rsdp_addr) 2037 } 2038 2039 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2040 trace_scoped!("entry_point"); 2041 2042 self.load_payload_handle 2043 .take() 2044 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2045 .transpose() 2046 } 2047 2048 pub fn boot(&mut self) -> Result<()> { 2049 trace_scoped!("Vm::boot"); 2050 let current_state = self.get_state()?; 2051 if current_state == VmState::Paused { 2052 return self.resume().map_err(Error::Resume); 2053 } 2054 2055 let new_state = if self.stop_on_boot { 2056 VmState::BreakPoint 2057 } else { 2058 VmState::Running 2059 }; 2060 current_state.valid_transition(new_state)?; 2061 2062 // Do earlier to parallelise with loading kernel 2063 #[cfg(target_arch = "x86_64")] 2064 cfg_if::cfg_if! { 2065 if #[cfg(feature = "sev_snp")] { 2066 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2067 let rsdp_addr = if sev_snp_enabled { 2068 // In case of SEV-SNP guest ACPI tables are provided via 2069 // IGVM. So skip the creation of ACPI tables and set the 2070 // rsdp addr to None. 2071 None 2072 } else { 2073 self.create_acpi_tables() 2074 }; 2075 } else { 2076 let rsdp_addr = self.create_acpi_tables(); 2077 } 2078 } 2079 2080 // Load kernel synchronously or if asynchronous then wait for load to 2081 // finish. 2082 let entry_point = self.entry_point()?; 2083 2084 #[cfg(feature = "tdx")] 2085 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2086 2087 // Configure the vcpus that have been created 2088 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2089 for vcpu in vcpus { 2090 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2091 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2092 self.cpu_manager 2093 .lock() 2094 .unwrap() 2095 .configure_vcpu(vcpu, boot_setup) 2096 .map_err(Error::CpuManager)?; 2097 } 2098 2099 #[cfg(feature = "tdx")] 2100 let (sections, guid_found) = if tdx_enabled { 2101 self.extract_tdvf_sections()? 2102 } else { 2103 (Vec::new(), false) 2104 }; 2105 2106 // Configuring the TDX regions requires that the vCPUs are created. 2107 #[cfg(feature = "tdx")] 2108 let hob_address = if tdx_enabled { 2109 // TDX sections are written to memory. 2110 self.populate_tdx_sections(§ions, guid_found)? 2111 } else { 2112 None 2113 }; 2114 2115 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2116 // available after they are configured 2117 #[cfg(target_arch = "aarch64")] 2118 let rsdp_addr = self.create_acpi_tables(); 2119 2120 // Configure shared state based on loaded kernel 2121 entry_point 2122 .map(|entry_point| { 2123 // Safe to unwrap rsdp_addr as we know it can't be None when 2124 // the entry_point is Some. 2125 self.configure_system(rsdp_addr.unwrap(), entry_point) 2126 }) 2127 .transpose()?; 2128 2129 #[cfg(target_arch = "x86_64")] 2130 // Note: For x86, always call this function before invoking start boot vcpus. 2131 // Otherwise guest would fail to boot because we haven't created the 2132 // userspace mappings to update the hypervisor about the memory mappings. 2133 // These mappings must be created before we start the vCPU threads for 2134 // the very first time. 2135 self.memory_manager 2136 .lock() 2137 .unwrap() 2138 .allocate_address_space() 2139 .map_err(Error::MemoryManager)?; 2140 2141 #[cfg(feature = "tdx")] 2142 if let Some(hob_address) = hob_address { 2143 // With the HOB address extracted the vCPUs can have 2144 // their TDX state configured. 2145 self.cpu_manager 2146 .lock() 2147 .unwrap() 2148 .initialize_tdx(hob_address) 2149 .map_err(Error::CpuManager)?; 2150 // Let the hypervisor know which memory ranges are shared with the 2151 // guest. This prevents the guest from ignoring/discarding memory 2152 // regions provided by the host. 2153 self.init_tdx_memory(§ions)?; 2154 // With TDX memory and CPU state configured TDX setup is complete 2155 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2156 } 2157 2158 // Resume the vm for MSHV 2159 if current_state == VmState::Created { 2160 self.vm.resume().map_err(Error::ResumeVm)?; 2161 } 2162 2163 self.cpu_manager 2164 .lock() 2165 .unwrap() 2166 .start_boot_vcpus(new_state == VmState::BreakPoint) 2167 .map_err(Error::CpuManager)?; 2168 2169 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2170 *state = new_state; 2171 Ok(()) 2172 } 2173 2174 pub fn restore(&mut self) -> Result<()> { 2175 event!("vm", "restoring"); 2176 2177 #[cfg(target_arch = "x86_64")] 2178 // Note: For x86, always call this function before invoking start boot vcpus. 2179 // Otherwise guest would fail to boot because we haven't created the 2180 // userspace mappings to update the hypervisor about the memory mappings. 2181 // These mappings must be created before we start the vCPU threads for 2182 // the very first time for the restored VM. 2183 self.memory_manager 2184 .lock() 2185 .unwrap() 2186 .allocate_address_space() 2187 .map_err(Error::MemoryManager)?; 2188 2189 // Now we can start all vCPUs from here. 2190 self.cpu_manager 2191 .lock() 2192 .unwrap() 2193 .start_restored_vcpus() 2194 .map_err(Error::CpuManager)?; 2195 2196 event!("vm", "restored"); 2197 Ok(()) 2198 } 2199 2200 /// Gets a thread-safe reference counted pointer to the VM configuration. 2201 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2202 Arc::clone(&self.config) 2203 } 2204 2205 /// Get the VM state. Returns an error if the state is poisoned. 2206 pub fn get_state(&self) -> Result<VmState> { 2207 self.state 2208 .try_read() 2209 .map_err(|_| Error::PoisonedState) 2210 .map(|state| *state) 2211 } 2212 2213 /// Gets the actual size of the balloon. 2214 pub fn balloon_size(&self) -> u64 { 2215 self.device_manager.lock().unwrap().balloon_size() 2216 } 2217 2218 pub fn send_memory_fds( 2219 &mut self, 2220 socket: &mut UnixStream, 2221 ) -> std::result::Result<(), MigratableError> { 2222 for (slot, fd) in self 2223 .memory_manager 2224 .lock() 2225 .unwrap() 2226 .memory_slot_fds() 2227 .drain() 2228 { 2229 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2230 .write_to(socket) 2231 .map_err(|e| { 2232 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2233 })?; 2234 socket 2235 .send_with_fd(&slot.to_le_bytes()[..], fd) 2236 .map_err(|e| { 2237 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2238 })?; 2239 2240 Response::read_from(socket)?.ok_or_abandon( 2241 socket, 2242 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")), 2243 )?; 2244 } 2245 2246 Ok(()) 2247 } 2248 2249 pub fn send_memory_regions<F>( 2250 &mut self, 2251 ranges: &MemoryRangeTable, 2252 fd: &mut F, 2253 ) -> std::result::Result<(), MigratableError> 2254 where 2255 F: WriteVolatile, 2256 { 2257 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2258 let mem = guest_memory.memory(); 2259 2260 for range in ranges.regions() { 2261 let mut offset: u64 = 0; 2262 // Here we are manually handling the retry in case we can't the 2263 // whole region at once because we can't use the implementation 2264 // from vm-memory::GuestMemory of write_all_to() as it is not 2265 // following the correct behavior. For more info about this issue 2266 // see: https://github.com/rust-vmm/vm-memory/issues/174 2267 loop { 2268 let bytes_written = mem 2269 .write_volatile_to( 2270 GuestAddress(range.gpa + offset), 2271 fd, 2272 (range.length - offset) as usize, 2273 ) 2274 .map_err(|e| { 2275 MigratableError::MigrateSend(anyhow!( 2276 "Error transferring memory to socket: {}", 2277 e 2278 )) 2279 })?; 2280 offset += bytes_written as u64; 2281 2282 if offset == range.length { 2283 break; 2284 } 2285 } 2286 } 2287 2288 Ok(()) 2289 } 2290 2291 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2292 self.memory_manager 2293 .lock() 2294 .unwrap() 2295 .memory_range_table(false) 2296 } 2297 2298 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2299 self.device_manager.lock().unwrap().device_tree() 2300 } 2301 2302 pub fn activate_virtio_devices(&self) -> Result<()> { 2303 self.device_manager 2304 .lock() 2305 .unwrap() 2306 .activate_virtio_devices() 2307 .map_err(Error::ActivateVirtioDevices) 2308 } 2309 2310 #[cfg(target_arch = "x86_64")] 2311 pub fn power_button(&self) -> Result<()> { 2312 return self 2313 .device_manager 2314 .lock() 2315 .unwrap() 2316 .notify_power_button() 2317 .map_err(Error::PowerButton); 2318 } 2319 2320 #[cfg(target_arch = "aarch64")] 2321 pub fn power_button(&self) -> Result<()> { 2322 self.device_manager 2323 .lock() 2324 .unwrap() 2325 .notify_power_button() 2326 .map_err(Error::PowerButton) 2327 } 2328 2329 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2330 self.memory_manager.lock().unwrap().snapshot_data() 2331 } 2332 2333 #[cfg(feature = "guest_debug")] 2334 pub fn debug_request( 2335 &mut self, 2336 gdb_request: &GdbRequestPayload, 2337 cpu_id: usize, 2338 ) -> Result<GdbResponsePayload> { 2339 use GdbRequestPayload::*; 2340 match gdb_request { 2341 SetSingleStep(single_step) => { 2342 self.set_guest_debug(cpu_id, &[], *single_step) 2343 .map_err(Error::Debug)?; 2344 } 2345 SetHwBreakPoint(addrs) => { 2346 self.set_guest_debug(cpu_id, addrs, false) 2347 .map_err(Error::Debug)?; 2348 } 2349 Pause => { 2350 self.debug_pause().map_err(Error::Debug)?; 2351 } 2352 Resume => { 2353 self.debug_resume().map_err(Error::Debug)?; 2354 } 2355 ReadRegs => { 2356 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2357 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2358 } 2359 WriteRegs(regs) => { 2360 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2361 } 2362 ReadMem(vaddr, len) => { 2363 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2364 let mem = self 2365 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2366 .map_err(Error::Debug)?; 2367 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2368 } 2369 WriteMem(vaddr, data) => { 2370 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2371 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2372 .map_err(Error::Debug)?; 2373 } 2374 ActiveVcpus => { 2375 let active_vcpus = self.active_vcpus(); 2376 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2377 } 2378 } 2379 Ok(GdbResponsePayload::CommandComplete) 2380 } 2381 2382 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2383 fn get_dump_state( 2384 &mut self, 2385 destination_url: &str, 2386 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2387 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2388 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2389 let mut elf_phdr_num = 1; 2390 let elf_sh_info = 0; 2391 let coredump_file_path = url_to_file(destination_url)?; 2392 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2393 2394 if mapping_num < UINT16_MAX - 2 { 2395 elf_phdr_num += mapping_num as u16; 2396 } else { 2397 panic!("mapping num beyond 65535 not supported"); 2398 } 2399 let coredump_file = OpenOptions::new() 2400 .read(true) 2401 .write(true) 2402 .create_new(true) 2403 .open(coredump_file_path) 2404 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2405 2406 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2407 let mem_data = self 2408 .memory_manager 2409 .lock() 2410 .unwrap() 2411 .coredump_memory_regions(mem_offset); 2412 2413 Ok(DumpState { 2414 elf_note_size, 2415 elf_phdr_num, 2416 elf_sh_info, 2417 mem_offset, 2418 mem_info: Some(mem_data), 2419 file: Some(coredump_file), 2420 }) 2421 } 2422 2423 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2424 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2425 size_of::<elf::Elf64_Ehdr>() as u64 2426 + note_size as u64 2427 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2428 } 2429 2430 pub fn nmi(&self) -> Result<()> { 2431 return self 2432 .cpu_manager 2433 .lock() 2434 .unwrap() 2435 .nmi() 2436 .map_err(|_| Error::ErrorNmi); 2437 } 2438 } 2439 2440 impl Pausable for Vm { 2441 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2442 event!("vm", "pausing"); 2443 let mut state = self 2444 .state 2445 .try_write() 2446 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2447 let new_state = VmState::Paused; 2448 2449 state 2450 .valid_transition(new_state) 2451 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2452 2453 #[cfg(target_arch = "x86_64")] 2454 { 2455 let mut clock = self 2456 .vm 2457 .get_clock() 2458 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2459 clock.reset_flags(); 2460 self.saved_clock = Some(clock); 2461 } 2462 2463 // Before pausing the vCPUs activate any pending virtio devices that might 2464 // need activation between starting the pause (or e.g. a migration it's part of) 2465 self.activate_virtio_devices().map_err(|e| { 2466 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2467 })?; 2468 2469 self.cpu_manager.lock().unwrap().pause()?; 2470 self.device_manager.lock().unwrap().pause()?; 2471 2472 self.vm 2473 .pause() 2474 .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?; 2475 2476 *state = new_state; 2477 2478 event!("vm", "paused"); 2479 Ok(()) 2480 } 2481 2482 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2483 event!("vm", "resuming"); 2484 let current_state = self.get_state().unwrap(); 2485 let mut state = self 2486 .state 2487 .try_write() 2488 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2489 let new_state = VmState::Running; 2490 2491 state 2492 .valid_transition(new_state) 2493 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2494 2495 self.cpu_manager.lock().unwrap().resume()?; 2496 #[cfg(target_arch = "x86_64")] 2497 { 2498 if let Some(clock) = &self.saved_clock { 2499 self.vm.set_clock(clock).map_err(|e| { 2500 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2501 })?; 2502 } 2503 } 2504 2505 if current_state == VmState::Paused { 2506 self.vm 2507 .resume() 2508 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?; 2509 } 2510 2511 self.device_manager.lock().unwrap().resume()?; 2512 2513 // And we're back to the Running state. 2514 *state = new_state; 2515 event!("vm", "resumed"); 2516 Ok(()) 2517 } 2518 } 2519 2520 #[derive(Serialize, Deserialize)] 2521 pub struct VmSnapshot { 2522 #[cfg(target_arch = "x86_64")] 2523 pub clock: Option<hypervisor::ClockData>, 2524 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2525 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2526 } 2527 2528 pub const VM_SNAPSHOT_ID: &str = "vm"; 2529 impl Snapshottable for Vm { 2530 fn id(&self) -> String { 2531 VM_SNAPSHOT_ID.to_string() 2532 } 2533 2534 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2535 event!("vm", "snapshotting"); 2536 2537 #[cfg(feature = "tdx")] 2538 { 2539 if self.config.lock().unwrap().is_tdx_enabled() { 2540 return Err(MigratableError::Snapshot(anyhow!( 2541 "Snapshot not possible with TDX VM" 2542 ))); 2543 } 2544 } 2545 2546 let current_state = self.get_state().unwrap(); 2547 if current_state != VmState::Paused { 2548 return Err(MigratableError::Snapshot(anyhow!( 2549 "Trying to snapshot while VM is running" 2550 ))); 2551 } 2552 2553 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2554 let common_cpuid = { 2555 let amx = self.config.lock().unwrap().cpus.features.amx; 2556 let phys_bits = physical_bits( 2557 &self.hypervisor, 2558 self.config.lock().unwrap().cpus.max_phys_bits, 2559 ); 2560 arch::generate_common_cpuid( 2561 &self.hypervisor, 2562 &arch::CpuidConfig { 2563 sgx_epc_sections: None, 2564 phys_bits, 2565 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2566 #[cfg(feature = "tdx")] 2567 tdx: false, 2568 amx, 2569 }, 2570 ) 2571 .map_err(|e| { 2572 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2573 })? 2574 }; 2575 2576 let vm_snapshot_state = VmSnapshot { 2577 #[cfg(target_arch = "x86_64")] 2578 clock: self.saved_clock, 2579 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2580 common_cpuid, 2581 }; 2582 2583 let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?; 2584 2585 let (id, snapshot) = { 2586 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2587 (cpu_manager.id(), cpu_manager.snapshot()?) 2588 }; 2589 vm_snapshot.add_snapshot(id, snapshot); 2590 let (id, snapshot) = { 2591 let mut memory_manager = self.memory_manager.lock().unwrap(); 2592 (memory_manager.id(), memory_manager.snapshot()?) 2593 }; 2594 vm_snapshot.add_snapshot(id, snapshot); 2595 let (id, snapshot) = { 2596 let mut device_manager = self.device_manager.lock().unwrap(); 2597 (device_manager.id(), device_manager.snapshot()?) 2598 }; 2599 vm_snapshot.add_snapshot(id, snapshot); 2600 2601 event!("vm", "snapshotted"); 2602 Ok(vm_snapshot) 2603 } 2604 } 2605 2606 impl Transportable for Vm { 2607 fn send( 2608 &self, 2609 snapshot: &Snapshot, 2610 destination_url: &str, 2611 ) -> std::result::Result<(), MigratableError> { 2612 let mut snapshot_config_path = url_to_path(destination_url)?; 2613 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2614 2615 // Create the snapshot config file 2616 let mut snapshot_config_file = OpenOptions::new() 2617 .read(true) 2618 .write(true) 2619 .create_new(true) 2620 .open(snapshot_config_path) 2621 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2622 2623 // Serialize and write the snapshot config 2624 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2625 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2626 2627 snapshot_config_file 2628 .write(vm_config.as_bytes()) 2629 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2630 2631 let mut snapshot_state_path = url_to_path(destination_url)?; 2632 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2633 2634 // Create the snapshot state file 2635 let mut snapshot_state_file = OpenOptions::new() 2636 .read(true) 2637 .write(true) 2638 .create_new(true) 2639 .open(snapshot_state_path) 2640 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2641 2642 // Serialize and write the snapshot state 2643 let vm_state = 2644 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2645 2646 snapshot_state_file 2647 .write(&vm_state) 2648 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2649 2650 // Tell the memory manager to also send/write its own snapshot. 2651 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2652 self.memory_manager 2653 .lock() 2654 .unwrap() 2655 .send(&memory_manager_snapshot.clone(), destination_url)?; 2656 } else { 2657 return Err(MigratableError::Restore(anyhow!( 2658 "Missing memory manager snapshot" 2659 ))); 2660 } 2661 2662 Ok(()) 2663 } 2664 } 2665 2666 impl Migratable for Vm { 2667 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2668 self.memory_manager.lock().unwrap().start_dirty_log()?; 2669 self.device_manager.lock().unwrap().start_dirty_log() 2670 } 2671 2672 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2673 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2674 self.device_manager.lock().unwrap().stop_dirty_log() 2675 } 2676 2677 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2678 Ok(MemoryRangeTable::new_from_tables(vec![ 2679 self.memory_manager.lock().unwrap().dirty_log()?, 2680 self.device_manager.lock().unwrap().dirty_log()?, 2681 ])) 2682 } 2683 2684 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2685 self.memory_manager.lock().unwrap().start_migration()?; 2686 self.device_manager.lock().unwrap().start_migration() 2687 } 2688 2689 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2690 self.memory_manager.lock().unwrap().complete_migration()?; 2691 self.device_manager.lock().unwrap().complete_migration() 2692 } 2693 } 2694 2695 #[cfg(feature = "guest_debug")] 2696 impl Debuggable for Vm { 2697 fn set_guest_debug( 2698 &self, 2699 cpu_id: usize, 2700 addrs: &[GuestAddress], 2701 singlestep: bool, 2702 ) -> std::result::Result<(), DebuggableError> { 2703 self.cpu_manager 2704 .lock() 2705 .unwrap() 2706 .set_guest_debug(cpu_id, addrs, singlestep) 2707 } 2708 2709 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2710 if *self.state.read().unwrap() == VmState::Running { 2711 self.pause().map_err(DebuggableError::Pause)?; 2712 } 2713 2714 let mut state = self 2715 .state 2716 .try_write() 2717 .map_err(|_| DebuggableError::PoisonedState)?; 2718 *state = VmState::BreakPoint; 2719 Ok(()) 2720 } 2721 2722 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2723 if *self.state.read().unwrap() == VmState::BreakPoint { 2724 self.resume().map_err(DebuggableError::Pause)?; 2725 } 2726 2727 Ok(()) 2728 } 2729 2730 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2731 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2732 } 2733 2734 fn write_regs( 2735 &self, 2736 cpu_id: usize, 2737 regs: &CoreRegs, 2738 ) -> std::result::Result<(), DebuggableError> { 2739 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2740 } 2741 2742 fn read_mem( 2743 &self, 2744 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2745 cpu_id: usize, 2746 vaddr: GuestAddress, 2747 len: usize, 2748 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2749 self.cpu_manager 2750 .lock() 2751 .unwrap() 2752 .read_mem(guest_memory, cpu_id, vaddr, len) 2753 } 2754 2755 fn write_mem( 2756 &self, 2757 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2758 cpu_id: usize, 2759 vaddr: &GuestAddress, 2760 data: &[u8], 2761 ) -> std::result::Result<(), DebuggableError> { 2762 self.cpu_manager 2763 .lock() 2764 .unwrap() 2765 .write_mem(guest_memory, cpu_id, vaddr, data) 2766 } 2767 2768 fn active_vcpus(&self) -> usize { 2769 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2770 if active_vcpus > 0 { 2771 active_vcpus 2772 } else { 2773 // The VM is not booted yet. Report boot_vcpus() instead. 2774 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2775 } 2776 } 2777 } 2778 2779 #[cfg(feature = "guest_debug")] 2780 pub const UINT16_MAX: u32 = 65535; 2781 2782 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2783 impl Elf64Writable for Vm {} 2784 2785 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2786 impl GuestDebuggable for Vm { 2787 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2788 event!("vm", "coredumping"); 2789 2790 let mut resume = false; 2791 2792 #[cfg(feature = "tdx")] 2793 { 2794 if let Some(ref platform) = self.config.lock().unwrap().platform { 2795 if platform.tdx { 2796 return Err(GuestDebuggableError::Coredump(anyhow!( 2797 "Coredump not possible with TDX VM" 2798 ))); 2799 } 2800 } 2801 } 2802 2803 match self.get_state().unwrap() { 2804 VmState::Running => { 2805 self.pause().map_err(GuestDebuggableError::Pause)?; 2806 resume = true; 2807 } 2808 VmState::Paused => {} 2809 _ => { 2810 return Err(GuestDebuggableError::Coredump(anyhow!( 2811 "Trying to coredump while VM is not running or paused" 2812 ))); 2813 } 2814 } 2815 2816 let coredump_state = self.get_dump_state(destination_url)?; 2817 2818 self.write_header(&coredump_state)?; 2819 self.write_note(&coredump_state)?; 2820 self.write_loads(&coredump_state)?; 2821 2822 self.cpu_manager 2823 .lock() 2824 .unwrap() 2825 .cpu_write_elf64_note(&coredump_state)?; 2826 self.cpu_manager 2827 .lock() 2828 .unwrap() 2829 .cpu_write_vmm_note(&coredump_state)?; 2830 2831 self.memory_manager 2832 .lock() 2833 .unwrap() 2834 .coredump_iterate_save_mem(&coredump_state)?; 2835 2836 if resume { 2837 self.resume().map_err(GuestDebuggableError::Resume)?; 2838 } 2839 2840 Ok(()) 2841 } 2842 } 2843 2844 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2845 #[cfg(test)] 2846 mod tests { 2847 use super::*; 2848 2849 fn test_vm_state_transitions(state: VmState) { 2850 match state { 2851 VmState::Created => { 2852 // Check the transitions from Created 2853 state.valid_transition(VmState::Created).unwrap_err(); 2854 state.valid_transition(VmState::Running).unwrap(); 2855 state.valid_transition(VmState::Shutdown).unwrap(); 2856 state.valid_transition(VmState::Paused).unwrap(); 2857 state.valid_transition(VmState::BreakPoint).unwrap(); 2858 } 2859 VmState::Running => { 2860 // Check the transitions from Running 2861 state.valid_transition(VmState::Created).unwrap_err(); 2862 state.valid_transition(VmState::Running).unwrap_err(); 2863 state.valid_transition(VmState::Shutdown).unwrap(); 2864 state.valid_transition(VmState::Paused).unwrap(); 2865 state.valid_transition(VmState::BreakPoint).unwrap(); 2866 } 2867 VmState::Shutdown => { 2868 // Check the transitions from Shutdown 2869 state.valid_transition(VmState::Created).unwrap_err(); 2870 state.valid_transition(VmState::Running).unwrap(); 2871 state.valid_transition(VmState::Shutdown).unwrap_err(); 2872 state.valid_transition(VmState::Paused).unwrap_err(); 2873 state.valid_transition(VmState::BreakPoint).unwrap_err(); 2874 } 2875 VmState::Paused => { 2876 // Check the transitions from Paused 2877 state.valid_transition(VmState::Created).unwrap_err(); 2878 state.valid_transition(VmState::Running).unwrap(); 2879 state.valid_transition(VmState::Shutdown).unwrap(); 2880 state.valid_transition(VmState::Paused).unwrap_err(); 2881 state.valid_transition(VmState::BreakPoint).unwrap_err(); 2882 } 2883 VmState::BreakPoint => { 2884 // Check the transitions from Breakpoint 2885 state.valid_transition(VmState::Created).unwrap(); 2886 state.valid_transition(VmState::Running).unwrap(); 2887 state.valid_transition(VmState::Shutdown).unwrap_err(); 2888 state.valid_transition(VmState::Paused).unwrap_err(); 2889 state.valid_transition(VmState::BreakPoint).unwrap_err(); 2890 } 2891 } 2892 } 2893 2894 #[test] 2895 fn test_vm_created_transitions() { 2896 test_vm_state_transitions(VmState::Created); 2897 } 2898 2899 #[test] 2900 fn test_vm_running_transitions() { 2901 test_vm_state_transitions(VmState::Running); 2902 } 2903 2904 #[test] 2905 fn test_vm_shutdown_transitions() { 2906 test_vm_state_transitions(VmState::Shutdown); 2907 } 2908 2909 #[test] 2910 fn test_vm_paused_transitions() { 2911 test_vm_state_transitions(VmState::Paused); 2912 } 2913 2914 #[cfg(feature = "tdx")] 2915 #[test] 2916 fn test_hob_memory_resources() { 2917 // Case 1: Two TDVF sections in the middle of the RAM 2918 let sections = vec![ 2919 TdvfSection { 2920 address: 0xc000, 2921 size: 0x1000, 2922 ..Default::default() 2923 }, 2924 TdvfSection { 2925 address: 0x1000, 2926 size: 0x4000, 2927 ..Default::default() 2928 }, 2929 ]; 2930 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2931 let expected = vec![ 2932 (0, 0x1000, true), 2933 (0x1000, 0x4000, false), 2934 (0x5000, 0x7000, true), 2935 (0xc000, 0x1000, false), 2936 (0xd000, 0x0fff_3000, true), 2937 ]; 2938 assert_eq!( 2939 expected, 2940 Vm::hob_memory_resources( 2941 sections, 2942 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2943 ) 2944 ); 2945 2946 // Case 2: Two TDVF sections with no conflict with the RAM 2947 let sections = vec![ 2948 TdvfSection { 2949 address: 0x1000_1000, 2950 size: 0x1000, 2951 ..Default::default() 2952 }, 2953 TdvfSection { 2954 address: 0, 2955 size: 0x1000, 2956 ..Default::default() 2957 }, 2958 ]; 2959 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2960 let expected = vec![ 2961 (0, 0x1000, false), 2962 (0x1000, 0x1000_0000, true), 2963 (0x1000_1000, 0x1000, false), 2964 ]; 2965 assert_eq!( 2966 expected, 2967 Vm::hob_memory_resources( 2968 sections, 2969 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2970 ) 2971 ); 2972 2973 // Case 3: Two TDVF sections with partial conflicts with the RAM 2974 let sections = vec![ 2975 TdvfSection { 2976 address: 0x1000_0000, 2977 size: 0x2000, 2978 ..Default::default() 2979 }, 2980 TdvfSection { 2981 address: 0, 2982 size: 0x2000, 2983 ..Default::default() 2984 }, 2985 ]; 2986 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2987 let expected = vec![ 2988 (0, 0x2000, false), 2989 (0x2000, 0x0fff_e000, true), 2990 (0x1000_0000, 0x2000, false), 2991 ]; 2992 assert_eq!( 2993 expected, 2994 Vm::hob_memory_resources( 2995 sections, 2996 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2997 ) 2998 ); 2999 3000 // Case 4: Two TDVF sections with no conflict before the RAM and two 3001 // more additional sections with no conflict after the RAM. 3002 let sections = vec![ 3003 TdvfSection { 3004 address: 0x2000_1000, 3005 size: 0x1000, 3006 ..Default::default() 3007 }, 3008 TdvfSection { 3009 address: 0x2000_0000, 3010 size: 0x1000, 3011 ..Default::default() 3012 }, 3013 TdvfSection { 3014 address: 0x1000, 3015 size: 0x1000, 3016 ..Default::default() 3017 }, 3018 TdvfSection { 3019 address: 0, 3020 size: 0x1000, 3021 ..Default::default() 3022 }, 3023 ]; 3024 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3025 let expected = vec![ 3026 (0, 0x1000, false), 3027 (0x1000, 0x1000, false), 3028 (0x4000, 0x1000_0000, true), 3029 (0x2000_0000, 0x1000, false), 3030 (0x2000_1000, 0x1000, false), 3031 ]; 3032 assert_eq!( 3033 expected, 3034 Vm::hob_memory_resources( 3035 sections, 3036 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3037 ) 3038 ); 3039 3040 // Case 5: One TDVF section overriding the entire RAM 3041 let sections = vec![TdvfSection { 3042 address: 0, 3043 size: 0x2000_0000, 3044 ..Default::default() 3045 }]; 3046 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3047 let expected = vec![(0, 0x2000_0000, false)]; 3048 assert_eq!( 3049 expected, 3050 Vm::hob_memory_resources( 3051 sections, 3052 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3053 ) 3054 ); 3055 3056 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3057 let sections = vec![ 3058 TdvfSection { 3059 address: 0x1000_2000, 3060 size: 0x2000, 3061 ..Default::default() 3062 }, 3063 TdvfSection { 3064 address: 0, 3065 size: 0x2000, 3066 ..Default::default() 3067 }, 3068 ]; 3069 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3070 (GuestAddress(0x2000), 0x1000_0000), 3071 (GuestAddress(0x1000_4000), 0x1000_0000), 3072 ]; 3073 let expected = vec![ 3074 (0, 0x2000, false), 3075 (0x2000, 0x1000_0000, true), 3076 (0x1000_2000, 0x2000, false), 3077 (0x1000_4000, 0x1000_0000, true), 3078 ]; 3079 assert_eq!( 3080 expected, 3081 Vm::hob_memory_resources( 3082 sections, 3083 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3084 ) 3085 ); 3086 3087 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3088 let sections = vec![ 3089 TdvfSection { 3090 address: 0x1000_0000, 3091 size: 0x4000, 3092 ..Default::default() 3093 }, 3094 TdvfSection { 3095 address: 0, 3096 size: 0x4000, 3097 ..Default::default() 3098 }, 3099 ]; 3100 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3101 (GuestAddress(0x1000), 0x1000_0000), 3102 (GuestAddress(0x1000_3000), 0x1000_0000), 3103 ]; 3104 let expected = vec![ 3105 (0, 0x4000, false), 3106 (0x4000, 0x0fff_c000, true), 3107 (0x1000_0000, 0x4000, false), 3108 (0x1000_4000, 0x0fff_f000, true), 3109 ]; 3110 assert_eq!( 3111 expected, 3112 Vm::hob_memory_resources( 3113 sections, 3114 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3115 ) 3116 ); 3117 } 3118 } 3119 3120 #[cfg(target_arch = "aarch64")] 3121 #[cfg(test)] 3122 mod tests { 3123 use arch::aarch64::fdt::create_fdt; 3124 use arch::aarch64::layout; 3125 use arch::{DeviceType, MmioDeviceInfo}; 3126 use devices::gic::Gic; 3127 3128 use super::*; 3129 3130 const LEN: u64 = 4096; 3131 3132 #[test] 3133 fn test_create_fdt_with_devices() { 3134 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3135 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3136 3137 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3138 ( 3139 (DeviceType::Serial, DeviceType::Serial.to_string()), 3140 MmioDeviceInfo { 3141 addr: 0x00, 3142 len: LEN, 3143 irq: 33, 3144 }, 3145 ), 3146 ( 3147 (DeviceType::Virtio(1), "virtio".to_string()), 3148 MmioDeviceInfo { 3149 addr: LEN, 3150 len: LEN, 3151 irq: 34, 3152 }, 3153 ), 3154 ( 3155 (DeviceType::Rtc, "rtc".to_string()), 3156 MmioDeviceInfo { 3157 addr: 2 * LEN, 3158 len: LEN, 3159 irq: 35, 3160 }, 3161 ), 3162 ] 3163 .iter() 3164 .cloned() 3165 .collect(); 3166 3167 let hv = hypervisor::new().unwrap(); 3168 let vm = hv.create_vm().unwrap(); 3169 let gic = vm 3170 .create_vgic(Gic::create_default_config(1)) 3171 .expect("Cannot create gic"); 3172 create_fdt( 3173 &mem, 3174 "console=tty0", 3175 vec![0], 3176 Some((0, 0, 0)), 3177 &dev_info, 3178 &gic, 3179 &None, 3180 &Vec::new(), 3181 &BTreeMap::new(), 3182 None, 3183 true, 3184 ) 3185 .unwrap(); 3186 } 3187 } 3188 3189 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3190 #[test] 3191 pub fn test_vm() { 3192 use hypervisor::VmExit; 3193 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3194 // This example based on https://lwn.net/Articles/658511/ 3195 let code = [ 3196 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3197 0x00, 0xd8, /* add %bl, %al */ 3198 0x04, b'0', /* add $'0', %al */ 3199 0xee, /* out %al, (%dx) */ 3200 0xb0, b'\n', /* mov $'\n', %al */ 3201 0xee, /* out %al, (%dx) */ 3202 0xf4, /* hlt */ 3203 ]; 3204 3205 let mem_size = 0x1000; 3206 let load_addr = GuestAddress(0x1000); 3207 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3208 3209 let hv = hypervisor::new().unwrap(); 3210 let vm = hv.create_vm().expect("new VM creation failed"); 3211 3212 for (index, region) in mem.iter().enumerate() { 3213 let mem_region = vm.make_user_memory_region( 3214 index as u32, 3215 region.start_addr().raw_value(), 3216 region.len(), 3217 region.as_ptr() as u64, 3218 false, 3219 false, 3220 ); 3221 3222 vm.create_user_memory_region(mem_region) 3223 .expect("Cannot configure guest memory"); 3224 } 3225 mem.write_slice(&code, load_addr) 3226 .expect("Writing code to memory failed"); 3227 3228 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3229 3230 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3231 vcpu_sregs.cs.base = 0; 3232 vcpu_sregs.cs.selector = 0; 3233 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3234 3235 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3236 vcpu_regs.set_rip(0x1000); 3237 vcpu_regs.set_rax(2); 3238 vcpu_regs.set_rbx(3); 3239 vcpu_regs.set_rflags(2); 3240 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3241 3242 loop { 3243 match vcpu.run().expect("run failed") { 3244 VmExit::Reset => { 3245 println!("HLT"); 3246 break; 3247 } 3248 VmExit::Ignore => {} 3249 r => panic!("unexpected exit reason: {r:?}"), 3250 } 3251 } 3252 } 3253