1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::collections::{BTreeMap, HashMap}; 15 use std::fs::{File, OpenOptions}; 16 use std::io::{self, Seek, SeekFrom, Write}; 17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 18 use std::mem::size_of; 19 use std::num::Wrapping; 20 use std::ops::Deref; 21 use std::os::unix::net::UnixStream; 22 use std::sync::{Arc, Mutex, RwLock}; 23 use std::time::Instant; 24 use std::{cmp, result, str, thread}; 25 26 use anyhow::anyhow; 27 #[cfg(target_arch = "x86_64")] 28 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 29 #[cfg(feature = "tdx")] 30 use arch::x86_64::tdx::TdvfSection; 31 #[cfg(target_arch = "aarch64")] 32 use arch::PciSpaceInfo; 33 use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes}; 34 #[cfg(target_arch = "aarch64")] 35 use devices::interrupt_controller; 36 use devices::AcpiNotificationFlags; 37 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 38 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 39 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 40 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 41 use hypervisor::{HypervisorVmError, VmOps}; 42 use libc::{termios, SIGWINCH}; 43 use linux_loader::cmdline::Cmdline; 44 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 45 use linux_loader::elf; 46 #[cfg(target_arch = "x86_64")] 47 use linux_loader::loader::bzimage::BzImage; 48 #[cfg(target_arch = "x86_64")] 49 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 50 #[cfg(target_arch = "aarch64")] 51 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 52 use linux_loader::loader::KernelLoader; 53 use seccompiler::SeccompAction; 54 use serde::{Deserialize, Serialize}; 55 use thiserror::Error; 56 use tracer::trace_scoped; 57 use vm_device::Bus; 58 #[cfg(feature = "tdx")] 59 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 60 use vm_memory::{ 61 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 62 }; 63 use vm_migration::protocol::{MemoryRangeTable, Request, Response}; 64 use vm_migration::{ 65 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, 66 }; 67 use vmm_sys_util::eventfd::EventFd; 68 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 69 70 use crate::config::{add_to_config, ValidationError}; 71 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; 72 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 73 use crate::coredump::{ 74 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 75 }; 76 use crate::device_manager::{DeviceManager, DeviceManagerError}; 77 use crate::device_tree::DeviceTree; 78 #[cfg(feature = "guest_debug")] 79 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 80 #[cfg(feature = "igvm")] 81 use crate::igvm::igvm_loader; 82 use crate::landlock::LandlockError; 83 use crate::memory_manager::{ 84 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 85 }; 86 #[cfg(target_arch = "x86_64")] 87 use crate::migration::get_vm_snapshot; 88 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 89 use crate::migration::url_to_file; 90 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 91 use crate::vm_config::{ 92 DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig, 93 PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, 94 }; 95 use crate::{ 96 cpu, GuestMemoryMmap, PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, 97 MEMORY_MANAGER_SNAPSHOT_ID, 98 }; 99 100 /// Errors associated with VM management 101 #[derive(Debug, Error)] 102 pub enum Error { 103 #[error("Cannot open kernel file: {0}")] 104 KernelFile(#[source] io::Error), 105 106 #[error("Cannot open initramfs file: {0}")] 107 InitramfsFile(#[source] io::Error), 108 109 #[error("Cannot load the kernel into memory: {0}")] 110 KernelLoad(#[source] linux_loader::loader::Error), 111 112 #[cfg(target_arch = "aarch64")] 113 #[error("Cannot load the UEFI binary in memory: {0:?}")] 114 UefiLoad(arch::aarch64::uefi::Error), 115 116 #[error("Cannot load the initramfs into memory")] 117 InitramfsLoad, 118 119 #[error("Cannot load the kernel command line in memory: {0}")] 120 LoadCmdLine(#[source] linux_loader::loader::Error), 121 122 #[error("Failed to apply landlock config during vm_create: {0}")] 123 ApplyLandlock(#[source] LandlockError), 124 125 #[error("Cannot modify the kernel command line: {0}")] 126 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 127 128 #[error("Cannot create the kernel command line: {0}")] 129 CmdLineCreate(#[source] linux_loader::cmdline::Error), 130 131 #[error("Cannot configure system: {0}")] 132 ConfigureSystem(#[source] arch::Error), 133 134 #[cfg(target_arch = "aarch64")] 135 #[error("Cannot enable interrupt controller: {0:?}")] 136 EnableInterruptController(interrupt_controller::Error), 137 138 #[error("VM state is poisoned")] 139 PoisonedState, 140 141 #[error("Error from device manager: {0:?}")] 142 DeviceManager(DeviceManagerError), 143 144 #[error("No device with id {0:?} to remove")] 145 NoDeviceToRemove(String), 146 147 #[error("Cannot spawn a signal handler thread: {0}")] 148 SignalHandlerSpawn(#[source] io::Error), 149 150 #[error("Failed to join on threads: {0:?}")] 151 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 152 153 #[error("VM config is missing")] 154 VmMissingConfig, 155 156 #[error("VM is not created")] 157 VmNotCreated, 158 159 #[error("VM is already created")] 160 VmAlreadyCreated, 161 162 #[error("VM is not running")] 163 VmNotRunning, 164 165 #[error("Cannot clone EventFd: {0}")] 166 EventFdClone(#[source] io::Error), 167 168 #[error("invalid VM state transition: {0:?} to {1:?}")] 169 InvalidStateTransition(VmState, VmState), 170 171 #[error("Error from CPU manager: {0}")] 172 CpuManager(#[source] cpu::Error), 173 174 #[error("Cannot pause devices: {0}")] 175 PauseDevices(#[source] MigratableError), 176 177 #[error("Cannot resume devices: {0}")] 178 ResumeDevices(#[source] MigratableError), 179 180 #[error("Cannot pause CPUs: {0}")] 181 PauseCpus(#[source] MigratableError), 182 183 #[error("Cannot resume cpus: {0}")] 184 ResumeCpus(#[source] MigratableError), 185 186 #[error("Cannot pause VM: {0}")] 187 Pause(#[source] MigratableError), 188 189 #[error("Cannot resume VM: {0}")] 190 Resume(#[source] MigratableError), 191 192 #[error("Memory manager error: {0:?}")] 193 MemoryManager(MemoryManagerError), 194 195 #[error("Eventfd write error: {0}")] 196 EventfdError(#[source] std::io::Error), 197 198 #[error("Cannot snapshot VM: {0}")] 199 Snapshot(#[source] MigratableError), 200 201 #[error("Cannot restore VM: {0}")] 202 Restore(#[source] MigratableError), 203 204 #[error("Cannot send VM snapshot: {0}")] 205 SnapshotSend(#[source] MigratableError), 206 207 #[error("Invalid restore source URL")] 208 InvalidRestoreSourceUrl, 209 210 #[error("Failed to validate config: {0}")] 211 ConfigValidation(#[source] ValidationError), 212 213 #[error("Too many virtio-vsock devices")] 214 TooManyVsockDevices, 215 216 #[error("Failed serializing into JSON: {0}")] 217 SerializeJson(#[source] serde_json::Error), 218 219 #[error("Invalid NUMA configuration")] 220 InvalidNumaConfig, 221 222 #[error("Cannot create seccomp filter: {0}")] 223 CreateSeccompFilter(#[source] seccompiler::Error), 224 225 #[error("Cannot apply seccomp filter: {0}")] 226 ApplySeccompFilter(#[source] seccompiler::Error), 227 228 #[error("Failed resizing a memory zone")] 229 ResizeZone, 230 231 #[error("Cannot activate virtio devices: {0:?}")] 232 ActivateVirtioDevices(DeviceManagerError), 233 234 #[error("Error triggering power button: {0:?}")] 235 PowerButton(DeviceManagerError), 236 237 #[error("Kernel lacks PVH header")] 238 KernelMissingPvhHeader, 239 240 #[error("Failed to allocate firmware RAM: {0:?}")] 241 AllocateFirmwareMemory(MemoryManagerError), 242 243 #[error("Error manipulating firmware file: {0}")] 244 FirmwareFile(#[source] std::io::Error), 245 246 #[error("Firmware too big")] 247 FirmwareTooLarge, 248 249 #[error("Failed to copy firmware to memory: {0}")] 250 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 251 252 #[cfg(feature = "sev_snp")] 253 #[error("Error enabling SEV-SNP VM: {0}")] 254 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 255 256 #[cfg(feature = "tdx")] 257 #[error("Error performing I/O on TDX firmware file: {0}")] 258 LoadTdvf(#[source] std::io::Error), 259 260 #[cfg(feature = "tdx")] 261 #[error("Error performing I/O on the TDX payload file: {0}")] 262 LoadPayload(#[source] std::io::Error), 263 264 #[cfg(feature = "tdx")] 265 #[error("Error parsing TDVF: {0}")] 266 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 267 268 #[cfg(feature = "tdx")] 269 #[error("Error populating TDX HOB: {0}")] 270 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 271 272 #[cfg(feature = "tdx")] 273 #[error("Error allocating TDVF memory: {0:?}")] 274 AllocatingTdvfMemory(crate::memory_manager::Error), 275 276 #[cfg(feature = "tdx")] 277 #[error("Error enabling TDX VM: {0}")] 278 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 279 280 #[cfg(feature = "tdx")] 281 #[error("Error enabling TDX memory region: {0}")] 282 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 283 284 #[cfg(feature = "tdx")] 285 #[error("Error finalizing TDX VM: {0}")] 286 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 287 288 #[cfg(feature = "tdx")] 289 #[error("TDX firmware missing")] 290 TdxFirmwareMissing, 291 292 #[cfg(feature = "tdx")] 293 #[error("Invalid TDX payload type")] 294 InvalidPayloadType, 295 296 #[cfg(feature = "guest_debug")] 297 #[error("Error debugging VM: {0:?}")] 298 Debug(DebuggableError), 299 300 #[error("Error spawning kernel loading thread")] 301 KernelLoadThreadSpawn(std::io::Error), 302 303 #[error("Error joining kernel loading thread")] 304 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 305 306 #[error("Payload configuration is not bootable")] 307 InvalidPayload, 308 309 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 310 #[error("Error coredumping VM: {0:?}")] 311 Coredump(GuestDebuggableError), 312 313 #[cfg(feature = "igvm")] 314 #[error("Cannot open igvm file: {0}")] 315 IgvmFile(#[source] io::Error), 316 317 #[cfg(feature = "igvm")] 318 #[error("Cannot load the igvm into memory: {0}")] 319 IgvmLoad(#[source] igvm_loader::Error), 320 321 #[error("Error injecting NMI")] 322 ErrorNmi, 323 324 #[error("Error resuming the VM: {0}")] 325 ResumeVm(#[source] hypervisor::HypervisorVmError), 326 327 #[error("Error creating console devices")] 328 CreateConsoleDevices(ConsoleDeviceError), 329 } 330 pub type Result<T> = result::Result<T, Error>; 331 332 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 333 pub enum VmState { 334 Created, 335 Running, 336 Shutdown, 337 Paused, 338 BreakPoint, 339 } 340 341 impl VmState { 342 fn valid_transition(self, new_state: VmState) -> Result<()> { 343 match self { 344 VmState::Created => match new_state { 345 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 346 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 347 Ok(()) 348 } 349 }, 350 351 VmState::Running => match new_state { 352 VmState::Created | VmState::Running => { 353 Err(Error::InvalidStateTransition(self, new_state)) 354 } 355 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 356 }, 357 358 VmState::Shutdown => match new_state { 359 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 360 Err(Error::InvalidStateTransition(self, new_state)) 361 } 362 VmState::Running => Ok(()), 363 }, 364 365 VmState::Paused => match new_state { 366 VmState::Created | VmState::Paused | VmState::BreakPoint => { 367 Err(Error::InvalidStateTransition(self, new_state)) 368 } 369 VmState::Running | VmState::Shutdown => Ok(()), 370 }, 371 VmState::BreakPoint => match new_state { 372 VmState::Created | VmState::Running => Ok(()), 373 _ => Err(Error::InvalidStateTransition(self, new_state)), 374 }, 375 } 376 } 377 } 378 379 struct VmOpsHandler { 380 memory: GuestMemoryAtomic<GuestMemoryMmap>, 381 #[cfg(target_arch = "x86_64")] 382 io_bus: Arc<Bus>, 383 mmio_bus: Arc<Bus>, 384 } 385 386 impl VmOps for VmOpsHandler { 387 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 388 self.memory 389 .memory() 390 .write(buf, GuestAddress(gpa)) 391 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 392 } 393 394 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 395 self.memory 396 .memory() 397 .read(buf, GuestAddress(gpa)) 398 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 399 } 400 401 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 402 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 403 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 404 } 405 Ok(()) 406 } 407 408 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 409 match self.mmio_bus.write(gpa, data) { 410 Err(vm_device::BusError::MissingAddressRange) => { 411 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 412 } 413 Ok(Some(barrier)) => { 414 info!("Waiting for barrier"); 415 barrier.wait(); 416 info!("Barrier released"); 417 } 418 _ => {} 419 }; 420 Ok(()) 421 } 422 423 #[cfg(target_arch = "x86_64")] 424 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 425 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 426 info!("Guest PIO read to unregistered address 0x{:x}", port); 427 } 428 Ok(()) 429 } 430 431 #[cfg(target_arch = "x86_64")] 432 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 433 match self.io_bus.write(port, data) { 434 Err(vm_device::BusError::MissingAddressRange) => { 435 info!("Guest PIO write to unregistered address 0x{:x}", port); 436 } 437 Ok(Some(barrier)) => { 438 info!("Waiting for barrier"); 439 barrier.wait(); 440 info!("Barrier released"); 441 } 442 _ => {} 443 }; 444 Ok(()) 445 } 446 } 447 448 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 449 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 450 451 cmp::min(host_phys_bits, max_phys_bits) 452 } 453 454 pub struct Vm { 455 #[cfg(feature = "tdx")] 456 kernel: Option<File>, 457 initramfs: Option<File>, 458 threads: Vec<thread::JoinHandle<()>>, 459 device_manager: Arc<Mutex<DeviceManager>>, 460 config: Arc<Mutex<VmConfig>>, 461 state: RwLock<VmState>, 462 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 463 memory_manager: Arc<Mutex<MemoryManager>>, 464 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 465 // The hypervisor abstracted virtual machine. 466 vm: Arc<dyn hypervisor::Vm>, 467 #[cfg(target_arch = "x86_64")] 468 saved_clock: Option<hypervisor::ClockData>, 469 numa_nodes: NumaNodes, 470 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 471 hypervisor: Arc<dyn hypervisor::Hypervisor>, 472 stop_on_boot: bool, 473 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 474 } 475 476 impl Vm { 477 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 478 479 #[allow(clippy::too_many_arguments)] 480 pub fn new_from_memory_manager( 481 config: Arc<Mutex<VmConfig>>, 482 memory_manager: Arc<Mutex<MemoryManager>>, 483 vm: Arc<dyn hypervisor::Vm>, 484 exit_evt: EventFd, 485 reset_evt: EventFd, 486 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 487 seccomp_action: &SeccompAction, 488 hypervisor: Arc<dyn hypervisor::Hypervisor>, 489 activate_evt: EventFd, 490 timestamp: Instant, 491 console_info: Option<ConsoleInfo>, 492 console_resize_pipe: Option<Arc<File>>, 493 original_termios: Arc<Mutex<Option<termios>>>, 494 snapshot: Option<Snapshot>, 495 ) -> Result<Self> { 496 trace_scoped!("Vm::new_from_memory_manager"); 497 498 let boot_id_list = config 499 .lock() 500 .unwrap() 501 .validate() 502 .map_err(Error::ConfigValidation)?; 503 504 #[cfg(not(feature = "igvm"))] 505 let load_payload_handle = if snapshot.is_none() { 506 Self::load_payload_async(&memory_manager, &config)? 507 } else { 508 None 509 }; 510 511 info!("Booting VM from config: {:?}", &config); 512 513 // Create NUMA nodes based on NumaConfig. 514 let numa_nodes = 515 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 516 517 #[cfg(feature = "tdx")] 518 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 519 #[cfg(feature = "sev_snp")] 520 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 521 #[cfg(feature = "tdx")] 522 let force_iommu = tdx_enabled; 523 #[cfg(feature = "sev_snp")] 524 let force_iommu = sev_snp_enabled; 525 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 526 let force_iommu = false; 527 528 #[cfg(feature = "guest_debug")] 529 let stop_on_boot = config.lock().unwrap().gdb; 530 #[cfg(not(feature = "guest_debug"))] 531 let stop_on_boot = false; 532 533 let memory = memory_manager.lock().unwrap().guest_memory(); 534 let io_bus = Arc::new(Bus::new()); 535 let mmio_bus = Arc::new(Bus::new()); 536 537 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 538 memory, 539 #[cfg(target_arch = "x86_64")] 540 io_bus: io_bus.clone(), 541 mmio_bus: mmio_bus.clone(), 542 }); 543 544 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 545 let cpu_manager = cpu::CpuManager::new( 546 cpus_config, 547 vm.clone(), 548 exit_evt.try_clone().map_err(Error::EventFdClone)?, 549 reset_evt.try_clone().map_err(Error::EventFdClone)?, 550 #[cfg(feature = "guest_debug")] 551 vm_debug_evt, 552 &hypervisor, 553 seccomp_action.clone(), 554 vm_ops, 555 #[cfg(feature = "tdx")] 556 tdx_enabled, 557 &numa_nodes, 558 #[cfg(feature = "sev_snp")] 559 sev_snp_enabled, 560 ) 561 .map_err(Error::CpuManager)?; 562 563 #[cfg(target_arch = "x86_64")] 564 cpu_manager 565 .lock() 566 .unwrap() 567 .populate_cpuid( 568 &memory_manager, 569 &hypervisor, 570 #[cfg(feature = "tdx")] 571 tdx_enabled, 572 ) 573 .map_err(Error::CpuManager)?; 574 575 // Loading the igvm file is pushed down here because 576 // igvm parser needs cpu_manager to retrieve cpuid leaf. 577 // For the regular case, we can start loading early, but for 578 // igvm case we have to wait until cpu_manager is created. 579 // Currently, Microsoft Hypervisor does not provide any 580 // Hypervisor specific common cpuid, we need to call get_cpuid_values 581 // per cpuid through cpu_manager. 582 #[cfg(feature = "igvm")] 583 let load_payload_handle = if snapshot.is_none() { 584 Self::load_payload_async( 585 &memory_manager, 586 &config, 587 &cpu_manager, 588 #[cfg(feature = "sev_snp")] 589 sev_snp_enabled, 590 )? 591 } else { 592 None 593 }; 594 // The initial TDX configuration must be done before the vCPUs are 595 // created 596 #[cfg(feature = "tdx")] 597 if tdx_enabled { 598 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 599 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 600 vm.tdx_init(&cpuid, max_vcpus) 601 .map_err(Error::InitializeTdxVm)?; 602 } 603 604 cpu_manager 605 .lock() 606 .unwrap() 607 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 608 .map_err(Error::CpuManager)?; 609 610 // This initial SEV-SNP configuration must be done immediately after 611 // vCPUs are created. As part of this initialization we are 612 // transitioning the guest into secure state. 613 #[cfg(feature = "sev_snp")] 614 if sev_snp_enabled { 615 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 616 } 617 618 #[cfg(feature = "tdx")] 619 let dynamic = !tdx_enabled; 620 #[cfg(not(feature = "tdx"))] 621 let dynamic = true; 622 623 let device_manager = DeviceManager::new( 624 io_bus, 625 mmio_bus, 626 vm.clone(), 627 config.clone(), 628 memory_manager.clone(), 629 cpu_manager.clone(), 630 exit_evt.try_clone().map_err(Error::EventFdClone)?, 631 reset_evt, 632 seccomp_action.clone(), 633 numa_nodes.clone(), 634 &activate_evt, 635 force_iommu, 636 boot_id_list, 637 timestamp, 638 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 639 dynamic, 640 ) 641 .map_err(Error::DeviceManager)?; 642 643 device_manager 644 .lock() 645 .unwrap() 646 .create_devices(console_info, console_resize_pipe, original_termios) 647 .map_err(Error::DeviceManager)?; 648 649 #[cfg(feature = "tdx")] 650 let kernel = config 651 .lock() 652 .unwrap() 653 .payload 654 .as_ref() 655 .map(|p| p.kernel.as_ref().map(File::open)) 656 .unwrap_or_default() 657 .transpose() 658 .map_err(Error::KernelFile)?; 659 660 let initramfs = config 661 .lock() 662 .unwrap() 663 .payload 664 .as_ref() 665 .map(|p| p.initramfs.as_ref().map(File::open)) 666 .unwrap_or_default() 667 .transpose() 668 .map_err(Error::InitramfsFile)?; 669 670 #[cfg(target_arch = "x86_64")] 671 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 672 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 673 vm_snapshot.clock 674 } else { 675 None 676 }; 677 678 let vm_state = if snapshot.is_some() { 679 VmState::Paused 680 } else { 681 VmState::Created 682 }; 683 684 Ok(Vm { 685 #[cfg(feature = "tdx")] 686 kernel, 687 initramfs, 688 device_manager, 689 config, 690 threads: Vec::with_capacity(1), 691 state: RwLock::new(vm_state), 692 cpu_manager, 693 memory_manager, 694 vm, 695 #[cfg(target_arch = "x86_64")] 696 saved_clock, 697 numa_nodes, 698 hypervisor, 699 stop_on_boot, 700 load_payload_handle, 701 }) 702 } 703 704 fn create_numa_nodes( 705 configs: Option<Vec<NumaConfig>>, 706 memory_manager: &Arc<Mutex<MemoryManager>>, 707 ) -> Result<NumaNodes> { 708 let mm = memory_manager.lock().unwrap(); 709 let mm_zones = mm.memory_zones(); 710 let mut numa_nodes = BTreeMap::new(); 711 712 if let Some(configs) = &configs { 713 for config in configs.iter() { 714 if numa_nodes.contains_key(&config.guest_numa_id) { 715 error!("Can't define twice the same NUMA node"); 716 return Err(Error::InvalidNumaConfig); 717 } 718 719 let mut node = NumaNode::default(); 720 721 if let Some(memory_zones) = &config.memory_zones { 722 for memory_zone in memory_zones.iter() { 723 if let Some(mm_zone) = mm_zones.get(memory_zone) { 724 node.memory_regions.extend(mm_zone.regions().clone()); 725 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 726 node.hotplug_regions.push(virtiomem_zone.region().clone()); 727 } 728 node.memory_zones.push(memory_zone.clone()); 729 } else { 730 error!("Unknown memory zone '{}'", memory_zone); 731 return Err(Error::InvalidNumaConfig); 732 } 733 } 734 } 735 736 if let Some(cpus) = &config.cpus { 737 node.cpus.extend(cpus); 738 } 739 740 if let Some(pci_segments) = &config.pci_segments { 741 node.pci_segments.extend(pci_segments); 742 } 743 744 if let Some(distances) = &config.distances { 745 for distance in distances.iter() { 746 let dest = distance.destination; 747 let dist = distance.distance; 748 749 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 750 error!("Unknown destination NUMA node {}", dest); 751 return Err(Error::InvalidNumaConfig); 752 } 753 754 if node.distances.contains_key(&dest) { 755 error!("Destination NUMA node {} has been already set", dest); 756 return Err(Error::InvalidNumaConfig); 757 } 758 759 node.distances.insert(dest, dist); 760 } 761 } 762 763 #[cfg(target_arch = "x86_64")] 764 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 765 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 766 let mm_sections = sgx_epc_region.epc_sections(); 767 for sgx_epc_section in sgx_epc_sections.iter() { 768 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 769 node.sgx_epc_sections.push(mm_section.clone()); 770 } else { 771 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 772 return Err(Error::InvalidNumaConfig); 773 } 774 } 775 } else { 776 error!("Missing SGX EPC region"); 777 return Err(Error::InvalidNumaConfig); 778 } 779 } 780 781 numa_nodes.insert(config.guest_numa_id, node); 782 } 783 } 784 785 Ok(numa_nodes) 786 } 787 788 #[allow(clippy::too_many_arguments)] 789 pub fn new( 790 vm_config: Arc<Mutex<VmConfig>>, 791 exit_evt: EventFd, 792 reset_evt: EventFd, 793 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 794 seccomp_action: &SeccompAction, 795 hypervisor: Arc<dyn hypervisor::Hypervisor>, 796 activate_evt: EventFd, 797 console_info: Option<ConsoleInfo>, 798 console_resize_pipe: Option<Arc<File>>, 799 original_termios: Arc<Mutex<Option<termios>>>, 800 snapshot: Option<Snapshot>, 801 source_url: Option<&str>, 802 prefault: Option<bool>, 803 ) -> Result<Self> { 804 trace_scoped!("Vm::new"); 805 806 let timestamp = Instant::now(); 807 808 #[cfg(feature = "tdx")] 809 let tdx_enabled = if snapshot.is_some() { 810 false 811 } else { 812 vm_config.lock().unwrap().is_tdx_enabled() 813 }; 814 815 #[cfg(feature = "sev_snp")] 816 let sev_snp_enabled = if snapshot.is_some() { 817 false 818 } else { 819 vm_config.lock().unwrap().is_sev_snp_enabled() 820 }; 821 822 let vm = Self::create_hypervisor_vm( 823 &hypervisor, 824 #[cfg(feature = "tdx")] 825 tdx_enabled, 826 #[cfg(feature = "sev_snp")] 827 sev_snp_enabled, 828 #[cfg(feature = "sev_snp")] 829 vm_config.lock().unwrap().memory.total_size(), 830 )?; 831 832 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 833 834 let memory_manager = if let Some(snapshot) = 835 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 836 { 837 MemoryManager::new_from_snapshot( 838 &snapshot, 839 vm.clone(), 840 &vm_config.lock().unwrap().memory.clone(), 841 source_url, 842 prefault.unwrap(), 843 phys_bits, 844 ) 845 .map_err(Error::MemoryManager)? 846 } else { 847 #[cfg(target_arch = "x86_64")] 848 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 849 850 MemoryManager::new( 851 vm.clone(), 852 &vm_config.lock().unwrap().memory.clone(), 853 None, 854 phys_bits, 855 #[cfg(feature = "tdx")] 856 tdx_enabled, 857 None, 858 None, 859 #[cfg(target_arch = "x86_64")] 860 sgx_epc_config, 861 ) 862 .map_err(Error::MemoryManager)? 863 }; 864 865 Vm::new_from_memory_manager( 866 vm_config, 867 memory_manager, 868 vm, 869 exit_evt, 870 reset_evt, 871 #[cfg(feature = "guest_debug")] 872 vm_debug_evt, 873 seccomp_action, 874 hypervisor, 875 activate_evt, 876 timestamp, 877 console_info, 878 console_resize_pipe, 879 original_termios, 880 snapshot, 881 ) 882 } 883 884 pub fn create_hypervisor_vm( 885 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 886 #[cfg(feature = "tdx")] tdx_enabled: bool, 887 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 888 #[cfg(feature = "sev_snp")] mem_size: u64, 889 ) -> Result<Arc<dyn hypervisor::Vm>> { 890 hypervisor.check_required_extensions().unwrap(); 891 892 cfg_if::cfg_if! { 893 if #[cfg(feature = "tdx")] { 894 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 895 // Otherwise KVM_X86_LEGACY_VM: 0 896 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 897 let vm = hypervisor 898 .create_vm_with_type(u64::from(tdx_enabled)) 899 .unwrap(); 900 } else if #[cfg(feature = "sev_snp")] { 901 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 902 // Otherwise SEV_SNP_DISABLED: 0 903 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 904 let vm = hypervisor 905 .create_vm_with_type_and_memory(u64::from(sev_snp_enabled), mem_size) 906 .unwrap(); 907 } else { 908 let vm = hypervisor.create_vm().unwrap(); 909 } 910 } 911 912 #[cfg(target_arch = "x86_64")] 913 { 914 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 915 .unwrap(); 916 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 917 vm.enable_split_irq().unwrap(); 918 } 919 920 Ok(vm) 921 } 922 923 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 924 let initramfs = self.initramfs.as_mut().unwrap(); 925 let size: usize = initramfs 926 .seek(SeekFrom::End(0)) 927 .map_err(|_| Error::InitramfsLoad)? 928 .try_into() 929 .unwrap(); 930 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 931 932 let address = 933 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 934 let address = GuestAddress(address); 935 936 guest_mem 937 .read_volatile_from(address, initramfs, size) 938 .map_err(|_| Error::InitramfsLoad)?; 939 940 info!("Initramfs loaded: address = 0x{:x}", address.0); 941 Ok(arch::InitramfsConfig { address, size }) 942 } 943 944 pub fn generate_cmdline( 945 payload: &PayloadConfig, 946 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 947 ) -> Result<Cmdline> { 948 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 949 if let Some(s) = payload.cmdline.as_ref() { 950 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 951 } 952 953 #[cfg(target_arch = "aarch64")] 954 for entry in device_manager.lock().unwrap().cmdline_additions() { 955 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 956 } 957 Ok(cmdline) 958 } 959 960 #[cfg(target_arch = "aarch64")] 961 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 962 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 963 let mem = uefi_flash.memory(); 964 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 965 .map_err(Error::UefiLoad)?; 966 Ok(()) 967 } 968 969 #[cfg(target_arch = "aarch64")] 970 fn load_kernel( 971 firmware: Option<File>, 972 kernel: Option<File>, 973 memory_manager: Arc<Mutex<MemoryManager>>, 974 ) -> Result<EntryPoint> { 975 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 976 let mem = guest_memory.memory(); 977 let entry_addr = match (firmware, kernel) { 978 (None, Some(mut kernel)) => { 979 match linux_loader::loader::pe::PE::load( 980 mem.deref(), 981 Some(arch::layout::KERNEL_START), 982 &mut kernel, 983 None, 984 ) { 985 Ok(entry_addr) => entry_addr.kernel_load, 986 // Try to load the binary as kernel PE file at first. 987 // If failed, retry to load it as UEFI binary. 988 // As the UEFI binary is formatless, it must be the last option to try. 989 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 990 Self::load_firmware(&kernel, memory_manager)?; 991 arch::layout::UEFI_START 992 } 993 Err(e) => { 994 return Err(Error::KernelLoad(e)); 995 } 996 } 997 } 998 (Some(firmware), None) => { 999 Self::load_firmware(&firmware, memory_manager)?; 1000 arch::layout::UEFI_START 1001 } 1002 _ => return Err(Error::InvalidPayload), 1003 }; 1004 1005 Ok(EntryPoint { entry_addr }) 1006 } 1007 1008 #[cfg(feature = "igvm")] 1009 fn load_igvm( 1010 igvm: File, 1011 memory_manager: Arc<Mutex<MemoryManager>>, 1012 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1013 #[cfg(feature = "sev_snp")] host_data: &Option<String>, 1014 ) -> Result<EntryPoint> { 1015 let res = igvm_loader::load_igvm( 1016 &igvm, 1017 memory_manager, 1018 cpu_manager.clone(), 1019 "", 1020 #[cfg(feature = "sev_snp")] 1021 host_data, 1022 ) 1023 .map_err(Error::IgvmLoad)?; 1024 1025 cfg_if::cfg_if! { 1026 if #[cfg(feature = "sev_snp")] { 1027 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1028 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1029 } else { 1030 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1031 }; 1032 } else { 1033 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1034 } 1035 }; 1036 Ok(entry_point) 1037 } 1038 1039 #[cfg(target_arch = "x86_64")] 1040 fn load_kernel( 1041 mut kernel: File, 1042 cmdline: Option<Cmdline>, 1043 memory_manager: Arc<Mutex<MemoryManager>>, 1044 ) -> Result<EntryPoint> { 1045 info!("Loading kernel"); 1046 1047 let mem = { 1048 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1049 guest_memory.memory() 1050 }; 1051 1052 // Try ELF binary with PVH boot. 1053 let entry_addr = linux_loader::loader::elf::Elf::load( 1054 mem.deref(), 1055 None, 1056 &mut kernel, 1057 Some(arch::layout::HIGH_RAM_START), 1058 ) 1059 // Try loading kernel as bzImage. 1060 .or_else(|_| { 1061 BzImage::load( 1062 mem.deref(), 1063 None, 1064 &mut kernel, 1065 Some(arch::layout::HIGH_RAM_START), 1066 ) 1067 }) 1068 .map_err(Error::KernelLoad)?; 1069 1070 if let Some(cmdline) = cmdline { 1071 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1072 .map_err(Error::LoadCmdLine)?; 1073 } 1074 1075 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1076 // Use the PVH kernel entry point to boot the guest 1077 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1078 Ok(EntryPoint { 1079 entry_addr, 1080 setup_header: None, 1081 }) 1082 } else if entry_addr.setup_header.is_some() { 1083 // Use the bzImage 32bit entry point to boot the guest 1084 info!( 1085 "bzImage kernel loaded: entry_addr = 0x{:x}", 1086 entry_addr.kernel_load.0 1087 ); 1088 Ok(EntryPoint { 1089 entry_addr: entry_addr.kernel_load, 1090 setup_header: entry_addr.setup_header, 1091 }) 1092 } else { 1093 Err(Error::KernelMissingPvhHeader) 1094 } 1095 } 1096 1097 #[cfg(target_arch = "x86_64")] 1098 fn load_payload( 1099 payload: &PayloadConfig, 1100 memory_manager: Arc<Mutex<MemoryManager>>, 1101 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1102 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1103 ) -> Result<EntryPoint> { 1104 trace_scoped!("load_payload"); 1105 #[cfg(feature = "igvm")] 1106 { 1107 if let Some(_igvm_file) = &payload.igvm { 1108 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1109 #[cfg(feature = "sev_snp")] 1110 if sev_snp_enabled { 1111 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); 1112 } 1113 #[cfg(not(feature = "sev_snp"))] 1114 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1115 } 1116 } 1117 match ( 1118 &payload.firmware, 1119 &payload.kernel, 1120 &payload.initramfs, 1121 &payload.cmdline, 1122 ) { 1123 (Some(firmware), None, None, None) => { 1124 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1125 Self::load_kernel(firmware, None, memory_manager) 1126 } 1127 (None, Some(kernel), _, _) => { 1128 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1129 let cmdline = Self::generate_cmdline(payload)?; 1130 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1131 } 1132 _ => Err(Error::InvalidPayload), 1133 } 1134 } 1135 1136 #[cfg(target_arch = "aarch64")] 1137 fn load_payload( 1138 payload: &PayloadConfig, 1139 memory_manager: Arc<Mutex<MemoryManager>>, 1140 ) -> Result<EntryPoint> { 1141 match (&payload.firmware, &payload.kernel) { 1142 (Some(firmware), None) => { 1143 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1144 Self::load_kernel(Some(firmware), None, memory_manager) 1145 } 1146 (None, Some(kernel)) => { 1147 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1148 Self::load_kernel(None, Some(kernel), memory_manager) 1149 } 1150 _ => Err(Error::InvalidPayload), 1151 } 1152 } 1153 1154 fn load_payload_async( 1155 memory_manager: &Arc<Mutex<MemoryManager>>, 1156 config: &Arc<Mutex<VmConfig>>, 1157 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1158 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1159 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1160 // Kernel with TDX is loaded in a different manner 1161 #[cfg(feature = "tdx")] 1162 if config.lock().unwrap().is_tdx_enabled() { 1163 return Ok(None); 1164 } 1165 1166 config 1167 .lock() 1168 .unwrap() 1169 .payload 1170 .as_ref() 1171 .map(|payload| { 1172 let memory_manager = memory_manager.clone(); 1173 let payload = payload.clone(); 1174 #[cfg(feature = "igvm")] 1175 let cpu_manager = cpu_manager.clone(); 1176 1177 std::thread::Builder::new() 1178 .name("payload_loader".into()) 1179 .spawn(move || { 1180 Self::load_payload( 1181 &payload, 1182 memory_manager, 1183 #[cfg(feature = "igvm")] 1184 cpu_manager, 1185 #[cfg(feature = "sev_snp")] 1186 sev_snp_enabled, 1187 ) 1188 }) 1189 .map_err(Error::KernelLoadThreadSpawn) 1190 }) 1191 .transpose() 1192 } 1193 1194 #[cfg(target_arch = "x86_64")] 1195 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1196 trace_scoped!("configure_system"); 1197 info!("Configuring system"); 1198 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1199 1200 let initramfs_config = match self.initramfs { 1201 Some(_) => Some(self.load_initramfs(&mem)?), 1202 None => None, 1203 }; 1204 1205 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1206 let rsdp_addr = Some(rsdp_addr); 1207 let sgx_epc_region = self 1208 .memory_manager 1209 .lock() 1210 .unwrap() 1211 .sgx_epc_region() 1212 .as_ref() 1213 .cloned(); 1214 1215 let serial_number = self 1216 .config 1217 .lock() 1218 .unwrap() 1219 .platform 1220 .as_ref() 1221 .and_then(|p| p.serial_number.clone()); 1222 1223 let uuid = self 1224 .config 1225 .lock() 1226 .unwrap() 1227 .platform 1228 .as_ref() 1229 .and_then(|p| p.uuid.clone()); 1230 1231 let oem_strings = self 1232 .config 1233 .lock() 1234 .unwrap() 1235 .platform 1236 .as_ref() 1237 .and_then(|p| p.oem_strings.clone()); 1238 1239 let oem_strings = oem_strings 1240 .as_deref() 1241 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1242 1243 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1244 1245 arch::configure_system( 1246 &mem, 1247 arch::layout::CMDLINE_START, 1248 arch::layout::CMDLINE_MAX_SIZE, 1249 &initramfs_config, 1250 boot_vcpus, 1251 entry_addr.setup_header, 1252 rsdp_addr, 1253 sgx_epc_region, 1254 serial_number.as_deref(), 1255 uuid.as_deref(), 1256 oem_strings.as_deref(), 1257 topology, 1258 ) 1259 .map_err(Error::ConfigureSystem)?; 1260 Ok(()) 1261 } 1262 1263 #[cfg(target_arch = "aarch64")] 1264 fn configure_system( 1265 &mut self, 1266 _rsdp_addr: GuestAddress, 1267 _entry_addr: EntryPoint, 1268 ) -> Result<()> { 1269 let cmdline = Self::generate_cmdline( 1270 self.config.lock().unwrap().payload.as_ref().unwrap(), 1271 &self.device_manager, 1272 )?; 1273 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1274 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1275 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1276 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1277 let initramfs_config = match self.initramfs { 1278 Some(_) => Some(self.load_initramfs(&mem)?), 1279 None => None, 1280 }; 1281 1282 let device_info = &self 1283 .device_manager 1284 .lock() 1285 .unwrap() 1286 .get_device_info() 1287 .clone(); 1288 1289 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1290 let pci_space = PciSpaceInfo { 1291 pci_segment_id: pci_segment.id, 1292 mmio_config_address: pci_segment.mmio_config_address, 1293 pci_device_space_start: pci_segment.start_of_mem64_area, 1294 pci_device_space_size: pci_segment.end_of_mem64_area 1295 - pci_segment.start_of_mem64_area 1296 + 1, 1297 }; 1298 pci_space_info.push(pci_space); 1299 } 1300 1301 let virtio_iommu_bdf = self 1302 .device_manager 1303 .lock() 1304 .unwrap() 1305 .iommu_attached_devices() 1306 .as_ref() 1307 .map(|(v, _)| *v); 1308 1309 let vgic = self 1310 .device_manager 1311 .lock() 1312 .unwrap() 1313 .get_interrupt_controller() 1314 .unwrap() 1315 .lock() 1316 .unwrap() 1317 .get_vgic() 1318 .map_err(|_| { 1319 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1320 arch::aarch64::Error::SetupGic, 1321 )) 1322 })?; 1323 1324 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1325 let pmu_supported = self 1326 .cpu_manager 1327 .lock() 1328 .unwrap() 1329 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1330 .map_err(|_| { 1331 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1332 arch::aarch64::Error::VcpuInitPmu, 1333 )) 1334 })?; 1335 1336 arch::configure_system( 1337 &mem, 1338 cmdline.as_cstring().unwrap().to_str().unwrap(), 1339 vcpu_mpidrs, 1340 vcpu_topology, 1341 device_info, 1342 &initramfs_config, 1343 &pci_space_info, 1344 virtio_iommu_bdf.map(|bdf| bdf.into()), 1345 &vgic, 1346 &self.numa_nodes, 1347 pmu_supported, 1348 ) 1349 .map_err(Error::ConfigureSystem)?; 1350 1351 Ok(()) 1352 } 1353 1354 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1355 self.device_manager.lock().unwrap().console_resize_pipe() 1356 } 1357 1358 pub fn shutdown(&mut self) -> Result<()> { 1359 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1360 let new_state = VmState::Shutdown; 1361 1362 state.valid_transition(new_state)?; 1363 1364 // Wake up the DeviceManager threads so they will get terminated cleanly 1365 self.device_manager 1366 .lock() 1367 .unwrap() 1368 .resume() 1369 .map_err(Error::Resume)?; 1370 1371 self.cpu_manager 1372 .lock() 1373 .unwrap() 1374 .shutdown() 1375 .map_err(Error::CpuManager)?; 1376 1377 // Wait for all the threads to finish 1378 for thread in self.threads.drain(..) { 1379 thread.join().map_err(Error::ThreadCleanup)? 1380 } 1381 *state = new_state; 1382 1383 Ok(()) 1384 } 1385 1386 pub fn resize( 1387 &mut self, 1388 desired_vcpus: Option<u8>, 1389 desired_memory: Option<u64>, 1390 desired_balloon: Option<u64>, 1391 ) -> Result<()> { 1392 event!("vm", "resizing"); 1393 1394 if let Some(desired_vcpus) = desired_vcpus { 1395 if self 1396 .cpu_manager 1397 .lock() 1398 .unwrap() 1399 .resize(desired_vcpus) 1400 .map_err(Error::CpuManager)? 1401 { 1402 self.device_manager 1403 .lock() 1404 .unwrap() 1405 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1406 .map_err(Error::DeviceManager)?; 1407 } 1408 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1409 } 1410 1411 if let Some(desired_memory) = desired_memory { 1412 let new_region = self 1413 .memory_manager 1414 .lock() 1415 .unwrap() 1416 .resize(desired_memory) 1417 .map_err(Error::MemoryManager)?; 1418 1419 let memory_config = &mut self.config.lock().unwrap().memory; 1420 1421 if let Some(new_region) = &new_region { 1422 self.device_manager 1423 .lock() 1424 .unwrap() 1425 .update_memory(new_region) 1426 .map_err(Error::DeviceManager)?; 1427 1428 match memory_config.hotplug_method { 1429 HotplugMethod::Acpi => { 1430 self.device_manager 1431 .lock() 1432 .unwrap() 1433 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1434 .map_err(Error::DeviceManager)?; 1435 } 1436 HotplugMethod::VirtioMem => {} 1437 } 1438 } 1439 1440 // We update the VM config regardless of the actual guest resize 1441 // operation result (happened or not), so that if the VM reboots 1442 // it will be running with the last configure memory size. 1443 match memory_config.hotplug_method { 1444 HotplugMethod::Acpi => memory_config.size = desired_memory, 1445 HotplugMethod::VirtioMem => { 1446 if desired_memory > memory_config.size { 1447 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1448 } else { 1449 memory_config.hotplugged_size = None; 1450 } 1451 } 1452 } 1453 } 1454 1455 if let Some(desired_balloon) = desired_balloon { 1456 self.device_manager 1457 .lock() 1458 .unwrap() 1459 .resize_balloon(desired_balloon) 1460 .map_err(Error::DeviceManager)?; 1461 1462 // Update the configuration value for the balloon size to ensure 1463 // a reboot would use the right value. 1464 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1465 balloon_config.size = desired_balloon; 1466 } 1467 } 1468 1469 event!("vm", "resized"); 1470 1471 Ok(()) 1472 } 1473 1474 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1475 let memory_config = &mut self.config.lock().unwrap().memory; 1476 1477 if let Some(zones) = &mut memory_config.zones { 1478 for zone in zones.iter_mut() { 1479 if zone.id == id { 1480 if desired_memory >= zone.size { 1481 let hotplugged_size = desired_memory - zone.size; 1482 self.memory_manager 1483 .lock() 1484 .unwrap() 1485 .resize_zone(&id, desired_memory - zone.size) 1486 .map_err(Error::MemoryManager)?; 1487 // We update the memory zone config regardless of the 1488 // actual 'resize-zone' operation result (happened or 1489 // not), so that if the VM reboots it will be running 1490 // with the last configured memory zone size. 1491 zone.hotplugged_size = Some(hotplugged_size); 1492 1493 return Ok(()); 1494 } else { 1495 error!( 1496 "Invalid to ask less ({}) than boot RAM ({}) for \ 1497 this memory zone", 1498 desired_memory, zone.size, 1499 ); 1500 return Err(Error::ResizeZone); 1501 } 1502 } 1503 } 1504 } 1505 1506 error!("Could not find the memory zone {} for the resize", id); 1507 Err(Error::ResizeZone) 1508 } 1509 1510 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1511 let pci_device_info = self 1512 .device_manager 1513 .lock() 1514 .unwrap() 1515 .add_device(&mut device_cfg) 1516 .map_err(Error::DeviceManager)?; 1517 1518 // Update VmConfig by adding the new device. This is important to 1519 // ensure the device would be created in case of a reboot. 1520 { 1521 let mut config = self.config.lock().unwrap(); 1522 add_to_config(&mut config.devices, device_cfg); 1523 } 1524 1525 self.device_manager 1526 .lock() 1527 .unwrap() 1528 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1529 .map_err(Error::DeviceManager)?; 1530 1531 Ok(pci_device_info) 1532 } 1533 1534 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1535 let pci_device_info = self 1536 .device_manager 1537 .lock() 1538 .unwrap() 1539 .add_user_device(&mut device_cfg) 1540 .map_err(Error::DeviceManager)?; 1541 1542 // Update VmConfig by adding the new device. This is important to 1543 // ensure the device would be created in case of a reboot. 1544 { 1545 let mut config = self.config.lock().unwrap(); 1546 add_to_config(&mut config.user_devices, device_cfg); 1547 } 1548 1549 self.device_manager 1550 .lock() 1551 .unwrap() 1552 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1553 .map_err(Error::DeviceManager)?; 1554 1555 Ok(pci_device_info) 1556 } 1557 1558 pub fn remove_device(&mut self, id: String) -> Result<()> { 1559 self.device_manager 1560 .lock() 1561 .unwrap() 1562 .remove_device(id.clone()) 1563 .map_err(Error::DeviceManager)?; 1564 1565 // Update VmConfig by removing the device. This is important to 1566 // ensure the device would not be created in case of a reboot. 1567 self.config.lock().unwrap().remove_device(&id); 1568 1569 self.device_manager 1570 .lock() 1571 .unwrap() 1572 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1573 .map_err(Error::DeviceManager)?; 1574 Ok(()) 1575 } 1576 1577 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1578 let pci_device_info = self 1579 .device_manager 1580 .lock() 1581 .unwrap() 1582 .add_disk(&mut disk_cfg) 1583 .map_err(Error::DeviceManager)?; 1584 1585 // Update VmConfig by adding the new device. This is important to 1586 // ensure the device would be created in case of a reboot. 1587 { 1588 let mut config = self.config.lock().unwrap(); 1589 add_to_config(&mut config.disks, disk_cfg); 1590 } 1591 1592 self.device_manager 1593 .lock() 1594 .unwrap() 1595 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1596 .map_err(Error::DeviceManager)?; 1597 1598 Ok(pci_device_info) 1599 } 1600 1601 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1602 let pci_device_info = self 1603 .device_manager 1604 .lock() 1605 .unwrap() 1606 .add_fs(&mut fs_cfg) 1607 .map_err(Error::DeviceManager)?; 1608 1609 // Update VmConfig by adding the new device. This is important to 1610 // ensure the device would be created in case of a reboot. 1611 { 1612 let mut config = self.config.lock().unwrap(); 1613 add_to_config(&mut config.fs, fs_cfg); 1614 } 1615 1616 self.device_manager 1617 .lock() 1618 .unwrap() 1619 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1620 .map_err(Error::DeviceManager)?; 1621 1622 Ok(pci_device_info) 1623 } 1624 1625 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1626 let pci_device_info = self 1627 .device_manager 1628 .lock() 1629 .unwrap() 1630 .add_pmem(&mut pmem_cfg) 1631 .map_err(Error::DeviceManager)?; 1632 1633 // Update VmConfig by adding the new device. This is important to 1634 // ensure the device would be created in case of a reboot. 1635 { 1636 let mut config = self.config.lock().unwrap(); 1637 add_to_config(&mut config.pmem, pmem_cfg); 1638 } 1639 1640 self.device_manager 1641 .lock() 1642 .unwrap() 1643 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1644 .map_err(Error::DeviceManager)?; 1645 1646 Ok(pci_device_info) 1647 } 1648 1649 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1650 let pci_device_info = self 1651 .device_manager 1652 .lock() 1653 .unwrap() 1654 .add_net(&mut net_cfg) 1655 .map_err(Error::DeviceManager)?; 1656 1657 // Update VmConfig by adding the new device. This is important to 1658 // ensure the device would be created in case of a reboot. 1659 { 1660 let mut config = self.config.lock().unwrap(); 1661 add_to_config(&mut config.net, net_cfg); 1662 } 1663 1664 self.device_manager 1665 .lock() 1666 .unwrap() 1667 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1668 .map_err(Error::DeviceManager)?; 1669 1670 Ok(pci_device_info) 1671 } 1672 1673 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1674 let pci_device_info = self 1675 .device_manager 1676 .lock() 1677 .unwrap() 1678 .add_vdpa(&mut vdpa_cfg) 1679 .map_err(Error::DeviceManager)?; 1680 1681 // Update VmConfig by adding the new device. This is important to 1682 // ensure the device would be created in case of a reboot. 1683 { 1684 let mut config = self.config.lock().unwrap(); 1685 add_to_config(&mut config.vdpa, vdpa_cfg); 1686 } 1687 1688 self.device_manager 1689 .lock() 1690 .unwrap() 1691 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1692 .map_err(Error::DeviceManager)?; 1693 1694 Ok(pci_device_info) 1695 } 1696 1697 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1698 let pci_device_info = self 1699 .device_manager 1700 .lock() 1701 .unwrap() 1702 .add_vsock(&mut vsock_cfg) 1703 .map_err(Error::DeviceManager)?; 1704 1705 // Update VmConfig by adding the new device. This is important to 1706 // ensure the device would be created in case of a reboot. 1707 { 1708 let mut config = self.config.lock().unwrap(); 1709 config.vsock = Some(vsock_cfg); 1710 } 1711 1712 self.device_manager 1713 .lock() 1714 .unwrap() 1715 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1716 .map_err(Error::DeviceManager)?; 1717 1718 Ok(pci_device_info) 1719 } 1720 1721 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1722 Ok(self.device_manager.lock().unwrap().counters()) 1723 } 1724 1725 #[cfg(feature = "tdx")] 1726 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1727 use arch::x86_64::tdx::*; 1728 1729 let firmware_path = self 1730 .config 1731 .lock() 1732 .unwrap() 1733 .payload 1734 .as_ref() 1735 .unwrap() 1736 .firmware 1737 .clone() 1738 .ok_or(Error::TdxFirmwareMissing)?; 1739 // The TDVF file contains a table of section as well as code 1740 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1741 1742 // For all the sections allocate some RAM backing them 1743 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1744 } 1745 1746 #[cfg(feature = "tdx")] 1747 fn hob_memory_resources( 1748 mut sorted_sections: Vec<TdvfSection>, 1749 guest_memory: &GuestMemoryMmap, 1750 ) -> Vec<(u64, u64, bool)> { 1751 let mut list = Vec::new(); 1752 1753 let mut current_section = sorted_sections.pop(); 1754 1755 // RAM regions interleaved with TDVF sections 1756 let mut next_start_addr = 0; 1757 for region in guest_memory.iter() { 1758 let region_start = region.start_addr().0; 1759 let region_end = region.last_addr().0; 1760 if region_start > next_start_addr { 1761 next_start_addr = region_start; 1762 } 1763 1764 loop { 1765 let (start, size, ram) = if let Some(section) = ¤t_section { 1766 if section.address <= next_start_addr { 1767 (section.address, section.size, false) 1768 } else { 1769 let last_addr = std::cmp::min(section.address - 1, region_end); 1770 (next_start_addr, last_addr - next_start_addr + 1, true) 1771 } 1772 } else { 1773 (next_start_addr, region_end - next_start_addr + 1, true) 1774 }; 1775 1776 list.push((start, size, ram)); 1777 1778 if !ram { 1779 current_section = sorted_sections.pop(); 1780 } 1781 1782 next_start_addr = start + size; 1783 1784 if region_start > next_start_addr { 1785 next_start_addr = region_start; 1786 } 1787 1788 if next_start_addr > region_end { 1789 break; 1790 } 1791 } 1792 } 1793 1794 // Once all the interleaved sections have been processed, let's simply 1795 // pull the remaining ones. 1796 if let Some(section) = current_section { 1797 list.push((section.address, section.size, false)); 1798 } 1799 while let Some(section) = sorted_sections.pop() { 1800 list.push((section.address, section.size, false)); 1801 } 1802 1803 list 1804 } 1805 1806 #[cfg(feature = "tdx")] 1807 fn populate_tdx_sections( 1808 &mut self, 1809 sections: &[TdvfSection], 1810 guid_found: bool, 1811 ) -> Result<Option<u64>> { 1812 use arch::x86_64::tdx::*; 1813 // Get the memory end *before* we start adding TDVF ram regions 1814 let boot_guest_memory = self 1815 .memory_manager 1816 .lock() 1817 .as_ref() 1818 .unwrap() 1819 .boot_guest_memory(); 1820 for section in sections { 1821 // No need to allocate if the section falls within guest RAM ranges 1822 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1823 info!( 1824 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1825 section 1826 ); 1827 continue; 1828 } 1829 1830 info!("Allocating TDVF Section: {:x?}", section); 1831 self.memory_manager 1832 .lock() 1833 .unwrap() 1834 .add_ram_region(GuestAddress(section.address), section.size as usize) 1835 .map_err(Error::AllocatingTdvfMemory)?; 1836 } 1837 1838 // The TDVF file contains a table of section as well as code 1839 let firmware_path = self 1840 .config 1841 .lock() 1842 .unwrap() 1843 .payload 1844 .as_ref() 1845 .unwrap() 1846 .firmware 1847 .clone() 1848 .ok_or(Error::TdxFirmwareMissing)?; 1849 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1850 1851 // The guest memory at this point now has all the required regions so it 1852 // is safe to copy from the TDVF file into it. 1853 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1854 let mem = guest_memory.memory(); 1855 let mut payload_info = None; 1856 let mut hob_offset = None; 1857 for section in sections { 1858 info!("Populating TDVF Section: {:x?}", section); 1859 match section.r#type { 1860 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1861 info!("Copying section to guest memory"); 1862 firmware_file 1863 .seek(SeekFrom::Start(section.data_offset as u64)) 1864 .map_err(Error::LoadTdvf)?; 1865 mem.read_volatile_from( 1866 GuestAddress(section.address), 1867 &mut firmware_file, 1868 section.data_size as usize, 1869 ) 1870 .unwrap(); 1871 } 1872 TdvfSectionType::TdHob => { 1873 hob_offset = Some(section.address); 1874 } 1875 TdvfSectionType::Payload => { 1876 info!("Copying payload to guest memory"); 1877 if let Some(payload_file) = self.kernel.as_mut() { 1878 let payload_size = payload_file 1879 .seek(SeekFrom::End(0)) 1880 .map_err(Error::LoadPayload)?; 1881 1882 payload_file 1883 .seek(SeekFrom::Start(0x1f1)) 1884 .map_err(Error::LoadPayload)?; 1885 1886 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1887 payload_file 1888 .read_volatile(&mut payload_header.as_bytes()) 1889 .unwrap(); 1890 1891 if payload_header.header != 0x5372_6448 { 1892 return Err(Error::InvalidPayloadType); 1893 } 1894 1895 if (payload_header.version < 0x0200) 1896 || ((payload_header.loadflags & 0x1) == 0x0) 1897 { 1898 return Err(Error::InvalidPayloadType); 1899 } 1900 1901 payload_file.rewind().map_err(Error::LoadPayload)?; 1902 mem.read_volatile_from( 1903 GuestAddress(section.address), 1904 payload_file, 1905 payload_size as usize, 1906 ) 1907 .unwrap(); 1908 1909 // Create the payload info that will be inserted into 1910 // the HOB. 1911 payload_info = Some(PayloadInfo { 1912 image_type: PayloadImageType::BzImage, 1913 entry_point: section.address, 1914 }); 1915 } 1916 } 1917 TdvfSectionType::PayloadParam => { 1918 info!("Copying payload parameters to guest memory"); 1919 let cmdline = Self::generate_cmdline( 1920 self.config.lock().unwrap().payload.as_ref().unwrap(), 1921 )?; 1922 mem.write_slice( 1923 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1924 GuestAddress(section.address), 1925 ) 1926 .unwrap(); 1927 } 1928 _ => {} 1929 } 1930 } 1931 1932 // Generate HOB 1933 let mut hob = TdHob::start(hob_offset.unwrap()); 1934 1935 let mut sorted_sections = sections.to_vec(); 1936 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1937 1938 sorted_sections.sort_by_key(|section| section.address); 1939 sorted_sections.reverse(); 1940 1941 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1942 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1943 .map_err(Error::PopulateHob)?; 1944 } 1945 1946 // MMIO regions 1947 hob.add_mmio_resource( 1948 &mem, 1949 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1950 arch::layout::APIC_START.raw_value() 1951 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1952 ) 1953 .map_err(Error::PopulateHob)?; 1954 let start_of_device_area = self 1955 .memory_manager 1956 .lock() 1957 .unwrap() 1958 .start_of_device_area() 1959 .raw_value(); 1960 let end_of_device_area = self 1961 .memory_manager 1962 .lock() 1963 .unwrap() 1964 .end_of_device_area() 1965 .raw_value(); 1966 hob.add_mmio_resource( 1967 &mem, 1968 start_of_device_area, 1969 end_of_device_area - start_of_device_area, 1970 ) 1971 .map_err(Error::PopulateHob)?; 1972 1973 // Loop over the ACPI tables and copy them to the HOB. 1974 1975 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1976 &self.device_manager, 1977 &self.cpu_manager, 1978 &self.memory_manager, 1979 &self.numa_nodes, 1980 ) { 1981 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1982 .map_err(Error::PopulateHob)?; 1983 } 1984 1985 // If a payload info has been created, let's insert it into the HOB. 1986 if let Some(payload_info) = payload_info { 1987 hob.add_payload(&mem, payload_info) 1988 .map_err(Error::PopulateHob)?; 1989 } 1990 1991 hob.finish(&mem).map_err(Error::PopulateHob)?; 1992 1993 Ok(hob_offset) 1994 } 1995 1996 #[cfg(feature = "tdx")] 1997 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1998 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1999 let mem = guest_memory.memory(); 2000 2001 for section in sections { 2002 self.vm 2003 .tdx_init_memory_region( 2004 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2005 section.address, 2006 section.size, 2007 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2008 section.attributes == 1, 2009 ) 2010 .map_err(Error::InitializeTdxMemoryRegion)?; 2011 } 2012 2013 Ok(()) 2014 } 2015 2016 // Creates ACPI tables 2017 // In case of TDX being used, this is a no-op since the tables will be 2018 // created and passed when populating the HOB. 2019 2020 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2021 #[cfg(feature = "tdx")] 2022 if self.config.lock().unwrap().is_tdx_enabled() { 2023 return None; 2024 } 2025 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2026 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2027 let rsdp_addr = crate::acpi::create_acpi_tables( 2028 &mem, 2029 &self.device_manager, 2030 &self.cpu_manager, 2031 &self.memory_manager, 2032 &self.numa_nodes, 2033 tpm_enabled, 2034 ); 2035 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2036 2037 Some(rsdp_addr) 2038 } 2039 2040 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2041 trace_scoped!("entry_point"); 2042 2043 self.load_payload_handle 2044 .take() 2045 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2046 .transpose() 2047 } 2048 2049 pub fn boot(&mut self) -> Result<()> { 2050 trace_scoped!("Vm::boot"); 2051 let current_state = self.get_state()?; 2052 if current_state == VmState::Paused { 2053 return self.resume().map_err(Error::Resume); 2054 } 2055 2056 let new_state = if self.stop_on_boot { 2057 VmState::BreakPoint 2058 } else { 2059 VmState::Running 2060 }; 2061 current_state.valid_transition(new_state)?; 2062 2063 // Do earlier to parallelise with loading kernel 2064 #[cfg(target_arch = "x86_64")] 2065 cfg_if::cfg_if! { 2066 if #[cfg(feature = "sev_snp")] { 2067 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2068 let rsdp_addr = if sev_snp_enabled { 2069 // In case of SEV-SNP guest ACPI tables are provided via 2070 // IGVM. So skip the creation of ACPI tables and set the 2071 // rsdp addr to None. 2072 None 2073 } else { 2074 self.create_acpi_tables() 2075 }; 2076 } else { 2077 let rsdp_addr = self.create_acpi_tables(); 2078 } 2079 } 2080 2081 // Load kernel synchronously or if asynchronous then wait for load to 2082 // finish. 2083 let entry_point = self.entry_point()?; 2084 2085 #[cfg(feature = "tdx")] 2086 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2087 2088 // Configure the vcpus that have been created 2089 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2090 for vcpu in vcpus { 2091 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2092 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2093 self.cpu_manager 2094 .lock() 2095 .unwrap() 2096 .configure_vcpu(vcpu, boot_setup) 2097 .map_err(Error::CpuManager)?; 2098 } 2099 2100 #[cfg(feature = "tdx")] 2101 let (sections, guid_found) = if tdx_enabled { 2102 self.extract_tdvf_sections()? 2103 } else { 2104 (Vec::new(), false) 2105 }; 2106 2107 // Configuring the TDX regions requires that the vCPUs are created. 2108 #[cfg(feature = "tdx")] 2109 let hob_address = if tdx_enabled { 2110 // TDX sections are written to memory. 2111 self.populate_tdx_sections(§ions, guid_found)? 2112 } else { 2113 None 2114 }; 2115 2116 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2117 // available after they are configured 2118 #[cfg(target_arch = "aarch64")] 2119 let rsdp_addr = self.create_acpi_tables(); 2120 2121 // Configure shared state based on loaded kernel 2122 entry_point 2123 .map(|entry_point| { 2124 // Safe to unwrap rsdp_addr as we know it can't be None when 2125 // the entry_point is Some. 2126 self.configure_system(rsdp_addr.unwrap(), entry_point) 2127 }) 2128 .transpose()?; 2129 2130 #[cfg(target_arch = "x86_64")] 2131 // Note: For x86, always call this function before invoking start boot vcpus. 2132 // Otherwise guest would fail to boot because we haven't created the 2133 // userspace mappings to update the hypervisor about the memory mappings. 2134 // These mappings must be created before we start the vCPU threads for 2135 // the very first time. 2136 self.memory_manager 2137 .lock() 2138 .unwrap() 2139 .allocate_address_space() 2140 .map_err(Error::MemoryManager)?; 2141 2142 #[cfg(feature = "tdx")] 2143 if let Some(hob_address) = hob_address { 2144 // With the HOB address extracted the vCPUs can have 2145 // their TDX state configured. 2146 self.cpu_manager 2147 .lock() 2148 .unwrap() 2149 .initialize_tdx(hob_address) 2150 .map_err(Error::CpuManager)?; 2151 // Let the hypervisor know which memory ranges are shared with the 2152 // guest. This prevents the guest from ignoring/discarding memory 2153 // regions provided by the host. 2154 self.init_tdx_memory(§ions)?; 2155 // With TDX memory and CPU state configured TDX setup is complete 2156 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2157 } 2158 2159 // Resume the vm for MSHV 2160 if current_state == VmState::Created { 2161 self.vm.resume().map_err(Error::ResumeVm)?; 2162 } 2163 2164 self.cpu_manager 2165 .lock() 2166 .unwrap() 2167 .start_boot_vcpus(new_state == VmState::BreakPoint) 2168 .map_err(Error::CpuManager)?; 2169 2170 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2171 *state = new_state; 2172 Ok(()) 2173 } 2174 2175 pub fn restore(&mut self) -> Result<()> { 2176 event!("vm", "restoring"); 2177 2178 #[cfg(target_arch = "x86_64")] 2179 // Note: For x86, always call this function before invoking start boot vcpus. 2180 // Otherwise guest would fail to boot because we haven't created the 2181 // userspace mappings to update the hypervisor about the memory mappings. 2182 // These mappings must be created before we start the vCPU threads for 2183 // the very first time for the restored VM. 2184 self.memory_manager 2185 .lock() 2186 .unwrap() 2187 .allocate_address_space() 2188 .map_err(Error::MemoryManager)?; 2189 2190 // Now we can start all vCPUs from here. 2191 self.cpu_manager 2192 .lock() 2193 .unwrap() 2194 .start_restored_vcpus() 2195 .map_err(Error::CpuManager)?; 2196 2197 event!("vm", "restored"); 2198 Ok(()) 2199 } 2200 2201 /// Gets a thread-safe reference counted pointer to the VM configuration. 2202 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2203 Arc::clone(&self.config) 2204 } 2205 2206 /// Get the VM state. Returns an error if the state is poisoned. 2207 pub fn get_state(&self) -> Result<VmState> { 2208 self.state 2209 .try_read() 2210 .map_err(|_| Error::PoisonedState) 2211 .map(|state| *state) 2212 } 2213 2214 /// Gets the actual size of the balloon. 2215 pub fn balloon_size(&self) -> u64 { 2216 self.device_manager.lock().unwrap().balloon_size() 2217 } 2218 2219 pub fn send_memory_fds( 2220 &mut self, 2221 socket: &mut UnixStream, 2222 ) -> std::result::Result<(), MigratableError> { 2223 for (slot, fd) in self 2224 .memory_manager 2225 .lock() 2226 .unwrap() 2227 .memory_slot_fds() 2228 .drain() 2229 { 2230 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2231 .write_to(socket) 2232 .map_err(|e| { 2233 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2234 })?; 2235 socket 2236 .send_with_fd(&slot.to_le_bytes()[..], fd) 2237 .map_err(|e| { 2238 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2239 })?; 2240 2241 Response::read_from(socket)?.ok_or_abandon( 2242 socket, 2243 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")), 2244 )?; 2245 } 2246 2247 Ok(()) 2248 } 2249 2250 pub fn send_memory_regions<F>( 2251 &mut self, 2252 ranges: &MemoryRangeTable, 2253 fd: &mut F, 2254 ) -> std::result::Result<(), MigratableError> 2255 where 2256 F: WriteVolatile, 2257 { 2258 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2259 let mem = guest_memory.memory(); 2260 2261 for range in ranges.regions() { 2262 let mut offset: u64 = 0; 2263 // Here we are manually handling the retry in case we can't the 2264 // whole region at once because we can't use the implementation 2265 // from vm-memory::GuestMemory of write_all_to() as it is not 2266 // following the correct behavior. For more info about this issue 2267 // see: https://github.com/rust-vmm/vm-memory/issues/174 2268 loop { 2269 let bytes_written = mem 2270 .write_volatile_to( 2271 GuestAddress(range.gpa + offset), 2272 fd, 2273 (range.length - offset) as usize, 2274 ) 2275 .map_err(|e| { 2276 MigratableError::MigrateSend(anyhow!( 2277 "Error transferring memory to socket: {}", 2278 e 2279 )) 2280 })?; 2281 offset += bytes_written as u64; 2282 2283 if offset == range.length { 2284 break; 2285 } 2286 } 2287 } 2288 2289 Ok(()) 2290 } 2291 2292 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2293 self.memory_manager 2294 .lock() 2295 .unwrap() 2296 .memory_range_table(false) 2297 } 2298 2299 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2300 self.device_manager.lock().unwrap().device_tree() 2301 } 2302 2303 pub fn activate_virtio_devices(&self) -> Result<()> { 2304 self.device_manager 2305 .lock() 2306 .unwrap() 2307 .activate_virtio_devices() 2308 .map_err(Error::ActivateVirtioDevices) 2309 } 2310 2311 #[cfg(target_arch = "x86_64")] 2312 pub fn power_button(&self) -> Result<()> { 2313 return self 2314 .device_manager 2315 .lock() 2316 .unwrap() 2317 .notify_power_button() 2318 .map_err(Error::PowerButton); 2319 } 2320 2321 #[cfg(target_arch = "aarch64")] 2322 pub fn power_button(&self) -> Result<()> { 2323 self.device_manager 2324 .lock() 2325 .unwrap() 2326 .notify_power_button() 2327 .map_err(Error::PowerButton) 2328 } 2329 2330 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2331 self.memory_manager.lock().unwrap().snapshot_data() 2332 } 2333 2334 #[cfg(feature = "guest_debug")] 2335 pub fn debug_request( 2336 &mut self, 2337 gdb_request: &GdbRequestPayload, 2338 cpu_id: usize, 2339 ) -> Result<GdbResponsePayload> { 2340 use GdbRequestPayload::*; 2341 match gdb_request { 2342 SetSingleStep(single_step) => { 2343 self.set_guest_debug(cpu_id, &[], *single_step) 2344 .map_err(Error::Debug)?; 2345 } 2346 SetHwBreakPoint(addrs) => { 2347 self.set_guest_debug(cpu_id, addrs, false) 2348 .map_err(Error::Debug)?; 2349 } 2350 Pause => { 2351 self.debug_pause().map_err(Error::Debug)?; 2352 } 2353 Resume => { 2354 self.debug_resume().map_err(Error::Debug)?; 2355 } 2356 ReadRegs => { 2357 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2358 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2359 } 2360 WriteRegs(regs) => { 2361 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2362 } 2363 ReadMem(vaddr, len) => { 2364 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2365 let mem = self 2366 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2367 .map_err(Error::Debug)?; 2368 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2369 } 2370 WriteMem(vaddr, data) => { 2371 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2372 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2373 .map_err(Error::Debug)?; 2374 } 2375 ActiveVcpus => { 2376 let active_vcpus = self.active_vcpus(); 2377 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2378 } 2379 } 2380 Ok(GdbResponsePayload::CommandComplete) 2381 } 2382 2383 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2384 fn get_dump_state( 2385 &mut self, 2386 destination_url: &str, 2387 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2388 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2389 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2390 let mut elf_phdr_num = 1; 2391 let elf_sh_info = 0; 2392 let coredump_file_path = url_to_file(destination_url)?; 2393 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2394 2395 if mapping_num < UINT16_MAX - 2 { 2396 elf_phdr_num += mapping_num as u16; 2397 } else { 2398 panic!("mapping num beyond 65535 not supported"); 2399 } 2400 let coredump_file = OpenOptions::new() 2401 .read(true) 2402 .write(true) 2403 .create_new(true) 2404 .open(coredump_file_path) 2405 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2406 2407 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2408 let mem_data = self 2409 .memory_manager 2410 .lock() 2411 .unwrap() 2412 .coredump_memory_regions(mem_offset); 2413 2414 Ok(DumpState { 2415 elf_note_size, 2416 elf_phdr_num, 2417 elf_sh_info, 2418 mem_offset, 2419 mem_info: Some(mem_data), 2420 file: Some(coredump_file), 2421 }) 2422 } 2423 2424 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2425 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2426 size_of::<elf::Elf64_Ehdr>() as u64 2427 + note_size as u64 2428 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2429 } 2430 2431 pub fn nmi(&self) -> Result<()> { 2432 return self 2433 .cpu_manager 2434 .lock() 2435 .unwrap() 2436 .nmi() 2437 .map_err(|_| Error::ErrorNmi); 2438 } 2439 } 2440 2441 impl Pausable for Vm { 2442 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2443 event!("vm", "pausing"); 2444 let mut state = self 2445 .state 2446 .try_write() 2447 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2448 let new_state = VmState::Paused; 2449 2450 state 2451 .valid_transition(new_state) 2452 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2453 2454 #[cfg(target_arch = "x86_64")] 2455 { 2456 let mut clock = self 2457 .vm 2458 .get_clock() 2459 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2460 clock.reset_flags(); 2461 self.saved_clock = Some(clock); 2462 } 2463 2464 // Before pausing the vCPUs activate any pending virtio devices that might 2465 // need activation between starting the pause (or e.g. a migration it's part of) 2466 self.activate_virtio_devices().map_err(|e| { 2467 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2468 })?; 2469 2470 self.cpu_manager.lock().unwrap().pause()?; 2471 self.device_manager.lock().unwrap().pause()?; 2472 2473 self.vm 2474 .pause() 2475 .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?; 2476 2477 *state = new_state; 2478 2479 event!("vm", "paused"); 2480 Ok(()) 2481 } 2482 2483 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2484 event!("vm", "resuming"); 2485 let current_state = self.get_state().unwrap(); 2486 let mut state = self 2487 .state 2488 .try_write() 2489 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2490 let new_state = VmState::Running; 2491 2492 state 2493 .valid_transition(new_state) 2494 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2495 2496 self.cpu_manager.lock().unwrap().resume()?; 2497 #[cfg(target_arch = "x86_64")] 2498 { 2499 if let Some(clock) = &self.saved_clock { 2500 self.vm.set_clock(clock).map_err(|e| { 2501 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2502 })?; 2503 } 2504 } 2505 2506 if current_state == VmState::Paused { 2507 self.vm 2508 .resume() 2509 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?; 2510 } 2511 2512 self.device_manager.lock().unwrap().resume()?; 2513 2514 // And we're back to the Running state. 2515 *state = new_state; 2516 event!("vm", "resumed"); 2517 Ok(()) 2518 } 2519 } 2520 2521 #[derive(Serialize, Deserialize)] 2522 pub struct VmSnapshot { 2523 #[cfg(target_arch = "x86_64")] 2524 pub clock: Option<hypervisor::ClockData>, 2525 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2526 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2527 } 2528 2529 pub const VM_SNAPSHOT_ID: &str = "vm"; 2530 impl Snapshottable for Vm { 2531 fn id(&self) -> String { 2532 VM_SNAPSHOT_ID.to_string() 2533 } 2534 2535 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2536 event!("vm", "snapshotting"); 2537 2538 #[cfg(feature = "tdx")] 2539 { 2540 if self.config.lock().unwrap().is_tdx_enabled() { 2541 return Err(MigratableError::Snapshot(anyhow!( 2542 "Snapshot not possible with TDX VM" 2543 ))); 2544 } 2545 } 2546 2547 let current_state = self.get_state().unwrap(); 2548 if current_state != VmState::Paused { 2549 return Err(MigratableError::Snapshot(anyhow!( 2550 "Trying to snapshot while VM is running" 2551 ))); 2552 } 2553 2554 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2555 let common_cpuid = { 2556 let amx = self.config.lock().unwrap().cpus.features.amx; 2557 let phys_bits = physical_bits( 2558 &self.hypervisor, 2559 self.config.lock().unwrap().cpus.max_phys_bits, 2560 ); 2561 arch::generate_common_cpuid( 2562 &self.hypervisor, 2563 &arch::CpuidConfig { 2564 sgx_epc_sections: None, 2565 phys_bits, 2566 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2567 #[cfg(feature = "tdx")] 2568 tdx: false, 2569 amx, 2570 }, 2571 ) 2572 .map_err(|e| { 2573 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2574 })? 2575 }; 2576 2577 let vm_snapshot_state = VmSnapshot { 2578 #[cfg(target_arch = "x86_64")] 2579 clock: self.saved_clock, 2580 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2581 common_cpuid, 2582 }; 2583 2584 let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?; 2585 2586 let (id, snapshot) = { 2587 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2588 (cpu_manager.id(), cpu_manager.snapshot()?) 2589 }; 2590 vm_snapshot.add_snapshot(id, snapshot); 2591 let (id, snapshot) = { 2592 let mut memory_manager = self.memory_manager.lock().unwrap(); 2593 (memory_manager.id(), memory_manager.snapshot()?) 2594 }; 2595 vm_snapshot.add_snapshot(id, snapshot); 2596 let (id, snapshot) = { 2597 let mut device_manager = self.device_manager.lock().unwrap(); 2598 (device_manager.id(), device_manager.snapshot()?) 2599 }; 2600 vm_snapshot.add_snapshot(id, snapshot); 2601 2602 event!("vm", "snapshotted"); 2603 Ok(vm_snapshot) 2604 } 2605 } 2606 2607 impl Transportable for Vm { 2608 fn send( 2609 &self, 2610 snapshot: &Snapshot, 2611 destination_url: &str, 2612 ) -> std::result::Result<(), MigratableError> { 2613 let mut snapshot_config_path = url_to_path(destination_url)?; 2614 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2615 2616 // Create the snapshot config file 2617 let mut snapshot_config_file = OpenOptions::new() 2618 .read(true) 2619 .write(true) 2620 .create_new(true) 2621 .open(snapshot_config_path) 2622 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2623 2624 // Serialize and write the snapshot config 2625 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2626 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2627 2628 snapshot_config_file 2629 .write(vm_config.as_bytes()) 2630 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2631 2632 let mut snapshot_state_path = url_to_path(destination_url)?; 2633 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2634 2635 // Create the snapshot state file 2636 let mut snapshot_state_file = OpenOptions::new() 2637 .read(true) 2638 .write(true) 2639 .create_new(true) 2640 .open(snapshot_state_path) 2641 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2642 2643 // Serialize and write the snapshot state 2644 let vm_state = 2645 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2646 2647 snapshot_state_file 2648 .write(&vm_state) 2649 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2650 2651 // Tell the memory manager to also send/write its own snapshot. 2652 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2653 self.memory_manager 2654 .lock() 2655 .unwrap() 2656 .send(&memory_manager_snapshot.clone(), destination_url)?; 2657 } else { 2658 return Err(MigratableError::Restore(anyhow!( 2659 "Missing memory manager snapshot" 2660 ))); 2661 } 2662 2663 Ok(()) 2664 } 2665 } 2666 2667 impl Migratable for Vm { 2668 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2669 self.memory_manager.lock().unwrap().start_dirty_log()?; 2670 self.device_manager.lock().unwrap().start_dirty_log() 2671 } 2672 2673 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2674 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2675 self.device_manager.lock().unwrap().stop_dirty_log() 2676 } 2677 2678 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2679 Ok(MemoryRangeTable::new_from_tables(vec![ 2680 self.memory_manager.lock().unwrap().dirty_log()?, 2681 self.device_manager.lock().unwrap().dirty_log()?, 2682 ])) 2683 } 2684 2685 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2686 self.memory_manager.lock().unwrap().start_migration()?; 2687 self.device_manager.lock().unwrap().start_migration() 2688 } 2689 2690 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2691 self.memory_manager.lock().unwrap().complete_migration()?; 2692 self.device_manager.lock().unwrap().complete_migration() 2693 } 2694 } 2695 2696 #[cfg(feature = "guest_debug")] 2697 impl Debuggable for Vm { 2698 fn set_guest_debug( 2699 &self, 2700 cpu_id: usize, 2701 addrs: &[GuestAddress], 2702 singlestep: bool, 2703 ) -> std::result::Result<(), DebuggableError> { 2704 self.cpu_manager 2705 .lock() 2706 .unwrap() 2707 .set_guest_debug(cpu_id, addrs, singlestep) 2708 } 2709 2710 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2711 if *self.state.read().unwrap() == VmState::Running { 2712 self.pause().map_err(DebuggableError::Pause)?; 2713 } 2714 2715 let mut state = self 2716 .state 2717 .try_write() 2718 .map_err(|_| DebuggableError::PoisonedState)?; 2719 *state = VmState::BreakPoint; 2720 Ok(()) 2721 } 2722 2723 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2724 if *self.state.read().unwrap() == VmState::BreakPoint { 2725 self.resume().map_err(DebuggableError::Pause)?; 2726 } 2727 2728 Ok(()) 2729 } 2730 2731 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2732 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2733 } 2734 2735 fn write_regs( 2736 &self, 2737 cpu_id: usize, 2738 regs: &CoreRegs, 2739 ) -> std::result::Result<(), DebuggableError> { 2740 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2741 } 2742 2743 fn read_mem( 2744 &self, 2745 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2746 cpu_id: usize, 2747 vaddr: GuestAddress, 2748 len: usize, 2749 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2750 self.cpu_manager 2751 .lock() 2752 .unwrap() 2753 .read_mem(guest_memory, cpu_id, vaddr, len) 2754 } 2755 2756 fn write_mem( 2757 &self, 2758 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2759 cpu_id: usize, 2760 vaddr: &GuestAddress, 2761 data: &[u8], 2762 ) -> std::result::Result<(), DebuggableError> { 2763 self.cpu_manager 2764 .lock() 2765 .unwrap() 2766 .write_mem(guest_memory, cpu_id, vaddr, data) 2767 } 2768 2769 fn active_vcpus(&self) -> usize { 2770 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2771 if active_vcpus > 0 { 2772 active_vcpus 2773 } else { 2774 // The VM is not booted yet. Report boot_vcpus() instead. 2775 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2776 } 2777 } 2778 } 2779 2780 #[cfg(feature = "guest_debug")] 2781 pub const UINT16_MAX: u32 = 65535; 2782 2783 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2784 impl Elf64Writable for Vm {} 2785 2786 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2787 impl GuestDebuggable for Vm { 2788 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2789 event!("vm", "coredumping"); 2790 2791 let mut resume = false; 2792 2793 #[cfg(feature = "tdx")] 2794 { 2795 if let Some(ref platform) = self.config.lock().unwrap().platform { 2796 if platform.tdx { 2797 return Err(GuestDebuggableError::Coredump(anyhow!( 2798 "Coredump not possible with TDX VM" 2799 ))); 2800 } 2801 } 2802 } 2803 2804 match self.get_state().unwrap() { 2805 VmState::Running => { 2806 self.pause().map_err(GuestDebuggableError::Pause)?; 2807 resume = true; 2808 } 2809 VmState::Paused => {} 2810 _ => { 2811 return Err(GuestDebuggableError::Coredump(anyhow!( 2812 "Trying to coredump while VM is not running or paused" 2813 ))); 2814 } 2815 } 2816 2817 let coredump_state = self.get_dump_state(destination_url)?; 2818 2819 self.write_header(&coredump_state)?; 2820 self.write_note(&coredump_state)?; 2821 self.write_loads(&coredump_state)?; 2822 2823 self.cpu_manager 2824 .lock() 2825 .unwrap() 2826 .cpu_write_elf64_note(&coredump_state)?; 2827 self.cpu_manager 2828 .lock() 2829 .unwrap() 2830 .cpu_write_vmm_note(&coredump_state)?; 2831 2832 self.memory_manager 2833 .lock() 2834 .unwrap() 2835 .coredump_iterate_save_mem(&coredump_state)?; 2836 2837 if resume { 2838 self.resume().map_err(GuestDebuggableError::Resume)?; 2839 } 2840 2841 Ok(()) 2842 } 2843 } 2844 2845 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2846 #[cfg(test)] 2847 mod tests { 2848 use super::*; 2849 2850 fn test_vm_state_transitions(state: VmState) { 2851 match state { 2852 VmState::Created => { 2853 // Check the transitions from Created 2854 state.valid_transition(VmState::Created).unwrap_err(); 2855 state.valid_transition(VmState::Running).unwrap(); 2856 state.valid_transition(VmState::Shutdown).unwrap(); 2857 state.valid_transition(VmState::Paused).unwrap(); 2858 state.valid_transition(VmState::BreakPoint).unwrap(); 2859 } 2860 VmState::Running => { 2861 // Check the transitions from Running 2862 state.valid_transition(VmState::Created).unwrap_err(); 2863 state.valid_transition(VmState::Running).unwrap_err(); 2864 state.valid_transition(VmState::Shutdown).unwrap(); 2865 state.valid_transition(VmState::Paused).unwrap(); 2866 state.valid_transition(VmState::BreakPoint).unwrap(); 2867 } 2868 VmState::Shutdown => { 2869 // Check the transitions from Shutdown 2870 state.valid_transition(VmState::Created).unwrap_err(); 2871 state.valid_transition(VmState::Running).unwrap(); 2872 state.valid_transition(VmState::Shutdown).unwrap_err(); 2873 state.valid_transition(VmState::Paused).unwrap_err(); 2874 state.valid_transition(VmState::BreakPoint).unwrap_err(); 2875 } 2876 VmState::Paused => { 2877 // Check the transitions from Paused 2878 state.valid_transition(VmState::Created).unwrap_err(); 2879 state.valid_transition(VmState::Running).unwrap(); 2880 state.valid_transition(VmState::Shutdown).unwrap(); 2881 state.valid_transition(VmState::Paused).unwrap_err(); 2882 state.valid_transition(VmState::BreakPoint).unwrap_err(); 2883 } 2884 VmState::BreakPoint => { 2885 // Check the transitions from Breakpoint 2886 state.valid_transition(VmState::Created).unwrap(); 2887 state.valid_transition(VmState::Running).unwrap(); 2888 state.valid_transition(VmState::Shutdown).unwrap_err(); 2889 state.valid_transition(VmState::Paused).unwrap_err(); 2890 state.valid_transition(VmState::BreakPoint).unwrap_err(); 2891 } 2892 } 2893 } 2894 2895 #[test] 2896 fn test_vm_created_transitions() { 2897 test_vm_state_transitions(VmState::Created); 2898 } 2899 2900 #[test] 2901 fn test_vm_running_transitions() { 2902 test_vm_state_transitions(VmState::Running); 2903 } 2904 2905 #[test] 2906 fn test_vm_shutdown_transitions() { 2907 test_vm_state_transitions(VmState::Shutdown); 2908 } 2909 2910 #[test] 2911 fn test_vm_paused_transitions() { 2912 test_vm_state_transitions(VmState::Paused); 2913 } 2914 2915 #[cfg(feature = "tdx")] 2916 #[test] 2917 fn test_hob_memory_resources() { 2918 // Case 1: Two TDVF sections in the middle of the RAM 2919 let sections = vec![ 2920 TdvfSection { 2921 address: 0xc000, 2922 size: 0x1000, 2923 ..Default::default() 2924 }, 2925 TdvfSection { 2926 address: 0x1000, 2927 size: 0x4000, 2928 ..Default::default() 2929 }, 2930 ]; 2931 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2932 let expected = vec![ 2933 (0, 0x1000, true), 2934 (0x1000, 0x4000, false), 2935 (0x5000, 0x7000, true), 2936 (0xc000, 0x1000, false), 2937 (0xd000, 0x0fff_3000, true), 2938 ]; 2939 assert_eq!( 2940 expected, 2941 Vm::hob_memory_resources( 2942 sections, 2943 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2944 ) 2945 ); 2946 2947 // Case 2: Two TDVF sections with no conflict with the RAM 2948 let sections = vec![ 2949 TdvfSection { 2950 address: 0x1000_1000, 2951 size: 0x1000, 2952 ..Default::default() 2953 }, 2954 TdvfSection { 2955 address: 0, 2956 size: 0x1000, 2957 ..Default::default() 2958 }, 2959 ]; 2960 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2961 let expected = vec![ 2962 (0, 0x1000, false), 2963 (0x1000, 0x1000_0000, true), 2964 (0x1000_1000, 0x1000, false), 2965 ]; 2966 assert_eq!( 2967 expected, 2968 Vm::hob_memory_resources( 2969 sections, 2970 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2971 ) 2972 ); 2973 2974 // Case 3: Two TDVF sections with partial conflicts with the RAM 2975 let sections = vec![ 2976 TdvfSection { 2977 address: 0x1000_0000, 2978 size: 0x2000, 2979 ..Default::default() 2980 }, 2981 TdvfSection { 2982 address: 0, 2983 size: 0x2000, 2984 ..Default::default() 2985 }, 2986 ]; 2987 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2988 let expected = vec![ 2989 (0, 0x2000, false), 2990 (0x2000, 0x0fff_e000, true), 2991 (0x1000_0000, 0x2000, false), 2992 ]; 2993 assert_eq!( 2994 expected, 2995 Vm::hob_memory_resources( 2996 sections, 2997 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2998 ) 2999 ); 3000 3001 // Case 4: Two TDVF sections with no conflict before the RAM and two 3002 // more additional sections with no conflict after the RAM. 3003 let sections = vec![ 3004 TdvfSection { 3005 address: 0x2000_1000, 3006 size: 0x1000, 3007 ..Default::default() 3008 }, 3009 TdvfSection { 3010 address: 0x2000_0000, 3011 size: 0x1000, 3012 ..Default::default() 3013 }, 3014 TdvfSection { 3015 address: 0x1000, 3016 size: 0x1000, 3017 ..Default::default() 3018 }, 3019 TdvfSection { 3020 address: 0, 3021 size: 0x1000, 3022 ..Default::default() 3023 }, 3024 ]; 3025 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3026 let expected = vec![ 3027 (0, 0x1000, false), 3028 (0x1000, 0x1000, false), 3029 (0x4000, 0x1000_0000, true), 3030 (0x2000_0000, 0x1000, false), 3031 (0x2000_1000, 0x1000, false), 3032 ]; 3033 assert_eq!( 3034 expected, 3035 Vm::hob_memory_resources( 3036 sections, 3037 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3038 ) 3039 ); 3040 3041 // Case 5: One TDVF section overriding the entire RAM 3042 let sections = vec![TdvfSection { 3043 address: 0, 3044 size: 0x2000_0000, 3045 ..Default::default() 3046 }]; 3047 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3048 let expected = vec![(0, 0x2000_0000, false)]; 3049 assert_eq!( 3050 expected, 3051 Vm::hob_memory_resources( 3052 sections, 3053 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3054 ) 3055 ); 3056 3057 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3058 let sections = vec![ 3059 TdvfSection { 3060 address: 0x1000_2000, 3061 size: 0x2000, 3062 ..Default::default() 3063 }, 3064 TdvfSection { 3065 address: 0, 3066 size: 0x2000, 3067 ..Default::default() 3068 }, 3069 ]; 3070 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3071 (GuestAddress(0x2000), 0x1000_0000), 3072 (GuestAddress(0x1000_4000), 0x1000_0000), 3073 ]; 3074 let expected = vec![ 3075 (0, 0x2000, false), 3076 (0x2000, 0x1000_0000, true), 3077 (0x1000_2000, 0x2000, false), 3078 (0x1000_4000, 0x1000_0000, true), 3079 ]; 3080 assert_eq!( 3081 expected, 3082 Vm::hob_memory_resources( 3083 sections, 3084 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3085 ) 3086 ); 3087 3088 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3089 let sections = vec![ 3090 TdvfSection { 3091 address: 0x1000_0000, 3092 size: 0x4000, 3093 ..Default::default() 3094 }, 3095 TdvfSection { 3096 address: 0, 3097 size: 0x4000, 3098 ..Default::default() 3099 }, 3100 ]; 3101 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3102 (GuestAddress(0x1000), 0x1000_0000), 3103 (GuestAddress(0x1000_3000), 0x1000_0000), 3104 ]; 3105 let expected = vec![ 3106 (0, 0x4000, false), 3107 (0x4000, 0x0fff_c000, true), 3108 (0x1000_0000, 0x4000, false), 3109 (0x1000_4000, 0x0fff_f000, true), 3110 ]; 3111 assert_eq!( 3112 expected, 3113 Vm::hob_memory_resources( 3114 sections, 3115 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3116 ) 3117 ); 3118 } 3119 } 3120 3121 #[cfg(target_arch = "aarch64")] 3122 #[cfg(test)] 3123 mod tests { 3124 use arch::aarch64::fdt::create_fdt; 3125 use arch::aarch64::layout; 3126 use arch::{DeviceType, MmioDeviceInfo}; 3127 use devices::gic::Gic; 3128 3129 use super::*; 3130 3131 const LEN: u64 = 4096; 3132 3133 #[test] 3134 fn test_create_fdt_with_devices() { 3135 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3136 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3137 3138 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3139 ( 3140 (DeviceType::Serial, DeviceType::Serial.to_string()), 3141 MmioDeviceInfo { 3142 addr: 0x00, 3143 len: LEN, 3144 irq: 33, 3145 }, 3146 ), 3147 ( 3148 (DeviceType::Virtio(1), "virtio".to_string()), 3149 MmioDeviceInfo { 3150 addr: LEN, 3151 len: LEN, 3152 irq: 34, 3153 }, 3154 ), 3155 ( 3156 (DeviceType::Rtc, "rtc".to_string()), 3157 MmioDeviceInfo { 3158 addr: 2 * LEN, 3159 len: LEN, 3160 irq: 35, 3161 }, 3162 ), 3163 ] 3164 .iter() 3165 .cloned() 3166 .collect(); 3167 3168 let hv = hypervisor::new().unwrap(); 3169 let vm = hv.create_vm().unwrap(); 3170 let gic = vm 3171 .create_vgic(Gic::create_default_config(1)) 3172 .expect("Cannot create gic"); 3173 create_fdt( 3174 &mem, 3175 "console=tty0", 3176 vec![0], 3177 Some((0, 0, 0)), 3178 &dev_info, 3179 &gic, 3180 &None, 3181 &Vec::new(), 3182 &BTreeMap::new(), 3183 None, 3184 true, 3185 ) 3186 .unwrap(); 3187 } 3188 } 3189 3190 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3191 #[test] 3192 pub fn test_vm() { 3193 use hypervisor::VmExit; 3194 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3195 // This example based on https://lwn.net/Articles/658511/ 3196 let code = [ 3197 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3198 0x00, 0xd8, /* add %bl, %al */ 3199 0x04, b'0', /* add $'0', %al */ 3200 0xee, /* out %al, (%dx) */ 3201 0xb0, b'\n', /* mov $'\n', %al */ 3202 0xee, /* out %al, (%dx) */ 3203 0xf4, /* hlt */ 3204 ]; 3205 3206 let mem_size = 0x1000; 3207 let load_addr = GuestAddress(0x1000); 3208 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3209 3210 let hv = hypervisor::new().unwrap(); 3211 let vm = hv.create_vm().expect("new VM creation failed"); 3212 3213 for (index, region) in mem.iter().enumerate() { 3214 let mem_region = vm.make_user_memory_region( 3215 index as u32, 3216 region.start_addr().raw_value(), 3217 region.len(), 3218 region.as_ptr() as u64, 3219 false, 3220 false, 3221 ); 3222 3223 vm.create_user_memory_region(mem_region) 3224 .expect("Cannot configure guest memory"); 3225 } 3226 mem.write_slice(&code, load_addr) 3227 .expect("Writing code to memory failed"); 3228 3229 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3230 3231 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3232 vcpu_sregs.cs.base = 0; 3233 vcpu_sregs.cs.selector = 0; 3234 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3235 3236 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3237 vcpu_regs.set_rip(0x1000); 3238 vcpu_regs.set_rax(2); 3239 vcpu_regs.set_rbx(3); 3240 vcpu_regs.set_rflags(2); 3241 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3242 3243 loop { 3244 match vcpu.run().expect("run failed") { 3245 VmExit::Reset => { 3246 println!("HLT"); 3247 break; 3248 } 3249 VmExit::Ignore => {} 3250 r => panic!("unexpected exit reason: {r:?}"), 3251 } 3252 } 3253 } 3254