1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; 20 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 21 use crate::coredump::{ 22 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 23 }; 24 use crate::cpu; 25 use crate::device_manager::{DeviceManager, DeviceManagerError}; 26 use crate::device_tree::DeviceTree; 27 #[cfg(feature = "guest_debug")] 28 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 29 #[cfg(feature = "igvm")] 30 use crate::igvm::igvm_loader; 31 use crate::landlock::LandlockError; 32 use crate::memory_manager::{ 33 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 34 }; 35 #[cfg(target_arch = "x86_64")] 36 use crate::migration::get_vm_snapshot; 37 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 38 use crate::migration::url_to_file; 39 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 40 use crate::GuestMemoryMmap; 41 use crate::{ 42 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 43 }; 44 use anyhow::anyhow; 45 use arch::get_host_cpu_phys_bits; 46 #[cfg(target_arch = "x86_64")] 47 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 48 #[cfg(feature = "tdx")] 49 use arch::x86_64::tdx::TdvfSection; 50 use arch::EntryPoint; 51 #[cfg(target_arch = "aarch64")] 52 use arch::PciSpaceInfo; 53 use arch::{NumaNode, NumaNodes}; 54 #[cfg(target_arch = "aarch64")] 55 use devices::interrupt_controller; 56 use devices::AcpiNotificationFlags; 57 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 58 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 59 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 60 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 61 use hypervisor::{HypervisorVmError, VmOps}; 62 use libc::{termios, SIGWINCH}; 63 use linux_loader::cmdline::Cmdline; 64 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 65 use linux_loader::elf; 66 #[cfg(target_arch = "x86_64")] 67 use linux_loader::loader::bzimage::BzImage; 68 #[cfg(target_arch = "x86_64")] 69 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 70 #[cfg(target_arch = "aarch64")] 71 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 72 use linux_loader::loader::KernelLoader; 73 use seccompiler::SeccompAction; 74 use serde::{Deserialize, Serialize}; 75 use std::cmp; 76 use std::collections::BTreeMap; 77 use std::collections::HashMap; 78 use std::fs::{File, OpenOptions}; 79 use std::io::{self, Seek, SeekFrom, Write}; 80 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 81 use std::mem::size_of; 82 use std::num::Wrapping; 83 use std::ops::Deref; 84 use std::os::unix::net::UnixStream; 85 use std::sync::{Arc, Mutex, RwLock}; 86 use std::time::Instant; 87 use std::{result, str, thread}; 88 use thiserror::Error; 89 use tracer::trace_scoped; 90 use vm_device::Bus; 91 #[cfg(feature = "tdx")] 92 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 93 use vm_memory::{ 94 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 95 }; 96 use vm_migration::protocol::{Request, Response}; 97 use vm_migration::{ 98 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 99 Snapshottable, Transportable, 100 }; 101 use vmm_sys_util::eventfd::EventFd; 102 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 103 104 /// Errors associated with VM management 105 #[derive(Debug, Error)] 106 pub enum Error { 107 #[error("Cannot open kernel file: {0}")] 108 KernelFile(#[source] io::Error), 109 110 #[error("Cannot open initramfs file: {0}")] 111 InitramfsFile(#[source] io::Error), 112 113 #[error("Cannot load the kernel into memory: {0}")] 114 KernelLoad(#[source] linux_loader::loader::Error), 115 116 #[cfg(target_arch = "aarch64")] 117 #[error("Cannot load the UEFI binary in memory: {0:?}")] 118 UefiLoad(arch::aarch64::uefi::Error), 119 120 #[error("Cannot load the initramfs into memory")] 121 InitramfsLoad, 122 123 #[error("Cannot load the kernel command line in memory: {0}")] 124 LoadCmdLine(#[source] linux_loader::loader::Error), 125 126 #[error("Failed to apply landlock config during vm_create: {0}")] 127 ApplyLandlock(#[source] LandlockError), 128 129 #[error("Cannot modify the kernel command line: {0}")] 130 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 131 132 #[error("Cannot create the kernel command line: {0}")] 133 CmdLineCreate(#[source] linux_loader::cmdline::Error), 134 135 #[error("Cannot configure system: {0}")] 136 ConfigureSystem(#[source] arch::Error), 137 138 #[cfg(target_arch = "aarch64")] 139 #[error("Cannot enable interrupt controller: {0:?}")] 140 EnableInterruptController(interrupt_controller::Error), 141 142 #[error("VM state is poisoned")] 143 PoisonedState, 144 145 #[error("Error from device manager: {0:?}")] 146 DeviceManager(DeviceManagerError), 147 148 #[error("No device with id {0:?} to remove")] 149 NoDeviceToRemove(String), 150 151 #[error("Cannot spawn a signal handler thread: {0}")] 152 SignalHandlerSpawn(#[source] io::Error), 153 154 #[error("Failed to join on threads: {0:?}")] 155 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 156 157 #[error("VM config is missing")] 158 VmMissingConfig, 159 160 #[error("VM is not created")] 161 VmNotCreated, 162 163 #[error("VM is already created")] 164 VmAlreadyCreated, 165 166 #[error("VM is not running")] 167 VmNotRunning, 168 169 #[error("Cannot clone EventFd: {0}")] 170 EventFdClone(#[source] io::Error), 171 172 #[error("invalid VM state transition: {0:?} to {1:?}")] 173 InvalidStateTransition(VmState, VmState), 174 175 #[error("Error from CPU manager: {0}")] 176 CpuManager(#[source] cpu::Error), 177 178 #[error("Cannot pause devices: {0}")] 179 PauseDevices(#[source] MigratableError), 180 181 #[error("Cannot resume devices: {0}")] 182 ResumeDevices(#[source] MigratableError), 183 184 #[error("Cannot pause CPUs: {0}")] 185 PauseCpus(#[source] MigratableError), 186 187 #[error("Cannot resume cpus: {0}")] 188 ResumeCpus(#[source] MigratableError), 189 190 #[error("Cannot pause VM: {0}")] 191 Pause(#[source] MigratableError), 192 193 #[error("Cannot resume VM: {0}")] 194 Resume(#[source] MigratableError), 195 196 #[error("Memory manager error: {0:?}")] 197 MemoryManager(MemoryManagerError), 198 199 #[error("Eventfd write error: {0}")] 200 EventfdError(#[source] std::io::Error), 201 202 #[error("Cannot snapshot VM: {0}")] 203 Snapshot(#[source] MigratableError), 204 205 #[error("Cannot restore VM: {0}")] 206 Restore(#[source] MigratableError), 207 208 #[error("Cannot send VM snapshot: {0}")] 209 SnapshotSend(#[source] MigratableError), 210 211 #[error("Invalid restore source URL")] 212 InvalidRestoreSourceUrl, 213 214 #[error("Failed to validate config: {0}")] 215 ConfigValidation(#[source] ValidationError), 216 217 #[error("Too many virtio-vsock devices")] 218 TooManyVsockDevices, 219 220 #[error("Failed serializing into JSON: {0}")] 221 SerializeJson(#[source] serde_json::Error), 222 223 #[error("Invalid NUMA configuration")] 224 InvalidNumaConfig, 225 226 #[error("Cannot create seccomp filter: {0}")] 227 CreateSeccompFilter(#[source] seccompiler::Error), 228 229 #[error("Cannot apply seccomp filter: {0}")] 230 ApplySeccompFilter(#[source] seccompiler::Error), 231 232 #[error("Failed resizing a memory zone")] 233 ResizeZone, 234 235 #[error("Cannot activate virtio devices: {0:?}")] 236 ActivateVirtioDevices(DeviceManagerError), 237 238 #[error("Error triggering power button: {0:?}")] 239 PowerButton(DeviceManagerError), 240 241 #[error("Kernel lacks PVH header")] 242 KernelMissingPvhHeader, 243 244 #[error("Failed to allocate firmware RAM: {0:?}")] 245 AllocateFirmwareMemory(MemoryManagerError), 246 247 #[error("Error manipulating firmware file: {0}")] 248 FirmwareFile(#[source] std::io::Error), 249 250 #[error("Firmware too big")] 251 FirmwareTooLarge, 252 253 #[error("Failed to copy firmware to memory: {0}")] 254 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 255 256 #[cfg(feature = "sev_snp")] 257 #[error("Error enabling SEV-SNP VM: {0}")] 258 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 259 260 #[cfg(feature = "tdx")] 261 #[error("Error performing I/O on TDX firmware file: {0}")] 262 LoadTdvf(#[source] std::io::Error), 263 264 #[cfg(feature = "tdx")] 265 #[error("Error performing I/O on the TDX payload file: {0}")] 266 LoadPayload(#[source] std::io::Error), 267 268 #[cfg(feature = "tdx")] 269 #[error("Error parsing TDVF: {0}")] 270 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 271 272 #[cfg(feature = "tdx")] 273 #[error("Error populating TDX HOB: {0}")] 274 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 275 276 #[cfg(feature = "tdx")] 277 #[error("Error allocating TDVF memory: {0:?}")] 278 AllocatingTdvfMemory(crate::memory_manager::Error), 279 280 #[cfg(feature = "tdx")] 281 #[error("Error enabling TDX VM: {0}")] 282 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 283 284 #[cfg(feature = "tdx")] 285 #[error("Error enabling TDX memory region: {0}")] 286 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 287 288 #[cfg(feature = "tdx")] 289 #[error("Error finalizing TDX VM: {0}")] 290 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 291 292 #[cfg(feature = "tdx")] 293 #[error("TDX firmware missing")] 294 TdxFirmwareMissing, 295 296 #[cfg(feature = "tdx")] 297 #[error("Invalid TDX payload type")] 298 InvalidPayloadType, 299 300 #[cfg(feature = "guest_debug")] 301 #[error("Error debugging VM: {0:?}")] 302 Debug(DebuggableError), 303 304 #[error("Error spawning kernel loading thread")] 305 KernelLoadThreadSpawn(std::io::Error), 306 307 #[error("Error joining kernel loading thread")] 308 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 309 310 #[error("Payload configuration is not bootable")] 311 InvalidPayload, 312 313 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 314 #[error("Error coredumping VM: {0:?}")] 315 Coredump(GuestDebuggableError), 316 317 #[cfg(feature = "igvm")] 318 #[error("Cannot open igvm file: {0}")] 319 IgvmFile(#[source] io::Error), 320 321 #[cfg(feature = "igvm")] 322 #[error("Cannot load the igvm into memory: {0}")] 323 IgvmLoad(#[source] igvm_loader::Error), 324 325 #[error("Error injecting NMI")] 326 ErrorNmi, 327 328 #[error("Error resuming the VM: {0}")] 329 ResumeVm(#[source] hypervisor::HypervisorVmError), 330 331 #[error("Error creating console devices")] 332 CreateConsoleDevices(ConsoleDeviceError), 333 } 334 pub type Result<T> = result::Result<T, Error>; 335 336 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 337 pub enum VmState { 338 Created, 339 Running, 340 Shutdown, 341 Paused, 342 BreakPoint, 343 } 344 345 impl VmState { 346 fn valid_transition(self, new_state: VmState) -> Result<()> { 347 match self { 348 VmState::Created => match new_state { 349 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 350 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 351 Ok(()) 352 } 353 }, 354 355 VmState::Running => match new_state { 356 VmState::Created | VmState::Running => { 357 Err(Error::InvalidStateTransition(self, new_state)) 358 } 359 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 360 }, 361 362 VmState::Shutdown => match new_state { 363 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 364 Err(Error::InvalidStateTransition(self, new_state)) 365 } 366 VmState::Running => Ok(()), 367 }, 368 369 VmState::Paused => match new_state { 370 VmState::Created | VmState::Paused | VmState::BreakPoint => { 371 Err(Error::InvalidStateTransition(self, new_state)) 372 } 373 VmState::Running | VmState::Shutdown => Ok(()), 374 }, 375 VmState::BreakPoint => match new_state { 376 VmState::Created | VmState::Running => Ok(()), 377 _ => Err(Error::InvalidStateTransition(self, new_state)), 378 }, 379 } 380 } 381 } 382 383 struct VmOpsHandler { 384 memory: GuestMemoryAtomic<GuestMemoryMmap>, 385 #[cfg(target_arch = "x86_64")] 386 io_bus: Arc<Bus>, 387 mmio_bus: Arc<Bus>, 388 } 389 390 impl VmOps for VmOpsHandler { 391 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 392 self.memory 393 .memory() 394 .write(buf, GuestAddress(gpa)) 395 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 396 } 397 398 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 399 self.memory 400 .memory() 401 .read(buf, GuestAddress(gpa)) 402 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 403 } 404 405 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 406 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 407 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 408 } 409 Ok(()) 410 } 411 412 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 413 match self.mmio_bus.write(gpa, data) { 414 Err(vm_device::BusError::MissingAddressRange) => { 415 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 416 } 417 Ok(Some(barrier)) => { 418 info!("Waiting for barrier"); 419 barrier.wait(); 420 info!("Barrier released"); 421 } 422 _ => {} 423 }; 424 Ok(()) 425 } 426 427 #[cfg(target_arch = "x86_64")] 428 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 429 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 430 info!("Guest PIO read to unregistered address 0x{:x}", port); 431 } 432 Ok(()) 433 } 434 435 #[cfg(target_arch = "x86_64")] 436 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 437 match self.io_bus.write(port, data) { 438 Err(vm_device::BusError::MissingAddressRange) => { 439 info!("Guest PIO write to unregistered address 0x{:x}", port); 440 } 441 Ok(Some(barrier)) => { 442 info!("Waiting for barrier"); 443 barrier.wait(); 444 info!("Barrier released"); 445 } 446 _ => {} 447 }; 448 Ok(()) 449 } 450 } 451 452 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 453 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 454 455 cmp::min(host_phys_bits, max_phys_bits) 456 } 457 458 pub struct Vm { 459 #[cfg(feature = "tdx")] 460 kernel: Option<File>, 461 initramfs: Option<File>, 462 threads: Vec<thread::JoinHandle<()>>, 463 device_manager: Arc<Mutex<DeviceManager>>, 464 config: Arc<Mutex<VmConfig>>, 465 state: RwLock<VmState>, 466 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 467 memory_manager: Arc<Mutex<MemoryManager>>, 468 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 469 // The hypervisor abstracted virtual machine. 470 vm: Arc<dyn hypervisor::Vm>, 471 #[cfg(target_arch = "x86_64")] 472 saved_clock: Option<hypervisor::ClockData>, 473 numa_nodes: NumaNodes, 474 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 475 hypervisor: Arc<dyn hypervisor::Hypervisor>, 476 stop_on_boot: bool, 477 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 478 } 479 480 impl Vm { 481 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 482 483 #[allow(clippy::too_many_arguments)] 484 pub fn new_from_memory_manager( 485 config: Arc<Mutex<VmConfig>>, 486 memory_manager: Arc<Mutex<MemoryManager>>, 487 vm: Arc<dyn hypervisor::Vm>, 488 exit_evt: EventFd, 489 reset_evt: EventFd, 490 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 491 seccomp_action: &SeccompAction, 492 hypervisor: Arc<dyn hypervisor::Hypervisor>, 493 activate_evt: EventFd, 494 timestamp: Instant, 495 console_info: Option<ConsoleInfo>, 496 console_resize_pipe: Option<File>, 497 original_termios: Arc<Mutex<Option<termios>>>, 498 snapshot: Option<Snapshot>, 499 ) -> Result<Self> { 500 trace_scoped!("Vm::new_from_memory_manager"); 501 502 let boot_id_list = config 503 .lock() 504 .unwrap() 505 .validate() 506 .map_err(Error::ConfigValidation)?; 507 508 #[cfg(not(feature = "igvm"))] 509 let load_payload_handle = if snapshot.is_none() { 510 Self::load_payload_async(&memory_manager, &config)? 511 } else { 512 None 513 }; 514 515 info!("Booting VM from config: {:?}", &config); 516 517 // Create NUMA nodes based on NumaConfig. 518 let numa_nodes = 519 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 520 521 #[cfg(feature = "tdx")] 522 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 523 #[cfg(feature = "sev_snp")] 524 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 525 #[cfg(feature = "tdx")] 526 let force_iommu = tdx_enabled; 527 #[cfg(feature = "sev_snp")] 528 let force_iommu = sev_snp_enabled; 529 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 530 let force_iommu = false; 531 532 #[cfg(feature = "guest_debug")] 533 let stop_on_boot = config.lock().unwrap().gdb; 534 #[cfg(not(feature = "guest_debug"))] 535 let stop_on_boot = false; 536 537 let memory = memory_manager.lock().unwrap().guest_memory(); 538 #[cfg(target_arch = "x86_64")] 539 let io_bus = Arc::new(Bus::new()); 540 let mmio_bus = Arc::new(Bus::new()); 541 542 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 543 memory, 544 #[cfg(target_arch = "x86_64")] 545 io_bus: io_bus.clone(), 546 mmio_bus: mmio_bus.clone(), 547 }); 548 549 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 550 let cpu_manager = cpu::CpuManager::new( 551 cpus_config, 552 vm.clone(), 553 exit_evt.try_clone().map_err(Error::EventFdClone)?, 554 reset_evt.try_clone().map_err(Error::EventFdClone)?, 555 #[cfg(feature = "guest_debug")] 556 vm_debug_evt, 557 &hypervisor, 558 seccomp_action.clone(), 559 vm_ops, 560 #[cfg(feature = "tdx")] 561 tdx_enabled, 562 &numa_nodes, 563 #[cfg(feature = "sev_snp")] 564 sev_snp_enabled, 565 ) 566 .map_err(Error::CpuManager)?; 567 568 #[cfg(target_arch = "x86_64")] 569 cpu_manager 570 .lock() 571 .unwrap() 572 .populate_cpuid( 573 &memory_manager, 574 &hypervisor, 575 #[cfg(feature = "tdx")] 576 tdx_enabled, 577 ) 578 .map_err(Error::CpuManager)?; 579 580 // Loading the igvm file is pushed down here because 581 // igvm parser needs cpu_manager to retrieve cpuid leaf. 582 // For the regular case, we can start loading early, but for 583 // igvm case we have to wait until cpu_manager is created. 584 // Currently, Microsoft Hypervisor does not provide any 585 // Hypervisor specific common cpuid, we need to call get_cpuid_values 586 // per cpuid through cpu_manager. 587 #[cfg(feature = "igvm")] 588 let load_payload_handle = if snapshot.is_none() { 589 Self::load_payload_async( 590 &memory_manager, 591 &config, 592 &cpu_manager, 593 #[cfg(feature = "sev_snp")] 594 sev_snp_enabled, 595 )? 596 } else { 597 None 598 }; 599 // The initial TDX configuration must be done before the vCPUs are 600 // created 601 #[cfg(feature = "tdx")] 602 if tdx_enabled { 603 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 604 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 605 vm.tdx_init(&cpuid, max_vcpus) 606 .map_err(Error::InitializeTdxVm)?; 607 } 608 609 cpu_manager 610 .lock() 611 .unwrap() 612 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 613 .map_err(Error::CpuManager)?; 614 615 // This initial SEV-SNP configuration must be done immediately after 616 // vCPUs are created. As part of this initialization we are 617 // transitioning the guest into secure state. 618 #[cfg(feature = "sev_snp")] 619 if sev_snp_enabled { 620 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 621 } 622 623 #[cfg(feature = "tdx")] 624 let dynamic = !tdx_enabled; 625 #[cfg(not(feature = "tdx"))] 626 let dynamic = true; 627 628 let device_manager = DeviceManager::new( 629 #[cfg(target_arch = "x86_64")] 630 io_bus, 631 mmio_bus, 632 vm.clone(), 633 config.clone(), 634 memory_manager.clone(), 635 cpu_manager.clone(), 636 exit_evt.try_clone().map_err(Error::EventFdClone)?, 637 reset_evt, 638 seccomp_action.clone(), 639 numa_nodes.clone(), 640 &activate_evt, 641 force_iommu, 642 boot_id_list, 643 timestamp, 644 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 645 dynamic, 646 ) 647 .map_err(Error::DeviceManager)?; 648 649 device_manager 650 .lock() 651 .unwrap() 652 .create_devices(console_info, console_resize_pipe, original_termios) 653 .map_err(Error::DeviceManager)?; 654 655 #[cfg(feature = "tdx")] 656 let kernel = config 657 .lock() 658 .unwrap() 659 .payload 660 .as_ref() 661 .map(|p| p.kernel.as_ref().map(File::open)) 662 .unwrap_or_default() 663 .transpose() 664 .map_err(Error::KernelFile)?; 665 666 let initramfs = config 667 .lock() 668 .unwrap() 669 .payload 670 .as_ref() 671 .map(|p| p.initramfs.as_ref().map(File::open)) 672 .unwrap_or_default() 673 .transpose() 674 .map_err(Error::InitramfsFile)?; 675 676 #[cfg(target_arch = "x86_64")] 677 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 678 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 679 vm_snapshot.clock 680 } else { 681 None 682 }; 683 684 let vm_state = if snapshot.is_some() { 685 VmState::Paused 686 } else { 687 VmState::Created 688 }; 689 690 Ok(Vm { 691 #[cfg(feature = "tdx")] 692 kernel, 693 initramfs, 694 device_manager, 695 config, 696 threads: Vec::with_capacity(1), 697 state: RwLock::new(vm_state), 698 cpu_manager, 699 memory_manager, 700 vm, 701 #[cfg(target_arch = "x86_64")] 702 saved_clock, 703 numa_nodes, 704 hypervisor, 705 stop_on_boot, 706 load_payload_handle, 707 }) 708 } 709 710 fn create_numa_nodes( 711 configs: Option<Vec<NumaConfig>>, 712 memory_manager: &Arc<Mutex<MemoryManager>>, 713 ) -> Result<NumaNodes> { 714 let mm = memory_manager.lock().unwrap(); 715 let mm_zones = mm.memory_zones(); 716 let mut numa_nodes = BTreeMap::new(); 717 718 if let Some(configs) = &configs { 719 for config in configs.iter() { 720 if numa_nodes.contains_key(&config.guest_numa_id) { 721 error!("Can't define twice the same NUMA node"); 722 return Err(Error::InvalidNumaConfig); 723 } 724 725 let mut node = NumaNode::default(); 726 727 if let Some(memory_zones) = &config.memory_zones { 728 for memory_zone in memory_zones.iter() { 729 if let Some(mm_zone) = mm_zones.get(memory_zone) { 730 node.memory_regions.extend(mm_zone.regions().clone()); 731 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 732 node.hotplug_regions.push(virtiomem_zone.region().clone()); 733 } 734 node.memory_zones.push(memory_zone.clone()); 735 } else { 736 error!("Unknown memory zone '{}'", memory_zone); 737 return Err(Error::InvalidNumaConfig); 738 } 739 } 740 } 741 742 if let Some(cpus) = &config.cpus { 743 node.cpus.extend(cpus); 744 } 745 746 if let Some(pci_segments) = &config.pci_segments { 747 node.pci_segments.extend(pci_segments); 748 } 749 750 if let Some(distances) = &config.distances { 751 for distance in distances.iter() { 752 let dest = distance.destination; 753 let dist = distance.distance; 754 755 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 756 error!("Unknown destination NUMA node {}", dest); 757 return Err(Error::InvalidNumaConfig); 758 } 759 760 if node.distances.contains_key(&dest) { 761 error!("Destination NUMA node {} has been already set", dest); 762 return Err(Error::InvalidNumaConfig); 763 } 764 765 node.distances.insert(dest, dist); 766 } 767 } 768 769 #[cfg(target_arch = "x86_64")] 770 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 771 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 772 let mm_sections = sgx_epc_region.epc_sections(); 773 for sgx_epc_section in sgx_epc_sections.iter() { 774 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 775 node.sgx_epc_sections.push(mm_section.clone()); 776 } else { 777 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 778 return Err(Error::InvalidNumaConfig); 779 } 780 } 781 } else { 782 error!("Missing SGX EPC region"); 783 return Err(Error::InvalidNumaConfig); 784 } 785 } 786 787 numa_nodes.insert(config.guest_numa_id, node); 788 } 789 } 790 791 Ok(numa_nodes) 792 } 793 794 #[allow(clippy::too_many_arguments)] 795 pub fn new( 796 vm_config: Arc<Mutex<VmConfig>>, 797 exit_evt: EventFd, 798 reset_evt: EventFd, 799 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 800 seccomp_action: &SeccompAction, 801 hypervisor: Arc<dyn hypervisor::Hypervisor>, 802 activate_evt: EventFd, 803 console_info: Option<ConsoleInfo>, 804 console_resize_pipe: Option<File>, 805 original_termios: Arc<Mutex<Option<termios>>>, 806 snapshot: Option<Snapshot>, 807 source_url: Option<&str>, 808 prefault: Option<bool>, 809 ) -> Result<Self> { 810 trace_scoped!("Vm::new"); 811 812 let timestamp = Instant::now(); 813 814 #[cfg(feature = "tdx")] 815 let tdx_enabled = if snapshot.is_some() { 816 false 817 } else { 818 vm_config.lock().unwrap().is_tdx_enabled() 819 }; 820 821 #[cfg(feature = "sev_snp")] 822 let sev_snp_enabled = if snapshot.is_some() { 823 false 824 } else { 825 vm_config.lock().unwrap().is_sev_snp_enabled() 826 }; 827 828 let vm = Self::create_hypervisor_vm( 829 &hypervisor, 830 #[cfg(feature = "tdx")] 831 tdx_enabled, 832 #[cfg(feature = "sev_snp")] 833 sev_snp_enabled, 834 )?; 835 836 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 837 838 let memory_manager = if let Some(snapshot) = 839 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 840 { 841 MemoryManager::new_from_snapshot( 842 &snapshot, 843 vm.clone(), 844 &vm_config.lock().unwrap().memory.clone(), 845 source_url, 846 prefault.unwrap(), 847 phys_bits, 848 ) 849 .map_err(Error::MemoryManager)? 850 } else { 851 #[cfg(target_arch = "x86_64")] 852 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 853 854 MemoryManager::new( 855 vm.clone(), 856 &vm_config.lock().unwrap().memory.clone(), 857 None, 858 phys_bits, 859 #[cfg(feature = "tdx")] 860 tdx_enabled, 861 None, 862 None, 863 #[cfg(target_arch = "x86_64")] 864 sgx_epc_config, 865 ) 866 .map_err(Error::MemoryManager)? 867 }; 868 869 Vm::new_from_memory_manager( 870 vm_config, 871 memory_manager, 872 vm, 873 exit_evt, 874 reset_evt, 875 #[cfg(feature = "guest_debug")] 876 vm_debug_evt, 877 seccomp_action, 878 hypervisor, 879 activate_evt, 880 timestamp, 881 console_info, 882 console_resize_pipe, 883 original_termios, 884 snapshot, 885 ) 886 } 887 888 pub fn create_hypervisor_vm( 889 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 890 #[cfg(feature = "tdx")] tdx_enabled: bool, 891 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 892 ) -> Result<Arc<dyn hypervisor::Vm>> { 893 hypervisor.check_required_extensions().unwrap(); 894 895 cfg_if::cfg_if! { 896 if #[cfg(feature = "tdx")] { 897 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 898 // Otherwise KVM_X86_LEGACY_VM: 0 899 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 900 let vm = hypervisor 901 .create_vm_with_type(u64::from(tdx_enabled)) 902 .unwrap(); 903 } else if #[cfg(feature = "sev_snp")] { 904 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 905 // Otherwise SEV_SNP_DISABLED: 0 906 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 907 let vm = hypervisor 908 .create_vm_with_type(u64::from(sev_snp_enabled)) 909 .unwrap(); 910 } else { 911 let vm = hypervisor.create_vm().unwrap(); 912 } 913 } 914 915 #[cfg(target_arch = "x86_64")] 916 { 917 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 918 .unwrap(); 919 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 920 vm.enable_split_irq().unwrap(); 921 } 922 923 Ok(vm) 924 } 925 926 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 927 let initramfs = self.initramfs.as_mut().unwrap(); 928 let size: usize = initramfs 929 .seek(SeekFrom::End(0)) 930 .map_err(|_| Error::InitramfsLoad)? 931 .try_into() 932 .unwrap(); 933 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 934 935 let address = 936 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 937 let address = GuestAddress(address); 938 939 guest_mem 940 .read_volatile_from(address, initramfs, size) 941 .map_err(|_| Error::InitramfsLoad)?; 942 943 info!("Initramfs loaded: address = 0x{:x}", address.0); 944 Ok(arch::InitramfsConfig { address, size }) 945 } 946 947 pub fn generate_cmdline( 948 payload: &PayloadConfig, 949 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 950 ) -> Result<Cmdline> { 951 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 952 if let Some(s) = payload.cmdline.as_ref() { 953 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 954 } 955 956 #[cfg(target_arch = "aarch64")] 957 for entry in device_manager.lock().unwrap().cmdline_additions() { 958 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 959 } 960 Ok(cmdline) 961 } 962 963 #[cfg(target_arch = "aarch64")] 964 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 965 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 966 let mem = uefi_flash.memory(); 967 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 968 .map_err(Error::UefiLoad)?; 969 Ok(()) 970 } 971 972 #[cfg(target_arch = "aarch64")] 973 fn load_kernel( 974 firmware: Option<File>, 975 kernel: Option<File>, 976 memory_manager: Arc<Mutex<MemoryManager>>, 977 ) -> Result<EntryPoint> { 978 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 979 let mem = guest_memory.memory(); 980 let entry_addr = match (firmware, kernel) { 981 (None, Some(mut kernel)) => { 982 match linux_loader::loader::pe::PE::load( 983 mem.deref(), 984 Some(arch::layout::KERNEL_START), 985 &mut kernel, 986 None, 987 ) { 988 Ok(entry_addr) => entry_addr.kernel_load, 989 // Try to load the binary as kernel PE file at first. 990 // If failed, retry to load it as UEFI binary. 991 // As the UEFI binary is formatless, it must be the last option to try. 992 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 993 Self::load_firmware(&kernel, memory_manager)?; 994 arch::layout::UEFI_START 995 } 996 Err(e) => { 997 return Err(Error::KernelLoad(e)); 998 } 999 } 1000 } 1001 (Some(firmware), None) => { 1002 Self::load_firmware(&firmware, memory_manager)?; 1003 arch::layout::UEFI_START 1004 } 1005 _ => return Err(Error::InvalidPayload), 1006 }; 1007 1008 Ok(EntryPoint { entry_addr }) 1009 } 1010 1011 #[cfg(feature = "igvm")] 1012 fn load_igvm( 1013 igvm: File, 1014 memory_manager: Arc<Mutex<MemoryManager>>, 1015 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1016 #[cfg(feature = "sev_snp")] host_data: &Option<String>, 1017 ) -> Result<EntryPoint> { 1018 let res = igvm_loader::load_igvm( 1019 &igvm, 1020 memory_manager, 1021 cpu_manager.clone(), 1022 "", 1023 #[cfg(feature = "sev_snp")] 1024 host_data, 1025 ) 1026 .map_err(Error::IgvmLoad)?; 1027 1028 cfg_if::cfg_if! { 1029 if #[cfg(feature = "sev_snp")] { 1030 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1031 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1032 } else { 1033 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1034 }; 1035 } else { 1036 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1037 } 1038 }; 1039 Ok(entry_point) 1040 } 1041 1042 #[cfg(target_arch = "x86_64")] 1043 fn load_kernel( 1044 mut kernel: File, 1045 cmdline: Option<Cmdline>, 1046 memory_manager: Arc<Mutex<MemoryManager>>, 1047 ) -> Result<EntryPoint> { 1048 info!("Loading kernel"); 1049 1050 let mem = { 1051 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1052 guest_memory.memory() 1053 }; 1054 1055 // Try ELF binary with PVH boot. 1056 let entry_addr = linux_loader::loader::elf::Elf::load( 1057 mem.deref(), 1058 None, 1059 &mut kernel, 1060 Some(arch::layout::HIGH_RAM_START), 1061 ) 1062 // Try loading kernel as bzImage. 1063 .or_else(|_| { 1064 BzImage::load( 1065 mem.deref(), 1066 None, 1067 &mut kernel, 1068 Some(arch::layout::HIGH_RAM_START), 1069 ) 1070 }) 1071 .map_err(Error::KernelLoad)?; 1072 1073 if let Some(cmdline) = cmdline { 1074 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1075 .map_err(Error::LoadCmdLine)?; 1076 } 1077 1078 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1079 // Use the PVH kernel entry point to boot the guest 1080 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1081 Ok(EntryPoint { 1082 entry_addr, 1083 setup_header: None, 1084 }) 1085 } else if entry_addr.setup_header.is_some() { 1086 // Use the bzImage 32bit entry point to boot the guest 1087 info!( 1088 "bzImage kernel loaded: entry_addr = 0x{:x}", 1089 entry_addr.kernel_load.0 1090 ); 1091 Ok(EntryPoint { 1092 entry_addr: entry_addr.kernel_load, 1093 setup_header: entry_addr.setup_header, 1094 }) 1095 } else { 1096 Err(Error::KernelMissingPvhHeader) 1097 } 1098 } 1099 1100 #[cfg(target_arch = "x86_64")] 1101 fn load_payload( 1102 payload: &PayloadConfig, 1103 memory_manager: Arc<Mutex<MemoryManager>>, 1104 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1105 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1106 ) -> Result<EntryPoint> { 1107 trace_scoped!("load_payload"); 1108 #[cfg(feature = "igvm")] 1109 { 1110 if let Some(_igvm_file) = &payload.igvm { 1111 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1112 #[cfg(feature = "sev_snp")] 1113 if sev_snp_enabled { 1114 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); 1115 } 1116 #[cfg(not(feature = "sev_snp"))] 1117 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1118 } 1119 } 1120 match ( 1121 &payload.firmware, 1122 &payload.kernel, 1123 &payload.initramfs, 1124 &payload.cmdline, 1125 ) { 1126 (Some(firmware), None, None, None) => { 1127 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1128 Self::load_kernel(firmware, None, memory_manager) 1129 } 1130 (None, Some(kernel), _, _) => { 1131 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1132 let cmdline = Self::generate_cmdline(payload)?; 1133 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1134 } 1135 _ => Err(Error::InvalidPayload), 1136 } 1137 } 1138 1139 #[cfg(target_arch = "aarch64")] 1140 fn load_payload( 1141 payload: &PayloadConfig, 1142 memory_manager: Arc<Mutex<MemoryManager>>, 1143 ) -> Result<EntryPoint> { 1144 match (&payload.firmware, &payload.kernel) { 1145 (Some(firmware), None) => { 1146 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1147 Self::load_kernel(Some(firmware), None, memory_manager) 1148 } 1149 (None, Some(kernel)) => { 1150 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1151 Self::load_kernel(None, Some(kernel), memory_manager) 1152 } 1153 _ => Err(Error::InvalidPayload), 1154 } 1155 } 1156 1157 fn load_payload_async( 1158 memory_manager: &Arc<Mutex<MemoryManager>>, 1159 config: &Arc<Mutex<VmConfig>>, 1160 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1161 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1162 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1163 // Kernel with TDX is loaded in a different manner 1164 #[cfg(feature = "tdx")] 1165 if config.lock().unwrap().is_tdx_enabled() { 1166 return Ok(None); 1167 } 1168 1169 config 1170 .lock() 1171 .unwrap() 1172 .payload 1173 .as_ref() 1174 .map(|payload| { 1175 let memory_manager = memory_manager.clone(); 1176 let payload = payload.clone(); 1177 #[cfg(feature = "igvm")] 1178 let cpu_manager = cpu_manager.clone(); 1179 1180 std::thread::Builder::new() 1181 .name("payload_loader".into()) 1182 .spawn(move || { 1183 Self::load_payload( 1184 &payload, 1185 memory_manager, 1186 #[cfg(feature = "igvm")] 1187 cpu_manager, 1188 #[cfg(feature = "sev_snp")] 1189 sev_snp_enabled, 1190 ) 1191 }) 1192 .map_err(Error::KernelLoadThreadSpawn) 1193 }) 1194 .transpose() 1195 } 1196 1197 #[cfg(target_arch = "x86_64")] 1198 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1199 trace_scoped!("configure_system"); 1200 info!("Configuring system"); 1201 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1202 1203 let initramfs_config = match self.initramfs { 1204 Some(_) => Some(self.load_initramfs(&mem)?), 1205 None => None, 1206 }; 1207 1208 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1209 let rsdp_addr = Some(rsdp_addr); 1210 let sgx_epc_region = self 1211 .memory_manager 1212 .lock() 1213 .unwrap() 1214 .sgx_epc_region() 1215 .as_ref() 1216 .cloned(); 1217 1218 let serial_number = self 1219 .config 1220 .lock() 1221 .unwrap() 1222 .platform 1223 .as_ref() 1224 .and_then(|p| p.serial_number.clone()); 1225 1226 let uuid = self 1227 .config 1228 .lock() 1229 .unwrap() 1230 .platform 1231 .as_ref() 1232 .and_then(|p| p.uuid.clone()); 1233 1234 let oem_strings = self 1235 .config 1236 .lock() 1237 .unwrap() 1238 .platform 1239 .as_ref() 1240 .and_then(|p| p.oem_strings.clone()); 1241 1242 let oem_strings = oem_strings 1243 .as_deref() 1244 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1245 1246 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1247 1248 arch::configure_system( 1249 &mem, 1250 arch::layout::CMDLINE_START, 1251 arch::layout::CMDLINE_MAX_SIZE, 1252 &initramfs_config, 1253 boot_vcpus, 1254 entry_addr.setup_header, 1255 rsdp_addr, 1256 sgx_epc_region, 1257 serial_number.as_deref(), 1258 uuid.as_deref(), 1259 oem_strings.as_deref(), 1260 topology, 1261 ) 1262 .map_err(Error::ConfigureSystem)?; 1263 Ok(()) 1264 } 1265 1266 #[cfg(target_arch = "aarch64")] 1267 fn configure_system( 1268 &mut self, 1269 _rsdp_addr: GuestAddress, 1270 _entry_addr: EntryPoint, 1271 ) -> Result<()> { 1272 let cmdline = Self::generate_cmdline( 1273 self.config.lock().unwrap().payload.as_ref().unwrap(), 1274 &self.device_manager, 1275 )?; 1276 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1277 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1278 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1279 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1280 let initramfs_config = match self.initramfs { 1281 Some(_) => Some(self.load_initramfs(&mem)?), 1282 None => None, 1283 }; 1284 1285 let device_info = &self 1286 .device_manager 1287 .lock() 1288 .unwrap() 1289 .get_device_info() 1290 .clone(); 1291 1292 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1293 let pci_space = PciSpaceInfo { 1294 pci_segment_id: pci_segment.id, 1295 mmio_config_address: pci_segment.mmio_config_address, 1296 pci_device_space_start: pci_segment.start_of_mem64_area, 1297 pci_device_space_size: pci_segment.end_of_mem64_area 1298 - pci_segment.start_of_mem64_area 1299 + 1, 1300 }; 1301 pci_space_info.push(pci_space); 1302 } 1303 1304 let virtio_iommu_bdf = self 1305 .device_manager 1306 .lock() 1307 .unwrap() 1308 .iommu_attached_devices() 1309 .as_ref() 1310 .map(|(v, _)| *v); 1311 1312 let vgic = self 1313 .device_manager 1314 .lock() 1315 .unwrap() 1316 .get_interrupt_controller() 1317 .unwrap() 1318 .lock() 1319 .unwrap() 1320 .get_vgic() 1321 .map_err(|_| { 1322 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1323 arch::aarch64::Error::SetupGic, 1324 )) 1325 })?; 1326 1327 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1328 let pmu_supported = self 1329 .cpu_manager 1330 .lock() 1331 .unwrap() 1332 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1333 .map_err(|_| { 1334 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1335 arch::aarch64::Error::VcpuInitPmu, 1336 )) 1337 })?; 1338 1339 arch::configure_system( 1340 &mem, 1341 cmdline.as_cstring().unwrap().to_str().unwrap(), 1342 vcpu_mpidrs, 1343 vcpu_topology, 1344 device_info, 1345 &initramfs_config, 1346 &pci_space_info, 1347 virtio_iommu_bdf.map(|bdf| bdf.into()), 1348 &vgic, 1349 &self.numa_nodes, 1350 pmu_supported, 1351 ) 1352 .map_err(Error::ConfigureSystem)?; 1353 1354 Ok(()) 1355 } 1356 1357 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1358 self.device_manager.lock().unwrap().console_resize_pipe() 1359 } 1360 1361 pub fn shutdown(&mut self) -> Result<()> { 1362 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1363 let new_state = VmState::Shutdown; 1364 1365 state.valid_transition(new_state)?; 1366 1367 // Wake up the DeviceManager threads so they will get terminated cleanly 1368 self.device_manager 1369 .lock() 1370 .unwrap() 1371 .resume() 1372 .map_err(Error::Resume)?; 1373 1374 self.cpu_manager 1375 .lock() 1376 .unwrap() 1377 .shutdown() 1378 .map_err(Error::CpuManager)?; 1379 1380 // Wait for all the threads to finish 1381 for thread in self.threads.drain(..) { 1382 thread.join().map_err(Error::ThreadCleanup)? 1383 } 1384 *state = new_state; 1385 1386 Ok(()) 1387 } 1388 1389 pub fn resize( 1390 &mut self, 1391 desired_vcpus: Option<u8>, 1392 desired_memory: Option<u64>, 1393 desired_balloon: Option<u64>, 1394 ) -> Result<()> { 1395 event!("vm", "resizing"); 1396 1397 if let Some(desired_vcpus) = desired_vcpus { 1398 if self 1399 .cpu_manager 1400 .lock() 1401 .unwrap() 1402 .resize(desired_vcpus) 1403 .map_err(Error::CpuManager)? 1404 { 1405 self.device_manager 1406 .lock() 1407 .unwrap() 1408 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1409 .map_err(Error::DeviceManager)?; 1410 } 1411 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1412 } 1413 1414 if let Some(desired_memory) = desired_memory { 1415 let new_region = self 1416 .memory_manager 1417 .lock() 1418 .unwrap() 1419 .resize(desired_memory) 1420 .map_err(Error::MemoryManager)?; 1421 1422 let memory_config = &mut self.config.lock().unwrap().memory; 1423 1424 if let Some(new_region) = &new_region { 1425 self.device_manager 1426 .lock() 1427 .unwrap() 1428 .update_memory(new_region) 1429 .map_err(Error::DeviceManager)?; 1430 1431 match memory_config.hotplug_method { 1432 HotplugMethod::Acpi => { 1433 self.device_manager 1434 .lock() 1435 .unwrap() 1436 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1437 .map_err(Error::DeviceManager)?; 1438 } 1439 HotplugMethod::VirtioMem => {} 1440 } 1441 } 1442 1443 // We update the VM config regardless of the actual guest resize 1444 // operation result (happened or not), so that if the VM reboots 1445 // it will be running with the last configure memory size. 1446 match memory_config.hotplug_method { 1447 HotplugMethod::Acpi => memory_config.size = desired_memory, 1448 HotplugMethod::VirtioMem => { 1449 if desired_memory > memory_config.size { 1450 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1451 } else { 1452 memory_config.hotplugged_size = None; 1453 } 1454 } 1455 } 1456 } 1457 1458 if let Some(desired_balloon) = desired_balloon { 1459 self.device_manager 1460 .lock() 1461 .unwrap() 1462 .resize_balloon(desired_balloon) 1463 .map_err(Error::DeviceManager)?; 1464 1465 // Update the configuration value for the balloon size to ensure 1466 // a reboot would use the right value. 1467 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1468 balloon_config.size = desired_balloon; 1469 } 1470 } 1471 1472 event!("vm", "resized"); 1473 1474 Ok(()) 1475 } 1476 1477 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1478 let memory_config = &mut self.config.lock().unwrap().memory; 1479 1480 if let Some(zones) = &mut memory_config.zones { 1481 for zone in zones.iter_mut() { 1482 if zone.id == id { 1483 if desired_memory >= zone.size { 1484 let hotplugged_size = desired_memory - zone.size; 1485 self.memory_manager 1486 .lock() 1487 .unwrap() 1488 .resize_zone(&id, desired_memory - zone.size) 1489 .map_err(Error::MemoryManager)?; 1490 // We update the memory zone config regardless of the 1491 // actual 'resize-zone' operation result (happened or 1492 // not), so that if the VM reboots it will be running 1493 // with the last configured memory zone size. 1494 zone.hotplugged_size = Some(hotplugged_size); 1495 1496 return Ok(()); 1497 } else { 1498 error!( 1499 "Invalid to ask less ({}) than boot RAM ({}) for \ 1500 this memory zone", 1501 desired_memory, zone.size, 1502 ); 1503 return Err(Error::ResizeZone); 1504 } 1505 } 1506 } 1507 } 1508 1509 error!("Could not find the memory zone {} for the resize", id); 1510 Err(Error::ResizeZone) 1511 } 1512 1513 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1514 let pci_device_info = self 1515 .device_manager 1516 .lock() 1517 .unwrap() 1518 .add_device(&mut device_cfg) 1519 .map_err(Error::DeviceManager)?; 1520 1521 // Update VmConfig by adding the new device. This is important to 1522 // ensure the device would be created in case of a reboot. 1523 { 1524 let mut config = self.config.lock().unwrap(); 1525 add_to_config(&mut config.devices, device_cfg); 1526 } 1527 1528 self.device_manager 1529 .lock() 1530 .unwrap() 1531 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1532 .map_err(Error::DeviceManager)?; 1533 1534 Ok(pci_device_info) 1535 } 1536 1537 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1538 let pci_device_info = self 1539 .device_manager 1540 .lock() 1541 .unwrap() 1542 .add_user_device(&mut device_cfg) 1543 .map_err(Error::DeviceManager)?; 1544 1545 // Update VmConfig by adding the new device. This is important to 1546 // ensure the device would be created in case of a reboot. 1547 { 1548 let mut config = self.config.lock().unwrap(); 1549 add_to_config(&mut config.user_devices, device_cfg); 1550 } 1551 1552 self.device_manager 1553 .lock() 1554 .unwrap() 1555 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1556 .map_err(Error::DeviceManager)?; 1557 1558 Ok(pci_device_info) 1559 } 1560 1561 pub fn remove_device(&mut self, id: String) -> Result<()> { 1562 self.device_manager 1563 .lock() 1564 .unwrap() 1565 .remove_device(id.clone()) 1566 .map_err(Error::DeviceManager)?; 1567 1568 // Update VmConfig by removing the device. This is important to 1569 // ensure the device would not be created in case of a reboot. 1570 self.config.lock().unwrap().remove_device(&id); 1571 1572 self.device_manager 1573 .lock() 1574 .unwrap() 1575 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1576 .map_err(Error::DeviceManager)?; 1577 Ok(()) 1578 } 1579 1580 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1581 let pci_device_info = self 1582 .device_manager 1583 .lock() 1584 .unwrap() 1585 .add_disk(&mut disk_cfg) 1586 .map_err(Error::DeviceManager)?; 1587 1588 // Update VmConfig by adding the new device. This is important to 1589 // ensure the device would be created in case of a reboot. 1590 { 1591 let mut config = self.config.lock().unwrap(); 1592 add_to_config(&mut config.disks, disk_cfg); 1593 } 1594 1595 self.device_manager 1596 .lock() 1597 .unwrap() 1598 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1599 .map_err(Error::DeviceManager)?; 1600 1601 Ok(pci_device_info) 1602 } 1603 1604 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1605 let pci_device_info = self 1606 .device_manager 1607 .lock() 1608 .unwrap() 1609 .add_fs(&mut fs_cfg) 1610 .map_err(Error::DeviceManager)?; 1611 1612 // Update VmConfig by adding the new device. This is important to 1613 // ensure the device would be created in case of a reboot. 1614 { 1615 let mut config = self.config.lock().unwrap(); 1616 add_to_config(&mut config.fs, fs_cfg); 1617 } 1618 1619 self.device_manager 1620 .lock() 1621 .unwrap() 1622 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1623 .map_err(Error::DeviceManager)?; 1624 1625 Ok(pci_device_info) 1626 } 1627 1628 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1629 let pci_device_info = self 1630 .device_manager 1631 .lock() 1632 .unwrap() 1633 .add_pmem(&mut pmem_cfg) 1634 .map_err(Error::DeviceManager)?; 1635 1636 // Update VmConfig by adding the new device. This is important to 1637 // ensure the device would be created in case of a reboot. 1638 { 1639 let mut config = self.config.lock().unwrap(); 1640 add_to_config(&mut config.pmem, pmem_cfg); 1641 } 1642 1643 self.device_manager 1644 .lock() 1645 .unwrap() 1646 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1647 .map_err(Error::DeviceManager)?; 1648 1649 Ok(pci_device_info) 1650 } 1651 1652 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1653 let pci_device_info = self 1654 .device_manager 1655 .lock() 1656 .unwrap() 1657 .add_net(&mut net_cfg) 1658 .map_err(Error::DeviceManager)?; 1659 1660 // Update VmConfig by adding the new device. This is important to 1661 // ensure the device would be created in case of a reboot. 1662 { 1663 let mut config = self.config.lock().unwrap(); 1664 add_to_config(&mut config.net, net_cfg); 1665 } 1666 1667 self.device_manager 1668 .lock() 1669 .unwrap() 1670 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1671 .map_err(Error::DeviceManager)?; 1672 1673 Ok(pci_device_info) 1674 } 1675 1676 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1677 let pci_device_info = self 1678 .device_manager 1679 .lock() 1680 .unwrap() 1681 .add_vdpa(&mut vdpa_cfg) 1682 .map_err(Error::DeviceManager)?; 1683 1684 // Update VmConfig by adding the new device. This is important to 1685 // ensure the device would be created in case of a reboot. 1686 { 1687 let mut config = self.config.lock().unwrap(); 1688 add_to_config(&mut config.vdpa, vdpa_cfg); 1689 } 1690 1691 self.device_manager 1692 .lock() 1693 .unwrap() 1694 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1695 .map_err(Error::DeviceManager)?; 1696 1697 Ok(pci_device_info) 1698 } 1699 1700 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1701 let pci_device_info = self 1702 .device_manager 1703 .lock() 1704 .unwrap() 1705 .add_vsock(&mut vsock_cfg) 1706 .map_err(Error::DeviceManager)?; 1707 1708 // Update VmConfig by adding the new device. This is important to 1709 // ensure the device would be created in case of a reboot. 1710 { 1711 let mut config = self.config.lock().unwrap(); 1712 config.vsock = Some(vsock_cfg); 1713 } 1714 1715 self.device_manager 1716 .lock() 1717 .unwrap() 1718 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1719 .map_err(Error::DeviceManager)?; 1720 1721 Ok(pci_device_info) 1722 } 1723 1724 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1725 Ok(self.device_manager.lock().unwrap().counters()) 1726 } 1727 1728 #[cfg(feature = "tdx")] 1729 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1730 use arch::x86_64::tdx::*; 1731 1732 let firmware_path = self 1733 .config 1734 .lock() 1735 .unwrap() 1736 .payload 1737 .as_ref() 1738 .unwrap() 1739 .firmware 1740 .clone() 1741 .ok_or(Error::TdxFirmwareMissing)?; 1742 // The TDVF file contains a table of section as well as code 1743 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1744 1745 // For all the sections allocate some RAM backing them 1746 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1747 } 1748 1749 #[cfg(feature = "tdx")] 1750 fn hob_memory_resources( 1751 mut sorted_sections: Vec<TdvfSection>, 1752 guest_memory: &GuestMemoryMmap, 1753 ) -> Vec<(u64, u64, bool)> { 1754 let mut list = Vec::new(); 1755 1756 let mut current_section = sorted_sections.pop(); 1757 1758 // RAM regions interleaved with TDVF sections 1759 let mut next_start_addr = 0; 1760 for region in guest_memory.iter() { 1761 let region_start = region.start_addr().0; 1762 let region_end = region.last_addr().0; 1763 if region_start > next_start_addr { 1764 next_start_addr = region_start; 1765 } 1766 1767 loop { 1768 let (start, size, ram) = if let Some(section) = ¤t_section { 1769 if section.address <= next_start_addr { 1770 (section.address, section.size, false) 1771 } else { 1772 let last_addr = std::cmp::min(section.address - 1, region_end); 1773 (next_start_addr, last_addr - next_start_addr + 1, true) 1774 } 1775 } else { 1776 (next_start_addr, region_end - next_start_addr + 1, true) 1777 }; 1778 1779 list.push((start, size, ram)); 1780 1781 if !ram { 1782 current_section = sorted_sections.pop(); 1783 } 1784 1785 next_start_addr = start + size; 1786 1787 if region_start > next_start_addr { 1788 next_start_addr = region_start; 1789 } 1790 1791 if next_start_addr > region_end { 1792 break; 1793 } 1794 } 1795 } 1796 1797 // Once all the interleaved sections have been processed, let's simply 1798 // pull the remaining ones. 1799 if let Some(section) = current_section { 1800 list.push((section.address, section.size, false)); 1801 } 1802 while let Some(section) = sorted_sections.pop() { 1803 list.push((section.address, section.size, false)); 1804 } 1805 1806 list 1807 } 1808 1809 #[cfg(feature = "tdx")] 1810 fn populate_tdx_sections( 1811 &mut self, 1812 sections: &[TdvfSection], 1813 guid_found: bool, 1814 ) -> Result<Option<u64>> { 1815 use arch::x86_64::tdx::*; 1816 // Get the memory end *before* we start adding TDVF ram regions 1817 let boot_guest_memory = self 1818 .memory_manager 1819 .lock() 1820 .as_ref() 1821 .unwrap() 1822 .boot_guest_memory(); 1823 for section in sections { 1824 // No need to allocate if the section falls within guest RAM ranges 1825 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1826 info!( 1827 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1828 section 1829 ); 1830 continue; 1831 } 1832 1833 info!("Allocating TDVF Section: {:x?}", section); 1834 self.memory_manager 1835 .lock() 1836 .unwrap() 1837 .add_ram_region(GuestAddress(section.address), section.size as usize) 1838 .map_err(Error::AllocatingTdvfMemory)?; 1839 } 1840 1841 // The TDVF file contains a table of section as well as code 1842 let firmware_path = self 1843 .config 1844 .lock() 1845 .unwrap() 1846 .payload 1847 .as_ref() 1848 .unwrap() 1849 .firmware 1850 .clone() 1851 .ok_or(Error::TdxFirmwareMissing)?; 1852 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1853 1854 // The guest memory at this point now has all the required regions so it 1855 // is safe to copy from the TDVF file into it. 1856 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1857 let mem = guest_memory.memory(); 1858 let mut payload_info = None; 1859 let mut hob_offset = None; 1860 for section in sections { 1861 info!("Populating TDVF Section: {:x?}", section); 1862 match section.r#type { 1863 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1864 info!("Copying section to guest memory"); 1865 firmware_file 1866 .seek(SeekFrom::Start(section.data_offset as u64)) 1867 .map_err(Error::LoadTdvf)?; 1868 mem.read_volatile_from( 1869 GuestAddress(section.address), 1870 &mut firmware_file, 1871 section.data_size as usize, 1872 ) 1873 .unwrap(); 1874 } 1875 TdvfSectionType::TdHob => { 1876 hob_offset = Some(section.address); 1877 } 1878 TdvfSectionType::Payload => { 1879 info!("Copying payload to guest memory"); 1880 if let Some(payload_file) = self.kernel.as_mut() { 1881 let payload_size = payload_file 1882 .seek(SeekFrom::End(0)) 1883 .map_err(Error::LoadPayload)?; 1884 1885 payload_file 1886 .seek(SeekFrom::Start(0x1f1)) 1887 .map_err(Error::LoadPayload)?; 1888 1889 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1890 payload_file 1891 .read_volatile(&mut payload_header.as_bytes()) 1892 .unwrap(); 1893 1894 if payload_header.header != 0x5372_6448 { 1895 return Err(Error::InvalidPayloadType); 1896 } 1897 1898 if (payload_header.version < 0x0200) 1899 || ((payload_header.loadflags & 0x1) == 0x0) 1900 { 1901 return Err(Error::InvalidPayloadType); 1902 } 1903 1904 payload_file.rewind().map_err(Error::LoadPayload)?; 1905 mem.read_volatile_from( 1906 GuestAddress(section.address), 1907 payload_file, 1908 payload_size as usize, 1909 ) 1910 .unwrap(); 1911 1912 // Create the payload info that will be inserted into 1913 // the HOB. 1914 payload_info = Some(PayloadInfo { 1915 image_type: PayloadImageType::BzImage, 1916 entry_point: section.address, 1917 }); 1918 } 1919 } 1920 TdvfSectionType::PayloadParam => { 1921 info!("Copying payload parameters to guest memory"); 1922 let cmdline = Self::generate_cmdline( 1923 self.config.lock().unwrap().payload.as_ref().unwrap(), 1924 )?; 1925 mem.write_slice( 1926 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1927 GuestAddress(section.address), 1928 ) 1929 .unwrap(); 1930 } 1931 _ => {} 1932 } 1933 } 1934 1935 // Generate HOB 1936 let mut hob = TdHob::start(hob_offset.unwrap()); 1937 1938 let mut sorted_sections = sections.to_vec(); 1939 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1940 1941 sorted_sections.sort_by_key(|section| section.address); 1942 sorted_sections.reverse(); 1943 1944 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1945 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1946 .map_err(Error::PopulateHob)?; 1947 } 1948 1949 // MMIO regions 1950 hob.add_mmio_resource( 1951 &mem, 1952 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1953 arch::layout::APIC_START.raw_value() 1954 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1955 ) 1956 .map_err(Error::PopulateHob)?; 1957 let start_of_device_area = self 1958 .memory_manager 1959 .lock() 1960 .unwrap() 1961 .start_of_device_area() 1962 .raw_value(); 1963 let end_of_device_area = self 1964 .memory_manager 1965 .lock() 1966 .unwrap() 1967 .end_of_device_area() 1968 .raw_value(); 1969 hob.add_mmio_resource( 1970 &mem, 1971 start_of_device_area, 1972 end_of_device_area - start_of_device_area, 1973 ) 1974 .map_err(Error::PopulateHob)?; 1975 1976 // Loop over the ACPI tables and copy them to the HOB. 1977 1978 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1979 &self.device_manager, 1980 &self.cpu_manager, 1981 &self.memory_manager, 1982 &self.numa_nodes, 1983 ) { 1984 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1985 .map_err(Error::PopulateHob)?; 1986 } 1987 1988 // If a payload info has been created, let's insert it into the HOB. 1989 if let Some(payload_info) = payload_info { 1990 hob.add_payload(&mem, payload_info) 1991 .map_err(Error::PopulateHob)?; 1992 } 1993 1994 hob.finish(&mem).map_err(Error::PopulateHob)?; 1995 1996 Ok(hob_offset) 1997 } 1998 1999 #[cfg(feature = "tdx")] 2000 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 2001 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2002 let mem = guest_memory.memory(); 2003 2004 for section in sections { 2005 self.vm 2006 .tdx_init_memory_region( 2007 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2008 section.address, 2009 section.size, 2010 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2011 section.attributes == 1, 2012 ) 2013 .map_err(Error::InitializeTdxMemoryRegion)?; 2014 } 2015 2016 Ok(()) 2017 } 2018 2019 // Creates ACPI tables 2020 // In case of TDX being used, this is a no-op since the tables will be 2021 // created and passed when populating the HOB. 2022 2023 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2024 #[cfg(feature = "tdx")] 2025 if self.config.lock().unwrap().is_tdx_enabled() { 2026 return None; 2027 } 2028 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2029 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2030 let rsdp_addr = crate::acpi::create_acpi_tables( 2031 &mem, 2032 &self.device_manager, 2033 &self.cpu_manager, 2034 &self.memory_manager, 2035 &self.numa_nodes, 2036 tpm_enabled, 2037 ); 2038 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2039 2040 Some(rsdp_addr) 2041 } 2042 2043 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2044 trace_scoped!("entry_point"); 2045 2046 self.load_payload_handle 2047 .take() 2048 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2049 .transpose() 2050 } 2051 2052 pub fn boot(&mut self) -> Result<()> { 2053 trace_scoped!("Vm::boot"); 2054 let current_state = self.get_state()?; 2055 if current_state == VmState::Paused { 2056 return self.resume().map_err(Error::Resume); 2057 } 2058 2059 let new_state = if self.stop_on_boot { 2060 VmState::BreakPoint 2061 } else { 2062 VmState::Running 2063 }; 2064 current_state.valid_transition(new_state)?; 2065 2066 // Do earlier to parallelise with loading kernel 2067 #[cfg(target_arch = "x86_64")] 2068 cfg_if::cfg_if! { 2069 if #[cfg(feature = "sev_snp")] { 2070 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2071 let rsdp_addr = if sev_snp_enabled { 2072 // In case of SEV-SNP guest ACPI tables are provided via 2073 // IGVM. So skip the creation of ACPI tables and set the 2074 // rsdp addr to None. 2075 None 2076 } else { 2077 self.create_acpi_tables() 2078 }; 2079 } else { 2080 let rsdp_addr = self.create_acpi_tables(); 2081 } 2082 } 2083 2084 // Load kernel synchronously or if asynchronous then wait for load to 2085 // finish. 2086 let entry_point = self.entry_point()?; 2087 2088 #[cfg(feature = "tdx")] 2089 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2090 2091 // Configure the vcpus that have been created 2092 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2093 for vcpu in vcpus { 2094 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2095 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2096 self.cpu_manager 2097 .lock() 2098 .unwrap() 2099 .configure_vcpu(vcpu, boot_setup) 2100 .map_err(Error::CpuManager)?; 2101 } 2102 2103 #[cfg(feature = "tdx")] 2104 let (sections, guid_found) = if tdx_enabled { 2105 self.extract_tdvf_sections()? 2106 } else { 2107 (Vec::new(), false) 2108 }; 2109 2110 // Configuring the TDX regions requires that the vCPUs are created. 2111 #[cfg(feature = "tdx")] 2112 let hob_address = if tdx_enabled { 2113 // TDX sections are written to memory. 2114 self.populate_tdx_sections(§ions, guid_found)? 2115 } else { 2116 None 2117 }; 2118 2119 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2120 // available after they are configured 2121 #[cfg(target_arch = "aarch64")] 2122 let rsdp_addr = self.create_acpi_tables(); 2123 2124 // Configure shared state based on loaded kernel 2125 entry_point 2126 .map(|entry_point| { 2127 // Safe to unwrap rsdp_addr as we know it can't be None when 2128 // the entry_point is Some. 2129 self.configure_system(rsdp_addr.unwrap(), entry_point) 2130 }) 2131 .transpose()?; 2132 2133 #[cfg(target_arch = "x86_64")] 2134 // Note: For x86, always call this function before invoking start boot vcpus. 2135 // Otherwise guest would fail to boot because we haven't created the 2136 // userspace mappings to update the hypervisor about the memory mappings. 2137 // These mappings must be created before we start the vCPU threads for 2138 // the very first time. 2139 self.memory_manager 2140 .lock() 2141 .unwrap() 2142 .allocate_address_space() 2143 .map_err(Error::MemoryManager)?; 2144 2145 #[cfg(feature = "tdx")] 2146 if let Some(hob_address) = hob_address { 2147 // With the HOB address extracted the vCPUs can have 2148 // their TDX state configured. 2149 self.cpu_manager 2150 .lock() 2151 .unwrap() 2152 .initialize_tdx(hob_address) 2153 .map_err(Error::CpuManager)?; 2154 // Let the hypervisor know which memory ranges are shared with the 2155 // guest. This prevents the guest from ignoring/discarding memory 2156 // regions provided by the host. 2157 self.init_tdx_memory(§ions)?; 2158 // With TDX memory and CPU state configured TDX setup is complete 2159 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2160 } 2161 2162 // Resume the vm for MSHV 2163 if current_state == VmState::Created { 2164 self.vm.resume().map_err(Error::ResumeVm)?; 2165 } 2166 2167 self.cpu_manager 2168 .lock() 2169 .unwrap() 2170 .start_boot_vcpus(new_state == VmState::BreakPoint) 2171 .map_err(Error::CpuManager)?; 2172 2173 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2174 *state = new_state; 2175 Ok(()) 2176 } 2177 2178 pub fn restore(&mut self) -> Result<()> { 2179 event!("vm", "restoring"); 2180 2181 #[cfg(target_arch = "x86_64")] 2182 // Note: For x86, always call this function before invoking start boot vcpus. 2183 // Otherwise guest would fail to boot because we haven't created the 2184 // userspace mappings to update the hypervisor about the memory mappings. 2185 // These mappings must be created before we start the vCPU threads for 2186 // the very first time for the restored VM. 2187 self.memory_manager 2188 .lock() 2189 .unwrap() 2190 .allocate_address_space() 2191 .map_err(Error::MemoryManager)?; 2192 2193 // Now we can start all vCPUs from here. 2194 self.cpu_manager 2195 .lock() 2196 .unwrap() 2197 .start_restored_vcpus() 2198 .map_err(Error::CpuManager)?; 2199 2200 event!("vm", "restored"); 2201 Ok(()) 2202 } 2203 2204 /// Gets a thread-safe reference counted pointer to the VM configuration. 2205 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2206 Arc::clone(&self.config) 2207 } 2208 2209 /// Get the VM state. Returns an error if the state is poisoned. 2210 pub fn get_state(&self) -> Result<VmState> { 2211 self.state 2212 .try_read() 2213 .map_err(|_| Error::PoisonedState) 2214 .map(|state| *state) 2215 } 2216 2217 /// Gets the actual size of the balloon. 2218 pub fn balloon_size(&self) -> u64 { 2219 self.device_manager.lock().unwrap().balloon_size() 2220 } 2221 2222 pub fn send_memory_fds( 2223 &mut self, 2224 socket: &mut UnixStream, 2225 ) -> std::result::Result<(), MigratableError> { 2226 for (slot, fd) in self 2227 .memory_manager 2228 .lock() 2229 .unwrap() 2230 .memory_slot_fds() 2231 .drain() 2232 { 2233 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2234 .write_to(socket) 2235 .map_err(|e| { 2236 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2237 })?; 2238 socket 2239 .send_with_fd(&slot.to_le_bytes()[..], fd) 2240 .map_err(|e| { 2241 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2242 })?; 2243 2244 Response::read_from(socket)?.ok_or_abandon( 2245 socket, 2246 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")), 2247 )?; 2248 } 2249 2250 Ok(()) 2251 } 2252 2253 pub fn send_memory_regions<F>( 2254 &mut self, 2255 ranges: &MemoryRangeTable, 2256 fd: &mut F, 2257 ) -> std::result::Result<(), MigratableError> 2258 where 2259 F: WriteVolatile, 2260 { 2261 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2262 let mem = guest_memory.memory(); 2263 2264 for range in ranges.regions() { 2265 let mut offset: u64 = 0; 2266 // Here we are manually handling the retry in case we can't the 2267 // whole region at once because we can't use the implementation 2268 // from vm-memory::GuestMemory of write_all_to() as it is not 2269 // following the correct behavior. For more info about this issue 2270 // see: https://github.com/rust-vmm/vm-memory/issues/174 2271 loop { 2272 let bytes_written = mem 2273 .write_volatile_to( 2274 GuestAddress(range.gpa + offset), 2275 fd, 2276 (range.length - offset) as usize, 2277 ) 2278 .map_err(|e| { 2279 MigratableError::MigrateSend(anyhow!( 2280 "Error transferring memory to socket: {}", 2281 e 2282 )) 2283 })?; 2284 offset += bytes_written as u64; 2285 2286 if offset == range.length { 2287 break; 2288 } 2289 } 2290 } 2291 2292 Ok(()) 2293 } 2294 2295 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2296 self.memory_manager 2297 .lock() 2298 .unwrap() 2299 .memory_range_table(false) 2300 } 2301 2302 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2303 self.device_manager.lock().unwrap().device_tree() 2304 } 2305 2306 pub fn activate_virtio_devices(&self) -> Result<()> { 2307 self.device_manager 2308 .lock() 2309 .unwrap() 2310 .activate_virtio_devices() 2311 .map_err(Error::ActivateVirtioDevices) 2312 } 2313 2314 #[cfg(target_arch = "x86_64")] 2315 pub fn power_button(&self) -> Result<()> { 2316 return self 2317 .device_manager 2318 .lock() 2319 .unwrap() 2320 .notify_power_button() 2321 .map_err(Error::PowerButton); 2322 } 2323 2324 #[cfg(target_arch = "aarch64")] 2325 pub fn power_button(&self) -> Result<()> { 2326 self.device_manager 2327 .lock() 2328 .unwrap() 2329 .notify_power_button() 2330 .map_err(Error::PowerButton) 2331 } 2332 2333 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2334 self.memory_manager.lock().unwrap().snapshot_data() 2335 } 2336 2337 #[cfg(feature = "guest_debug")] 2338 pub fn debug_request( 2339 &mut self, 2340 gdb_request: &GdbRequestPayload, 2341 cpu_id: usize, 2342 ) -> Result<GdbResponsePayload> { 2343 use GdbRequestPayload::*; 2344 match gdb_request { 2345 SetSingleStep(single_step) => { 2346 self.set_guest_debug(cpu_id, &[], *single_step) 2347 .map_err(Error::Debug)?; 2348 } 2349 SetHwBreakPoint(addrs) => { 2350 self.set_guest_debug(cpu_id, addrs, false) 2351 .map_err(Error::Debug)?; 2352 } 2353 Pause => { 2354 self.debug_pause().map_err(Error::Debug)?; 2355 } 2356 Resume => { 2357 self.debug_resume().map_err(Error::Debug)?; 2358 } 2359 ReadRegs => { 2360 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2361 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2362 } 2363 WriteRegs(regs) => { 2364 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2365 } 2366 ReadMem(vaddr, len) => { 2367 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2368 let mem = self 2369 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2370 .map_err(Error::Debug)?; 2371 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2372 } 2373 WriteMem(vaddr, data) => { 2374 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2375 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2376 .map_err(Error::Debug)?; 2377 } 2378 ActiveVcpus => { 2379 let active_vcpus = self.active_vcpus(); 2380 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2381 } 2382 } 2383 Ok(GdbResponsePayload::CommandComplete) 2384 } 2385 2386 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2387 fn get_dump_state( 2388 &mut self, 2389 destination_url: &str, 2390 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2391 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2392 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2393 let mut elf_phdr_num = 1; 2394 let elf_sh_info = 0; 2395 let coredump_file_path = url_to_file(destination_url)?; 2396 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2397 2398 if mapping_num < UINT16_MAX - 2 { 2399 elf_phdr_num += mapping_num as u16; 2400 } else { 2401 panic!("mapping num beyond 65535 not supported"); 2402 } 2403 let coredump_file = OpenOptions::new() 2404 .read(true) 2405 .write(true) 2406 .create_new(true) 2407 .open(coredump_file_path) 2408 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2409 2410 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2411 let mem_data = self 2412 .memory_manager 2413 .lock() 2414 .unwrap() 2415 .coredump_memory_regions(mem_offset); 2416 2417 Ok(DumpState { 2418 elf_note_size, 2419 elf_phdr_num, 2420 elf_sh_info, 2421 mem_offset, 2422 mem_info: Some(mem_data), 2423 file: Some(coredump_file), 2424 }) 2425 } 2426 2427 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2428 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2429 size_of::<elf::Elf64_Ehdr>() as u64 2430 + note_size as u64 2431 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2432 } 2433 2434 pub fn nmi(&self) -> Result<()> { 2435 return self 2436 .cpu_manager 2437 .lock() 2438 .unwrap() 2439 .nmi() 2440 .map_err(|_| Error::ErrorNmi); 2441 } 2442 } 2443 2444 impl Pausable for Vm { 2445 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2446 event!("vm", "pausing"); 2447 let mut state = self 2448 .state 2449 .try_write() 2450 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2451 let new_state = VmState::Paused; 2452 2453 state 2454 .valid_transition(new_state) 2455 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2456 2457 #[cfg(target_arch = "x86_64")] 2458 { 2459 let mut clock = self 2460 .vm 2461 .get_clock() 2462 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2463 clock.reset_flags(); 2464 self.saved_clock = Some(clock); 2465 } 2466 2467 // Before pausing the vCPUs activate any pending virtio devices that might 2468 // need activation between starting the pause (or e.g. a migration it's part of) 2469 self.activate_virtio_devices().map_err(|e| { 2470 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2471 })?; 2472 2473 self.cpu_manager.lock().unwrap().pause()?; 2474 self.device_manager.lock().unwrap().pause()?; 2475 2476 self.vm 2477 .pause() 2478 .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?; 2479 2480 *state = new_state; 2481 2482 event!("vm", "paused"); 2483 Ok(()) 2484 } 2485 2486 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2487 event!("vm", "resuming"); 2488 let current_state = self.get_state().unwrap(); 2489 let mut state = self 2490 .state 2491 .try_write() 2492 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2493 let new_state = VmState::Running; 2494 2495 state 2496 .valid_transition(new_state) 2497 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2498 2499 self.cpu_manager.lock().unwrap().resume()?; 2500 #[cfg(target_arch = "x86_64")] 2501 { 2502 if let Some(clock) = &self.saved_clock { 2503 self.vm.set_clock(clock).map_err(|e| { 2504 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2505 })?; 2506 } 2507 } 2508 2509 if current_state == VmState::Paused { 2510 self.vm 2511 .resume() 2512 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?; 2513 } 2514 2515 self.device_manager.lock().unwrap().resume()?; 2516 2517 // And we're back to the Running state. 2518 *state = new_state; 2519 event!("vm", "resumed"); 2520 Ok(()) 2521 } 2522 } 2523 2524 #[derive(Serialize, Deserialize)] 2525 pub struct VmSnapshot { 2526 #[cfg(target_arch = "x86_64")] 2527 pub clock: Option<hypervisor::ClockData>, 2528 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2529 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2530 } 2531 2532 pub const VM_SNAPSHOT_ID: &str = "vm"; 2533 impl Snapshottable for Vm { 2534 fn id(&self) -> String { 2535 VM_SNAPSHOT_ID.to_string() 2536 } 2537 2538 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2539 event!("vm", "snapshotting"); 2540 2541 #[cfg(feature = "tdx")] 2542 { 2543 if self.config.lock().unwrap().is_tdx_enabled() { 2544 return Err(MigratableError::Snapshot(anyhow!( 2545 "Snapshot not possible with TDX VM" 2546 ))); 2547 } 2548 } 2549 2550 let current_state = self.get_state().unwrap(); 2551 if current_state != VmState::Paused { 2552 return Err(MigratableError::Snapshot(anyhow!( 2553 "Trying to snapshot while VM is running" 2554 ))); 2555 } 2556 2557 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2558 let common_cpuid = { 2559 let amx = self.config.lock().unwrap().cpus.features.amx; 2560 let phys_bits = physical_bits( 2561 &self.hypervisor, 2562 self.config.lock().unwrap().cpus.max_phys_bits, 2563 ); 2564 arch::generate_common_cpuid( 2565 &self.hypervisor, 2566 &arch::CpuidConfig { 2567 sgx_epc_sections: None, 2568 phys_bits, 2569 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2570 #[cfg(feature = "tdx")] 2571 tdx: false, 2572 amx, 2573 }, 2574 ) 2575 .map_err(|e| { 2576 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2577 })? 2578 }; 2579 2580 let vm_snapshot_state = VmSnapshot { 2581 #[cfg(target_arch = "x86_64")] 2582 clock: self.saved_clock, 2583 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2584 common_cpuid, 2585 }; 2586 2587 let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?; 2588 2589 let (id, snapshot) = { 2590 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2591 (cpu_manager.id(), cpu_manager.snapshot()?) 2592 }; 2593 vm_snapshot.add_snapshot(id, snapshot); 2594 let (id, snapshot) = { 2595 let mut memory_manager = self.memory_manager.lock().unwrap(); 2596 (memory_manager.id(), memory_manager.snapshot()?) 2597 }; 2598 vm_snapshot.add_snapshot(id, snapshot); 2599 let (id, snapshot) = { 2600 let mut device_manager = self.device_manager.lock().unwrap(); 2601 (device_manager.id(), device_manager.snapshot()?) 2602 }; 2603 vm_snapshot.add_snapshot(id, snapshot); 2604 2605 event!("vm", "snapshotted"); 2606 Ok(vm_snapshot) 2607 } 2608 } 2609 2610 impl Transportable for Vm { 2611 fn send( 2612 &self, 2613 snapshot: &Snapshot, 2614 destination_url: &str, 2615 ) -> std::result::Result<(), MigratableError> { 2616 let mut snapshot_config_path = url_to_path(destination_url)?; 2617 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2618 2619 // Create the snapshot config file 2620 let mut snapshot_config_file = OpenOptions::new() 2621 .read(true) 2622 .write(true) 2623 .create_new(true) 2624 .open(snapshot_config_path) 2625 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2626 2627 // Serialize and write the snapshot config 2628 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2629 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2630 2631 snapshot_config_file 2632 .write(vm_config.as_bytes()) 2633 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2634 2635 let mut snapshot_state_path = url_to_path(destination_url)?; 2636 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2637 2638 // Create the snapshot state file 2639 let mut snapshot_state_file = OpenOptions::new() 2640 .read(true) 2641 .write(true) 2642 .create_new(true) 2643 .open(snapshot_state_path) 2644 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2645 2646 // Serialize and write the snapshot state 2647 let vm_state = 2648 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2649 2650 snapshot_state_file 2651 .write(&vm_state) 2652 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2653 2654 // Tell the memory manager to also send/write its own snapshot. 2655 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2656 self.memory_manager 2657 .lock() 2658 .unwrap() 2659 .send(&memory_manager_snapshot.clone(), destination_url)?; 2660 } else { 2661 return Err(MigratableError::Restore(anyhow!( 2662 "Missing memory manager snapshot" 2663 ))); 2664 } 2665 2666 Ok(()) 2667 } 2668 } 2669 2670 impl Migratable for Vm { 2671 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2672 self.memory_manager.lock().unwrap().start_dirty_log()?; 2673 self.device_manager.lock().unwrap().start_dirty_log() 2674 } 2675 2676 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2677 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2678 self.device_manager.lock().unwrap().stop_dirty_log() 2679 } 2680 2681 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2682 Ok(MemoryRangeTable::new_from_tables(vec![ 2683 self.memory_manager.lock().unwrap().dirty_log()?, 2684 self.device_manager.lock().unwrap().dirty_log()?, 2685 ])) 2686 } 2687 2688 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2689 self.memory_manager.lock().unwrap().start_migration()?; 2690 self.device_manager.lock().unwrap().start_migration() 2691 } 2692 2693 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2694 self.memory_manager.lock().unwrap().complete_migration()?; 2695 self.device_manager.lock().unwrap().complete_migration() 2696 } 2697 } 2698 2699 #[cfg(feature = "guest_debug")] 2700 impl Debuggable for Vm { 2701 fn set_guest_debug( 2702 &self, 2703 cpu_id: usize, 2704 addrs: &[GuestAddress], 2705 singlestep: bool, 2706 ) -> std::result::Result<(), DebuggableError> { 2707 self.cpu_manager 2708 .lock() 2709 .unwrap() 2710 .set_guest_debug(cpu_id, addrs, singlestep) 2711 } 2712 2713 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2714 if *self.state.read().unwrap() == VmState::Running { 2715 self.pause().map_err(DebuggableError::Pause)?; 2716 } 2717 2718 let mut state = self 2719 .state 2720 .try_write() 2721 .map_err(|_| DebuggableError::PoisonedState)?; 2722 *state = VmState::BreakPoint; 2723 Ok(()) 2724 } 2725 2726 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2727 if *self.state.read().unwrap() == VmState::BreakPoint { 2728 self.resume().map_err(DebuggableError::Pause)?; 2729 } 2730 2731 Ok(()) 2732 } 2733 2734 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2735 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2736 } 2737 2738 fn write_regs( 2739 &self, 2740 cpu_id: usize, 2741 regs: &CoreRegs, 2742 ) -> std::result::Result<(), DebuggableError> { 2743 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2744 } 2745 2746 fn read_mem( 2747 &self, 2748 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2749 cpu_id: usize, 2750 vaddr: GuestAddress, 2751 len: usize, 2752 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2753 self.cpu_manager 2754 .lock() 2755 .unwrap() 2756 .read_mem(guest_memory, cpu_id, vaddr, len) 2757 } 2758 2759 fn write_mem( 2760 &self, 2761 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2762 cpu_id: usize, 2763 vaddr: &GuestAddress, 2764 data: &[u8], 2765 ) -> std::result::Result<(), DebuggableError> { 2766 self.cpu_manager 2767 .lock() 2768 .unwrap() 2769 .write_mem(guest_memory, cpu_id, vaddr, data) 2770 } 2771 2772 fn active_vcpus(&self) -> usize { 2773 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2774 if active_vcpus > 0 { 2775 active_vcpus 2776 } else { 2777 // The VM is not booted yet. Report boot_vcpus() instead. 2778 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2779 } 2780 } 2781 } 2782 2783 #[cfg(feature = "guest_debug")] 2784 pub const UINT16_MAX: u32 = 65535; 2785 2786 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2787 impl Elf64Writable for Vm {} 2788 2789 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2790 impl GuestDebuggable for Vm { 2791 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2792 event!("vm", "coredumping"); 2793 2794 let mut resume = false; 2795 2796 #[cfg(feature = "tdx")] 2797 { 2798 if let Some(ref platform) = self.config.lock().unwrap().platform { 2799 if platform.tdx { 2800 return Err(GuestDebuggableError::Coredump(anyhow!( 2801 "Coredump not possible with TDX VM" 2802 ))); 2803 } 2804 } 2805 } 2806 2807 match self.get_state().unwrap() { 2808 VmState::Running => { 2809 self.pause().map_err(GuestDebuggableError::Pause)?; 2810 resume = true; 2811 } 2812 VmState::Paused => {} 2813 _ => { 2814 return Err(GuestDebuggableError::Coredump(anyhow!( 2815 "Trying to coredump while VM is not running or paused" 2816 ))); 2817 } 2818 } 2819 2820 let coredump_state = self.get_dump_state(destination_url)?; 2821 2822 self.write_header(&coredump_state)?; 2823 self.write_note(&coredump_state)?; 2824 self.write_loads(&coredump_state)?; 2825 2826 self.cpu_manager 2827 .lock() 2828 .unwrap() 2829 .cpu_write_elf64_note(&coredump_state)?; 2830 self.cpu_manager 2831 .lock() 2832 .unwrap() 2833 .cpu_write_vmm_note(&coredump_state)?; 2834 2835 self.memory_manager 2836 .lock() 2837 .unwrap() 2838 .coredump_iterate_save_mem(&coredump_state)?; 2839 2840 if resume { 2841 self.resume().map_err(GuestDebuggableError::Resume)?; 2842 } 2843 2844 Ok(()) 2845 } 2846 } 2847 2848 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2849 #[cfg(test)] 2850 mod tests { 2851 use super::*; 2852 2853 fn test_vm_state_transitions(state: VmState) { 2854 match state { 2855 VmState::Created => { 2856 // Check the transitions from Created 2857 assert!(state.valid_transition(VmState::Created).is_err()); 2858 assert!(state.valid_transition(VmState::Running).is_ok()); 2859 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2860 assert!(state.valid_transition(VmState::Paused).is_ok()); 2861 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2862 } 2863 VmState::Running => { 2864 // Check the transitions from Running 2865 assert!(state.valid_transition(VmState::Created).is_err()); 2866 assert!(state.valid_transition(VmState::Running).is_err()); 2867 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2868 assert!(state.valid_transition(VmState::Paused).is_ok()); 2869 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2870 } 2871 VmState::Shutdown => { 2872 // Check the transitions from Shutdown 2873 assert!(state.valid_transition(VmState::Created).is_err()); 2874 assert!(state.valid_transition(VmState::Running).is_ok()); 2875 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2876 assert!(state.valid_transition(VmState::Paused).is_err()); 2877 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2878 } 2879 VmState::Paused => { 2880 // Check the transitions from Paused 2881 assert!(state.valid_transition(VmState::Created).is_err()); 2882 assert!(state.valid_transition(VmState::Running).is_ok()); 2883 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2884 assert!(state.valid_transition(VmState::Paused).is_err()); 2885 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2886 } 2887 VmState::BreakPoint => { 2888 // Check the transitions from Breakpoint 2889 assert!(state.valid_transition(VmState::Created).is_ok()); 2890 assert!(state.valid_transition(VmState::Running).is_ok()); 2891 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2892 assert!(state.valid_transition(VmState::Paused).is_err()); 2893 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2894 } 2895 } 2896 } 2897 2898 #[test] 2899 fn test_vm_created_transitions() { 2900 test_vm_state_transitions(VmState::Created); 2901 } 2902 2903 #[test] 2904 fn test_vm_running_transitions() { 2905 test_vm_state_transitions(VmState::Running); 2906 } 2907 2908 #[test] 2909 fn test_vm_shutdown_transitions() { 2910 test_vm_state_transitions(VmState::Shutdown); 2911 } 2912 2913 #[test] 2914 fn test_vm_paused_transitions() { 2915 test_vm_state_transitions(VmState::Paused); 2916 } 2917 2918 #[cfg(feature = "tdx")] 2919 #[test] 2920 fn test_hob_memory_resources() { 2921 // Case 1: Two TDVF sections in the middle of the RAM 2922 let sections = vec![ 2923 TdvfSection { 2924 address: 0xc000, 2925 size: 0x1000, 2926 ..Default::default() 2927 }, 2928 TdvfSection { 2929 address: 0x1000, 2930 size: 0x4000, 2931 ..Default::default() 2932 }, 2933 ]; 2934 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2935 let expected = vec![ 2936 (0, 0x1000, true), 2937 (0x1000, 0x4000, false), 2938 (0x5000, 0x7000, true), 2939 (0xc000, 0x1000, false), 2940 (0xd000, 0x0fff_3000, true), 2941 ]; 2942 assert_eq!( 2943 expected, 2944 Vm::hob_memory_resources( 2945 sections, 2946 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2947 ) 2948 ); 2949 2950 // Case 2: Two TDVF sections with no conflict with the RAM 2951 let sections = vec![ 2952 TdvfSection { 2953 address: 0x1000_1000, 2954 size: 0x1000, 2955 ..Default::default() 2956 }, 2957 TdvfSection { 2958 address: 0, 2959 size: 0x1000, 2960 ..Default::default() 2961 }, 2962 ]; 2963 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2964 let expected = vec![ 2965 (0, 0x1000, false), 2966 (0x1000, 0x1000_0000, true), 2967 (0x1000_1000, 0x1000, false), 2968 ]; 2969 assert_eq!( 2970 expected, 2971 Vm::hob_memory_resources( 2972 sections, 2973 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2974 ) 2975 ); 2976 2977 // Case 3: Two TDVF sections with partial conflicts with the RAM 2978 let sections = vec![ 2979 TdvfSection { 2980 address: 0x1000_0000, 2981 size: 0x2000, 2982 ..Default::default() 2983 }, 2984 TdvfSection { 2985 address: 0, 2986 size: 0x2000, 2987 ..Default::default() 2988 }, 2989 ]; 2990 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2991 let expected = vec![ 2992 (0, 0x2000, false), 2993 (0x2000, 0x0fff_e000, true), 2994 (0x1000_0000, 0x2000, false), 2995 ]; 2996 assert_eq!( 2997 expected, 2998 Vm::hob_memory_resources( 2999 sections, 3000 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3001 ) 3002 ); 3003 3004 // Case 4: Two TDVF sections with no conflict before the RAM and two 3005 // more additional sections with no conflict after the RAM. 3006 let sections = vec![ 3007 TdvfSection { 3008 address: 0x2000_1000, 3009 size: 0x1000, 3010 ..Default::default() 3011 }, 3012 TdvfSection { 3013 address: 0x2000_0000, 3014 size: 0x1000, 3015 ..Default::default() 3016 }, 3017 TdvfSection { 3018 address: 0x1000, 3019 size: 0x1000, 3020 ..Default::default() 3021 }, 3022 TdvfSection { 3023 address: 0, 3024 size: 0x1000, 3025 ..Default::default() 3026 }, 3027 ]; 3028 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3029 let expected = vec![ 3030 (0, 0x1000, false), 3031 (0x1000, 0x1000, false), 3032 (0x4000, 0x1000_0000, true), 3033 (0x2000_0000, 0x1000, false), 3034 (0x2000_1000, 0x1000, false), 3035 ]; 3036 assert_eq!( 3037 expected, 3038 Vm::hob_memory_resources( 3039 sections, 3040 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3041 ) 3042 ); 3043 3044 // Case 5: One TDVF section overriding the entire RAM 3045 let sections = vec![TdvfSection { 3046 address: 0, 3047 size: 0x2000_0000, 3048 ..Default::default() 3049 }]; 3050 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3051 let expected = vec![(0, 0x2000_0000, false)]; 3052 assert_eq!( 3053 expected, 3054 Vm::hob_memory_resources( 3055 sections, 3056 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3057 ) 3058 ); 3059 3060 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3061 let sections = vec![ 3062 TdvfSection { 3063 address: 0x1000_2000, 3064 size: 0x2000, 3065 ..Default::default() 3066 }, 3067 TdvfSection { 3068 address: 0, 3069 size: 0x2000, 3070 ..Default::default() 3071 }, 3072 ]; 3073 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3074 (GuestAddress(0x2000), 0x1000_0000), 3075 (GuestAddress(0x1000_4000), 0x1000_0000), 3076 ]; 3077 let expected = vec![ 3078 (0, 0x2000, false), 3079 (0x2000, 0x1000_0000, true), 3080 (0x1000_2000, 0x2000, false), 3081 (0x1000_4000, 0x1000_0000, true), 3082 ]; 3083 assert_eq!( 3084 expected, 3085 Vm::hob_memory_resources( 3086 sections, 3087 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3088 ) 3089 ); 3090 3091 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3092 let sections = vec![ 3093 TdvfSection { 3094 address: 0x1000_0000, 3095 size: 0x4000, 3096 ..Default::default() 3097 }, 3098 TdvfSection { 3099 address: 0, 3100 size: 0x4000, 3101 ..Default::default() 3102 }, 3103 ]; 3104 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3105 (GuestAddress(0x1000), 0x1000_0000), 3106 (GuestAddress(0x1000_3000), 0x1000_0000), 3107 ]; 3108 let expected = vec![ 3109 (0, 0x4000, false), 3110 (0x4000, 0x0fff_c000, true), 3111 (0x1000_0000, 0x4000, false), 3112 (0x1000_4000, 0x0fff_f000, true), 3113 ]; 3114 assert_eq!( 3115 expected, 3116 Vm::hob_memory_resources( 3117 sections, 3118 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3119 ) 3120 ); 3121 } 3122 } 3123 3124 #[cfg(target_arch = "aarch64")] 3125 #[cfg(test)] 3126 mod tests { 3127 use super::*; 3128 use arch::aarch64::fdt::create_fdt; 3129 use arch::aarch64::layout; 3130 use arch::{DeviceType, MmioDeviceInfo}; 3131 use devices::gic::Gic; 3132 3133 const LEN: u64 = 4096; 3134 3135 #[test] 3136 fn test_create_fdt_with_devices() { 3137 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3138 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3139 3140 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3141 ( 3142 (DeviceType::Serial, DeviceType::Serial.to_string()), 3143 MmioDeviceInfo { 3144 addr: 0x00, 3145 len: LEN, 3146 irq: 33, 3147 }, 3148 ), 3149 ( 3150 (DeviceType::Virtio(1), "virtio".to_string()), 3151 MmioDeviceInfo { 3152 addr: LEN, 3153 len: LEN, 3154 irq: 34, 3155 }, 3156 ), 3157 ( 3158 (DeviceType::Rtc, "rtc".to_string()), 3159 MmioDeviceInfo { 3160 addr: 2 * LEN, 3161 len: LEN, 3162 irq: 35, 3163 }, 3164 ), 3165 ] 3166 .iter() 3167 .cloned() 3168 .collect(); 3169 3170 let hv = hypervisor::new().unwrap(); 3171 let vm = hv.create_vm().unwrap(); 3172 let gic = vm 3173 .create_vgic(Gic::create_default_config(1)) 3174 .expect("Cannot create gic"); 3175 assert!(create_fdt( 3176 &mem, 3177 "console=tty0", 3178 vec![0], 3179 Some((0, 0, 0)), 3180 &dev_info, 3181 &gic, 3182 &None, 3183 &Vec::new(), 3184 &BTreeMap::new(), 3185 None, 3186 true, 3187 ) 3188 .is_ok()) 3189 } 3190 } 3191 3192 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3193 #[test] 3194 pub fn test_vm() { 3195 use hypervisor::VmExit; 3196 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3197 // This example based on https://lwn.net/Articles/658511/ 3198 let code = [ 3199 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3200 0x00, 0xd8, /* add %bl, %al */ 3201 0x04, b'0', /* add $'0', %al */ 3202 0xee, /* out %al, (%dx) */ 3203 0xb0, b'\n', /* mov $'\n', %al */ 3204 0xee, /* out %al, (%dx) */ 3205 0xf4, /* hlt */ 3206 ]; 3207 3208 let mem_size = 0x1000; 3209 let load_addr = GuestAddress(0x1000); 3210 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3211 3212 let hv = hypervisor::new().unwrap(); 3213 let vm = hv.create_vm().expect("new VM creation failed"); 3214 3215 for (index, region) in mem.iter().enumerate() { 3216 let mem_region = vm.make_user_memory_region( 3217 index as u32, 3218 region.start_addr().raw_value(), 3219 region.len(), 3220 region.as_ptr() as u64, 3221 false, 3222 false, 3223 ); 3224 3225 vm.create_user_memory_region(mem_region) 3226 .expect("Cannot configure guest memory"); 3227 } 3228 mem.write_slice(&code, load_addr) 3229 .expect("Writing code to memory failed"); 3230 3231 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3232 3233 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3234 vcpu_sregs.cs.base = 0; 3235 vcpu_sregs.cs.selector = 0; 3236 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3237 3238 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3239 vcpu_regs.rip = 0x1000; 3240 vcpu_regs.rax = 2; 3241 vcpu_regs.rbx = 3; 3242 vcpu_regs.rflags = 2; 3243 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3244 3245 loop { 3246 match vcpu.run().expect("run failed") { 3247 VmExit::Reset => { 3248 println!("HLT"); 3249 break; 3250 } 3251 VmExit::Ignore => {} 3252 r => panic!("unexpected exit reason: {r:?}"), 3253 } 3254 } 3255 } 3256