1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 #[cfg(feature = "igvm")] 29 use crate::igvm::igvm_loader; 30 use crate::memory_manager::{ 31 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 32 }; 33 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 34 use crate::migration::get_vm_snapshot; 35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 36 use crate::migration::url_to_file; 37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 38 use crate::GuestMemoryMmap; 39 use crate::{ 40 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 41 }; 42 use anyhow::anyhow; 43 use arch::get_host_cpu_phys_bits; 44 #[cfg(target_arch = "x86_64")] 45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 46 #[cfg(feature = "tdx")] 47 use arch::x86_64::tdx::TdvfSection; 48 use arch::EntryPoint; 49 #[cfg(target_arch = "aarch64")] 50 use arch::PciSpaceInfo; 51 use arch::{NumaNode, NumaNodes}; 52 #[cfg(target_arch = "aarch64")] 53 use devices::interrupt_controller; 54 use devices::AcpiNotificationFlags; 55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 59 use hypervisor::{HypervisorVmError, VmOps}; 60 use libc::{termios, SIGWINCH}; 61 use linux_loader::cmdline::Cmdline; 62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 63 use linux_loader::elf; 64 #[cfg(target_arch = "x86_64")] 65 use linux_loader::loader::bzimage::BzImage; 66 #[cfg(target_arch = "x86_64")] 67 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 68 #[cfg(target_arch = "aarch64")] 69 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 70 use linux_loader::loader::KernelLoader; 71 use seccompiler::SeccompAction; 72 use serde::{Deserialize, Serialize}; 73 use std::cmp; 74 use std::collections::BTreeMap; 75 use std::collections::HashMap; 76 use std::fs::{File, OpenOptions}; 77 use std::io::{self, Seek, SeekFrom, Write}; 78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 79 use std::mem::size_of; 80 use std::num::Wrapping; 81 use std::ops::Deref; 82 use std::os::unix::net::UnixStream; 83 use std::sync::{Arc, Mutex, RwLock}; 84 use std::time::Instant; 85 use std::{result, str, thread}; 86 use thiserror::Error; 87 use tracer::trace_scoped; 88 use vm_device::Bus; 89 #[cfg(feature = "tdx")] 90 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 91 use vm_memory::{ 92 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 93 }; 94 use vm_migration::protocol::{Request, Response, Status}; 95 use vm_migration::{ 96 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 97 SnapshotData, Snapshottable, Transportable, 98 }; 99 use vmm_sys_util::eventfd::EventFd; 100 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 101 102 /// Errors associated with VM management 103 #[derive(Debug, Error)] 104 pub enum Error { 105 #[error("Cannot open kernel file: {0}")] 106 KernelFile(#[source] io::Error), 107 108 #[error("Cannot open initramfs file: {0}")] 109 InitramfsFile(#[source] io::Error), 110 111 #[error("Cannot load the kernel into memory: {0}")] 112 KernelLoad(#[source] linux_loader::loader::Error), 113 114 #[cfg(target_arch = "aarch64")] 115 #[error("Cannot load the UEFI binary in memory: {0:?}")] 116 UefiLoad(arch::aarch64::uefi::Error), 117 118 #[error("Cannot load the initramfs into memory")] 119 InitramfsLoad, 120 121 #[error("Cannot load the kernel command line in memory: {0}")] 122 LoadCmdLine(#[source] linux_loader::loader::Error), 123 124 #[error("Cannot modify the kernel command line: {0}")] 125 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 126 127 #[error("Cannot create the kernel command line: {0}")] 128 CmdLineCreate(#[source] linux_loader::cmdline::Error), 129 130 #[error("Cannot configure system: {0}")] 131 ConfigureSystem(#[source] arch::Error), 132 133 #[cfg(target_arch = "aarch64")] 134 #[error("Cannot enable interrupt controller: {0:?}")] 135 EnableInterruptController(interrupt_controller::Error), 136 137 #[error("VM state is poisoned")] 138 PoisonedState, 139 140 #[error("Error from device manager: {0:?}")] 141 DeviceManager(DeviceManagerError), 142 143 #[error("No device with id {0:?} to remove")] 144 NoDeviceToRemove(String), 145 146 #[error("Cannot spawn a signal handler thread: {0}")] 147 SignalHandlerSpawn(#[source] io::Error), 148 149 #[error("Failed to join on threads: {0:?}")] 150 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 151 152 #[error("VM config is missing")] 153 VmMissingConfig, 154 155 #[error("VM is not created")] 156 VmNotCreated, 157 158 #[error("VM is already created")] 159 VmAlreadyCreated, 160 161 #[error("VM is not running")] 162 VmNotRunning, 163 164 #[error("Cannot clone EventFd: {0}")] 165 EventFdClone(#[source] io::Error), 166 167 #[error("invalid VM state transition: {0:?} to {1:?}")] 168 InvalidStateTransition(VmState, VmState), 169 170 #[error("Error from CPU manager: {0}")] 171 CpuManager(#[source] cpu::Error), 172 173 #[error("Cannot pause devices: {0}")] 174 PauseDevices(#[source] MigratableError), 175 176 #[error("Cannot resume devices: {0}")] 177 ResumeDevices(#[source] MigratableError), 178 179 #[error("Cannot pause CPUs: {0}")] 180 PauseCpus(#[source] MigratableError), 181 182 #[error("Cannot resume cpus: {0}")] 183 ResumeCpus(#[source] MigratableError), 184 185 #[error("Cannot pause VM: {0}")] 186 Pause(#[source] MigratableError), 187 188 #[error("Cannot resume VM: {0}")] 189 Resume(#[source] MigratableError), 190 191 #[error("Memory manager error: {0:?}")] 192 MemoryManager(MemoryManagerError), 193 194 #[error("Eventfd write error: {0}")] 195 EventfdError(#[source] std::io::Error), 196 197 #[error("Cannot snapshot VM: {0}")] 198 Snapshot(#[source] MigratableError), 199 200 #[error("Cannot restore VM: {0}")] 201 Restore(#[source] MigratableError), 202 203 #[error("Cannot send VM snapshot: {0}")] 204 SnapshotSend(#[source] MigratableError), 205 206 #[error("Invalid restore source URL")] 207 InvalidRestoreSourceUrl, 208 209 #[error("Failed to validate config: {0}")] 210 ConfigValidation(#[source] ValidationError), 211 212 #[error("Too many virtio-vsock devices")] 213 TooManyVsockDevices, 214 215 #[error("Failed serializing into JSON: {0}")] 216 SerializeJson(#[source] serde_json::Error), 217 218 #[error("Invalid NUMA configuration")] 219 InvalidNumaConfig, 220 221 #[error("Cannot create seccomp filter: {0}")] 222 CreateSeccompFilter(#[source] seccompiler::Error), 223 224 #[error("Cannot apply seccomp filter: {0}")] 225 ApplySeccompFilter(#[source] seccompiler::Error), 226 227 #[error("Failed resizing a memory zone")] 228 ResizeZone, 229 230 #[error("Cannot activate virtio devices: {0:?}")] 231 ActivateVirtioDevices(DeviceManagerError), 232 233 #[error("Error triggering power button: {0:?}")] 234 PowerButton(DeviceManagerError), 235 236 #[error("Kernel lacks PVH header")] 237 KernelMissingPvhHeader, 238 239 #[error("Failed to allocate firmware RAM: {0:?}")] 240 AllocateFirmwareMemory(MemoryManagerError), 241 242 #[error("Error manipulating firmware file: {0}")] 243 FirmwareFile(#[source] std::io::Error), 244 245 #[error("Firmware too big")] 246 FirmwareTooLarge, 247 248 #[error("Failed to copy firmware to memory: {0}")] 249 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 250 251 #[cfg(feature = "sev_snp")] 252 #[error("Error enabling SEV-SNP VM: {0}")] 253 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 254 255 #[cfg(feature = "tdx")] 256 #[error("Error performing I/O on TDX firmware file: {0}")] 257 LoadTdvf(#[source] std::io::Error), 258 259 #[cfg(feature = "tdx")] 260 #[error("Error performing I/O on the TDX payload file: {0}")] 261 LoadPayload(#[source] std::io::Error), 262 263 #[cfg(feature = "tdx")] 264 #[error("Error parsing TDVF: {0}")] 265 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 266 267 #[cfg(feature = "tdx")] 268 #[error("Error populating TDX HOB: {0}")] 269 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 270 271 #[cfg(feature = "tdx")] 272 #[error("Error allocating TDVF memory: {0:?}")] 273 AllocatingTdvfMemory(crate::memory_manager::Error), 274 275 #[cfg(feature = "tdx")] 276 #[error("Error enabling TDX VM: {0}")] 277 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 278 279 #[cfg(feature = "tdx")] 280 #[error("Error enabling TDX memory region: {0}")] 281 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 282 283 #[cfg(feature = "tdx")] 284 #[error("Error finalizing TDX VM: {0}")] 285 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 286 287 #[cfg(feature = "tdx")] 288 #[error("TDX firmware missing")] 289 TdxFirmwareMissing, 290 291 #[cfg(feature = "tdx")] 292 #[error("Invalid TDX payload type")] 293 InvalidPayloadType, 294 295 #[cfg(feature = "guest_debug")] 296 #[error("Error debugging VM: {0:?}")] 297 Debug(DebuggableError), 298 299 #[error("Error spawning kernel loading thread")] 300 KernelLoadThreadSpawn(std::io::Error), 301 302 #[error("Error joining kernel loading thread")] 303 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 304 305 #[error("Payload configuration is not bootable")] 306 InvalidPayload, 307 308 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 309 #[error("Error coredumping VM: {0:?}")] 310 Coredump(GuestDebuggableError), 311 312 #[cfg(feature = "igvm")] 313 #[error("Cannot open igvm file: {0}")] 314 IgvmFile(#[source] io::Error), 315 316 #[cfg(feature = "igvm")] 317 #[error("Cannot load the igvm into memory: {0}")] 318 IgvmLoad(#[source] igvm_loader::Error), 319 320 #[error("Error injecting NMI")] 321 ErrorNmi, 322 } 323 pub type Result<T> = result::Result<T, Error>; 324 325 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 326 pub enum VmState { 327 Created, 328 Running, 329 Shutdown, 330 Paused, 331 BreakPoint, 332 } 333 334 impl VmState { 335 fn valid_transition(self, new_state: VmState) -> Result<()> { 336 match self { 337 VmState::Created => match new_state { 338 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 339 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 340 Ok(()) 341 } 342 }, 343 344 VmState::Running => match new_state { 345 VmState::Created | VmState::Running => { 346 Err(Error::InvalidStateTransition(self, new_state)) 347 } 348 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 349 }, 350 351 VmState::Shutdown => match new_state { 352 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 353 Err(Error::InvalidStateTransition(self, new_state)) 354 } 355 VmState::Running => Ok(()), 356 }, 357 358 VmState::Paused => match new_state { 359 VmState::Created | VmState::Paused | VmState::BreakPoint => { 360 Err(Error::InvalidStateTransition(self, new_state)) 361 } 362 VmState::Running | VmState::Shutdown => Ok(()), 363 }, 364 VmState::BreakPoint => match new_state { 365 VmState::Created | VmState::Running => Ok(()), 366 _ => Err(Error::InvalidStateTransition(self, new_state)), 367 }, 368 } 369 } 370 } 371 372 struct VmOpsHandler { 373 memory: GuestMemoryAtomic<GuestMemoryMmap>, 374 #[cfg(target_arch = "x86_64")] 375 io_bus: Arc<Bus>, 376 mmio_bus: Arc<Bus>, 377 } 378 379 impl VmOps for VmOpsHandler { 380 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 381 self.memory 382 .memory() 383 .write(buf, GuestAddress(gpa)) 384 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 385 } 386 387 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 388 self.memory 389 .memory() 390 .read(buf, GuestAddress(gpa)) 391 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 392 } 393 394 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 395 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 396 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 397 } 398 Ok(()) 399 } 400 401 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 402 match self.mmio_bus.write(gpa, data) { 403 Err(vm_device::BusError::MissingAddressRange) => { 404 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 405 } 406 Ok(Some(barrier)) => { 407 info!("Waiting for barrier"); 408 barrier.wait(); 409 info!("Barrier released"); 410 } 411 _ => {} 412 }; 413 Ok(()) 414 } 415 416 #[cfg(target_arch = "x86_64")] 417 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 418 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 419 info!("Guest PIO read to unregistered address 0x{:x}", port); 420 } 421 Ok(()) 422 } 423 424 #[cfg(target_arch = "x86_64")] 425 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 426 match self.io_bus.write(port, data) { 427 Err(vm_device::BusError::MissingAddressRange) => { 428 info!("Guest PIO write to unregistered address 0x{:x}", port); 429 } 430 Ok(Some(barrier)) => { 431 info!("Waiting for barrier"); 432 barrier.wait(); 433 info!("Barrier released"); 434 } 435 _ => {} 436 }; 437 Ok(()) 438 } 439 } 440 441 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 442 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 443 444 cmp::min(host_phys_bits, max_phys_bits) 445 } 446 447 pub struct Vm { 448 #[cfg(feature = "tdx")] 449 kernel: Option<File>, 450 initramfs: Option<File>, 451 threads: Vec<thread::JoinHandle<()>>, 452 device_manager: Arc<Mutex<DeviceManager>>, 453 config: Arc<Mutex<VmConfig>>, 454 state: RwLock<VmState>, 455 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 456 memory_manager: Arc<Mutex<MemoryManager>>, 457 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 458 // The hypervisor abstracted virtual machine. 459 vm: Arc<dyn hypervisor::Vm>, 460 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 461 saved_clock: Option<hypervisor::ClockData>, 462 numa_nodes: NumaNodes, 463 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 464 hypervisor: Arc<dyn hypervisor::Hypervisor>, 465 stop_on_boot: bool, 466 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 467 } 468 469 impl Vm { 470 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 471 472 #[allow(clippy::too_many_arguments)] 473 pub fn new_from_memory_manager( 474 config: Arc<Mutex<VmConfig>>, 475 memory_manager: Arc<Mutex<MemoryManager>>, 476 vm: Arc<dyn hypervisor::Vm>, 477 exit_evt: EventFd, 478 reset_evt: EventFd, 479 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 480 seccomp_action: &SeccompAction, 481 hypervisor: Arc<dyn hypervisor::Hypervisor>, 482 activate_evt: EventFd, 483 timestamp: Instant, 484 serial_pty: Option<PtyPair>, 485 console_pty: Option<PtyPair>, 486 debug_console_pty: Option<PtyPair>, 487 console_resize_pipe: Option<File>, 488 original_termios: Arc<Mutex<Option<termios>>>, 489 snapshot: Option<Snapshot>, 490 ) -> Result<Self> { 491 trace_scoped!("Vm::new_from_memory_manager"); 492 493 let boot_id_list = config 494 .lock() 495 .unwrap() 496 .validate() 497 .map_err(Error::ConfigValidation)?; 498 499 #[cfg(not(feature = "igvm"))] 500 let load_payload_handle = if snapshot.is_none() { 501 Self::load_payload_async(&memory_manager, &config)? 502 } else { 503 None 504 }; 505 506 info!("Booting VM from config: {:?}", &config); 507 508 // Create NUMA nodes based on NumaConfig. 509 let numa_nodes = 510 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 511 512 #[cfg(feature = "tdx")] 513 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 514 #[cfg(feature = "sev_snp")] 515 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 516 #[cfg(feature = "tdx")] 517 let force_iommu = tdx_enabled; 518 #[cfg(feature = "sev_snp")] 519 let force_iommu = sev_snp_enabled; 520 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 521 let force_iommu = false; 522 523 #[cfg(feature = "guest_debug")] 524 let stop_on_boot = config.lock().unwrap().gdb; 525 #[cfg(not(feature = "guest_debug"))] 526 let stop_on_boot = false; 527 528 let memory = memory_manager.lock().unwrap().guest_memory(); 529 #[cfg(target_arch = "x86_64")] 530 let io_bus = Arc::new(Bus::new()); 531 let mmio_bus = Arc::new(Bus::new()); 532 533 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 534 memory, 535 #[cfg(target_arch = "x86_64")] 536 io_bus: io_bus.clone(), 537 mmio_bus: mmio_bus.clone(), 538 }); 539 540 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 541 let cpu_manager = cpu::CpuManager::new( 542 cpus_config, 543 vm.clone(), 544 exit_evt.try_clone().map_err(Error::EventFdClone)?, 545 reset_evt.try_clone().map_err(Error::EventFdClone)?, 546 #[cfg(feature = "guest_debug")] 547 vm_debug_evt, 548 &hypervisor, 549 seccomp_action.clone(), 550 vm_ops, 551 #[cfg(feature = "tdx")] 552 tdx_enabled, 553 &numa_nodes, 554 #[cfg(feature = "sev_snp")] 555 sev_snp_enabled, 556 ) 557 .map_err(Error::CpuManager)?; 558 559 #[cfg(target_arch = "x86_64")] 560 cpu_manager 561 .lock() 562 .unwrap() 563 .populate_cpuid( 564 &memory_manager, 565 &hypervisor, 566 #[cfg(feature = "tdx")] 567 tdx_enabled, 568 ) 569 .map_err(Error::CpuManager)?; 570 571 // Loading the igvm file is pushed down here because 572 // igvm parser needs cpu_manager to retrieve cpuid leaf. 573 // For the regular case, we can start loading early, but for 574 // igvm case we have to wait until cpu_manager is created. 575 // Currently, Microsoft Hypervisor does not provide any 576 // Hypervisor specific common cpuid, we need to call get_cpuid_values 577 // per cpuid through cpu_manager. 578 #[cfg(feature = "igvm")] 579 let load_payload_handle = if snapshot.is_none() { 580 Self::load_payload_async( 581 &memory_manager, 582 &config, 583 &cpu_manager, 584 #[cfg(feature = "sev_snp")] 585 sev_snp_enabled, 586 )? 587 } else { 588 None 589 }; 590 // The initial TDX configuration must be done before the vCPUs are 591 // created 592 #[cfg(feature = "tdx")] 593 if tdx_enabled { 594 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 595 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 596 vm.tdx_init(&cpuid, max_vcpus) 597 .map_err(Error::InitializeTdxVm)?; 598 } 599 600 cpu_manager 601 .lock() 602 .unwrap() 603 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 604 .map_err(Error::CpuManager)?; 605 606 // This initial SEV-SNP configuration must be done immediately after 607 // vCPUs are created. As part of this initialization we are 608 // transitioning the guest into secure state. 609 #[cfg(feature = "sev_snp")] 610 if sev_snp_enabled { 611 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 612 } 613 614 #[cfg(feature = "tdx")] 615 let dynamic = !tdx_enabled; 616 #[cfg(not(feature = "tdx"))] 617 let dynamic = true; 618 619 let device_manager = DeviceManager::new( 620 #[cfg(target_arch = "x86_64")] 621 io_bus, 622 mmio_bus, 623 hypervisor.hypervisor_type(), 624 vm.clone(), 625 config.clone(), 626 memory_manager.clone(), 627 cpu_manager.clone(), 628 exit_evt.try_clone().map_err(Error::EventFdClone)?, 629 reset_evt, 630 seccomp_action.clone(), 631 numa_nodes.clone(), 632 &activate_evt, 633 force_iommu, 634 boot_id_list, 635 timestamp, 636 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 637 dynamic, 638 ) 639 .map_err(Error::DeviceManager)?; 640 641 device_manager 642 .lock() 643 .unwrap() 644 .create_devices( 645 serial_pty, 646 console_pty, 647 debug_console_pty, 648 console_resize_pipe, 649 original_termios, 650 ) 651 .map_err(Error::DeviceManager)?; 652 653 #[cfg(feature = "tdx")] 654 let kernel = config 655 .lock() 656 .unwrap() 657 .payload 658 .as_ref() 659 .map(|p| p.kernel.as_ref().map(File::open)) 660 .unwrap_or_default() 661 .transpose() 662 .map_err(Error::KernelFile)?; 663 664 let initramfs = config 665 .lock() 666 .unwrap() 667 .payload 668 .as_ref() 669 .map(|p| p.initramfs.as_ref().map(File::open)) 670 .unwrap_or_default() 671 .transpose() 672 .map_err(Error::InitramfsFile)?; 673 674 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 675 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 676 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 677 vm_snapshot.clock 678 } else { 679 None 680 }; 681 682 let vm_state = if snapshot.is_some() { 683 VmState::Paused 684 } else { 685 VmState::Created 686 }; 687 688 Ok(Vm { 689 #[cfg(feature = "tdx")] 690 kernel, 691 initramfs, 692 device_manager, 693 config, 694 threads: Vec::with_capacity(1), 695 state: RwLock::new(vm_state), 696 cpu_manager, 697 memory_manager, 698 vm, 699 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 700 saved_clock, 701 numa_nodes, 702 hypervisor, 703 stop_on_boot, 704 load_payload_handle, 705 }) 706 } 707 708 fn create_numa_nodes( 709 configs: Option<Vec<NumaConfig>>, 710 memory_manager: &Arc<Mutex<MemoryManager>>, 711 ) -> Result<NumaNodes> { 712 let mm = memory_manager.lock().unwrap(); 713 let mm_zones = mm.memory_zones(); 714 let mut numa_nodes = BTreeMap::new(); 715 716 if let Some(configs) = &configs { 717 for config in configs.iter() { 718 if numa_nodes.contains_key(&config.guest_numa_id) { 719 error!("Can't define twice the same NUMA node"); 720 return Err(Error::InvalidNumaConfig); 721 } 722 723 let mut node = NumaNode::default(); 724 725 if let Some(memory_zones) = &config.memory_zones { 726 for memory_zone in memory_zones.iter() { 727 if let Some(mm_zone) = mm_zones.get(memory_zone) { 728 node.memory_regions.extend(mm_zone.regions().clone()); 729 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 730 node.hotplug_regions.push(virtiomem_zone.region().clone()); 731 } 732 node.memory_zones.push(memory_zone.clone()); 733 } else { 734 error!("Unknown memory zone '{}'", memory_zone); 735 return Err(Error::InvalidNumaConfig); 736 } 737 } 738 } 739 740 if let Some(cpus) = &config.cpus { 741 node.cpus.extend(cpus); 742 } 743 744 if let Some(pci_segments) = &config.pci_segments { 745 node.pci_segments.extend(pci_segments); 746 } 747 748 if let Some(distances) = &config.distances { 749 for distance in distances.iter() { 750 let dest = distance.destination; 751 let dist = distance.distance; 752 753 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 754 error!("Unknown destination NUMA node {}", dest); 755 return Err(Error::InvalidNumaConfig); 756 } 757 758 if node.distances.contains_key(&dest) { 759 error!("Destination NUMA node {} has been already set", dest); 760 return Err(Error::InvalidNumaConfig); 761 } 762 763 node.distances.insert(dest, dist); 764 } 765 } 766 767 #[cfg(target_arch = "x86_64")] 768 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 769 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 770 let mm_sections = sgx_epc_region.epc_sections(); 771 for sgx_epc_section in sgx_epc_sections.iter() { 772 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 773 node.sgx_epc_sections.push(mm_section.clone()); 774 } else { 775 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 776 return Err(Error::InvalidNumaConfig); 777 } 778 } 779 } else { 780 error!("Missing SGX EPC region"); 781 return Err(Error::InvalidNumaConfig); 782 } 783 } 784 785 numa_nodes.insert(config.guest_numa_id, node); 786 } 787 } 788 789 Ok(numa_nodes) 790 } 791 792 #[allow(clippy::too_many_arguments)] 793 pub fn new( 794 vm_config: Arc<Mutex<VmConfig>>, 795 exit_evt: EventFd, 796 reset_evt: EventFd, 797 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 798 seccomp_action: &SeccompAction, 799 hypervisor: Arc<dyn hypervisor::Hypervisor>, 800 activate_evt: EventFd, 801 serial_pty: Option<PtyPair>, 802 console_pty: Option<PtyPair>, 803 debug_console_pty: Option<PtyPair>, 804 console_resize_pipe: Option<File>, 805 original_termios: Arc<Mutex<Option<termios>>>, 806 snapshot: Option<Snapshot>, 807 source_url: Option<&str>, 808 prefault: Option<bool>, 809 ) -> Result<Self> { 810 trace_scoped!("Vm::new"); 811 812 let timestamp = Instant::now(); 813 814 #[cfg(feature = "tdx")] 815 let tdx_enabled = if snapshot.is_some() { 816 false 817 } else { 818 vm_config.lock().unwrap().is_tdx_enabled() 819 }; 820 821 #[cfg(feature = "sev_snp")] 822 let sev_snp_enabled = if snapshot.is_some() { 823 false 824 } else { 825 vm_config.lock().unwrap().is_sev_snp_enabled() 826 }; 827 828 let vm = Self::create_hypervisor_vm( 829 &hypervisor, 830 #[cfg(feature = "tdx")] 831 tdx_enabled, 832 #[cfg(feature = "sev_snp")] 833 sev_snp_enabled, 834 )?; 835 836 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 837 838 let memory_manager = if let Some(snapshot) = 839 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 840 { 841 MemoryManager::new_from_snapshot( 842 &snapshot, 843 vm.clone(), 844 &vm_config.lock().unwrap().memory.clone(), 845 source_url, 846 prefault.unwrap(), 847 phys_bits, 848 ) 849 .map_err(Error::MemoryManager)? 850 } else { 851 #[cfg(target_arch = "x86_64")] 852 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 853 854 MemoryManager::new( 855 vm.clone(), 856 &vm_config.lock().unwrap().memory.clone(), 857 None, 858 phys_bits, 859 #[cfg(feature = "tdx")] 860 tdx_enabled, 861 None, 862 None, 863 #[cfg(target_arch = "x86_64")] 864 sgx_epc_config, 865 ) 866 .map_err(Error::MemoryManager)? 867 }; 868 869 Vm::new_from_memory_manager( 870 vm_config, 871 memory_manager, 872 vm, 873 exit_evt, 874 reset_evt, 875 #[cfg(feature = "guest_debug")] 876 vm_debug_evt, 877 seccomp_action, 878 hypervisor, 879 activate_evt, 880 timestamp, 881 serial_pty, 882 console_pty, 883 debug_console_pty, 884 console_resize_pipe, 885 original_termios, 886 snapshot, 887 ) 888 } 889 890 pub fn create_hypervisor_vm( 891 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 892 #[cfg(feature = "tdx")] tdx_enabled: bool, 893 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 894 ) -> Result<Arc<dyn hypervisor::Vm>> { 895 hypervisor.check_required_extensions().unwrap(); 896 897 cfg_if::cfg_if! { 898 if #[cfg(feature = "tdx")] { 899 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 900 // Otherwise KVM_X86_LEGACY_VM: 0 901 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 902 let vm = hypervisor 903 .create_vm_with_type(u64::from(tdx_enabled)) 904 .unwrap(); 905 } else if #[cfg(feature = "sev_snp")] { 906 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 907 // Otherwise SEV_SNP_DISABLED: 0 908 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 909 let vm = hypervisor 910 .create_vm_with_type(u64::from(sev_snp_enabled)) 911 .unwrap(); 912 } else { 913 let vm = hypervisor.create_vm().unwrap(); 914 } 915 } 916 917 #[cfg(target_arch = "x86_64")] 918 { 919 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 920 .unwrap(); 921 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 922 vm.enable_split_irq().unwrap(); 923 } 924 925 Ok(vm) 926 } 927 928 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 929 let initramfs = self.initramfs.as_mut().unwrap(); 930 let size: usize = initramfs 931 .seek(SeekFrom::End(0)) 932 .map_err(|_| Error::InitramfsLoad)? 933 .try_into() 934 .unwrap(); 935 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 936 937 let address = 938 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 939 let address = GuestAddress(address); 940 941 guest_mem 942 .read_volatile_from(address, initramfs, size) 943 .map_err(|_| Error::InitramfsLoad)?; 944 945 info!("Initramfs loaded: address = 0x{:x}", address.0); 946 Ok(arch::InitramfsConfig { address, size }) 947 } 948 949 pub fn generate_cmdline( 950 payload: &PayloadConfig, 951 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 952 ) -> Result<Cmdline> { 953 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 954 if let Some(s) = payload.cmdline.as_ref() { 955 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 956 } 957 958 #[cfg(target_arch = "aarch64")] 959 for entry in device_manager.lock().unwrap().cmdline_additions() { 960 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 961 } 962 Ok(cmdline) 963 } 964 965 #[cfg(target_arch = "aarch64")] 966 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 967 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 968 let mem = uefi_flash.memory(); 969 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 970 .map_err(Error::UefiLoad)?; 971 Ok(()) 972 } 973 974 #[cfg(target_arch = "aarch64")] 975 fn load_kernel( 976 firmware: Option<File>, 977 kernel: Option<File>, 978 memory_manager: Arc<Mutex<MemoryManager>>, 979 ) -> Result<EntryPoint> { 980 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 981 let mem = guest_memory.memory(); 982 let entry_addr = match (firmware, kernel) { 983 (None, Some(mut kernel)) => { 984 match linux_loader::loader::pe::PE::load( 985 mem.deref(), 986 Some(arch::layout::KERNEL_START), 987 &mut kernel, 988 None, 989 ) { 990 Ok(entry_addr) => entry_addr.kernel_load, 991 // Try to load the binary as kernel PE file at first. 992 // If failed, retry to load it as UEFI binary. 993 // As the UEFI binary is formatless, it must be the last option to try. 994 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 995 Self::load_firmware(&kernel, memory_manager)?; 996 arch::layout::UEFI_START 997 } 998 Err(e) => { 999 return Err(Error::KernelLoad(e)); 1000 } 1001 } 1002 } 1003 (Some(firmware), None) => { 1004 Self::load_firmware(&firmware, memory_manager)?; 1005 arch::layout::UEFI_START 1006 } 1007 _ => return Err(Error::InvalidPayload), 1008 }; 1009 1010 Ok(EntryPoint { entry_addr }) 1011 } 1012 1013 #[cfg(feature = "igvm")] 1014 fn load_igvm( 1015 igvm: File, 1016 memory_manager: Arc<Mutex<MemoryManager>>, 1017 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1018 #[cfg(feature = "sev_snp")] host_data: &Option<String>, 1019 ) -> Result<EntryPoint> { 1020 let res = igvm_loader::load_igvm( 1021 &igvm, 1022 memory_manager, 1023 cpu_manager.clone(), 1024 "", 1025 #[cfg(feature = "sev_snp")] 1026 host_data, 1027 ) 1028 .map_err(Error::IgvmLoad)?; 1029 1030 cfg_if::cfg_if! { 1031 if #[cfg(feature = "sev_snp")] { 1032 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1033 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1034 } else { 1035 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1036 }; 1037 } else { 1038 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1039 } 1040 }; 1041 Ok(entry_point) 1042 } 1043 1044 #[cfg(target_arch = "x86_64")] 1045 fn load_kernel( 1046 mut kernel: File, 1047 cmdline: Option<Cmdline>, 1048 memory_manager: Arc<Mutex<MemoryManager>>, 1049 ) -> Result<EntryPoint> { 1050 info!("Loading kernel"); 1051 1052 let mem = { 1053 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1054 guest_memory.memory() 1055 }; 1056 1057 // Try ELF binary with PVH boot. 1058 let entry_addr = linux_loader::loader::elf::Elf::load( 1059 mem.deref(), 1060 None, 1061 &mut kernel, 1062 Some(arch::layout::HIGH_RAM_START), 1063 ) 1064 // Try loading kernel as bzImage. 1065 .or_else(|_| { 1066 BzImage::load( 1067 mem.deref(), 1068 None, 1069 &mut kernel, 1070 Some(arch::layout::HIGH_RAM_START), 1071 ) 1072 }) 1073 .map_err(Error::KernelLoad)?; 1074 1075 if let Some(cmdline) = cmdline { 1076 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1077 .map_err(Error::LoadCmdLine)?; 1078 } 1079 1080 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1081 // Use the PVH kernel entry point to boot the guest 1082 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1083 Ok(EntryPoint { 1084 entry_addr, 1085 setup_header: None, 1086 }) 1087 } else if entry_addr.setup_header.is_some() { 1088 // Use the bzImage 32bit entry point to boot the guest 1089 info!( 1090 "bzImage kernel loaded: entry_addr = 0x{:x}", 1091 entry_addr.kernel_load.0 1092 ); 1093 Ok(EntryPoint { 1094 entry_addr: entry_addr.kernel_load, 1095 setup_header: entry_addr.setup_header, 1096 }) 1097 } else { 1098 Err(Error::KernelMissingPvhHeader) 1099 } 1100 } 1101 1102 #[cfg(target_arch = "x86_64")] 1103 fn load_payload( 1104 payload: &PayloadConfig, 1105 memory_manager: Arc<Mutex<MemoryManager>>, 1106 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1107 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1108 ) -> Result<EntryPoint> { 1109 trace_scoped!("load_payload"); 1110 #[cfg(feature = "igvm")] 1111 { 1112 if let Some(_igvm_file) = &payload.igvm { 1113 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1114 #[cfg(feature = "sev_snp")] 1115 if sev_snp_enabled { 1116 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); 1117 } 1118 #[cfg(not(feature = "sev_snp"))] 1119 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1120 } 1121 } 1122 match ( 1123 &payload.firmware, 1124 &payload.kernel, 1125 &payload.initramfs, 1126 &payload.cmdline, 1127 ) { 1128 (Some(firmware), None, None, None) => { 1129 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1130 Self::load_kernel(firmware, None, memory_manager) 1131 } 1132 (None, Some(kernel), _, _) => { 1133 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1134 let cmdline = Self::generate_cmdline(payload)?; 1135 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1136 } 1137 _ => Err(Error::InvalidPayload), 1138 } 1139 } 1140 1141 #[cfg(target_arch = "aarch64")] 1142 fn load_payload( 1143 payload: &PayloadConfig, 1144 memory_manager: Arc<Mutex<MemoryManager>>, 1145 ) -> Result<EntryPoint> { 1146 match (&payload.firmware, &payload.kernel) { 1147 (Some(firmware), None) => { 1148 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1149 Self::load_kernel(Some(firmware), None, memory_manager) 1150 } 1151 (None, Some(kernel)) => { 1152 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1153 Self::load_kernel(None, Some(kernel), memory_manager) 1154 } 1155 _ => Err(Error::InvalidPayload), 1156 } 1157 } 1158 1159 fn load_payload_async( 1160 memory_manager: &Arc<Mutex<MemoryManager>>, 1161 config: &Arc<Mutex<VmConfig>>, 1162 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1163 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1164 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1165 // Kernel with TDX is loaded in a different manner 1166 #[cfg(feature = "tdx")] 1167 if config.lock().unwrap().is_tdx_enabled() { 1168 return Ok(None); 1169 } 1170 1171 config 1172 .lock() 1173 .unwrap() 1174 .payload 1175 .as_ref() 1176 .map(|payload| { 1177 let memory_manager = memory_manager.clone(); 1178 let payload = payload.clone(); 1179 #[cfg(feature = "igvm")] 1180 let cpu_manager = cpu_manager.clone(); 1181 1182 std::thread::Builder::new() 1183 .name("payload_loader".into()) 1184 .spawn(move || { 1185 Self::load_payload( 1186 &payload, 1187 memory_manager, 1188 #[cfg(feature = "igvm")] 1189 cpu_manager, 1190 #[cfg(feature = "sev_snp")] 1191 sev_snp_enabled, 1192 ) 1193 }) 1194 .map_err(Error::KernelLoadThreadSpawn) 1195 }) 1196 .transpose() 1197 } 1198 1199 #[cfg(target_arch = "x86_64")] 1200 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1201 trace_scoped!("configure_system"); 1202 info!("Configuring system"); 1203 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1204 1205 let initramfs_config = match self.initramfs { 1206 Some(_) => Some(self.load_initramfs(&mem)?), 1207 None => None, 1208 }; 1209 1210 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1211 let rsdp_addr = Some(rsdp_addr); 1212 let sgx_epc_region = self 1213 .memory_manager 1214 .lock() 1215 .unwrap() 1216 .sgx_epc_region() 1217 .as_ref() 1218 .cloned(); 1219 1220 let serial_number = self 1221 .config 1222 .lock() 1223 .unwrap() 1224 .platform 1225 .as_ref() 1226 .and_then(|p| p.serial_number.clone()); 1227 1228 let uuid = self 1229 .config 1230 .lock() 1231 .unwrap() 1232 .platform 1233 .as_ref() 1234 .and_then(|p| p.uuid.clone()); 1235 1236 let oem_strings = self 1237 .config 1238 .lock() 1239 .unwrap() 1240 .platform 1241 .as_ref() 1242 .and_then(|p| p.oem_strings.clone()); 1243 1244 let oem_strings = oem_strings 1245 .as_deref() 1246 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1247 1248 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1249 1250 arch::configure_system( 1251 &mem, 1252 arch::layout::CMDLINE_START, 1253 arch::layout::CMDLINE_MAX_SIZE, 1254 &initramfs_config, 1255 boot_vcpus, 1256 entry_addr.setup_header, 1257 rsdp_addr, 1258 sgx_epc_region, 1259 serial_number.as_deref(), 1260 uuid.as_deref(), 1261 oem_strings.as_deref(), 1262 topology, 1263 ) 1264 .map_err(Error::ConfigureSystem)?; 1265 Ok(()) 1266 } 1267 1268 #[cfg(target_arch = "aarch64")] 1269 fn configure_system( 1270 &mut self, 1271 _rsdp_addr: GuestAddress, 1272 _entry_addr: EntryPoint, 1273 ) -> Result<()> { 1274 let cmdline = Self::generate_cmdline( 1275 self.config.lock().unwrap().payload.as_ref().unwrap(), 1276 &self.device_manager, 1277 )?; 1278 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1279 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1280 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1281 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1282 let initramfs_config = match self.initramfs { 1283 Some(_) => Some(self.load_initramfs(&mem)?), 1284 None => None, 1285 }; 1286 1287 let device_info = &self 1288 .device_manager 1289 .lock() 1290 .unwrap() 1291 .get_device_info() 1292 .clone(); 1293 1294 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1295 let pci_space = PciSpaceInfo { 1296 pci_segment_id: pci_segment.id, 1297 mmio_config_address: pci_segment.mmio_config_address, 1298 pci_device_space_start: pci_segment.start_of_mem64_area, 1299 pci_device_space_size: pci_segment.end_of_mem64_area 1300 - pci_segment.start_of_mem64_area 1301 + 1, 1302 }; 1303 pci_space_info.push(pci_space); 1304 } 1305 1306 let virtio_iommu_bdf = self 1307 .device_manager 1308 .lock() 1309 .unwrap() 1310 .iommu_attached_devices() 1311 .as_ref() 1312 .map(|(v, _)| *v); 1313 1314 let vgic = self 1315 .device_manager 1316 .lock() 1317 .unwrap() 1318 .get_interrupt_controller() 1319 .unwrap() 1320 .lock() 1321 .unwrap() 1322 .get_vgic() 1323 .map_err(|_| { 1324 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1325 arch::aarch64::Error::SetupGic, 1326 )) 1327 })?; 1328 1329 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1330 let pmu_supported = self 1331 .cpu_manager 1332 .lock() 1333 .unwrap() 1334 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1335 .map_err(|_| { 1336 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1337 arch::aarch64::Error::VcpuInitPmu, 1338 )) 1339 })?; 1340 1341 arch::configure_system( 1342 &mem, 1343 cmdline.as_cstring().unwrap().to_str().unwrap(), 1344 vcpu_mpidrs, 1345 vcpu_topology, 1346 device_info, 1347 &initramfs_config, 1348 &pci_space_info, 1349 virtio_iommu_bdf.map(|bdf| bdf.into()), 1350 &vgic, 1351 &self.numa_nodes, 1352 pmu_supported, 1353 ) 1354 .map_err(Error::ConfigureSystem)?; 1355 1356 Ok(()) 1357 } 1358 1359 pub fn serial_pty(&self) -> Option<PtyPair> { 1360 self.device_manager.lock().unwrap().serial_pty() 1361 } 1362 1363 pub fn console_pty(&self) -> Option<PtyPair> { 1364 self.device_manager.lock().unwrap().console_pty() 1365 } 1366 1367 pub fn debug_console_pty(&self) -> Option<PtyPair> { 1368 self.device_manager.lock().unwrap().debug_console_pty() 1369 } 1370 1371 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1372 self.device_manager.lock().unwrap().console_resize_pipe() 1373 } 1374 1375 pub fn shutdown(&mut self) -> Result<()> { 1376 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1377 let new_state = VmState::Shutdown; 1378 1379 state.valid_transition(new_state)?; 1380 1381 // Wake up the DeviceManager threads so they will get terminated cleanly 1382 self.device_manager 1383 .lock() 1384 .unwrap() 1385 .resume() 1386 .map_err(Error::Resume)?; 1387 1388 self.cpu_manager 1389 .lock() 1390 .unwrap() 1391 .shutdown() 1392 .map_err(Error::CpuManager)?; 1393 1394 // Wait for all the threads to finish 1395 for thread in self.threads.drain(..) { 1396 thread.join().map_err(Error::ThreadCleanup)? 1397 } 1398 *state = new_state; 1399 1400 Ok(()) 1401 } 1402 1403 pub fn resize( 1404 &mut self, 1405 desired_vcpus: Option<u8>, 1406 desired_memory: Option<u64>, 1407 desired_balloon: Option<u64>, 1408 ) -> Result<()> { 1409 event!("vm", "resizing"); 1410 1411 if let Some(desired_vcpus) = desired_vcpus { 1412 if self 1413 .cpu_manager 1414 .lock() 1415 .unwrap() 1416 .resize(desired_vcpus) 1417 .map_err(Error::CpuManager)? 1418 { 1419 self.device_manager 1420 .lock() 1421 .unwrap() 1422 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1423 .map_err(Error::DeviceManager)?; 1424 } 1425 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1426 } 1427 1428 if let Some(desired_memory) = desired_memory { 1429 let new_region = self 1430 .memory_manager 1431 .lock() 1432 .unwrap() 1433 .resize(desired_memory) 1434 .map_err(Error::MemoryManager)?; 1435 1436 let memory_config = &mut self.config.lock().unwrap().memory; 1437 1438 if let Some(new_region) = &new_region { 1439 self.device_manager 1440 .lock() 1441 .unwrap() 1442 .update_memory(new_region) 1443 .map_err(Error::DeviceManager)?; 1444 1445 match memory_config.hotplug_method { 1446 HotplugMethod::Acpi => { 1447 self.device_manager 1448 .lock() 1449 .unwrap() 1450 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1451 .map_err(Error::DeviceManager)?; 1452 } 1453 HotplugMethod::VirtioMem => {} 1454 } 1455 } 1456 1457 // We update the VM config regardless of the actual guest resize 1458 // operation result (happened or not), so that if the VM reboots 1459 // it will be running with the last configure memory size. 1460 match memory_config.hotplug_method { 1461 HotplugMethod::Acpi => memory_config.size = desired_memory, 1462 HotplugMethod::VirtioMem => { 1463 if desired_memory > memory_config.size { 1464 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1465 } else { 1466 memory_config.hotplugged_size = None; 1467 } 1468 } 1469 } 1470 } 1471 1472 if let Some(desired_balloon) = desired_balloon { 1473 self.device_manager 1474 .lock() 1475 .unwrap() 1476 .resize_balloon(desired_balloon) 1477 .map_err(Error::DeviceManager)?; 1478 1479 // Update the configuration value for the balloon size to ensure 1480 // a reboot would use the right value. 1481 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1482 balloon_config.size = desired_balloon; 1483 } 1484 } 1485 1486 event!("vm", "resized"); 1487 1488 Ok(()) 1489 } 1490 1491 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1492 let memory_config = &mut self.config.lock().unwrap().memory; 1493 1494 if let Some(zones) = &mut memory_config.zones { 1495 for zone in zones.iter_mut() { 1496 if zone.id == id { 1497 if desired_memory >= zone.size { 1498 let hotplugged_size = desired_memory - zone.size; 1499 self.memory_manager 1500 .lock() 1501 .unwrap() 1502 .resize_zone(&id, desired_memory - zone.size) 1503 .map_err(Error::MemoryManager)?; 1504 // We update the memory zone config regardless of the 1505 // actual 'resize-zone' operation result (happened or 1506 // not), so that if the VM reboots it will be running 1507 // with the last configured memory zone size. 1508 zone.hotplugged_size = Some(hotplugged_size); 1509 1510 return Ok(()); 1511 } else { 1512 error!( 1513 "Invalid to ask less ({}) than boot RAM ({}) for \ 1514 this memory zone", 1515 desired_memory, zone.size, 1516 ); 1517 return Err(Error::ResizeZone); 1518 } 1519 } 1520 } 1521 } 1522 1523 error!("Could not find the memory zone {} for the resize", id); 1524 Err(Error::ResizeZone) 1525 } 1526 1527 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1528 let pci_device_info = self 1529 .device_manager 1530 .lock() 1531 .unwrap() 1532 .add_device(&mut device_cfg) 1533 .map_err(Error::DeviceManager)?; 1534 1535 // Update VmConfig by adding the new device. This is important to 1536 // ensure the device would be created in case of a reboot. 1537 { 1538 let mut config = self.config.lock().unwrap(); 1539 add_to_config(&mut config.devices, device_cfg); 1540 } 1541 1542 self.device_manager 1543 .lock() 1544 .unwrap() 1545 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1546 .map_err(Error::DeviceManager)?; 1547 1548 Ok(pci_device_info) 1549 } 1550 1551 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1552 let pci_device_info = self 1553 .device_manager 1554 .lock() 1555 .unwrap() 1556 .add_user_device(&mut device_cfg) 1557 .map_err(Error::DeviceManager)?; 1558 1559 // Update VmConfig by adding the new device. This is important to 1560 // ensure the device would be created in case of a reboot. 1561 { 1562 let mut config = self.config.lock().unwrap(); 1563 add_to_config(&mut config.user_devices, device_cfg); 1564 } 1565 1566 self.device_manager 1567 .lock() 1568 .unwrap() 1569 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1570 .map_err(Error::DeviceManager)?; 1571 1572 Ok(pci_device_info) 1573 } 1574 1575 pub fn remove_device(&mut self, id: String) -> Result<()> { 1576 self.device_manager 1577 .lock() 1578 .unwrap() 1579 .remove_device(id.clone()) 1580 .map_err(Error::DeviceManager)?; 1581 1582 // Update VmConfig by removing the device. This is important to 1583 // ensure the device would not be created in case of a reboot. 1584 self.config.lock().unwrap().remove_device(&id); 1585 1586 self.device_manager 1587 .lock() 1588 .unwrap() 1589 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1590 .map_err(Error::DeviceManager)?; 1591 Ok(()) 1592 } 1593 1594 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1595 let pci_device_info = self 1596 .device_manager 1597 .lock() 1598 .unwrap() 1599 .add_disk(&mut disk_cfg) 1600 .map_err(Error::DeviceManager)?; 1601 1602 // Update VmConfig by adding the new device. This is important to 1603 // ensure the device would be created in case of a reboot. 1604 { 1605 let mut config = self.config.lock().unwrap(); 1606 add_to_config(&mut config.disks, disk_cfg); 1607 } 1608 1609 self.device_manager 1610 .lock() 1611 .unwrap() 1612 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1613 .map_err(Error::DeviceManager)?; 1614 1615 Ok(pci_device_info) 1616 } 1617 1618 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1619 let pci_device_info = self 1620 .device_manager 1621 .lock() 1622 .unwrap() 1623 .add_fs(&mut fs_cfg) 1624 .map_err(Error::DeviceManager)?; 1625 1626 // Update VmConfig by adding the new device. This is important to 1627 // ensure the device would be created in case of a reboot. 1628 { 1629 let mut config = self.config.lock().unwrap(); 1630 add_to_config(&mut config.fs, fs_cfg); 1631 } 1632 1633 self.device_manager 1634 .lock() 1635 .unwrap() 1636 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1637 .map_err(Error::DeviceManager)?; 1638 1639 Ok(pci_device_info) 1640 } 1641 1642 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1643 let pci_device_info = self 1644 .device_manager 1645 .lock() 1646 .unwrap() 1647 .add_pmem(&mut pmem_cfg) 1648 .map_err(Error::DeviceManager)?; 1649 1650 // Update VmConfig by adding the new device. This is important to 1651 // ensure the device would be created in case of a reboot. 1652 { 1653 let mut config = self.config.lock().unwrap(); 1654 add_to_config(&mut config.pmem, pmem_cfg); 1655 } 1656 1657 self.device_manager 1658 .lock() 1659 .unwrap() 1660 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1661 .map_err(Error::DeviceManager)?; 1662 1663 Ok(pci_device_info) 1664 } 1665 1666 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1667 let pci_device_info = self 1668 .device_manager 1669 .lock() 1670 .unwrap() 1671 .add_net(&mut net_cfg) 1672 .map_err(Error::DeviceManager)?; 1673 1674 // Update VmConfig by adding the new device. This is important to 1675 // ensure the device would be created in case of a reboot. 1676 { 1677 let mut config = self.config.lock().unwrap(); 1678 add_to_config(&mut config.net, net_cfg); 1679 } 1680 1681 self.device_manager 1682 .lock() 1683 .unwrap() 1684 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1685 .map_err(Error::DeviceManager)?; 1686 1687 Ok(pci_device_info) 1688 } 1689 1690 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1691 let pci_device_info = self 1692 .device_manager 1693 .lock() 1694 .unwrap() 1695 .add_vdpa(&mut vdpa_cfg) 1696 .map_err(Error::DeviceManager)?; 1697 1698 // Update VmConfig by adding the new device. This is important to 1699 // ensure the device would be created in case of a reboot. 1700 { 1701 let mut config = self.config.lock().unwrap(); 1702 add_to_config(&mut config.vdpa, vdpa_cfg); 1703 } 1704 1705 self.device_manager 1706 .lock() 1707 .unwrap() 1708 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1709 .map_err(Error::DeviceManager)?; 1710 1711 Ok(pci_device_info) 1712 } 1713 1714 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1715 let pci_device_info = self 1716 .device_manager 1717 .lock() 1718 .unwrap() 1719 .add_vsock(&mut vsock_cfg) 1720 .map_err(Error::DeviceManager)?; 1721 1722 // Update VmConfig by adding the new device. This is important to 1723 // ensure the device would be created in case of a reboot. 1724 { 1725 let mut config = self.config.lock().unwrap(); 1726 config.vsock = Some(vsock_cfg); 1727 } 1728 1729 self.device_manager 1730 .lock() 1731 .unwrap() 1732 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1733 .map_err(Error::DeviceManager)?; 1734 1735 Ok(pci_device_info) 1736 } 1737 1738 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1739 Ok(self.device_manager.lock().unwrap().counters()) 1740 } 1741 1742 #[cfg(feature = "tdx")] 1743 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1744 use arch::x86_64::tdx::*; 1745 1746 let firmware_path = self 1747 .config 1748 .lock() 1749 .unwrap() 1750 .payload 1751 .as_ref() 1752 .unwrap() 1753 .firmware 1754 .clone() 1755 .ok_or(Error::TdxFirmwareMissing)?; 1756 // The TDVF file contains a table of section as well as code 1757 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1758 1759 // For all the sections allocate some RAM backing them 1760 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1761 } 1762 1763 #[cfg(feature = "tdx")] 1764 fn hob_memory_resources( 1765 mut sorted_sections: Vec<TdvfSection>, 1766 guest_memory: &GuestMemoryMmap, 1767 ) -> Vec<(u64, u64, bool)> { 1768 let mut list = Vec::new(); 1769 1770 let mut current_section = sorted_sections.pop(); 1771 1772 // RAM regions interleaved with TDVF sections 1773 let mut next_start_addr = 0; 1774 for region in guest_memory.iter() { 1775 let region_start = region.start_addr().0; 1776 let region_end = region.last_addr().0; 1777 if region_start > next_start_addr { 1778 next_start_addr = region_start; 1779 } 1780 1781 loop { 1782 let (start, size, ram) = if let Some(section) = ¤t_section { 1783 if section.address <= next_start_addr { 1784 (section.address, section.size, false) 1785 } else { 1786 let last_addr = std::cmp::min(section.address - 1, region_end); 1787 (next_start_addr, last_addr - next_start_addr + 1, true) 1788 } 1789 } else { 1790 (next_start_addr, region_end - next_start_addr + 1, true) 1791 }; 1792 1793 list.push((start, size, ram)); 1794 1795 if !ram { 1796 current_section = sorted_sections.pop(); 1797 } 1798 1799 next_start_addr = start + size; 1800 1801 if region_start > next_start_addr { 1802 next_start_addr = region_start; 1803 } 1804 1805 if next_start_addr > region_end { 1806 break; 1807 } 1808 } 1809 } 1810 1811 // Once all the interleaved sections have been processed, let's simply 1812 // pull the remaining ones. 1813 if let Some(section) = current_section { 1814 list.push((section.address, section.size, false)); 1815 } 1816 while let Some(section) = sorted_sections.pop() { 1817 list.push((section.address, section.size, false)); 1818 } 1819 1820 list 1821 } 1822 1823 #[cfg(feature = "tdx")] 1824 fn populate_tdx_sections( 1825 &mut self, 1826 sections: &[TdvfSection], 1827 guid_found: bool, 1828 ) -> Result<Option<u64>> { 1829 use arch::x86_64::tdx::*; 1830 // Get the memory end *before* we start adding TDVF ram regions 1831 let boot_guest_memory = self 1832 .memory_manager 1833 .lock() 1834 .as_ref() 1835 .unwrap() 1836 .boot_guest_memory(); 1837 for section in sections { 1838 // No need to allocate if the section falls within guest RAM ranges 1839 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1840 info!( 1841 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1842 section 1843 ); 1844 continue; 1845 } 1846 1847 info!("Allocating TDVF Section: {:x?}", section); 1848 self.memory_manager 1849 .lock() 1850 .unwrap() 1851 .add_ram_region(GuestAddress(section.address), section.size as usize) 1852 .map_err(Error::AllocatingTdvfMemory)?; 1853 } 1854 1855 // The TDVF file contains a table of section as well as code 1856 let firmware_path = self 1857 .config 1858 .lock() 1859 .unwrap() 1860 .payload 1861 .as_ref() 1862 .unwrap() 1863 .firmware 1864 .clone() 1865 .ok_or(Error::TdxFirmwareMissing)?; 1866 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1867 1868 // The guest memory at this point now has all the required regions so it 1869 // is safe to copy from the TDVF file into it. 1870 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1871 let mem = guest_memory.memory(); 1872 let mut payload_info = None; 1873 let mut hob_offset = None; 1874 for section in sections { 1875 info!("Populating TDVF Section: {:x?}", section); 1876 match section.r#type { 1877 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1878 info!("Copying section to guest memory"); 1879 firmware_file 1880 .seek(SeekFrom::Start(section.data_offset as u64)) 1881 .map_err(Error::LoadTdvf)?; 1882 mem.read_volatile_from( 1883 GuestAddress(section.address), 1884 &mut firmware_file, 1885 section.data_size as usize, 1886 ) 1887 .unwrap(); 1888 } 1889 TdvfSectionType::TdHob => { 1890 hob_offset = Some(section.address); 1891 } 1892 TdvfSectionType::Payload => { 1893 info!("Copying payload to guest memory"); 1894 if let Some(payload_file) = self.kernel.as_mut() { 1895 let payload_size = payload_file 1896 .seek(SeekFrom::End(0)) 1897 .map_err(Error::LoadPayload)?; 1898 1899 payload_file 1900 .seek(SeekFrom::Start(0x1f1)) 1901 .map_err(Error::LoadPayload)?; 1902 1903 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1904 payload_file 1905 .read_volatile(&mut payload_header.as_bytes()) 1906 .unwrap(); 1907 1908 if payload_header.header != 0x5372_6448 { 1909 return Err(Error::InvalidPayloadType); 1910 } 1911 1912 if (payload_header.version < 0x0200) 1913 || ((payload_header.loadflags & 0x1) == 0x0) 1914 { 1915 return Err(Error::InvalidPayloadType); 1916 } 1917 1918 payload_file.rewind().map_err(Error::LoadPayload)?; 1919 mem.read_volatile_from( 1920 GuestAddress(section.address), 1921 payload_file, 1922 payload_size as usize, 1923 ) 1924 .unwrap(); 1925 1926 // Create the payload info that will be inserted into 1927 // the HOB. 1928 payload_info = Some(PayloadInfo { 1929 image_type: PayloadImageType::BzImage, 1930 entry_point: section.address, 1931 }); 1932 } 1933 } 1934 TdvfSectionType::PayloadParam => { 1935 info!("Copying payload parameters to guest memory"); 1936 let cmdline = Self::generate_cmdline( 1937 self.config.lock().unwrap().payload.as_ref().unwrap(), 1938 )?; 1939 mem.write_slice( 1940 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1941 GuestAddress(section.address), 1942 ) 1943 .unwrap(); 1944 } 1945 _ => {} 1946 } 1947 } 1948 1949 // Generate HOB 1950 let mut hob = TdHob::start(hob_offset.unwrap()); 1951 1952 let mut sorted_sections = sections.to_vec(); 1953 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1954 1955 sorted_sections.sort_by_key(|section| section.address); 1956 sorted_sections.reverse(); 1957 1958 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1959 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1960 .map_err(Error::PopulateHob)?; 1961 } 1962 1963 // MMIO regions 1964 hob.add_mmio_resource( 1965 &mem, 1966 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1967 arch::layout::APIC_START.raw_value() 1968 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1969 ) 1970 .map_err(Error::PopulateHob)?; 1971 let start_of_device_area = self 1972 .memory_manager 1973 .lock() 1974 .unwrap() 1975 .start_of_device_area() 1976 .raw_value(); 1977 let end_of_device_area = self 1978 .memory_manager 1979 .lock() 1980 .unwrap() 1981 .end_of_device_area() 1982 .raw_value(); 1983 hob.add_mmio_resource( 1984 &mem, 1985 start_of_device_area, 1986 end_of_device_area - start_of_device_area, 1987 ) 1988 .map_err(Error::PopulateHob)?; 1989 1990 // Loop over the ACPI tables and copy them to the HOB. 1991 1992 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1993 &self.device_manager, 1994 &self.cpu_manager, 1995 &self.memory_manager, 1996 &self.numa_nodes, 1997 ) { 1998 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1999 .map_err(Error::PopulateHob)?; 2000 } 2001 2002 // If a payload info has been created, let's insert it into the HOB. 2003 if let Some(payload_info) = payload_info { 2004 hob.add_payload(&mem, payload_info) 2005 .map_err(Error::PopulateHob)?; 2006 } 2007 2008 hob.finish(&mem).map_err(Error::PopulateHob)?; 2009 2010 Ok(hob_offset) 2011 } 2012 2013 #[cfg(feature = "tdx")] 2014 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 2015 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2016 let mem = guest_memory.memory(); 2017 2018 for section in sections { 2019 self.vm 2020 .tdx_init_memory_region( 2021 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2022 section.address, 2023 section.size, 2024 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2025 section.attributes == 1, 2026 ) 2027 .map_err(Error::InitializeTdxMemoryRegion)?; 2028 } 2029 2030 Ok(()) 2031 } 2032 2033 // Creates ACPI tables 2034 // In case of TDX being used, this is a no-op since the tables will be 2035 // created and passed when populating the HOB. 2036 2037 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2038 #[cfg(feature = "tdx")] 2039 if self.config.lock().unwrap().is_tdx_enabled() { 2040 return None; 2041 } 2042 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2043 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2044 let rsdp_addr = crate::acpi::create_acpi_tables( 2045 &mem, 2046 &self.device_manager, 2047 &self.cpu_manager, 2048 &self.memory_manager, 2049 &self.numa_nodes, 2050 tpm_enabled, 2051 ); 2052 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2053 2054 Some(rsdp_addr) 2055 } 2056 2057 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2058 trace_scoped!("entry_point"); 2059 2060 self.load_payload_handle 2061 .take() 2062 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2063 .transpose() 2064 } 2065 2066 pub fn boot(&mut self) -> Result<()> { 2067 trace_scoped!("Vm::boot"); 2068 let current_state = self.get_state()?; 2069 if current_state == VmState::Paused { 2070 return self.resume().map_err(Error::Resume); 2071 } 2072 2073 let new_state = if self.stop_on_boot { 2074 VmState::BreakPoint 2075 } else { 2076 VmState::Running 2077 }; 2078 current_state.valid_transition(new_state)?; 2079 2080 // Do earlier to parallelise with loading kernel 2081 #[cfg(target_arch = "x86_64")] 2082 cfg_if::cfg_if! { 2083 if #[cfg(feature = "sev_snp")] { 2084 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2085 let rsdp_addr = if sev_snp_enabled { 2086 // In case of SEV-SNP guest ACPI tables are provided via 2087 // IGVM. So skip the creation of ACPI tables and set the 2088 // rsdp addr to None. 2089 None 2090 } else { 2091 self.create_acpi_tables() 2092 }; 2093 } else { 2094 let rsdp_addr = self.create_acpi_tables(); 2095 } 2096 } 2097 2098 // Load kernel synchronously or if asynchronous then wait for load to 2099 // finish. 2100 let entry_point = self.entry_point()?; 2101 2102 #[cfg(feature = "tdx")] 2103 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2104 2105 // Configure the vcpus that have been created 2106 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2107 for vcpu in vcpus { 2108 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2109 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2110 self.cpu_manager 2111 .lock() 2112 .unwrap() 2113 .configure_vcpu(vcpu, boot_setup) 2114 .map_err(Error::CpuManager)?; 2115 } 2116 2117 #[cfg(feature = "tdx")] 2118 let (sections, guid_found) = if tdx_enabled { 2119 self.extract_tdvf_sections()? 2120 } else { 2121 (Vec::new(), false) 2122 }; 2123 2124 // Configuring the TDX regions requires that the vCPUs are created. 2125 #[cfg(feature = "tdx")] 2126 let hob_address = if tdx_enabled { 2127 // TDX sections are written to memory. 2128 self.populate_tdx_sections(§ions, guid_found)? 2129 } else { 2130 None 2131 }; 2132 2133 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2134 // available after they are configured 2135 #[cfg(target_arch = "aarch64")] 2136 let rsdp_addr = self.create_acpi_tables(); 2137 2138 // Configure shared state based on loaded kernel 2139 entry_point 2140 .map(|entry_point| { 2141 // Safe to unwrap rsdp_addr as we know it can't be None when 2142 // the entry_point is Some. 2143 self.configure_system(rsdp_addr.unwrap(), entry_point) 2144 }) 2145 .transpose()?; 2146 2147 #[cfg(target_arch = "x86_64")] 2148 // Note: For x86, always call this function before invoking start boot vcpus. 2149 // Otherwise guest would fail to boot because we haven't created the 2150 // userspace mappings to update the hypervisor about the memory mappings. 2151 // These mappings must be created before we start the vCPU threads for 2152 // the very first time. 2153 self.memory_manager 2154 .lock() 2155 .unwrap() 2156 .allocate_address_space() 2157 .map_err(Error::MemoryManager)?; 2158 2159 #[cfg(feature = "tdx")] 2160 if let Some(hob_address) = hob_address { 2161 // With the HOB address extracted the vCPUs can have 2162 // their TDX state configured. 2163 self.cpu_manager 2164 .lock() 2165 .unwrap() 2166 .initialize_tdx(hob_address) 2167 .map_err(Error::CpuManager)?; 2168 // Let the hypervisor know which memory ranges are shared with the 2169 // guest. This prevents the guest from ignoring/discarding memory 2170 // regions provided by the host. 2171 self.init_tdx_memory(§ions)?; 2172 // With TDX memory and CPU state configured TDX setup is complete 2173 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2174 } 2175 2176 self.cpu_manager 2177 .lock() 2178 .unwrap() 2179 .start_boot_vcpus(new_state == VmState::BreakPoint) 2180 .map_err(Error::CpuManager)?; 2181 2182 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2183 *state = new_state; 2184 Ok(()) 2185 } 2186 2187 pub fn restore(&mut self) -> Result<()> { 2188 event!("vm", "restoring"); 2189 2190 #[cfg(target_arch = "x86_64")] 2191 // Note: For x86, always call this function before invoking start boot vcpus. 2192 // Otherwise guest would fail to boot because we haven't created the 2193 // userspace mappings to update the hypervisor about the memory mappings. 2194 // These mappings must be created before we start the vCPU threads for 2195 // the very first time for the restored VM. 2196 self.memory_manager 2197 .lock() 2198 .unwrap() 2199 .allocate_address_space() 2200 .map_err(Error::MemoryManager)?; 2201 2202 // Now we can start all vCPUs from here. 2203 self.cpu_manager 2204 .lock() 2205 .unwrap() 2206 .start_restored_vcpus() 2207 .map_err(Error::CpuManager)?; 2208 2209 event!("vm", "restored"); 2210 Ok(()) 2211 } 2212 2213 /// Gets a thread-safe reference counted pointer to the VM configuration. 2214 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2215 Arc::clone(&self.config) 2216 } 2217 2218 /// Get the VM state. Returns an error if the state is poisoned. 2219 pub fn get_state(&self) -> Result<VmState> { 2220 self.state 2221 .try_read() 2222 .map_err(|_| Error::PoisonedState) 2223 .map(|state| *state) 2224 } 2225 2226 /// Gets the actual size of the balloon. 2227 pub fn balloon_size(&self) -> u64 { 2228 self.device_manager.lock().unwrap().balloon_size() 2229 } 2230 2231 pub fn send_memory_fds( 2232 &mut self, 2233 socket: &mut UnixStream, 2234 ) -> std::result::Result<(), MigratableError> { 2235 for (slot, fd) in self 2236 .memory_manager 2237 .lock() 2238 .unwrap() 2239 .memory_slot_fds() 2240 .drain() 2241 { 2242 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2243 .write_to(socket) 2244 .map_err(|e| { 2245 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2246 })?; 2247 socket 2248 .send_with_fd(&slot.to_le_bytes()[..], fd) 2249 .map_err(|e| { 2250 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2251 })?; 2252 2253 let res = Response::read_from(socket)?; 2254 if res.status() != Status::Ok { 2255 warn!("Error during memory fd migration"); 2256 Request::abandon().write_to(socket)?; 2257 Response::read_from(socket).ok(); 2258 return Err(MigratableError::MigrateSend(anyhow!( 2259 "Error during memory fd migration" 2260 ))); 2261 } 2262 } 2263 2264 Ok(()) 2265 } 2266 2267 pub fn send_memory_regions<F>( 2268 &mut self, 2269 ranges: &MemoryRangeTable, 2270 fd: &mut F, 2271 ) -> std::result::Result<(), MigratableError> 2272 where 2273 F: WriteVolatile, 2274 { 2275 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2276 let mem = guest_memory.memory(); 2277 2278 for range in ranges.regions() { 2279 let mut offset: u64 = 0; 2280 // Here we are manually handling the retry in case we can't the 2281 // whole region at once because we can't use the implementation 2282 // from vm-memory::GuestMemory of write_all_to() as it is not 2283 // following the correct behavior. For more info about this issue 2284 // see: https://github.com/rust-vmm/vm-memory/issues/174 2285 loop { 2286 let bytes_written = mem 2287 .write_volatile_to( 2288 GuestAddress(range.gpa + offset), 2289 fd, 2290 (range.length - offset) as usize, 2291 ) 2292 .map_err(|e| { 2293 MigratableError::MigrateSend(anyhow!( 2294 "Error transferring memory to socket: {}", 2295 e 2296 )) 2297 })?; 2298 offset += bytes_written as u64; 2299 2300 if offset == range.length { 2301 break; 2302 } 2303 } 2304 } 2305 2306 Ok(()) 2307 } 2308 2309 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2310 self.memory_manager 2311 .lock() 2312 .unwrap() 2313 .memory_range_table(false) 2314 } 2315 2316 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2317 self.device_manager.lock().unwrap().device_tree() 2318 } 2319 2320 pub fn activate_virtio_devices(&self) -> Result<()> { 2321 self.device_manager 2322 .lock() 2323 .unwrap() 2324 .activate_virtio_devices() 2325 .map_err(Error::ActivateVirtioDevices) 2326 } 2327 2328 #[cfg(target_arch = "x86_64")] 2329 pub fn power_button(&self) -> Result<()> { 2330 return self 2331 .device_manager 2332 .lock() 2333 .unwrap() 2334 .notify_power_button() 2335 .map_err(Error::PowerButton); 2336 } 2337 2338 #[cfg(target_arch = "aarch64")] 2339 pub fn power_button(&self) -> Result<()> { 2340 self.device_manager 2341 .lock() 2342 .unwrap() 2343 .notify_power_button() 2344 .map_err(Error::PowerButton) 2345 } 2346 2347 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2348 self.memory_manager.lock().unwrap().snapshot_data() 2349 } 2350 2351 #[cfg(feature = "guest_debug")] 2352 pub fn debug_request( 2353 &mut self, 2354 gdb_request: &GdbRequestPayload, 2355 cpu_id: usize, 2356 ) -> Result<GdbResponsePayload> { 2357 use GdbRequestPayload::*; 2358 match gdb_request { 2359 SetSingleStep(single_step) => { 2360 self.set_guest_debug(cpu_id, &[], *single_step) 2361 .map_err(Error::Debug)?; 2362 } 2363 SetHwBreakPoint(addrs) => { 2364 self.set_guest_debug(cpu_id, addrs, false) 2365 .map_err(Error::Debug)?; 2366 } 2367 Pause => { 2368 self.debug_pause().map_err(Error::Debug)?; 2369 } 2370 Resume => { 2371 self.debug_resume().map_err(Error::Debug)?; 2372 } 2373 ReadRegs => { 2374 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2375 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2376 } 2377 WriteRegs(regs) => { 2378 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2379 } 2380 ReadMem(vaddr, len) => { 2381 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2382 let mem = self 2383 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2384 .map_err(Error::Debug)?; 2385 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2386 } 2387 WriteMem(vaddr, data) => { 2388 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2389 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2390 .map_err(Error::Debug)?; 2391 } 2392 ActiveVcpus => { 2393 let active_vcpus = self.active_vcpus(); 2394 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2395 } 2396 } 2397 Ok(GdbResponsePayload::CommandComplete) 2398 } 2399 2400 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2401 fn get_dump_state( 2402 &mut self, 2403 destination_url: &str, 2404 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2405 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2406 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2407 let mut elf_phdr_num = 1; 2408 let elf_sh_info = 0; 2409 let coredump_file_path = url_to_file(destination_url)?; 2410 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2411 2412 if mapping_num < UINT16_MAX - 2 { 2413 elf_phdr_num += mapping_num as u16; 2414 } else { 2415 panic!("mapping num beyond 65535 not supported"); 2416 } 2417 let coredump_file = OpenOptions::new() 2418 .read(true) 2419 .write(true) 2420 .create_new(true) 2421 .open(coredump_file_path) 2422 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2423 2424 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2425 let mem_data = self 2426 .memory_manager 2427 .lock() 2428 .unwrap() 2429 .coredump_memory_regions(mem_offset); 2430 2431 Ok(DumpState { 2432 elf_note_size, 2433 elf_phdr_num, 2434 elf_sh_info, 2435 mem_offset, 2436 mem_info: Some(mem_data), 2437 file: Some(coredump_file), 2438 }) 2439 } 2440 2441 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2442 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2443 size_of::<elf::Elf64_Ehdr>() as u64 2444 + note_size as u64 2445 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2446 } 2447 2448 pub fn nmi(&self) -> Result<()> { 2449 return self 2450 .cpu_manager 2451 .lock() 2452 .unwrap() 2453 .nmi() 2454 .map_err(|_| Error::ErrorNmi); 2455 } 2456 } 2457 2458 impl Pausable for Vm { 2459 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2460 event!("vm", "pausing"); 2461 let mut state = self 2462 .state 2463 .try_write() 2464 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2465 let new_state = VmState::Paused; 2466 2467 state 2468 .valid_transition(new_state) 2469 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2470 2471 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2472 { 2473 let mut clock = self 2474 .vm 2475 .get_clock() 2476 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2477 clock.reset_flags(); 2478 self.saved_clock = Some(clock); 2479 } 2480 2481 // Before pausing the vCPUs activate any pending virtio devices that might 2482 // need activation between starting the pause (or e.g. a migration it's part of) 2483 self.activate_virtio_devices().map_err(|e| { 2484 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2485 })?; 2486 2487 self.cpu_manager.lock().unwrap().pause()?; 2488 self.device_manager.lock().unwrap().pause()?; 2489 2490 *state = new_state; 2491 2492 event!("vm", "paused"); 2493 Ok(()) 2494 } 2495 2496 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2497 event!("vm", "resuming"); 2498 let mut state = self 2499 .state 2500 .try_write() 2501 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2502 let new_state = VmState::Running; 2503 2504 state 2505 .valid_transition(new_state) 2506 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2507 2508 self.cpu_manager.lock().unwrap().resume()?; 2509 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2510 { 2511 if let Some(clock) = &self.saved_clock { 2512 self.vm.set_clock(clock).map_err(|e| { 2513 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2514 })?; 2515 } 2516 } 2517 self.device_manager.lock().unwrap().resume()?; 2518 2519 // And we're back to the Running state. 2520 *state = new_state; 2521 event!("vm", "resumed"); 2522 Ok(()) 2523 } 2524 } 2525 2526 #[derive(Serialize, Deserialize)] 2527 pub struct VmSnapshot { 2528 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2529 pub clock: Option<hypervisor::ClockData>, 2530 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2531 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2532 } 2533 2534 pub const VM_SNAPSHOT_ID: &str = "vm"; 2535 impl Snapshottable for Vm { 2536 fn id(&self) -> String { 2537 VM_SNAPSHOT_ID.to_string() 2538 } 2539 2540 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2541 event!("vm", "snapshotting"); 2542 2543 #[cfg(feature = "tdx")] 2544 { 2545 if self.config.lock().unwrap().is_tdx_enabled() { 2546 return Err(MigratableError::Snapshot(anyhow!( 2547 "Snapshot not possible with TDX VM" 2548 ))); 2549 } 2550 } 2551 2552 let current_state = self.get_state().unwrap(); 2553 if current_state != VmState::Paused { 2554 return Err(MigratableError::Snapshot(anyhow!( 2555 "Trying to snapshot while VM is running" 2556 ))); 2557 } 2558 2559 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2560 let common_cpuid = { 2561 let amx = self.config.lock().unwrap().cpus.features.amx; 2562 let phys_bits = physical_bits( 2563 &self.hypervisor, 2564 self.config.lock().unwrap().cpus.max_phys_bits, 2565 ); 2566 arch::generate_common_cpuid( 2567 &self.hypervisor, 2568 &arch::CpuidConfig { 2569 sgx_epc_sections: None, 2570 phys_bits, 2571 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2572 #[cfg(feature = "tdx")] 2573 tdx: false, 2574 amx, 2575 }, 2576 ) 2577 .map_err(|e| { 2578 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2579 })? 2580 }; 2581 2582 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2583 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2584 clock: self.saved_clock, 2585 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2586 common_cpuid, 2587 }) 2588 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2589 2590 let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data)); 2591 2592 let (id, snapshot) = { 2593 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2594 (cpu_manager.id(), cpu_manager.snapshot()?) 2595 }; 2596 vm_snapshot.add_snapshot(id, snapshot); 2597 let (id, snapshot) = { 2598 let mut memory_manager = self.memory_manager.lock().unwrap(); 2599 (memory_manager.id(), memory_manager.snapshot()?) 2600 }; 2601 vm_snapshot.add_snapshot(id, snapshot); 2602 let (id, snapshot) = { 2603 let mut device_manager = self.device_manager.lock().unwrap(); 2604 (device_manager.id(), device_manager.snapshot()?) 2605 }; 2606 vm_snapshot.add_snapshot(id, snapshot); 2607 2608 event!("vm", "snapshotted"); 2609 Ok(vm_snapshot) 2610 } 2611 } 2612 2613 impl Transportable for Vm { 2614 fn send( 2615 &self, 2616 snapshot: &Snapshot, 2617 destination_url: &str, 2618 ) -> std::result::Result<(), MigratableError> { 2619 let mut snapshot_config_path = url_to_path(destination_url)?; 2620 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2621 2622 // Create the snapshot config file 2623 let mut snapshot_config_file = OpenOptions::new() 2624 .read(true) 2625 .write(true) 2626 .create_new(true) 2627 .open(snapshot_config_path) 2628 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2629 2630 // Serialize and write the snapshot config 2631 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2632 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2633 2634 snapshot_config_file 2635 .write(vm_config.as_bytes()) 2636 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2637 2638 let mut snapshot_state_path = url_to_path(destination_url)?; 2639 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2640 2641 // Create the snapshot state file 2642 let mut snapshot_state_file = OpenOptions::new() 2643 .read(true) 2644 .write(true) 2645 .create_new(true) 2646 .open(snapshot_state_path) 2647 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2648 2649 // Serialize and write the snapshot state 2650 let vm_state = 2651 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2652 2653 snapshot_state_file 2654 .write(&vm_state) 2655 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2656 2657 // Tell the memory manager to also send/write its own snapshot. 2658 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2659 self.memory_manager 2660 .lock() 2661 .unwrap() 2662 .send(&memory_manager_snapshot.clone(), destination_url)?; 2663 } else { 2664 return Err(MigratableError::Restore(anyhow!( 2665 "Missing memory manager snapshot" 2666 ))); 2667 } 2668 2669 Ok(()) 2670 } 2671 } 2672 2673 impl Migratable for Vm { 2674 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2675 self.memory_manager.lock().unwrap().start_dirty_log()?; 2676 self.device_manager.lock().unwrap().start_dirty_log() 2677 } 2678 2679 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2680 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2681 self.device_manager.lock().unwrap().stop_dirty_log() 2682 } 2683 2684 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2685 Ok(MemoryRangeTable::new_from_tables(vec![ 2686 self.memory_manager.lock().unwrap().dirty_log()?, 2687 self.device_manager.lock().unwrap().dirty_log()?, 2688 ])) 2689 } 2690 2691 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2692 self.memory_manager.lock().unwrap().start_migration()?; 2693 self.device_manager.lock().unwrap().start_migration() 2694 } 2695 2696 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2697 self.memory_manager.lock().unwrap().complete_migration()?; 2698 self.device_manager.lock().unwrap().complete_migration() 2699 } 2700 } 2701 2702 #[cfg(feature = "guest_debug")] 2703 impl Debuggable for Vm { 2704 fn set_guest_debug( 2705 &self, 2706 cpu_id: usize, 2707 addrs: &[GuestAddress], 2708 singlestep: bool, 2709 ) -> std::result::Result<(), DebuggableError> { 2710 self.cpu_manager 2711 .lock() 2712 .unwrap() 2713 .set_guest_debug(cpu_id, addrs, singlestep) 2714 } 2715 2716 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2717 if *self.state.read().unwrap() == VmState::Running { 2718 self.pause().map_err(DebuggableError::Pause)?; 2719 } 2720 2721 let mut state = self 2722 .state 2723 .try_write() 2724 .map_err(|_| DebuggableError::PoisonedState)?; 2725 *state = VmState::BreakPoint; 2726 Ok(()) 2727 } 2728 2729 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2730 if *self.state.read().unwrap() == VmState::BreakPoint { 2731 self.resume().map_err(DebuggableError::Pause)?; 2732 } 2733 2734 Ok(()) 2735 } 2736 2737 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2738 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2739 } 2740 2741 fn write_regs( 2742 &self, 2743 cpu_id: usize, 2744 regs: &CoreRegs, 2745 ) -> std::result::Result<(), DebuggableError> { 2746 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2747 } 2748 2749 fn read_mem( 2750 &self, 2751 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2752 cpu_id: usize, 2753 vaddr: GuestAddress, 2754 len: usize, 2755 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2756 self.cpu_manager 2757 .lock() 2758 .unwrap() 2759 .read_mem(guest_memory, cpu_id, vaddr, len) 2760 } 2761 2762 fn write_mem( 2763 &self, 2764 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2765 cpu_id: usize, 2766 vaddr: &GuestAddress, 2767 data: &[u8], 2768 ) -> std::result::Result<(), DebuggableError> { 2769 self.cpu_manager 2770 .lock() 2771 .unwrap() 2772 .write_mem(guest_memory, cpu_id, vaddr, data) 2773 } 2774 2775 fn active_vcpus(&self) -> usize { 2776 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2777 if active_vcpus > 0 { 2778 active_vcpus 2779 } else { 2780 // The VM is not booted yet. Report boot_vcpus() instead. 2781 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2782 } 2783 } 2784 } 2785 2786 #[cfg(feature = "guest_debug")] 2787 pub const UINT16_MAX: u32 = 65535; 2788 2789 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2790 impl Elf64Writable for Vm {} 2791 2792 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2793 impl GuestDebuggable for Vm { 2794 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2795 event!("vm", "coredumping"); 2796 2797 let mut resume = false; 2798 2799 #[cfg(feature = "tdx")] 2800 { 2801 if let Some(ref platform) = self.config.lock().unwrap().platform { 2802 if platform.tdx { 2803 return Err(GuestDebuggableError::Coredump(anyhow!( 2804 "Coredump not possible with TDX VM" 2805 ))); 2806 } 2807 } 2808 } 2809 2810 match self.get_state().unwrap() { 2811 VmState::Running => { 2812 self.pause().map_err(GuestDebuggableError::Pause)?; 2813 resume = true; 2814 } 2815 VmState::Paused => {} 2816 _ => { 2817 return Err(GuestDebuggableError::Coredump(anyhow!( 2818 "Trying to coredump while VM is not running or paused" 2819 ))); 2820 } 2821 } 2822 2823 let coredump_state = self.get_dump_state(destination_url)?; 2824 2825 self.write_header(&coredump_state)?; 2826 self.write_note(&coredump_state)?; 2827 self.write_loads(&coredump_state)?; 2828 2829 self.cpu_manager 2830 .lock() 2831 .unwrap() 2832 .cpu_write_elf64_note(&coredump_state)?; 2833 self.cpu_manager 2834 .lock() 2835 .unwrap() 2836 .cpu_write_vmm_note(&coredump_state)?; 2837 2838 self.memory_manager 2839 .lock() 2840 .unwrap() 2841 .coredump_iterate_save_mem(&coredump_state)?; 2842 2843 if resume { 2844 self.resume().map_err(GuestDebuggableError::Resume)?; 2845 } 2846 2847 Ok(()) 2848 } 2849 } 2850 2851 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2852 #[cfg(test)] 2853 mod tests { 2854 use super::*; 2855 2856 fn test_vm_state_transitions(state: VmState) { 2857 match state { 2858 VmState::Created => { 2859 // Check the transitions from Created 2860 assert!(state.valid_transition(VmState::Created).is_err()); 2861 assert!(state.valid_transition(VmState::Running).is_ok()); 2862 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2863 assert!(state.valid_transition(VmState::Paused).is_ok()); 2864 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2865 } 2866 VmState::Running => { 2867 // Check the transitions from Running 2868 assert!(state.valid_transition(VmState::Created).is_err()); 2869 assert!(state.valid_transition(VmState::Running).is_err()); 2870 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2871 assert!(state.valid_transition(VmState::Paused).is_ok()); 2872 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2873 } 2874 VmState::Shutdown => { 2875 // Check the transitions from Shutdown 2876 assert!(state.valid_transition(VmState::Created).is_err()); 2877 assert!(state.valid_transition(VmState::Running).is_ok()); 2878 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2879 assert!(state.valid_transition(VmState::Paused).is_err()); 2880 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2881 } 2882 VmState::Paused => { 2883 // Check the transitions from Paused 2884 assert!(state.valid_transition(VmState::Created).is_err()); 2885 assert!(state.valid_transition(VmState::Running).is_ok()); 2886 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2887 assert!(state.valid_transition(VmState::Paused).is_err()); 2888 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2889 } 2890 VmState::BreakPoint => { 2891 // Check the transitions from Breakpoint 2892 assert!(state.valid_transition(VmState::Created).is_ok()); 2893 assert!(state.valid_transition(VmState::Running).is_ok()); 2894 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2895 assert!(state.valid_transition(VmState::Paused).is_err()); 2896 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2897 } 2898 } 2899 } 2900 2901 #[test] 2902 fn test_vm_created_transitions() { 2903 test_vm_state_transitions(VmState::Created); 2904 } 2905 2906 #[test] 2907 fn test_vm_running_transitions() { 2908 test_vm_state_transitions(VmState::Running); 2909 } 2910 2911 #[test] 2912 fn test_vm_shutdown_transitions() { 2913 test_vm_state_transitions(VmState::Shutdown); 2914 } 2915 2916 #[test] 2917 fn test_vm_paused_transitions() { 2918 test_vm_state_transitions(VmState::Paused); 2919 } 2920 2921 #[cfg(feature = "tdx")] 2922 #[test] 2923 fn test_hob_memory_resources() { 2924 // Case 1: Two TDVF sections in the middle of the RAM 2925 let sections = vec![ 2926 TdvfSection { 2927 address: 0xc000, 2928 size: 0x1000, 2929 ..Default::default() 2930 }, 2931 TdvfSection { 2932 address: 0x1000, 2933 size: 0x4000, 2934 ..Default::default() 2935 }, 2936 ]; 2937 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2938 let expected = vec![ 2939 (0, 0x1000, true), 2940 (0x1000, 0x4000, false), 2941 (0x5000, 0x7000, true), 2942 (0xc000, 0x1000, false), 2943 (0xd000, 0x0fff_3000, true), 2944 ]; 2945 assert_eq!( 2946 expected, 2947 Vm::hob_memory_resources( 2948 sections, 2949 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2950 ) 2951 ); 2952 2953 // Case 2: Two TDVF sections with no conflict with the RAM 2954 let sections = vec![ 2955 TdvfSection { 2956 address: 0x1000_1000, 2957 size: 0x1000, 2958 ..Default::default() 2959 }, 2960 TdvfSection { 2961 address: 0, 2962 size: 0x1000, 2963 ..Default::default() 2964 }, 2965 ]; 2966 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2967 let expected = vec![ 2968 (0, 0x1000, false), 2969 (0x1000, 0x1000_0000, true), 2970 (0x1000_1000, 0x1000, false), 2971 ]; 2972 assert_eq!( 2973 expected, 2974 Vm::hob_memory_resources( 2975 sections, 2976 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2977 ) 2978 ); 2979 2980 // Case 3: Two TDVF sections with partial conflicts with the RAM 2981 let sections = vec![ 2982 TdvfSection { 2983 address: 0x1000_0000, 2984 size: 0x2000, 2985 ..Default::default() 2986 }, 2987 TdvfSection { 2988 address: 0, 2989 size: 0x2000, 2990 ..Default::default() 2991 }, 2992 ]; 2993 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2994 let expected = vec![ 2995 (0, 0x2000, false), 2996 (0x2000, 0x0fff_e000, true), 2997 (0x1000_0000, 0x2000, false), 2998 ]; 2999 assert_eq!( 3000 expected, 3001 Vm::hob_memory_resources( 3002 sections, 3003 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3004 ) 3005 ); 3006 3007 // Case 4: Two TDVF sections with no conflict before the RAM and two 3008 // more additional sections with no conflict after the RAM. 3009 let sections = vec![ 3010 TdvfSection { 3011 address: 0x2000_1000, 3012 size: 0x1000, 3013 ..Default::default() 3014 }, 3015 TdvfSection { 3016 address: 0x2000_0000, 3017 size: 0x1000, 3018 ..Default::default() 3019 }, 3020 TdvfSection { 3021 address: 0x1000, 3022 size: 0x1000, 3023 ..Default::default() 3024 }, 3025 TdvfSection { 3026 address: 0, 3027 size: 0x1000, 3028 ..Default::default() 3029 }, 3030 ]; 3031 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3032 let expected = vec![ 3033 (0, 0x1000, false), 3034 (0x1000, 0x1000, false), 3035 (0x4000, 0x1000_0000, true), 3036 (0x2000_0000, 0x1000, false), 3037 (0x2000_1000, 0x1000, false), 3038 ]; 3039 assert_eq!( 3040 expected, 3041 Vm::hob_memory_resources( 3042 sections, 3043 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3044 ) 3045 ); 3046 3047 // Case 5: One TDVF section overriding the entire RAM 3048 let sections = vec![TdvfSection { 3049 address: 0, 3050 size: 0x2000_0000, 3051 ..Default::default() 3052 }]; 3053 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3054 let expected = vec![(0, 0x2000_0000, false)]; 3055 assert_eq!( 3056 expected, 3057 Vm::hob_memory_resources( 3058 sections, 3059 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3060 ) 3061 ); 3062 3063 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3064 let sections = vec![ 3065 TdvfSection { 3066 address: 0x1000_2000, 3067 size: 0x2000, 3068 ..Default::default() 3069 }, 3070 TdvfSection { 3071 address: 0, 3072 size: 0x2000, 3073 ..Default::default() 3074 }, 3075 ]; 3076 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3077 (GuestAddress(0x2000), 0x1000_0000), 3078 (GuestAddress(0x1000_4000), 0x1000_0000), 3079 ]; 3080 let expected = vec![ 3081 (0, 0x2000, false), 3082 (0x2000, 0x1000_0000, true), 3083 (0x1000_2000, 0x2000, false), 3084 (0x1000_4000, 0x1000_0000, true), 3085 ]; 3086 assert_eq!( 3087 expected, 3088 Vm::hob_memory_resources( 3089 sections, 3090 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3091 ) 3092 ); 3093 3094 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3095 let sections = vec![ 3096 TdvfSection { 3097 address: 0x1000_0000, 3098 size: 0x4000, 3099 ..Default::default() 3100 }, 3101 TdvfSection { 3102 address: 0, 3103 size: 0x4000, 3104 ..Default::default() 3105 }, 3106 ]; 3107 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3108 (GuestAddress(0x1000), 0x1000_0000), 3109 (GuestAddress(0x1000_3000), 0x1000_0000), 3110 ]; 3111 let expected = vec![ 3112 (0, 0x4000, false), 3113 (0x4000, 0x0fff_c000, true), 3114 (0x1000_0000, 0x4000, false), 3115 (0x1000_4000, 0x0fff_f000, true), 3116 ]; 3117 assert_eq!( 3118 expected, 3119 Vm::hob_memory_resources( 3120 sections, 3121 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3122 ) 3123 ); 3124 } 3125 } 3126 3127 #[cfg(target_arch = "aarch64")] 3128 #[cfg(test)] 3129 mod tests { 3130 use super::*; 3131 use arch::aarch64::fdt::create_fdt; 3132 use arch::aarch64::layout; 3133 use arch::{DeviceType, MmioDeviceInfo}; 3134 use devices::gic::Gic; 3135 3136 const LEN: u64 = 4096; 3137 3138 #[test] 3139 fn test_create_fdt_with_devices() { 3140 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3141 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3142 3143 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3144 ( 3145 (DeviceType::Serial, DeviceType::Serial.to_string()), 3146 MmioDeviceInfo { 3147 addr: 0x00, 3148 len: LEN, 3149 irq: 33, 3150 }, 3151 ), 3152 ( 3153 (DeviceType::Virtio(1), "virtio".to_string()), 3154 MmioDeviceInfo { 3155 addr: LEN, 3156 len: LEN, 3157 irq: 34, 3158 }, 3159 ), 3160 ( 3161 (DeviceType::Rtc, "rtc".to_string()), 3162 MmioDeviceInfo { 3163 addr: 2 * LEN, 3164 len: LEN, 3165 irq: 35, 3166 }, 3167 ), 3168 ] 3169 .iter() 3170 .cloned() 3171 .collect(); 3172 3173 let hv = hypervisor::new().unwrap(); 3174 let vm = hv.create_vm().unwrap(); 3175 let gic = vm 3176 .create_vgic(Gic::create_default_config(1)) 3177 .expect("Cannot create gic"); 3178 assert!(create_fdt( 3179 &mem, 3180 "console=tty0", 3181 vec![0], 3182 Some((0, 0, 0)), 3183 &dev_info, 3184 &gic, 3185 &None, 3186 &Vec::new(), 3187 &BTreeMap::new(), 3188 None, 3189 true, 3190 ) 3191 .is_ok()) 3192 } 3193 } 3194 3195 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3196 #[test] 3197 pub fn test_vm() { 3198 use hypervisor::VmExit; 3199 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3200 // This example based on https://lwn.net/Articles/658511/ 3201 let code = [ 3202 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3203 0x00, 0xd8, /* add %bl, %al */ 3204 0x04, b'0', /* add $'0', %al */ 3205 0xee, /* out %al, (%dx) */ 3206 0xb0, b'\n', /* mov $'\n', %al */ 3207 0xee, /* out %al, (%dx) */ 3208 0xf4, /* hlt */ 3209 ]; 3210 3211 let mem_size = 0x1000; 3212 let load_addr = GuestAddress(0x1000); 3213 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3214 3215 let hv = hypervisor::new().unwrap(); 3216 let vm = hv.create_vm().expect("new VM creation failed"); 3217 3218 for (index, region) in mem.iter().enumerate() { 3219 let mem_region = vm.make_user_memory_region( 3220 index as u32, 3221 region.start_addr().raw_value(), 3222 region.len(), 3223 region.as_ptr() as u64, 3224 false, 3225 false, 3226 ); 3227 3228 vm.create_user_memory_region(mem_region) 3229 .expect("Cannot configure guest memory"); 3230 } 3231 mem.write_slice(&code, load_addr) 3232 .expect("Writing code to memory failed"); 3233 3234 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3235 3236 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3237 vcpu_sregs.cs.base = 0; 3238 vcpu_sregs.cs.selector = 0; 3239 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3240 3241 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3242 vcpu_regs.rip = 0x1000; 3243 vcpu_regs.rax = 2; 3244 vcpu_regs.rbx = 3; 3245 vcpu_regs.rflags = 2; 3246 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3247 3248 loop { 3249 match vcpu.run().expect("run failed") { 3250 VmExit::IoOut(addr, data) => { 3251 println!( 3252 "IO out -- addr: {:#x} data [{:?}]", 3253 addr, 3254 str::from_utf8(data).unwrap() 3255 ); 3256 } 3257 VmExit::Reset => { 3258 println!("HLT"); 3259 break; 3260 } 3261 r => panic!("unexpected exit reason: {r:?}"), 3262 } 3263 } 3264 } 3265