1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 #[cfg(feature = "igvm")] 29 use crate::igvm::igvm_loader; 30 use crate::memory_manager::{ 31 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 32 }; 33 #[cfg(target_arch = "x86_64")] 34 use crate::migration::get_vm_snapshot; 35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 36 use crate::migration::url_to_file; 37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 38 use crate::GuestMemoryMmap; 39 use crate::{ 40 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 41 }; 42 use anyhow::anyhow; 43 use arch::get_host_cpu_phys_bits; 44 #[cfg(target_arch = "x86_64")] 45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 46 #[cfg(feature = "tdx")] 47 use arch::x86_64::tdx::TdvfSection; 48 use arch::EntryPoint; 49 #[cfg(target_arch = "aarch64")] 50 use arch::PciSpaceInfo; 51 use arch::{NumaNode, NumaNodes}; 52 #[cfg(target_arch = "aarch64")] 53 use devices::interrupt_controller; 54 use devices::AcpiNotificationFlags; 55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 59 use hypervisor::{HypervisorVmError, VmOps}; 60 use libc::{termios, SIGWINCH}; 61 use linux_loader::cmdline::Cmdline; 62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 63 use linux_loader::elf; 64 #[cfg(target_arch = "x86_64")] 65 use linux_loader::loader::bzimage::BzImage; 66 #[cfg(target_arch = "x86_64")] 67 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 68 #[cfg(target_arch = "aarch64")] 69 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 70 use linux_loader::loader::KernelLoader; 71 use seccompiler::SeccompAction; 72 use serde::{Deserialize, Serialize}; 73 use std::cmp; 74 use std::collections::BTreeMap; 75 use std::collections::HashMap; 76 use std::fs::{File, OpenOptions}; 77 use std::io::{self, Seek, SeekFrom, Write}; 78 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 79 use std::mem::size_of; 80 use std::num::Wrapping; 81 use std::ops::Deref; 82 use std::os::unix::net::UnixStream; 83 use std::sync::{Arc, Mutex, RwLock}; 84 use std::time::Instant; 85 use std::{result, str, thread}; 86 use thiserror::Error; 87 use tracer::trace_scoped; 88 use vm_device::Bus; 89 #[cfg(feature = "tdx")] 90 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 91 use vm_memory::{ 92 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 93 }; 94 use vm_migration::protocol::{Request, Response, Status}; 95 use vm_migration::{ 96 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 97 Snapshottable, Transportable, 98 }; 99 use vmm_sys_util::eventfd::EventFd; 100 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 101 102 /// Errors associated with VM management 103 #[derive(Debug, Error)] 104 pub enum Error { 105 #[error("Cannot open kernel file: {0}")] 106 KernelFile(#[source] io::Error), 107 108 #[error("Cannot open initramfs file: {0}")] 109 InitramfsFile(#[source] io::Error), 110 111 #[error("Cannot load the kernel into memory: {0}")] 112 KernelLoad(#[source] linux_loader::loader::Error), 113 114 #[cfg(target_arch = "aarch64")] 115 #[error("Cannot load the UEFI binary in memory: {0:?}")] 116 UefiLoad(arch::aarch64::uefi::Error), 117 118 #[error("Cannot load the initramfs into memory")] 119 InitramfsLoad, 120 121 #[error("Cannot load the kernel command line in memory: {0}")] 122 LoadCmdLine(#[source] linux_loader::loader::Error), 123 124 #[error("Cannot modify the kernel command line: {0}")] 125 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 126 127 #[error("Cannot create the kernel command line: {0}")] 128 CmdLineCreate(#[source] linux_loader::cmdline::Error), 129 130 #[error("Cannot configure system: {0}")] 131 ConfigureSystem(#[source] arch::Error), 132 133 #[cfg(target_arch = "aarch64")] 134 #[error("Cannot enable interrupt controller: {0:?}")] 135 EnableInterruptController(interrupt_controller::Error), 136 137 #[error("VM state is poisoned")] 138 PoisonedState, 139 140 #[error("Error from device manager: {0:?}")] 141 DeviceManager(DeviceManagerError), 142 143 #[error("No device with id {0:?} to remove")] 144 NoDeviceToRemove(String), 145 146 #[error("Cannot spawn a signal handler thread: {0}")] 147 SignalHandlerSpawn(#[source] io::Error), 148 149 #[error("Failed to join on threads: {0:?}")] 150 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 151 152 #[error("VM config is missing")] 153 VmMissingConfig, 154 155 #[error("VM is not created")] 156 VmNotCreated, 157 158 #[error("VM is already created")] 159 VmAlreadyCreated, 160 161 #[error("VM is not running")] 162 VmNotRunning, 163 164 #[error("Cannot clone EventFd: {0}")] 165 EventFdClone(#[source] io::Error), 166 167 #[error("invalid VM state transition: {0:?} to {1:?}")] 168 InvalidStateTransition(VmState, VmState), 169 170 #[error("Error from CPU manager: {0}")] 171 CpuManager(#[source] cpu::Error), 172 173 #[error("Cannot pause devices: {0}")] 174 PauseDevices(#[source] MigratableError), 175 176 #[error("Cannot resume devices: {0}")] 177 ResumeDevices(#[source] MigratableError), 178 179 #[error("Cannot pause CPUs: {0}")] 180 PauseCpus(#[source] MigratableError), 181 182 #[error("Cannot resume cpus: {0}")] 183 ResumeCpus(#[source] MigratableError), 184 185 #[error("Cannot pause VM: {0}")] 186 Pause(#[source] MigratableError), 187 188 #[error("Cannot resume VM: {0}")] 189 Resume(#[source] MigratableError), 190 191 #[error("Memory manager error: {0:?}")] 192 MemoryManager(MemoryManagerError), 193 194 #[error("Eventfd write error: {0}")] 195 EventfdError(#[source] std::io::Error), 196 197 #[error("Cannot snapshot VM: {0}")] 198 Snapshot(#[source] MigratableError), 199 200 #[error("Cannot restore VM: {0}")] 201 Restore(#[source] MigratableError), 202 203 #[error("Cannot send VM snapshot: {0}")] 204 SnapshotSend(#[source] MigratableError), 205 206 #[error("Invalid restore source URL")] 207 InvalidRestoreSourceUrl, 208 209 #[error("Failed to validate config: {0}")] 210 ConfigValidation(#[source] ValidationError), 211 212 #[error("Too many virtio-vsock devices")] 213 TooManyVsockDevices, 214 215 #[error("Failed serializing into JSON: {0}")] 216 SerializeJson(#[source] serde_json::Error), 217 218 #[error("Invalid NUMA configuration")] 219 InvalidNumaConfig, 220 221 #[error("Cannot create seccomp filter: {0}")] 222 CreateSeccompFilter(#[source] seccompiler::Error), 223 224 #[error("Cannot apply seccomp filter: {0}")] 225 ApplySeccompFilter(#[source] seccompiler::Error), 226 227 #[error("Failed resizing a memory zone")] 228 ResizeZone, 229 230 #[error("Cannot activate virtio devices: {0:?}")] 231 ActivateVirtioDevices(DeviceManagerError), 232 233 #[error("Error triggering power button: {0:?}")] 234 PowerButton(DeviceManagerError), 235 236 #[error("Kernel lacks PVH header")] 237 KernelMissingPvhHeader, 238 239 #[error("Failed to allocate firmware RAM: {0:?}")] 240 AllocateFirmwareMemory(MemoryManagerError), 241 242 #[error("Error manipulating firmware file: {0}")] 243 FirmwareFile(#[source] std::io::Error), 244 245 #[error("Firmware too big")] 246 FirmwareTooLarge, 247 248 #[error("Failed to copy firmware to memory: {0}")] 249 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 250 251 #[cfg(feature = "sev_snp")] 252 #[error("Error enabling SEV-SNP VM: {0}")] 253 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 254 255 #[cfg(feature = "tdx")] 256 #[error("Error performing I/O on TDX firmware file: {0}")] 257 LoadTdvf(#[source] std::io::Error), 258 259 #[cfg(feature = "tdx")] 260 #[error("Error performing I/O on the TDX payload file: {0}")] 261 LoadPayload(#[source] std::io::Error), 262 263 #[cfg(feature = "tdx")] 264 #[error("Error parsing TDVF: {0}")] 265 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 266 267 #[cfg(feature = "tdx")] 268 #[error("Error populating TDX HOB: {0}")] 269 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 270 271 #[cfg(feature = "tdx")] 272 #[error("Error allocating TDVF memory: {0:?}")] 273 AllocatingTdvfMemory(crate::memory_manager::Error), 274 275 #[cfg(feature = "tdx")] 276 #[error("Error enabling TDX VM: {0}")] 277 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 278 279 #[cfg(feature = "tdx")] 280 #[error("Error enabling TDX memory region: {0}")] 281 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 282 283 #[cfg(feature = "tdx")] 284 #[error("Error finalizing TDX VM: {0}")] 285 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 286 287 #[cfg(feature = "tdx")] 288 #[error("TDX firmware missing")] 289 TdxFirmwareMissing, 290 291 #[cfg(feature = "tdx")] 292 #[error("Invalid TDX payload type")] 293 InvalidPayloadType, 294 295 #[cfg(feature = "guest_debug")] 296 #[error("Error debugging VM: {0:?}")] 297 Debug(DebuggableError), 298 299 #[error("Error spawning kernel loading thread")] 300 KernelLoadThreadSpawn(std::io::Error), 301 302 #[error("Error joining kernel loading thread")] 303 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 304 305 #[error("Payload configuration is not bootable")] 306 InvalidPayload, 307 308 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 309 #[error("Error coredumping VM: {0:?}")] 310 Coredump(GuestDebuggableError), 311 312 #[cfg(feature = "igvm")] 313 #[error("Cannot open igvm file: {0}")] 314 IgvmFile(#[source] io::Error), 315 316 #[cfg(feature = "igvm")] 317 #[error("Cannot load the igvm into memory: {0}")] 318 IgvmLoad(#[source] igvm_loader::Error), 319 320 #[error("Error injecting NMI")] 321 ErrorNmi, 322 323 #[error("Error resuming the VM: {0}")] 324 ResumeVm(#[source] hypervisor::HypervisorVmError), 325 } 326 pub type Result<T> = result::Result<T, Error>; 327 328 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 329 pub enum VmState { 330 Created, 331 Running, 332 Shutdown, 333 Paused, 334 BreakPoint, 335 } 336 337 impl VmState { 338 fn valid_transition(self, new_state: VmState) -> Result<()> { 339 match self { 340 VmState::Created => match new_state { 341 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 342 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 343 Ok(()) 344 } 345 }, 346 347 VmState::Running => match new_state { 348 VmState::Created | VmState::Running => { 349 Err(Error::InvalidStateTransition(self, new_state)) 350 } 351 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 352 }, 353 354 VmState::Shutdown => match new_state { 355 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 356 Err(Error::InvalidStateTransition(self, new_state)) 357 } 358 VmState::Running => Ok(()), 359 }, 360 361 VmState::Paused => match new_state { 362 VmState::Created | VmState::Paused | VmState::BreakPoint => { 363 Err(Error::InvalidStateTransition(self, new_state)) 364 } 365 VmState::Running | VmState::Shutdown => Ok(()), 366 }, 367 VmState::BreakPoint => match new_state { 368 VmState::Created | VmState::Running => Ok(()), 369 _ => Err(Error::InvalidStateTransition(self, new_state)), 370 }, 371 } 372 } 373 } 374 375 struct VmOpsHandler { 376 memory: GuestMemoryAtomic<GuestMemoryMmap>, 377 #[cfg(target_arch = "x86_64")] 378 io_bus: Arc<Bus>, 379 mmio_bus: Arc<Bus>, 380 } 381 382 impl VmOps for VmOpsHandler { 383 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 384 self.memory 385 .memory() 386 .write(buf, GuestAddress(gpa)) 387 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 388 } 389 390 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 391 self.memory 392 .memory() 393 .read(buf, GuestAddress(gpa)) 394 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 395 } 396 397 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 398 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 399 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 400 } 401 Ok(()) 402 } 403 404 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 405 match self.mmio_bus.write(gpa, data) { 406 Err(vm_device::BusError::MissingAddressRange) => { 407 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 408 } 409 Ok(Some(barrier)) => { 410 info!("Waiting for barrier"); 411 barrier.wait(); 412 info!("Barrier released"); 413 } 414 _ => {} 415 }; 416 Ok(()) 417 } 418 419 #[cfg(target_arch = "x86_64")] 420 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 421 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 422 info!("Guest PIO read to unregistered address 0x{:x}", port); 423 } 424 Ok(()) 425 } 426 427 #[cfg(target_arch = "x86_64")] 428 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 429 match self.io_bus.write(port, data) { 430 Err(vm_device::BusError::MissingAddressRange) => { 431 info!("Guest PIO write to unregistered address 0x{:x}", port); 432 } 433 Ok(Some(barrier)) => { 434 info!("Waiting for barrier"); 435 barrier.wait(); 436 info!("Barrier released"); 437 } 438 _ => {} 439 }; 440 Ok(()) 441 } 442 } 443 444 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 445 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 446 447 cmp::min(host_phys_bits, max_phys_bits) 448 } 449 450 pub struct Vm { 451 #[cfg(feature = "tdx")] 452 kernel: Option<File>, 453 initramfs: Option<File>, 454 threads: Vec<thread::JoinHandle<()>>, 455 device_manager: Arc<Mutex<DeviceManager>>, 456 config: Arc<Mutex<VmConfig>>, 457 state: RwLock<VmState>, 458 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 459 memory_manager: Arc<Mutex<MemoryManager>>, 460 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 461 // The hypervisor abstracted virtual machine. 462 vm: Arc<dyn hypervisor::Vm>, 463 #[cfg(target_arch = "x86_64")] 464 saved_clock: Option<hypervisor::ClockData>, 465 numa_nodes: NumaNodes, 466 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 467 hypervisor: Arc<dyn hypervisor::Hypervisor>, 468 stop_on_boot: bool, 469 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 470 } 471 472 impl Vm { 473 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 474 475 #[allow(clippy::too_many_arguments)] 476 pub fn new_from_memory_manager( 477 config: Arc<Mutex<VmConfig>>, 478 memory_manager: Arc<Mutex<MemoryManager>>, 479 vm: Arc<dyn hypervisor::Vm>, 480 exit_evt: EventFd, 481 reset_evt: EventFd, 482 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 483 seccomp_action: &SeccompAction, 484 hypervisor: Arc<dyn hypervisor::Hypervisor>, 485 activate_evt: EventFd, 486 timestamp: Instant, 487 serial_pty: Option<PtyPair>, 488 console_pty: Option<PtyPair>, 489 debug_console_pty: Option<PtyPair>, 490 console_resize_pipe: Option<File>, 491 original_termios: Arc<Mutex<Option<termios>>>, 492 snapshot: Option<Snapshot>, 493 ) -> Result<Self> { 494 trace_scoped!("Vm::new_from_memory_manager"); 495 496 let boot_id_list = config 497 .lock() 498 .unwrap() 499 .validate() 500 .map_err(Error::ConfigValidation)?; 501 502 #[cfg(not(feature = "igvm"))] 503 let load_payload_handle = if snapshot.is_none() { 504 Self::load_payload_async(&memory_manager, &config)? 505 } else { 506 None 507 }; 508 509 info!("Booting VM from config: {:?}", &config); 510 511 // Create NUMA nodes based on NumaConfig. 512 let numa_nodes = 513 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 514 515 #[cfg(feature = "tdx")] 516 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 517 #[cfg(feature = "sev_snp")] 518 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 519 #[cfg(feature = "tdx")] 520 let force_iommu = tdx_enabled; 521 #[cfg(feature = "sev_snp")] 522 let force_iommu = sev_snp_enabled; 523 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 524 let force_iommu = false; 525 526 #[cfg(feature = "guest_debug")] 527 let stop_on_boot = config.lock().unwrap().gdb; 528 #[cfg(not(feature = "guest_debug"))] 529 let stop_on_boot = false; 530 531 let memory = memory_manager.lock().unwrap().guest_memory(); 532 #[cfg(target_arch = "x86_64")] 533 let io_bus = Arc::new(Bus::new()); 534 let mmio_bus = Arc::new(Bus::new()); 535 536 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 537 memory, 538 #[cfg(target_arch = "x86_64")] 539 io_bus: io_bus.clone(), 540 mmio_bus: mmio_bus.clone(), 541 }); 542 543 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 544 let cpu_manager = cpu::CpuManager::new( 545 cpus_config, 546 vm.clone(), 547 exit_evt.try_clone().map_err(Error::EventFdClone)?, 548 reset_evt.try_clone().map_err(Error::EventFdClone)?, 549 #[cfg(feature = "guest_debug")] 550 vm_debug_evt, 551 &hypervisor, 552 seccomp_action.clone(), 553 vm_ops, 554 #[cfg(feature = "tdx")] 555 tdx_enabled, 556 &numa_nodes, 557 #[cfg(feature = "sev_snp")] 558 sev_snp_enabled, 559 ) 560 .map_err(Error::CpuManager)?; 561 562 #[cfg(target_arch = "x86_64")] 563 cpu_manager 564 .lock() 565 .unwrap() 566 .populate_cpuid( 567 &memory_manager, 568 &hypervisor, 569 #[cfg(feature = "tdx")] 570 tdx_enabled, 571 ) 572 .map_err(Error::CpuManager)?; 573 574 // Loading the igvm file is pushed down here because 575 // igvm parser needs cpu_manager to retrieve cpuid leaf. 576 // For the regular case, we can start loading early, but for 577 // igvm case we have to wait until cpu_manager is created. 578 // Currently, Microsoft Hypervisor does not provide any 579 // Hypervisor specific common cpuid, we need to call get_cpuid_values 580 // per cpuid through cpu_manager. 581 #[cfg(feature = "igvm")] 582 let load_payload_handle = if snapshot.is_none() { 583 Self::load_payload_async( 584 &memory_manager, 585 &config, 586 &cpu_manager, 587 #[cfg(feature = "sev_snp")] 588 sev_snp_enabled, 589 )? 590 } else { 591 None 592 }; 593 // The initial TDX configuration must be done before the vCPUs are 594 // created 595 #[cfg(feature = "tdx")] 596 if tdx_enabled { 597 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 598 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 599 vm.tdx_init(&cpuid, max_vcpus) 600 .map_err(Error::InitializeTdxVm)?; 601 } 602 603 cpu_manager 604 .lock() 605 .unwrap() 606 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 607 .map_err(Error::CpuManager)?; 608 609 // This initial SEV-SNP configuration must be done immediately after 610 // vCPUs are created. As part of this initialization we are 611 // transitioning the guest into secure state. 612 #[cfg(feature = "sev_snp")] 613 if sev_snp_enabled { 614 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 615 } 616 617 #[cfg(feature = "tdx")] 618 let dynamic = !tdx_enabled; 619 #[cfg(not(feature = "tdx"))] 620 let dynamic = true; 621 622 let device_manager = DeviceManager::new( 623 #[cfg(target_arch = "x86_64")] 624 io_bus, 625 mmio_bus, 626 hypervisor.hypervisor_type(), 627 vm.clone(), 628 config.clone(), 629 memory_manager.clone(), 630 cpu_manager.clone(), 631 exit_evt.try_clone().map_err(Error::EventFdClone)?, 632 reset_evt, 633 seccomp_action.clone(), 634 numa_nodes.clone(), 635 &activate_evt, 636 force_iommu, 637 boot_id_list, 638 timestamp, 639 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 640 dynamic, 641 ) 642 .map_err(Error::DeviceManager)?; 643 644 device_manager 645 .lock() 646 .unwrap() 647 .create_devices( 648 serial_pty, 649 console_pty, 650 debug_console_pty, 651 console_resize_pipe, 652 original_termios, 653 ) 654 .map_err(Error::DeviceManager)?; 655 656 #[cfg(feature = "tdx")] 657 let kernel = config 658 .lock() 659 .unwrap() 660 .payload 661 .as_ref() 662 .map(|p| p.kernel.as_ref().map(File::open)) 663 .unwrap_or_default() 664 .transpose() 665 .map_err(Error::KernelFile)?; 666 667 let initramfs = config 668 .lock() 669 .unwrap() 670 .payload 671 .as_ref() 672 .map(|p| p.initramfs.as_ref().map(File::open)) 673 .unwrap_or_default() 674 .transpose() 675 .map_err(Error::InitramfsFile)?; 676 677 #[cfg(target_arch = "x86_64")] 678 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 679 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 680 vm_snapshot.clock 681 } else { 682 None 683 }; 684 685 let vm_state = if snapshot.is_some() { 686 VmState::Paused 687 } else { 688 VmState::Created 689 }; 690 691 Ok(Vm { 692 #[cfg(feature = "tdx")] 693 kernel, 694 initramfs, 695 device_manager, 696 config, 697 threads: Vec::with_capacity(1), 698 state: RwLock::new(vm_state), 699 cpu_manager, 700 memory_manager, 701 vm, 702 #[cfg(target_arch = "x86_64")] 703 saved_clock, 704 numa_nodes, 705 hypervisor, 706 stop_on_boot, 707 load_payload_handle, 708 }) 709 } 710 711 fn create_numa_nodes( 712 configs: Option<Vec<NumaConfig>>, 713 memory_manager: &Arc<Mutex<MemoryManager>>, 714 ) -> Result<NumaNodes> { 715 let mm = memory_manager.lock().unwrap(); 716 let mm_zones = mm.memory_zones(); 717 let mut numa_nodes = BTreeMap::new(); 718 719 if let Some(configs) = &configs { 720 for config in configs.iter() { 721 if numa_nodes.contains_key(&config.guest_numa_id) { 722 error!("Can't define twice the same NUMA node"); 723 return Err(Error::InvalidNumaConfig); 724 } 725 726 let mut node = NumaNode::default(); 727 728 if let Some(memory_zones) = &config.memory_zones { 729 for memory_zone in memory_zones.iter() { 730 if let Some(mm_zone) = mm_zones.get(memory_zone) { 731 node.memory_regions.extend(mm_zone.regions().clone()); 732 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 733 node.hotplug_regions.push(virtiomem_zone.region().clone()); 734 } 735 node.memory_zones.push(memory_zone.clone()); 736 } else { 737 error!("Unknown memory zone '{}'", memory_zone); 738 return Err(Error::InvalidNumaConfig); 739 } 740 } 741 } 742 743 if let Some(cpus) = &config.cpus { 744 node.cpus.extend(cpus); 745 } 746 747 if let Some(pci_segments) = &config.pci_segments { 748 node.pci_segments.extend(pci_segments); 749 } 750 751 if let Some(distances) = &config.distances { 752 for distance in distances.iter() { 753 let dest = distance.destination; 754 let dist = distance.distance; 755 756 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 757 error!("Unknown destination NUMA node {}", dest); 758 return Err(Error::InvalidNumaConfig); 759 } 760 761 if node.distances.contains_key(&dest) { 762 error!("Destination NUMA node {} has been already set", dest); 763 return Err(Error::InvalidNumaConfig); 764 } 765 766 node.distances.insert(dest, dist); 767 } 768 } 769 770 #[cfg(target_arch = "x86_64")] 771 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 772 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 773 let mm_sections = sgx_epc_region.epc_sections(); 774 for sgx_epc_section in sgx_epc_sections.iter() { 775 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 776 node.sgx_epc_sections.push(mm_section.clone()); 777 } else { 778 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 779 return Err(Error::InvalidNumaConfig); 780 } 781 } 782 } else { 783 error!("Missing SGX EPC region"); 784 return Err(Error::InvalidNumaConfig); 785 } 786 } 787 788 numa_nodes.insert(config.guest_numa_id, node); 789 } 790 } 791 792 Ok(numa_nodes) 793 } 794 795 #[allow(clippy::too_many_arguments)] 796 pub fn new( 797 vm_config: Arc<Mutex<VmConfig>>, 798 exit_evt: EventFd, 799 reset_evt: EventFd, 800 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 801 seccomp_action: &SeccompAction, 802 hypervisor: Arc<dyn hypervisor::Hypervisor>, 803 activate_evt: EventFd, 804 serial_pty: Option<PtyPair>, 805 console_pty: Option<PtyPair>, 806 debug_console_pty: Option<PtyPair>, 807 console_resize_pipe: Option<File>, 808 original_termios: Arc<Mutex<Option<termios>>>, 809 snapshot: Option<Snapshot>, 810 source_url: Option<&str>, 811 prefault: Option<bool>, 812 ) -> Result<Self> { 813 trace_scoped!("Vm::new"); 814 815 let timestamp = Instant::now(); 816 817 #[cfg(feature = "tdx")] 818 let tdx_enabled = if snapshot.is_some() { 819 false 820 } else { 821 vm_config.lock().unwrap().is_tdx_enabled() 822 }; 823 824 #[cfg(feature = "sev_snp")] 825 let sev_snp_enabled = if snapshot.is_some() { 826 false 827 } else { 828 vm_config.lock().unwrap().is_sev_snp_enabled() 829 }; 830 831 let vm = Self::create_hypervisor_vm( 832 &hypervisor, 833 #[cfg(feature = "tdx")] 834 tdx_enabled, 835 #[cfg(feature = "sev_snp")] 836 sev_snp_enabled, 837 )?; 838 839 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 840 841 let memory_manager = if let Some(snapshot) = 842 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 843 { 844 MemoryManager::new_from_snapshot( 845 &snapshot, 846 vm.clone(), 847 &vm_config.lock().unwrap().memory.clone(), 848 source_url, 849 prefault.unwrap(), 850 phys_bits, 851 ) 852 .map_err(Error::MemoryManager)? 853 } else { 854 #[cfg(target_arch = "x86_64")] 855 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 856 857 MemoryManager::new( 858 vm.clone(), 859 &vm_config.lock().unwrap().memory.clone(), 860 None, 861 phys_bits, 862 #[cfg(feature = "tdx")] 863 tdx_enabled, 864 None, 865 None, 866 #[cfg(target_arch = "x86_64")] 867 sgx_epc_config, 868 ) 869 .map_err(Error::MemoryManager)? 870 }; 871 872 Vm::new_from_memory_manager( 873 vm_config, 874 memory_manager, 875 vm, 876 exit_evt, 877 reset_evt, 878 #[cfg(feature = "guest_debug")] 879 vm_debug_evt, 880 seccomp_action, 881 hypervisor, 882 activate_evt, 883 timestamp, 884 serial_pty, 885 console_pty, 886 debug_console_pty, 887 console_resize_pipe, 888 original_termios, 889 snapshot, 890 ) 891 } 892 893 pub fn create_hypervisor_vm( 894 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 895 #[cfg(feature = "tdx")] tdx_enabled: bool, 896 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 897 ) -> Result<Arc<dyn hypervisor::Vm>> { 898 hypervisor.check_required_extensions().unwrap(); 899 900 cfg_if::cfg_if! { 901 if #[cfg(feature = "tdx")] { 902 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 903 // Otherwise KVM_X86_LEGACY_VM: 0 904 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 905 let vm = hypervisor 906 .create_vm_with_type(u64::from(tdx_enabled)) 907 .unwrap(); 908 } else if #[cfg(feature = "sev_snp")] { 909 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 910 // Otherwise SEV_SNP_DISABLED: 0 911 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 912 let vm = hypervisor 913 .create_vm_with_type(u64::from(sev_snp_enabled)) 914 .unwrap(); 915 } else { 916 let vm = hypervisor.create_vm().unwrap(); 917 } 918 } 919 920 #[cfg(target_arch = "x86_64")] 921 { 922 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 923 .unwrap(); 924 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 925 vm.enable_split_irq().unwrap(); 926 } 927 928 Ok(vm) 929 } 930 931 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 932 let initramfs = self.initramfs.as_mut().unwrap(); 933 let size: usize = initramfs 934 .seek(SeekFrom::End(0)) 935 .map_err(|_| Error::InitramfsLoad)? 936 .try_into() 937 .unwrap(); 938 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 939 940 let address = 941 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 942 let address = GuestAddress(address); 943 944 guest_mem 945 .read_volatile_from(address, initramfs, size) 946 .map_err(|_| Error::InitramfsLoad)?; 947 948 info!("Initramfs loaded: address = 0x{:x}", address.0); 949 Ok(arch::InitramfsConfig { address, size }) 950 } 951 952 pub fn generate_cmdline( 953 payload: &PayloadConfig, 954 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 955 ) -> Result<Cmdline> { 956 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 957 if let Some(s) = payload.cmdline.as_ref() { 958 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 959 } 960 961 #[cfg(target_arch = "aarch64")] 962 for entry in device_manager.lock().unwrap().cmdline_additions() { 963 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 964 } 965 Ok(cmdline) 966 } 967 968 #[cfg(target_arch = "aarch64")] 969 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 970 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 971 let mem = uefi_flash.memory(); 972 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 973 .map_err(Error::UefiLoad)?; 974 Ok(()) 975 } 976 977 #[cfg(target_arch = "aarch64")] 978 fn load_kernel( 979 firmware: Option<File>, 980 kernel: Option<File>, 981 memory_manager: Arc<Mutex<MemoryManager>>, 982 ) -> Result<EntryPoint> { 983 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 984 let mem = guest_memory.memory(); 985 let entry_addr = match (firmware, kernel) { 986 (None, Some(mut kernel)) => { 987 match linux_loader::loader::pe::PE::load( 988 mem.deref(), 989 Some(arch::layout::KERNEL_START), 990 &mut kernel, 991 None, 992 ) { 993 Ok(entry_addr) => entry_addr.kernel_load, 994 // Try to load the binary as kernel PE file at first. 995 // If failed, retry to load it as UEFI binary. 996 // As the UEFI binary is formatless, it must be the last option to try. 997 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 998 Self::load_firmware(&kernel, memory_manager)?; 999 arch::layout::UEFI_START 1000 } 1001 Err(e) => { 1002 return Err(Error::KernelLoad(e)); 1003 } 1004 } 1005 } 1006 (Some(firmware), None) => { 1007 Self::load_firmware(&firmware, memory_manager)?; 1008 arch::layout::UEFI_START 1009 } 1010 _ => return Err(Error::InvalidPayload), 1011 }; 1012 1013 Ok(EntryPoint { entry_addr }) 1014 } 1015 1016 #[cfg(feature = "igvm")] 1017 fn load_igvm( 1018 igvm: File, 1019 memory_manager: Arc<Mutex<MemoryManager>>, 1020 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1021 #[cfg(feature = "sev_snp")] host_data: &Option<String>, 1022 ) -> Result<EntryPoint> { 1023 let res = igvm_loader::load_igvm( 1024 &igvm, 1025 memory_manager, 1026 cpu_manager.clone(), 1027 "", 1028 #[cfg(feature = "sev_snp")] 1029 host_data, 1030 ) 1031 .map_err(Error::IgvmLoad)?; 1032 1033 cfg_if::cfg_if! { 1034 if #[cfg(feature = "sev_snp")] { 1035 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1036 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1037 } else { 1038 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1039 }; 1040 } else { 1041 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1042 } 1043 }; 1044 Ok(entry_point) 1045 } 1046 1047 #[cfg(target_arch = "x86_64")] 1048 fn load_kernel( 1049 mut kernel: File, 1050 cmdline: Option<Cmdline>, 1051 memory_manager: Arc<Mutex<MemoryManager>>, 1052 ) -> Result<EntryPoint> { 1053 info!("Loading kernel"); 1054 1055 let mem = { 1056 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1057 guest_memory.memory() 1058 }; 1059 1060 // Try ELF binary with PVH boot. 1061 let entry_addr = linux_loader::loader::elf::Elf::load( 1062 mem.deref(), 1063 None, 1064 &mut kernel, 1065 Some(arch::layout::HIGH_RAM_START), 1066 ) 1067 // Try loading kernel as bzImage. 1068 .or_else(|_| { 1069 BzImage::load( 1070 mem.deref(), 1071 None, 1072 &mut kernel, 1073 Some(arch::layout::HIGH_RAM_START), 1074 ) 1075 }) 1076 .map_err(Error::KernelLoad)?; 1077 1078 if let Some(cmdline) = cmdline { 1079 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1080 .map_err(Error::LoadCmdLine)?; 1081 } 1082 1083 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1084 // Use the PVH kernel entry point to boot the guest 1085 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1086 Ok(EntryPoint { 1087 entry_addr, 1088 setup_header: None, 1089 }) 1090 } else if entry_addr.setup_header.is_some() { 1091 // Use the bzImage 32bit entry point to boot the guest 1092 info!( 1093 "bzImage kernel loaded: entry_addr = 0x{:x}", 1094 entry_addr.kernel_load.0 1095 ); 1096 Ok(EntryPoint { 1097 entry_addr: entry_addr.kernel_load, 1098 setup_header: entry_addr.setup_header, 1099 }) 1100 } else { 1101 Err(Error::KernelMissingPvhHeader) 1102 } 1103 } 1104 1105 #[cfg(target_arch = "x86_64")] 1106 fn load_payload( 1107 payload: &PayloadConfig, 1108 memory_manager: Arc<Mutex<MemoryManager>>, 1109 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1110 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1111 ) -> Result<EntryPoint> { 1112 trace_scoped!("load_payload"); 1113 #[cfg(feature = "igvm")] 1114 { 1115 if let Some(_igvm_file) = &payload.igvm { 1116 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1117 #[cfg(feature = "sev_snp")] 1118 if sev_snp_enabled { 1119 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); 1120 } 1121 #[cfg(not(feature = "sev_snp"))] 1122 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1123 } 1124 } 1125 match ( 1126 &payload.firmware, 1127 &payload.kernel, 1128 &payload.initramfs, 1129 &payload.cmdline, 1130 ) { 1131 (Some(firmware), None, None, None) => { 1132 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1133 Self::load_kernel(firmware, None, memory_manager) 1134 } 1135 (None, Some(kernel), _, _) => { 1136 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1137 let cmdline = Self::generate_cmdline(payload)?; 1138 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1139 } 1140 _ => Err(Error::InvalidPayload), 1141 } 1142 } 1143 1144 #[cfg(target_arch = "aarch64")] 1145 fn load_payload( 1146 payload: &PayloadConfig, 1147 memory_manager: Arc<Mutex<MemoryManager>>, 1148 ) -> Result<EntryPoint> { 1149 match (&payload.firmware, &payload.kernel) { 1150 (Some(firmware), None) => { 1151 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1152 Self::load_kernel(Some(firmware), None, memory_manager) 1153 } 1154 (None, Some(kernel)) => { 1155 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1156 Self::load_kernel(None, Some(kernel), memory_manager) 1157 } 1158 _ => Err(Error::InvalidPayload), 1159 } 1160 } 1161 1162 fn load_payload_async( 1163 memory_manager: &Arc<Mutex<MemoryManager>>, 1164 config: &Arc<Mutex<VmConfig>>, 1165 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1166 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1167 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1168 // Kernel with TDX is loaded in a different manner 1169 #[cfg(feature = "tdx")] 1170 if config.lock().unwrap().is_tdx_enabled() { 1171 return Ok(None); 1172 } 1173 1174 config 1175 .lock() 1176 .unwrap() 1177 .payload 1178 .as_ref() 1179 .map(|payload| { 1180 let memory_manager = memory_manager.clone(); 1181 let payload = payload.clone(); 1182 #[cfg(feature = "igvm")] 1183 let cpu_manager = cpu_manager.clone(); 1184 1185 std::thread::Builder::new() 1186 .name("payload_loader".into()) 1187 .spawn(move || { 1188 Self::load_payload( 1189 &payload, 1190 memory_manager, 1191 #[cfg(feature = "igvm")] 1192 cpu_manager, 1193 #[cfg(feature = "sev_snp")] 1194 sev_snp_enabled, 1195 ) 1196 }) 1197 .map_err(Error::KernelLoadThreadSpawn) 1198 }) 1199 .transpose() 1200 } 1201 1202 #[cfg(target_arch = "x86_64")] 1203 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1204 trace_scoped!("configure_system"); 1205 info!("Configuring system"); 1206 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1207 1208 let initramfs_config = match self.initramfs { 1209 Some(_) => Some(self.load_initramfs(&mem)?), 1210 None => None, 1211 }; 1212 1213 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1214 let rsdp_addr = Some(rsdp_addr); 1215 let sgx_epc_region = self 1216 .memory_manager 1217 .lock() 1218 .unwrap() 1219 .sgx_epc_region() 1220 .as_ref() 1221 .cloned(); 1222 1223 let serial_number = self 1224 .config 1225 .lock() 1226 .unwrap() 1227 .platform 1228 .as_ref() 1229 .and_then(|p| p.serial_number.clone()); 1230 1231 let uuid = self 1232 .config 1233 .lock() 1234 .unwrap() 1235 .platform 1236 .as_ref() 1237 .and_then(|p| p.uuid.clone()); 1238 1239 let oem_strings = self 1240 .config 1241 .lock() 1242 .unwrap() 1243 .platform 1244 .as_ref() 1245 .and_then(|p| p.oem_strings.clone()); 1246 1247 let oem_strings = oem_strings 1248 .as_deref() 1249 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1250 1251 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1252 1253 arch::configure_system( 1254 &mem, 1255 arch::layout::CMDLINE_START, 1256 arch::layout::CMDLINE_MAX_SIZE, 1257 &initramfs_config, 1258 boot_vcpus, 1259 entry_addr.setup_header, 1260 rsdp_addr, 1261 sgx_epc_region, 1262 serial_number.as_deref(), 1263 uuid.as_deref(), 1264 oem_strings.as_deref(), 1265 topology, 1266 ) 1267 .map_err(Error::ConfigureSystem)?; 1268 Ok(()) 1269 } 1270 1271 #[cfg(target_arch = "aarch64")] 1272 fn configure_system( 1273 &mut self, 1274 _rsdp_addr: GuestAddress, 1275 _entry_addr: EntryPoint, 1276 ) -> Result<()> { 1277 let cmdline = Self::generate_cmdline( 1278 self.config.lock().unwrap().payload.as_ref().unwrap(), 1279 &self.device_manager, 1280 )?; 1281 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1282 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1283 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1284 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1285 let initramfs_config = match self.initramfs { 1286 Some(_) => Some(self.load_initramfs(&mem)?), 1287 None => None, 1288 }; 1289 1290 let device_info = &self 1291 .device_manager 1292 .lock() 1293 .unwrap() 1294 .get_device_info() 1295 .clone(); 1296 1297 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1298 let pci_space = PciSpaceInfo { 1299 pci_segment_id: pci_segment.id, 1300 mmio_config_address: pci_segment.mmio_config_address, 1301 pci_device_space_start: pci_segment.start_of_mem64_area, 1302 pci_device_space_size: pci_segment.end_of_mem64_area 1303 - pci_segment.start_of_mem64_area 1304 + 1, 1305 }; 1306 pci_space_info.push(pci_space); 1307 } 1308 1309 let virtio_iommu_bdf = self 1310 .device_manager 1311 .lock() 1312 .unwrap() 1313 .iommu_attached_devices() 1314 .as_ref() 1315 .map(|(v, _)| *v); 1316 1317 let vgic = self 1318 .device_manager 1319 .lock() 1320 .unwrap() 1321 .get_interrupt_controller() 1322 .unwrap() 1323 .lock() 1324 .unwrap() 1325 .get_vgic() 1326 .map_err(|_| { 1327 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1328 arch::aarch64::Error::SetupGic, 1329 )) 1330 })?; 1331 1332 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1333 let pmu_supported = self 1334 .cpu_manager 1335 .lock() 1336 .unwrap() 1337 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1338 .map_err(|_| { 1339 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1340 arch::aarch64::Error::VcpuInitPmu, 1341 )) 1342 })?; 1343 1344 arch::configure_system( 1345 &mem, 1346 cmdline.as_cstring().unwrap().to_str().unwrap(), 1347 vcpu_mpidrs, 1348 vcpu_topology, 1349 device_info, 1350 &initramfs_config, 1351 &pci_space_info, 1352 virtio_iommu_bdf.map(|bdf| bdf.into()), 1353 &vgic, 1354 &self.numa_nodes, 1355 pmu_supported, 1356 ) 1357 .map_err(Error::ConfigureSystem)?; 1358 1359 Ok(()) 1360 } 1361 1362 pub fn serial_pty(&self) -> Option<PtyPair> { 1363 self.device_manager.lock().unwrap().serial_pty() 1364 } 1365 1366 pub fn console_pty(&self) -> Option<PtyPair> { 1367 self.device_manager.lock().unwrap().console_pty() 1368 } 1369 1370 pub fn debug_console_pty(&self) -> Option<PtyPair> { 1371 self.device_manager.lock().unwrap().debug_console_pty() 1372 } 1373 1374 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1375 self.device_manager.lock().unwrap().console_resize_pipe() 1376 } 1377 1378 pub fn shutdown(&mut self) -> Result<()> { 1379 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1380 let new_state = VmState::Shutdown; 1381 1382 state.valid_transition(new_state)?; 1383 1384 // Wake up the DeviceManager threads so they will get terminated cleanly 1385 self.device_manager 1386 .lock() 1387 .unwrap() 1388 .resume() 1389 .map_err(Error::Resume)?; 1390 1391 self.cpu_manager 1392 .lock() 1393 .unwrap() 1394 .shutdown() 1395 .map_err(Error::CpuManager)?; 1396 1397 // Wait for all the threads to finish 1398 for thread in self.threads.drain(..) { 1399 thread.join().map_err(Error::ThreadCleanup)? 1400 } 1401 *state = new_state; 1402 1403 Ok(()) 1404 } 1405 1406 pub fn resize( 1407 &mut self, 1408 desired_vcpus: Option<u8>, 1409 desired_memory: Option<u64>, 1410 desired_balloon: Option<u64>, 1411 ) -> Result<()> { 1412 event!("vm", "resizing"); 1413 1414 if let Some(desired_vcpus) = desired_vcpus { 1415 if self 1416 .cpu_manager 1417 .lock() 1418 .unwrap() 1419 .resize(desired_vcpus) 1420 .map_err(Error::CpuManager)? 1421 { 1422 self.device_manager 1423 .lock() 1424 .unwrap() 1425 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1426 .map_err(Error::DeviceManager)?; 1427 } 1428 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1429 } 1430 1431 if let Some(desired_memory) = desired_memory { 1432 let new_region = self 1433 .memory_manager 1434 .lock() 1435 .unwrap() 1436 .resize(desired_memory) 1437 .map_err(Error::MemoryManager)?; 1438 1439 let memory_config = &mut self.config.lock().unwrap().memory; 1440 1441 if let Some(new_region) = &new_region { 1442 self.device_manager 1443 .lock() 1444 .unwrap() 1445 .update_memory(new_region) 1446 .map_err(Error::DeviceManager)?; 1447 1448 match memory_config.hotplug_method { 1449 HotplugMethod::Acpi => { 1450 self.device_manager 1451 .lock() 1452 .unwrap() 1453 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1454 .map_err(Error::DeviceManager)?; 1455 } 1456 HotplugMethod::VirtioMem => {} 1457 } 1458 } 1459 1460 // We update the VM config regardless of the actual guest resize 1461 // operation result (happened or not), so that if the VM reboots 1462 // it will be running with the last configure memory size. 1463 match memory_config.hotplug_method { 1464 HotplugMethod::Acpi => memory_config.size = desired_memory, 1465 HotplugMethod::VirtioMem => { 1466 if desired_memory > memory_config.size { 1467 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1468 } else { 1469 memory_config.hotplugged_size = None; 1470 } 1471 } 1472 } 1473 } 1474 1475 if let Some(desired_balloon) = desired_balloon { 1476 self.device_manager 1477 .lock() 1478 .unwrap() 1479 .resize_balloon(desired_balloon) 1480 .map_err(Error::DeviceManager)?; 1481 1482 // Update the configuration value for the balloon size to ensure 1483 // a reboot would use the right value. 1484 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1485 balloon_config.size = desired_balloon; 1486 } 1487 } 1488 1489 event!("vm", "resized"); 1490 1491 Ok(()) 1492 } 1493 1494 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1495 let memory_config = &mut self.config.lock().unwrap().memory; 1496 1497 if let Some(zones) = &mut memory_config.zones { 1498 for zone in zones.iter_mut() { 1499 if zone.id == id { 1500 if desired_memory >= zone.size { 1501 let hotplugged_size = desired_memory - zone.size; 1502 self.memory_manager 1503 .lock() 1504 .unwrap() 1505 .resize_zone(&id, desired_memory - zone.size) 1506 .map_err(Error::MemoryManager)?; 1507 // We update the memory zone config regardless of the 1508 // actual 'resize-zone' operation result (happened or 1509 // not), so that if the VM reboots it will be running 1510 // with the last configured memory zone size. 1511 zone.hotplugged_size = Some(hotplugged_size); 1512 1513 return Ok(()); 1514 } else { 1515 error!( 1516 "Invalid to ask less ({}) than boot RAM ({}) for \ 1517 this memory zone", 1518 desired_memory, zone.size, 1519 ); 1520 return Err(Error::ResizeZone); 1521 } 1522 } 1523 } 1524 } 1525 1526 error!("Could not find the memory zone {} for the resize", id); 1527 Err(Error::ResizeZone) 1528 } 1529 1530 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1531 let pci_device_info = self 1532 .device_manager 1533 .lock() 1534 .unwrap() 1535 .add_device(&mut device_cfg) 1536 .map_err(Error::DeviceManager)?; 1537 1538 // Update VmConfig by adding the new device. This is important to 1539 // ensure the device would be created in case of a reboot. 1540 { 1541 let mut config = self.config.lock().unwrap(); 1542 add_to_config(&mut config.devices, device_cfg); 1543 } 1544 1545 self.device_manager 1546 .lock() 1547 .unwrap() 1548 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1549 .map_err(Error::DeviceManager)?; 1550 1551 Ok(pci_device_info) 1552 } 1553 1554 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1555 let pci_device_info = self 1556 .device_manager 1557 .lock() 1558 .unwrap() 1559 .add_user_device(&mut device_cfg) 1560 .map_err(Error::DeviceManager)?; 1561 1562 // Update VmConfig by adding the new device. This is important to 1563 // ensure the device would be created in case of a reboot. 1564 { 1565 let mut config = self.config.lock().unwrap(); 1566 add_to_config(&mut config.user_devices, device_cfg); 1567 } 1568 1569 self.device_manager 1570 .lock() 1571 .unwrap() 1572 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1573 .map_err(Error::DeviceManager)?; 1574 1575 Ok(pci_device_info) 1576 } 1577 1578 pub fn remove_device(&mut self, id: String) -> Result<()> { 1579 self.device_manager 1580 .lock() 1581 .unwrap() 1582 .remove_device(id.clone()) 1583 .map_err(Error::DeviceManager)?; 1584 1585 // Update VmConfig by removing the device. This is important to 1586 // ensure the device would not be created in case of a reboot. 1587 self.config.lock().unwrap().remove_device(&id); 1588 1589 self.device_manager 1590 .lock() 1591 .unwrap() 1592 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1593 .map_err(Error::DeviceManager)?; 1594 Ok(()) 1595 } 1596 1597 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1598 let pci_device_info = self 1599 .device_manager 1600 .lock() 1601 .unwrap() 1602 .add_disk(&mut disk_cfg) 1603 .map_err(Error::DeviceManager)?; 1604 1605 // Update VmConfig by adding the new device. This is important to 1606 // ensure the device would be created in case of a reboot. 1607 { 1608 let mut config = self.config.lock().unwrap(); 1609 add_to_config(&mut config.disks, disk_cfg); 1610 } 1611 1612 self.device_manager 1613 .lock() 1614 .unwrap() 1615 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1616 .map_err(Error::DeviceManager)?; 1617 1618 Ok(pci_device_info) 1619 } 1620 1621 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1622 let pci_device_info = self 1623 .device_manager 1624 .lock() 1625 .unwrap() 1626 .add_fs(&mut fs_cfg) 1627 .map_err(Error::DeviceManager)?; 1628 1629 // Update VmConfig by adding the new device. This is important to 1630 // ensure the device would be created in case of a reboot. 1631 { 1632 let mut config = self.config.lock().unwrap(); 1633 add_to_config(&mut config.fs, fs_cfg); 1634 } 1635 1636 self.device_manager 1637 .lock() 1638 .unwrap() 1639 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1640 .map_err(Error::DeviceManager)?; 1641 1642 Ok(pci_device_info) 1643 } 1644 1645 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1646 let pci_device_info = self 1647 .device_manager 1648 .lock() 1649 .unwrap() 1650 .add_pmem(&mut pmem_cfg) 1651 .map_err(Error::DeviceManager)?; 1652 1653 // Update VmConfig by adding the new device. This is important to 1654 // ensure the device would be created in case of a reboot. 1655 { 1656 let mut config = self.config.lock().unwrap(); 1657 add_to_config(&mut config.pmem, pmem_cfg); 1658 } 1659 1660 self.device_manager 1661 .lock() 1662 .unwrap() 1663 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1664 .map_err(Error::DeviceManager)?; 1665 1666 Ok(pci_device_info) 1667 } 1668 1669 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1670 let pci_device_info = self 1671 .device_manager 1672 .lock() 1673 .unwrap() 1674 .add_net(&mut net_cfg) 1675 .map_err(Error::DeviceManager)?; 1676 1677 // Update VmConfig by adding the new device. This is important to 1678 // ensure the device would be created in case of a reboot. 1679 { 1680 let mut config = self.config.lock().unwrap(); 1681 add_to_config(&mut config.net, net_cfg); 1682 } 1683 1684 self.device_manager 1685 .lock() 1686 .unwrap() 1687 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1688 .map_err(Error::DeviceManager)?; 1689 1690 Ok(pci_device_info) 1691 } 1692 1693 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1694 let pci_device_info = self 1695 .device_manager 1696 .lock() 1697 .unwrap() 1698 .add_vdpa(&mut vdpa_cfg) 1699 .map_err(Error::DeviceManager)?; 1700 1701 // Update VmConfig by adding the new device. This is important to 1702 // ensure the device would be created in case of a reboot. 1703 { 1704 let mut config = self.config.lock().unwrap(); 1705 add_to_config(&mut config.vdpa, vdpa_cfg); 1706 } 1707 1708 self.device_manager 1709 .lock() 1710 .unwrap() 1711 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1712 .map_err(Error::DeviceManager)?; 1713 1714 Ok(pci_device_info) 1715 } 1716 1717 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1718 let pci_device_info = self 1719 .device_manager 1720 .lock() 1721 .unwrap() 1722 .add_vsock(&mut vsock_cfg) 1723 .map_err(Error::DeviceManager)?; 1724 1725 // Update VmConfig by adding the new device. This is important to 1726 // ensure the device would be created in case of a reboot. 1727 { 1728 let mut config = self.config.lock().unwrap(); 1729 config.vsock = Some(vsock_cfg); 1730 } 1731 1732 self.device_manager 1733 .lock() 1734 .unwrap() 1735 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1736 .map_err(Error::DeviceManager)?; 1737 1738 Ok(pci_device_info) 1739 } 1740 1741 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1742 Ok(self.device_manager.lock().unwrap().counters()) 1743 } 1744 1745 #[cfg(feature = "tdx")] 1746 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1747 use arch::x86_64::tdx::*; 1748 1749 let firmware_path = self 1750 .config 1751 .lock() 1752 .unwrap() 1753 .payload 1754 .as_ref() 1755 .unwrap() 1756 .firmware 1757 .clone() 1758 .ok_or(Error::TdxFirmwareMissing)?; 1759 // The TDVF file contains a table of section as well as code 1760 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1761 1762 // For all the sections allocate some RAM backing them 1763 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1764 } 1765 1766 #[cfg(feature = "tdx")] 1767 fn hob_memory_resources( 1768 mut sorted_sections: Vec<TdvfSection>, 1769 guest_memory: &GuestMemoryMmap, 1770 ) -> Vec<(u64, u64, bool)> { 1771 let mut list = Vec::new(); 1772 1773 let mut current_section = sorted_sections.pop(); 1774 1775 // RAM regions interleaved with TDVF sections 1776 let mut next_start_addr = 0; 1777 for region in guest_memory.iter() { 1778 let region_start = region.start_addr().0; 1779 let region_end = region.last_addr().0; 1780 if region_start > next_start_addr { 1781 next_start_addr = region_start; 1782 } 1783 1784 loop { 1785 let (start, size, ram) = if let Some(section) = ¤t_section { 1786 if section.address <= next_start_addr { 1787 (section.address, section.size, false) 1788 } else { 1789 let last_addr = std::cmp::min(section.address - 1, region_end); 1790 (next_start_addr, last_addr - next_start_addr + 1, true) 1791 } 1792 } else { 1793 (next_start_addr, region_end - next_start_addr + 1, true) 1794 }; 1795 1796 list.push((start, size, ram)); 1797 1798 if !ram { 1799 current_section = sorted_sections.pop(); 1800 } 1801 1802 next_start_addr = start + size; 1803 1804 if region_start > next_start_addr { 1805 next_start_addr = region_start; 1806 } 1807 1808 if next_start_addr > region_end { 1809 break; 1810 } 1811 } 1812 } 1813 1814 // Once all the interleaved sections have been processed, let's simply 1815 // pull the remaining ones. 1816 if let Some(section) = current_section { 1817 list.push((section.address, section.size, false)); 1818 } 1819 while let Some(section) = sorted_sections.pop() { 1820 list.push((section.address, section.size, false)); 1821 } 1822 1823 list 1824 } 1825 1826 #[cfg(feature = "tdx")] 1827 fn populate_tdx_sections( 1828 &mut self, 1829 sections: &[TdvfSection], 1830 guid_found: bool, 1831 ) -> Result<Option<u64>> { 1832 use arch::x86_64::tdx::*; 1833 // Get the memory end *before* we start adding TDVF ram regions 1834 let boot_guest_memory = self 1835 .memory_manager 1836 .lock() 1837 .as_ref() 1838 .unwrap() 1839 .boot_guest_memory(); 1840 for section in sections { 1841 // No need to allocate if the section falls within guest RAM ranges 1842 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1843 info!( 1844 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1845 section 1846 ); 1847 continue; 1848 } 1849 1850 info!("Allocating TDVF Section: {:x?}", section); 1851 self.memory_manager 1852 .lock() 1853 .unwrap() 1854 .add_ram_region(GuestAddress(section.address), section.size as usize) 1855 .map_err(Error::AllocatingTdvfMemory)?; 1856 } 1857 1858 // The TDVF file contains a table of section as well as code 1859 let firmware_path = self 1860 .config 1861 .lock() 1862 .unwrap() 1863 .payload 1864 .as_ref() 1865 .unwrap() 1866 .firmware 1867 .clone() 1868 .ok_or(Error::TdxFirmwareMissing)?; 1869 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1870 1871 // The guest memory at this point now has all the required regions so it 1872 // is safe to copy from the TDVF file into it. 1873 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1874 let mem = guest_memory.memory(); 1875 let mut payload_info = None; 1876 let mut hob_offset = None; 1877 for section in sections { 1878 info!("Populating TDVF Section: {:x?}", section); 1879 match section.r#type { 1880 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1881 info!("Copying section to guest memory"); 1882 firmware_file 1883 .seek(SeekFrom::Start(section.data_offset as u64)) 1884 .map_err(Error::LoadTdvf)?; 1885 mem.read_volatile_from( 1886 GuestAddress(section.address), 1887 &mut firmware_file, 1888 section.data_size as usize, 1889 ) 1890 .unwrap(); 1891 } 1892 TdvfSectionType::TdHob => { 1893 hob_offset = Some(section.address); 1894 } 1895 TdvfSectionType::Payload => { 1896 info!("Copying payload to guest memory"); 1897 if let Some(payload_file) = self.kernel.as_mut() { 1898 let payload_size = payload_file 1899 .seek(SeekFrom::End(0)) 1900 .map_err(Error::LoadPayload)?; 1901 1902 payload_file 1903 .seek(SeekFrom::Start(0x1f1)) 1904 .map_err(Error::LoadPayload)?; 1905 1906 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1907 payload_file 1908 .read_volatile(&mut payload_header.as_bytes()) 1909 .unwrap(); 1910 1911 if payload_header.header != 0x5372_6448 { 1912 return Err(Error::InvalidPayloadType); 1913 } 1914 1915 if (payload_header.version < 0x0200) 1916 || ((payload_header.loadflags & 0x1) == 0x0) 1917 { 1918 return Err(Error::InvalidPayloadType); 1919 } 1920 1921 payload_file.rewind().map_err(Error::LoadPayload)?; 1922 mem.read_volatile_from( 1923 GuestAddress(section.address), 1924 payload_file, 1925 payload_size as usize, 1926 ) 1927 .unwrap(); 1928 1929 // Create the payload info that will be inserted into 1930 // the HOB. 1931 payload_info = Some(PayloadInfo { 1932 image_type: PayloadImageType::BzImage, 1933 entry_point: section.address, 1934 }); 1935 } 1936 } 1937 TdvfSectionType::PayloadParam => { 1938 info!("Copying payload parameters to guest memory"); 1939 let cmdline = Self::generate_cmdline( 1940 self.config.lock().unwrap().payload.as_ref().unwrap(), 1941 )?; 1942 mem.write_slice( 1943 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1944 GuestAddress(section.address), 1945 ) 1946 .unwrap(); 1947 } 1948 _ => {} 1949 } 1950 } 1951 1952 // Generate HOB 1953 let mut hob = TdHob::start(hob_offset.unwrap()); 1954 1955 let mut sorted_sections = sections.to_vec(); 1956 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1957 1958 sorted_sections.sort_by_key(|section| section.address); 1959 sorted_sections.reverse(); 1960 1961 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1962 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1963 .map_err(Error::PopulateHob)?; 1964 } 1965 1966 // MMIO regions 1967 hob.add_mmio_resource( 1968 &mem, 1969 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1970 arch::layout::APIC_START.raw_value() 1971 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1972 ) 1973 .map_err(Error::PopulateHob)?; 1974 let start_of_device_area = self 1975 .memory_manager 1976 .lock() 1977 .unwrap() 1978 .start_of_device_area() 1979 .raw_value(); 1980 let end_of_device_area = self 1981 .memory_manager 1982 .lock() 1983 .unwrap() 1984 .end_of_device_area() 1985 .raw_value(); 1986 hob.add_mmio_resource( 1987 &mem, 1988 start_of_device_area, 1989 end_of_device_area - start_of_device_area, 1990 ) 1991 .map_err(Error::PopulateHob)?; 1992 1993 // Loop over the ACPI tables and copy them to the HOB. 1994 1995 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1996 &self.device_manager, 1997 &self.cpu_manager, 1998 &self.memory_manager, 1999 &self.numa_nodes, 2000 ) { 2001 hob.add_acpi_table(&mem, acpi_table.as_slice()) 2002 .map_err(Error::PopulateHob)?; 2003 } 2004 2005 // If a payload info has been created, let's insert it into the HOB. 2006 if let Some(payload_info) = payload_info { 2007 hob.add_payload(&mem, payload_info) 2008 .map_err(Error::PopulateHob)?; 2009 } 2010 2011 hob.finish(&mem).map_err(Error::PopulateHob)?; 2012 2013 Ok(hob_offset) 2014 } 2015 2016 #[cfg(feature = "tdx")] 2017 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 2018 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2019 let mem = guest_memory.memory(); 2020 2021 for section in sections { 2022 self.vm 2023 .tdx_init_memory_region( 2024 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2025 section.address, 2026 section.size, 2027 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2028 section.attributes == 1, 2029 ) 2030 .map_err(Error::InitializeTdxMemoryRegion)?; 2031 } 2032 2033 Ok(()) 2034 } 2035 2036 // Creates ACPI tables 2037 // In case of TDX being used, this is a no-op since the tables will be 2038 // created and passed when populating the HOB. 2039 2040 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2041 #[cfg(feature = "tdx")] 2042 if self.config.lock().unwrap().is_tdx_enabled() { 2043 return None; 2044 } 2045 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2046 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2047 let rsdp_addr = crate::acpi::create_acpi_tables( 2048 &mem, 2049 &self.device_manager, 2050 &self.cpu_manager, 2051 &self.memory_manager, 2052 &self.numa_nodes, 2053 tpm_enabled, 2054 ); 2055 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2056 2057 Some(rsdp_addr) 2058 } 2059 2060 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2061 trace_scoped!("entry_point"); 2062 2063 self.load_payload_handle 2064 .take() 2065 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2066 .transpose() 2067 } 2068 2069 pub fn boot(&mut self) -> Result<()> { 2070 trace_scoped!("Vm::boot"); 2071 let current_state = self.get_state()?; 2072 if current_state == VmState::Paused { 2073 return self.resume().map_err(Error::Resume); 2074 } 2075 2076 let new_state = if self.stop_on_boot { 2077 VmState::BreakPoint 2078 } else { 2079 VmState::Running 2080 }; 2081 current_state.valid_transition(new_state)?; 2082 2083 // Do earlier to parallelise with loading kernel 2084 #[cfg(target_arch = "x86_64")] 2085 cfg_if::cfg_if! { 2086 if #[cfg(feature = "sev_snp")] { 2087 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2088 let rsdp_addr = if sev_snp_enabled { 2089 // In case of SEV-SNP guest ACPI tables are provided via 2090 // IGVM. So skip the creation of ACPI tables and set the 2091 // rsdp addr to None. 2092 None 2093 } else { 2094 self.create_acpi_tables() 2095 }; 2096 } else { 2097 let rsdp_addr = self.create_acpi_tables(); 2098 } 2099 } 2100 2101 // Load kernel synchronously or if asynchronous then wait for load to 2102 // finish. 2103 let entry_point = self.entry_point()?; 2104 2105 #[cfg(feature = "tdx")] 2106 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2107 2108 // Configure the vcpus that have been created 2109 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2110 for vcpu in vcpus { 2111 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2112 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2113 self.cpu_manager 2114 .lock() 2115 .unwrap() 2116 .configure_vcpu(vcpu, boot_setup) 2117 .map_err(Error::CpuManager)?; 2118 } 2119 2120 #[cfg(feature = "tdx")] 2121 let (sections, guid_found) = if tdx_enabled { 2122 self.extract_tdvf_sections()? 2123 } else { 2124 (Vec::new(), false) 2125 }; 2126 2127 // Configuring the TDX regions requires that the vCPUs are created. 2128 #[cfg(feature = "tdx")] 2129 let hob_address = if tdx_enabled { 2130 // TDX sections are written to memory. 2131 self.populate_tdx_sections(§ions, guid_found)? 2132 } else { 2133 None 2134 }; 2135 2136 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2137 // available after they are configured 2138 #[cfg(target_arch = "aarch64")] 2139 let rsdp_addr = self.create_acpi_tables(); 2140 2141 // Configure shared state based on loaded kernel 2142 entry_point 2143 .map(|entry_point| { 2144 // Safe to unwrap rsdp_addr as we know it can't be None when 2145 // the entry_point is Some. 2146 self.configure_system(rsdp_addr.unwrap(), entry_point) 2147 }) 2148 .transpose()?; 2149 2150 #[cfg(target_arch = "x86_64")] 2151 // Note: For x86, always call this function before invoking start boot vcpus. 2152 // Otherwise guest would fail to boot because we haven't created the 2153 // userspace mappings to update the hypervisor about the memory mappings. 2154 // These mappings must be created before we start the vCPU threads for 2155 // the very first time. 2156 self.memory_manager 2157 .lock() 2158 .unwrap() 2159 .allocate_address_space() 2160 .map_err(Error::MemoryManager)?; 2161 2162 #[cfg(feature = "tdx")] 2163 if let Some(hob_address) = hob_address { 2164 // With the HOB address extracted the vCPUs can have 2165 // their TDX state configured. 2166 self.cpu_manager 2167 .lock() 2168 .unwrap() 2169 .initialize_tdx(hob_address) 2170 .map_err(Error::CpuManager)?; 2171 // Let the hypervisor know which memory ranges are shared with the 2172 // guest. This prevents the guest from ignoring/discarding memory 2173 // regions provided by the host. 2174 self.init_tdx_memory(§ions)?; 2175 // With TDX memory and CPU state configured TDX setup is complete 2176 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2177 } 2178 2179 // Resume the vm for MSHV 2180 if current_state == VmState::Created { 2181 self.vm.resume().map_err(Error::ResumeVm)?; 2182 } 2183 2184 self.cpu_manager 2185 .lock() 2186 .unwrap() 2187 .start_boot_vcpus(new_state == VmState::BreakPoint) 2188 .map_err(Error::CpuManager)?; 2189 2190 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2191 *state = new_state; 2192 Ok(()) 2193 } 2194 2195 pub fn restore(&mut self) -> Result<()> { 2196 event!("vm", "restoring"); 2197 2198 #[cfg(target_arch = "x86_64")] 2199 // Note: For x86, always call this function before invoking start boot vcpus. 2200 // Otherwise guest would fail to boot because we haven't created the 2201 // userspace mappings to update the hypervisor about the memory mappings. 2202 // These mappings must be created before we start the vCPU threads for 2203 // the very first time for the restored VM. 2204 self.memory_manager 2205 .lock() 2206 .unwrap() 2207 .allocate_address_space() 2208 .map_err(Error::MemoryManager)?; 2209 2210 // Now we can start all vCPUs from here. 2211 self.cpu_manager 2212 .lock() 2213 .unwrap() 2214 .start_restored_vcpus() 2215 .map_err(Error::CpuManager)?; 2216 2217 event!("vm", "restored"); 2218 Ok(()) 2219 } 2220 2221 /// Gets a thread-safe reference counted pointer to the VM configuration. 2222 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2223 Arc::clone(&self.config) 2224 } 2225 2226 /// Get the VM state. Returns an error if the state is poisoned. 2227 pub fn get_state(&self) -> Result<VmState> { 2228 self.state 2229 .try_read() 2230 .map_err(|_| Error::PoisonedState) 2231 .map(|state| *state) 2232 } 2233 2234 /// Gets the actual size of the balloon. 2235 pub fn balloon_size(&self) -> u64 { 2236 self.device_manager.lock().unwrap().balloon_size() 2237 } 2238 2239 pub fn send_memory_fds( 2240 &mut self, 2241 socket: &mut UnixStream, 2242 ) -> std::result::Result<(), MigratableError> { 2243 for (slot, fd) in self 2244 .memory_manager 2245 .lock() 2246 .unwrap() 2247 .memory_slot_fds() 2248 .drain() 2249 { 2250 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2251 .write_to(socket) 2252 .map_err(|e| { 2253 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2254 })?; 2255 socket 2256 .send_with_fd(&slot.to_le_bytes()[..], fd) 2257 .map_err(|e| { 2258 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2259 })?; 2260 2261 let res = Response::read_from(socket)?; 2262 if res.status() != Status::Ok { 2263 warn!("Error during memory fd migration"); 2264 Request::abandon().write_to(socket)?; 2265 Response::read_from(socket).ok(); 2266 return Err(MigratableError::MigrateSend(anyhow!( 2267 "Error during memory fd migration" 2268 ))); 2269 } 2270 } 2271 2272 Ok(()) 2273 } 2274 2275 pub fn send_memory_regions<F>( 2276 &mut self, 2277 ranges: &MemoryRangeTable, 2278 fd: &mut F, 2279 ) -> std::result::Result<(), MigratableError> 2280 where 2281 F: WriteVolatile, 2282 { 2283 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2284 let mem = guest_memory.memory(); 2285 2286 for range in ranges.regions() { 2287 let mut offset: u64 = 0; 2288 // Here we are manually handling the retry in case we can't the 2289 // whole region at once because we can't use the implementation 2290 // from vm-memory::GuestMemory of write_all_to() as it is not 2291 // following the correct behavior. For more info about this issue 2292 // see: https://github.com/rust-vmm/vm-memory/issues/174 2293 loop { 2294 let bytes_written = mem 2295 .write_volatile_to( 2296 GuestAddress(range.gpa + offset), 2297 fd, 2298 (range.length - offset) as usize, 2299 ) 2300 .map_err(|e| { 2301 MigratableError::MigrateSend(anyhow!( 2302 "Error transferring memory to socket: {}", 2303 e 2304 )) 2305 })?; 2306 offset += bytes_written as u64; 2307 2308 if offset == range.length { 2309 break; 2310 } 2311 } 2312 } 2313 2314 Ok(()) 2315 } 2316 2317 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2318 self.memory_manager 2319 .lock() 2320 .unwrap() 2321 .memory_range_table(false) 2322 } 2323 2324 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2325 self.device_manager.lock().unwrap().device_tree() 2326 } 2327 2328 pub fn activate_virtio_devices(&self) -> Result<()> { 2329 self.device_manager 2330 .lock() 2331 .unwrap() 2332 .activate_virtio_devices() 2333 .map_err(Error::ActivateVirtioDevices) 2334 } 2335 2336 #[cfg(target_arch = "x86_64")] 2337 pub fn power_button(&self) -> Result<()> { 2338 return self 2339 .device_manager 2340 .lock() 2341 .unwrap() 2342 .notify_power_button() 2343 .map_err(Error::PowerButton); 2344 } 2345 2346 #[cfg(target_arch = "aarch64")] 2347 pub fn power_button(&self) -> Result<()> { 2348 self.device_manager 2349 .lock() 2350 .unwrap() 2351 .notify_power_button() 2352 .map_err(Error::PowerButton) 2353 } 2354 2355 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2356 self.memory_manager.lock().unwrap().snapshot_data() 2357 } 2358 2359 #[cfg(feature = "guest_debug")] 2360 pub fn debug_request( 2361 &mut self, 2362 gdb_request: &GdbRequestPayload, 2363 cpu_id: usize, 2364 ) -> Result<GdbResponsePayload> { 2365 use GdbRequestPayload::*; 2366 match gdb_request { 2367 SetSingleStep(single_step) => { 2368 self.set_guest_debug(cpu_id, &[], *single_step) 2369 .map_err(Error::Debug)?; 2370 } 2371 SetHwBreakPoint(addrs) => { 2372 self.set_guest_debug(cpu_id, addrs, false) 2373 .map_err(Error::Debug)?; 2374 } 2375 Pause => { 2376 self.debug_pause().map_err(Error::Debug)?; 2377 } 2378 Resume => { 2379 self.debug_resume().map_err(Error::Debug)?; 2380 } 2381 ReadRegs => { 2382 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2383 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2384 } 2385 WriteRegs(regs) => { 2386 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2387 } 2388 ReadMem(vaddr, len) => { 2389 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2390 let mem = self 2391 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2392 .map_err(Error::Debug)?; 2393 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2394 } 2395 WriteMem(vaddr, data) => { 2396 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2397 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2398 .map_err(Error::Debug)?; 2399 } 2400 ActiveVcpus => { 2401 let active_vcpus = self.active_vcpus(); 2402 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2403 } 2404 } 2405 Ok(GdbResponsePayload::CommandComplete) 2406 } 2407 2408 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2409 fn get_dump_state( 2410 &mut self, 2411 destination_url: &str, 2412 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2413 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2414 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2415 let mut elf_phdr_num = 1; 2416 let elf_sh_info = 0; 2417 let coredump_file_path = url_to_file(destination_url)?; 2418 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2419 2420 if mapping_num < UINT16_MAX - 2 { 2421 elf_phdr_num += mapping_num as u16; 2422 } else { 2423 panic!("mapping num beyond 65535 not supported"); 2424 } 2425 let coredump_file = OpenOptions::new() 2426 .read(true) 2427 .write(true) 2428 .create_new(true) 2429 .open(coredump_file_path) 2430 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2431 2432 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2433 let mem_data = self 2434 .memory_manager 2435 .lock() 2436 .unwrap() 2437 .coredump_memory_regions(mem_offset); 2438 2439 Ok(DumpState { 2440 elf_note_size, 2441 elf_phdr_num, 2442 elf_sh_info, 2443 mem_offset, 2444 mem_info: Some(mem_data), 2445 file: Some(coredump_file), 2446 }) 2447 } 2448 2449 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2450 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2451 size_of::<elf::Elf64_Ehdr>() as u64 2452 + note_size as u64 2453 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2454 } 2455 2456 pub fn nmi(&self) -> Result<()> { 2457 return self 2458 .cpu_manager 2459 .lock() 2460 .unwrap() 2461 .nmi() 2462 .map_err(|_| Error::ErrorNmi); 2463 } 2464 } 2465 2466 impl Pausable for Vm { 2467 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2468 event!("vm", "pausing"); 2469 let mut state = self 2470 .state 2471 .try_write() 2472 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2473 let new_state = VmState::Paused; 2474 2475 state 2476 .valid_transition(new_state) 2477 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2478 2479 #[cfg(target_arch = "x86_64")] 2480 { 2481 let mut clock = self 2482 .vm 2483 .get_clock() 2484 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2485 clock.reset_flags(); 2486 self.saved_clock = Some(clock); 2487 } 2488 2489 // Before pausing the vCPUs activate any pending virtio devices that might 2490 // need activation between starting the pause (or e.g. a migration it's part of) 2491 self.activate_virtio_devices().map_err(|e| { 2492 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2493 })?; 2494 2495 self.cpu_manager.lock().unwrap().pause()?; 2496 self.device_manager.lock().unwrap().pause()?; 2497 2498 self.vm 2499 .pause() 2500 .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?; 2501 2502 *state = new_state; 2503 2504 event!("vm", "paused"); 2505 Ok(()) 2506 } 2507 2508 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2509 event!("vm", "resuming"); 2510 let current_state = self.get_state().unwrap(); 2511 let mut state = self 2512 .state 2513 .try_write() 2514 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2515 let new_state = VmState::Running; 2516 2517 state 2518 .valid_transition(new_state) 2519 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2520 2521 self.cpu_manager.lock().unwrap().resume()?; 2522 #[cfg(target_arch = "x86_64")] 2523 { 2524 if let Some(clock) = &self.saved_clock { 2525 self.vm.set_clock(clock).map_err(|e| { 2526 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2527 })?; 2528 } 2529 } 2530 2531 if current_state == VmState::Paused { 2532 self.vm 2533 .resume() 2534 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?; 2535 } 2536 2537 self.device_manager.lock().unwrap().resume()?; 2538 2539 // And we're back to the Running state. 2540 *state = new_state; 2541 event!("vm", "resumed"); 2542 Ok(()) 2543 } 2544 } 2545 2546 #[derive(Serialize, Deserialize)] 2547 pub struct VmSnapshot { 2548 #[cfg(target_arch = "x86_64")] 2549 pub clock: Option<hypervisor::ClockData>, 2550 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2551 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2552 } 2553 2554 pub const VM_SNAPSHOT_ID: &str = "vm"; 2555 impl Snapshottable for Vm { 2556 fn id(&self) -> String { 2557 VM_SNAPSHOT_ID.to_string() 2558 } 2559 2560 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2561 event!("vm", "snapshotting"); 2562 2563 #[cfg(feature = "tdx")] 2564 { 2565 if self.config.lock().unwrap().is_tdx_enabled() { 2566 return Err(MigratableError::Snapshot(anyhow!( 2567 "Snapshot not possible with TDX VM" 2568 ))); 2569 } 2570 } 2571 2572 let current_state = self.get_state().unwrap(); 2573 if current_state != VmState::Paused { 2574 return Err(MigratableError::Snapshot(anyhow!( 2575 "Trying to snapshot while VM is running" 2576 ))); 2577 } 2578 2579 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2580 let common_cpuid = { 2581 let amx = self.config.lock().unwrap().cpus.features.amx; 2582 let phys_bits = physical_bits( 2583 &self.hypervisor, 2584 self.config.lock().unwrap().cpus.max_phys_bits, 2585 ); 2586 arch::generate_common_cpuid( 2587 &self.hypervisor, 2588 &arch::CpuidConfig { 2589 sgx_epc_sections: None, 2590 phys_bits, 2591 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2592 #[cfg(feature = "tdx")] 2593 tdx: false, 2594 amx, 2595 }, 2596 ) 2597 .map_err(|e| { 2598 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2599 })? 2600 }; 2601 2602 let vm_snapshot_state = VmSnapshot { 2603 #[cfg(target_arch = "x86_64")] 2604 clock: self.saved_clock, 2605 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2606 common_cpuid, 2607 }; 2608 2609 let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?; 2610 2611 let (id, snapshot) = { 2612 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2613 (cpu_manager.id(), cpu_manager.snapshot()?) 2614 }; 2615 vm_snapshot.add_snapshot(id, snapshot); 2616 let (id, snapshot) = { 2617 let mut memory_manager = self.memory_manager.lock().unwrap(); 2618 (memory_manager.id(), memory_manager.snapshot()?) 2619 }; 2620 vm_snapshot.add_snapshot(id, snapshot); 2621 let (id, snapshot) = { 2622 let mut device_manager = self.device_manager.lock().unwrap(); 2623 (device_manager.id(), device_manager.snapshot()?) 2624 }; 2625 vm_snapshot.add_snapshot(id, snapshot); 2626 2627 event!("vm", "snapshotted"); 2628 Ok(vm_snapshot) 2629 } 2630 } 2631 2632 impl Transportable for Vm { 2633 fn send( 2634 &self, 2635 snapshot: &Snapshot, 2636 destination_url: &str, 2637 ) -> std::result::Result<(), MigratableError> { 2638 let mut snapshot_config_path = url_to_path(destination_url)?; 2639 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2640 2641 // Create the snapshot config file 2642 let mut snapshot_config_file = OpenOptions::new() 2643 .read(true) 2644 .write(true) 2645 .create_new(true) 2646 .open(snapshot_config_path) 2647 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2648 2649 // Serialize and write the snapshot config 2650 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2651 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2652 2653 snapshot_config_file 2654 .write(vm_config.as_bytes()) 2655 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2656 2657 let mut snapshot_state_path = url_to_path(destination_url)?; 2658 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2659 2660 // Create the snapshot state file 2661 let mut snapshot_state_file = OpenOptions::new() 2662 .read(true) 2663 .write(true) 2664 .create_new(true) 2665 .open(snapshot_state_path) 2666 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2667 2668 // Serialize and write the snapshot state 2669 let vm_state = 2670 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2671 2672 snapshot_state_file 2673 .write(&vm_state) 2674 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2675 2676 // Tell the memory manager to also send/write its own snapshot. 2677 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2678 self.memory_manager 2679 .lock() 2680 .unwrap() 2681 .send(&memory_manager_snapshot.clone(), destination_url)?; 2682 } else { 2683 return Err(MigratableError::Restore(anyhow!( 2684 "Missing memory manager snapshot" 2685 ))); 2686 } 2687 2688 Ok(()) 2689 } 2690 } 2691 2692 impl Migratable for Vm { 2693 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2694 self.memory_manager.lock().unwrap().start_dirty_log()?; 2695 self.device_manager.lock().unwrap().start_dirty_log() 2696 } 2697 2698 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2699 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2700 self.device_manager.lock().unwrap().stop_dirty_log() 2701 } 2702 2703 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2704 Ok(MemoryRangeTable::new_from_tables(vec![ 2705 self.memory_manager.lock().unwrap().dirty_log()?, 2706 self.device_manager.lock().unwrap().dirty_log()?, 2707 ])) 2708 } 2709 2710 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2711 self.memory_manager.lock().unwrap().start_migration()?; 2712 self.device_manager.lock().unwrap().start_migration() 2713 } 2714 2715 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2716 self.memory_manager.lock().unwrap().complete_migration()?; 2717 self.device_manager.lock().unwrap().complete_migration() 2718 } 2719 } 2720 2721 #[cfg(feature = "guest_debug")] 2722 impl Debuggable for Vm { 2723 fn set_guest_debug( 2724 &self, 2725 cpu_id: usize, 2726 addrs: &[GuestAddress], 2727 singlestep: bool, 2728 ) -> std::result::Result<(), DebuggableError> { 2729 self.cpu_manager 2730 .lock() 2731 .unwrap() 2732 .set_guest_debug(cpu_id, addrs, singlestep) 2733 } 2734 2735 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2736 if *self.state.read().unwrap() == VmState::Running { 2737 self.pause().map_err(DebuggableError::Pause)?; 2738 } 2739 2740 let mut state = self 2741 .state 2742 .try_write() 2743 .map_err(|_| DebuggableError::PoisonedState)?; 2744 *state = VmState::BreakPoint; 2745 Ok(()) 2746 } 2747 2748 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2749 if *self.state.read().unwrap() == VmState::BreakPoint { 2750 self.resume().map_err(DebuggableError::Pause)?; 2751 } 2752 2753 Ok(()) 2754 } 2755 2756 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2757 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2758 } 2759 2760 fn write_regs( 2761 &self, 2762 cpu_id: usize, 2763 regs: &CoreRegs, 2764 ) -> std::result::Result<(), DebuggableError> { 2765 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2766 } 2767 2768 fn read_mem( 2769 &self, 2770 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2771 cpu_id: usize, 2772 vaddr: GuestAddress, 2773 len: usize, 2774 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2775 self.cpu_manager 2776 .lock() 2777 .unwrap() 2778 .read_mem(guest_memory, cpu_id, vaddr, len) 2779 } 2780 2781 fn write_mem( 2782 &self, 2783 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2784 cpu_id: usize, 2785 vaddr: &GuestAddress, 2786 data: &[u8], 2787 ) -> std::result::Result<(), DebuggableError> { 2788 self.cpu_manager 2789 .lock() 2790 .unwrap() 2791 .write_mem(guest_memory, cpu_id, vaddr, data) 2792 } 2793 2794 fn active_vcpus(&self) -> usize { 2795 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2796 if active_vcpus > 0 { 2797 active_vcpus 2798 } else { 2799 // The VM is not booted yet. Report boot_vcpus() instead. 2800 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2801 } 2802 } 2803 } 2804 2805 #[cfg(feature = "guest_debug")] 2806 pub const UINT16_MAX: u32 = 65535; 2807 2808 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2809 impl Elf64Writable for Vm {} 2810 2811 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2812 impl GuestDebuggable for Vm { 2813 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2814 event!("vm", "coredumping"); 2815 2816 let mut resume = false; 2817 2818 #[cfg(feature = "tdx")] 2819 { 2820 if let Some(ref platform) = self.config.lock().unwrap().platform { 2821 if platform.tdx { 2822 return Err(GuestDebuggableError::Coredump(anyhow!( 2823 "Coredump not possible with TDX VM" 2824 ))); 2825 } 2826 } 2827 } 2828 2829 match self.get_state().unwrap() { 2830 VmState::Running => { 2831 self.pause().map_err(GuestDebuggableError::Pause)?; 2832 resume = true; 2833 } 2834 VmState::Paused => {} 2835 _ => { 2836 return Err(GuestDebuggableError::Coredump(anyhow!( 2837 "Trying to coredump while VM is not running or paused" 2838 ))); 2839 } 2840 } 2841 2842 let coredump_state = self.get_dump_state(destination_url)?; 2843 2844 self.write_header(&coredump_state)?; 2845 self.write_note(&coredump_state)?; 2846 self.write_loads(&coredump_state)?; 2847 2848 self.cpu_manager 2849 .lock() 2850 .unwrap() 2851 .cpu_write_elf64_note(&coredump_state)?; 2852 self.cpu_manager 2853 .lock() 2854 .unwrap() 2855 .cpu_write_vmm_note(&coredump_state)?; 2856 2857 self.memory_manager 2858 .lock() 2859 .unwrap() 2860 .coredump_iterate_save_mem(&coredump_state)?; 2861 2862 if resume { 2863 self.resume().map_err(GuestDebuggableError::Resume)?; 2864 } 2865 2866 Ok(()) 2867 } 2868 } 2869 2870 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2871 #[cfg(test)] 2872 mod tests { 2873 use super::*; 2874 2875 fn test_vm_state_transitions(state: VmState) { 2876 match state { 2877 VmState::Created => { 2878 // Check the transitions from Created 2879 assert!(state.valid_transition(VmState::Created).is_err()); 2880 assert!(state.valid_transition(VmState::Running).is_ok()); 2881 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2882 assert!(state.valid_transition(VmState::Paused).is_ok()); 2883 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2884 } 2885 VmState::Running => { 2886 // Check the transitions from Running 2887 assert!(state.valid_transition(VmState::Created).is_err()); 2888 assert!(state.valid_transition(VmState::Running).is_err()); 2889 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2890 assert!(state.valid_transition(VmState::Paused).is_ok()); 2891 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2892 } 2893 VmState::Shutdown => { 2894 // Check the transitions from Shutdown 2895 assert!(state.valid_transition(VmState::Created).is_err()); 2896 assert!(state.valid_transition(VmState::Running).is_ok()); 2897 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2898 assert!(state.valid_transition(VmState::Paused).is_err()); 2899 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2900 } 2901 VmState::Paused => { 2902 // Check the transitions from Paused 2903 assert!(state.valid_transition(VmState::Created).is_err()); 2904 assert!(state.valid_transition(VmState::Running).is_ok()); 2905 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2906 assert!(state.valid_transition(VmState::Paused).is_err()); 2907 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2908 } 2909 VmState::BreakPoint => { 2910 // Check the transitions from Breakpoint 2911 assert!(state.valid_transition(VmState::Created).is_ok()); 2912 assert!(state.valid_transition(VmState::Running).is_ok()); 2913 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2914 assert!(state.valid_transition(VmState::Paused).is_err()); 2915 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2916 } 2917 } 2918 } 2919 2920 #[test] 2921 fn test_vm_created_transitions() { 2922 test_vm_state_transitions(VmState::Created); 2923 } 2924 2925 #[test] 2926 fn test_vm_running_transitions() { 2927 test_vm_state_transitions(VmState::Running); 2928 } 2929 2930 #[test] 2931 fn test_vm_shutdown_transitions() { 2932 test_vm_state_transitions(VmState::Shutdown); 2933 } 2934 2935 #[test] 2936 fn test_vm_paused_transitions() { 2937 test_vm_state_transitions(VmState::Paused); 2938 } 2939 2940 #[cfg(feature = "tdx")] 2941 #[test] 2942 fn test_hob_memory_resources() { 2943 // Case 1: Two TDVF sections in the middle of the RAM 2944 let sections = vec![ 2945 TdvfSection { 2946 address: 0xc000, 2947 size: 0x1000, 2948 ..Default::default() 2949 }, 2950 TdvfSection { 2951 address: 0x1000, 2952 size: 0x4000, 2953 ..Default::default() 2954 }, 2955 ]; 2956 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2957 let expected = vec![ 2958 (0, 0x1000, true), 2959 (0x1000, 0x4000, false), 2960 (0x5000, 0x7000, true), 2961 (0xc000, 0x1000, false), 2962 (0xd000, 0x0fff_3000, true), 2963 ]; 2964 assert_eq!( 2965 expected, 2966 Vm::hob_memory_resources( 2967 sections, 2968 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2969 ) 2970 ); 2971 2972 // Case 2: Two TDVF sections with no conflict with the RAM 2973 let sections = vec![ 2974 TdvfSection { 2975 address: 0x1000_1000, 2976 size: 0x1000, 2977 ..Default::default() 2978 }, 2979 TdvfSection { 2980 address: 0, 2981 size: 0x1000, 2982 ..Default::default() 2983 }, 2984 ]; 2985 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2986 let expected = vec![ 2987 (0, 0x1000, false), 2988 (0x1000, 0x1000_0000, true), 2989 (0x1000_1000, 0x1000, false), 2990 ]; 2991 assert_eq!( 2992 expected, 2993 Vm::hob_memory_resources( 2994 sections, 2995 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2996 ) 2997 ); 2998 2999 // Case 3: Two TDVF sections with partial conflicts with the RAM 3000 let sections = vec![ 3001 TdvfSection { 3002 address: 0x1000_0000, 3003 size: 0x2000, 3004 ..Default::default() 3005 }, 3006 TdvfSection { 3007 address: 0, 3008 size: 0x2000, 3009 ..Default::default() 3010 }, 3011 ]; 3012 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3013 let expected = vec![ 3014 (0, 0x2000, false), 3015 (0x2000, 0x0fff_e000, true), 3016 (0x1000_0000, 0x2000, false), 3017 ]; 3018 assert_eq!( 3019 expected, 3020 Vm::hob_memory_resources( 3021 sections, 3022 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3023 ) 3024 ); 3025 3026 // Case 4: Two TDVF sections with no conflict before the RAM and two 3027 // more additional sections with no conflict after the RAM. 3028 let sections = vec![ 3029 TdvfSection { 3030 address: 0x2000_1000, 3031 size: 0x1000, 3032 ..Default::default() 3033 }, 3034 TdvfSection { 3035 address: 0x2000_0000, 3036 size: 0x1000, 3037 ..Default::default() 3038 }, 3039 TdvfSection { 3040 address: 0x1000, 3041 size: 0x1000, 3042 ..Default::default() 3043 }, 3044 TdvfSection { 3045 address: 0, 3046 size: 0x1000, 3047 ..Default::default() 3048 }, 3049 ]; 3050 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3051 let expected = vec![ 3052 (0, 0x1000, false), 3053 (0x1000, 0x1000, false), 3054 (0x4000, 0x1000_0000, true), 3055 (0x2000_0000, 0x1000, false), 3056 (0x2000_1000, 0x1000, false), 3057 ]; 3058 assert_eq!( 3059 expected, 3060 Vm::hob_memory_resources( 3061 sections, 3062 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3063 ) 3064 ); 3065 3066 // Case 5: One TDVF section overriding the entire RAM 3067 let sections = vec![TdvfSection { 3068 address: 0, 3069 size: 0x2000_0000, 3070 ..Default::default() 3071 }]; 3072 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3073 let expected = vec![(0, 0x2000_0000, false)]; 3074 assert_eq!( 3075 expected, 3076 Vm::hob_memory_resources( 3077 sections, 3078 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3079 ) 3080 ); 3081 3082 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3083 let sections = vec![ 3084 TdvfSection { 3085 address: 0x1000_2000, 3086 size: 0x2000, 3087 ..Default::default() 3088 }, 3089 TdvfSection { 3090 address: 0, 3091 size: 0x2000, 3092 ..Default::default() 3093 }, 3094 ]; 3095 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3096 (GuestAddress(0x2000), 0x1000_0000), 3097 (GuestAddress(0x1000_4000), 0x1000_0000), 3098 ]; 3099 let expected = vec![ 3100 (0, 0x2000, false), 3101 (0x2000, 0x1000_0000, true), 3102 (0x1000_2000, 0x2000, false), 3103 (0x1000_4000, 0x1000_0000, true), 3104 ]; 3105 assert_eq!( 3106 expected, 3107 Vm::hob_memory_resources( 3108 sections, 3109 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3110 ) 3111 ); 3112 3113 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3114 let sections = vec![ 3115 TdvfSection { 3116 address: 0x1000_0000, 3117 size: 0x4000, 3118 ..Default::default() 3119 }, 3120 TdvfSection { 3121 address: 0, 3122 size: 0x4000, 3123 ..Default::default() 3124 }, 3125 ]; 3126 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3127 (GuestAddress(0x1000), 0x1000_0000), 3128 (GuestAddress(0x1000_3000), 0x1000_0000), 3129 ]; 3130 let expected = vec![ 3131 (0, 0x4000, false), 3132 (0x4000, 0x0fff_c000, true), 3133 (0x1000_0000, 0x4000, false), 3134 (0x1000_4000, 0x0fff_f000, true), 3135 ]; 3136 assert_eq!( 3137 expected, 3138 Vm::hob_memory_resources( 3139 sections, 3140 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3141 ) 3142 ); 3143 } 3144 } 3145 3146 #[cfg(target_arch = "aarch64")] 3147 #[cfg(test)] 3148 mod tests { 3149 use super::*; 3150 use arch::aarch64::fdt::create_fdt; 3151 use arch::aarch64::layout; 3152 use arch::{DeviceType, MmioDeviceInfo}; 3153 use devices::gic::Gic; 3154 3155 const LEN: u64 = 4096; 3156 3157 #[test] 3158 fn test_create_fdt_with_devices() { 3159 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3160 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3161 3162 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3163 ( 3164 (DeviceType::Serial, DeviceType::Serial.to_string()), 3165 MmioDeviceInfo { 3166 addr: 0x00, 3167 len: LEN, 3168 irq: 33, 3169 }, 3170 ), 3171 ( 3172 (DeviceType::Virtio(1), "virtio".to_string()), 3173 MmioDeviceInfo { 3174 addr: LEN, 3175 len: LEN, 3176 irq: 34, 3177 }, 3178 ), 3179 ( 3180 (DeviceType::Rtc, "rtc".to_string()), 3181 MmioDeviceInfo { 3182 addr: 2 * LEN, 3183 len: LEN, 3184 irq: 35, 3185 }, 3186 ), 3187 ] 3188 .iter() 3189 .cloned() 3190 .collect(); 3191 3192 let hv = hypervisor::new().unwrap(); 3193 let vm = hv.create_vm().unwrap(); 3194 let gic = vm 3195 .create_vgic(Gic::create_default_config(1)) 3196 .expect("Cannot create gic"); 3197 assert!(create_fdt( 3198 &mem, 3199 "console=tty0", 3200 vec![0], 3201 Some((0, 0, 0)), 3202 &dev_info, 3203 &gic, 3204 &None, 3205 &Vec::new(), 3206 &BTreeMap::new(), 3207 None, 3208 true, 3209 ) 3210 .is_ok()) 3211 } 3212 } 3213 3214 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3215 #[test] 3216 pub fn test_vm() { 3217 use hypervisor::VmExit; 3218 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3219 // This example based on https://lwn.net/Articles/658511/ 3220 let code = [ 3221 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3222 0x00, 0xd8, /* add %bl, %al */ 3223 0x04, b'0', /* add $'0', %al */ 3224 0xee, /* out %al, (%dx) */ 3225 0xb0, b'\n', /* mov $'\n', %al */ 3226 0xee, /* out %al, (%dx) */ 3227 0xf4, /* hlt */ 3228 ]; 3229 3230 let mem_size = 0x1000; 3231 let load_addr = GuestAddress(0x1000); 3232 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3233 3234 let hv = hypervisor::new().unwrap(); 3235 let vm = hv.create_vm().expect("new VM creation failed"); 3236 3237 for (index, region) in mem.iter().enumerate() { 3238 let mem_region = vm.make_user_memory_region( 3239 index as u32, 3240 region.start_addr().raw_value(), 3241 region.len(), 3242 region.as_ptr() as u64, 3243 false, 3244 false, 3245 ); 3246 3247 vm.create_user_memory_region(mem_region) 3248 .expect("Cannot configure guest memory"); 3249 } 3250 mem.write_slice(&code, load_addr) 3251 .expect("Writing code to memory failed"); 3252 3253 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3254 3255 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3256 vcpu_sregs.cs.base = 0; 3257 vcpu_sregs.cs.selector = 0; 3258 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3259 3260 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3261 vcpu_regs.rip = 0x1000; 3262 vcpu_regs.rax = 2; 3263 vcpu_regs.rbx = 3; 3264 vcpu_regs.rflags = 2; 3265 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3266 3267 loop { 3268 match vcpu.run().expect("run failed") { 3269 VmExit::Reset => { 3270 println!("HLT"); 3271 break; 3272 } 3273 VmExit::Ignore => {} 3274 r => panic!("unexpected exit reason: {r:?}"), 3275 } 3276 } 3277 } 3278