1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 #[cfg(feature = "igvm")] 29 use crate::igvm::igvm_loader; 30 use crate::memory_manager::{ 31 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 32 }; 33 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 34 use crate::migration::get_vm_snapshot; 35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 36 use crate::migration::url_to_file; 37 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 38 use crate::GuestMemoryMmap; 39 use crate::{ 40 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 41 }; 42 use anyhow::anyhow; 43 use arch::get_host_cpu_phys_bits; 44 #[cfg(target_arch = "x86_64")] 45 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 46 #[cfg(feature = "tdx")] 47 use arch::x86_64::tdx::TdvfSection; 48 use arch::EntryPoint; 49 #[cfg(target_arch = "aarch64")] 50 use arch::PciSpaceInfo; 51 use arch::{NumaNode, NumaNodes}; 52 #[cfg(target_arch = "aarch64")] 53 use devices::interrupt_controller; 54 use devices::AcpiNotificationFlags; 55 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 56 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 57 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 58 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 59 use hypervisor::{HypervisorVmError, VmOps}; 60 use libc::{termios, SIGWINCH}; 61 use linux_loader::cmdline::Cmdline; 62 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 63 use linux_loader::elf; 64 #[cfg(target_arch = "x86_64")] 65 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 66 #[cfg(target_arch = "aarch64")] 67 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 68 use linux_loader::loader::KernelLoader; 69 use seccompiler::SeccompAction; 70 use serde::{Deserialize, Serialize}; 71 use std::cmp; 72 use std::collections::BTreeMap; 73 use std::collections::HashMap; 74 use std::convert::TryInto; 75 use std::fs::{File, OpenOptions}; 76 use std::io::{self, Seek, SeekFrom, Write}; 77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 78 use std::mem::size_of; 79 use std::num::Wrapping; 80 use std::ops::Deref; 81 use std::os::unix::net::UnixStream; 82 use std::sync::{Arc, Mutex, RwLock}; 83 use std::time::Instant; 84 use std::{result, str, thread}; 85 use thiserror::Error; 86 use tracer::trace_scoped; 87 use vm_device::Bus; 88 #[cfg(feature = "tdx")] 89 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 90 use vm_memory::{ 91 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 92 }; 93 use vm_migration::protocol::{Request, Response, Status}; 94 use vm_migration::{ 95 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 96 SnapshotData, Snapshottable, Transportable, 97 }; 98 use vmm_sys_util::eventfd::EventFd; 99 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 100 101 /// Errors associated with VM management 102 #[derive(Debug, Error)] 103 pub enum Error { 104 #[error("Cannot open kernel file: {0}")] 105 KernelFile(#[source] io::Error), 106 107 #[error("Cannot open initramfs file: {0}")] 108 InitramfsFile(#[source] io::Error), 109 110 #[error("Cannot load the kernel into memory: {0}")] 111 KernelLoad(#[source] linux_loader::loader::Error), 112 113 #[cfg(target_arch = "aarch64")] 114 #[error("Cannot load the UEFI binary in memory: {0:?}")] 115 UefiLoad(arch::aarch64::uefi::Error), 116 117 #[error("Cannot load the initramfs into memory")] 118 InitramfsLoad, 119 120 #[error("Cannot load the kernel command line in memory: {0}")] 121 LoadCmdLine(#[source] linux_loader::loader::Error), 122 123 #[error("Cannot modify the kernel command line: {0}")] 124 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 125 126 #[error("Cannot create the kernel command line: {0}")] 127 CmdLineCreate(#[source] linux_loader::cmdline::Error), 128 129 #[error("Cannot configure system: {0}")] 130 ConfigureSystem(#[source] arch::Error), 131 132 #[cfg(target_arch = "aarch64")] 133 #[error("Cannot enable interrupt controller: {0:?}")] 134 EnableInterruptController(interrupt_controller::Error), 135 136 #[error("VM state is poisoned")] 137 PoisonedState, 138 139 #[error("Error from device manager: {0:?}")] 140 DeviceManager(DeviceManagerError), 141 142 #[error("No device with id {0:?} to remove")] 143 NoDeviceToRemove(String), 144 145 #[error("Cannot spawn a signal handler thread: {0}")] 146 SignalHandlerSpawn(#[source] io::Error), 147 148 #[error("Failed to join on threads: {0:?}")] 149 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 150 151 #[error("VM config is missing")] 152 VmMissingConfig, 153 154 #[error("VM is not created")] 155 VmNotCreated, 156 157 #[error("VM is already created")] 158 VmAlreadyCreated, 159 160 #[error("VM is not running")] 161 VmNotRunning, 162 163 #[error("Cannot clone EventFd: {0}")] 164 EventFdClone(#[source] io::Error), 165 166 #[error("invalid VM state transition: {0:?} to {1:?}")] 167 InvalidStateTransition(VmState, VmState), 168 169 #[error("Error from CPU manager: {0}")] 170 CpuManager(#[source] cpu::Error), 171 172 #[error("Cannot pause devices: {0}")] 173 PauseDevices(#[source] MigratableError), 174 175 #[error("Cannot resume devices: {0}")] 176 ResumeDevices(#[source] MigratableError), 177 178 #[error("Cannot pause CPUs: {0}")] 179 PauseCpus(#[source] MigratableError), 180 181 #[error("Cannot resume cpus: {0}")] 182 ResumeCpus(#[source] MigratableError), 183 184 #[error("Cannot pause VM: {0}")] 185 Pause(#[source] MigratableError), 186 187 #[error("Cannot resume VM: {0}")] 188 Resume(#[source] MigratableError), 189 190 #[error("Memory manager error: {0:?}")] 191 MemoryManager(MemoryManagerError), 192 193 #[error("Eventfd write error: {0}")] 194 EventfdError(#[source] std::io::Error), 195 196 #[error("Cannot snapshot VM: {0}")] 197 Snapshot(#[source] MigratableError), 198 199 #[error("Cannot restore VM: {0}")] 200 Restore(#[source] MigratableError), 201 202 #[error("Cannot send VM snapshot: {0}")] 203 SnapshotSend(#[source] MigratableError), 204 205 #[error("Invalid restore source URL")] 206 InvalidRestoreSourceUrl, 207 208 #[error("Failed to validate config: {0}")] 209 ConfigValidation(#[source] ValidationError), 210 211 #[error("Too many virtio-vsock devices")] 212 TooManyVsockDevices, 213 214 #[error("Failed serializing into JSON: {0}")] 215 SerializeJson(#[source] serde_json::Error), 216 217 #[error("Invalid NUMA configuration")] 218 InvalidNumaConfig, 219 220 #[error("Cannot create seccomp filter: {0}")] 221 CreateSeccompFilter(#[source] seccompiler::Error), 222 223 #[error("Cannot apply seccomp filter: {0}")] 224 ApplySeccompFilter(#[source] seccompiler::Error), 225 226 #[error("Failed resizing a memory zone")] 227 ResizeZone, 228 229 #[error("Cannot activate virtio devices: {0:?}")] 230 ActivateVirtioDevices(DeviceManagerError), 231 232 #[error("Error triggering power button: {0:?}")] 233 PowerButton(DeviceManagerError), 234 235 #[error("Kernel lacks PVH header")] 236 KernelMissingPvhHeader, 237 238 #[error("Failed to allocate firmware RAM: {0:?}")] 239 AllocateFirmwareMemory(MemoryManagerError), 240 241 #[error("Error manipulating firmware file: {0}")] 242 FirmwareFile(#[source] std::io::Error), 243 244 #[error("Firmware too big")] 245 FirmwareTooLarge, 246 247 #[error("Failed to copy firmware to memory: {0}")] 248 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 249 250 #[cfg(feature = "sev_snp")] 251 #[error("Error enabling SEV-SNP VM: {0}")] 252 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 253 254 #[cfg(feature = "tdx")] 255 #[error("Error performing I/O on TDX firmware file: {0}")] 256 LoadTdvf(#[source] std::io::Error), 257 258 #[cfg(feature = "tdx")] 259 #[error("Error performing I/O on the TDX payload file: {0}")] 260 LoadPayload(#[source] std::io::Error), 261 262 #[cfg(feature = "tdx")] 263 #[error("Error parsing TDVF: {0}")] 264 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 265 266 #[cfg(feature = "tdx")] 267 #[error("Error populating TDX HOB: {0}")] 268 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 269 270 #[cfg(feature = "tdx")] 271 #[error("Error allocating TDVF memory: {0:?}")] 272 AllocatingTdvfMemory(crate::memory_manager::Error), 273 274 #[cfg(feature = "tdx")] 275 #[error("Error enabling TDX VM: {0}")] 276 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 277 278 #[cfg(feature = "tdx")] 279 #[error("Error enabling TDX memory region: {0}")] 280 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 281 282 #[cfg(feature = "tdx")] 283 #[error("Error finalizing TDX VM: {0}")] 284 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 285 286 #[cfg(feature = "tdx")] 287 #[error("TDX firmware missing")] 288 TdxFirmwareMissing, 289 290 #[cfg(feature = "tdx")] 291 #[error("Invalid TDX payload type")] 292 InvalidPayloadType, 293 294 #[cfg(feature = "guest_debug")] 295 #[error("Error debugging VM: {0:?}")] 296 Debug(DebuggableError), 297 298 #[error("Error spawning kernel loading thread")] 299 KernelLoadThreadSpawn(std::io::Error), 300 301 #[error("Error joining kernel loading thread")] 302 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 303 304 #[error("Payload configuration is not bootable")] 305 InvalidPayload, 306 307 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 308 #[error("Error coredumping VM: {0:?}")] 309 Coredump(GuestDebuggableError), 310 311 #[cfg(feature = "igvm")] 312 #[error("Cannot open igvm file: {0}")] 313 IgvmFile(#[source] io::Error), 314 315 #[cfg(feature = "igvm")] 316 #[error("Cannot load the igvm into memory: {0}")] 317 IgvmLoad(#[source] igvm_loader::Error), 318 } 319 pub type Result<T> = result::Result<T, Error>; 320 321 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 322 pub enum VmState { 323 Created, 324 Running, 325 Shutdown, 326 Paused, 327 BreakPoint, 328 } 329 330 impl VmState { 331 fn valid_transition(self, new_state: VmState) -> Result<()> { 332 match self { 333 VmState::Created => match new_state { 334 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 335 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 336 Ok(()) 337 } 338 }, 339 340 VmState::Running => match new_state { 341 VmState::Created | VmState::Running => { 342 Err(Error::InvalidStateTransition(self, new_state)) 343 } 344 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 345 }, 346 347 VmState::Shutdown => match new_state { 348 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 349 Err(Error::InvalidStateTransition(self, new_state)) 350 } 351 VmState::Running => Ok(()), 352 }, 353 354 VmState::Paused => match new_state { 355 VmState::Created | VmState::Paused | VmState::BreakPoint => { 356 Err(Error::InvalidStateTransition(self, new_state)) 357 } 358 VmState::Running | VmState::Shutdown => Ok(()), 359 }, 360 VmState::BreakPoint => match new_state { 361 VmState::Created | VmState::Running => Ok(()), 362 _ => Err(Error::InvalidStateTransition(self, new_state)), 363 }, 364 } 365 } 366 } 367 368 struct VmOpsHandler { 369 memory: GuestMemoryAtomic<GuestMemoryMmap>, 370 #[cfg(target_arch = "x86_64")] 371 io_bus: Arc<Bus>, 372 mmio_bus: Arc<Bus>, 373 } 374 375 impl VmOps for VmOpsHandler { 376 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 377 self.memory 378 .memory() 379 .write(buf, GuestAddress(gpa)) 380 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 381 } 382 383 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 384 self.memory 385 .memory() 386 .read(buf, GuestAddress(gpa)) 387 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 388 } 389 390 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 391 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 392 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 393 } 394 Ok(()) 395 } 396 397 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 398 match self.mmio_bus.write(gpa, data) { 399 Err(vm_device::BusError::MissingAddressRange) => { 400 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 401 } 402 Ok(Some(barrier)) => { 403 info!("Waiting for barrier"); 404 barrier.wait(); 405 info!("Barrier released"); 406 } 407 _ => {} 408 }; 409 Ok(()) 410 } 411 412 #[cfg(target_arch = "x86_64")] 413 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 414 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 415 info!("Guest PIO read to unregistered address 0x{:x}", port); 416 } 417 Ok(()) 418 } 419 420 #[cfg(target_arch = "x86_64")] 421 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 422 match self.io_bus.write(port, data) { 423 Err(vm_device::BusError::MissingAddressRange) => { 424 info!("Guest PIO write to unregistered address 0x{:x}", port); 425 } 426 Ok(Some(barrier)) => { 427 info!("Waiting for barrier"); 428 barrier.wait(); 429 info!("Barrier released"); 430 } 431 _ => {} 432 }; 433 Ok(()) 434 } 435 } 436 437 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 438 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 439 440 cmp::min(host_phys_bits, max_phys_bits) 441 } 442 443 pub struct Vm { 444 #[cfg(feature = "tdx")] 445 kernel: Option<File>, 446 initramfs: Option<File>, 447 threads: Vec<thread::JoinHandle<()>>, 448 device_manager: Arc<Mutex<DeviceManager>>, 449 config: Arc<Mutex<VmConfig>>, 450 state: RwLock<VmState>, 451 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 452 memory_manager: Arc<Mutex<MemoryManager>>, 453 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 454 // The hypervisor abstracted virtual machine. 455 vm: Arc<dyn hypervisor::Vm>, 456 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 457 saved_clock: Option<hypervisor::ClockData>, 458 numa_nodes: NumaNodes, 459 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 460 hypervisor: Arc<dyn hypervisor::Hypervisor>, 461 stop_on_boot: bool, 462 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 463 } 464 465 impl Vm { 466 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 467 468 #[allow(clippy::too_many_arguments)] 469 pub fn new_from_memory_manager( 470 config: Arc<Mutex<VmConfig>>, 471 memory_manager: Arc<Mutex<MemoryManager>>, 472 vm: Arc<dyn hypervisor::Vm>, 473 exit_evt: EventFd, 474 reset_evt: EventFd, 475 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 476 seccomp_action: &SeccompAction, 477 hypervisor: Arc<dyn hypervisor::Hypervisor>, 478 activate_evt: EventFd, 479 timestamp: Instant, 480 serial_pty: Option<PtyPair>, 481 console_pty: Option<PtyPair>, 482 console_resize_pipe: Option<File>, 483 original_termios: Arc<Mutex<Option<termios>>>, 484 snapshot: Option<Snapshot>, 485 ) -> Result<Self> { 486 trace_scoped!("Vm::new_from_memory_manager"); 487 488 let boot_id_list = config 489 .lock() 490 .unwrap() 491 .validate() 492 .map_err(Error::ConfigValidation)?; 493 494 #[cfg(not(feature = "igvm"))] 495 let load_payload_handle = if snapshot.is_none() { 496 Self::load_payload_async(&memory_manager, &config)? 497 } else { 498 None 499 }; 500 501 info!("Booting VM from config: {:?}", &config); 502 503 // Create NUMA nodes based on NumaConfig. 504 let numa_nodes = 505 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 506 507 #[cfg(feature = "tdx")] 508 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 509 #[cfg(feature = "sev_snp")] 510 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 511 #[cfg(feature = "tdx")] 512 let force_iommu = tdx_enabled; 513 #[cfg(not(feature = "tdx"))] 514 let force_iommu = false; 515 516 #[cfg(feature = "guest_debug")] 517 let stop_on_boot = config.lock().unwrap().gdb; 518 #[cfg(not(feature = "guest_debug"))] 519 let stop_on_boot = false; 520 521 let memory = memory_manager.lock().unwrap().guest_memory(); 522 #[cfg(target_arch = "x86_64")] 523 let io_bus = Arc::new(Bus::new()); 524 let mmio_bus = Arc::new(Bus::new()); 525 526 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 527 memory, 528 #[cfg(target_arch = "x86_64")] 529 io_bus: io_bus.clone(), 530 mmio_bus: mmio_bus.clone(), 531 }); 532 533 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 534 let cpu_manager = cpu::CpuManager::new( 535 cpus_config, 536 vm.clone(), 537 exit_evt.try_clone().map_err(Error::EventFdClone)?, 538 reset_evt.try_clone().map_err(Error::EventFdClone)?, 539 #[cfg(feature = "guest_debug")] 540 vm_debug_evt, 541 &hypervisor, 542 seccomp_action.clone(), 543 vm_ops, 544 #[cfg(feature = "tdx")] 545 tdx_enabled, 546 &numa_nodes, 547 #[cfg(feature = "sev_snp")] 548 sev_snp_enabled, 549 ) 550 .map_err(Error::CpuManager)?; 551 552 #[cfg(target_arch = "x86_64")] 553 cpu_manager 554 .lock() 555 .unwrap() 556 .populate_cpuid( 557 &memory_manager, 558 &hypervisor, 559 #[cfg(feature = "tdx")] 560 tdx_enabled, 561 ) 562 .map_err(Error::CpuManager)?; 563 564 // Loading the igvm file is pushed down here because 565 // igvm parser needs cpu_manager to retrieve cpuid leaf. 566 // For the regular case, we can start loading early, but for 567 // igvm case we have to wait until cpu_manager is created. 568 // Currently, Microsoft Hypervisor does not provide any 569 // Hypervisor specific common cpuid, we need to call get_cpuid_values 570 // per cpuid through cpu_manager. 571 #[cfg(feature = "igvm")] 572 let load_payload_handle = if snapshot.is_none() { 573 Self::load_payload_async(&memory_manager, &config, &cpu_manager)? 574 } else { 575 None 576 }; 577 // The initial TDX configuration must be done before the vCPUs are 578 // created 579 #[cfg(feature = "tdx")] 580 if tdx_enabled { 581 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 582 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 583 vm.tdx_init(&cpuid, max_vcpus) 584 .map_err(Error::InitializeTdxVm)?; 585 } 586 587 cpu_manager 588 .lock() 589 .unwrap() 590 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 591 .map_err(Error::CpuManager)?; 592 593 // This initial SEV-SNP configuration must be done immediately after 594 // vCPUs are created. As part of this initialization we are 595 // transitioning the guest into secure state. 596 #[cfg(feature = "sev_snp")] 597 if sev_snp_enabled { 598 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 599 } 600 601 #[cfg(feature = "tdx")] 602 let dynamic = !tdx_enabled; 603 #[cfg(not(feature = "tdx"))] 604 let dynamic = true; 605 606 let device_manager = DeviceManager::new( 607 #[cfg(target_arch = "x86_64")] 608 io_bus, 609 mmio_bus, 610 hypervisor.hypervisor_type(), 611 vm.clone(), 612 config.clone(), 613 memory_manager.clone(), 614 cpu_manager.clone(), 615 exit_evt.try_clone().map_err(Error::EventFdClone)?, 616 reset_evt, 617 seccomp_action.clone(), 618 numa_nodes.clone(), 619 &activate_evt, 620 force_iommu, 621 boot_id_list, 622 timestamp, 623 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 624 dynamic, 625 ) 626 .map_err(Error::DeviceManager)?; 627 628 device_manager 629 .lock() 630 .unwrap() 631 .create_devices( 632 serial_pty, 633 console_pty, 634 console_resize_pipe, 635 original_termios, 636 ) 637 .map_err(Error::DeviceManager)?; 638 639 #[cfg(feature = "tdx")] 640 let kernel = config 641 .lock() 642 .unwrap() 643 .payload 644 .as_ref() 645 .map(|p| p.kernel.as_ref().map(File::open)) 646 .unwrap_or_default() 647 .transpose() 648 .map_err(Error::KernelFile)?; 649 650 let initramfs = config 651 .lock() 652 .unwrap() 653 .payload 654 .as_ref() 655 .map(|p| p.initramfs.as_ref().map(File::open)) 656 .unwrap_or_default() 657 .transpose() 658 .map_err(Error::InitramfsFile)?; 659 660 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 661 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 662 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 663 vm_snapshot.clock 664 } else { 665 None 666 }; 667 668 let vm_state = if snapshot.is_some() { 669 VmState::Paused 670 } else { 671 VmState::Created 672 }; 673 674 Ok(Vm { 675 #[cfg(feature = "tdx")] 676 kernel, 677 initramfs, 678 device_manager, 679 config, 680 threads: Vec::with_capacity(1), 681 state: RwLock::new(vm_state), 682 cpu_manager, 683 memory_manager, 684 vm, 685 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 686 saved_clock, 687 numa_nodes, 688 hypervisor, 689 stop_on_boot, 690 load_payload_handle, 691 }) 692 } 693 694 fn create_numa_nodes( 695 configs: Option<Vec<NumaConfig>>, 696 memory_manager: &Arc<Mutex<MemoryManager>>, 697 ) -> Result<NumaNodes> { 698 let mm = memory_manager.lock().unwrap(); 699 let mm_zones = mm.memory_zones(); 700 let mut numa_nodes = BTreeMap::new(); 701 702 if let Some(configs) = &configs { 703 for config in configs.iter() { 704 if numa_nodes.contains_key(&config.guest_numa_id) { 705 error!("Can't define twice the same NUMA node"); 706 return Err(Error::InvalidNumaConfig); 707 } 708 709 let mut node = NumaNode::default(); 710 711 if let Some(memory_zones) = &config.memory_zones { 712 for memory_zone in memory_zones.iter() { 713 if let Some(mm_zone) = mm_zones.get(memory_zone) { 714 node.memory_regions.extend(mm_zone.regions().clone()); 715 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 716 node.hotplug_regions.push(virtiomem_zone.region().clone()); 717 } 718 node.memory_zones.push(memory_zone.clone()); 719 } else { 720 error!("Unknown memory zone '{}'", memory_zone); 721 return Err(Error::InvalidNumaConfig); 722 } 723 } 724 } 725 726 if let Some(cpus) = &config.cpus { 727 node.cpus.extend(cpus); 728 } 729 730 if let Some(pci_segments) = &config.pci_segments { 731 node.pci_segments.extend(pci_segments); 732 } 733 734 if let Some(distances) = &config.distances { 735 for distance in distances.iter() { 736 let dest = distance.destination; 737 let dist = distance.distance; 738 739 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 740 error!("Unknown destination NUMA node {}", dest); 741 return Err(Error::InvalidNumaConfig); 742 } 743 744 if node.distances.contains_key(&dest) { 745 error!("Destination NUMA node {} has been already set", dest); 746 return Err(Error::InvalidNumaConfig); 747 } 748 749 node.distances.insert(dest, dist); 750 } 751 } 752 753 #[cfg(target_arch = "x86_64")] 754 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 755 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 756 let mm_sections = sgx_epc_region.epc_sections(); 757 for sgx_epc_section in sgx_epc_sections.iter() { 758 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 759 node.sgx_epc_sections.push(mm_section.clone()); 760 } else { 761 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 762 return Err(Error::InvalidNumaConfig); 763 } 764 } 765 } else { 766 error!("Missing SGX EPC region"); 767 return Err(Error::InvalidNumaConfig); 768 } 769 } 770 771 numa_nodes.insert(config.guest_numa_id, node); 772 } 773 } 774 775 Ok(numa_nodes) 776 } 777 778 #[allow(clippy::too_many_arguments)] 779 pub fn new( 780 vm_config: Arc<Mutex<VmConfig>>, 781 exit_evt: EventFd, 782 reset_evt: EventFd, 783 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 784 seccomp_action: &SeccompAction, 785 hypervisor: Arc<dyn hypervisor::Hypervisor>, 786 activate_evt: EventFd, 787 serial_pty: Option<PtyPair>, 788 console_pty: Option<PtyPair>, 789 console_resize_pipe: Option<File>, 790 original_termios: Arc<Mutex<Option<termios>>>, 791 snapshot: Option<Snapshot>, 792 source_url: Option<&str>, 793 prefault: Option<bool>, 794 ) -> Result<Self> { 795 trace_scoped!("Vm::new"); 796 797 let timestamp = Instant::now(); 798 799 #[cfg(feature = "tdx")] 800 let tdx_enabled = if snapshot.is_some() { 801 false 802 } else { 803 vm_config.lock().unwrap().is_tdx_enabled() 804 }; 805 806 #[cfg(feature = "sev_snp")] 807 let sev_snp_enabled = if snapshot.is_some() { 808 false 809 } else { 810 vm_config.lock().unwrap().is_sev_snp_enabled() 811 }; 812 813 let vm = Self::create_hypervisor_vm( 814 &hypervisor, 815 #[cfg(feature = "tdx")] 816 tdx_enabled, 817 #[cfg(feature = "sev_snp")] 818 sev_snp_enabled, 819 )?; 820 821 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 822 823 let memory_manager = if let Some(snapshot) = 824 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 825 { 826 MemoryManager::new_from_snapshot( 827 &snapshot, 828 vm.clone(), 829 &vm_config.lock().unwrap().memory.clone(), 830 source_url, 831 prefault.unwrap(), 832 phys_bits, 833 ) 834 .map_err(Error::MemoryManager)? 835 } else { 836 #[cfg(target_arch = "x86_64")] 837 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 838 839 MemoryManager::new( 840 vm.clone(), 841 &vm_config.lock().unwrap().memory.clone(), 842 None, 843 phys_bits, 844 #[cfg(feature = "tdx")] 845 tdx_enabled, 846 None, 847 None, 848 #[cfg(target_arch = "x86_64")] 849 sgx_epc_config, 850 ) 851 .map_err(Error::MemoryManager)? 852 }; 853 854 Vm::new_from_memory_manager( 855 vm_config, 856 memory_manager, 857 vm, 858 exit_evt, 859 reset_evt, 860 #[cfg(feature = "guest_debug")] 861 vm_debug_evt, 862 seccomp_action, 863 hypervisor, 864 activate_evt, 865 timestamp, 866 serial_pty, 867 console_pty, 868 console_resize_pipe, 869 original_termios, 870 snapshot, 871 ) 872 } 873 874 pub fn create_hypervisor_vm( 875 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 876 #[cfg(feature = "tdx")] tdx_enabled: bool, 877 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 878 ) -> Result<Arc<dyn hypervisor::Vm>> { 879 hypervisor.check_required_extensions().unwrap(); 880 881 cfg_if::cfg_if! { 882 if #[cfg(feature = "tdx")] { 883 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 884 // Otherwise KVM_X86_LEGACY_VM: 0 885 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 886 let vm = hypervisor 887 .create_vm_with_type(u64::from(tdx_enabled)) 888 .unwrap(); 889 } else if #[cfg(feature = "sev_snp")] { 890 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 891 // Otherwise SEV_SNP_DISABLED: 0 892 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 893 let vm = hypervisor 894 .create_vm_with_type(u64::from(sev_snp_enabled)) 895 .unwrap(); 896 } else { 897 let vm = hypervisor.create_vm().unwrap(); 898 } 899 } 900 901 #[cfg(target_arch = "x86_64")] 902 { 903 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 904 .unwrap(); 905 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 906 vm.enable_split_irq().unwrap(); 907 } 908 909 Ok(vm) 910 } 911 912 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 913 let initramfs = self.initramfs.as_mut().unwrap(); 914 let size: usize = initramfs 915 .seek(SeekFrom::End(0)) 916 .map_err(|_| Error::InitramfsLoad)? 917 .try_into() 918 .unwrap(); 919 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 920 921 let address = 922 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 923 let address = GuestAddress(address); 924 925 guest_mem 926 .read_volatile_from(address, initramfs, size) 927 .map_err(|_| Error::InitramfsLoad)?; 928 929 info!("Initramfs loaded: address = 0x{:x}", address.0); 930 Ok(arch::InitramfsConfig { address, size }) 931 } 932 933 pub fn generate_cmdline( 934 payload: &PayloadConfig, 935 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 936 ) -> Result<Cmdline> { 937 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 938 if let Some(s) = payload.cmdline.as_ref() { 939 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 940 } 941 942 #[cfg(target_arch = "aarch64")] 943 for entry in device_manager.lock().unwrap().cmdline_additions() { 944 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 945 } 946 Ok(cmdline) 947 } 948 949 #[cfg(target_arch = "aarch64")] 950 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 951 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 952 let mem = uefi_flash.memory(); 953 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 954 .map_err(Error::UefiLoad)?; 955 Ok(()) 956 } 957 958 #[cfg(target_arch = "aarch64")] 959 fn load_kernel( 960 firmware: Option<File>, 961 kernel: Option<File>, 962 memory_manager: Arc<Mutex<MemoryManager>>, 963 ) -> Result<EntryPoint> { 964 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 965 let mem = guest_memory.memory(); 966 let entry_addr = match (firmware, kernel) { 967 (None, Some(mut kernel)) => { 968 match linux_loader::loader::pe::PE::load( 969 mem.deref(), 970 Some(arch::layout::KERNEL_START), 971 &mut kernel, 972 None, 973 ) { 974 Ok(entry_addr) => entry_addr.kernel_load, 975 // Try to load the binary as kernel PE file at first. 976 // If failed, retry to load it as UEFI binary. 977 // As the UEFI binary is formatless, it must be the last option to try. 978 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 979 Self::load_firmware(&kernel, memory_manager)?; 980 arch::layout::UEFI_START 981 } 982 Err(e) => { 983 return Err(Error::KernelLoad(e)); 984 } 985 } 986 } 987 (Some(firmware), None) => { 988 Self::load_firmware(&firmware, memory_manager)?; 989 arch::layout::UEFI_START 990 } 991 _ => return Err(Error::InvalidPayload), 992 }; 993 994 Ok(EntryPoint { entry_addr }) 995 } 996 997 #[cfg(feature = "igvm")] 998 fn load_igvm( 999 igvm: File, 1000 memory_manager: Arc<Mutex<MemoryManager>>, 1001 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1002 ) -> Result<EntryPoint> { 1003 let res = igvm_loader::load_igvm(&igvm, memory_manager, cpu_manager.clone(), "") 1004 .map_err(Error::IgvmLoad)?; 1005 1006 cfg_if::cfg_if! { 1007 if #[cfg(feature = "sev_snp")] { 1008 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1009 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa) } 1010 } else { 1011 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip) } 1012 }; 1013 } else { 1014 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip) }; 1015 } 1016 }; 1017 Ok(entry_point) 1018 } 1019 1020 #[cfg(target_arch = "x86_64")] 1021 fn load_kernel( 1022 mut kernel: File, 1023 cmdline: Option<Cmdline>, 1024 memory_manager: Arc<Mutex<MemoryManager>>, 1025 ) -> Result<EntryPoint> { 1026 info!("Loading kernel"); 1027 1028 let mem = { 1029 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1030 guest_memory.memory() 1031 }; 1032 let entry_addr = linux_loader::loader::elf::Elf::load( 1033 mem.deref(), 1034 None, 1035 &mut kernel, 1036 Some(arch::layout::HIGH_RAM_START), 1037 ) 1038 .map_err(Error::KernelLoad)?; 1039 1040 if let Some(cmdline) = cmdline { 1041 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1042 .map_err(Error::LoadCmdLine)?; 1043 } 1044 1045 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1046 // Use the PVH kernel entry point to boot the guest 1047 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1048 Ok(EntryPoint { entry_addr }) 1049 } else { 1050 Err(Error::KernelMissingPvhHeader) 1051 } 1052 } 1053 1054 #[cfg(target_arch = "x86_64")] 1055 fn load_payload( 1056 payload: &PayloadConfig, 1057 memory_manager: Arc<Mutex<MemoryManager>>, 1058 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1059 ) -> Result<EntryPoint> { 1060 trace_scoped!("load_payload"); 1061 #[cfg(feature = "igvm")] 1062 if let Some(_igvm_file) = &payload.igvm { 1063 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1064 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1065 } 1066 match ( 1067 &payload.firmware, 1068 &payload.kernel, 1069 &payload.initramfs, 1070 &payload.cmdline, 1071 ) { 1072 (Some(firmware), None, None, None) => { 1073 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1074 Self::load_kernel(firmware, None, memory_manager) 1075 } 1076 (None, Some(kernel), _, _) => { 1077 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1078 let cmdline = Self::generate_cmdline(payload)?; 1079 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1080 } 1081 _ => Err(Error::InvalidPayload), 1082 } 1083 } 1084 1085 #[cfg(target_arch = "aarch64")] 1086 fn load_payload( 1087 payload: &PayloadConfig, 1088 memory_manager: Arc<Mutex<MemoryManager>>, 1089 ) -> Result<EntryPoint> { 1090 match (&payload.firmware, &payload.kernel) { 1091 (Some(firmware), None) => { 1092 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1093 Self::load_kernel(Some(firmware), None, memory_manager) 1094 } 1095 (None, Some(kernel)) => { 1096 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1097 Self::load_kernel(None, Some(kernel), memory_manager) 1098 } 1099 _ => Err(Error::InvalidPayload), 1100 } 1101 } 1102 1103 fn load_payload_async( 1104 memory_manager: &Arc<Mutex<MemoryManager>>, 1105 config: &Arc<Mutex<VmConfig>>, 1106 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1107 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1108 // Kernel with TDX is loaded in a different manner 1109 #[cfg(feature = "tdx")] 1110 if config.lock().unwrap().is_tdx_enabled() { 1111 return Ok(None); 1112 } 1113 1114 config 1115 .lock() 1116 .unwrap() 1117 .payload 1118 .as_ref() 1119 .map(|payload| { 1120 let memory_manager = memory_manager.clone(); 1121 let payload = payload.clone(); 1122 #[cfg(feature = "igvm")] 1123 let cpu_manager = cpu_manager.clone(); 1124 1125 std::thread::Builder::new() 1126 .name("payload_loader".into()) 1127 .spawn(move || { 1128 Self::load_payload( 1129 &payload, 1130 memory_manager, 1131 #[cfg(feature = "igvm")] 1132 cpu_manager, 1133 ) 1134 }) 1135 .map_err(Error::KernelLoadThreadSpawn) 1136 }) 1137 .transpose() 1138 } 1139 1140 #[cfg(target_arch = "x86_64")] 1141 fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> { 1142 trace_scoped!("configure_system"); 1143 info!("Configuring system"); 1144 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1145 1146 let initramfs_config = match self.initramfs { 1147 Some(_) => Some(self.load_initramfs(&mem)?), 1148 None => None, 1149 }; 1150 1151 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1152 let rsdp_addr = Some(rsdp_addr); 1153 let sgx_epc_region = self 1154 .memory_manager 1155 .lock() 1156 .unwrap() 1157 .sgx_epc_region() 1158 .as_ref() 1159 .cloned(); 1160 1161 let serial_number = self 1162 .config 1163 .lock() 1164 .unwrap() 1165 .platform 1166 .as_ref() 1167 .and_then(|p| p.serial_number.clone()); 1168 1169 let uuid = self 1170 .config 1171 .lock() 1172 .unwrap() 1173 .platform 1174 .as_ref() 1175 .and_then(|p| p.uuid.clone()); 1176 1177 let oem_strings = self 1178 .config 1179 .lock() 1180 .unwrap() 1181 .platform 1182 .as_ref() 1183 .and_then(|p| p.oem_strings.clone()); 1184 1185 let oem_strings = oem_strings 1186 .as_deref() 1187 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1188 1189 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1190 1191 arch::configure_system( 1192 &mem, 1193 arch::layout::CMDLINE_START, 1194 &initramfs_config, 1195 boot_vcpus, 1196 rsdp_addr, 1197 sgx_epc_region, 1198 serial_number.as_deref(), 1199 uuid.as_deref(), 1200 oem_strings.as_deref(), 1201 topology, 1202 ) 1203 .map_err(Error::ConfigureSystem)?; 1204 Ok(()) 1205 } 1206 1207 #[cfg(target_arch = "aarch64")] 1208 fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> { 1209 let cmdline = Self::generate_cmdline( 1210 self.config.lock().unwrap().payload.as_ref().unwrap(), 1211 &self.device_manager, 1212 )?; 1213 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1214 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1215 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1216 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1217 let initramfs_config = match self.initramfs { 1218 Some(_) => Some(self.load_initramfs(&mem)?), 1219 None => None, 1220 }; 1221 1222 let device_info = &self 1223 .device_manager 1224 .lock() 1225 .unwrap() 1226 .get_device_info() 1227 .clone(); 1228 1229 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1230 let pci_space = PciSpaceInfo { 1231 pci_segment_id: pci_segment.id, 1232 mmio_config_address: pci_segment.mmio_config_address, 1233 pci_device_space_start: pci_segment.start_of_mem64_area, 1234 pci_device_space_size: pci_segment.end_of_mem64_area 1235 - pci_segment.start_of_mem64_area 1236 + 1, 1237 }; 1238 pci_space_info.push(pci_space); 1239 } 1240 1241 let virtio_iommu_bdf = self 1242 .device_manager 1243 .lock() 1244 .unwrap() 1245 .iommu_attached_devices() 1246 .as_ref() 1247 .map(|(v, _)| *v); 1248 1249 let vgic = self 1250 .device_manager 1251 .lock() 1252 .unwrap() 1253 .get_interrupt_controller() 1254 .unwrap() 1255 .lock() 1256 .unwrap() 1257 .get_vgic() 1258 .map_err(|_| { 1259 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1260 arch::aarch64::Error::SetupGic, 1261 )) 1262 })?; 1263 1264 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1265 let pmu_supported = self 1266 .cpu_manager 1267 .lock() 1268 .unwrap() 1269 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1270 .map_err(|_| { 1271 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1272 arch::aarch64::Error::VcpuInitPmu, 1273 )) 1274 })?; 1275 1276 arch::configure_system( 1277 &mem, 1278 cmdline.as_cstring().unwrap().to_str().unwrap(), 1279 vcpu_mpidrs, 1280 vcpu_topology, 1281 device_info, 1282 &initramfs_config, 1283 &pci_space_info, 1284 virtio_iommu_bdf.map(|bdf| bdf.into()), 1285 &vgic, 1286 &self.numa_nodes, 1287 pmu_supported, 1288 ) 1289 .map_err(Error::ConfigureSystem)?; 1290 1291 Ok(()) 1292 } 1293 1294 pub fn serial_pty(&self) -> Option<PtyPair> { 1295 self.device_manager.lock().unwrap().serial_pty() 1296 } 1297 1298 pub fn console_pty(&self) -> Option<PtyPair> { 1299 self.device_manager.lock().unwrap().console_pty() 1300 } 1301 1302 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1303 self.device_manager.lock().unwrap().console_resize_pipe() 1304 } 1305 1306 pub fn shutdown(&mut self) -> Result<()> { 1307 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1308 let new_state = VmState::Shutdown; 1309 1310 state.valid_transition(new_state)?; 1311 1312 // Wake up the DeviceManager threads so they will get terminated cleanly 1313 self.device_manager 1314 .lock() 1315 .unwrap() 1316 .resume() 1317 .map_err(Error::Resume)?; 1318 1319 self.cpu_manager 1320 .lock() 1321 .unwrap() 1322 .shutdown() 1323 .map_err(Error::CpuManager)?; 1324 1325 // Wait for all the threads to finish 1326 for thread in self.threads.drain(..) { 1327 thread.join().map_err(Error::ThreadCleanup)? 1328 } 1329 *state = new_state; 1330 1331 event!("vm", "shutdown"); 1332 1333 Ok(()) 1334 } 1335 1336 pub fn resize( 1337 &mut self, 1338 desired_vcpus: Option<u8>, 1339 desired_memory: Option<u64>, 1340 desired_balloon: Option<u64>, 1341 ) -> Result<()> { 1342 event!("vm", "resizing"); 1343 1344 if let Some(desired_vcpus) = desired_vcpus { 1345 if self 1346 .cpu_manager 1347 .lock() 1348 .unwrap() 1349 .resize(desired_vcpus) 1350 .map_err(Error::CpuManager)? 1351 { 1352 self.device_manager 1353 .lock() 1354 .unwrap() 1355 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1356 .map_err(Error::DeviceManager)?; 1357 } 1358 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1359 } 1360 1361 if let Some(desired_memory) = desired_memory { 1362 let new_region = self 1363 .memory_manager 1364 .lock() 1365 .unwrap() 1366 .resize(desired_memory) 1367 .map_err(Error::MemoryManager)?; 1368 1369 let memory_config = &mut self.config.lock().unwrap().memory; 1370 1371 if let Some(new_region) = &new_region { 1372 self.device_manager 1373 .lock() 1374 .unwrap() 1375 .update_memory(new_region) 1376 .map_err(Error::DeviceManager)?; 1377 1378 match memory_config.hotplug_method { 1379 HotplugMethod::Acpi => { 1380 self.device_manager 1381 .lock() 1382 .unwrap() 1383 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1384 .map_err(Error::DeviceManager)?; 1385 } 1386 HotplugMethod::VirtioMem => {} 1387 } 1388 } 1389 1390 // We update the VM config regardless of the actual guest resize 1391 // operation result (happened or not), so that if the VM reboots 1392 // it will be running with the last configure memory size. 1393 match memory_config.hotplug_method { 1394 HotplugMethod::Acpi => memory_config.size = desired_memory, 1395 HotplugMethod::VirtioMem => { 1396 if desired_memory > memory_config.size { 1397 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1398 } else { 1399 memory_config.hotplugged_size = None; 1400 } 1401 } 1402 } 1403 } 1404 1405 if let Some(desired_balloon) = desired_balloon { 1406 self.device_manager 1407 .lock() 1408 .unwrap() 1409 .resize_balloon(desired_balloon) 1410 .map_err(Error::DeviceManager)?; 1411 1412 // Update the configuration value for the balloon size to ensure 1413 // a reboot would use the right value. 1414 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1415 balloon_config.size = desired_balloon; 1416 } 1417 } 1418 1419 event!("vm", "resized"); 1420 1421 Ok(()) 1422 } 1423 1424 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1425 let memory_config = &mut self.config.lock().unwrap().memory; 1426 1427 if let Some(zones) = &mut memory_config.zones { 1428 for zone in zones.iter_mut() { 1429 if zone.id == id { 1430 if desired_memory >= zone.size { 1431 let hotplugged_size = desired_memory - zone.size; 1432 self.memory_manager 1433 .lock() 1434 .unwrap() 1435 .resize_zone(&id, desired_memory - zone.size) 1436 .map_err(Error::MemoryManager)?; 1437 // We update the memory zone config regardless of the 1438 // actual 'resize-zone' operation result (happened or 1439 // not), so that if the VM reboots it will be running 1440 // with the last configured memory zone size. 1441 zone.hotplugged_size = Some(hotplugged_size); 1442 1443 return Ok(()); 1444 } else { 1445 error!( 1446 "Invalid to ask less ({}) than boot RAM ({}) for \ 1447 this memory zone", 1448 desired_memory, zone.size, 1449 ); 1450 return Err(Error::ResizeZone); 1451 } 1452 } 1453 } 1454 } 1455 1456 error!("Could not find the memory zone {} for the resize", id); 1457 Err(Error::ResizeZone) 1458 } 1459 1460 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1461 let pci_device_info = self 1462 .device_manager 1463 .lock() 1464 .unwrap() 1465 .add_device(&mut device_cfg) 1466 .map_err(Error::DeviceManager)?; 1467 1468 // Update VmConfig by adding the new device. This is important to 1469 // ensure the device would be created in case of a reboot. 1470 { 1471 let mut config = self.config.lock().unwrap(); 1472 add_to_config(&mut config.devices, device_cfg); 1473 } 1474 1475 self.device_manager 1476 .lock() 1477 .unwrap() 1478 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1479 .map_err(Error::DeviceManager)?; 1480 1481 Ok(pci_device_info) 1482 } 1483 1484 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1485 let pci_device_info = self 1486 .device_manager 1487 .lock() 1488 .unwrap() 1489 .add_user_device(&mut device_cfg) 1490 .map_err(Error::DeviceManager)?; 1491 1492 // Update VmConfig by adding the new device. This is important to 1493 // ensure the device would be created in case of a reboot. 1494 { 1495 let mut config = self.config.lock().unwrap(); 1496 add_to_config(&mut config.user_devices, device_cfg); 1497 } 1498 1499 self.device_manager 1500 .lock() 1501 .unwrap() 1502 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1503 .map_err(Error::DeviceManager)?; 1504 1505 Ok(pci_device_info) 1506 } 1507 1508 pub fn remove_device(&mut self, id: String) -> Result<()> { 1509 self.device_manager 1510 .lock() 1511 .unwrap() 1512 .remove_device(id.clone()) 1513 .map_err(Error::DeviceManager)?; 1514 1515 // Update VmConfig by removing the device. This is important to 1516 // ensure the device would not be created in case of a reboot. 1517 self.config.lock().unwrap().remove_device(&id); 1518 1519 self.device_manager 1520 .lock() 1521 .unwrap() 1522 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1523 .map_err(Error::DeviceManager)?; 1524 Ok(()) 1525 } 1526 1527 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1528 let pci_device_info = self 1529 .device_manager 1530 .lock() 1531 .unwrap() 1532 .add_disk(&mut disk_cfg) 1533 .map_err(Error::DeviceManager)?; 1534 1535 // Update VmConfig by adding the new device. This is important to 1536 // ensure the device would be created in case of a reboot. 1537 { 1538 let mut config = self.config.lock().unwrap(); 1539 add_to_config(&mut config.disks, disk_cfg); 1540 } 1541 1542 self.device_manager 1543 .lock() 1544 .unwrap() 1545 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1546 .map_err(Error::DeviceManager)?; 1547 1548 Ok(pci_device_info) 1549 } 1550 1551 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1552 let pci_device_info = self 1553 .device_manager 1554 .lock() 1555 .unwrap() 1556 .add_fs(&mut fs_cfg) 1557 .map_err(Error::DeviceManager)?; 1558 1559 // Update VmConfig by adding the new device. This is important to 1560 // ensure the device would be created in case of a reboot. 1561 { 1562 let mut config = self.config.lock().unwrap(); 1563 add_to_config(&mut config.fs, fs_cfg); 1564 } 1565 1566 self.device_manager 1567 .lock() 1568 .unwrap() 1569 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1570 .map_err(Error::DeviceManager)?; 1571 1572 Ok(pci_device_info) 1573 } 1574 1575 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1576 let pci_device_info = self 1577 .device_manager 1578 .lock() 1579 .unwrap() 1580 .add_pmem(&mut pmem_cfg) 1581 .map_err(Error::DeviceManager)?; 1582 1583 // Update VmConfig by adding the new device. This is important to 1584 // ensure the device would be created in case of a reboot. 1585 { 1586 let mut config = self.config.lock().unwrap(); 1587 add_to_config(&mut config.pmem, pmem_cfg); 1588 } 1589 1590 self.device_manager 1591 .lock() 1592 .unwrap() 1593 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1594 .map_err(Error::DeviceManager)?; 1595 1596 Ok(pci_device_info) 1597 } 1598 1599 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1600 let pci_device_info = self 1601 .device_manager 1602 .lock() 1603 .unwrap() 1604 .add_net(&mut net_cfg) 1605 .map_err(Error::DeviceManager)?; 1606 1607 // Update VmConfig by adding the new device. This is important to 1608 // ensure the device would be created in case of a reboot. 1609 { 1610 let mut config = self.config.lock().unwrap(); 1611 add_to_config(&mut config.net, net_cfg); 1612 } 1613 1614 self.device_manager 1615 .lock() 1616 .unwrap() 1617 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1618 .map_err(Error::DeviceManager)?; 1619 1620 Ok(pci_device_info) 1621 } 1622 1623 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1624 let pci_device_info = self 1625 .device_manager 1626 .lock() 1627 .unwrap() 1628 .add_vdpa(&mut vdpa_cfg) 1629 .map_err(Error::DeviceManager)?; 1630 1631 // Update VmConfig by adding the new device. This is important to 1632 // ensure the device would be created in case of a reboot. 1633 { 1634 let mut config = self.config.lock().unwrap(); 1635 add_to_config(&mut config.vdpa, vdpa_cfg); 1636 } 1637 1638 self.device_manager 1639 .lock() 1640 .unwrap() 1641 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1642 .map_err(Error::DeviceManager)?; 1643 1644 Ok(pci_device_info) 1645 } 1646 1647 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1648 let pci_device_info = self 1649 .device_manager 1650 .lock() 1651 .unwrap() 1652 .add_vsock(&mut vsock_cfg) 1653 .map_err(Error::DeviceManager)?; 1654 1655 // Update VmConfig by adding the new device. This is important to 1656 // ensure the device would be created in case of a reboot. 1657 { 1658 let mut config = self.config.lock().unwrap(); 1659 config.vsock = Some(vsock_cfg); 1660 } 1661 1662 self.device_manager 1663 .lock() 1664 .unwrap() 1665 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1666 .map_err(Error::DeviceManager)?; 1667 1668 Ok(pci_device_info) 1669 } 1670 1671 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1672 Ok(self.device_manager.lock().unwrap().counters()) 1673 } 1674 1675 #[cfg(feature = "tdx")] 1676 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1677 use arch::x86_64::tdx::*; 1678 1679 let firmware_path = self 1680 .config 1681 .lock() 1682 .unwrap() 1683 .payload 1684 .as_ref() 1685 .unwrap() 1686 .firmware 1687 .clone() 1688 .ok_or(Error::TdxFirmwareMissing)?; 1689 // The TDVF file contains a table of section as well as code 1690 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1691 1692 // For all the sections allocate some RAM backing them 1693 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1694 } 1695 1696 #[cfg(feature = "tdx")] 1697 fn hob_memory_resources( 1698 mut sorted_sections: Vec<TdvfSection>, 1699 guest_memory: &GuestMemoryMmap, 1700 ) -> Vec<(u64, u64, bool)> { 1701 let mut list = Vec::new(); 1702 1703 let mut current_section = sorted_sections.pop(); 1704 1705 // RAM regions interleaved with TDVF sections 1706 let mut next_start_addr = 0; 1707 for region in guest_memory.iter() { 1708 let region_start = region.start_addr().0; 1709 let region_end = region.last_addr().0; 1710 if region_start > next_start_addr { 1711 next_start_addr = region_start; 1712 } 1713 1714 loop { 1715 let (start, size, ram) = if let Some(section) = ¤t_section { 1716 if section.address <= next_start_addr { 1717 (section.address, section.size, false) 1718 } else { 1719 let last_addr = std::cmp::min(section.address - 1, region_end); 1720 (next_start_addr, last_addr - next_start_addr + 1, true) 1721 } 1722 } else { 1723 (next_start_addr, region_end - next_start_addr + 1, true) 1724 }; 1725 1726 list.push((start, size, ram)); 1727 1728 if !ram { 1729 current_section = sorted_sections.pop(); 1730 } 1731 1732 next_start_addr = start + size; 1733 1734 if region_start > next_start_addr { 1735 next_start_addr = region_start; 1736 } 1737 1738 if next_start_addr > region_end { 1739 break; 1740 } 1741 } 1742 } 1743 1744 // Once all the interleaved sections have been processed, let's simply 1745 // pull the remaining ones. 1746 if let Some(section) = current_section { 1747 list.push((section.address, section.size, false)); 1748 } 1749 while let Some(section) = sorted_sections.pop() { 1750 list.push((section.address, section.size, false)); 1751 } 1752 1753 list 1754 } 1755 1756 #[cfg(feature = "tdx")] 1757 fn populate_tdx_sections( 1758 &mut self, 1759 sections: &[TdvfSection], 1760 guid_found: bool, 1761 ) -> Result<Option<u64>> { 1762 use arch::x86_64::tdx::*; 1763 // Get the memory end *before* we start adding TDVF ram regions 1764 let boot_guest_memory = self 1765 .memory_manager 1766 .lock() 1767 .as_ref() 1768 .unwrap() 1769 .boot_guest_memory(); 1770 for section in sections { 1771 // No need to allocate if the section falls within guest RAM ranges 1772 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1773 info!( 1774 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1775 section 1776 ); 1777 continue; 1778 } 1779 1780 info!("Allocating TDVF Section: {:x?}", section); 1781 self.memory_manager 1782 .lock() 1783 .unwrap() 1784 .add_ram_region(GuestAddress(section.address), section.size as usize) 1785 .map_err(Error::AllocatingTdvfMemory)?; 1786 } 1787 1788 // The TDVF file contains a table of section as well as code 1789 let firmware_path = self 1790 .config 1791 .lock() 1792 .unwrap() 1793 .payload 1794 .as_ref() 1795 .unwrap() 1796 .firmware 1797 .clone() 1798 .ok_or(Error::TdxFirmwareMissing)?; 1799 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1800 1801 // The guest memory at this point now has all the required regions so it 1802 // is safe to copy from the TDVF file into it. 1803 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1804 let mem = guest_memory.memory(); 1805 let mut payload_info = None; 1806 let mut hob_offset = None; 1807 for section in sections { 1808 info!("Populating TDVF Section: {:x?}", section); 1809 match section.r#type { 1810 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1811 info!("Copying section to guest memory"); 1812 firmware_file 1813 .seek(SeekFrom::Start(section.data_offset as u64)) 1814 .map_err(Error::LoadTdvf)?; 1815 mem.read_volatile_from( 1816 GuestAddress(section.address), 1817 &mut firmware_file, 1818 section.data_size as usize, 1819 ) 1820 .unwrap(); 1821 } 1822 TdvfSectionType::TdHob => { 1823 hob_offset = Some(section.address); 1824 } 1825 TdvfSectionType::Payload => { 1826 info!("Copying payload to guest memory"); 1827 if let Some(payload_file) = self.kernel.as_mut() { 1828 let payload_size = payload_file 1829 .seek(SeekFrom::End(0)) 1830 .map_err(Error::LoadPayload)?; 1831 1832 payload_file 1833 .seek(SeekFrom::Start(0x1f1)) 1834 .map_err(Error::LoadPayload)?; 1835 1836 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1837 payload_file 1838 .read_volatile(&mut payload_header.as_bytes()) 1839 .unwrap(); 1840 1841 if payload_header.header != 0x5372_6448 { 1842 return Err(Error::InvalidPayloadType); 1843 } 1844 1845 if (payload_header.version < 0x0200) 1846 || ((payload_header.loadflags & 0x1) == 0x0) 1847 { 1848 return Err(Error::InvalidPayloadType); 1849 } 1850 1851 payload_file.rewind().map_err(Error::LoadPayload)?; 1852 mem.read_volatile_from( 1853 GuestAddress(section.address), 1854 payload_file, 1855 payload_size as usize, 1856 ) 1857 .unwrap(); 1858 1859 // Create the payload info that will be inserted into 1860 // the HOB. 1861 payload_info = Some(PayloadInfo { 1862 image_type: PayloadImageType::BzImage, 1863 entry_point: section.address, 1864 }); 1865 } 1866 } 1867 TdvfSectionType::PayloadParam => { 1868 info!("Copying payload parameters to guest memory"); 1869 let cmdline = Self::generate_cmdline( 1870 self.config.lock().unwrap().payload.as_ref().unwrap(), 1871 )?; 1872 mem.write_slice( 1873 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1874 GuestAddress(section.address), 1875 ) 1876 .unwrap(); 1877 } 1878 _ => {} 1879 } 1880 } 1881 1882 // Generate HOB 1883 let mut hob = TdHob::start(hob_offset.unwrap()); 1884 1885 let mut sorted_sections = sections.to_vec(); 1886 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1887 1888 sorted_sections.sort_by_key(|section| section.address); 1889 sorted_sections.reverse(); 1890 1891 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1892 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1893 .map_err(Error::PopulateHob)?; 1894 } 1895 1896 // MMIO regions 1897 hob.add_mmio_resource( 1898 &mem, 1899 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1900 arch::layout::APIC_START.raw_value() 1901 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1902 ) 1903 .map_err(Error::PopulateHob)?; 1904 let start_of_device_area = self 1905 .memory_manager 1906 .lock() 1907 .unwrap() 1908 .start_of_device_area() 1909 .raw_value(); 1910 let end_of_device_area = self 1911 .memory_manager 1912 .lock() 1913 .unwrap() 1914 .end_of_device_area() 1915 .raw_value(); 1916 hob.add_mmio_resource( 1917 &mem, 1918 start_of_device_area, 1919 end_of_device_area - start_of_device_area, 1920 ) 1921 .map_err(Error::PopulateHob)?; 1922 1923 // Loop over the ACPI tables and copy them to the HOB. 1924 1925 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1926 &self.device_manager, 1927 &self.cpu_manager, 1928 &self.memory_manager, 1929 &self.numa_nodes, 1930 ) { 1931 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1932 .map_err(Error::PopulateHob)?; 1933 } 1934 1935 // If a payload info has been created, let's insert it into the HOB. 1936 if let Some(payload_info) = payload_info { 1937 hob.add_payload(&mem, payload_info) 1938 .map_err(Error::PopulateHob)?; 1939 } 1940 1941 hob.finish(&mem).map_err(Error::PopulateHob)?; 1942 1943 Ok(hob_offset) 1944 } 1945 1946 #[cfg(feature = "tdx")] 1947 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1948 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1949 let mem = guest_memory.memory(); 1950 1951 for section in sections { 1952 self.vm 1953 .tdx_init_memory_region( 1954 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1955 section.address, 1956 section.size, 1957 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1958 section.attributes == 1, 1959 ) 1960 .map_err(Error::InitializeTdxMemoryRegion)?; 1961 } 1962 1963 Ok(()) 1964 } 1965 1966 // Creates ACPI tables 1967 // In case of TDX being used, this is a no-op since the tables will be 1968 // created and passed when populating the HOB. 1969 1970 fn create_acpi_tables(&self) -> Option<GuestAddress> { 1971 #[cfg(feature = "tdx")] 1972 if self.config.lock().unwrap().is_tdx_enabled() { 1973 return None; 1974 } 1975 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 1976 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 1977 let rsdp_addr = crate::acpi::create_acpi_tables( 1978 &mem, 1979 &self.device_manager, 1980 &self.cpu_manager, 1981 &self.memory_manager, 1982 &self.numa_nodes, 1983 tpm_enabled, 1984 ); 1985 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 1986 1987 Some(rsdp_addr) 1988 } 1989 1990 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 1991 trace_scoped!("entry_point"); 1992 1993 self.load_payload_handle 1994 .take() 1995 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 1996 .transpose() 1997 } 1998 1999 pub fn boot(&mut self) -> Result<()> { 2000 trace_scoped!("Vm::boot"); 2001 info!("Booting VM"); 2002 event!("vm", "booting"); 2003 let current_state = self.get_state()?; 2004 if current_state == VmState::Paused { 2005 return self.resume().map_err(Error::Resume); 2006 } 2007 2008 let new_state = if self.stop_on_boot { 2009 VmState::BreakPoint 2010 } else { 2011 VmState::Running 2012 }; 2013 current_state.valid_transition(new_state)?; 2014 2015 // Do earlier to parallelise with loading kernel 2016 #[cfg(target_arch = "x86_64")] 2017 let rsdp_addr = self.create_acpi_tables(); 2018 2019 // Load kernel synchronously or if asynchronous then wait for load to 2020 // finish. 2021 let entry_point = self.entry_point()?; 2022 2023 #[cfg(feature = "tdx")] 2024 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2025 2026 // Configure the vcpus that have been created 2027 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2028 for vcpu in vcpus { 2029 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2030 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2031 self.cpu_manager 2032 .lock() 2033 .unwrap() 2034 .configure_vcpu(vcpu, boot_setup) 2035 .map_err(Error::CpuManager)?; 2036 } 2037 2038 #[cfg(feature = "tdx")] 2039 let (sections, guid_found) = if tdx_enabled { 2040 self.extract_tdvf_sections()? 2041 } else { 2042 (Vec::new(), false) 2043 }; 2044 2045 // Configuring the TDX regions requires that the vCPUs are created. 2046 #[cfg(feature = "tdx")] 2047 let hob_address = if tdx_enabled { 2048 // TDX sections are written to memory. 2049 self.populate_tdx_sections(§ions, guid_found)? 2050 } else { 2051 None 2052 }; 2053 2054 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2055 // available after they are configured 2056 #[cfg(target_arch = "aarch64")] 2057 let rsdp_addr = self.create_acpi_tables(); 2058 2059 // Configure shared state based on loaded kernel 2060 entry_point 2061 .map(|_| { 2062 // Safe to unwrap rsdp_addr as we know it can't be None when 2063 // the entry_point is Some. 2064 self.configure_system(rsdp_addr.unwrap()) 2065 }) 2066 .transpose()?; 2067 2068 #[cfg(target_arch = "x86_64")] 2069 // Note: For x86, always call this function before invoking start boot vcpus. 2070 // Otherwise guest would fail to boot because we haven't created the 2071 // userspace mappings to update the hypervisor about the memory mappings. 2072 // These mappings must be created before we start the vCPU threads for 2073 // the very first time. 2074 self.memory_manager 2075 .lock() 2076 .unwrap() 2077 .allocate_address_space() 2078 .map_err(Error::MemoryManager)?; 2079 2080 #[cfg(feature = "tdx")] 2081 if let Some(hob_address) = hob_address { 2082 // With the HOB address extracted the vCPUs can have 2083 // their TDX state configured. 2084 self.cpu_manager 2085 .lock() 2086 .unwrap() 2087 .initialize_tdx(hob_address) 2088 .map_err(Error::CpuManager)?; 2089 // Let the hypervisor know which memory ranges are shared with the 2090 // guest. This prevents the guest from ignoring/discarding memory 2091 // regions provided by the host. 2092 self.init_tdx_memory(§ions)?; 2093 // With TDX memory and CPU state configured TDX setup is complete 2094 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2095 } 2096 2097 self.cpu_manager 2098 .lock() 2099 .unwrap() 2100 .start_boot_vcpus(new_state == VmState::BreakPoint) 2101 .map_err(Error::CpuManager)?; 2102 2103 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2104 *state = new_state; 2105 event!("vm", "booted"); 2106 Ok(()) 2107 } 2108 2109 pub fn restore(&mut self) -> Result<()> { 2110 event!("vm", "restoring"); 2111 2112 #[cfg(target_arch = "x86_64")] 2113 // Note: For x86, always call this function before invoking start boot vcpus. 2114 // Otherwise guest would fail to boot because we haven't created the 2115 // userspace mappings to update the hypervisor about the memory mappings. 2116 // These mappings must be created before we start the vCPU threads for 2117 // the very first time for the restored VM. 2118 self.memory_manager 2119 .lock() 2120 .unwrap() 2121 .allocate_address_space() 2122 .map_err(Error::MemoryManager)?; 2123 2124 // Now we can start all vCPUs from here. 2125 self.cpu_manager 2126 .lock() 2127 .unwrap() 2128 .start_restored_vcpus() 2129 .map_err(Error::CpuManager)?; 2130 2131 event!("vm", "restored"); 2132 Ok(()) 2133 } 2134 2135 /// Gets a thread-safe reference counted pointer to the VM configuration. 2136 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2137 Arc::clone(&self.config) 2138 } 2139 2140 /// Get the VM state. Returns an error if the state is poisoned. 2141 pub fn get_state(&self) -> Result<VmState> { 2142 self.state 2143 .try_read() 2144 .map_err(|_| Error::PoisonedState) 2145 .map(|state| *state) 2146 } 2147 2148 /// Gets the actual size of the balloon. 2149 pub fn balloon_size(&self) -> u64 { 2150 self.device_manager.lock().unwrap().balloon_size() 2151 } 2152 2153 pub fn send_memory_fds( 2154 &mut self, 2155 socket: &mut UnixStream, 2156 ) -> std::result::Result<(), MigratableError> { 2157 for (slot, fd) in self 2158 .memory_manager 2159 .lock() 2160 .unwrap() 2161 .memory_slot_fds() 2162 .drain() 2163 { 2164 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2165 .write_to(socket) 2166 .map_err(|e| { 2167 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2168 })?; 2169 socket 2170 .send_with_fd(&slot.to_le_bytes()[..], fd) 2171 .map_err(|e| { 2172 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2173 })?; 2174 2175 let res = Response::read_from(socket)?; 2176 if res.status() != Status::Ok { 2177 warn!("Error during memory fd migration"); 2178 Request::abandon().write_to(socket)?; 2179 Response::read_from(socket).ok(); 2180 return Err(MigratableError::MigrateSend(anyhow!( 2181 "Error during memory fd migration" 2182 ))); 2183 } 2184 } 2185 2186 Ok(()) 2187 } 2188 2189 pub fn send_memory_regions<F>( 2190 &mut self, 2191 ranges: &MemoryRangeTable, 2192 fd: &mut F, 2193 ) -> std::result::Result<(), MigratableError> 2194 where 2195 F: WriteVolatile, 2196 { 2197 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2198 let mem = guest_memory.memory(); 2199 2200 for range in ranges.regions() { 2201 let mut offset: u64 = 0; 2202 // Here we are manually handling the retry in case we can't the 2203 // whole region at once because we can't use the implementation 2204 // from vm-memory::GuestMemory of write_all_to() as it is not 2205 // following the correct behavior. For more info about this issue 2206 // see: https://github.com/rust-vmm/vm-memory/issues/174 2207 loop { 2208 let bytes_written = mem 2209 .write_volatile_to( 2210 GuestAddress(range.gpa + offset), 2211 fd, 2212 (range.length - offset) as usize, 2213 ) 2214 .map_err(|e| { 2215 MigratableError::MigrateSend(anyhow!( 2216 "Error transferring memory to socket: {}", 2217 e 2218 )) 2219 })?; 2220 offset += bytes_written as u64; 2221 2222 if offset == range.length { 2223 break; 2224 } 2225 } 2226 } 2227 2228 Ok(()) 2229 } 2230 2231 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2232 self.memory_manager 2233 .lock() 2234 .unwrap() 2235 .memory_range_table(false) 2236 } 2237 2238 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2239 self.device_manager.lock().unwrap().device_tree() 2240 } 2241 2242 pub fn activate_virtio_devices(&self) -> Result<()> { 2243 self.device_manager 2244 .lock() 2245 .unwrap() 2246 .activate_virtio_devices() 2247 .map_err(Error::ActivateVirtioDevices) 2248 } 2249 2250 #[cfg(target_arch = "x86_64")] 2251 pub fn power_button(&self) -> Result<()> { 2252 return self 2253 .device_manager 2254 .lock() 2255 .unwrap() 2256 .notify_power_button() 2257 .map_err(Error::PowerButton); 2258 } 2259 2260 #[cfg(target_arch = "aarch64")] 2261 pub fn power_button(&self) -> Result<()> { 2262 self.device_manager 2263 .lock() 2264 .unwrap() 2265 .notify_power_button() 2266 .map_err(Error::PowerButton) 2267 } 2268 2269 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2270 self.memory_manager.lock().unwrap().snapshot_data() 2271 } 2272 2273 #[cfg(feature = "guest_debug")] 2274 pub fn debug_request( 2275 &mut self, 2276 gdb_request: &GdbRequestPayload, 2277 cpu_id: usize, 2278 ) -> Result<GdbResponsePayload> { 2279 use GdbRequestPayload::*; 2280 match gdb_request { 2281 SetSingleStep(single_step) => { 2282 self.set_guest_debug(cpu_id, &[], *single_step) 2283 .map_err(Error::Debug)?; 2284 } 2285 SetHwBreakPoint(addrs) => { 2286 self.set_guest_debug(cpu_id, addrs, false) 2287 .map_err(Error::Debug)?; 2288 } 2289 Pause => { 2290 self.debug_pause().map_err(Error::Debug)?; 2291 } 2292 Resume => { 2293 self.debug_resume().map_err(Error::Debug)?; 2294 } 2295 ReadRegs => { 2296 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2297 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2298 } 2299 WriteRegs(regs) => { 2300 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2301 } 2302 ReadMem(vaddr, len) => { 2303 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2304 let mem = self 2305 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2306 .map_err(Error::Debug)?; 2307 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2308 } 2309 WriteMem(vaddr, data) => { 2310 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2311 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2312 .map_err(Error::Debug)?; 2313 } 2314 ActiveVcpus => { 2315 let active_vcpus = self.active_vcpus(); 2316 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2317 } 2318 } 2319 Ok(GdbResponsePayload::CommandComplete) 2320 } 2321 2322 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2323 fn get_dump_state( 2324 &mut self, 2325 destination_url: &str, 2326 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2327 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2328 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2329 let mut elf_phdr_num = 1; 2330 let elf_sh_info = 0; 2331 let coredump_file_path = url_to_file(destination_url)?; 2332 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2333 2334 if mapping_num < UINT16_MAX - 2 { 2335 elf_phdr_num += mapping_num as u16; 2336 } else { 2337 panic!("mapping num beyond 65535 not supported"); 2338 } 2339 let coredump_file = OpenOptions::new() 2340 .read(true) 2341 .write(true) 2342 .create_new(true) 2343 .open(coredump_file_path) 2344 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2345 2346 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2347 let mem_data = self 2348 .memory_manager 2349 .lock() 2350 .unwrap() 2351 .coredump_memory_regions(mem_offset); 2352 2353 Ok(DumpState { 2354 elf_note_size, 2355 elf_phdr_num, 2356 elf_sh_info, 2357 mem_offset, 2358 mem_info: Some(mem_data), 2359 file: Some(coredump_file), 2360 }) 2361 } 2362 2363 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2364 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2365 size_of::<elf::Elf64_Ehdr>() as u64 2366 + note_size as u64 2367 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2368 } 2369 } 2370 2371 impl Pausable for Vm { 2372 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2373 event!("vm", "pausing"); 2374 let mut state = self 2375 .state 2376 .try_write() 2377 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2378 let new_state = VmState::Paused; 2379 2380 state 2381 .valid_transition(new_state) 2382 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2383 2384 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2385 { 2386 let mut clock = self 2387 .vm 2388 .get_clock() 2389 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2390 clock.reset_flags(); 2391 self.saved_clock = Some(clock); 2392 } 2393 2394 // Before pausing the vCPUs activate any pending virtio devices that might 2395 // need activation between starting the pause (or e.g. a migration it's part of) 2396 self.activate_virtio_devices().map_err(|e| { 2397 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2398 })?; 2399 2400 self.cpu_manager.lock().unwrap().pause()?; 2401 self.device_manager.lock().unwrap().pause()?; 2402 2403 *state = new_state; 2404 2405 event!("vm", "paused"); 2406 Ok(()) 2407 } 2408 2409 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2410 event!("vm", "resuming"); 2411 let mut state = self 2412 .state 2413 .try_write() 2414 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2415 let new_state = VmState::Running; 2416 2417 state 2418 .valid_transition(new_state) 2419 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2420 2421 self.cpu_manager.lock().unwrap().resume()?; 2422 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2423 { 2424 if let Some(clock) = &self.saved_clock { 2425 self.vm.set_clock(clock).map_err(|e| { 2426 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2427 })?; 2428 } 2429 } 2430 self.device_manager.lock().unwrap().resume()?; 2431 2432 // And we're back to the Running state. 2433 *state = new_state; 2434 event!("vm", "resumed"); 2435 Ok(()) 2436 } 2437 } 2438 2439 #[derive(Serialize, Deserialize)] 2440 pub struct VmSnapshot { 2441 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2442 pub clock: Option<hypervisor::ClockData>, 2443 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2444 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2445 } 2446 2447 pub const VM_SNAPSHOT_ID: &str = "vm"; 2448 impl Snapshottable for Vm { 2449 fn id(&self) -> String { 2450 VM_SNAPSHOT_ID.to_string() 2451 } 2452 2453 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2454 event!("vm", "snapshotting"); 2455 2456 #[cfg(feature = "tdx")] 2457 { 2458 if self.config.lock().unwrap().is_tdx_enabled() { 2459 return Err(MigratableError::Snapshot(anyhow!( 2460 "Snapshot not possible with TDX VM" 2461 ))); 2462 } 2463 } 2464 2465 let current_state = self.get_state().unwrap(); 2466 if current_state != VmState::Paused { 2467 return Err(MigratableError::Snapshot(anyhow!( 2468 "Trying to snapshot while VM is running" 2469 ))); 2470 } 2471 2472 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2473 let common_cpuid = { 2474 let amx = self.config.lock().unwrap().cpus.features.amx; 2475 let phys_bits = physical_bits( 2476 &self.hypervisor, 2477 self.config.lock().unwrap().cpus.max_phys_bits, 2478 ); 2479 arch::generate_common_cpuid( 2480 &self.hypervisor, 2481 &arch::CpuidConfig { 2482 sgx_epc_sections: None, 2483 phys_bits, 2484 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2485 #[cfg(feature = "tdx")] 2486 tdx: false, 2487 amx, 2488 }, 2489 ) 2490 .map_err(|e| { 2491 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2492 })? 2493 }; 2494 2495 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2496 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2497 clock: self.saved_clock, 2498 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2499 common_cpuid, 2500 }) 2501 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2502 2503 let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data)); 2504 2505 let (id, snapshot) = { 2506 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2507 (cpu_manager.id(), cpu_manager.snapshot()?) 2508 }; 2509 vm_snapshot.add_snapshot(id, snapshot); 2510 let (id, snapshot) = { 2511 let mut memory_manager = self.memory_manager.lock().unwrap(); 2512 (memory_manager.id(), memory_manager.snapshot()?) 2513 }; 2514 vm_snapshot.add_snapshot(id, snapshot); 2515 let (id, snapshot) = { 2516 let mut device_manager = self.device_manager.lock().unwrap(); 2517 (device_manager.id(), device_manager.snapshot()?) 2518 }; 2519 vm_snapshot.add_snapshot(id, snapshot); 2520 2521 event!("vm", "snapshotted"); 2522 Ok(vm_snapshot) 2523 } 2524 } 2525 2526 impl Transportable for Vm { 2527 fn send( 2528 &self, 2529 snapshot: &Snapshot, 2530 destination_url: &str, 2531 ) -> std::result::Result<(), MigratableError> { 2532 let mut snapshot_config_path = url_to_path(destination_url)?; 2533 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2534 2535 // Create the snapshot config file 2536 let mut snapshot_config_file = OpenOptions::new() 2537 .read(true) 2538 .write(true) 2539 .create_new(true) 2540 .open(snapshot_config_path) 2541 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2542 2543 // Serialize and write the snapshot config 2544 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2545 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2546 2547 snapshot_config_file 2548 .write(vm_config.as_bytes()) 2549 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2550 2551 let mut snapshot_state_path = url_to_path(destination_url)?; 2552 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2553 2554 // Create the snapshot state file 2555 let mut snapshot_state_file = OpenOptions::new() 2556 .read(true) 2557 .write(true) 2558 .create_new(true) 2559 .open(snapshot_state_path) 2560 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2561 2562 // Serialize and write the snapshot state 2563 let vm_state = 2564 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2565 2566 snapshot_state_file 2567 .write(&vm_state) 2568 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2569 2570 // Tell the memory manager to also send/write its own snapshot. 2571 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2572 self.memory_manager 2573 .lock() 2574 .unwrap() 2575 .send(&memory_manager_snapshot.clone(), destination_url)?; 2576 } else { 2577 return Err(MigratableError::Restore(anyhow!( 2578 "Missing memory manager snapshot" 2579 ))); 2580 } 2581 2582 Ok(()) 2583 } 2584 } 2585 2586 impl Migratable for Vm { 2587 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2588 self.memory_manager.lock().unwrap().start_dirty_log()?; 2589 self.device_manager.lock().unwrap().start_dirty_log() 2590 } 2591 2592 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2593 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2594 self.device_manager.lock().unwrap().stop_dirty_log() 2595 } 2596 2597 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2598 Ok(MemoryRangeTable::new_from_tables(vec![ 2599 self.memory_manager.lock().unwrap().dirty_log()?, 2600 self.device_manager.lock().unwrap().dirty_log()?, 2601 ])) 2602 } 2603 2604 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2605 self.memory_manager.lock().unwrap().start_migration()?; 2606 self.device_manager.lock().unwrap().start_migration() 2607 } 2608 2609 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2610 self.memory_manager.lock().unwrap().complete_migration()?; 2611 self.device_manager.lock().unwrap().complete_migration() 2612 } 2613 } 2614 2615 #[cfg(feature = "guest_debug")] 2616 impl Debuggable for Vm { 2617 fn set_guest_debug( 2618 &self, 2619 cpu_id: usize, 2620 addrs: &[GuestAddress], 2621 singlestep: bool, 2622 ) -> std::result::Result<(), DebuggableError> { 2623 self.cpu_manager 2624 .lock() 2625 .unwrap() 2626 .set_guest_debug(cpu_id, addrs, singlestep) 2627 } 2628 2629 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2630 if *self.state.read().unwrap() == VmState::Running { 2631 self.pause().map_err(DebuggableError::Pause)?; 2632 } 2633 2634 let mut state = self 2635 .state 2636 .try_write() 2637 .map_err(|_| DebuggableError::PoisonedState)?; 2638 *state = VmState::BreakPoint; 2639 Ok(()) 2640 } 2641 2642 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2643 if *self.state.read().unwrap() == VmState::BreakPoint { 2644 self.resume().map_err(DebuggableError::Pause)?; 2645 } 2646 2647 Ok(()) 2648 } 2649 2650 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2651 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2652 } 2653 2654 fn write_regs( 2655 &self, 2656 cpu_id: usize, 2657 regs: &CoreRegs, 2658 ) -> std::result::Result<(), DebuggableError> { 2659 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2660 } 2661 2662 fn read_mem( 2663 &self, 2664 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2665 cpu_id: usize, 2666 vaddr: GuestAddress, 2667 len: usize, 2668 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2669 self.cpu_manager 2670 .lock() 2671 .unwrap() 2672 .read_mem(guest_memory, cpu_id, vaddr, len) 2673 } 2674 2675 fn write_mem( 2676 &self, 2677 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2678 cpu_id: usize, 2679 vaddr: &GuestAddress, 2680 data: &[u8], 2681 ) -> std::result::Result<(), DebuggableError> { 2682 self.cpu_manager 2683 .lock() 2684 .unwrap() 2685 .write_mem(guest_memory, cpu_id, vaddr, data) 2686 } 2687 2688 fn active_vcpus(&self) -> usize { 2689 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2690 if active_vcpus > 0 { 2691 active_vcpus 2692 } else { 2693 // The VM is not booted yet. Report boot_vcpus() instead. 2694 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2695 } 2696 } 2697 } 2698 2699 #[cfg(feature = "guest_debug")] 2700 pub const UINT16_MAX: u32 = 65535; 2701 2702 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2703 impl Elf64Writable for Vm {} 2704 2705 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2706 impl GuestDebuggable for Vm { 2707 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2708 event!("vm", "coredumping"); 2709 2710 let mut resume = false; 2711 2712 #[cfg(feature = "tdx")] 2713 { 2714 if let Some(ref platform) = self.config.lock().unwrap().platform { 2715 if platform.tdx { 2716 return Err(GuestDebuggableError::Coredump(anyhow!( 2717 "Coredump not possible with TDX VM" 2718 ))); 2719 } 2720 } 2721 } 2722 2723 match self.get_state().unwrap() { 2724 VmState::Running => { 2725 self.pause().map_err(GuestDebuggableError::Pause)?; 2726 resume = true; 2727 } 2728 VmState::Paused => {} 2729 _ => { 2730 return Err(GuestDebuggableError::Coredump(anyhow!( 2731 "Trying to coredump while VM is not running or paused" 2732 ))); 2733 } 2734 } 2735 2736 let coredump_state = self.get_dump_state(destination_url)?; 2737 2738 self.write_header(&coredump_state)?; 2739 self.write_note(&coredump_state)?; 2740 self.write_loads(&coredump_state)?; 2741 2742 self.cpu_manager 2743 .lock() 2744 .unwrap() 2745 .cpu_write_elf64_note(&coredump_state)?; 2746 self.cpu_manager 2747 .lock() 2748 .unwrap() 2749 .cpu_write_vmm_note(&coredump_state)?; 2750 2751 self.memory_manager 2752 .lock() 2753 .unwrap() 2754 .coredump_iterate_save_mem(&coredump_state)?; 2755 2756 if resume { 2757 self.resume().map_err(GuestDebuggableError::Resume)?; 2758 } 2759 2760 Ok(()) 2761 } 2762 } 2763 2764 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2765 #[cfg(test)] 2766 mod tests { 2767 use super::*; 2768 2769 fn test_vm_state_transitions(state: VmState) { 2770 match state { 2771 VmState::Created => { 2772 // Check the transitions from Created 2773 assert!(state.valid_transition(VmState::Created).is_err()); 2774 assert!(state.valid_transition(VmState::Running).is_ok()); 2775 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2776 assert!(state.valid_transition(VmState::Paused).is_ok()); 2777 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2778 } 2779 VmState::Running => { 2780 // Check the transitions from Running 2781 assert!(state.valid_transition(VmState::Created).is_err()); 2782 assert!(state.valid_transition(VmState::Running).is_err()); 2783 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2784 assert!(state.valid_transition(VmState::Paused).is_ok()); 2785 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2786 } 2787 VmState::Shutdown => { 2788 // Check the transitions from Shutdown 2789 assert!(state.valid_transition(VmState::Created).is_err()); 2790 assert!(state.valid_transition(VmState::Running).is_ok()); 2791 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2792 assert!(state.valid_transition(VmState::Paused).is_err()); 2793 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2794 } 2795 VmState::Paused => { 2796 // Check the transitions from Paused 2797 assert!(state.valid_transition(VmState::Created).is_err()); 2798 assert!(state.valid_transition(VmState::Running).is_ok()); 2799 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2800 assert!(state.valid_transition(VmState::Paused).is_err()); 2801 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2802 } 2803 VmState::BreakPoint => { 2804 // Check the transitions from Breakpoint 2805 assert!(state.valid_transition(VmState::Created).is_ok()); 2806 assert!(state.valid_transition(VmState::Running).is_ok()); 2807 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2808 assert!(state.valid_transition(VmState::Paused).is_err()); 2809 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2810 } 2811 } 2812 } 2813 2814 #[test] 2815 fn test_vm_created_transitions() { 2816 test_vm_state_transitions(VmState::Created); 2817 } 2818 2819 #[test] 2820 fn test_vm_running_transitions() { 2821 test_vm_state_transitions(VmState::Running); 2822 } 2823 2824 #[test] 2825 fn test_vm_shutdown_transitions() { 2826 test_vm_state_transitions(VmState::Shutdown); 2827 } 2828 2829 #[test] 2830 fn test_vm_paused_transitions() { 2831 test_vm_state_transitions(VmState::Paused); 2832 } 2833 2834 #[cfg(feature = "tdx")] 2835 #[test] 2836 fn test_hob_memory_resources() { 2837 // Case 1: Two TDVF sections in the middle of the RAM 2838 let sections = vec![ 2839 TdvfSection { 2840 address: 0xc000, 2841 size: 0x1000, 2842 ..Default::default() 2843 }, 2844 TdvfSection { 2845 address: 0x1000, 2846 size: 0x4000, 2847 ..Default::default() 2848 }, 2849 ]; 2850 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2851 let expected = vec![ 2852 (0, 0x1000, true), 2853 (0x1000, 0x4000, false), 2854 (0x5000, 0x7000, true), 2855 (0xc000, 0x1000, false), 2856 (0xd000, 0x0fff_3000, true), 2857 ]; 2858 assert_eq!( 2859 expected, 2860 Vm::hob_memory_resources( 2861 sections, 2862 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2863 ) 2864 ); 2865 2866 // Case 2: Two TDVF sections with no conflict with the RAM 2867 let sections = vec![ 2868 TdvfSection { 2869 address: 0x1000_1000, 2870 size: 0x1000, 2871 ..Default::default() 2872 }, 2873 TdvfSection { 2874 address: 0, 2875 size: 0x1000, 2876 ..Default::default() 2877 }, 2878 ]; 2879 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2880 let expected = vec![ 2881 (0, 0x1000, false), 2882 (0x1000, 0x1000_0000, true), 2883 (0x1000_1000, 0x1000, false), 2884 ]; 2885 assert_eq!( 2886 expected, 2887 Vm::hob_memory_resources( 2888 sections, 2889 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2890 ) 2891 ); 2892 2893 // Case 3: Two TDVF sections with partial conflicts with the RAM 2894 let sections = vec![ 2895 TdvfSection { 2896 address: 0x1000_0000, 2897 size: 0x2000, 2898 ..Default::default() 2899 }, 2900 TdvfSection { 2901 address: 0, 2902 size: 0x2000, 2903 ..Default::default() 2904 }, 2905 ]; 2906 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2907 let expected = vec![ 2908 (0, 0x2000, false), 2909 (0x2000, 0x0fff_e000, true), 2910 (0x1000_0000, 0x2000, false), 2911 ]; 2912 assert_eq!( 2913 expected, 2914 Vm::hob_memory_resources( 2915 sections, 2916 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2917 ) 2918 ); 2919 2920 // Case 4: Two TDVF sections with no conflict before the RAM and two 2921 // more additional sections with no conflict after the RAM. 2922 let sections = vec![ 2923 TdvfSection { 2924 address: 0x2000_1000, 2925 size: 0x1000, 2926 ..Default::default() 2927 }, 2928 TdvfSection { 2929 address: 0x2000_0000, 2930 size: 0x1000, 2931 ..Default::default() 2932 }, 2933 TdvfSection { 2934 address: 0x1000, 2935 size: 0x1000, 2936 ..Default::default() 2937 }, 2938 TdvfSection { 2939 address: 0, 2940 size: 0x1000, 2941 ..Default::default() 2942 }, 2943 ]; 2944 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 2945 let expected = vec![ 2946 (0, 0x1000, false), 2947 (0x1000, 0x1000, false), 2948 (0x4000, 0x1000_0000, true), 2949 (0x2000_0000, 0x1000, false), 2950 (0x2000_1000, 0x1000, false), 2951 ]; 2952 assert_eq!( 2953 expected, 2954 Vm::hob_memory_resources( 2955 sections, 2956 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2957 ) 2958 ); 2959 2960 // Case 5: One TDVF section overriding the entire RAM 2961 let sections = vec![TdvfSection { 2962 address: 0, 2963 size: 0x2000_0000, 2964 ..Default::default() 2965 }]; 2966 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2967 let expected = vec![(0, 0x2000_0000, false)]; 2968 assert_eq!( 2969 expected, 2970 Vm::hob_memory_resources( 2971 sections, 2972 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2973 ) 2974 ); 2975 2976 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 2977 let sections = vec![ 2978 TdvfSection { 2979 address: 0x1000_2000, 2980 size: 0x2000, 2981 ..Default::default() 2982 }, 2983 TdvfSection { 2984 address: 0, 2985 size: 0x2000, 2986 ..Default::default() 2987 }, 2988 ]; 2989 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 2990 (GuestAddress(0x2000), 0x1000_0000), 2991 (GuestAddress(0x1000_4000), 0x1000_0000), 2992 ]; 2993 let expected = vec![ 2994 (0, 0x2000, false), 2995 (0x2000, 0x1000_0000, true), 2996 (0x1000_2000, 0x2000, false), 2997 (0x1000_4000, 0x1000_0000, true), 2998 ]; 2999 assert_eq!( 3000 expected, 3001 Vm::hob_memory_resources( 3002 sections, 3003 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3004 ) 3005 ); 3006 3007 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3008 let sections = vec![ 3009 TdvfSection { 3010 address: 0x1000_0000, 3011 size: 0x4000, 3012 ..Default::default() 3013 }, 3014 TdvfSection { 3015 address: 0, 3016 size: 0x4000, 3017 ..Default::default() 3018 }, 3019 ]; 3020 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3021 (GuestAddress(0x1000), 0x1000_0000), 3022 (GuestAddress(0x1000_3000), 0x1000_0000), 3023 ]; 3024 let expected = vec![ 3025 (0, 0x4000, false), 3026 (0x4000, 0x0fff_c000, true), 3027 (0x1000_0000, 0x4000, false), 3028 (0x1000_4000, 0x0fff_f000, true), 3029 ]; 3030 assert_eq!( 3031 expected, 3032 Vm::hob_memory_resources( 3033 sections, 3034 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3035 ) 3036 ); 3037 } 3038 } 3039 3040 #[cfg(target_arch = "aarch64")] 3041 #[cfg(test)] 3042 mod tests { 3043 use super::*; 3044 use crate::GuestMemoryMmap; 3045 use arch::aarch64::fdt::create_fdt; 3046 use arch::aarch64::layout; 3047 use arch::{DeviceType, MmioDeviceInfo}; 3048 use devices::gic::Gic; 3049 3050 const LEN: u64 = 4096; 3051 3052 #[test] 3053 fn test_create_fdt_with_devices() { 3054 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3055 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3056 3057 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3058 ( 3059 (DeviceType::Serial, DeviceType::Serial.to_string()), 3060 MmioDeviceInfo { 3061 addr: 0x00, 3062 len: LEN, 3063 irq: 33, 3064 }, 3065 ), 3066 ( 3067 (DeviceType::Virtio(1), "virtio".to_string()), 3068 MmioDeviceInfo { 3069 addr: LEN, 3070 len: LEN, 3071 irq: 34, 3072 }, 3073 ), 3074 ( 3075 (DeviceType::Rtc, "rtc".to_string()), 3076 MmioDeviceInfo { 3077 addr: 2 * LEN, 3078 len: LEN, 3079 irq: 35, 3080 }, 3081 ), 3082 ] 3083 .iter() 3084 .cloned() 3085 .collect(); 3086 3087 let hv = hypervisor::new().unwrap(); 3088 let vm = hv.create_vm().unwrap(); 3089 let gic = vm 3090 .create_vgic(Gic::create_default_config(1)) 3091 .expect("Cannot create gic"); 3092 assert!(create_fdt( 3093 &mem, 3094 "console=tty0", 3095 vec![0], 3096 Some((0, 0, 0)), 3097 &dev_info, 3098 &gic, 3099 &None, 3100 &Vec::new(), 3101 &BTreeMap::new(), 3102 None, 3103 true, 3104 ) 3105 .is_ok()) 3106 } 3107 } 3108 3109 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3110 #[test] 3111 pub fn test_vm() { 3112 use hypervisor::VmExit; 3113 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3114 // This example based on https://lwn.net/Articles/658511/ 3115 let code = [ 3116 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3117 0x00, 0xd8, /* add %bl, %al */ 3118 0x04, b'0', /* add $'0', %al */ 3119 0xee, /* out %al, (%dx) */ 3120 0xb0, b'\n', /* mov $'\n', %al */ 3121 0xee, /* out %al, (%dx) */ 3122 0xf4, /* hlt */ 3123 ]; 3124 3125 let mem_size = 0x1000; 3126 let load_addr = GuestAddress(0x1000); 3127 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3128 3129 let hv = hypervisor::new().unwrap(); 3130 let vm = hv.create_vm().expect("new VM creation failed"); 3131 3132 for (index, region) in mem.iter().enumerate() { 3133 let mem_region = vm.make_user_memory_region( 3134 index as u32, 3135 region.start_addr().raw_value(), 3136 region.len(), 3137 region.as_ptr() as u64, 3138 false, 3139 false, 3140 ); 3141 3142 vm.create_user_memory_region(mem_region) 3143 .expect("Cannot configure guest memory"); 3144 } 3145 mem.write_slice(&code, load_addr) 3146 .expect("Writing code to memory failed"); 3147 3148 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3149 3150 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3151 vcpu_sregs.cs.base = 0; 3152 vcpu_sregs.cs.selector = 0; 3153 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3154 3155 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3156 vcpu_regs.rip = 0x1000; 3157 vcpu_regs.rax = 2; 3158 vcpu_regs.rbx = 3; 3159 vcpu_regs.rflags = 2; 3160 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3161 3162 loop { 3163 match vcpu.run().expect("run failed") { 3164 VmExit::IoOut(addr, data) => { 3165 println!( 3166 "IO out -- addr: {:#x} data [{:?}]", 3167 addr, 3168 str::from_utf8(data).unwrap() 3169 ); 3170 } 3171 VmExit::Reset => { 3172 println!("HLT"); 3173 break; 3174 } 3175 r => panic!("unexpected exit reason: {r:?}"), 3176 } 3177 } 3178 } 3179