1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::collections::{BTreeMap, HashMap}; 15 use std::fs::{File, OpenOptions}; 16 use std::io::{self, Seek, SeekFrom, Write}; 17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 18 use std::mem::size_of; 19 use std::num::Wrapping; 20 use std::ops::Deref; 21 use std::os::unix::net::UnixStream; 22 use std::sync::{Arc, Mutex, RwLock}; 23 #[cfg(not(target_arch = "riscv64"))] 24 use std::time::Instant; 25 use std::{cmp, result, str, thread}; 26 27 use anyhow::anyhow; 28 #[cfg(target_arch = "x86_64")] 29 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 30 #[cfg(feature = "tdx")] 31 use arch::x86_64::tdx::TdvfSection; 32 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 33 use arch::PciSpaceInfo; 34 use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes}; 35 #[cfg(target_arch = "aarch64")] 36 use devices::interrupt_controller; 37 use devices::AcpiNotificationFlags; 38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 39 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 41 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 42 #[cfg(target_arch = "aarch64")] 43 use hypervisor::arch::aarch64::regs::AARCH64_PMU_IRQ; 44 use hypervisor::{HypervisorVmError, VmOps}; 45 use libc::{termios, SIGWINCH}; 46 use linux_loader::cmdline::Cmdline; 47 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 48 use linux_loader::elf; 49 #[cfg(target_arch = "x86_64")] 50 use linux_loader::loader::bzimage::BzImage; 51 #[cfg(target_arch = "x86_64")] 52 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 53 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 54 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 55 use linux_loader::loader::KernelLoader; 56 use seccompiler::SeccompAction; 57 use serde::{Deserialize, Serialize}; 58 use thiserror::Error; 59 use tracer::trace_scoped; 60 use vm_device::Bus; 61 #[cfg(feature = "tdx")] 62 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 63 use vm_memory::{ 64 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 65 }; 66 use vm_migration::protocol::{MemoryRangeTable, Request, Response}; 67 use vm_migration::{ 68 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, 69 }; 70 use vmm_sys_util::eventfd::EventFd; 71 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 72 73 use crate::config::{add_to_config, ValidationError}; 74 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; 75 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 76 use crate::coredump::{ 77 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 78 }; 79 use crate::device_manager::{DeviceManager, DeviceManagerError}; 80 use crate::device_tree::DeviceTree; 81 #[cfg(feature = "guest_debug")] 82 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 83 #[cfg(feature = "igvm")] 84 use crate::igvm::igvm_loader; 85 use crate::landlock::LandlockError; 86 use crate::memory_manager::{ 87 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 88 }; 89 #[cfg(target_arch = "x86_64")] 90 use crate::migration::get_vm_snapshot; 91 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 92 use crate::migration::url_to_file; 93 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 94 use crate::vm_config::{ 95 DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig, 96 PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, 97 }; 98 use crate::{ 99 cpu, GuestMemoryMmap, PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, 100 MEMORY_MANAGER_SNAPSHOT_ID, 101 }; 102 103 /// Errors associated with VM management 104 #[derive(Debug, Error)] 105 pub enum Error { 106 #[error("Cannot open kernel file: {0}")] 107 KernelFile(#[source] io::Error), 108 109 #[error("Cannot open initramfs file: {0}")] 110 InitramfsFile(#[source] io::Error), 111 112 #[error("Cannot load the kernel into memory: {0}")] 113 KernelLoad(#[source] linux_loader::loader::Error), 114 115 #[cfg(target_arch = "aarch64")] 116 #[error("Cannot load the UEFI binary in memory: {0:?}")] 117 UefiLoad(arch::aarch64::uefi::Error), 118 119 #[error("Cannot load the initramfs into memory")] 120 InitramfsLoad, 121 122 #[error("Cannot load the kernel command line in memory: {0}")] 123 LoadCmdLine(#[source] linux_loader::loader::Error), 124 125 #[error("Failed to apply landlock config during vm_create: {0}")] 126 ApplyLandlock(#[source] LandlockError), 127 128 #[error("Cannot modify the kernel command line: {0}")] 129 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 130 131 #[error("Cannot create the kernel command line: {0}")] 132 CmdLineCreate(#[source] linux_loader::cmdline::Error), 133 134 #[error("Cannot configure system: {0}")] 135 ConfigureSystem(#[source] arch::Error), 136 137 #[cfg(target_arch = "aarch64")] 138 #[error("Cannot enable interrupt controller: {0:?}")] 139 EnableInterruptController(interrupt_controller::Error), 140 141 #[error("VM state is poisoned")] 142 PoisonedState, 143 144 #[error("Error from device manager: {0:?}")] 145 DeviceManager(#[source] DeviceManagerError), 146 147 #[error("Error initializing VM: {0:?}")] 148 InitializeVm(#[source] hypervisor::HypervisorVmError), 149 150 #[error("No device with id {0:?} to remove")] 151 NoDeviceToRemove(String), 152 153 #[error("Cannot spawn a signal handler thread: {0}")] 154 SignalHandlerSpawn(#[source] io::Error), 155 156 #[error("Failed to join on threads: {0:?}")] 157 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 158 159 #[error("VM config is missing")] 160 VmMissingConfig, 161 162 #[error("VM is not created")] 163 VmNotCreated, 164 165 #[error("VM is already created")] 166 VmAlreadyCreated, 167 168 #[error("VM is not running")] 169 VmNotRunning, 170 171 #[error("Cannot clone EventFd: {0}")] 172 EventFdClone(#[source] io::Error), 173 174 #[error("invalid VM state transition: {0:?} to {1:?}")] 175 InvalidStateTransition(VmState, VmState), 176 177 #[error("Error from CPU manager: {0}")] 178 CpuManager(#[source] cpu::Error), 179 180 #[error("Cannot pause devices: {0}")] 181 PauseDevices(#[source] MigratableError), 182 183 #[error("Cannot resume devices: {0}")] 184 ResumeDevices(#[source] MigratableError), 185 186 #[error("Cannot pause CPUs: {0}")] 187 PauseCpus(#[source] MigratableError), 188 189 #[error("Cannot resume cpus: {0}")] 190 ResumeCpus(#[source] MigratableError), 191 192 #[error("Cannot pause VM: {0}")] 193 Pause(#[source] MigratableError), 194 195 #[error("Cannot resume VM: {0}")] 196 Resume(#[source] MigratableError), 197 198 #[error("Memory manager error: {0:?}")] 199 MemoryManager(#[source] MemoryManagerError), 200 201 #[error("Eventfd write error: {0}")] 202 EventfdError(#[source] std::io::Error), 203 204 #[error("Cannot snapshot VM: {0}")] 205 Snapshot(#[source] MigratableError), 206 207 #[error("Cannot restore VM: {0}")] 208 Restore(#[source] MigratableError), 209 210 #[error("Cannot send VM snapshot: {0}")] 211 SnapshotSend(#[source] MigratableError), 212 213 #[error("Invalid restore source URL")] 214 InvalidRestoreSourceUrl, 215 216 #[error("Failed to validate config: {0}")] 217 ConfigValidation(#[source] ValidationError), 218 219 #[error("Too many virtio-vsock devices")] 220 TooManyVsockDevices, 221 222 #[error("Failed serializing into JSON: {0}")] 223 SerializeJson(#[source] serde_json::Error), 224 225 #[error("Invalid NUMA configuration")] 226 InvalidNumaConfig, 227 228 #[error("Cannot create seccomp filter: {0}")] 229 CreateSeccompFilter(#[source] seccompiler::Error), 230 231 #[error("Cannot apply seccomp filter: {0}")] 232 ApplySeccompFilter(#[source] seccompiler::Error), 233 234 #[error("Failed resizing a memory zone")] 235 ResizeZone, 236 237 #[error("Cannot activate virtio devices: {0:?}")] 238 ActivateVirtioDevices(#[source] DeviceManagerError), 239 240 #[error("Error triggering power button: {0:?}")] 241 PowerButton(#[source] DeviceManagerError), 242 243 #[error("Kernel lacks PVH header")] 244 KernelMissingPvhHeader, 245 246 #[error("Failed to allocate firmware RAM: {0:?}")] 247 AllocateFirmwareMemory(#[source] MemoryManagerError), 248 249 #[error("Error manipulating firmware file: {0}")] 250 FirmwareFile(#[source] std::io::Error), 251 252 #[error("Firmware too big")] 253 FirmwareTooLarge, 254 255 #[error("Failed to copy firmware to memory: {0}")] 256 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 257 258 #[cfg(feature = "sev_snp")] 259 #[error("Error enabling SEV-SNP VM: {0}")] 260 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 261 262 #[cfg(feature = "tdx")] 263 #[error("Error performing I/O on TDX firmware file: {0}")] 264 LoadTdvf(#[source] std::io::Error), 265 266 #[cfg(feature = "tdx")] 267 #[error("Error performing I/O on the TDX payload file: {0}")] 268 LoadPayload(#[source] std::io::Error), 269 270 #[cfg(feature = "tdx")] 271 #[error("Error parsing TDVF: {0}")] 272 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 273 274 #[cfg(feature = "tdx")] 275 #[error("Error populating TDX HOB: {0}")] 276 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 277 278 #[cfg(feature = "tdx")] 279 #[error("Error allocating TDVF memory: {0:?}")] 280 AllocatingTdvfMemory(crate::memory_manager::Error), 281 282 #[cfg(feature = "tdx")] 283 #[error("Error enabling TDX VM: {0}")] 284 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 285 286 #[cfg(feature = "tdx")] 287 #[error("Error enabling TDX memory region: {0}")] 288 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 289 290 #[cfg(feature = "tdx")] 291 #[error("Error finalizing TDX VM: {0}")] 292 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 293 294 #[cfg(feature = "tdx")] 295 #[error("TDX firmware missing")] 296 TdxFirmwareMissing, 297 298 #[cfg(feature = "tdx")] 299 #[error("Invalid TDX payload type")] 300 InvalidPayloadType, 301 302 #[cfg(feature = "guest_debug")] 303 #[error("Error debugging VM: {0:?}")] 304 Debug(DebuggableError), 305 306 #[error("Error spawning kernel loading thread")] 307 KernelLoadThreadSpawn(#[source] std::io::Error), 308 309 #[error("Error joining kernel loading thread")] 310 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 311 312 #[error("Payload configuration is not bootable")] 313 InvalidPayload, 314 315 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 316 #[error("Error coredumping VM: {0:?}")] 317 Coredump(#[source] GuestDebuggableError), 318 319 #[cfg(feature = "igvm")] 320 #[error("Cannot open igvm file: {0}")] 321 IgvmFile(#[source] io::Error), 322 323 #[cfg(feature = "igvm")] 324 #[error("Cannot load the igvm into memory: {0}")] 325 IgvmLoad(#[source] igvm_loader::Error), 326 327 #[error("Error injecting NMI")] 328 ErrorNmi, 329 330 #[error("Error resuming the VM: {0}")] 331 ResumeVm(#[source] hypervisor::HypervisorVmError), 332 333 #[error("Error creating console devices")] 334 CreateConsoleDevices(#[source] ConsoleDeviceError), 335 336 #[error("Error locking disk images: Another instance likely holds a lock")] 337 LockingError(#[source] DeviceManagerError), 338 } 339 pub type Result<T> = result::Result<T, Error>; 340 341 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 342 pub enum VmState { 343 Created, 344 Running, 345 Shutdown, 346 Paused, 347 BreakPoint, 348 } 349 350 impl VmState { 351 fn valid_transition(self, new_state: VmState) -> Result<()> { 352 match self { 353 VmState::Created => match new_state { 354 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 355 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 356 Ok(()) 357 } 358 }, 359 360 VmState::Running => match new_state { 361 VmState::Created | VmState::Running => { 362 Err(Error::InvalidStateTransition(self, new_state)) 363 } 364 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 365 }, 366 367 VmState::Shutdown => match new_state { 368 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 369 Err(Error::InvalidStateTransition(self, new_state)) 370 } 371 VmState::Running => Ok(()), 372 }, 373 374 VmState::Paused => match new_state { 375 VmState::Created | VmState::Paused | VmState::BreakPoint => { 376 Err(Error::InvalidStateTransition(self, new_state)) 377 } 378 VmState::Running | VmState::Shutdown => Ok(()), 379 }, 380 VmState::BreakPoint => match new_state { 381 VmState::Created | VmState::Running => Ok(()), 382 _ => Err(Error::InvalidStateTransition(self, new_state)), 383 }, 384 } 385 } 386 } 387 388 struct VmOpsHandler { 389 memory: GuestMemoryAtomic<GuestMemoryMmap>, 390 #[cfg(target_arch = "x86_64")] 391 io_bus: Arc<Bus>, 392 mmio_bus: Arc<Bus>, 393 } 394 395 impl VmOps for VmOpsHandler { 396 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 397 self.memory 398 .memory() 399 .write(buf, GuestAddress(gpa)) 400 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 401 } 402 403 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 404 self.memory 405 .memory() 406 .read(buf, GuestAddress(gpa)) 407 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 408 } 409 410 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 411 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 412 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 413 } 414 Ok(()) 415 } 416 417 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 418 match self.mmio_bus.write(gpa, data) { 419 Err(vm_device::BusError::MissingAddressRange) => { 420 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 421 } 422 Ok(Some(barrier)) => { 423 info!("Waiting for barrier"); 424 barrier.wait(); 425 info!("Barrier released"); 426 } 427 _ => {} 428 }; 429 Ok(()) 430 } 431 432 #[cfg(target_arch = "x86_64")] 433 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 434 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 435 info!("Guest PIO read to unregistered address 0x{:x}", port); 436 } 437 Ok(()) 438 } 439 440 #[cfg(target_arch = "x86_64")] 441 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 442 match self.io_bus.write(port, data) { 443 Err(vm_device::BusError::MissingAddressRange) => { 444 info!("Guest PIO write to unregistered address 0x{:x}", port); 445 } 446 Ok(Some(barrier)) => { 447 info!("Waiting for barrier"); 448 barrier.wait(); 449 info!("Barrier released"); 450 } 451 _ => {} 452 }; 453 Ok(()) 454 } 455 } 456 457 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 458 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 459 460 cmp::min(host_phys_bits, max_phys_bits) 461 } 462 463 pub struct Vm { 464 #[cfg(feature = "tdx")] 465 kernel: Option<File>, 466 initramfs: Option<File>, 467 threads: Vec<thread::JoinHandle<()>>, 468 device_manager: Arc<Mutex<DeviceManager>>, 469 config: Arc<Mutex<VmConfig>>, 470 state: RwLock<VmState>, 471 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 472 memory_manager: Arc<Mutex<MemoryManager>>, 473 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 474 // The hypervisor abstracted virtual machine. 475 vm: Arc<dyn hypervisor::Vm>, 476 #[cfg(target_arch = "x86_64")] 477 saved_clock: Option<hypervisor::ClockData>, 478 #[cfg(not(target_arch = "riscv64"))] 479 numa_nodes: NumaNodes, 480 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 481 #[cfg(not(target_arch = "riscv64"))] 482 hypervisor: Arc<dyn hypervisor::Hypervisor>, 483 stop_on_boot: bool, 484 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 485 } 486 487 impl Vm { 488 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 489 490 #[allow(clippy::too_many_arguments)] 491 pub fn new_from_memory_manager( 492 config: Arc<Mutex<VmConfig>>, 493 memory_manager: Arc<Mutex<MemoryManager>>, 494 vm: Arc<dyn hypervisor::Vm>, 495 exit_evt: EventFd, 496 reset_evt: EventFd, 497 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 498 seccomp_action: &SeccompAction, 499 hypervisor: Arc<dyn hypervisor::Hypervisor>, 500 activate_evt: EventFd, 501 #[cfg(not(target_arch = "riscv64"))] timestamp: Instant, 502 console_info: Option<ConsoleInfo>, 503 console_resize_pipe: Option<Arc<File>>, 504 original_termios: Arc<Mutex<Option<termios>>>, 505 snapshot: Option<Snapshot>, 506 ) -> Result<Self> { 507 trace_scoped!("Vm::new_from_memory_manager"); 508 509 let boot_id_list = config 510 .lock() 511 .unwrap() 512 .validate() 513 .map_err(Error::ConfigValidation)?; 514 515 info!("Booting VM from config: {:?}", &config); 516 517 // Create NUMA nodes based on NumaConfig. 518 let numa_nodes = 519 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 520 521 #[cfg(feature = "tdx")] 522 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 523 #[cfg(feature = "sev_snp")] 524 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 525 #[cfg(feature = "tdx")] 526 let force_iommu = tdx_enabled; 527 #[cfg(feature = "sev_snp")] 528 let force_iommu = sev_snp_enabled; 529 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 530 let force_iommu = false; 531 532 #[cfg(feature = "guest_debug")] 533 let stop_on_boot = config.lock().unwrap().gdb; 534 #[cfg(not(feature = "guest_debug"))] 535 let stop_on_boot = false; 536 537 let memory = memory_manager.lock().unwrap().guest_memory(); 538 let io_bus = Arc::new(Bus::new()); 539 let mmio_bus = Arc::new(Bus::new()); 540 541 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 542 memory, 543 #[cfg(target_arch = "x86_64")] 544 io_bus: io_bus.clone(), 545 mmio_bus: mmio_bus.clone(), 546 }); 547 548 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 549 let cpu_manager = cpu::CpuManager::new( 550 cpus_config, 551 vm.clone(), 552 exit_evt.try_clone().map_err(Error::EventFdClone)?, 553 reset_evt.try_clone().map_err(Error::EventFdClone)?, 554 #[cfg(feature = "guest_debug")] 555 vm_debug_evt, 556 &hypervisor, 557 seccomp_action.clone(), 558 vm_ops, 559 #[cfg(feature = "tdx")] 560 tdx_enabled, 561 &numa_nodes, 562 #[cfg(feature = "sev_snp")] 563 sev_snp_enabled, 564 ) 565 .map_err(Error::CpuManager)?; 566 567 #[cfg(target_arch = "x86_64")] 568 cpu_manager 569 .lock() 570 .unwrap() 571 .populate_cpuid( 572 &memory_manager, 573 &hypervisor, 574 #[cfg(feature = "tdx")] 575 tdx_enabled, 576 ) 577 .map_err(Error::CpuManager)?; 578 579 // The initial TDX configuration must be done before the vCPUs are 580 // created 581 #[cfg(feature = "tdx")] 582 if tdx_enabled { 583 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 584 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 585 vm.tdx_init(&cpuid, max_vcpus) 586 .map_err(Error::InitializeTdxVm)?; 587 } 588 589 #[cfg(feature = "tdx")] 590 let dynamic = !tdx_enabled; 591 #[cfg(not(feature = "tdx"))] 592 let dynamic = true; 593 594 #[cfg(feature = "kvm")] 595 let is_kvm = matches!( 596 hypervisor.hypervisor_type(), 597 hypervisor::HypervisorType::Kvm 598 ); 599 #[cfg(feature = "mshv")] 600 let is_mshv = matches!( 601 hypervisor.hypervisor_type(), 602 hypervisor::HypervisorType::Mshv 603 ); 604 605 let device_manager = DeviceManager::new( 606 io_bus, 607 mmio_bus, 608 vm.clone(), 609 config.clone(), 610 memory_manager.clone(), 611 cpu_manager.clone(), 612 exit_evt.try_clone().map_err(Error::EventFdClone)?, 613 reset_evt, 614 seccomp_action.clone(), 615 numa_nodes.clone(), 616 &activate_evt, 617 force_iommu, 618 boot_id_list, 619 #[cfg(not(target_arch = "riscv64"))] 620 timestamp, 621 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 622 dynamic, 623 ) 624 .map_err(Error::DeviceManager)?; 625 626 // For MSHV, we need to create the interrupt controller before we initialize the VM. 627 // Because we need to set the base address of GICD before we initialize the VM. 628 #[cfg(feature = "mshv")] 629 { 630 if is_mshv { 631 let ic = device_manager 632 .lock() 633 .unwrap() 634 .create_interrupt_controller() 635 .map_err(Error::DeviceManager)?; 636 637 vm.init().map_err(Error::InitializeVm)?; 638 639 device_manager 640 .lock() 641 .unwrap() 642 .create_devices( 643 console_info.clone(), 644 console_resize_pipe.clone(), 645 original_termios.clone(), 646 ic, 647 ) 648 .map_err(Error::DeviceManager)?; 649 } 650 } 651 652 memory_manager 653 .lock() 654 .unwrap() 655 .allocate_address_space() 656 .map_err(Error::MemoryManager)?; 657 658 #[cfg(target_arch = "aarch64")] 659 memory_manager 660 .lock() 661 .unwrap() 662 .add_uefi_flash() 663 .map_err(Error::MemoryManager)?; 664 665 // Loading the igvm file is pushed down here because 666 // igvm parser needs cpu_manager to retrieve cpuid leaf. 667 // Currently, Microsoft Hypervisor does not provide any 668 // Hypervisor specific common cpuid, we need to call get_cpuid_values 669 // per cpuid through cpu_manager. 670 let load_payload_handle = if snapshot.is_none() { 671 Self::load_payload_async( 672 &memory_manager, 673 &config, 674 #[cfg(feature = "igvm")] 675 &cpu_manager, 676 #[cfg(feature = "sev_snp")] 677 sev_snp_enabled, 678 )? 679 } else { 680 None 681 }; 682 683 cpu_manager 684 .lock() 685 .unwrap() 686 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 687 .map_err(Error::CpuManager)?; 688 689 // For KVM, we need to create interrupt controller after we create boot vcpus. 690 // Because we restore GIC state from the snapshot as part of boot vcpu creation. 691 // This means that we need to create interrupt controller after we restore in case of KVM guests. 692 #[cfg(feature = "kvm")] 693 { 694 if is_kvm { 695 let ic = device_manager 696 .lock() 697 .unwrap() 698 .create_interrupt_controller() 699 .map_err(Error::DeviceManager)?; 700 701 vm.init().map_err(Error::InitializeVm)?; 702 703 device_manager 704 .lock() 705 .unwrap() 706 .create_devices(console_info, console_resize_pipe, original_termios, ic) 707 .map_err(Error::DeviceManager)?; 708 } 709 } 710 711 // This initial SEV-SNP configuration must be done immediately after 712 // vCPUs are created. As part of this initialization we are 713 // transitioning the guest into secure state. 714 #[cfg(feature = "sev_snp")] 715 if sev_snp_enabled { 716 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 717 } 718 719 #[cfg(feature = "tdx")] 720 let kernel = config 721 .lock() 722 .unwrap() 723 .payload 724 .as_ref() 725 .map(|p| p.kernel.as_ref().map(File::open)) 726 .unwrap_or_default() 727 .transpose() 728 .map_err(Error::KernelFile)?; 729 730 let initramfs = config 731 .lock() 732 .unwrap() 733 .payload 734 .as_ref() 735 .map(|p| p.initramfs.as_ref().map(File::open)) 736 .unwrap_or_default() 737 .transpose() 738 .map_err(Error::InitramfsFile)?; 739 740 #[cfg(target_arch = "x86_64")] 741 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 742 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 743 vm_snapshot.clock 744 } else { 745 None 746 }; 747 748 let vm_state = if snapshot.is_some() { 749 VmState::Paused 750 } else { 751 VmState::Created 752 }; 753 754 Ok(Vm { 755 #[cfg(feature = "tdx")] 756 kernel, 757 initramfs, 758 device_manager, 759 config, 760 threads: Vec::with_capacity(1), 761 state: RwLock::new(vm_state), 762 cpu_manager, 763 memory_manager, 764 vm, 765 #[cfg(target_arch = "x86_64")] 766 saved_clock, 767 #[cfg(not(target_arch = "riscv64"))] 768 numa_nodes, 769 #[cfg(not(target_arch = "riscv64"))] 770 hypervisor, 771 stop_on_boot, 772 load_payload_handle, 773 }) 774 } 775 776 fn create_numa_nodes( 777 configs: Option<Vec<NumaConfig>>, 778 memory_manager: &Arc<Mutex<MemoryManager>>, 779 ) -> Result<NumaNodes> { 780 let mm = memory_manager.lock().unwrap(); 781 let mm_zones = mm.memory_zones(); 782 let mut numa_nodes = BTreeMap::new(); 783 784 if let Some(configs) = &configs { 785 for config in configs.iter() { 786 if numa_nodes.contains_key(&config.guest_numa_id) { 787 error!("Can't define twice the same NUMA node"); 788 return Err(Error::InvalidNumaConfig); 789 } 790 791 let mut node = NumaNode::default(); 792 793 if let Some(memory_zones) = &config.memory_zones { 794 for memory_zone in memory_zones.iter() { 795 if let Some(mm_zone) = mm_zones.get(memory_zone) { 796 node.memory_regions.extend(mm_zone.regions().clone()); 797 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 798 node.hotplug_regions.push(virtiomem_zone.region().clone()); 799 } 800 node.memory_zones.push(memory_zone.clone()); 801 } else { 802 error!("Unknown memory zone '{}'", memory_zone); 803 return Err(Error::InvalidNumaConfig); 804 } 805 } 806 } 807 808 if let Some(cpus) = &config.cpus { 809 node.cpus.extend(cpus); 810 } 811 812 if let Some(pci_segments) = &config.pci_segments { 813 node.pci_segments.extend(pci_segments); 814 } 815 816 if let Some(distances) = &config.distances { 817 for distance in distances.iter() { 818 let dest = distance.destination; 819 let dist = distance.distance; 820 821 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 822 error!("Unknown destination NUMA node {}", dest); 823 return Err(Error::InvalidNumaConfig); 824 } 825 826 if node.distances.contains_key(&dest) { 827 error!("Destination NUMA node {} has been already set", dest); 828 return Err(Error::InvalidNumaConfig); 829 } 830 831 node.distances.insert(dest, dist); 832 } 833 } 834 835 #[cfg(target_arch = "x86_64")] 836 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 837 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 838 let mm_sections = sgx_epc_region.epc_sections(); 839 for sgx_epc_section in sgx_epc_sections.iter() { 840 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 841 node.sgx_epc_sections.push(mm_section.clone()); 842 } else { 843 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 844 return Err(Error::InvalidNumaConfig); 845 } 846 } 847 } else { 848 error!("Missing SGX EPC region"); 849 return Err(Error::InvalidNumaConfig); 850 } 851 } 852 853 numa_nodes.insert(config.guest_numa_id, node); 854 } 855 } 856 857 Ok(numa_nodes) 858 } 859 860 #[allow(clippy::too_many_arguments)] 861 pub fn new( 862 vm_config: Arc<Mutex<VmConfig>>, 863 exit_evt: EventFd, 864 reset_evt: EventFd, 865 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 866 seccomp_action: &SeccompAction, 867 hypervisor: Arc<dyn hypervisor::Hypervisor>, 868 activate_evt: EventFd, 869 console_info: Option<ConsoleInfo>, 870 console_resize_pipe: Option<Arc<File>>, 871 original_termios: Arc<Mutex<Option<termios>>>, 872 snapshot: Option<Snapshot>, 873 source_url: Option<&str>, 874 prefault: Option<bool>, 875 ) -> Result<Self> { 876 trace_scoped!("Vm::new"); 877 878 #[cfg(not(target_arch = "riscv64"))] 879 let timestamp = Instant::now(); 880 881 #[cfg(feature = "tdx")] 882 let tdx_enabled = if snapshot.is_some() { 883 false 884 } else { 885 vm_config.lock().unwrap().is_tdx_enabled() 886 }; 887 888 #[cfg(feature = "sev_snp")] 889 let sev_snp_enabled = if snapshot.is_some() { 890 false 891 } else { 892 vm_config.lock().unwrap().is_sev_snp_enabled() 893 }; 894 895 let vm = Self::create_hypervisor_vm( 896 &hypervisor, 897 #[cfg(feature = "tdx")] 898 tdx_enabled, 899 #[cfg(feature = "sev_snp")] 900 sev_snp_enabled, 901 #[cfg(feature = "sev_snp")] 902 vm_config.lock().unwrap().memory.total_size(), 903 )?; 904 905 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 906 907 let memory_manager = if let Some(snapshot) = 908 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 909 { 910 MemoryManager::new_from_snapshot( 911 &snapshot, 912 vm.clone(), 913 &vm_config.lock().unwrap().memory.clone(), 914 source_url, 915 prefault.unwrap(), 916 phys_bits, 917 ) 918 .map_err(Error::MemoryManager)? 919 } else { 920 #[cfg(target_arch = "x86_64")] 921 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 922 923 MemoryManager::new( 924 vm.clone(), 925 &vm_config.lock().unwrap().memory.clone(), 926 None, 927 phys_bits, 928 #[cfg(feature = "tdx")] 929 tdx_enabled, 930 None, 931 None, 932 #[cfg(target_arch = "x86_64")] 933 sgx_epc_config, 934 ) 935 .map_err(Error::MemoryManager)? 936 }; 937 938 Vm::new_from_memory_manager( 939 vm_config, 940 memory_manager, 941 vm, 942 exit_evt, 943 reset_evt, 944 #[cfg(feature = "guest_debug")] 945 vm_debug_evt, 946 seccomp_action, 947 hypervisor, 948 activate_evt, 949 #[cfg(not(target_arch = "riscv64"))] 950 timestamp, 951 console_info, 952 console_resize_pipe, 953 original_termios, 954 snapshot, 955 ) 956 } 957 958 pub fn create_hypervisor_vm( 959 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 960 #[cfg(feature = "tdx")] tdx_enabled: bool, 961 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 962 #[cfg(feature = "sev_snp")] mem_size: u64, 963 ) -> Result<Arc<dyn hypervisor::Vm>> { 964 hypervisor.check_required_extensions().unwrap(); 965 966 cfg_if::cfg_if! { 967 if #[cfg(feature = "tdx")] { 968 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 969 // Otherwise KVM_X86_LEGACY_VM: 0 970 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 971 let vm = hypervisor 972 .create_vm_with_type(u64::from(tdx_enabled)) 973 .unwrap(); 974 } else if #[cfg(feature = "sev_snp")] { 975 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 976 // Otherwise SEV_SNP_DISABLED: 0 977 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 978 let vm = hypervisor 979 .create_vm_with_type_and_memory(u64::from(sev_snp_enabled), mem_size) 980 .unwrap(); 981 } else { 982 let vm = hypervisor.create_vm().unwrap(); 983 } 984 } 985 986 #[cfg(target_arch = "x86_64")] 987 { 988 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 989 .unwrap(); 990 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 991 vm.enable_split_irq().unwrap(); 992 } 993 994 Ok(vm) 995 } 996 997 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 998 let initramfs = self.initramfs.as_mut().unwrap(); 999 let size: usize = initramfs 1000 .seek(SeekFrom::End(0)) 1001 .map_err(|_| Error::InitramfsLoad)? 1002 .try_into() 1003 .unwrap(); 1004 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 1005 1006 let address = 1007 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 1008 let address = GuestAddress(address); 1009 1010 guest_mem 1011 .read_volatile_from(address, initramfs, size) 1012 .map_err(|_| Error::InitramfsLoad)?; 1013 1014 info!("Initramfs loaded: address = 0x{:x}", address.0); 1015 Ok(arch::InitramfsConfig { address, size }) 1016 } 1017 1018 pub fn generate_cmdline( 1019 payload: &PayloadConfig, 1020 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] device_manager: &Arc< 1021 Mutex<DeviceManager>, 1022 >, 1023 ) -> Result<Cmdline> { 1024 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 1025 if let Some(s) = payload.cmdline.as_ref() { 1026 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 1027 } 1028 1029 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 1030 for entry in device_manager.lock().unwrap().cmdline_additions() { 1031 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 1032 } 1033 Ok(cmdline) 1034 } 1035 1036 #[cfg(target_arch = "aarch64")] 1037 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 1038 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 1039 let mem = uefi_flash.memory(); 1040 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 1041 .map_err(Error::UefiLoad)?; 1042 Ok(()) 1043 } 1044 1045 #[cfg(target_arch = "aarch64")] 1046 fn load_kernel( 1047 firmware: Option<File>, 1048 kernel: Option<File>, 1049 memory_manager: Arc<Mutex<MemoryManager>>, 1050 ) -> Result<EntryPoint> { 1051 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1052 let mem = guest_memory.memory(); 1053 let entry_addr = match (firmware, kernel) { 1054 (None, Some(mut kernel)) => { 1055 match linux_loader::loader::pe::PE::load( 1056 mem.deref(), 1057 Some(arch::layout::KERNEL_START), 1058 &mut kernel, 1059 None, 1060 ) { 1061 Ok(entry_addr) => entry_addr.kernel_load, 1062 // Try to load the binary as kernel PE file at first. 1063 // If failed, retry to load it as UEFI binary. 1064 // As the UEFI binary is formatless, it must be the last option to try. 1065 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 1066 Self::load_firmware(&kernel, memory_manager)?; 1067 arch::layout::UEFI_START 1068 } 1069 Err(e) => { 1070 return Err(Error::KernelLoad(e)); 1071 } 1072 } 1073 } 1074 (Some(firmware), None) => { 1075 Self::load_firmware(&firmware, memory_manager)?; 1076 arch::layout::UEFI_START 1077 } 1078 _ => return Err(Error::InvalidPayload), 1079 }; 1080 1081 Ok(EntryPoint { entry_addr }) 1082 } 1083 1084 #[cfg(target_arch = "riscv64")] 1085 fn load_kernel( 1086 firmware: Option<File>, 1087 kernel: Option<File>, 1088 memory_manager: Arc<Mutex<MemoryManager>>, 1089 ) -> Result<EntryPoint> { 1090 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1091 let mem = guest_memory.memory(); 1092 let alignment = 0x20_0000; 1093 let aligned_kernel_addr = arch::layout::KERNEL_START.0 + (alignment - 1) & !(alignment - 1); 1094 let entry_addr = match (firmware, kernel) { 1095 (None, Some(mut kernel)) => { 1096 match linux_loader::loader::pe::PE::load( 1097 mem.deref(), 1098 Some(GuestAddress(aligned_kernel_addr)), 1099 &mut kernel, 1100 None, 1101 ) { 1102 Ok(entry_addr) => entry_addr.kernel_load, 1103 // Try to load the binary as kernel PE file at first. 1104 // If failed, retry to load it as UEFI binary. 1105 // As the UEFI binary is formatless, it must be the last option to try. 1106 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 1107 // TODO: UEFI for riscv64 is scheduled to next stage. 1108 unimplemented!() 1109 } 1110 Err(e) => { 1111 return Err(Error::KernelLoad(e)); 1112 } 1113 } 1114 } 1115 (Some(_firmware), None) => { 1116 // TODO: UEFI for riscv64 is scheduled to next stage. 1117 unimplemented!() 1118 } 1119 _ => return Err(Error::InvalidPayload), 1120 }; 1121 1122 Ok(EntryPoint { entry_addr }) 1123 } 1124 1125 #[cfg(feature = "igvm")] 1126 fn load_igvm( 1127 igvm: File, 1128 memory_manager: Arc<Mutex<MemoryManager>>, 1129 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1130 #[cfg(feature = "sev_snp")] host_data: &Option<String>, 1131 ) -> Result<EntryPoint> { 1132 let res = igvm_loader::load_igvm( 1133 &igvm, 1134 memory_manager, 1135 cpu_manager.clone(), 1136 "", 1137 #[cfg(feature = "sev_snp")] 1138 host_data, 1139 ) 1140 .map_err(Error::IgvmLoad)?; 1141 1142 cfg_if::cfg_if! { 1143 if #[cfg(feature = "sev_snp")] { 1144 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1145 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1146 } else { 1147 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1148 }; 1149 } else { 1150 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1151 } 1152 }; 1153 Ok(entry_point) 1154 } 1155 1156 #[cfg(target_arch = "x86_64")] 1157 fn load_kernel( 1158 mut kernel: File, 1159 cmdline: Option<Cmdline>, 1160 memory_manager: Arc<Mutex<MemoryManager>>, 1161 ) -> Result<EntryPoint> { 1162 info!("Loading kernel"); 1163 1164 let mem = { 1165 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1166 guest_memory.memory() 1167 }; 1168 1169 // Try ELF binary with PVH boot. 1170 let entry_addr = linux_loader::loader::elf::Elf::load( 1171 mem.deref(), 1172 None, 1173 &mut kernel, 1174 Some(arch::layout::HIGH_RAM_START), 1175 ) 1176 // Try loading kernel as bzImage. 1177 .or_else(|_| { 1178 BzImage::load( 1179 mem.deref(), 1180 None, 1181 &mut kernel, 1182 Some(arch::layout::HIGH_RAM_START), 1183 ) 1184 }) 1185 .map_err(Error::KernelLoad)?; 1186 1187 if let Some(cmdline) = cmdline { 1188 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1189 .map_err(Error::LoadCmdLine)?; 1190 } 1191 1192 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1193 // Use the PVH kernel entry point to boot the guest 1194 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1195 Ok(EntryPoint { 1196 entry_addr, 1197 setup_header: None, 1198 }) 1199 } else if entry_addr.setup_header.is_some() { 1200 // Use the bzImage 32bit entry point to boot the guest 1201 info!( 1202 "bzImage kernel loaded: entry_addr = 0x{:x}", 1203 entry_addr.kernel_load.0 1204 ); 1205 Ok(EntryPoint { 1206 entry_addr: entry_addr.kernel_load, 1207 setup_header: entry_addr.setup_header, 1208 }) 1209 } else { 1210 Err(Error::KernelMissingPvhHeader) 1211 } 1212 } 1213 1214 #[cfg(target_arch = "x86_64")] 1215 fn load_payload( 1216 payload: &PayloadConfig, 1217 memory_manager: Arc<Mutex<MemoryManager>>, 1218 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1219 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1220 ) -> Result<EntryPoint> { 1221 trace_scoped!("load_payload"); 1222 #[cfg(feature = "igvm")] 1223 { 1224 if let Some(_igvm_file) = &payload.igvm { 1225 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1226 #[cfg(feature = "sev_snp")] 1227 if sev_snp_enabled { 1228 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); 1229 } 1230 #[cfg(not(feature = "sev_snp"))] 1231 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1232 } 1233 } 1234 match ( 1235 &payload.firmware, 1236 &payload.kernel, 1237 &payload.initramfs, 1238 &payload.cmdline, 1239 ) { 1240 (Some(firmware), None, None, None) => { 1241 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1242 Self::load_kernel(firmware, None, memory_manager) 1243 } 1244 (None, Some(kernel), _, _) => { 1245 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1246 let cmdline = Self::generate_cmdline(payload)?; 1247 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1248 } 1249 _ => Err(Error::InvalidPayload), 1250 } 1251 } 1252 1253 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 1254 fn load_payload( 1255 payload: &PayloadConfig, 1256 memory_manager: Arc<Mutex<MemoryManager>>, 1257 ) -> Result<EntryPoint> { 1258 match (&payload.firmware, &payload.kernel) { 1259 (Some(firmware), None) => { 1260 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1261 Self::load_kernel(Some(firmware), None, memory_manager) 1262 } 1263 (None, Some(kernel)) => { 1264 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1265 Self::load_kernel(None, Some(kernel), memory_manager) 1266 } 1267 _ => Err(Error::InvalidPayload), 1268 } 1269 } 1270 1271 fn load_payload_async( 1272 memory_manager: &Arc<Mutex<MemoryManager>>, 1273 config: &Arc<Mutex<VmConfig>>, 1274 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1275 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1276 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1277 // Kernel with TDX is loaded in a different manner 1278 #[cfg(feature = "tdx")] 1279 if config.lock().unwrap().is_tdx_enabled() { 1280 return Ok(None); 1281 } 1282 1283 config 1284 .lock() 1285 .unwrap() 1286 .payload 1287 .as_ref() 1288 .map(|payload| { 1289 let memory_manager = memory_manager.clone(); 1290 let payload = payload.clone(); 1291 #[cfg(feature = "igvm")] 1292 let cpu_manager = cpu_manager.clone(); 1293 1294 std::thread::Builder::new() 1295 .name("payload_loader".into()) 1296 .spawn(move || { 1297 Self::load_payload( 1298 &payload, 1299 memory_manager, 1300 #[cfg(feature = "igvm")] 1301 cpu_manager, 1302 #[cfg(feature = "sev_snp")] 1303 sev_snp_enabled, 1304 ) 1305 }) 1306 .map_err(Error::KernelLoadThreadSpawn) 1307 }) 1308 .transpose() 1309 } 1310 1311 #[cfg(target_arch = "x86_64")] 1312 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1313 trace_scoped!("configure_system"); 1314 info!("Configuring system"); 1315 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1316 1317 let initramfs_config = match self.initramfs { 1318 Some(_) => Some(self.load_initramfs(&mem)?), 1319 None => None, 1320 }; 1321 1322 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1323 let rsdp_addr = Some(rsdp_addr); 1324 let sgx_epc_region = self 1325 .memory_manager 1326 .lock() 1327 .unwrap() 1328 .sgx_epc_region() 1329 .as_ref() 1330 .cloned(); 1331 1332 let serial_number = self 1333 .config 1334 .lock() 1335 .unwrap() 1336 .platform 1337 .as_ref() 1338 .and_then(|p| p.serial_number.clone()); 1339 1340 let uuid = self 1341 .config 1342 .lock() 1343 .unwrap() 1344 .platform 1345 .as_ref() 1346 .and_then(|p| p.uuid.clone()); 1347 1348 let oem_strings = self 1349 .config 1350 .lock() 1351 .unwrap() 1352 .platform 1353 .as_ref() 1354 .and_then(|p| p.oem_strings.clone()); 1355 1356 let oem_strings = oem_strings 1357 .as_deref() 1358 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1359 1360 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1361 1362 arch::configure_system( 1363 &mem, 1364 arch::layout::CMDLINE_START, 1365 arch::layout::CMDLINE_MAX_SIZE, 1366 &initramfs_config, 1367 boot_vcpus, 1368 entry_addr.setup_header, 1369 rsdp_addr, 1370 sgx_epc_region, 1371 serial_number.as_deref(), 1372 uuid.as_deref(), 1373 oem_strings.as_deref(), 1374 topology, 1375 ) 1376 .map_err(Error::ConfigureSystem)?; 1377 Ok(()) 1378 } 1379 1380 #[cfg(target_arch = "aarch64")] 1381 fn configure_system( 1382 &mut self, 1383 _rsdp_addr: GuestAddress, 1384 _entry_addr: EntryPoint, 1385 ) -> Result<()> { 1386 let cmdline = Self::generate_cmdline( 1387 self.config.lock().unwrap().payload.as_ref().unwrap(), 1388 &self.device_manager, 1389 )?; 1390 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1391 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1392 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1393 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1394 let initramfs_config = match self.initramfs { 1395 Some(_) => Some(self.load_initramfs(&mem)?), 1396 None => None, 1397 }; 1398 1399 let device_info = &self 1400 .device_manager 1401 .lock() 1402 .unwrap() 1403 .get_device_info() 1404 .clone(); 1405 1406 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1407 let pci_space = PciSpaceInfo { 1408 pci_segment_id: pci_segment.id, 1409 mmio_config_address: pci_segment.mmio_config_address, 1410 pci_device_space_start: pci_segment.start_of_mem64_area, 1411 pci_device_space_size: pci_segment.end_of_mem64_area 1412 - pci_segment.start_of_mem64_area 1413 + 1, 1414 }; 1415 pci_space_info.push(pci_space); 1416 } 1417 1418 let virtio_iommu_bdf = self 1419 .device_manager 1420 .lock() 1421 .unwrap() 1422 .iommu_attached_devices() 1423 .as_ref() 1424 .map(|(v, _)| *v); 1425 1426 let vgic = self 1427 .device_manager 1428 .lock() 1429 .unwrap() 1430 .get_interrupt_controller() 1431 .unwrap() 1432 .lock() 1433 .unwrap() 1434 .get_vgic() 1435 .map_err(|_| { 1436 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1437 arch::aarch64::Error::SetupGic, 1438 )) 1439 })?; 1440 1441 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1442 let pmu_supported = self 1443 .cpu_manager 1444 .lock() 1445 .unwrap() 1446 .init_pmu(AARCH64_PMU_IRQ + 16) 1447 .map_err(|_| { 1448 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1449 arch::aarch64::Error::VcpuInitPmu, 1450 )) 1451 })?; 1452 1453 arch::configure_system( 1454 &mem, 1455 cmdline.as_cstring().unwrap().to_str().unwrap(), 1456 vcpu_mpidrs, 1457 vcpu_topology, 1458 device_info, 1459 &initramfs_config, 1460 &pci_space_info, 1461 virtio_iommu_bdf.map(|bdf| bdf.into()), 1462 &vgic, 1463 &self.numa_nodes, 1464 pmu_supported, 1465 ) 1466 .map_err(Error::ConfigureSystem)?; 1467 1468 Ok(()) 1469 } 1470 1471 #[cfg(target_arch = "riscv64")] 1472 fn configure_system(&mut self) -> Result<()> { 1473 let cmdline = Self::generate_cmdline( 1474 self.config.lock().unwrap().payload.as_ref().unwrap(), 1475 &self.device_manager, 1476 )?; 1477 let num_vcpu = self.cpu_manager.lock().unwrap().vcpus().len(); 1478 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1479 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1480 let initramfs_config = match self.initramfs { 1481 Some(_) => Some(self.load_initramfs(&mem)?), 1482 None => None, 1483 }; 1484 1485 let device_info = &self 1486 .device_manager 1487 .lock() 1488 .unwrap() 1489 .get_device_info() 1490 .clone(); 1491 1492 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1493 let pci_space = PciSpaceInfo { 1494 pci_segment_id: pci_segment.id, 1495 mmio_config_address: pci_segment.mmio_config_address, 1496 pci_device_space_start: pci_segment.start_of_mem64_area, 1497 pci_device_space_size: pci_segment.end_of_mem64_area 1498 - pci_segment.start_of_mem64_area 1499 + 1, 1500 }; 1501 pci_space_info.push(pci_space); 1502 } 1503 1504 // TODO: IOMMU for riscv64 is not yet support in kernel. 1505 1506 let vaia = self 1507 .device_manager 1508 .lock() 1509 .unwrap() 1510 .get_interrupt_controller() 1511 .unwrap() 1512 .lock() 1513 .unwrap() 1514 .get_vaia() 1515 .map_err(|_| { 1516 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1517 arch::riscv64::Error::SetupAia, 1518 )) 1519 })?; 1520 1521 // TODO: PMU support for riscv64 is scheduled to next stage. 1522 1523 arch::configure_system( 1524 &mem, 1525 cmdline.as_cstring().unwrap().to_str().unwrap(), 1526 num_vcpu as u32, 1527 device_info, 1528 &initramfs_config, 1529 &pci_space_info, 1530 &vaia, 1531 ) 1532 .map_err(Error::ConfigureSystem)?; 1533 1534 Ok(()) 1535 } 1536 1537 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1538 self.device_manager.lock().unwrap().console_resize_pipe() 1539 } 1540 1541 pub fn shutdown(&mut self) -> Result<()> { 1542 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1543 let new_state = VmState::Shutdown; 1544 1545 state.valid_transition(new_state)?; 1546 1547 // Wake up the DeviceManager threads so they will get terminated cleanly 1548 self.device_manager 1549 .lock() 1550 .unwrap() 1551 .resume() 1552 .map_err(Error::Resume)?; 1553 1554 self.cpu_manager 1555 .lock() 1556 .unwrap() 1557 .shutdown() 1558 .map_err(Error::CpuManager)?; 1559 1560 // Wait for all the threads to finish 1561 for thread in self.threads.drain(..) { 1562 thread.join().map_err(Error::ThreadCleanup)? 1563 } 1564 *state = new_state; 1565 1566 Ok(()) 1567 } 1568 1569 pub fn resize( 1570 &mut self, 1571 desired_vcpus: Option<u8>, 1572 desired_memory: Option<u64>, 1573 desired_balloon: Option<u64>, 1574 ) -> Result<()> { 1575 event!("vm", "resizing"); 1576 1577 if let Some(desired_vcpus) = desired_vcpus { 1578 if self 1579 .cpu_manager 1580 .lock() 1581 .unwrap() 1582 .resize(desired_vcpus) 1583 .map_err(Error::CpuManager)? 1584 { 1585 self.device_manager 1586 .lock() 1587 .unwrap() 1588 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1589 .map_err(Error::DeviceManager)?; 1590 } 1591 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1592 } 1593 1594 if let Some(desired_memory) = desired_memory { 1595 let new_region = self 1596 .memory_manager 1597 .lock() 1598 .unwrap() 1599 .resize(desired_memory) 1600 .map_err(Error::MemoryManager)?; 1601 1602 let memory_config = &mut self.config.lock().unwrap().memory; 1603 1604 if let Some(new_region) = &new_region { 1605 self.device_manager 1606 .lock() 1607 .unwrap() 1608 .update_memory(new_region) 1609 .map_err(Error::DeviceManager)?; 1610 1611 match memory_config.hotplug_method { 1612 HotplugMethod::Acpi => { 1613 self.device_manager 1614 .lock() 1615 .unwrap() 1616 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1617 .map_err(Error::DeviceManager)?; 1618 } 1619 HotplugMethod::VirtioMem => {} 1620 } 1621 } 1622 1623 // We update the VM config regardless of the actual guest resize 1624 // operation result (happened or not), so that if the VM reboots 1625 // it will be running with the last configure memory size. 1626 match memory_config.hotplug_method { 1627 HotplugMethod::Acpi => memory_config.size = desired_memory, 1628 HotplugMethod::VirtioMem => { 1629 if desired_memory > memory_config.size { 1630 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1631 } else { 1632 memory_config.hotplugged_size = None; 1633 } 1634 } 1635 } 1636 } 1637 1638 if let Some(desired_balloon) = desired_balloon { 1639 self.device_manager 1640 .lock() 1641 .unwrap() 1642 .resize_balloon(desired_balloon) 1643 .map_err(Error::DeviceManager)?; 1644 1645 // Update the configuration value for the balloon size to ensure 1646 // a reboot would use the right value. 1647 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1648 balloon_config.size = desired_balloon; 1649 } 1650 } 1651 1652 event!("vm", "resized"); 1653 1654 Ok(()) 1655 } 1656 1657 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1658 let memory_config = &mut self.config.lock().unwrap().memory; 1659 1660 if let Some(zones) = &mut memory_config.zones { 1661 for zone in zones.iter_mut() { 1662 if zone.id == id { 1663 if desired_memory >= zone.size { 1664 let hotplugged_size = desired_memory - zone.size; 1665 self.memory_manager 1666 .lock() 1667 .unwrap() 1668 .resize_zone(&id, desired_memory - zone.size) 1669 .map_err(Error::MemoryManager)?; 1670 // We update the memory zone config regardless of the 1671 // actual 'resize-zone' operation result (happened or 1672 // not), so that if the VM reboots it will be running 1673 // with the last configured memory zone size. 1674 zone.hotplugged_size = Some(hotplugged_size); 1675 1676 return Ok(()); 1677 } else { 1678 error!( 1679 "Invalid to ask less ({}) than boot RAM ({}) for \ 1680 this memory zone", 1681 desired_memory, zone.size, 1682 ); 1683 return Err(Error::ResizeZone); 1684 } 1685 } 1686 } 1687 } 1688 1689 error!("Could not find the memory zone {} for the resize", id); 1690 Err(Error::ResizeZone) 1691 } 1692 1693 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1694 let pci_device_info = self 1695 .device_manager 1696 .lock() 1697 .unwrap() 1698 .add_device(&mut device_cfg) 1699 .map_err(Error::DeviceManager)?; 1700 1701 // Update VmConfig by adding the new device. This is important to 1702 // ensure the device would be created in case of a reboot. 1703 { 1704 let mut config = self.config.lock().unwrap(); 1705 add_to_config(&mut config.devices, device_cfg); 1706 } 1707 1708 self.device_manager 1709 .lock() 1710 .unwrap() 1711 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1712 .map_err(Error::DeviceManager)?; 1713 1714 Ok(pci_device_info) 1715 } 1716 1717 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1718 let pci_device_info = self 1719 .device_manager 1720 .lock() 1721 .unwrap() 1722 .add_user_device(&mut device_cfg) 1723 .map_err(Error::DeviceManager)?; 1724 1725 // Update VmConfig by adding the new device. This is important to 1726 // ensure the device would be created in case of a reboot. 1727 { 1728 let mut config = self.config.lock().unwrap(); 1729 add_to_config(&mut config.user_devices, device_cfg); 1730 } 1731 1732 self.device_manager 1733 .lock() 1734 .unwrap() 1735 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1736 .map_err(Error::DeviceManager)?; 1737 1738 Ok(pci_device_info) 1739 } 1740 1741 pub fn remove_device(&mut self, id: String) -> Result<()> { 1742 self.device_manager 1743 .lock() 1744 .unwrap() 1745 .remove_device(id.clone()) 1746 .map_err(Error::DeviceManager)?; 1747 1748 // Update VmConfig by removing the device. This is important to 1749 // ensure the device would not be created in case of a reboot. 1750 self.config.lock().unwrap().remove_device(&id); 1751 1752 self.device_manager 1753 .lock() 1754 .unwrap() 1755 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1756 .map_err(Error::DeviceManager)?; 1757 Ok(()) 1758 } 1759 1760 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1761 let pci_device_info = self 1762 .device_manager 1763 .lock() 1764 .unwrap() 1765 .add_disk(&mut disk_cfg) 1766 .map_err(Error::DeviceManager)?; 1767 1768 // Update VmConfig by adding the new device. This is important to 1769 // ensure the device would be created in case of a reboot. 1770 { 1771 let mut config = self.config.lock().unwrap(); 1772 add_to_config(&mut config.disks, disk_cfg); 1773 } 1774 1775 self.device_manager 1776 .lock() 1777 .unwrap() 1778 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1779 .map_err(Error::DeviceManager)?; 1780 1781 Ok(pci_device_info) 1782 } 1783 1784 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1785 let pci_device_info = self 1786 .device_manager 1787 .lock() 1788 .unwrap() 1789 .add_fs(&mut fs_cfg) 1790 .map_err(Error::DeviceManager)?; 1791 1792 // Update VmConfig by adding the new device. This is important to 1793 // ensure the device would be created in case of a reboot. 1794 { 1795 let mut config = self.config.lock().unwrap(); 1796 add_to_config(&mut config.fs, fs_cfg); 1797 } 1798 1799 self.device_manager 1800 .lock() 1801 .unwrap() 1802 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1803 .map_err(Error::DeviceManager)?; 1804 1805 Ok(pci_device_info) 1806 } 1807 1808 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1809 let pci_device_info = self 1810 .device_manager 1811 .lock() 1812 .unwrap() 1813 .add_pmem(&mut pmem_cfg) 1814 .map_err(Error::DeviceManager)?; 1815 1816 // Update VmConfig by adding the new device. This is important to 1817 // ensure the device would be created in case of a reboot. 1818 { 1819 let mut config = self.config.lock().unwrap(); 1820 add_to_config(&mut config.pmem, pmem_cfg); 1821 } 1822 1823 self.device_manager 1824 .lock() 1825 .unwrap() 1826 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1827 .map_err(Error::DeviceManager)?; 1828 1829 Ok(pci_device_info) 1830 } 1831 1832 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1833 let pci_device_info = self 1834 .device_manager 1835 .lock() 1836 .unwrap() 1837 .add_net(&mut net_cfg) 1838 .map_err(Error::DeviceManager)?; 1839 1840 // Update VmConfig by adding the new device. This is important to 1841 // ensure the device would be created in case of a reboot. 1842 { 1843 let mut config = self.config.lock().unwrap(); 1844 add_to_config(&mut config.net, net_cfg); 1845 } 1846 1847 self.device_manager 1848 .lock() 1849 .unwrap() 1850 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1851 .map_err(Error::DeviceManager)?; 1852 1853 Ok(pci_device_info) 1854 } 1855 1856 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1857 let pci_device_info = self 1858 .device_manager 1859 .lock() 1860 .unwrap() 1861 .add_vdpa(&mut vdpa_cfg) 1862 .map_err(Error::DeviceManager)?; 1863 1864 // Update VmConfig by adding the new device. This is important to 1865 // ensure the device would be created in case of a reboot. 1866 { 1867 let mut config = self.config.lock().unwrap(); 1868 add_to_config(&mut config.vdpa, vdpa_cfg); 1869 } 1870 1871 self.device_manager 1872 .lock() 1873 .unwrap() 1874 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1875 .map_err(Error::DeviceManager)?; 1876 1877 Ok(pci_device_info) 1878 } 1879 1880 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1881 let pci_device_info = self 1882 .device_manager 1883 .lock() 1884 .unwrap() 1885 .add_vsock(&mut vsock_cfg) 1886 .map_err(Error::DeviceManager)?; 1887 1888 // Update VmConfig by adding the new device. This is important to 1889 // ensure the device would be created in case of a reboot. 1890 { 1891 let mut config = self.config.lock().unwrap(); 1892 config.vsock = Some(vsock_cfg); 1893 } 1894 1895 self.device_manager 1896 .lock() 1897 .unwrap() 1898 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1899 .map_err(Error::DeviceManager)?; 1900 1901 Ok(pci_device_info) 1902 } 1903 1904 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1905 Ok(self.device_manager.lock().unwrap().counters()) 1906 } 1907 1908 #[cfg(feature = "tdx")] 1909 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1910 use arch::x86_64::tdx::*; 1911 1912 let firmware_path = self 1913 .config 1914 .lock() 1915 .unwrap() 1916 .payload 1917 .as_ref() 1918 .unwrap() 1919 .firmware 1920 .clone() 1921 .ok_or(Error::TdxFirmwareMissing)?; 1922 // The TDVF file contains a table of section as well as code 1923 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1924 1925 // For all the sections allocate some RAM backing them 1926 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1927 } 1928 1929 #[cfg(feature = "tdx")] 1930 fn hob_memory_resources( 1931 mut sorted_sections: Vec<TdvfSection>, 1932 guest_memory: &GuestMemoryMmap, 1933 ) -> Vec<(u64, u64, bool)> { 1934 let mut list = Vec::new(); 1935 1936 let mut current_section = sorted_sections.pop(); 1937 1938 // RAM regions interleaved with TDVF sections 1939 let mut next_start_addr = 0; 1940 for region in guest_memory.iter() { 1941 let region_start = region.start_addr().0; 1942 let region_end = region.last_addr().0; 1943 if region_start > next_start_addr { 1944 next_start_addr = region_start; 1945 } 1946 1947 loop { 1948 let (start, size, ram) = if let Some(section) = ¤t_section { 1949 if section.address <= next_start_addr { 1950 (section.address, section.size, false) 1951 } else { 1952 let last_addr = std::cmp::min(section.address - 1, region_end); 1953 (next_start_addr, last_addr - next_start_addr + 1, true) 1954 } 1955 } else { 1956 (next_start_addr, region_end - next_start_addr + 1, true) 1957 }; 1958 1959 list.push((start, size, ram)); 1960 1961 if !ram { 1962 current_section = sorted_sections.pop(); 1963 } 1964 1965 next_start_addr = start + size; 1966 1967 if region_start > next_start_addr { 1968 next_start_addr = region_start; 1969 } 1970 1971 if next_start_addr > region_end { 1972 break; 1973 } 1974 } 1975 } 1976 1977 // Once all the interleaved sections have been processed, let's simply 1978 // pull the remaining ones. 1979 if let Some(section) = current_section { 1980 list.push((section.address, section.size, false)); 1981 } 1982 while let Some(section) = sorted_sections.pop() { 1983 list.push((section.address, section.size, false)); 1984 } 1985 1986 list 1987 } 1988 1989 #[cfg(feature = "tdx")] 1990 fn populate_tdx_sections( 1991 &mut self, 1992 sections: &[TdvfSection], 1993 guid_found: bool, 1994 ) -> Result<Option<u64>> { 1995 use arch::x86_64::tdx::*; 1996 // Get the memory end *before* we start adding TDVF ram regions 1997 let boot_guest_memory = self 1998 .memory_manager 1999 .lock() 2000 .as_ref() 2001 .unwrap() 2002 .boot_guest_memory(); 2003 for section in sections { 2004 // No need to allocate if the section falls within guest RAM ranges 2005 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 2006 info!( 2007 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 2008 section 2009 ); 2010 continue; 2011 } 2012 2013 info!("Allocating TDVF Section: {:x?}", section); 2014 self.memory_manager 2015 .lock() 2016 .unwrap() 2017 .add_ram_region(GuestAddress(section.address), section.size as usize) 2018 .map_err(Error::AllocatingTdvfMemory)?; 2019 } 2020 2021 // The TDVF file contains a table of section as well as code 2022 let firmware_path = self 2023 .config 2024 .lock() 2025 .unwrap() 2026 .payload 2027 .as_ref() 2028 .unwrap() 2029 .firmware 2030 .clone() 2031 .ok_or(Error::TdxFirmwareMissing)?; 2032 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 2033 2034 // The guest memory at this point now has all the required regions so it 2035 // is safe to copy from the TDVF file into it. 2036 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2037 let mem = guest_memory.memory(); 2038 let mut payload_info = None; 2039 let mut hob_offset = None; 2040 for section in sections { 2041 info!("Populating TDVF Section: {:x?}", section); 2042 match section.r#type { 2043 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 2044 info!("Copying section to guest memory"); 2045 firmware_file 2046 .seek(SeekFrom::Start(section.data_offset as u64)) 2047 .map_err(Error::LoadTdvf)?; 2048 mem.read_volatile_from( 2049 GuestAddress(section.address), 2050 &mut firmware_file, 2051 section.data_size as usize, 2052 ) 2053 .unwrap(); 2054 } 2055 TdvfSectionType::TdHob => { 2056 hob_offset = Some(section.address); 2057 } 2058 TdvfSectionType::Payload => { 2059 info!("Copying payload to guest memory"); 2060 if let Some(payload_file) = self.kernel.as_mut() { 2061 let payload_size = payload_file 2062 .seek(SeekFrom::End(0)) 2063 .map_err(Error::LoadPayload)?; 2064 2065 payload_file 2066 .seek(SeekFrom::Start(0x1f1)) 2067 .map_err(Error::LoadPayload)?; 2068 2069 let mut payload_header = linux_loader::bootparam::setup_header::default(); 2070 payload_file 2071 .read_volatile(&mut payload_header.as_bytes()) 2072 .unwrap(); 2073 2074 if payload_header.header != 0x5372_6448 { 2075 return Err(Error::InvalidPayloadType); 2076 } 2077 2078 if (payload_header.version < 0x0200) 2079 || ((payload_header.loadflags & 0x1) == 0x0) 2080 { 2081 return Err(Error::InvalidPayloadType); 2082 } 2083 2084 payload_file.rewind().map_err(Error::LoadPayload)?; 2085 mem.read_volatile_from( 2086 GuestAddress(section.address), 2087 payload_file, 2088 payload_size as usize, 2089 ) 2090 .unwrap(); 2091 2092 // Create the payload info that will be inserted into 2093 // the HOB. 2094 payload_info = Some(PayloadInfo { 2095 image_type: PayloadImageType::BzImage, 2096 entry_point: section.address, 2097 }); 2098 } 2099 } 2100 TdvfSectionType::PayloadParam => { 2101 info!("Copying payload parameters to guest memory"); 2102 let cmdline = Self::generate_cmdline( 2103 self.config.lock().unwrap().payload.as_ref().unwrap(), 2104 )?; 2105 mem.write_slice( 2106 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 2107 GuestAddress(section.address), 2108 ) 2109 .unwrap(); 2110 } 2111 _ => {} 2112 } 2113 } 2114 2115 // Generate HOB 2116 let mut hob = TdHob::start(hob_offset.unwrap()); 2117 2118 let mut sorted_sections = sections.to_vec(); 2119 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 2120 2121 sorted_sections.sort_by_key(|section| section.address); 2122 sorted_sections.reverse(); 2123 2124 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 2125 hob.add_memory_resource(&mem, start, size, ram, guid_found) 2126 .map_err(Error::PopulateHob)?; 2127 } 2128 2129 // MMIO regions 2130 hob.add_mmio_resource( 2131 &mem, 2132 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 2133 arch::layout::APIC_START.raw_value() 2134 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 2135 ) 2136 .map_err(Error::PopulateHob)?; 2137 let start_of_device_area = self 2138 .memory_manager 2139 .lock() 2140 .unwrap() 2141 .start_of_device_area() 2142 .raw_value(); 2143 let end_of_device_area = self 2144 .memory_manager 2145 .lock() 2146 .unwrap() 2147 .end_of_device_area() 2148 .raw_value(); 2149 hob.add_mmio_resource( 2150 &mem, 2151 start_of_device_area, 2152 end_of_device_area - start_of_device_area, 2153 ) 2154 .map_err(Error::PopulateHob)?; 2155 2156 // Loop over the ACPI tables and copy them to the HOB. 2157 2158 for acpi_table in crate::acpi::create_acpi_tables_tdx( 2159 &self.device_manager, 2160 &self.cpu_manager, 2161 &self.memory_manager, 2162 &self.numa_nodes, 2163 ) { 2164 hob.add_acpi_table(&mem, acpi_table.as_slice()) 2165 .map_err(Error::PopulateHob)?; 2166 } 2167 2168 // If a payload info has been created, let's insert it into the HOB. 2169 if let Some(payload_info) = payload_info { 2170 hob.add_payload(&mem, payload_info) 2171 .map_err(Error::PopulateHob)?; 2172 } 2173 2174 hob.finish(&mem).map_err(Error::PopulateHob)?; 2175 2176 Ok(hob_offset) 2177 } 2178 2179 #[cfg(feature = "tdx")] 2180 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 2181 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2182 let mem = guest_memory.memory(); 2183 2184 for section in sections { 2185 self.vm 2186 .tdx_init_memory_region( 2187 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2188 section.address, 2189 section.size, 2190 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2191 section.attributes == 1, 2192 ) 2193 .map_err(Error::InitializeTdxMemoryRegion)?; 2194 } 2195 2196 Ok(()) 2197 } 2198 2199 // Creates ACPI tables 2200 // In case of TDX being used, this is a no-op since the tables will be 2201 // created and passed when populating the HOB. 2202 2203 #[cfg(not(target_arch = "riscv64"))] 2204 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2205 #[cfg(feature = "tdx")] 2206 if self.config.lock().unwrap().is_tdx_enabled() { 2207 return None; 2208 } 2209 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2210 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2211 let rsdp_addr = crate::acpi::create_acpi_tables( 2212 &mem, 2213 &self.device_manager, 2214 &self.cpu_manager, 2215 &self.memory_manager, 2216 &self.numa_nodes, 2217 tpm_enabled, 2218 ); 2219 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2220 2221 Some(rsdp_addr) 2222 } 2223 2224 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2225 trace_scoped!("entry_point"); 2226 2227 self.load_payload_handle 2228 .take() 2229 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2230 .transpose() 2231 } 2232 2233 pub fn boot(&mut self) -> Result<()> { 2234 trace_scoped!("Vm::boot"); 2235 let current_state = self.get_state()?; 2236 if current_state == VmState::Paused { 2237 return self.resume().map_err(Error::Resume); 2238 } 2239 2240 // We acquire all advisory disk image locks here and not on device creation 2241 // to enable live-migration without locking issues. 2242 self.device_manager 2243 .lock() 2244 .unwrap() 2245 .try_lock_disks() 2246 .map_err(Error::LockingError)?; 2247 2248 let new_state = if self.stop_on_boot { 2249 VmState::BreakPoint 2250 } else { 2251 VmState::Running 2252 }; 2253 current_state.valid_transition(new_state)?; 2254 2255 // Do earlier to parallelise with loading kernel 2256 #[cfg(target_arch = "x86_64")] 2257 cfg_if::cfg_if! { 2258 if #[cfg(feature = "sev_snp")] { 2259 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2260 let rsdp_addr = if sev_snp_enabled { 2261 // In case of SEV-SNP guest ACPI tables are provided via 2262 // IGVM. So skip the creation of ACPI tables and set the 2263 // rsdp addr to None. 2264 None 2265 } else { 2266 self.create_acpi_tables() 2267 }; 2268 } else { 2269 let rsdp_addr = self.create_acpi_tables(); 2270 } 2271 } 2272 2273 // Load kernel synchronously or if asynchronous then wait for load to 2274 // finish. 2275 let entry_point = self.entry_point()?; 2276 2277 #[cfg(feature = "tdx")] 2278 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2279 2280 #[cfg(target_arch = "aarch64")] 2281 let vgic = self 2282 .device_manager 2283 .lock() 2284 .unwrap() 2285 .get_interrupt_controller() 2286 .unwrap() 2287 .lock() 2288 .unwrap() 2289 .get_vgic() 2290 .unwrap(); 2291 2292 #[cfg(target_arch = "aarch64")] 2293 let redist_addr = vgic.lock().unwrap().device_properties(); 2294 2295 // Configure the vcpus that have been created 2296 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2297 for vcpu in vcpus { 2298 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2299 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2300 self.cpu_manager 2301 .lock() 2302 .unwrap() 2303 .configure_vcpu(vcpu.clone(), boot_setup) 2304 .map_err(Error::CpuManager)?; 2305 2306 #[cfg(target_arch = "aarch64")] 2307 vcpu.lock() 2308 .unwrap() 2309 .set_gic_redistributor_addr(redist_addr[2], redist_addr[3]) 2310 .map_err(Error::CpuManager)?; 2311 } 2312 2313 #[cfg(feature = "tdx")] 2314 let (sections, guid_found) = if tdx_enabled { 2315 self.extract_tdvf_sections()? 2316 } else { 2317 (Vec::new(), false) 2318 }; 2319 2320 // Configuring the TDX regions requires that the vCPUs are created. 2321 #[cfg(feature = "tdx")] 2322 let hob_address = if tdx_enabled { 2323 // TDX sections are written to memory. 2324 self.populate_tdx_sections(§ions, guid_found)? 2325 } else { 2326 None 2327 }; 2328 2329 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2330 // available after they are configured 2331 #[cfg(target_arch = "aarch64")] 2332 let rsdp_addr = self.create_acpi_tables(); 2333 2334 #[cfg(not(target_arch = "riscv64"))] 2335 // Configure shared state based on loaded kernel 2336 entry_point 2337 .map(|entry_point| { 2338 // Safe to unwrap rsdp_addr as we know it can't be None when 2339 // the entry_point is Some. 2340 self.configure_system(rsdp_addr.unwrap(), entry_point) 2341 }) 2342 .transpose()?; 2343 2344 #[cfg(target_arch = "riscv64")] 2345 self.configure_system().unwrap(); 2346 2347 #[cfg(feature = "tdx")] 2348 if let Some(hob_address) = hob_address { 2349 // With the HOB address extracted the vCPUs can have 2350 // their TDX state configured. 2351 self.cpu_manager 2352 .lock() 2353 .unwrap() 2354 .initialize_tdx(hob_address) 2355 .map_err(Error::CpuManager)?; 2356 // Let the hypervisor know which memory ranges are shared with the 2357 // guest. This prevents the guest from ignoring/discarding memory 2358 // regions provided by the host. 2359 self.init_tdx_memory(§ions)?; 2360 // With TDX memory and CPU state configured TDX setup is complete 2361 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2362 } 2363 2364 // Resume the vm for MSHV 2365 if current_state == VmState::Created { 2366 self.vm.resume().map_err(Error::ResumeVm)?; 2367 } 2368 2369 self.cpu_manager 2370 .lock() 2371 .unwrap() 2372 .start_boot_vcpus(new_state == VmState::BreakPoint) 2373 .map_err(Error::CpuManager)?; 2374 2375 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2376 *state = new_state; 2377 Ok(()) 2378 } 2379 2380 pub fn restore(&mut self) -> Result<()> { 2381 event!("vm", "restoring"); 2382 2383 // We acquire all advisory disk image locks again. 2384 self.device_manager 2385 .lock() 2386 .unwrap() 2387 .try_lock_disks() 2388 .map_err(Error::LockingError)?; 2389 2390 // Now we can start all vCPUs from here. 2391 self.cpu_manager 2392 .lock() 2393 .unwrap() 2394 .start_restored_vcpus() 2395 .map_err(Error::CpuManager)?; 2396 2397 event!("vm", "restored"); 2398 Ok(()) 2399 } 2400 2401 /// Gets a thread-safe reference counted pointer to the VM configuration. 2402 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2403 Arc::clone(&self.config) 2404 } 2405 2406 /// Get the VM state. Returns an error if the state is poisoned. 2407 pub fn get_state(&self) -> Result<VmState> { 2408 self.state 2409 .try_read() 2410 .map_err(|_| Error::PoisonedState) 2411 .map(|state| *state) 2412 } 2413 2414 /// Gets the actual size of the balloon. 2415 pub fn balloon_size(&self) -> u64 { 2416 self.device_manager.lock().unwrap().balloon_size() 2417 } 2418 2419 pub fn send_memory_fds( 2420 &mut self, 2421 socket: &mut UnixStream, 2422 ) -> std::result::Result<(), MigratableError> { 2423 for (slot, fd) in self 2424 .memory_manager 2425 .lock() 2426 .unwrap() 2427 .memory_slot_fds() 2428 .drain() 2429 { 2430 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2431 .write_to(socket) 2432 .map_err(|e| { 2433 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2434 })?; 2435 socket 2436 .send_with_fd(&slot.to_le_bytes()[..], fd) 2437 .map_err(|e| { 2438 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2439 })?; 2440 2441 Response::read_from(socket)?.ok_or_abandon( 2442 socket, 2443 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")), 2444 )?; 2445 } 2446 2447 Ok(()) 2448 } 2449 2450 pub fn send_memory_regions<F>( 2451 &mut self, 2452 ranges: &MemoryRangeTable, 2453 fd: &mut F, 2454 ) -> std::result::Result<(), MigratableError> 2455 where 2456 F: WriteVolatile, 2457 { 2458 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2459 let mem = guest_memory.memory(); 2460 2461 for range in ranges.regions() { 2462 let mut offset: u64 = 0; 2463 // Here we are manually handling the retry in case we can't the 2464 // whole region at once because we can't use the implementation 2465 // from vm-memory::GuestMemory of write_all_to() as it is not 2466 // following the correct behavior. For more info about this issue 2467 // see: https://github.com/rust-vmm/vm-memory/issues/174 2468 loop { 2469 let bytes_written = mem 2470 .write_volatile_to( 2471 GuestAddress(range.gpa + offset), 2472 fd, 2473 (range.length - offset) as usize, 2474 ) 2475 .map_err(|e| { 2476 MigratableError::MigrateSend(anyhow!( 2477 "Error transferring memory to socket: {}", 2478 e 2479 )) 2480 })?; 2481 offset += bytes_written as u64; 2482 2483 if offset == range.length { 2484 break; 2485 } 2486 } 2487 } 2488 2489 Ok(()) 2490 } 2491 2492 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2493 self.memory_manager 2494 .lock() 2495 .unwrap() 2496 .memory_range_table(false) 2497 } 2498 2499 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2500 self.device_manager.lock().unwrap().device_tree() 2501 } 2502 2503 /// Release all advisory locks held for the disk images. 2504 /// 2505 /// This should only be called when the VM is stopped and the VMM supposed 2506 /// to shut down. A new VMM, either after a live migration or a 2507 /// state save/resume cycle, should then acquire all locks before the VM 2508 /// starts to run. 2509 pub fn release_disk_locks(&self) -> Result<()> { 2510 self.device_manager 2511 .lock() 2512 .unwrap() 2513 .release_disk_locks() 2514 .map_err(Error::LockingError)?; 2515 Ok(()) 2516 } 2517 2518 pub fn activate_virtio_devices(&self) -> Result<()> { 2519 self.device_manager 2520 .lock() 2521 .unwrap() 2522 .activate_virtio_devices() 2523 .map_err(Error::ActivateVirtioDevices) 2524 } 2525 2526 #[cfg(target_arch = "x86_64")] 2527 pub fn power_button(&self) -> Result<()> { 2528 return self 2529 .device_manager 2530 .lock() 2531 .unwrap() 2532 .notify_power_button() 2533 .map_err(Error::PowerButton); 2534 } 2535 2536 #[cfg(target_arch = "aarch64")] 2537 pub fn power_button(&self) -> Result<()> { 2538 self.device_manager 2539 .lock() 2540 .unwrap() 2541 .notify_power_button() 2542 .map_err(Error::PowerButton) 2543 } 2544 2545 #[cfg(target_arch = "riscv64")] 2546 pub fn power_button(&self) -> Result<()> { 2547 unimplemented!() 2548 } 2549 2550 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2551 self.memory_manager.lock().unwrap().snapshot_data() 2552 } 2553 2554 #[cfg(feature = "guest_debug")] 2555 pub fn debug_request( 2556 &mut self, 2557 gdb_request: &GdbRequestPayload, 2558 cpu_id: usize, 2559 ) -> Result<GdbResponsePayload> { 2560 use GdbRequestPayload::*; 2561 match gdb_request { 2562 SetSingleStep(single_step) => { 2563 self.set_guest_debug(cpu_id, &[], *single_step) 2564 .map_err(Error::Debug)?; 2565 } 2566 SetHwBreakPoint(addrs) => { 2567 self.set_guest_debug(cpu_id, addrs, false) 2568 .map_err(Error::Debug)?; 2569 } 2570 Pause => { 2571 self.debug_pause().map_err(Error::Debug)?; 2572 } 2573 Resume => { 2574 self.debug_resume().map_err(Error::Debug)?; 2575 } 2576 ReadRegs => { 2577 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2578 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2579 } 2580 WriteRegs(regs) => { 2581 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2582 } 2583 ReadMem(vaddr, len) => { 2584 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2585 let mem = self 2586 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2587 .map_err(Error::Debug)?; 2588 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2589 } 2590 WriteMem(vaddr, data) => { 2591 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2592 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2593 .map_err(Error::Debug)?; 2594 } 2595 ActiveVcpus => { 2596 let active_vcpus = self.active_vcpus(); 2597 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2598 } 2599 } 2600 Ok(GdbResponsePayload::CommandComplete) 2601 } 2602 2603 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2604 fn get_dump_state( 2605 &mut self, 2606 destination_url: &str, 2607 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2608 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2609 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2610 let mut elf_phdr_num = 1; 2611 let elf_sh_info = 0; 2612 let coredump_file_path = url_to_file(destination_url)?; 2613 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2614 2615 if mapping_num < UINT16_MAX - 2 { 2616 elf_phdr_num += mapping_num as u16; 2617 } else { 2618 panic!("mapping num beyond 65535 not supported"); 2619 } 2620 let coredump_file = OpenOptions::new() 2621 .read(true) 2622 .write(true) 2623 .create_new(true) 2624 .open(coredump_file_path) 2625 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2626 2627 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2628 let mem_data = self 2629 .memory_manager 2630 .lock() 2631 .unwrap() 2632 .coredump_memory_regions(mem_offset); 2633 2634 Ok(DumpState { 2635 elf_note_size, 2636 elf_phdr_num, 2637 elf_sh_info, 2638 mem_offset, 2639 mem_info: Some(mem_data), 2640 file: Some(coredump_file), 2641 }) 2642 } 2643 2644 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2645 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2646 size_of::<elf::Elf64_Ehdr>() as u64 2647 + note_size as u64 2648 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2649 } 2650 2651 pub fn nmi(&self) -> Result<()> { 2652 return self 2653 .cpu_manager 2654 .lock() 2655 .unwrap() 2656 .nmi() 2657 .map_err(|_| Error::ErrorNmi); 2658 } 2659 } 2660 2661 impl Pausable for Vm { 2662 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2663 event!("vm", "pausing"); 2664 let mut state = self 2665 .state 2666 .try_write() 2667 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2668 let new_state = VmState::Paused; 2669 2670 state 2671 .valid_transition(new_state) 2672 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2673 2674 #[cfg(target_arch = "x86_64")] 2675 { 2676 let mut clock = self 2677 .vm 2678 .get_clock() 2679 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2680 clock.reset_flags(); 2681 self.saved_clock = Some(clock); 2682 } 2683 2684 // Before pausing the vCPUs activate any pending virtio devices that might 2685 // need activation between starting the pause (or e.g. a migration it's part of) 2686 self.activate_virtio_devices().map_err(|e| { 2687 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2688 })?; 2689 2690 self.cpu_manager.lock().unwrap().pause()?; 2691 self.device_manager.lock().unwrap().pause()?; 2692 2693 self.vm 2694 .pause() 2695 .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?; 2696 2697 *state = new_state; 2698 2699 event!("vm", "paused"); 2700 Ok(()) 2701 } 2702 2703 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2704 event!("vm", "resuming"); 2705 let current_state = self.get_state().unwrap(); 2706 let mut state = self 2707 .state 2708 .try_write() 2709 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2710 let new_state = VmState::Running; 2711 2712 state 2713 .valid_transition(new_state) 2714 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2715 2716 self.cpu_manager.lock().unwrap().resume()?; 2717 #[cfg(target_arch = "x86_64")] 2718 { 2719 if let Some(clock) = &self.saved_clock { 2720 self.vm.set_clock(clock).map_err(|e| { 2721 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2722 })?; 2723 } 2724 } 2725 2726 if current_state == VmState::Paused { 2727 self.vm 2728 .resume() 2729 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?; 2730 } 2731 2732 self.device_manager.lock().unwrap().resume()?; 2733 2734 // And we're back to the Running state. 2735 *state = new_state; 2736 event!("vm", "resumed"); 2737 Ok(()) 2738 } 2739 } 2740 2741 #[derive(Serialize, Deserialize)] 2742 pub struct VmSnapshot { 2743 #[cfg(target_arch = "x86_64")] 2744 pub clock: Option<hypervisor::ClockData>, 2745 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2746 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2747 } 2748 2749 pub const VM_SNAPSHOT_ID: &str = "vm"; 2750 impl Snapshottable for Vm { 2751 fn id(&self) -> String { 2752 VM_SNAPSHOT_ID.to_string() 2753 } 2754 2755 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2756 event!("vm", "snapshotting"); 2757 2758 #[cfg(feature = "tdx")] 2759 { 2760 if self.config.lock().unwrap().is_tdx_enabled() { 2761 return Err(MigratableError::Snapshot(anyhow!( 2762 "Snapshot not possible with TDX VM" 2763 ))); 2764 } 2765 } 2766 2767 let current_state = self.get_state().unwrap(); 2768 if current_state != VmState::Paused { 2769 return Err(MigratableError::Snapshot(anyhow!( 2770 "Trying to snapshot while VM is running" 2771 ))); 2772 } 2773 2774 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2775 let common_cpuid = { 2776 let amx = self.config.lock().unwrap().cpus.features.amx; 2777 let phys_bits = physical_bits( 2778 &self.hypervisor, 2779 self.config.lock().unwrap().cpus.max_phys_bits, 2780 ); 2781 arch::generate_common_cpuid( 2782 &self.hypervisor, 2783 &arch::CpuidConfig { 2784 sgx_epc_sections: None, 2785 phys_bits, 2786 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2787 #[cfg(feature = "tdx")] 2788 tdx: false, 2789 amx, 2790 }, 2791 ) 2792 .map_err(|e| { 2793 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2794 })? 2795 }; 2796 2797 let vm_snapshot_state = VmSnapshot { 2798 #[cfg(target_arch = "x86_64")] 2799 clock: self.saved_clock, 2800 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2801 common_cpuid, 2802 }; 2803 2804 let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?; 2805 2806 let (id, snapshot) = { 2807 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2808 (cpu_manager.id(), cpu_manager.snapshot()?) 2809 }; 2810 vm_snapshot.add_snapshot(id, snapshot); 2811 let (id, snapshot) = { 2812 let mut memory_manager = self.memory_manager.lock().unwrap(); 2813 (memory_manager.id(), memory_manager.snapshot()?) 2814 }; 2815 vm_snapshot.add_snapshot(id, snapshot); 2816 let (id, snapshot) = { 2817 let mut device_manager = self.device_manager.lock().unwrap(); 2818 (device_manager.id(), device_manager.snapshot()?) 2819 }; 2820 vm_snapshot.add_snapshot(id, snapshot); 2821 2822 event!("vm", "snapshotted"); 2823 Ok(vm_snapshot) 2824 } 2825 } 2826 2827 impl Transportable for Vm { 2828 fn send( 2829 &self, 2830 snapshot: &Snapshot, 2831 destination_url: &str, 2832 ) -> std::result::Result<(), MigratableError> { 2833 let mut snapshot_config_path = url_to_path(destination_url)?; 2834 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2835 2836 // Create the snapshot config file 2837 let mut snapshot_config_file = OpenOptions::new() 2838 .read(true) 2839 .write(true) 2840 .create_new(true) 2841 .open(snapshot_config_path) 2842 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2843 2844 // Serialize and write the snapshot config 2845 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2846 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2847 2848 snapshot_config_file 2849 .write(vm_config.as_bytes()) 2850 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2851 2852 let mut snapshot_state_path = url_to_path(destination_url)?; 2853 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2854 2855 // Create the snapshot state file 2856 let mut snapshot_state_file = OpenOptions::new() 2857 .read(true) 2858 .write(true) 2859 .create_new(true) 2860 .open(snapshot_state_path) 2861 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2862 2863 // Serialize and write the snapshot state 2864 let vm_state = 2865 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2866 2867 snapshot_state_file 2868 .write(&vm_state) 2869 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2870 2871 // Tell the memory manager to also send/write its own snapshot. 2872 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2873 self.memory_manager 2874 .lock() 2875 .unwrap() 2876 .send(&memory_manager_snapshot.clone(), destination_url)?; 2877 } else { 2878 return Err(MigratableError::Restore(anyhow!( 2879 "Missing memory manager snapshot" 2880 ))); 2881 } 2882 2883 Ok(()) 2884 } 2885 } 2886 2887 impl Migratable for Vm { 2888 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2889 self.memory_manager.lock().unwrap().start_dirty_log()?; 2890 self.device_manager.lock().unwrap().start_dirty_log() 2891 } 2892 2893 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2894 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2895 self.device_manager.lock().unwrap().stop_dirty_log() 2896 } 2897 2898 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2899 Ok(MemoryRangeTable::new_from_tables(vec![ 2900 self.memory_manager.lock().unwrap().dirty_log()?, 2901 self.device_manager.lock().unwrap().dirty_log()?, 2902 ])) 2903 } 2904 2905 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2906 self.memory_manager.lock().unwrap().start_migration()?; 2907 self.device_manager.lock().unwrap().start_migration() 2908 } 2909 2910 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2911 self.memory_manager.lock().unwrap().complete_migration()?; 2912 self.device_manager.lock().unwrap().complete_migration() 2913 } 2914 } 2915 2916 #[cfg(feature = "guest_debug")] 2917 impl Debuggable for Vm { 2918 fn set_guest_debug( 2919 &self, 2920 cpu_id: usize, 2921 addrs: &[GuestAddress], 2922 singlestep: bool, 2923 ) -> std::result::Result<(), DebuggableError> { 2924 self.cpu_manager 2925 .lock() 2926 .unwrap() 2927 .set_guest_debug(cpu_id, addrs, singlestep) 2928 } 2929 2930 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2931 if *self.state.read().unwrap() == VmState::Running { 2932 self.pause().map_err(DebuggableError::Pause)?; 2933 } 2934 2935 let mut state = self 2936 .state 2937 .try_write() 2938 .map_err(|_| DebuggableError::PoisonedState)?; 2939 *state = VmState::BreakPoint; 2940 Ok(()) 2941 } 2942 2943 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2944 if *self.state.read().unwrap() == VmState::BreakPoint { 2945 self.resume().map_err(DebuggableError::Pause)?; 2946 } 2947 2948 Ok(()) 2949 } 2950 2951 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2952 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2953 } 2954 2955 fn write_regs( 2956 &self, 2957 cpu_id: usize, 2958 regs: &CoreRegs, 2959 ) -> std::result::Result<(), DebuggableError> { 2960 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2961 } 2962 2963 fn read_mem( 2964 &self, 2965 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2966 cpu_id: usize, 2967 vaddr: GuestAddress, 2968 len: usize, 2969 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2970 self.cpu_manager 2971 .lock() 2972 .unwrap() 2973 .read_mem(guest_memory, cpu_id, vaddr, len) 2974 } 2975 2976 fn write_mem( 2977 &self, 2978 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2979 cpu_id: usize, 2980 vaddr: &GuestAddress, 2981 data: &[u8], 2982 ) -> std::result::Result<(), DebuggableError> { 2983 self.cpu_manager 2984 .lock() 2985 .unwrap() 2986 .write_mem(guest_memory, cpu_id, vaddr, data) 2987 } 2988 2989 fn active_vcpus(&self) -> usize { 2990 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2991 if active_vcpus > 0 { 2992 active_vcpus 2993 } else { 2994 // The VM is not booted yet. Report boot_vcpus() instead. 2995 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2996 } 2997 } 2998 } 2999 3000 #[cfg(feature = "guest_debug")] 3001 pub const UINT16_MAX: u32 = 65535; 3002 3003 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 3004 impl Elf64Writable for Vm {} 3005 3006 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 3007 impl GuestDebuggable for Vm { 3008 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 3009 event!("vm", "coredumping"); 3010 3011 let mut resume = false; 3012 3013 #[cfg(feature = "tdx")] 3014 { 3015 if let Some(ref platform) = self.config.lock().unwrap().platform { 3016 if platform.tdx { 3017 return Err(GuestDebuggableError::Coredump(anyhow!( 3018 "Coredump not possible with TDX VM" 3019 ))); 3020 } 3021 } 3022 } 3023 3024 match self.get_state().unwrap() { 3025 VmState::Running => { 3026 self.pause().map_err(GuestDebuggableError::Pause)?; 3027 resume = true; 3028 } 3029 VmState::Paused => {} 3030 _ => { 3031 return Err(GuestDebuggableError::Coredump(anyhow!( 3032 "Trying to coredump while VM is not running or paused" 3033 ))); 3034 } 3035 } 3036 3037 let coredump_state = self.get_dump_state(destination_url)?; 3038 3039 self.write_header(&coredump_state)?; 3040 self.write_note(&coredump_state)?; 3041 self.write_loads(&coredump_state)?; 3042 3043 self.cpu_manager 3044 .lock() 3045 .unwrap() 3046 .cpu_write_elf64_note(&coredump_state)?; 3047 self.cpu_manager 3048 .lock() 3049 .unwrap() 3050 .cpu_write_vmm_note(&coredump_state)?; 3051 3052 self.memory_manager 3053 .lock() 3054 .unwrap() 3055 .coredump_iterate_save_mem(&coredump_state)?; 3056 3057 if resume { 3058 self.resume().map_err(GuestDebuggableError::Resume)?; 3059 } 3060 3061 Ok(()) 3062 } 3063 } 3064 3065 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3066 #[cfg(test)] 3067 mod tests { 3068 use super::*; 3069 3070 fn test_vm_state_transitions(state: VmState) { 3071 match state { 3072 VmState::Created => { 3073 // Check the transitions from Created 3074 state.valid_transition(VmState::Created).unwrap_err(); 3075 state.valid_transition(VmState::Running).unwrap(); 3076 state.valid_transition(VmState::Shutdown).unwrap(); 3077 state.valid_transition(VmState::Paused).unwrap(); 3078 state.valid_transition(VmState::BreakPoint).unwrap(); 3079 } 3080 VmState::Running => { 3081 // Check the transitions from Running 3082 state.valid_transition(VmState::Created).unwrap_err(); 3083 state.valid_transition(VmState::Running).unwrap_err(); 3084 state.valid_transition(VmState::Shutdown).unwrap(); 3085 state.valid_transition(VmState::Paused).unwrap(); 3086 state.valid_transition(VmState::BreakPoint).unwrap(); 3087 } 3088 VmState::Shutdown => { 3089 // Check the transitions from Shutdown 3090 state.valid_transition(VmState::Created).unwrap_err(); 3091 state.valid_transition(VmState::Running).unwrap(); 3092 state.valid_transition(VmState::Shutdown).unwrap_err(); 3093 state.valid_transition(VmState::Paused).unwrap_err(); 3094 state.valid_transition(VmState::BreakPoint).unwrap_err(); 3095 } 3096 VmState::Paused => { 3097 // Check the transitions from Paused 3098 state.valid_transition(VmState::Created).unwrap_err(); 3099 state.valid_transition(VmState::Running).unwrap(); 3100 state.valid_transition(VmState::Shutdown).unwrap(); 3101 state.valid_transition(VmState::Paused).unwrap_err(); 3102 state.valid_transition(VmState::BreakPoint).unwrap_err(); 3103 } 3104 VmState::BreakPoint => { 3105 // Check the transitions from Breakpoint 3106 state.valid_transition(VmState::Created).unwrap(); 3107 state.valid_transition(VmState::Running).unwrap(); 3108 state.valid_transition(VmState::Shutdown).unwrap_err(); 3109 state.valid_transition(VmState::Paused).unwrap_err(); 3110 state.valid_transition(VmState::BreakPoint).unwrap_err(); 3111 } 3112 } 3113 } 3114 3115 #[test] 3116 fn test_vm_created_transitions() { 3117 test_vm_state_transitions(VmState::Created); 3118 } 3119 3120 #[test] 3121 fn test_vm_running_transitions() { 3122 test_vm_state_transitions(VmState::Running); 3123 } 3124 3125 #[test] 3126 fn test_vm_shutdown_transitions() { 3127 test_vm_state_transitions(VmState::Shutdown); 3128 } 3129 3130 #[test] 3131 fn test_vm_paused_transitions() { 3132 test_vm_state_transitions(VmState::Paused); 3133 } 3134 3135 #[cfg(feature = "tdx")] 3136 #[test] 3137 fn test_hob_memory_resources() { 3138 // Case 1: Two TDVF sections in the middle of the RAM 3139 let sections = vec![ 3140 TdvfSection { 3141 address: 0xc000, 3142 size: 0x1000, 3143 ..Default::default() 3144 }, 3145 TdvfSection { 3146 address: 0x1000, 3147 size: 0x4000, 3148 ..Default::default() 3149 }, 3150 ]; 3151 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 3152 let expected = vec![ 3153 (0, 0x1000, true), 3154 (0x1000, 0x4000, false), 3155 (0x5000, 0x7000, true), 3156 (0xc000, 0x1000, false), 3157 (0xd000, 0x0fff_3000, true), 3158 ]; 3159 assert_eq!( 3160 expected, 3161 Vm::hob_memory_resources( 3162 sections, 3163 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3164 ) 3165 ); 3166 3167 // Case 2: Two TDVF sections with no conflict with the RAM 3168 let sections = vec![ 3169 TdvfSection { 3170 address: 0x1000_1000, 3171 size: 0x1000, 3172 ..Default::default() 3173 }, 3174 TdvfSection { 3175 address: 0, 3176 size: 0x1000, 3177 ..Default::default() 3178 }, 3179 ]; 3180 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3181 let expected = vec![ 3182 (0, 0x1000, false), 3183 (0x1000, 0x1000_0000, true), 3184 (0x1000_1000, 0x1000, false), 3185 ]; 3186 assert_eq!( 3187 expected, 3188 Vm::hob_memory_resources( 3189 sections, 3190 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3191 ) 3192 ); 3193 3194 // Case 3: Two TDVF sections with partial conflicts with the RAM 3195 let sections = vec![ 3196 TdvfSection { 3197 address: 0x1000_0000, 3198 size: 0x2000, 3199 ..Default::default() 3200 }, 3201 TdvfSection { 3202 address: 0, 3203 size: 0x2000, 3204 ..Default::default() 3205 }, 3206 ]; 3207 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3208 let expected = vec![ 3209 (0, 0x2000, false), 3210 (0x2000, 0x0fff_e000, true), 3211 (0x1000_0000, 0x2000, false), 3212 ]; 3213 assert_eq!( 3214 expected, 3215 Vm::hob_memory_resources( 3216 sections, 3217 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3218 ) 3219 ); 3220 3221 // Case 4: Two TDVF sections with no conflict before the RAM and two 3222 // more additional sections with no conflict after the RAM. 3223 let sections = vec![ 3224 TdvfSection { 3225 address: 0x2000_1000, 3226 size: 0x1000, 3227 ..Default::default() 3228 }, 3229 TdvfSection { 3230 address: 0x2000_0000, 3231 size: 0x1000, 3232 ..Default::default() 3233 }, 3234 TdvfSection { 3235 address: 0x1000, 3236 size: 0x1000, 3237 ..Default::default() 3238 }, 3239 TdvfSection { 3240 address: 0, 3241 size: 0x1000, 3242 ..Default::default() 3243 }, 3244 ]; 3245 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3246 let expected = vec![ 3247 (0, 0x1000, false), 3248 (0x1000, 0x1000, false), 3249 (0x4000, 0x1000_0000, true), 3250 (0x2000_0000, 0x1000, false), 3251 (0x2000_1000, 0x1000, false), 3252 ]; 3253 assert_eq!( 3254 expected, 3255 Vm::hob_memory_resources( 3256 sections, 3257 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3258 ) 3259 ); 3260 3261 // Case 5: One TDVF section overriding the entire RAM 3262 let sections = vec![TdvfSection { 3263 address: 0, 3264 size: 0x2000_0000, 3265 ..Default::default() 3266 }]; 3267 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3268 let expected = vec![(0, 0x2000_0000, false)]; 3269 assert_eq!( 3270 expected, 3271 Vm::hob_memory_resources( 3272 sections, 3273 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3274 ) 3275 ); 3276 3277 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3278 let sections = vec![ 3279 TdvfSection { 3280 address: 0x1000_2000, 3281 size: 0x2000, 3282 ..Default::default() 3283 }, 3284 TdvfSection { 3285 address: 0, 3286 size: 0x2000, 3287 ..Default::default() 3288 }, 3289 ]; 3290 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3291 (GuestAddress(0x2000), 0x1000_0000), 3292 (GuestAddress(0x1000_4000), 0x1000_0000), 3293 ]; 3294 let expected = vec![ 3295 (0, 0x2000, false), 3296 (0x2000, 0x1000_0000, true), 3297 (0x1000_2000, 0x2000, false), 3298 (0x1000_4000, 0x1000_0000, true), 3299 ]; 3300 assert_eq!( 3301 expected, 3302 Vm::hob_memory_resources( 3303 sections, 3304 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3305 ) 3306 ); 3307 3308 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3309 let sections = vec![ 3310 TdvfSection { 3311 address: 0x1000_0000, 3312 size: 0x4000, 3313 ..Default::default() 3314 }, 3315 TdvfSection { 3316 address: 0, 3317 size: 0x4000, 3318 ..Default::default() 3319 }, 3320 ]; 3321 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3322 (GuestAddress(0x1000), 0x1000_0000), 3323 (GuestAddress(0x1000_3000), 0x1000_0000), 3324 ]; 3325 let expected = vec![ 3326 (0, 0x4000, false), 3327 (0x4000, 0x0fff_c000, true), 3328 (0x1000_0000, 0x4000, false), 3329 (0x1000_4000, 0x0fff_f000, true), 3330 ]; 3331 assert_eq!( 3332 expected, 3333 Vm::hob_memory_resources( 3334 sections, 3335 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3336 ) 3337 ); 3338 } 3339 } 3340 3341 #[cfg(target_arch = "aarch64")] 3342 #[cfg(test)] 3343 mod tests { 3344 use arch::aarch64::fdt::create_fdt; 3345 use arch::aarch64::layout; 3346 use arch::{DeviceType, MmioDeviceInfo}; 3347 use devices::gic::Gic; 3348 3349 use super::*; 3350 3351 const LEN: u64 = 4096; 3352 3353 #[test] 3354 fn test_create_fdt_with_devices() { 3355 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3356 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3357 3358 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3359 ( 3360 (DeviceType::Serial, DeviceType::Serial.to_string()), 3361 MmioDeviceInfo { 3362 addr: 0x00, 3363 len: LEN, 3364 irq: 33, 3365 }, 3366 ), 3367 ( 3368 (DeviceType::Virtio(1), "virtio".to_string()), 3369 MmioDeviceInfo { 3370 addr: LEN, 3371 len: LEN, 3372 irq: 34, 3373 }, 3374 ), 3375 ( 3376 (DeviceType::Rtc, "rtc".to_string()), 3377 MmioDeviceInfo { 3378 addr: 2 * LEN, 3379 len: LEN, 3380 irq: 35, 3381 }, 3382 ), 3383 ] 3384 .iter() 3385 .cloned() 3386 .collect(); 3387 3388 let hv = hypervisor::new().unwrap(); 3389 let vm = hv.create_vm().unwrap(); 3390 let gic = vm 3391 .create_vgic(Gic::create_default_config(1)) 3392 .expect("Cannot create gic"); 3393 create_fdt( 3394 &mem, 3395 "console=tty0", 3396 vec![0], 3397 Some((0, 0, 0)), 3398 &dev_info, 3399 &gic, 3400 &None, 3401 &Vec::new(), 3402 &BTreeMap::new(), 3403 None, 3404 true, 3405 ) 3406 .unwrap(); 3407 } 3408 } 3409 3410 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3411 #[test] 3412 pub fn test_vm() { 3413 use hypervisor::VmExit; 3414 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3415 // This example based on https://lwn.net/Articles/658511/ 3416 let code = [ 3417 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3418 0x00, 0xd8, /* add %bl, %al */ 3419 0x04, b'0', /* add $'0', %al */ 3420 0xee, /* out %al, (%dx) */ 3421 0xb0, b'\n', /* mov $'\n', %al */ 3422 0xee, /* out %al, (%dx) */ 3423 0xf4, /* hlt */ 3424 ]; 3425 3426 let mem_size = 0x1000; 3427 let load_addr = GuestAddress(0x1000); 3428 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3429 3430 let hv = hypervisor::new().unwrap(); 3431 let vm = hv.create_vm().expect("new VM creation failed"); 3432 3433 for (index, region) in mem.iter().enumerate() { 3434 let mem_region = vm.make_user_memory_region( 3435 index as u32, 3436 region.start_addr().raw_value(), 3437 region.len(), 3438 region.as_ptr() as u64, 3439 false, 3440 false, 3441 ); 3442 3443 vm.create_user_memory_region(mem_region) 3444 .expect("Cannot configure guest memory"); 3445 } 3446 mem.write_slice(&code, load_addr) 3447 .expect("Writing code to memory failed"); 3448 3449 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3450 3451 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3452 vcpu_sregs.cs.base = 0; 3453 vcpu_sregs.cs.selector = 0; 3454 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3455 3456 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3457 vcpu_regs.set_rip(0x1000); 3458 vcpu_regs.set_rax(2); 3459 vcpu_regs.set_rbx(3); 3460 vcpu_regs.set_rflags(2); 3461 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3462 3463 loop { 3464 match vcpu.run().expect("run failed") { 3465 VmExit::Reset => { 3466 println!("HLT"); 3467 break; 3468 } 3469 VmExit::Ignore => {} 3470 r => panic!("unexpected exit reason: {r:?}"), 3471 } 3472 } 3473 } 3474