1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use std::collections::{BTreeMap, HashMap}; 15 use std::fs::{File, OpenOptions}; 16 use std::io::{self, Seek, SeekFrom, Write}; 17 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 18 use std::mem::size_of; 19 use std::num::Wrapping; 20 use std::ops::Deref; 21 use std::os::unix::net::UnixStream; 22 use std::sync::{Arc, Mutex, RwLock}; 23 #[cfg(not(target_arch = "riscv64"))] 24 use std::time::Instant; 25 use std::{cmp, result, str, thread}; 26 27 use anyhow::anyhow; 28 #[cfg(target_arch = "x86_64")] 29 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 30 #[cfg(feature = "tdx")] 31 use arch::x86_64::tdx::TdvfSection; 32 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 33 use arch::PciSpaceInfo; 34 use arch::{get_host_cpu_phys_bits, EntryPoint, NumaNode, NumaNodes}; 35 #[cfg(target_arch = "aarch64")] 36 use devices::interrupt_controller; 37 use devices::AcpiNotificationFlags; 38 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 39 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 40 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 41 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 42 use hypervisor::{HypervisorVmError, VmOps}; 43 use libc::{termios, SIGWINCH}; 44 use linux_loader::cmdline::Cmdline; 45 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 46 use linux_loader::elf; 47 #[cfg(target_arch = "x86_64")] 48 use linux_loader::loader::bzimage::BzImage; 49 #[cfg(target_arch = "x86_64")] 50 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 51 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 52 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 53 use linux_loader::loader::KernelLoader; 54 use seccompiler::SeccompAction; 55 use serde::{Deserialize, Serialize}; 56 use thiserror::Error; 57 use tracer::trace_scoped; 58 use vm_device::Bus; 59 #[cfg(feature = "tdx")] 60 use vm_memory::{Address, ByteValued, GuestMemoryRegion, ReadVolatile}; 61 use vm_memory::{ 62 Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, WriteVolatile, 63 }; 64 use vm_migration::protocol::{MemoryRangeTable, Request, Response}; 65 use vm_migration::{ 66 snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, 67 }; 68 use vmm_sys_util::eventfd::EventFd; 69 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 70 71 use crate::config::{add_to_config, ValidationError}; 72 use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; 73 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 74 use crate::coredump::{ 75 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 76 }; 77 use crate::device_manager::{DeviceManager, DeviceManagerError}; 78 use crate::device_tree::DeviceTree; 79 #[cfg(feature = "guest_debug")] 80 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 81 #[cfg(feature = "igvm")] 82 use crate::igvm::igvm_loader; 83 use crate::landlock::LandlockError; 84 use crate::memory_manager::{ 85 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 86 }; 87 #[cfg(target_arch = "x86_64")] 88 use crate::migration::get_vm_snapshot; 89 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 90 use crate::migration::url_to_file; 91 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 92 use crate::vm_config::{ 93 DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, NumaConfig, PayloadConfig, 94 PmemConfig, UserDeviceConfig, VdpaConfig, VmConfig, VsockConfig, 95 }; 96 use crate::{ 97 cpu, GuestMemoryMmap, PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, 98 MEMORY_MANAGER_SNAPSHOT_ID, 99 }; 100 101 /// Errors associated with VM management 102 #[derive(Debug, Error)] 103 pub enum Error { 104 #[error("Cannot open kernel file: {0}")] 105 KernelFile(#[source] io::Error), 106 107 #[error("Cannot open initramfs file: {0}")] 108 InitramfsFile(#[source] io::Error), 109 110 #[error("Cannot load the kernel into memory: {0}")] 111 KernelLoad(#[source] linux_loader::loader::Error), 112 113 #[cfg(target_arch = "aarch64")] 114 #[error("Cannot load the UEFI binary in memory: {0:?}")] 115 UefiLoad(arch::aarch64::uefi::Error), 116 117 #[error("Cannot load the initramfs into memory")] 118 InitramfsLoad, 119 120 #[error("Cannot load the kernel command line in memory: {0}")] 121 LoadCmdLine(#[source] linux_loader::loader::Error), 122 123 #[error("Failed to apply landlock config during vm_create: {0}")] 124 ApplyLandlock(#[source] LandlockError), 125 126 #[error("Cannot modify the kernel command line: {0}")] 127 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 128 129 #[error("Cannot create the kernel command line: {0}")] 130 CmdLineCreate(#[source] linux_loader::cmdline::Error), 131 132 #[error("Cannot configure system: {0}")] 133 ConfigureSystem(#[source] arch::Error), 134 135 #[cfg(target_arch = "aarch64")] 136 #[error("Cannot enable interrupt controller: {0:?}")] 137 EnableInterruptController(interrupt_controller::Error), 138 139 #[error("VM state is poisoned")] 140 PoisonedState, 141 142 #[error("Error from device manager: {0:?}")] 143 DeviceManager(DeviceManagerError), 144 145 #[error("No device with id {0:?} to remove")] 146 NoDeviceToRemove(String), 147 148 #[error("Cannot spawn a signal handler thread: {0}")] 149 SignalHandlerSpawn(#[source] io::Error), 150 151 #[error("Failed to join on threads: {0:?}")] 152 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 153 154 #[error("VM config is missing")] 155 VmMissingConfig, 156 157 #[error("VM is not created")] 158 VmNotCreated, 159 160 #[error("VM is already created")] 161 VmAlreadyCreated, 162 163 #[error("VM is not running")] 164 VmNotRunning, 165 166 #[error("Cannot clone EventFd: {0}")] 167 EventFdClone(#[source] io::Error), 168 169 #[error("invalid VM state transition: {0:?} to {1:?}")] 170 InvalidStateTransition(VmState, VmState), 171 172 #[error("Error from CPU manager: {0}")] 173 CpuManager(#[source] cpu::Error), 174 175 #[error("Cannot pause devices: {0}")] 176 PauseDevices(#[source] MigratableError), 177 178 #[error("Cannot resume devices: {0}")] 179 ResumeDevices(#[source] MigratableError), 180 181 #[error("Cannot pause CPUs: {0}")] 182 PauseCpus(#[source] MigratableError), 183 184 #[error("Cannot resume cpus: {0}")] 185 ResumeCpus(#[source] MigratableError), 186 187 #[error("Cannot pause VM: {0}")] 188 Pause(#[source] MigratableError), 189 190 #[error("Cannot resume VM: {0}")] 191 Resume(#[source] MigratableError), 192 193 #[error("Memory manager error: {0:?}")] 194 MemoryManager(MemoryManagerError), 195 196 #[error("Eventfd write error: {0}")] 197 EventfdError(#[source] std::io::Error), 198 199 #[error("Cannot snapshot VM: {0}")] 200 Snapshot(#[source] MigratableError), 201 202 #[error("Cannot restore VM: {0}")] 203 Restore(#[source] MigratableError), 204 205 #[error("Cannot send VM snapshot: {0}")] 206 SnapshotSend(#[source] MigratableError), 207 208 #[error("Invalid restore source URL")] 209 InvalidRestoreSourceUrl, 210 211 #[error("Failed to validate config: {0}")] 212 ConfigValidation(#[source] ValidationError), 213 214 #[error("Too many virtio-vsock devices")] 215 TooManyVsockDevices, 216 217 #[error("Failed serializing into JSON: {0}")] 218 SerializeJson(#[source] serde_json::Error), 219 220 #[error("Invalid NUMA configuration")] 221 InvalidNumaConfig, 222 223 #[error("Cannot create seccomp filter: {0}")] 224 CreateSeccompFilter(#[source] seccompiler::Error), 225 226 #[error("Cannot apply seccomp filter: {0}")] 227 ApplySeccompFilter(#[source] seccompiler::Error), 228 229 #[error("Failed resizing a memory zone")] 230 ResizeZone, 231 232 #[error("Cannot activate virtio devices: {0:?}")] 233 ActivateVirtioDevices(DeviceManagerError), 234 235 #[error("Error triggering power button: {0:?}")] 236 PowerButton(DeviceManagerError), 237 238 #[error("Kernel lacks PVH header")] 239 KernelMissingPvhHeader, 240 241 #[error("Failed to allocate firmware RAM: {0:?}")] 242 AllocateFirmwareMemory(MemoryManagerError), 243 244 #[error("Error manipulating firmware file: {0}")] 245 FirmwareFile(#[source] std::io::Error), 246 247 #[error("Firmware too big")] 248 FirmwareTooLarge, 249 250 #[error("Failed to copy firmware to memory: {0}")] 251 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 252 253 #[cfg(feature = "sev_snp")] 254 #[error("Error enabling SEV-SNP VM: {0}")] 255 InitializeSevSnpVm(#[source] hypervisor::HypervisorVmError), 256 257 #[cfg(feature = "tdx")] 258 #[error("Error performing I/O on TDX firmware file: {0}")] 259 LoadTdvf(#[source] std::io::Error), 260 261 #[cfg(feature = "tdx")] 262 #[error("Error performing I/O on the TDX payload file: {0}")] 263 LoadPayload(#[source] std::io::Error), 264 265 #[cfg(feature = "tdx")] 266 #[error("Error parsing TDVF: {0}")] 267 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 268 269 #[cfg(feature = "tdx")] 270 #[error("Error populating TDX HOB: {0}")] 271 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 272 273 #[cfg(feature = "tdx")] 274 #[error("Error allocating TDVF memory: {0:?}")] 275 AllocatingTdvfMemory(crate::memory_manager::Error), 276 277 #[cfg(feature = "tdx")] 278 #[error("Error enabling TDX VM: {0}")] 279 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 280 281 #[cfg(feature = "tdx")] 282 #[error("Error enabling TDX memory region: {0}")] 283 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 284 285 #[cfg(feature = "tdx")] 286 #[error("Error finalizing TDX VM: {0}")] 287 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 288 289 #[cfg(feature = "tdx")] 290 #[error("TDX firmware missing")] 291 TdxFirmwareMissing, 292 293 #[cfg(feature = "tdx")] 294 #[error("Invalid TDX payload type")] 295 InvalidPayloadType, 296 297 #[cfg(feature = "guest_debug")] 298 #[error("Error debugging VM: {0:?}")] 299 Debug(DebuggableError), 300 301 #[error("Error spawning kernel loading thread")] 302 KernelLoadThreadSpawn(std::io::Error), 303 304 #[error("Error joining kernel loading thread")] 305 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 306 307 #[error("Payload configuration is not bootable")] 308 InvalidPayload, 309 310 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 311 #[error("Error coredumping VM: {0:?}")] 312 Coredump(GuestDebuggableError), 313 314 #[cfg(feature = "igvm")] 315 #[error("Cannot open igvm file: {0}")] 316 IgvmFile(#[source] io::Error), 317 318 #[cfg(feature = "igvm")] 319 #[error("Cannot load the igvm into memory: {0}")] 320 IgvmLoad(#[source] igvm_loader::Error), 321 322 #[error("Error injecting NMI")] 323 ErrorNmi, 324 325 #[error("Error resuming the VM: {0}")] 326 ResumeVm(#[source] hypervisor::HypervisorVmError), 327 328 #[error("Error creating console devices")] 329 CreateConsoleDevices(ConsoleDeviceError), 330 } 331 pub type Result<T> = result::Result<T, Error>; 332 333 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 334 pub enum VmState { 335 Created, 336 Running, 337 Shutdown, 338 Paused, 339 BreakPoint, 340 } 341 342 impl VmState { 343 fn valid_transition(self, new_state: VmState) -> Result<()> { 344 match self { 345 VmState::Created => match new_state { 346 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 347 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 348 Ok(()) 349 } 350 }, 351 352 VmState::Running => match new_state { 353 VmState::Created | VmState::Running => { 354 Err(Error::InvalidStateTransition(self, new_state)) 355 } 356 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 357 }, 358 359 VmState::Shutdown => match new_state { 360 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 361 Err(Error::InvalidStateTransition(self, new_state)) 362 } 363 VmState::Running => Ok(()), 364 }, 365 366 VmState::Paused => match new_state { 367 VmState::Created | VmState::Paused | VmState::BreakPoint => { 368 Err(Error::InvalidStateTransition(self, new_state)) 369 } 370 VmState::Running | VmState::Shutdown => Ok(()), 371 }, 372 VmState::BreakPoint => match new_state { 373 VmState::Created | VmState::Running => Ok(()), 374 _ => Err(Error::InvalidStateTransition(self, new_state)), 375 }, 376 } 377 } 378 } 379 380 struct VmOpsHandler { 381 memory: GuestMemoryAtomic<GuestMemoryMmap>, 382 #[cfg(target_arch = "x86_64")] 383 io_bus: Arc<Bus>, 384 mmio_bus: Arc<Bus>, 385 } 386 387 impl VmOps for VmOpsHandler { 388 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 389 self.memory 390 .memory() 391 .write(buf, GuestAddress(gpa)) 392 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 393 } 394 395 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 396 self.memory 397 .memory() 398 .read(buf, GuestAddress(gpa)) 399 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 400 } 401 402 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 403 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 404 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 405 } 406 Ok(()) 407 } 408 409 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 410 match self.mmio_bus.write(gpa, data) { 411 Err(vm_device::BusError::MissingAddressRange) => { 412 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 413 } 414 Ok(Some(barrier)) => { 415 info!("Waiting for barrier"); 416 barrier.wait(); 417 info!("Barrier released"); 418 } 419 _ => {} 420 }; 421 Ok(()) 422 } 423 424 #[cfg(target_arch = "x86_64")] 425 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 426 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 427 info!("Guest PIO read to unregistered address 0x{:x}", port); 428 } 429 Ok(()) 430 } 431 432 #[cfg(target_arch = "x86_64")] 433 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 434 match self.io_bus.write(port, data) { 435 Err(vm_device::BusError::MissingAddressRange) => { 436 info!("Guest PIO write to unregistered address 0x{:x}", port); 437 } 438 Ok(Some(barrier)) => { 439 info!("Waiting for barrier"); 440 barrier.wait(); 441 info!("Barrier released"); 442 } 443 _ => {} 444 }; 445 Ok(()) 446 } 447 } 448 449 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 450 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 451 452 cmp::min(host_phys_bits, max_phys_bits) 453 } 454 455 pub struct Vm { 456 #[cfg(feature = "tdx")] 457 kernel: Option<File>, 458 initramfs: Option<File>, 459 threads: Vec<thread::JoinHandle<()>>, 460 device_manager: Arc<Mutex<DeviceManager>>, 461 config: Arc<Mutex<VmConfig>>, 462 state: RwLock<VmState>, 463 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 464 memory_manager: Arc<Mutex<MemoryManager>>, 465 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 466 // The hypervisor abstracted virtual machine. 467 vm: Arc<dyn hypervisor::Vm>, 468 #[cfg(target_arch = "x86_64")] 469 saved_clock: Option<hypervisor::ClockData>, 470 #[cfg(not(target_arch = "riscv64"))] 471 numa_nodes: NumaNodes, 472 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 473 #[cfg(not(target_arch = "riscv64"))] 474 hypervisor: Arc<dyn hypervisor::Hypervisor>, 475 stop_on_boot: bool, 476 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 477 } 478 479 impl Vm { 480 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 481 482 #[allow(clippy::too_many_arguments)] 483 pub fn new_from_memory_manager( 484 config: Arc<Mutex<VmConfig>>, 485 memory_manager: Arc<Mutex<MemoryManager>>, 486 vm: Arc<dyn hypervisor::Vm>, 487 exit_evt: EventFd, 488 reset_evt: EventFd, 489 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 490 seccomp_action: &SeccompAction, 491 hypervisor: Arc<dyn hypervisor::Hypervisor>, 492 activate_evt: EventFd, 493 #[cfg(not(target_arch = "riscv64"))] timestamp: Instant, 494 console_info: Option<ConsoleInfo>, 495 console_resize_pipe: Option<Arc<File>>, 496 original_termios: Arc<Mutex<Option<termios>>>, 497 snapshot: Option<Snapshot>, 498 ) -> Result<Self> { 499 trace_scoped!("Vm::new_from_memory_manager"); 500 501 let boot_id_list = config 502 .lock() 503 .unwrap() 504 .validate() 505 .map_err(Error::ConfigValidation)?; 506 507 #[cfg(not(feature = "igvm"))] 508 let load_payload_handle = if snapshot.is_none() { 509 Self::load_payload_async(&memory_manager, &config)? 510 } else { 511 None 512 }; 513 514 info!("Booting VM from config: {:?}", &config); 515 516 // Create NUMA nodes based on NumaConfig. 517 let numa_nodes = 518 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 519 520 #[cfg(feature = "tdx")] 521 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 522 #[cfg(feature = "sev_snp")] 523 let sev_snp_enabled = config.lock().unwrap().is_sev_snp_enabled(); 524 #[cfg(feature = "tdx")] 525 let force_iommu = tdx_enabled; 526 #[cfg(feature = "sev_snp")] 527 let force_iommu = sev_snp_enabled; 528 #[cfg(not(any(feature = "tdx", feature = "sev_snp")))] 529 let force_iommu = false; 530 531 #[cfg(feature = "guest_debug")] 532 let stop_on_boot = config.lock().unwrap().gdb; 533 #[cfg(not(feature = "guest_debug"))] 534 let stop_on_boot = false; 535 536 let memory = memory_manager.lock().unwrap().guest_memory(); 537 let io_bus = Arc::new(Bus::new()); 538 let mmio_bus = Arc::new(Bus::new()); 539 540 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 541 memory, 542 #[cfg(target_arch = "x86_64")] 543 io_bus: io_bus.clone(), 544 mmio_bus: mmio_bus.clone(), 545 }); 546 547 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 548 let cpu_manager = cpu::CpuManager::new( 549 cpus_config, 550 vm.clone(), 551 exit_evt.try_clone().map_err(Error::EventFdClone)?, 552 reset_evt.try_clone().map_err(Error::EventFdClone)?, 553 #[cfg(feature = "guest_debug")] 554 vm_debug_evt, 555 &hypervisor, 556 seccomp_action.clone(), 557 vm_ops, 558 #[cfg(feature = "tdx")] 559 tdx_enabled, 560 &numa_nodes, 561 #[cfg(feature = "sev_snp")] 562 sev_snp_enabled, 563 ) 564 .map_err(Error::CpuManager)?; 565 566 #[cfg(target_arch = "x86_64")] 567 cpu_manager 568 .lock() 569 .unwrap() 570 .populate_cpuid( 571 &memory_manager, 572 &hypervisor, 573 #[cfg(feature = "tdx")] 574 tdx_enabled, 575 ) 576 .map_err(Error::CpuManager)?; 577 578 // Loading the igvm file is pushed down here because 579 // igvm parser needs cpu_manager to retrieve cpuid leaf. 580 // For the regular case, we can start loading early, but for 581 // igvm case we have to wait until cpu_manager is created. 582 // Currently, Microsoft Hypervisor does not provide any 583 // Hypervisor specific common cpuid, we need to call get_cpuid_values 584 // per cpuid through cpu_manager. 585 #[cfg(feature = "igvm")] 586 let load_payload_handle = if snapshot.is_none() { 587 Self::load_payload_async( 588 &memory_manager, 589 &config, 590 &cpu_manager, 591 #[cfg(feature = "sev_snp")] 592 sev_snp_enabled, 593 )? 594 } else { 595 None 596 }; 597 // The initial TDX configuration must be done before the vCPUs are 598 // created 599 #[cfg(feature = "tdx")] 600 if tdx_enabled { 601 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 602 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 603 vm.tdx_init(&cpuid, max_vcpus) 604 .map_err(Error::InitializeTdxVm)?; 605 } 606 607 cpu_manager 608 .lock() 609 .unwrap() 610 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 611 .map_err(Error::CpuManager)?; 612 613 // This initial SEV-SNP configuration must be done immediately after 614 // vCPUs are created. As part of this initialization we are 615 // transitioning the guest into secure state. 616 #[cfg(feature = "sev_snp")] 617 if sev_snp_enabled { 618 vm.sev_snp_init().map_err(Error::InitializeSevSnpVm)?; 619 } 620 621 #[cfg(feature = "tdx")] 622 let dynamic = !tdx_enabled; 623 #[cfg(not(feature = "tdx"))] 624 let dynamic = true; 625 626 let device_manager = DeviceManager::new( 627 io_bus, 628 mmio_bus, 629 vm.clone(), 630 config.clone(), 631 memory_manager.clone(), 632 cpu_manager.clone(), 633 exit_evt.try_clone().map_err(Error::EventFdClone)?, 634 reset_evt, 635 seccomp_action.clone(), 636 numa_nodes.clone(), 637 &activate_evt, 638 force_iommu, 639 boot_id_list, 640 #[cfg(not(target_arch = "riscv64"))] 641 timestamp, 642 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 643 dynamic, 644 ) 645 .map_err(Error::DeviceManager)?; 646 647 device_manager 648 .lock() 649 .unwrap() 650 .create_devices(console_info, console_resize_pipe, original_termios) 651 .map_err(Error::DeviceManager)?; 652 653 #[cfg(feature = "tdx")] 654 let kernel = config 655 .lock() 656 .unwrap() 657 .payload 658 .as_ref() 659 .map(|p| p.kernel.as_ref().map(File::open)) 660 .unwrap_or_default() 661 .transpose() 662 .map_err(Error::KernelFile)?; 663 664 let initramfs = config 665 .lock() 666 .unwrap() 667 .payload 668 .as_ref() 669 .map(|p| p.initramfs.as_ref().map(File::open)) 670 .unwrap_or_default() 671 .transpose() 672 .map_err(Error::InitramfsFile)?; 673 674 #[cfg(target_arch = "x86_64")] 675 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 676 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 677 vm_snapshot.clock 678 } else { 679 None 680 }; 681 682 let vm_state = if snapshot.is_some() { 683 VmState::Paused 684 } else { 685 VmState::Created 686 }; 687 688 Ok(Vm { 689 #[cfg(feature = "tdx")] 690 kernel, 691 initramfs, 692 device_manager, 693 config, 694 threads: Vec::with_capacity(1), 695 state: RwLock::new(vm_state), 696 cpu_manager, 697 memory_manager, 698 vm, 699 #[cfg(target_arch = "x86_64")] 700 saved_clock, 701 #[cfg(not(target_arch = "riscv64"))] 702 numa_nodes, 703 #[cfg(not(target_arch = "riscv64"))] 704 hypervisor, 705 stop_on_boot, 706 load_payload_handle, 707 }) 708 } 709 710 fn create_numa_nodes( 711 configs: Option<Vec<NumaConfig>>, 712 memory_manager: &Arc<Mutex<MemoryManager>>, 713 ) -> Result<NumaNodes> { 714 let mm = memory_manager.lock().unwrap(); 715 let mm_zones = mm.memory_zones(); 716 let mut numa_nodes = BTreeMap::new(); 717 718 if let Some(configs) = &configs { 719 for config in configs.iter() { 720 if numa_nodes.contains_key(&config.guest_numa_id) { 721 error!("Can't define twice the same NUMA node"); 722 return Err(Error::InvalidNumaConfig); 723 } 724 725 let mut node = NumaNode::default(); 726 727 if let Some(memory_zones) = &config.memory_zones { 728 for memory_zone in memory_zones.iter() { 729 if let Some(mm_zone) = mm_zones.get(memory_zone) { 730 node.memory_regions.extend(mm_zone.regions().clone()); 731 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 732 node.hotplug_regions.push(virtiomem_zone.region().clone()); 733 } 734 node.memory_zones.push(memory_zone.clone()); 735 } else { 736 error!("Unknown memory zone '{}'", memory_zone); 737 return Err(Error::InvalidNumaConfig); 738 } 739 } 740 } 741 742 if let Some(cpus) = &config.cpus { 743 node.cpus.extend(cpus); 744 } 745 746 if let Some(pci_segments) = &config.pci_segments { 747 node.pci_segments.extend(pci_segments); 748 } 749 750 if let Some(distances) = &config.distances { 751 for distance in distances.iter() { 752 let dest = distance.destination; 753 let dist = distance.distance; 754 755 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 756 error!("Unknown destination NUMA node {}", dest); 757 return Err(Error::InvalidNumaConfig); 758 } 759 760 if node.distances.contains_key(&dest) { 761 error!("Destination NUMA node {} has been already set", dest); 762 return Err(Error::InvalidNumaConfig); 763 } 764 765 node.distances.insert(dest, dist); 766 } 767 } 768 769 #[cfg(target_arch = "x86_64")] 770 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 771 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 772 let mm_sections = sgx_epc_region.epc_sections(); 773 for sgx_epc_section in sgx_epc_sections.iter() { 774 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 775 node.sgx_epc_sections.push(mm_section.clone()); 776 } else { 777 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 778 return Err(Error::InvalidNumaConfig); 779 } 780 } 781 } else { 782 error!("Missing SGX EPC region"); 783 return Err(Error::InvalidNumaConfig); 784 } 785 } 786 787 numa_nodes.insert(config.guest_numa_id, node); 788 } 789 } 790 791 Ok(numa_nodes) 792 } 793 794 #[allow(clippy::too_many_arguments)] 795 pub fn new( 796 vm_config: Arc<Mutex<VmConfig>>, 797 exit_evt: EventFd, 798 reset_evt: EventFd, 799 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 800 seccomp_action: &SeccompAction, 801 hypervisor: Arc<dyn hypervisor::Hypervisor>, 802 activate_evt: EventFd, 803 console_info: Option<ConsoleInfo>, 804 console_resize_pipe: Option<Arc<File>>, 805 original_termios: Arc<Mutex<Option<termios>>>, 806 snapshot: Option<Snapshot>, 807 source_url: Option<&str>, 808 prefault: Option<bool>, 809 ) -> Result<Self> { 810 trace_scoped!("Vm::new"); 811 812 #[cfg(not(target_arch = "riscv64"))] 813 let timestamp = Instant::now(); 814 815 #[cfg(feature = "tdx")] 816 let tdx_enabled = if snapshot.is_some() { 817 false 818 } else { 819 vm_config.lock().unwrap().is_tdx_enabled() 820 }; 821 822 #[cfg(feature = "sev_snp")] 823 let sev_snp_enabled = if snapshot.is_some() { 824 false 825 } else { 826 vm_config.lock().unwrap().is_sev_snp_enabled() 827 }; 828 829 let vm = Self::create_hypervisor_vm( 830 &hypervisor, 831 #[cfg(feature = "tdx")] 832 tdx_enabled, 833 #[cfg(feature = "sev_snp")] 834 sev_snp_enabled, 835 #[cfg(feature = "sev_snp")] 836 vm_config.lock().unwrap().memory.total_size(), 837 )?; 838 839 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 840 841 let memory_manager = if let Some(snapshot) = 842 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 843 { 844 MemoryManager::new_from_snapshot( 845 &snapshot, 846 vm.clone(), 847 &vm_config.lock().unwrap().memory.clone(), 848 source_url, 849 prefault.unwrap(), 850 phys_bits, 851 ) 852 .map_err(Error::MemoryManager)? 853 } else { 854 #[cfg(target_arch = "x86_64")] 855 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 856 857 MemoryManager::new( 858 vm.clone(), 859 &vm_config.lock().unwrap().memory.clone(), 860 None, 861 phys_bits, 862 #[cfg(feature = "tdx")] 863 tdx_enabled, 864 None, 865 None, 866 #[cfg(target_arch = "x86_64")] 867 sgx_epc_config, 868 ) 869 .map_err(Error::MemoryManager)? 870 }; 871 872 Vm::new_from_memory_manager( 873 vm_config, 874 memory_manager, 875 vm, 876 exit_evt, 877 reset_evt, 878 #[cfg(feature = "guest_debug")] 879 vm_debug_evt, 880 seccomp_action, 881 hypervisor, 882 activate_evt, 883 #[cfg(not(target_arch = "riscv64"))] 884 timestamp, 885 console_info, 886 console_resize_pipe, 887 original_termios, 888 snapshot, 889 ) 890 } 891 892 pub fn create_hypervisor_vm( 893 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 894 #[cfg(feature = "tdx")] tdx_enabled: bool, 895 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 896 #[cfg(feature = "sev_snp")] mem_size: u64, 897 ) -> Result<Arc<dyn hypervisor::Vm>> { 898 hypervisor.check_required_extensions().unwrap(); 899 900 cfg_if::cfg_if! { 901 if #[cfg(feature = "tdx")] { 902 // Passing KVM_X86_TDX_VM: 1 if tdx_enabled is true 903 // Otherwise KVM_X86_LEGACY_VM: 0 904 // value of tdx_enabled is mapped to KVM_X86_TDX_VM or KVM_X86_LEGACY_VM 905 let vm = hypervisor 906 .create_vm_with_type(u64::from(tdx_enabled)) 907 .unwrap(); 908 } else if #[cfg(feature = "sev_snp")] { 909 // Passing SEV_SNP_ENABLED: 1 if sev_snp_enabled is true 910 // Otherwise SEV_SNP_DISABLED: 0 911 // value of sev_snp_enabled is mapped to SEV_SNP_ENABLED for true or SEV_SNP_DISABLED for false 912 let vm = hypervisor 913 .create_vm_with_type_and_memory(u64::from(sev_snp_enabled), mem_size) 914 .unwrap(); 915 } else { 916 let vm = hypervisor.create_vm().unwrap(); 917 } 918 } 919 920 #[cfg(target_arch = "x86_64")] 921 { 922 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 923 .unwrap(); 924 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 925 vm.enable_split_irq().unwrap(); 926 } 927 928 Ok(vm) 929 } 930 931 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 932 let initramfs = self.initramfs.as_mut().unwrap(); 933 let size: usize = initramfs 934 .seek(SeekFrom::End(0)) 935 .map_err(|_| Error::InitramfsLoad)? 936 .try_into() 937 .unwrap(); 938 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 939 940 let address = 941 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 942 let address = GuestAddress(address); 943 944 guest_mem 945 .read_volatile_from(address, initramfs, size) 946 .map_err(|_| Error::InitramfsLoad)?; 947 948 info!("Initramfs loaded: address = 0x{:x}", address.0); 949 Ok(arch::InitramfsConfig { address, size }) 950 } 951 952 pub fn generate_cmdline( 953 payload: &PayloadConfig, 954 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] device_manager: &Arc< 955 Mutex<DeviceManager>, 956 >, 957 ) -> Result<Cmdline> { 958 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 959 if let Some(s) = payload.cmdline.as_ref() { 960 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 961 } 962 963 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 964 for entry in device_manager.lock().unwrap().cmdline_additions() { 965 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 966 } 967 Ok(cmdline) 968 } 969 970 #[cfg(target_arch = "aarch64")] 971 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 972 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 973 let mem = uefi_flash.memory(); 974 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 975 .map_err(Error::UefiLoad)?; 976 Ok(()) 977 } 978 979 #[cfg(target_arch = "aarch64")] 980 fn load_kernel( 981 firmware: Option<File>, 982 kernel: Option<File>, 983 memory_manager: Arc<Mutex<MemoryManager>>, 984 ) -> Result<EntryPoint> { 985 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 986 let mem = guest_memory.memory(); 987 let entry_addr = match (firmware, kernel) { 988 (None, Some(mut kernel)) => { 989 match linux_loader::loader::pe::PE::load( 990 mem.deref(), 991 Some(arch::layout::KERNEL_START), 992 &mut kernel, 993 None, 994 ) { 995 Ok(entry_addr) => entry_addr.kernel_load, 996 // Try to load the binary as kernel PE file at first. 997 // If failed, retry to load it as UEFI binary. 998 // As the UEFI binary is formatless, it must be the last option to try. 999 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 1000 Self::load_firmware(&kernel, memory_manager)?; 1001 arch::layout::UEFI_START 1002 } 1003 Err(e) => { 1004 return Err(Error::KernelLoad(e)); 1005 } 1006 } 1007 } 1008 (Some(firmware), None) => { 1009 Self::load_firmware(&firmware, memory_manager)?; 1010 arch::layout::UEFI_START 1011 } 1012 _ => return Err(Error::InvalidPayload), 1013 }; 1014 1015 Ok(EntryPoint { entry_addr }) 1016 } 1017 1018 #[cfg(target_arch = "riscv64")] 1019 fn load_kernel( 1020 firmware: Option<File>, 1021 kernel: Option<File>, 1022 memory_manager: Arc<Mutex<MemoryManager>>, 1023 ) -> Result<EntryPoint> { 1024 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1025 let mem = guest_memory.memory(); 1026 let alignment = 0x20_0000; 1027 let aligned_kernel_addr = arch::layout::KERNEL_START.0 + (alignment - 1) & !(alignment - 1); 1028 let entry_addr = match (firmware, kernel) { 1029 (None, Some(mut kernel)) => { 1030 match linux_loader::loader::pe::PE::load( 1031 mem.deref(), 1032 Some(GuestAddress(aligned_kernel_addr)), 1033 &mut kernel, 1034 None, 1035 ) { 1036 Ok(entry_addr) => entry_addr.kernel_load, 1037 // Try to load the binary as kernel PE file at first. 1038 // If failed, retry to load it as UEFI binary. 1039 // As the UEFI binary is formatless, it must be the last option to try. 1040 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 1041 // TODO: UEFI for riscv64 is scheduled to next stage. 1042 unimplemented!() 1043 } 1044 Err(e) => { 1045 return Err(Error::KernelLoad(e)); 1046 } 1047 } 1048 } 1049 (Some(_firmware), None) => { 1050 // TODO: UEFI for riscv64 is scheduled to next stage. 1051 unimplemented!() 1052 } 1053 _ => return Err(Error::InvalidPayload), 1054 }; 1055 1056 Ok(EntryPoint { entry_addr }) 1057 } 1058 1059 #[cfg(feature = "igvm")] 1060 fn load_igvm( 1061 igvm: File, 1062 memory_manager: Arc<Mutex<MemoryManager>>, 1063 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1064 #[cfg(feature = "sev_snp")] host_data: &Option<String>, 1065 ) -> Result<EntryPoint> { 1066 let res = igvm_loader::load_igvm( 1067 &igvm, 1068 memory_manager, 1069 cpu_manager.clone(), 1070 "", 1071 #[cfg(feature = "sev_snp")] 1072 host_data, 1073 ) 1074 .map_err(Error::IgvmLoad)?; 1075 1076 cfg_if::cfg_if! { 1077 if #[cfg(feature = "sev_snp")] { 1078 let entry_point = if cpu_manager.lock().unwrap().sev_snp_enabled() { 1079 EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa_gpa), setup_header: None } 1080 } else { 1081 EntryPoint {entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None } 1082 }; 1083 } else { 1084 let entry_point = EntryPoint { entry_addr: vm_memory::GuestAddress(res.vmsa.rip), setup_header: None }; 1085 } 1086 }; 1087 Ok(entry_point) 1088 } 1089 1090 #[cfg(target_arch = "x86_64")] 1091 fn load_kernel( 1092 mut kernel: File, 1093 cmdline: Option<Cmdline>, 1094 memory_manager: Arc<Mutex<MemoryManager>>, 1095 ) -> Result<EntryPoint> { 1096 info!("Loading kernel"); 1097 1098 let mem = { 1099 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1100 guest_memory.memory() 1101 }; 1102 1103 // Try ELF binary with PVH boot. 1104 let entry_addr = linux_loader::loader::elf::Elf::load( 1105 mem.deref(), 1106 None, 1107 &mut kernel, 1108 Some(arch::layout::HIGH_RAM_START), 1109 ) 1110 // Try loading kernel as bzImage. 1111 .or_else(|_| { 1112 BzImage::load( 1113 mem.deref(), 1114 None, 1115 &mut kernel, 1116 Some(arch::layout::HIGH_RAM_START), 1117 ) 1118 }) 1119 .map_err(Error::KernelLoad)?; 1120 1121 if let Some(cmdline) = cmdline { 1122 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1123 .map_err(Error::LoadCmdLine)?; 1124 } 1125 1126 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1127 // Use the PVH kernel entry point to boot the guest 1128 info!("PVH kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1129 Ok(EntryPoint { 1130 entry_addr, 1131 setup_header: None, 1132 }) 1133 } else if entry_addr.setup_header.is_some() { 1134 // Use the bzImage 32bit entry point to boot the guest 1135 info!( 1136 "bzImage kernel loaded: entry_addr = 0x{:x}", 1137 entry_addr.kernel_load.0 1138 ); 1139 Ok(EntryPoint { 1140 entry_addr: entry_addr.kernel_load, 1141 setup_header: entry_addr.setup_header, 1142 }) 1143 } else { 1144 Err(Error::KernelMissingPvhHeader) 1145 } 1146 } 1147 1148 #[cfg(target_arch = "x86_64")] 1149 fn load_payload( 1150 payload: &PayloadConfig, 1151 memory_manager: Arc<Mutex<MemoryManager>>, 1152 #[cfg(feature = "igvm")] cpu_manager: Arc<Mutex<cpu::CpuManager>>, 1153 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1154 ) -> Result<EntryPoint> { 1155 trace_scoped!("load_payload"); 1156 #[cfg(feature = "igvm")] 1157 { 1158 if let Some(_igvm_file) = &payload.igvm { 1159 let igvm = File::open(_igvm_file).map_err(Error::IgvmFile)?; 1160 #[cfg(feature = "sev_snp")] 1161 if sev_snp_enabled { 1162 return Self::load_igvm(igvm, memory_manager, cpu_manager, &payload.host_data); 1163 } 1164 #[cfg(not(feature = "sev_snp"))] 1165 return Self::load_igvm(igvm, memory_manager, cpu_manager); 1166 } 1167 } 1168 match ( 1169 &payload.firmware, 1170 &payload.kernel, 1171 &payload.initramfs, 1172 &payload.cmdline, 1173 ) { 1174 (Some(firmware), None, None, None) => { 1175 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1176 Self::load_kernel(firmware, None, memory_manager) 1177 } 1178 (None, Some(kernel), _, _) => { 1179 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1180 let cmdline = Self::generate_cmdline(payload)?; 1181 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1182 } 1183 _ => Err(Error::InvalidPayload), 1184 } 1185 } 1186 1187 #[cfg(any(target_arch = "aarch64", target_arch = "riscv64"))] 1188 fn load_payload( 1189 payload: &PayloadConfig, 1190 memory_manager: Arc<Mutex<MemoryManager>>, 1191 ) -> Result<EntryPoint> { 1192 match (&payload.firmware, &payload.kernel) { 1193 (Some(firmware), None) => { 1194 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1195 Self::load_kernel(Some(firmware), None, memory_manager) 1196 } 1197 (None, Some(kernel)) => { 1198 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1199 Self::load_kernel(None, Some(kernel), memory_manager) 1200 } 1201 _ => Err(Error::InvalidPayload), 1202 } 1203 } 1204 1205 fn load_payload_async( 1206 memory_manager: &Arc<Mutex<MemoryManager>>, 1207 config: &Arc<Mutex<VmConfig>>, 1208 #[cfg(feature = "igvm")] cpu_manager: &Arc<Mutex<cpu::CpuManager>>, 1209 #[cfg(feature = "sev_snp")] sev_snp_enabled: bool, 1210 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1211 // Kernel with TDX is loaded in a different manner 1212 #[cfg(feature = "tdx")] 1213 if config.lock().unwrap().is_tdx_enabled() { 1214 return Ok(None); 1215 } 1216 1217 config 1218 .lock() 1219 .unwrap() 1220 .payload 1221 .as_ref() 1222 .map(|payload| { 1223 let memory_manager = memory_manager.clone(); 1224 let payload = payload.clone(); 1225 #[cfg(feature = "igvm")] 1226 let cpu_manager = cpu_manager.clone(); 1227 1228 std::thread::Builder::new() 1229 .name("payload_loader".into()) 1230 .spawn(move || { 1231 Self::load_payload( 1232 &payload, 1233 memory_manager, 1234 #[cfg(feature = "igvm")] 1235 cpu_manager, 1236 #[cfg(feature = "sev_snp")] 1237 sev_snp_enabled, 1238 ) 1239 }) 1240 .map_err(Error::KernelLoadThreadSpawn) 1241 }) 1242 .transpose() 1243 } 1244 1245 #[cfg(target_arch = "x86_64")] 1246 fn configure_system(&mut self, rsdp_addr: GuestAddress, entry_addr: EntryPoint) -> Result<()> { 1247 trace_scoped!("configure_system"); 1248 info!("Configuring system"); 1249 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1250 1251 let initramfs_config = match self.initramfs { 1252 Some(_) => Some(self.load_initramfs(&mem)?), 1253 None => None, 1254 }; 1255 1256 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1257 let rsdp_addr = Some(rsdp_addr); 1258 let sgx_epc_region = self 1259 .memory_manager 1260 .lock() 1261 .unwrap() 1262 .sgx_epc_region() 1263 .as_ref() 1264 .cloned(); 1265 1266 let serial_number = self 1267 .config 1268 .lock() 1269 .unwrap() 1270 .platform 1271 .as_ref() 1272 .and_then(|p| p.serial_number.clone()); 1273 1274 let uuid = self 1275 .config 1276 .lock() 1277 .unwrap() 1278 .platform 1279 .as_ref() 1280 .and_then(|p| p.uuid.clone()); 1281 1282 let oem_strings = self 1283 .config 1284 .lock() 1285 .unwrap() 1286 .platform 1287 .as_ref() 1288 .and_then(|p| p.oem_strings.clone()); 1289 1290 let oem_strings = oem_strings 1291 .as_deref() 1292 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1293 1294 let topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1295 1296 arch::configure_system( 1297 &mem, 1298 arch::layout::CMDLINE_START, 1299 arch::layout::CMDLINE_MAX_SIZE, 1300 &initramfs_config, 1301 boot_vcpus, 1302 entry_addr.setup_header, 1303 rsdp_addr, 1304 sgx_epc_region, 1305 serial_number.as_deref(), 1306 uuid.as_deref(), 1307 oem_strings.as_deref(), 1308 topology, 1309 ) 1310 .map_err(Error::ConfigureSystem)?; 1311 Ok(()) 1312 } 1313 1314 #[cfg(target_arch = "aarch64")] 1315 fn configure_system( 1316 &mut self, 1317 _rsdp_addr: GuestAddress, 1318 _entry_addr: EntryPoint, 1319 ) -> Result<()> { 1320 let cmdline = Self::generate_cmdline( 1321 self.config.lock().unwrap().payload.as_ref().unwrap(), 1322 &self.device_manager, 1323 )?; 1324 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1325 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1326 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1327 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1328 let initramfs_config = match self.initramfs { 1329 Some(_) => Some(self.load_initramfs(&mem)?), 1330 None => None, 1331 }; 1332 1333 let device_info = &self 1334 .device_manager 1335 .lock() 1336 .unwrap() 1337 .get_device_info() 1338 .clone(); 1339 1340 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1341 let pci_space = PciSpaceInfo { 1342 pci_segment_id: pci_segment.id, 1343 mmio_config_address: pci_segment.mmio_config_address, 1344 pci_device_space_start: pci_segment.start_of_mem64_area, 1345 pci_device_space_size: pci_segment.end_of_mem64_area 1346 - pci_segment.start_of_mem64_area 1347 + 1, 1348 }; 1349 pci_space_info.push(pci_space); 1350 } 1351 1352 let virtio_iommu_bdf = self 1353 .device_manager 1354 .lock() 1355 .unwrap() 1356 .iommu_attached_devices() 1357 .as_ref() 1358 .map(|(v, _)| *v); 1359 1360 let vgic = self 1361 .device_manager 1362 .lock() 1363 .unwrap() 1364 .get_interrupt_controller() 1365 .unwrap() 1366 .lock() 1367 .unwrap() 1368 .get_vgic() 1369 .map_err(|_| { 1370 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1371 arch::aarch64::Error::SetupGic, 1372 )) 1373 })?; 1374 1375 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1376 let pmu_supported = self 1377 .cpu_manager 1378 .lock() 1379 .unwrap() 1380 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1381 .map_err(|_| { 1382 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1383 arch::aarch64::Error::VcpuInitPmu, 1384 )) 1385 })?; 1386 1387 arch::configure_system( 1388 &mem, 1389 cmdline.as_cstring().unwrap().to_str().unwrap(), 1390 vcpu_mpidrs, 1391 vcpu_topology, 1392 device_info, 1393 &initramfs_config, 1394 &pci_space_info, 1395 virtio_iommu_bdf.map(|bdf| bdf.into()), 1396 &vgic, 1397 &self.numa_nodes, 1398 pmu_supported, 1399 ) 1400 .map_err(Error::ConfigureSystem)?; 1401 1402 Ok(()) 1403 } 1404 1405 #[cfg(target_arch = "riscv64")] 1406 fn configure_system(&mut self) -> Result<()> { 1407 let cmdline = Self::generate_cmdline( 1408 self.config.lock().unwrap().payload.as_ref().unwrap(), 1409 &self.device_manager, 1410 )?; 1411 let num_vcpu = self.cpu_manager.lock().unwrap().vcpus().len(); 1412 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1413 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1414 let initramfs_config = match self.initramfs { 1415 Some(_) => Some(self.load_initramfs(&mem)?), 1416 None => None, 1417 }; 1418 1419 let device_info = &self 1420 .device_manager 1421 .lock() 1422 .unwrap() 1423 .get_device_info() 1424 .clone(); 1425 1426 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1427 let pci_space = PciSpaceInfo { 1428 pci_segment_id: pci_segment.id, 1429 mmio_config_address: pci_segment.mmio_config_address, 1430 pci_device_space_start: pci_segment.start_of_mem64_area, 1431 pci_device_space_size: pci_segment.end_of_mem64_area 1432 - pci_segment.start_of_mem64_area 1433 + 1, 1434 }; 1435 pci_space_info.push(pci_space); 1436 } 1437 1438 // TODO: IOMMU for riscv64 is not yet support in kernel. 1439 1440 let vaia = self 1441 .device_manager 1442 .lock() 1443 .unwrap() 1444 .get_interrupt_controller() 1445 .unwrap() 1446 .lock() 1447 .unwrap() 1448 .get_vaia() 1449 .map_err(|_| { 1450 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1451 arch::riscv64::Error::SetupAia, 1452 )) 1453 })?; 1454 1455 // TODO: PMU support for riscv64 is scheduled to next stage. 1456 1457 arch::configure_system( 1458 &mem, 1459 cmdline.as_cstring().unwrap().to_str().unwrap(), 1460 num_vcpu as u32, 1461 device_info, 1462 &initramfs_config, 1463 &pci_space_info, 1464 &vaia, 1465 ) 1466 .map_err(Error::ConfigureSystem)?; 1467 1468 Ok(()) 1469 } 1470 1471 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1472 self.device_manager.lock().unwrap().console_resize_pipe() 1473 } 1474 1475 pub fn shutdown(&mut self) -> Result<()> { 1476 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1477 let new_state = VmState::Shutdown; 1478 1479 state.valid_transition(new_state)?; 1480 1481 // Wake up the DeviceManager threads so they will get terminated cleanly 1482 self.device_manager 1483 .lock() 1484 .unwrap() 1485 .resume() 1486 .map_err(Error::Resume)?; 1487 1488 self.cpu_manager 1489 .lock() 1490 .unwrap() 1491 .shutdown() 1492 .map_err(Error::CpuManager)?; 1493 1494 // Wait for all the threads to finish 1495 for thread in self.threads.drain(..) { 1496 thread.join().map_err(Error::ThreadCleanup)? 1497 } 1498 *state = new_state; 1499 1500 Ok(()) 1501 } 1502 1503 pub fn resize( 1504 &mut self, 1505 desired_vcpus: Option<u8>, 1506 desired_memory: Option<u64>, 1507 desired_balloon: Option<u64>, 1508 ) -> Result<()> { 1509 event!("vm", "resizing"); 1510 1511 if let Some(desired_vcpus) = desired_vcpus { 1512 if self 1513 .cpu_manager 1514 .lock() 1515 .unwrap() 1516 .resize(desired_vcpus) 1517 .map_err(Error::CpuManager)? 1518 { 1519 self.device_manager 1520 .lock() 1521 .unwrap() 1522 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1523 .map_err(Error::DeviceManager)?; 1524 } 1525 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1526 } 1527 1528 if let Some(desired_memory) = desired_memory { 1529 let new_region = self 1530 .memory_manager 1531 .lock() 1532 .unwrap() 1533 .resize(desired_memory) 1534 .map_err(Error::MemoryManager)?; 1535 1536 let memory_config = &mut self.config.lock().unwrap().memory; 1537 1538 if let Some(new_region) = &new_region { 1539 self.device_manager 1540 .lock() 1541 .unwrap() 1542 .update_memory(new_region) 1543 .map_err(Error::DeviceManager)?; 1544 1545 match memory_config.hotplug_method { 1546 HotplugMethod::Acpi => { 1547 self.device_manager 1548 .lock() 1549 .unwrap() 1550 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1551 .map_err(Error::DeviceManager)?; 1552 } 1553 HotplugMethod::VirtioMem => {} 1554 } 1555 } 1556 1557 // We update the VM config regardless of the actual guest resize 1558 // operation result (happened or not), so that if the VM reboots 1559 // it will be running with the last configure memory size. 1560 match memory_config.hotplug_method { 1561 HotplugMethod::Acpi => memory_config.size = desired_memory, 1562 HotplugMethod::VirtioMem => { 1563 if desired_memory > memory_config.size { 1564 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1565 } else { 1566 memory_config.hotplugged_size = None; 1567 } 1568 } 1569 } 1570 } 1571 1572 if let Some(desired_balloon) = desired_balloon { 1573 self.device_manager 1574 .lock() 1575 .unwrap() 1576 .resize_balloon(desired_balloon) 1577 .map_err(Error::DeviceManager)?; 1578 1579 // Update the configuration value for the balloon size to ensure 1580 // a reboot would use the right value. 1581 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1582 balloon_config.size = desired_balloon; 1583 } 1584 } 1585 1586 event!("vm", "resized"); 1587 1588 Ok(()) 1589 } 1590 1591 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1592 let memory_config = &mut self.config.lock().unwrap().memory; 1593 1594 if let Some(zones) = &mut memory_config.zones { 1595 for zone in zones.iter_mut() { 1596 if zone.id == id { 1597 if desired_memory >= zone.size { 1598 let hotplugged_size = desired_memory - zone.size; 1599 self.memory_manager 1600 .lock() 1601 .unwrap() 1602 .resize_zone(&id, desired_memory - zone.size) 1603 .map_err(Error::MemoryManager)?; 1604 // We update the memory zone config regardless of the 1605 // actual 'resize-zone' operation result (happened or 1606 // not), so that if the VM reboots it will be running 1607 // with the last configured memory zone size. 1608 zone.hotplugged_size = Some(hotplugged_size); 1609 1610 return Ok(()); 1611 } else { 1612 error!( 1613 "Invalid to ask less ({}) than boot RAM ({}) for \ 1614 this memory zone", 1615 desired_memory, zone.size, 1616 ); 1617 return Err(Error::ResizeZone); 1618 } 1619 } 1620 } 1621 } 1622 1623 error!("Could not find the memory zone {} for the resize", id); 1624 Err(Error::ResizeZone) 1625 } 1626 1627 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1628 let pci_device_info = self 1629 .device_manager 1630 .lock() 1631 .unwrap() 1632 .add_device(&mut device_cfg) 1633 .map_err(Error::DeviceManager)?; 1634 1635 // Update VmConfig by adding the new device. This is important to 1636 // ensure the device would be created in case of a reboot. 1637 { 1638 let mut config = self.config.lock().unwrap(); 1639 add_to_config(&mut config.devices, device_cfg); 1640 } 1641 1642 self.device_manager 1643 .lock() 1644 .unwrap() 1645 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1646 .map_err(Error::DeviceManager)?; 1647 1648 Ok(pci_device_info) 1649 } 1650 1651 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1652 let pci_device_info = self 1653 .device_manager 1654 .lock() 1655 .unwrap() 1656 .add_user_device(&mut device_cfg) 1657 .map_err(Error::DeviceManager)?; 1658 1659 // Update VmConfig by adding the new device. This is important to 1660 // ensure the device would be created in case of a reboot. 1661 { 1662 let mut config = self.config.lock().unwrap(); 1663 add_to_config(&mut config.user_devices, device_cfg); 1664 } 1665 1666 self.device_manager 1667 .lock() 1668 .unwrap() 1669 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1670 .map_err(Error::DeviceManager)?; 1671 1672 Ok(pci_device_info) 1673 } 1674 1675 pub fn remove_device(&mut self, id: String) -> Result<()> { 1676 self.device_manager 1677 .lock() 1678 .unwrap() 1679 .remove_device(id.clone()) 1680 .map_err(Error::DeviceManager)?; 1681 1682 // Update VmConfig by removing the device. This is important to 1683 // ensure the device would not be created in case of a reboot. 1684 self.config.lock().unwrap().remove_device(&id); 1685 1686 self.device_manager 1687 .lock() 1688 .unwrap() 1689 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1690 .map_err(Error::DeviceManager)?; 1691 Ok(()) 1692 } 1693 1694 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1695 let pci_device_info = self 1696 .device_manager 1697 .lock() 1698 .unwrap() 1699 .add_disk(&mut disk_cfg) 1700 .map_err(Error::DeviceManager)?; 1701 1702 // Update VmConfig by adding the new device. This is important to 1703 // ensure the device would be created in case of a reboot. 1704 { 1705 let mut config = self.config.lock().unwrap(); 1706 add_to_config(&mut config.disks, disk_cfg); 1707 } 1708 1709 self.device_manager 1710 .lock() 1711 .unwrap() 1712 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1713 .map_err(Error::DeviceManager)?; 1714 1715 Ok(pci_device_info) 1716 } 1717 1718 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1719 let pci_device_info = self 1720 .device_manager 1721 .lock() 1722 .unwrap() 1723 .add_fs(&mut fs_cfg) 1724 .map_err(Error::DeviceManager)?; 1725 1726 // Update VmConfig by adding the new device. This is important to 1727 // ensure the device would be created in case of a reboot. 1728 { 1729 let mut config = self.config.lock().unwrap(); 1730 add_to_config(&mut config.fs, fs_cfg); 1731 } 1732 1733 self.device_manager 1734 .lock() 1735 .unwrap() 1736 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1737 .map_err(Error::DeviceManager)?; 1738 1739 Ok(pci_device_info) 1740 } 1741 1742 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1743 let pci_device_info = self 1744 .device_manager 1745 .lock() 1746 .unwrap() 1747 .add_pmem(&mut pmem_cfg) 1748 .map_err(Error::DeviceManager)?; 1749 1750 // Update VmConfig by adding the new device. This is important to 1751 // ensure the device would be created in case of a reboot. 1752 { 1753 let mut config = self.config.lock().unwrap(); 1754 add_to_config(&mut config.pmem, pmem_cfg); 1755 } 1756 1757 self.device_manager 1758 .lock() 1759 .unwrap() 1760 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1761 .map_err(Error::DeviceManager)?; 1762 1763 Ok(pci_device_info) 1764 } 1765 1766 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1767 let pci_device_info = self 1768 .device_manager 1769 .lock() 1770 .unwrap() 1771 .add_net(&mut net_cfg) 1772 .map_err(Error::DeviceManager)?; 1773 1774 // Update VmConfig by adding the new device. This is important to 1775 // ensure the device would be created in case of a reboot. 1776 { 1777 let mut config = self.config.lock().unwrap(); 1778 add_to_config(&mut config.net, net_cfg); 1779 } 1780 1781 self.device_manager 1782 .lock() 1783 .unwrap() 1784 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1785 .map_err(Error::DeviceManager)?; 1786 1787 Ok(pci_device_info) 1788 } 1789 1790 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1791 let pci_device_info = self 1792 .device_manager 1793 .lock() 1794 .unwrap() 1795 .add_vdpa(&mut vdpa_cfg) 1796 .map_err(Error::DeviceManager)?; 1797 1798 // Update VmConfig by adding the new device. This is important to 1799 // ensure the device would be created in case of a reboot. 1800 { 1801 let mut config = self.config.lock().unwrap(); 1802 add_to_config(&mut config.vdpa, vdpa_cfg); 1803 } 1804 1805 self.device_manager 1806 .lock() 1807 .unwrap() 1808 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1809 .map_err(Error::DeviceManager)?; 1810 1811 Ok(pci_device_info) 1812 } 1813 1814 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1815 let pci_device_info = self 1816 .device_manager 1817 .lock() 1818 .unwrap() 1819 .add_vsock(&mut vsock_cfg) 1820 .map_err(Error::DeviceManager)?; 1821 1822 // Update VmConfig by adding the new device. This is important to 1823 // ensure the device would be created in case of a reboot. 1824 { 1825 let mut config = self.config.lock().unwrap(); 1826 config.vsock = Some(vsock_cfg); 1827 } 1828 1829 self.device_manager 1830 .lock() 1831 .unwrap() 1832 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1833 .map_err(Error::DeviceManager)?; 1834 1835 Ok(pci_device_info) 1836 } 1837 1838 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1839 Ok(self.device_manager.lock().unwrap().counters()) 1840 } 1841 1842 #[cfg(feature = "tdx")] 1843 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1844 use arch::x86_64::tdx::*; 1845 1846 let firmware_path = self 1847 .config 1848 .lock() 1849 .unwrap() 1850 .payload 1851 .as_ref() 1852 .unwrap() 1853 .firmware 1854 .clone() 1855 .ok_or(Error::TdxFirmwareMissing)?; 1856 // The TDVF file contains a table of section as well as code 1857 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1858 1859 // For all the sections allocate some RAM backing them 1860 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1861 } 1862 1863 #[cfg(feature = "tdx")] 1864 fn hob_memory_resources( 1865 mut sorted_sections: Vec<TdvfSection>, 1866 guest_memory: &GuestMemoryMmap, 1867 ) -> Vec<(u64, u64, bool)> { 1868 let mut list = Vec::new(); 1869 1870 let mut current_section = sorted_sections.pop(); 1871 1872 // RAM regions interleaved with TDVF sections 1873 let mut next_start_addr = 0; 1874 for region in guest_memory.iter() { 1875 let region_start = region.start_addr().0; 1876 let region_end = region.last_addr().0; 1877 if region_start > next_start_addr { 1878 next_start_addr = region_start; 1879 } 1880 1881 loop { 1882 let (start, size, ram) = if let Some(section) = ¤t_section { 1883 if section.address <= next_start_addr { 1884 (section.address, section.size, false) 1885 } else { 1886 let last_addr = std::cmp::min(section.address - 1, region_end); 1887 (next_start_addr, last_addr - next_start_addr + 1, true) 1888 } 1889 } else { 1890 (next_start_addr, region_end - next_start_addr + 1, true) 1891 }; 1892 1893 list.push((start, size, ram)); 1894 1895 if !ram { 1896 current_section = sorted_sections.pop(); 1897 } 1898 1899 next_start_addr = start + size; 1900 1901 if region_start > next_start_addr { 1902 next_start_addr = region_start; 1903 } 1904 1905 if next_start_addr > region_end { 1906 break; 1907 } 1908 } 1909 } 1910 1911 // Once all the interleaved sections have been processed, let's simply 1912 // pull the remaining ones. 1913 if let Some(section) = current_section { 1914 list.push((section.address, section.size, false)); 1915 } 1916 while let Some(section) = sorted_sections.pop() { 1917 list.push((section.address, section.size, false)); 1918 } 1919 1920 list 1921 } 1922 1923 #[cfg(feature = "tdx")] 1924 fn populate_tdx_sections( 1925 &mut self, 1926 sections: &[TdvfSection], 1927 guid_found: bool, 1928 ) -> Result<Option<u64>> { 1929 use arch::x86_64::tdx::*; 1930 // Get the memory end *before* we start adding TDVF ram regions 1931 let boot_guest_memory = self 1932 .memory_manager 1933 .lock() 1934 .as_ref() 1935 .unwrap() 1936 .boot_guest_memory(); 1937 for section in sections { 1938 // No need to allocate if the section falls within guest RAM ranges 1939 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1940 info!( 1941 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1942 section 1943 ); 1944 continue; 1945 } 1946 1947 info!("Allocating TDVF Section: {:x?}", section); 1948 self.memory_manager 1949 .lock() 1950 .unwrap() 1951 .add_ram_region(GuestAddress(section.address), section.size as usize) 1952 .map_err(Error::AllocatingTdvfMemory)?; 1953 } 1954 1955 // The TDVF file contains a table of section as well as code 1956 let firmware_path = self 1957 .config 1958 .lock() 1959 .unwrap() 1960 .payload 1961 .as_ref() 1962 .unwrap() 1963 .firmware 1964 .clone() 1965 .ok_or(Error::TdxFirmwareMissing)?; 1966 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1967 1968 // The guest memory at this point now has all the required regions so it 1969 // is safe to copy from the TDVF file into it. 1970 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1971 let mem = guest_memory.memory(); 1972 let mut payload_info = None; 1973 let mut hob_offset = None; 1974 for section in sections { 1975 info!("Populating TDVF Section: {:x?}", section); 1976 match section.r#type { 1977 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1978 info!("Copying section to guest memory"); 1979 firmware_file 1980 .seek(SeekFrom::Start(section.data_offset as u64)) 1981 .map_err(Error::LoadTdvf)?; 1982 mem.read_volatile_from( 1983 GuestAddress(section.address), 1984 &mut firmware_file, 1985 section.data_size as usize, 1986 ) 1987 .unwrap(); 1988 } 1989 TdvfSectionType::TdHob => { 1990 hob_offset = Some(section.address); 1991 } 1992 TdvfSectionType::Payload => { 1993 info!("Copying payload to guest memory"); 1994 if let Some(payload_file) = self.kernel.as_mut() { 1995 let payload_size = payload_file 1996 .seek(SeekFrom::End(0)) 1997 .map_err(Error::LoadPayload)?; 1998 1999 payload_file 2000 .seek(SeekFrom::Start(0x1f1)) 2001 .map_err(Error::LoadPayload)?; 2002 2003 let mut payload_header = linux_loader::bootparam::setup_header::default(); 2004 payload_file 2005 .read_volatile(&mut payload_header.as_bytes()) 2006 .unwrap(); 2007 2008 if payload_header.header != 0x5372_6448 { 2009 return Err(Error::InvalidPayloadType); 2010 } 2011 2012 if (payload_header.version < 0x0200) 2013 || ((payload_header.loadflags & 0x1) == 0x0) 2014 { 2015 return Err(Error::InvalidPayloadType); 2016 } 2017 2018 payload_file.rewind().map_err(Error::LoadPayload)?; 2019 mem.read_volatile_from( 2020 GuestAddress(section.address), 2021 payload_file, 2022 payload_size as usize, 2023 ) 2024 .unwrap(); 2025 2026 // Create the payload info that will be inserted into 2027 // the HOB. 2028 payload_info = Some(PayloadInfo { 2029 image_type: PayloadImageType::BzImage, 2030 entry_point: section.address, 2031 }); 2032 } 2033 } 2034 TdvfSectionType::PayloadParam => { 2035 info!("Copying payload parameters to guest memory"); 2036 let cmdline = Self::generate_cmdline( 2037 self.config.lock().unwrap().payload.as_ref().unwrap(), 2038 )?; 2039 mem.write_slice( 2040 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 2041 GuestAddress(section.address), 2042 ) 2043 .unwrap(); 2044 } 2045 _ => {} 2046 } 2047 } 2048 2049 // Generate HOB 2050 let mut hob = TdHob::start(hob_offset.unwrap()); 2051 2052 let mut sorted_sections = sections.to_vec(); 2053 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 2054 2055 sorted_sections.sort_by_key(|section| section.address); 2056 sorted_sections.reverse(); 2057 2058 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 2059 hob.add_memory_resource(&mem, start, size, ram, guid_found) 2060 .map_err(Error::PopulateHob)?; 2061 } 2062 2063 // MMIO regions 2064 hob.add_mmio_resource( 2065 &mem, 2066 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 2067 arch::layout::APIC_START.raw_value() 2068 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 2069 ) 2070 .map_err(Error::PopulateHob)?; 2071 let start_of_device_area = self 2072 .memory_manager 2073 .lock() 2074 .unwrap() 2075 .start_of_device_area() 2076 .raw_value(); 2077 let end_of_device_area = self 2078 .memory_manager 2079 .lock() 2080 .unwrap() 2081 .end_of_device_area() 2082 .raw_value(); 2083 hob.add_mmio_resource( 2084 &mem, 2085 start_of_device_area, 2086 end_of_device_area - start_of_device_area, 2087 ) 2088 .map_err(Error::PopulateHob)?; 2089 2090 // Loop over the ACPI tables and copy them to the HOB. 2091 2092 for acpi_table in crate::acpi::create_acpi_tables_tdx( 2093 &self.device_manager, 2094 &self.cpu_manager, 2095 &self.memory_manager, 2096 &self.numa_nodes, 2097 ) { 2098 hob.add_acpi_table(&mem, acpi_table.as_slice()) 2099 .map_err(Error::PopulateHob)?; 2100 } 2101 2102 // If a payload info has been created, let's insert it into the HOB. 2103 if let Some(payload_info) = payload_info { 2104 hob.add_payload(&mem, payload_info) 2105 .map_err(Error::PopulateHob)?; 2106 } 2107 2108 hob.finish(&mem).map_err(Error::PopulateHob)?; 2109 2110 Ok(hob_offset) 2111 } 2112 2113 #[cfg(feature = "tdx")] 2114 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 2115 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2116 let mem = guest_memory.memory(); 2117 2118 for section in sections { 2119 self.vm 2120 .tdx_init_memory_region( 2121 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2122 section.address, 2123 section.size, 2124 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2125 section.attributes == 1, 2126 ) 2127 .map_err(Error::InitializeTdxMemoryRegion)?; 2128 } 2129 2130 Ok(()) 2131 } 2132 2133 // Creates ACPI tables 2134 // In case of TDX being used, this is a no-op since the tables will be 2135 // created and passed when populating the HOB. 2136 2137 #[cfg(not(target_arch = "riscv64"))] 2138 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2139 #[cfg(feature = "tdx")] 2140 if self.config.lock().unwrap().is_tdx_enabled() { 2141 return None; 2142 } 2143 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2144 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 2145 let rsdp_addr = crate::acpi::create_acpi_tables( 2146 &mem, 2147 &self.device_manager, 2148 &self.cpu_manager, 2149 &self.memory_manager, 2150 &self.numa_nodes, 2151 tpm_enabled, 2152 ); 2153 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2154 2155 Some(rsdp_addr) 2156 } 2157 2158 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2159 trace_scoped!("entry_point"); 2160 2161 self.load_payload_handle 2162 .take() 2163 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2164 .transpose() 2165 } 2166 2167 pub fn boot(&mut self) -> Result<()> { 2168 trace_scoped!("Vm::boot"); 2169 let current_state = self.get_state()?; 2170 if current_state == VmState::Paused { 2171 return self.resume().map_err(Error::Resume); 2172 } 2173 2174 let new_state = if self.stop_on_boot { 2175 VmState::BreakPoint 2176 } else { 2177 VmState::Running 2178 }; 2179 current_state.valid_transition(new_state)?; 2180 2181 // Do earlier to parallelise with loading kernel 2182 #[cfg(target_arch = "x86_64")] 2183 cfg_if::cfg_if! { 2184 if #[cfg(feature = "sev_snp")] { 2185 let sev_snp_enabled = self.config.lock().unwrap().is_sev_snp_enabled(); 2186 let rsdp_addr = if sev_snp_enabled { 2187 // In case of SEV-SNP guest ACPI tables are provided via 2188 // IGVM. So skip the creation of ACPI tables and set the 2189 // rsdp addr to None. 2190 None 2191 } else { 2192 self.create_acpi_tables() 2193 }; 2194 } else { 2195 let rsdp_addr = self.create_acpi_tables(); 2196 } 2197 } 2198 2199 // Load kernel synchronously or if asynchronous then wait for load to 2200 // finish. 2201 let entry_point = self.entry_point()?; 2202 2203 #[cfg(feature = "tdx")] 2204 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2205 2206 // Configure the vcpus that have been created 2207 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 2208 for vcpu in vcpus { 2209 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2210 let boot_setup = entry_point.map(|e| (e, guest_memory)); 2211 self.cpu_manager 2212 .lock() 2213 .unwrap() 2214 .configure_vcpu(vcpu, boot_setup) 2215 .map_err(Error::CpuManager)?; 2216 } 2217 2218 #[cfg(feature = "tdx")] 2219 let (sections, guid_found) = if tdx_enabled { 2220 self.extract_tdvf_sections()? 2221 } else { 2222 (Vec::new(), false) 2223 }; 2224 2225 // Configuring the TDX regions requires that the vCPUs are created. 2226 #[cfg(feature = "tdx")] 2227 let hob_address = if tdx_enabled { 2228 // TDX sections are written to memory. 2229 self.populate_tdx_sections(§ions, guid_found)? 2230 } else { 2231 None 2232 }; 2233 2234 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2235 // available after they are configured 2236 #[cfg(target_arch = "aarch64")] 2237 let rsdp_addr = self.create_acpi_tables(); 2238 2239 #[cfg(not(target_arch = "riscv64"))] 2240 // Configure shared state based on loaded kernel 2241 entry_point 2242 .map(|entry_point| { 2243 // Safe to unwrap rsdp_addr as we know it can't be None when 2244 // the entry_point is Some. 2245 self.configure_system(rsdp_addr.unwrap(), entry_point) 2246 }) 2247 .transpose()?; 2248 2249 #[cfg(target_arch = "riscv64")] 2250 self.configure_system().unwrap(); 2251 2252 #[cfg(target_arch = "x86_64")] 2253 // Note: For x86, always call this function before invoking start boot vcpus. 2254 // Otherwise guest would fail to boot because we haven't created the 2255 // userspace mappings to update the hypervisor about the memory mappings. 2256 // These mappings must be created before we start the vCPU threads for 2257 // the very first time. 2258 self.memory_manager 2259 .lock() 2260 .unwrap() 2261 .allocate_address_space() 2262 .map_err(Error::MemoryManager)?; 2263 2264 #[cfg(feature = "tdx")] 2265 if let Some(hob_address) = hob_address { 2266 // With the HOB address extracted the vCPUs can have 2267 // their TDX state configured. 2268 self.cpu_manager 2269 .lock() 2270 .unwrap() 2271 .initialize_tdx(hob_address) 2272 .map_err(Error::CpuManager)?; 2273 // Let the hypervisor know which memory ranges are shared with the 2274 // guest. This prevents the guest from ignoring/discarding memory 2275 // regions provided by the host. 2276 self.init_tdx_memory(§ions)?; 2277 // With TDX memory and CPU state configured TDX setup is complete 2278 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2279 } 2280 2281 // Resume the vm for MSHV 2282 if current_state == VmState::Created { 2283 self.vm.resume().map_err(Error::ResumeVm)?; 2284 } 2285 2286 self.cpu_manager 2287 .lock() 2288 .unwrap() 2289 .start_boot_vcpus(new_state == VmState::BreakPoint) 2290 .map_err(Error::CpuManager)?; 2291 2292 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2293 *state = new_state; 2294 Ok(()) 2295 } 2296 2297 pub fn restore(&mut self) -> Result<()> { 2298 event!("vm", "restoring"); 2299 2300 #[cfg(target_arch = "x86_64")] 2301 // Note: For x86, always call this function before invoking start boot vcpus. 2302 // Otherwise guest would fail to boot because we haven't created the 2303 // userspace mappings to update the hypervisor about the memory mappings. 2304 // These mappings must be created before we start the vCPU threads for 2305 // the very first time for the restored VM. 2306 self.memory_manager 2307 .lock() 2308 .unwrap() 2309 .allocate_address_space() 2310 .map_err(Error::MemoryManager)?; 2311 2312 // Now we can start all vCPUs from here. 2313 self.cpu_manager 2314 .lock() 2315 .unwrap() 2316 .start_restored_vcpus() 2317 .map_err(Error::CpuManager)?; 2318 2319 event!("vm", "restored"); 2320 Ok(()) 2321 } 2322 2323 /// Gets a thread-safe reference counted pointer to the VM configuration. 2324 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2325 Arc::clone(&self.config) 2326 } 2327 2328 /// Get the VM state. Returns an error if the state is poisoned. 2329 pub fn get_state(&self) -> Result<VmState> { 2330 self.state 2331 .try_read() 2332 .map_err(|_| Error::PoisonedState) 2333 .map(|state| *state) 2334 } 2335 2336 /// Gets the actual size of the balloon. 2337 pub fn balloon_size(&self) -> u64 { 2338 self.device_manager.lock().unwrap().balloon_size() 2339 } 2340 2341 pub fn send_memory_fds( 2342 &mut self, 2343 socket: &mut UnixStream, 2344 ) -> std::result::Result<(), MigratableError> { 2345 for (slot, fd) in self 2346 .memory_manager 2347 .lock() 2348 .unwrap() 2349 .memory_slot_fds() 2350 .drain() 2351 { 2352 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2353 .write_to(socket) 2354 .map_err(|e| { 2355 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2356 })?; 2357 socket 2358 .send_with_fd(&slot.to_le_bytes()[..], fd) 2359 .map_err(|e| { 2360 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2361 })?; 2362 2363 Response::read_from(socket)?.ok_or_abandon( 2364 socket, 2365 MigratableError::MigrateSend(anyhow!("Error during memory fd migration")), 2366 )?; 2367 } 2368 2369 Ok(()) 2370 } 2371 2372 pub fn send_memory_regions<F>( 2373 &mut self, 2374 ranges: &MemoryRangeTable, 2375 fd: &mut F, 2376 ) -> std::result::Result<(), MigratableError> 2377 where 2378 F: WriteVolatile, 2379 { 2380 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2381 let mem = guest_memory.memory(); 2382 2383 for range in ranges.regions() { 2384 let mut offset: u64 = 0; 2385 // Here we are manually handling the retry in case we can't the 2386 // whole region at once because we can't use the implementation 2387 // from vm-memory::GuestMemory of write_all_to() as it is not 2388 // following the correct behavior. For more info about this issue 2389 // see: https://github.com/rust-vmm/vm-memory/issues/174 2390 loop { 2391 let bytes_written = mem 2392 .write_volatile_to( 2393 GuestAddress(range.gpa + offset), 2394 fd, 2395 (range.length - offset) as usize, 2396 ) 2397 .map_err(|e| { 2398 MigratableError::MigrateSend(anyhow!( 2399 "Error transferring memory to socket: {}", 2400 e 2401 )) 2402 })?; 2403 offset += bytes_written as u64; 2404 2405 if offset == range.length { 2406 break; 2407 } 2408 } 2409 } 2410 2411 Ok(()) 2412 } 2413 2414 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2415 self.memory_manager 2416 .lock() 2417 .unwrap() 2418 .memory_range_table(false) 2419 } 2420 2421 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2422 self.device_manager.lock().unwrap().device_tree() 2423 } 2424 2425 pub fn activate_virtio_devices(&self) -> Result<()> { 2426 self.device_manager 2427 .lock() 2428 .unwrap() 2429 .activate_virtio_devices() 2430 .map_err(Error::ActivateVirtioDevices) 2431 } 2432 2433 #[cfg(target_arch = "x86_64")] 2434 pub fn power_button(&self) -> Result<()> { 2435 return self 2436 .device_manager 2437 .lock() 2438 .unwrap() 2439 .notify_power_button() 2440 .map_err(Error::PowerButton); 2441 } 2442 2443 #[cfg(target_arch = "aarch64")] 2444 pub fn power_button(&self) -> Result<()> { 2445 self.device_manager 2446 .lock() 2447 .unwrap() 2448 .notify_power_button() 2449 .map_err(Error::PowerButton) 2450 } 2451 2452 #[cfg(target_arch = "riscv64")] 2453 pub fn power_button(&self) -> Result<()> { 2454 unimplemented!() 2455 } 2456 2457 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2458 self.memory_manager.lock().unwrap().snapshot_data() 2459 } 2460 2461 #[cfg(feature = "guest_debug")] 2462 pub fn debug_request( 2463 &mut self, 2464 gdb_request: &GdbRequestPayload, 2465 cpu_id: usize, 2466 ) -> Result<GdbResponsePayload> { 2467 use GdbRequestPayload::*; 2468 match gdb_request { 2469 SetSingleStep(single_step) => { 2470 self.set_guest_debug(cpu_id, &[], *single_step) 2471 .map_err(Error::Debug)?; 2472 } 2473 SetHwBreakPoint(addrs) => { 2474 self.set_guest_debug(cpu_id, addrs, false) 2475 .map_err(Error::Debug)?; 2476 } 2477 Pause => { 2478 self.debug_pause().map_err(Error::Debug)?; 2479 } 2480 Resume => { 2481 self.debug_resume().map_err(Error::Debug)?; 2482 } 2483 ReadRegs => { 2484 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2485 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2486 } 2487 WriteRegs(regs) => { 2488 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2489 } 2490 ReadMem(vaddr, len) => { 2491 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2492 let mem = self 2493 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2494 .map_err(Error::Debug)?; 2495 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2496 } 2497 WriteMem(vaddr, data) => { 2498 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2499 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2500 .map_err(Error::Debug)?; 2501 } 2502 ActiveVcpus => { 2503 let active_vcpus = self.active_vcpus(); 2504 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2505 } 2506 } 2507 Ok(GdbResponsePayload::CommandComplete) 2508 } 2509 2510 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2511 fn get_dump_state( 2512 &mut self, 2513 destination_url: &str, 2514 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2515 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2516 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2517 let mut elf_phdr_num = 1; 2518 let elf_sh_info = 0; 2519 let coredump_file_path = url_to_file(destination_url)?; 2520 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2521 2522 if mapping_num < UINT16_MAX - 2 { 2523 elf_phdr_num += mapping_num as u16; 2524 } else { 2525 panic!("mapping num beyond 65535 not supported"); 2526 } 2527 let coredump_file = OpenOptions::new() 2528 .read(true) 2529 .write(true) 2530 .create_new(true) 2531 .open(coredump_file_path) 2532 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2533 2534 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2535 let mem_data = self 2536 .memory_manager 2537 .lock() 2538 .unwrap() 2539 .coredump_memory_regions(mem_offset); 2540 2541 Ok(DumpState { 2542 elf_note_size, 2543 elf_phdr_num, 2544 elf_sh_info, 2545 mem_offset, 2546 mem_info: Some(mem_data), 2547 file: Some(coredump_file), 2548 }) 2549 } 2550 2551 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2552 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2553 size_of::<elf::Elf64_Ehdr>() as u64 2554 + note_size as u64 2555 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2556 } 2557 2558 pub fn nmi(&self) -> Result<()> { 2559 return self 2560 .cpu_manager 2561 .lock() 2562 .unwrap() 2563 .nmi() 2564 .map_err(|_| Error::ErrorNmi); 2565 } 2566 } 2567 2568 impl Pausable for Vm { 2569 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2570 event!("vm", "pausing"); 2571 let mut state = self 2572 .state 2573 .try_write() 2574 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2575 let new_state = VmState::Paused; 2576 2577 state 2578 .valid_transition(new_state) 2579 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2580 2581 #[cfg(target_arch = "x86_64")] 2582 { 2583 let mut clock = self 2584 .vm 2585 .get_clock() 2586 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2587 clock.reset_flags(); 2588 self.saved_clock = Some(clock); 2589 } 2590 2591 // Before pausing the vCPUs activate any pending virtio devices that might 2592 // need activation between starting the pause (or e.g. a migration it's part of) 2593 self.activate_virtio_devices().map_err(|e| { 2594 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2595 })?; 2596 2597 self.cpu_manager.lock().unwrap().pause()?; 2598 self.device_manager.lock().unwrap().pause()?; 2599 2600 self.vm 2601 .pause() 2602 .map_err(|e| MigratableError::Pause(anyhow!("Could not pause the VM: {}", e)))?; 2603 2604 *state = new_state; 2605 2606 event!("vm", "paused"); 2607 Ok(()) 2608 } 2609 2610 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2611 event!("vm", "resuming"); 2612 let current_state = self.get_state().unwrap(); 2613 let mut state = self 2614 .state 2615 .try_write() 2616 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2617 let new_state = VmState::Running; 2618 2619 state 2620 .valid_transition(new_state) 2621 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2622 2623 self.cpu_manager.lock().unwrap().resume()?; 2624 #[cfg(target_arch = "x86_64")] 2625 { 2626 if let Some(clock) = &self.saved_clock { 2627 self.vm.set_clock(clock).map_err(|e| { 2628 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2629 })?; 2630 } 2631 } 2632 2633 if current_state == VmState::Paused { 2634 self.vm 2635 .resume() 2636 .map_err(|e| MigratableError::Resume(anyhow!("Could not resume the VM: {}", e)))?; 2637 } 2638 2639 self.device_manager.lock().unwrap().resume()?; 2640 2641 // And we're back to the Running state. 2642 *state = new_state; 2643 event!("vm", "resumed"); 2644 Ok(()) 2645 } 2646 } 2647 2648 #[derive(Serialize, Deserialize)] 2649 pub struct VmSnapshot { 2650 #[cfg(target_arch = "x86_64")] 2651 pub clock: Option<hypervisor::ClockData>, 2652 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2653 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2654 } 2655 2656 pub const VM_SNAPSHOT_ID: &str = "vm"; 2657 impl Snapshottable for Vm { 2658 fn id(&self) -> String { 2659 VM_SNAPSHOT_ID.to_string() 2660 } 2661 2662 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2663 event!("vm", "snapshotting"); 2664 2665 #[cfg(feature = "tdx")] 2666 { 2667 if self.config.lock().unwrap().is_tdx_enabled() { 2668 return Err(MigratableError::Snapshot(anyhow!( 2669 "Snapshot not possible with TDX VM" 2670 ))); 2671 } 2672 } 2673 2674 let current_state = self.get_state().unwrap(); 2675 if current_state != VmState::Paused { 2676 return Err(MigratableError::Snapshot(anyhow!( 2677 "Trying to snapshot while VM is running" 2678 ))); 2679 } 2680 2681 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2682 let common_cpuid = { 2683 let amx = self.config.lock().unwrap().cpus.features.amx; 2684 let phys_bits = physical_bits( 2685 &self.hypervisor, 2686 self.config.lock().unwrap().cpus.max_phys_bits, 2687 ); 2688 arch::generate_common_cpuid( 2689 &self.hypervisor, 2690 &arch::CpuidConfig { 2691 sgx_epc_sections: None, 2692 phys_bits, 2693 kvm_hyperv: self.config.lock().unwrap().cpus.kvm_hyperv, 2694 #[cfg(feature = "tdx")] 2695 tdx: false, 2696 amx, 2697 }, 2698 ) 2699 .map_err(|e| { 2700 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2701 })? 2702 }; 2703 2704 let vm_snapshot_state = VmSnapshot { 2705 #[cfg(target_arch = "x86_64")] 2706 clock: self.saved_clock, 2707 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2708 common_cpuid, 2709 }; 2710 2711 let mut vm_snapshot = Snapshot::new_from_state(&vm_snapshot_state)?; 2712 2713 let (id, snapshot) = { 2714 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2715 (cpu_manager.id(), cpu_manager.snapshot()?) 2716 }; 2717 vm_snapshot.add_snapshot(id, snapshot); 2718 let (id, snapshot) = { 2719 let mut memory_manager = self.memory_manager.lock().unwrap(); 2720 (memory_manager.id(), memory_manager.snapshot()?) 2721 }; 2722 vm_snapshot.add_snapshot(id, snapshot); 2723 let (id, snapshot) = { 2724 let mut device_manager = self.device_manager.lock().unwrap(); 2725 (device_manager.id(), device_manager.snapshot()?) 2726 }; 2727 vm_snapshot.add_snapshot(id, snapshot); 2728 2729 event!("vm", "snapshotted"); 2730 Ok(vm_snapshot) 2731 } 2732 } 2733 2734 impl Transportable for Vm { 2735 fn send( 2736 &self, 2737 snapshot: &Snapshot, 2738 destination_url: &str, 2739 ) -> std::result::Result<(), MigratableError> { 2740 let mut snapshot_config_path = url_to_path(destination_url)?; 2741 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2742 2743 // Create the snapshot config file 2744 let mut snapshot_config_file = OpenOptions::new() 2745 .read(true) 2746 .write(true) 2747 .create_new(true) 2748 .open(snapshot_config_path) 2749 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2750 2751 // Serialize and write the snapshot config 2752 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2753 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2754 2755 snapshot_config_file 2756 .write(vm_config.as_bytes()) 2757 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2758 2759 let mut snapshot_state_path = url_to_path(destination_url)?; 2760 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2761 2762 // Create the snapshot state file 2763 let mut snapshot_state_file = OpenOptions::new() 2764 .read(true) 2765 .write(true) 2766 .create_new(true) 2767 .open(snapshot_state_path) 2768 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2769 2770 // Serialize and write the snapshot state 2771 let vm_state = 2772 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2773 2774 snapshot_state_file 2775 .write(&vm_state) 2776 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2777 2778 // Tell the memory manager to also send/write its own snapshot. 2779 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2780 self.memory_manager 2781 .lock() 2782 .unwrap() 2783 .send(&memory_manager_snapshot.clone(), destination_url)?; 2784 } else { 2785 return Err(MigratableError::Restore(anyhow!( 2786 "Missing memory manager snapshot" 2787 ))); 2788 } 2789 2790 Ok(()) 2791 } 2792 } 2793 2794 impl Migratable for Vm { 2795 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2796 self.memory_manager.lock().unwrap().start_dirty_log()?; 2797 self.device_manager.lock().unwrap().start_dirty_log() 2798 } 2799 2800 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2801 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2802 self.device_manager.lock().unwrap().stop_dirty_log() 2803 } 2804 2805 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2806 Ok(MemoryRangeTable::new_from_tables(vec![ 2807 self.memory_manager.lock().unwrap().dirty_log()?, 2808 self.device_manager.lock().unwrap().dirty_log()?, 2809 ])) 2810 } 2811 2812 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2813 self.memory_manager.lock().unwrap().start_migration()?; 2814 self.device_manager.lock().unwrap().start_migration() 2815 } 2816 2817 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2818 self.memory_manager.lock().unwrap().complete_migration()?; 2819 self.device_manager.lock().unwrap().complete_migration() 2820 } 2821 } 2822 2823 #[cfg(feature = "guest_debug")] 2824 impl Debuggable for Vm { 2825 fn set_guest_debug( 2826 &self, 2827 cpu_id: usize, 2828 addrs: &[GuestAddress], 2829 singlestep: bool, 2830 ) -> std::result::Result<(), DebuggableError> { 2831 self.cpu_manager 2832 .lock() 2833 .unwrap() 2834 .set_guest_debug(cpu_id, addrs, singlestep) 2835 } 2836 2837 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2838 if *self.state.read().unwrap() == VmState::Running { 2839 self.pause().map_err(DebuggableError::Pause)?; 2840 } 2841 2842 let mut state = self 2843 .state 2844 .try_write() 2845 .map_err(|_| DebuggableError::PoisonedState)?; 2846 *state = VmState::BreakPoint; 2847 Ok(()) 2848 } 2849 2850 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2851 if *self.state.read().unwrap() == VmState::BreakPoint { 2852 self.resume().map_err(DebuggableError::Pause)?; 2853 } 2854 2855 Ok(()) 2856 } 2857 2858 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2859 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2860 } 2861 2862 fn write_regs( 2863 &self, 2864 cpu_id: usize, 2865 regs: &CoreRegs, 2866 ) -> std::result::Result<(), DebuggableError> { 2867 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2868 } 2869 2870 fn read_mem( 2871 &self, 2872 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2873 cpu_id: usize, 2874 vaddr: GuestAddress, 2875 len: usize, 2876 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2877 self.cpu_manager 2878 .lock() 2879 .unwrap() 2880 .read_mem(guest_memory, cpu_id, vaddr, len) 2881 } 2882 2883 fn write_mem( 2884 &self, 2885 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2886 cpu_id: usize, 2887 vaddr: &GuestAddress, 2888 data: &[u8], 2889 ) -> std::result::Result<(), DebuggableError> { 2890 self.cpu_manager 2891 .lock() 2892 .unwrap() 2893 .write_mem(guest_memory, cpu_id, vaddr, data) 2894 } 2895 2896 fn active_vcpus(&self) -> usize { 2897 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2898 if active_vcpus > 0 { 2899 active_vcpus 2900 } else { 2901 // The VM is not booted yet. Report boot_vcpus() instead. 2902 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2903 } 2904 } 2905 } 2906 2907 #[cfg(feature = "guest_debug")] 2908 pub const UINT16_MAX: u32 = 65535; 2909 2910 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2911 impl Elf64Writable for Vm {} 2912 2913 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2914 impl GuestDebuggable for Vm { 2915 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2916 event!("vm", "coredumping"); 2917 2918 let mut resume = false; 2919 2920 #[cfg(feature = "tdx")] 2921 { 2922 if let Some(ref platform) = self.config.lock().unwrap().platform { 2923 if platform.tdx { 2924 return Err(GuestDebuggableError::Coredump(anyhow!( 2925 "Coredump not possible with TDX VM" 2926 ))); 2927 } 2928 } 2929 } 2930 2931 match self.get_state().unwrap() { 2932 VmState::Running => { 2933 self.pause().map_err(GuestDebuggableError::Pause)?; 2934 resume = true; 2935 } 2936 VmState::Paused => {} 2937 _ => { 2938 return Err(GuestDebuggableError::Coredump(anyhow!( 2939 "Trying to coredump while VM is not running or paused" 2940 ))); 2941 } 2942 } 2943 2944 let coredump_state = self.get_dump_state(destination_url)?; 2945 2946 self.write_header(&coredump_state)?; 2947 self.write_note(&coredump_state)?; 2948 self.write_loads(&coredump_state)?; 2949 2950 self.cpu_manager 2951 .lock() 2952 .unwrap() 2953 .cpu_write_elf64_note(&coredump_state)?; 2954 self.cpu_manager 2955 .lock() 2956 .unwrap() 2957 .cpu_write_vmm_note(&coredump_state)?; 2958 2959 self.memory_manager 2960 .lock() 2961 .unwrap() 2962 .coredump_iterate_save_mem(&coredump_state)?; 2963 2964 if resume { 2965 self.resume().map_err(GuestDebuggableError::Resume)?; 2966 } 2967 2968 Ok(()) 2969 } 2970 } 2971 2972 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2973 #[cfg(test)] 2974 mod tests { 2975 use super::*; 2976 2977 fn test_vm_state_transitions(state: VmState) { 2978 match state { 2979 VmState::Created => { 2980 // Check the transitions from Created 2981 state.valid_transition(VmState::Created).unwrap_err(); 2982 state.valid_transition(VmState::Running).unwrap(); 2983 state.valid_transition(VmState::Shutdown).unwrap(); 2984 state.valid_transition(VmState::Paused).unwrap(); 2985 state.valid_transition(VmState::BreakPoint).unwrap(); 2986 } 2987 VmState::Running => { 2988 // Check the transitions from Running 2989 state.valid_transition(VmState::Created).unwrap_err(); 2990 state.valid_transition(VmState::Running).unwrap_err(); 2991 state.valid_transition(VmState::Shutdown).unwrap(); 2992 state.valid_transition(VmState::Paused).unwrap(); 2993 state.valid_transition(VmState::BreakPoint).unwrap(); 2994 } 2995 VmState::Shutdown => { 2996 // Check the transitions from Shutdown 2997 state.valid_transition(VmState::Created).unwrap_err(); 2998 state.valid_transition(VmState::Running).unwrap(); 2999 state.valid_transition(VmState::Shutdown).unwrap_err(); 3000 state.valid_transition(VmState::Paused).unwrap_err(); 3001 state.valid_transition(VmState::BreakPoint).unwrap_err(); 3002 } 3003 VmState::Paused => { 3004 // Check the transitions from Paused 3005 state.valid_transition(VmState::Created).unwrap_err(); 3006 state.valid_transition(VmState::Running).unwrap(); 3007 state.valid_transition(VmState::Shutdown).unwrap(); 3008 state.valid_transition(VmState::Paused).unwrap_err(); 3009 state.valid_transition(VmState::BreakPoint).unwrap_err(); 3010 } 3011 VmState::BreakPoint => { 3012 // Check the transitions from Breakpoint 3013 state.valid_transition(VmState::Created).unwrap(); 3014 state.valid_transition(VmState::Running).unwrap(); 3015 state.valid_transition(VmState::Shutdown).unwrap_err(); 3016 state.valid_transition(VmState::Paused).unwrap_err(); 3017 state.valid_transition(VmState::BreakPoint).unwrap_err(); 3018 } 3019 } 3020 } 3021 3022 #[test] 3023 fn test_vm_created_transitions() { 3024 test_vm_state_transitions(VmState::Created); 3025 } 3026 3027 #[test] 3028 fn test_vm_running_transitions() { 3029 test_vm_state_transitions(VmState::Running); 3030 } 3031 3032 #[test] 3033 fn test_vm_shutdown_transitions() { 3034 test_vm_state_transitions(VmState::Shutdown); 3035 } 3036 3037 #[test] 3038 fn test_vm_paused_transitions() { 3039 test_vm_state_transitions(VmState::Paused); 3040 } 3041 3042 #[cfg(feature = "tdx")] 3043 #[test] 3044 fn test_hob_memory_resources() { 3045 // Case 1: Two TDVF sections in the middle of the RAM 3046 let sections = vec![ 3047 TdvfSection { 3048 address: 0xc000, 3049 size: 0x1000, 3050 ..Default::default() 3051 }, 3052 TdvfSection { 3053 address: 0x1000, 3054 size: 0x4000, 3055 ..Default::default() 3056 }, 3057 ]; 3058 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 3059 let expected = vec![ 3060 (0, 0x1000, true), 3061 (0x1000, 0x4000, false), 3062 (0x5000, 0x7000, true), 3063 (0xc000, 0x1000, false), 3064 (0xd000, 0x0fff_3000, true), 3065 ]; 3066 assert_eq!( 3067 expected, 3068 Vm::hob_memory_resources( 3069 sections, 3070 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3071 ) 3072 ); 3073 3074 // Case 2: Two TDVF sections with no conflict with the RAM 3075 let sections = vec![ 3076 TdvfSection { 3077 address: 0x1000_1000, 3078 size: 0x1000, 3079 ..Default::default() 3080 }, 3081 TdvfSection { 3082 address: 0, 3083 size: 0x1000, 3084 ..Default::default() 3085 }, 3086 ]; 3087 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3088 let expected = vec![ 3089 (0, 0x1000, false), 3090 (0x1000, 0x1000_0000, true), 3091 (0x1000_1000, 0x1000, false), 3092 ]; 3093 assert_eq!( 3094 expected, 3095 Vm::hob_memory_resources( 3096 sections, 3097 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3098 ) 3099 ); 3100 3101 // Case 3: Two TDVF sections with partial conflicts with the RAM 3102 let sections = vec![ 3103 TdvfSection { 3104 address: 0x1000_0000, 3105 size: 0x2000, 3106 ..Default::default() 3107 }, 3108 TdvfSection { 3109 address: 0, 3110 size: 0x2000, 3111 ..Default::default() 3112 }, 3113 ]; 3114 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3115 let expected = vec![ 3116 (0, 0x2000, false), 3117 (0x2000, 0x0fff_e000, true), 3118 (0x1000_0000, 0x2000, false), 3119 ]; 3120 assert_eq!( 3121 expected, 3122 Vm::hob_memory_resources( 3123 sections, 3124 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3125 ) 3126 ); 3127 3128 // Case 4: Two TDVF sections with no conflict before the RAM and two 3129 // more additional sections with no conflict after the RAM. 3130 let sections = vec![ 3131 TdvfSection { 3132 address: 0x2000_1000, 3133 size: 0x1000, 3134 ..Default::default() 3135 }, 3136 TdvfSection { 3137 address: 0x2000_0000, 3138 size: 0x1000, 3139 ..Default::default() 3140 }, 3141 TdvfSection { 3142 address: 0x1000, 3143 size: 0x1000, 3144 ..Default::default() 3145 }, 3146 TdvfSection { 3147 address: 0, 3148 size: 0x1000, 3149 ..Default::default() 3150 }, 3151 ]; 3152 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3153 let expected = vec![ 3154 (0, 0x1000, false), 3155 (0x1000, 0x1000, false), 3156 (0x4000, 0x1000_0000, true), 3157 (0x2000_0000, 0x1000, false), 3158 (0x2000_1000, 0x1000, false), 3159 ]; 3160 assert_eq!( 3161 expected, 3162 Vm::hob_memory_resources( 3163 sections, 3164 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3165 ) 3166 ); 3167 3168 // Case 5: One TDVF section overriding the entire RAM 3169 let sections = vec![TdvfSection { 3170 address: 0, 3171 size: 0x2000_0000, 3172 ..Default::default() 3173 }]; 3174 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3175 let expected = vec![(0, 0x2000_0000, false)]; 3176 assert_eq!( 3177 expected, 3178 Vm::hob_memory_resources( 3179 sections, 3180 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3181 ) 3182 ); 3183 3184 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3185 let sections = vec![ 3186 TdvfSection { 3187 address: 0x1000_2000, 3188 size: 0x2000, 3189 ..Default::default() 3190 }, 3191 TdvfSection { 3192 address: 0, 3193 size: 0x2000, 3194 ..Default::default() 3195 }, 3196 ]; 3197 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3198 (GuestAddress(0x2000), 0x1000_0000), 3199 (GuestAddress(0x1000_4000), 0x1000_0000), 3200 ]; 3201 let expected = vec![ 3202 (0, 0x2000, false), 3203 (0x2000, 0x1000_0000, true), 3204 (0x1000_2000, 0x2000, false), 3205 (0x1000_4000, 0x1000_0000, true), 3206 ]; 3207 assert_eq!( 3208 expected, 3209 Vm::hob_memory_resources( 3210 sections, 3211 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3212 ) 3213 ); 3214 3215 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3216 let sections = vec![ 3217 TdvfSection { 3218 address: 0x1000_0000, 3219 size: 0x4000, 3220 ..Default::default() 3221 }, 3222 TdvfSection { 3223 address: 0, 3224 size: 0x4000, 3225 ..Default::default() 3226 }, 3227 ]; 3228 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3229 (GuestAddress(0x1000), 0x1000_0000), 3230 (GuestAddress(0x1000_3000), 0x1000_0000), 3231 ]; 3232 let expected = vec![ 3233 (0, 0x4000, false), 3234 (0x4000, 0x0fff_c000, true), 3235 (0x1000_0000, 0x4000, false), 3236 (0x1000_4000, 0x0fff_f000, true), 3237 ]; 3238 assert_eq!( 3239 expected, 3240 Vm::hob_memory_resources( 3241 sections, 3242 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3243 ) 3244 ); 3245 } 3246 } 3247 3248 #[cfg(target_arch = "aarch64")] 3249 #[cfg(test)] 3250 mod tests { 3251 use arch::aarch64::fdt::create_fdt; 3252 use arch::aarch64::layout; 3253 use arch::{DeviceType, MmioDeviceInfo}; 3254 use devices::gic::Gic; 3255 3256 use super::*; 3257 3258 const LEN: u64 = 4096; 3259 3260 #[test] 3261 fn test_create_fdt_with_devices() { 3262 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3263 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3264 3265 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3266 ( 3267 (DeviceType::Serial, DeviceType::Serial.to_string()), 3268 MmioDeviceInfo { 3269 addr: 0x00, 3270 len: LEN, 3271 irq: 33, 3272 }, 3273 ), 3274 ( 3275 (DeviceType::Virtio(1), "virtio".to_string()), 3276 MmioDeviceInfo { 3277 addr: LEN, 3278 len: LEN, 3279 irq: 34, 3280 }, 3281 ), 3282 ( 3283 (DeviceType::Rtc, "rtc".to_string()), 3284 MmioDeviceInfo { 3285 addr: 2 * LEN, 3286 len: LEN, 3287 irq: 35, 3288 }, 3289 ), 3290 ] 3291 .iter() 3292 .cloned() 3293 .collect(); 3294 3295 let hv = hypervisor::new().unwrap(); 3296 let vm = hv.create_vm().unwrap(); 3297 let gic = vm 3298 .create_vgic(Gic::create_default_config(1)) 3299 .expect("Cannot create gic"); 3300 create_fdt( 3301 &mem, 3302 "console=tty0", 3303 vec![0], 3304 Some((0, 0, 0)), 3305 &dev_info, 3306 &gic, 3307 &None, 3308 &Vec::new(), 3309 &BTreeMap::new(), 3310 None, 3311 true, 3312 ) 3313 .unwrap(); 3314 } 3315 } 3316 3317 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3318 #[test] 3319 pub fn test_vm() { 3320 use hypervisor::VmExit; 3321 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3322 // This example based on https://lwn.net/Articles/658511/ 3323 let code = [ 3324 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3325 0x00, 0xd8, /* add %bl, %al */ 3326 0x04, b'0', /* add $'0', %al */ 3327 0xee, /* out %al, (%dx) */ 3328 0xb0, b'\n', /* mov $'\n', %al */ 3329 0xee, /* out %al, (%dx) */ 3330 0xf4, /* hlt */ 3331 ]; 3332 3333 let mem_size = 0x1000; 3334 let load_addr = GuestAddress(0x1000); 3335 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3336 3337 let hv = hypervisor::new().unwrap(); 3338 let vm = hv.create_vm().expect("new VM creation failed"); 3339 3340 for (index, region) in mem.iter().enumerate() { 3341 let mem_region = vm.make_user_memory_region( 3342 index as u32, 3343 region.start_addr().raw_value(), 3344 region.len(), 3345 region.as_ptr() as u64, 3346 false, 3347 false, 3348 ); 3349 3350 vm.create_user_memory_region(mem_region) 3351 .expect("Cannot configure guest memory"); 3352 } 3353 mem.write_slice(&code, load_addr) 3354 .expect("Writing code to memory failed"); 3355 3356 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3357 3358 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3359 vcpu_sregs.cs.base = 0; 3360 vcpu_sregs.cs.selector = 0; 3361 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3362 3363 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3364 vcpu_regs.set_rip(0x1000); 3365 vcpu_regs.set_rax(2); 3366 vcpu_regs.set_rbx(3); 3367 vcpu_regs.set_rflags(2); 3368 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3369 3370 loop { 3371 match vcpu.run().expect("run failed") { 3372 VmExit::Reset => { 3373 println!("HLT"); 3374 break; 3375 } 3376 VmExit::Ignore => {} 3377 r => panic!("unexpected exit reason: {r:?}"), 3378 } 3379 } 3380 } 3381