1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(feature = "guest_debug")] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{Console, DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 use crate::memory_manager::{ 29 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 30 }; 31 #[cfg(feature = "guest_debug")] 32 use crate::migration::url_to_file; 33 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 34 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 35 use crate::GuestMemoryMmap; 36 use crate::{ 37 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 38 }; 39 use anyhow::anyhow; 40 use arch::get_host_cpu_phys_bits; 41 #[cfg(target_arch = "x86_64")] 42 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 43 #[cfg(feature = "tdx")] 44 use arch::x86_64::tdx::TdvfSection; 45 use arch::EntryPoint; 46 #[cfg(target_arch = "aarch64")] 47 use arch::PciSpaceInfo; 48 use arch::{NumaNode, NumaNodes}; 49 #[cfg(target_arch = "aarch64")] 50 use devices::gic::{Gic, GIC_V3_ITS_SNAPSHOT_ID}; 51 #[cfg(target_arch = "aarch64")] 52 use devices::interrupt_controller::{self, InterruptController}; 53 use devices::AcpiNotificationFlags; 54 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 55 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 56 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 57 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 58 use hypervisor::{HypervisorVmError, VmOps}; 59 use linux_loader::cmdline::Cmdline; 60 #[cfg(feature = "guest_debug")] 61 use linux_loader::elf; 62 #[cfg(target_arch = "x86_64")] 63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 64 #[cfg(target_arch = "aarch64")] 65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 66 use linux_loader::loader::KernelLoader; 67 use seccompiler::{apply_filter, SeccompAction}; 68 use serde::{Deserialize, Serialize}; 69 use signal_hook::{consts::SIGWINCH, iterator::backend::Handle, iterator::Signals}; 70 use std::cmp; 71 use std::collections::BTreeMap; 72 use std::collections::HashMap; 73 use std::convert::TryInto; 74 use std::fs::{File, OpenOptions}; 75 use std::io::{self, Read, Write}; 76 use std::io::{Seek, SeekFrom}; 77 #[cfg(feature = "tdx")] 78 use std::mem; 79 #[cfg(feature = "guest_debug")] 80 use std::mem::size_of; 81 use std::num::Wrapping; 82 use std::ops::Deref; 83 use std::os::unix::net::UnixStream; 84 use std::panic::AssertUnwindSafe; 85 use std::sync::{Arc, Mutex, RwLock}; 86 use std::time::Instant; 87 use std::{result, str, thread}; 88 use thiserror::Error; 89 use tracer::trace_scoped; 90 use vm_device::Bus; 91 #[cfg(target_arch = "x86_64")] 92 use vm_device::BusDevice; 93 #[cfg(target_arch = "x86_64")] 94 use vm_memory::Address; 95 #[cfg(feature = "tdx")] 96 use vm_memory::{ByteValued, GuestMemory, GuestMemoryRegion}; 97 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; 98 use vm_migration::protocol::{Request, Response, Status}; 99 use vm_migration::{ 100 protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, Snapshot, 101 SnapshotDataSection, Snapshottable, Transportable, 102 }; 103 use vmm_sys_util::eventfd::EventFd; 104 use vmm_sys_util::signal::unblock_signal; 105 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 106 use vmm_sys_util::terminal::Terminal; 107 108 /// Errors associated with VM management 109 #[derive(Debug, Error)] 110 pub enum Error { 111 #[error("Cannot open kernel file: {0}")] 112 KernelFile(#[source] io::Error), 113 114 #[error("Cannot open initramfs file: {0}")] 115 InitramfsFile(#[source] io::Error), 116 117 #[error("Cannot load the kernel into memory: {0}")] 118 KernelLoad(#[source] linux_loader::loader::Error), 119 120 #[cfg(target_arch = "aarch64")] 121 #[error("Cannot load the UEFI binary in memory: {0:?}")] 122 UefiLoad(arch::aarch64::uefi::Error), 123 124 #[error("Cannot load the initramfs into memory")] 125 InitramfsLoad, 126 127 #[error("Cannot load the kernel command line in memory: {0}")] 128 LoadCmdLine(#[source] linux_loader::loader::Error), 129 130 #[error("Cannot modify the kernel command line: {0}")] 131 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 132 133 #[error("Cannot configure system: {0}")] 134 ConfigureSystem(#[source] arch::Error), 135 136 #[cfg(target_arch = "aarch64")] 137 #[error("Cannot enable interrupt controller: {0:?}")] 138 EnableInterruptController(interrupt_controller::Error), 139 140 #[error("VM state is poisoned")] 141 PoisonedState, 142 143 #[error("Error from device manager: {0:?}")] 144 DeviceManager(DeviceManagerError), 145 146 #[error("Cannot setup terminal in raw mode: {0}")] 147 SetTerminalRaw(#[source] vmm_sys_util::errno::Error), 148 149 #[error("Cannot setup terminal in canonical mode.: {0}")] 150 SetTerminalCanon(#[source] vmm_sys_util::errno::Error), 151 152 #[error("Cannot spawn a signal handler thread: {0}")] 153 SignalHandlerSpawn(#[source] io::Error), 154 155 #[error("Failed to join on threads: {0:?}")] 156 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 157 158 #[error("VM config is missing")] 159 VmMissingConfig, 160 161 #[error("VM is not created")] 162 VmNotCreated, 163 164 #[error("VM is already created")] 165 VmAlreadyCreated, 166 167 #[error("VM is not running")] 168 VmNotRunning, 169 170 #[error("Cannot clone EventFd: {0}")] 171 EventFdClone(#[source] io::Error), 172 173 #[error("invalid VM state transition: {0:?} to {1:?}")] 174 InvalidStateTransition(VmState, VmState), 175 176 #[error("Error from CPU manager: {0}")] 177 CpuManager(#[source] cpu::Error), 178 179 #[error("Cannot pause devices: {0}")] 180 PauseDevices(#[source] MigratableError), 181 182 #[error("Cannot resume devices: {0}")] 183 ResumeDevices(#[source] MigratableError), 184 185 #[error("Cannot pause CPUs: {0}")] 186 PauseCpus(#[source] MigratableError), 187 188 #[error("Cannot resume cpus: {0}")] 189 ResumeCpus(#[source] MigratableError), 190 191 #[error("Cannot pause VM: {0}")] 192 Pause(#[source] MigratableError), 193 194 #[error("Cannot resume VM: {0}")] 195 Resume(#[source] MigratableError), 196 197 #[error("Memory manager error: {0:?}")] 198 MemoryManager(MemoryManagerError), 199 200 #[error("Eventfd write error: {0}")] 201 EventfdError(#[source] std::io::Error), 202 203 #[error("Cannot snapshot VM: {0}")] 204 Snapshot(#[source] MigratableError), 205 206 #[error("Cannot restore VM: {0}")] 207 Restore(#[source] MigratableError), 208 209 #[error("Cannot send VM snapshot: {0}")] 210 SnapshotSend(#[source] MigratableError), 211 212 #[error("Invalid restore source URL")] 213 InvalidRestoreSourceUrl, 214 215 #[error("Failed to validate config: {0}")] 216 ConfigValidation(#[source] ValidationError), 217 218 #[error("Too many virtio-vsock devices")] 219 TooManyVsockDevices, 220 221 #[error("Failed serializing into JSON: {0}")] 222 SerializeJson(#[source] serde_json::Error), 223 224 #[error("Invalid NUMA configuration")] 225 InvalidNumaConfig, 226 227 #[error("Cannot create seccomp filter: {0}")] 228 CreateSeccompFilter(#[source] seccompiler::Error), 229 230 #[error("Cannot apply seccomp filter: {0}")] 231 ApplySeccompFilter(#[source] seccompiler::Error), 232 233 #[error("Failed resizing a memory zone")] 234 ResizeZone, 235 236 #[error("Cannot activate virtio devices: {0:?}")] 237 ActivateVirtioDevices(DeviceManagerError), 238 239 #[error("Error triggering power button: {0:?}")] 240 PowerButton(DeviceManagerError), 241 242 #[error("Kernel lacks PVH header")] 243 KernelMissingPvhHeader, 244 245 #[error("Failed to allocate firmware RAM: {0:?}")] 246 AllocateFirmwareMemory(MemoryManagerError), 247 248 #[error("Error manipulating firmware file: {0}")] 249 FirmwareFile(#[source] std::io::Error), 250 251 #[error("Firmware too big")] 252 FirmwareTooLarge, 253 254 #[error("Failed to copy firmware to memory: {0}")] 255 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 256 257 #[cfg(feature = "tdx")] 258 #[error("Error performing I/O on TDX firmware file: {0}")] 259 LoadTdvf(#[source] std::io::Error), 260 261 #[cfg(feature = "tdx")] 262 #[error("Error performing I/O on the TDX payload file: {0}")] 263 LoadPayload(#[source] std::io::Error), 264 265 #[cfg(feature = "tdx")] 266 #[error("Error parsing TDVF: {0}")] 267 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 268 269 #[cfg(feature = "tdx")] 270 #[error("Error populating TDX HOB: {0}")] 271 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 272 273 #[cfg(feature = "tdx")] 274 #[error("Error allocating TDVF memory: {0:?}")] 275 AllocatingTdvfMemory(crate::memory_manager::Error), 276 277 #[cfg(feature = "tdx")] 278 #[error("Error enabling TDX VM: {0}")] 279 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 280 281 #[cfg(feature = "tdx")] 282 #[error("Error enabling TDX memory region: {0}")] 283 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 284 285 #[cfg(feature = "tdx")] 286 #[error("Error finalizing TDX VM: {0}")] 287 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 288 289 #[cfg(feature = "tdx")] 290 #[error("TDX firmware missing")] 291 TdxFirmwareMissing, 292 293 #[cfg(feature = "tdx")] 294 #[error("Invalid TDX payload type")] 295 InvalidPayloadType, 296 297 #[cfg(feature = "guest_debug")] 298 #[error("Error debugging VM: {0:?}")] 299 Debug(DebuggableError), 300 301 #[error("Error spawning kernel loading thread")] 302 KernelLoadThreadSpawn(std::io::Error), 303 304 #[error("Error joining kernel loading thread")] 305 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 306 307 #[error("Payload configuration is not bootable")] 308 InvalidPayload, 309 310 #[cfg(feature = "guest_debug")] 311 #[error("Error coredumping VM: {0:?}")] 312 Coredump(GuestDebuggableError), 313 } 314 pub type Result<T> = result::Result<T, Error>; 315 316 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 317 pub enum VmState { 318 Created, 319 Running, 320 Shutdown, 321 Paused, 322 BreakPoint, 323 } 324 325 impl VmState { 326 fn valid_transition(self, new_state: VmState) -> Result<()> { 327 match self { 328 VmState::Created => match new_state { 329 VmState::Created | VmState::Shutdown => { 330 Err(Error::InvalidStateTransition(self, new_state)) 331 } 332 VmState::Running | VmState::Paused | VmState::BreakPoint => Ok(()), 333 }, 334 335 VmState::Running => match new_state { 336 VmState::Created | VmState::Running => { 337 Err(Error::InvalidStateTransition(self, new_state)) 338 } 339 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 340 }, 341 342 VmState::Shutdown => match new_state { 343 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 344 Err(Error::InvalidStateTransition(self, new_state)) 345 } 346 VmState::Running => Ok(()), 347 }, 348 349 VmState::Paused => match new_state { 350 VmState::Created | VmState::Paused | VmState::BreakPoint => { 351 Err(Error::InvalidStateTransition(self, new_state)) 352 } 353 VmState::Running | VmState::Shutdown => Ok(()), 354 }, 355 VmState::BreakPoint => match new_state { 356 VmState::Created | VmState::Running => Ok(()), 357 _ => Err(Error::InvalidStateTransition(self, new_state)), 358 }, 359 } 360 } 361 } 362 363 struct VmOpsHandler { 364 memory: GuestMemoryAtomic<GuestMemoryMmap>, 365 #[cfg(target_arch = "x86_64")] 366 io_bus: Arc<Bus>, 367 mmio_bus: Arc<Bus>, 368 #[cfg(target_arch = "x86_64")] 369 pci_config_io: Arc<Mutex<dyn BusDevice>>, 370 } 371 372 impl VmOps for VmOpsHandler { 373 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 374 self.memory 375 .memory() 376 .write(buf, GuestAddress(gpa)) 377 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 378 } 379 380 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 381 self.memory 382 .memory() 383 .read(buf, GuestAddress(gpa)) 384 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 385 } 386 387 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 388 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 389 warn!("Guest MMIO read to unregistered address 0x{:x}", gpa); 390 } 391 Ok(()) 392 } 393 394 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 395 match self.mmio_bus.write(gpa, data) { 396 Err(vm_device::BusError::MissingAddressRange) => { 397 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa); 398 } 399 Ok(Some(barrier)) => { 400 info!("Waiting for barrier"); 401 barrier.wait(); 402 info!("Barrier released"); 403 } 404 _ => {} 405 }; 406 Ok(()) 407 } 408 409 #[cfg(target_arch = "x86_64")] 410 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 411 use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE}; 412 413 if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) { 414 self.pci_config_io.lock().unwrap().read( 415 PCI_CONFIG_IO_PORT, 416 port - PCI_CONFIG_IO_PORT, 417 data, 418 ); 419 return Ok(()); 420 } 421 422 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 423 warn!("Guest PIO read to unregistered address 0x{:x}", port); 424 } 425 Ok(()) 426 } 427 428 #[cfg(target_arch = "x86_64")] 429 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 430 use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE}; 431 432 if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) { 433 self.pci_config_io.lock().unwrap().write( 434 PCI_CONFIG_IO_PORT, 435 port - PCI_CONFIG_IO_PORT, 436 data, 437 ); 438 return Ok(()); 439 } 440 441 match self.io_bus.write(port, data) { 442 Err(vm_device::BusError::MissingAddressRange) => { 443 warn!("Guest PIO write to unregistered address 0x{:x}", port); 444 } 445 Ok(Some(barrier)) => { 446 info!("Waiting for barrier"); 447 barrier.wait(); 448 info!("Barrier released"); 449 } 450 _ => {} 451 }; 452 Ok(()) 453 } 454 } 455 456 pub fn physical_bits(max_phys_bits: u8) -> u8 { 457 let host_phys_bits = get_host_cpu_phys_bits(); 458 459 cmp::min(host_phys_bits, max_phys_bits) 460 } 461 462 pub struct Vm { 463 #[cfg(feature = "tdx")] 464 kernel: Option<File>, 465 initramfs: Option<File>, 466 threads: Vec<thread::JoinHandle<()>>, 467 device_manager: Arc<Mutex<DeviceManager>>, 468 config: Arc<Mutex<VmConfig>>, 469 on_tty: bool, 470 signals: Option<Handle>, 471 state: RwLock<VmState>, 472 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 473 memory_manager: Arc<Mutex<MemoryManager>>, 474 #[cfg_attr(not(feature = "kvm"), allow(dead_code))] 475 // The hypervisor abstracted virtual machine. 476 vm: Arc<dyn hypervisor::Vm>, 477 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 478 saved_clock: Option<hypervisor::ClockData>, 479 numa_nodes: NumaNodes, 480 seccomp_action: SeccompAction, 481 exit_evt: EventFd, 482 hypervisor: Arc<dyn hypervisor::Hypervisor>, 483 stop_on_boot: bool, 484 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 485 } 486 487 impl Vm { 488 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 489 490 #[allow(clippy::too_many_arguments)] 491 fn new_from_memory_manager( 492 config: Arc<Mutex<VmConfig>>, 493 memory_manager: Arc<Mutex<MemoryManager>>, 494 vm: Arc<dyn hypervisor::Vm>, 495 exit_evt: EventFd, 496 reset_evt: EventFd, 497 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 498 seccomp_action: &SeccompAction, 499 hypervisor: Arc<dyn hypervisor::Hypervisor>, 500 activate_evt: EventFd, 501 restoring: bool, 502 timestamp: Instant, 503 ) -> Result<Self> { 504 trace_scoped!("Vm::new_from_memory_manager"); 505 506 let boot_id_list = config 507 .lock() 508 .unwrap() 509 .validate() 510 .map_err(Error::ConfigValidation)?; 511 512 let load_payload_handle = if !restoring { 513 Self::load_payload_async(&memory_manager, &config)? 514 } else { 515 None 516 }; 517 518 info!("Booting VM from config: {:?}", &config); 519 520 // Create NUMA nodes based on NumaConfig. 521 let numa_nodes = 522 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 523 524 #[cfg(feature = "tdx")] 525 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 526 #[cfg(feature = "tdx")] 527 let force_iommu = tdx_enabled; 528 #[cfg(not(feature = "tdx"))] 529 let force_iommu = false; 530 531 #[cfg(feature = "guest_debug")] 532 let stop_on_boot = config.lock().unwrap().gdb; 533 #[cfg(not(feature = "guest_debug"))] 534 let stop_on_boot = false; 535 536 let device_manager = DeviceManager::new( 537 hypervisor.hypervisor_type(), 538 vm.clone(), 539 config.clone(), 540 memory_manager.clone(), 541 &exit_evt, 542 &reset_evt, 543 seccomp_action.clone(), 544 numa_nodes.clone(), 545 &activate_evt, 546 force_iommu, 547 restoring, 548 boot_id_list, 549 timestamp, 550 ) 551 .map_err(Error::DeviceManager)?; 552 553 let memory = memory_manager.lock().unwrap().guest_memory(); 554 #[cfg(target_arch = "x86_64")] 555 let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus()); 556 let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus()); 557 558 #[cfg(target_arch = "x86_64")] 559 let pci_config_io = 560 device_manager.lock().unwrap().pci_config_io() as Arc<Mutex<dyn BusDevice>>; 561 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 562 memory, 563 #[cfg(target_arch = "x86_64")] 564 io_bus, 565 mmio_bus, 566 #[cfg(target_arch = "x86_64")] 567 pci_config_io, 568 }); 569 570 let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?; 571 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 572 let cpu_manager = cpu::CpuManager::new( 573 cpus_config, 574 &device_manager, 575 &memory_manager, 576 vm.clone(), 577 exit_evt_clone, 578 reset_evt, 579 #[cfg(feature = "guest_debug")] 580 vm_debug_evt, 581 hypervisor.clone(), 582 seccomp_action.clone(), 583 vm_ops, 584 #[cfg(feature = "tdx")] 585 tdx_enabled, 586 &numa_nodes, 587 ) 588 .map_err(Error::CpuManager)?; 589 590 let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0; 591 592 #[cfg(feature = "tdx")] 593 let kernel = config 594 .lock() 595 .unwrap() 596 .payload 597 .as_ref() 598 .map(|p| p.kernel.as_ref().map(File::open)) 599 .unwrap_or_default() 600 .transpose() 601 .map_err(Error::KernelFile)?; 602 603 let initramfs = config 604 .lock() 605 .unwrap() 606 .payload 607 .as_ref() 608 .map(|p| p.initramfs.as_ref().map(File::open)) 609 .unwrap_or_default() 610 .transpose() 611 .map_err(Error::InitramfsFile)?; 612 613 Ok(Vm { 614 #[cfg(feature = "tdx")] 615 kernel, 616 initramfs, 617 device_manager, 618 config, 619 on_tty, 620 threads: Vec::with_capacity(1), 621 signals: None, 622 state: RwLock::new(VmState::Created), 623 cpu_manager, 624 memory_manager, 625 vm, 626 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 627 saved_clock: None, 628 numa_nodes, 629 seccomp_action: seccomp_action.clone(), 630 exit_evt, 631 hypervisor, 632 stop_on_boot, 633 load_payload_handle, 634 }) 635 } 636 637 fn create_numa_nodes( 638 configs: Option<Vec<NumaConfig>>, 639 memory_manager: &Arc<Mutex<MemoryManager>>, 640 ) -> Result<NumaNodes> { 641 let mm = memory_manager.lock().unwrap(); 642 let mm_zones = mm.memory_zones(); 643 let mut numa_nodes = BTreeMap::new(); 644 645 if let Some(configs) = &configs { 646 for config in configs.iter() { 647 if numa_nodes.contains_key(&config.guest_numa_id) { 648 error!("Can't define twice the same NUMA node"); 649 return Err(Error::InvalidNumaConfig); 650 } 651 652 let mut node = NumaNode::default(); 653 654 if let Some(memory_zones) = &config.memory_zones { 655 for memory_zone in memory_zones.iter() { 656 if let Some(mm_zone) = mm_zones.get(memory_zone) { 657 node.memory_regions.extend(mm_zone.regions().clone()); 658 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 659 node.hotplug_regions.push(virtiomem_zone.region().clone()); 660 } 661 node.memory_zones.push(memory_zone.clone()); 662 } else { 663 error!("Unknown memory zone '{}'", memory_zone); 664 return Err(Error::InvalidNumaConfig); 665 } 666 } 667 } 668 669 if let Some(cpus) = &config.cpus { 670 node.cpus.extend(cpus); 671 } 672 673 if let Some(distances) = &config.distances { 674 for distance in distances.iter() { 675 let dest = distance.destination; 676 let dist = distance.distance; 677 678 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 679 error!("Unknown destination NUMA node {}", dest); 680 return Err(Error::InvalidNumaConfig); 681 } 682 683 if node.distances.contains_key(&dest) { 684 error!("Destination NUMA node {} has been already set", dest); 685 return Err(Error::InvalidNumaConfig); 686 } 687 688 node.distances.insert(dest, dist); 689 } 690 } 691 692 #[cfg(target_arch = "x86_64")] 693 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 694 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 695 let mm_sections = sgx_epc_region.epc_sections(); 696 for sgx_epc_section in sgx_epc_sections.iter() { 697 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 698 node.sgx_epc_sections.push(mm_section.clone()); 699 } else { 700 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 701 return Err(Error::InvalidNumaConfig); 702 } 703 } 704 } else { 705 error!("Missing SGX EPC region"); 706 return Err(Error::InvalidNumaConfig); 707 } 708 } 709 710 numa_nodes.insert(config.guest_numa_id, node); 711 } 712 } 713 714 Ok(numa_nodes) 715 } 716 717 #[allow(clippy::too_many_arguments)] 718 pub fn new( 719 config: Arc<Mutex<VmConfig>>, 720 exit_evt: EventFd, 721 reset_evt: EventFd, 722 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 723 seccomp_action: &SeccompAction, 724 hypervisor: Arc<dyn hypervisor::Hypervisor>, 725 activate_evt: EventFd, 726 serial_pty: Option<PtyPair>, 727 console_pty: Option<PtyPair>, 728 console_resize_pipe: Option<File>, 729 ) -> Result<Self> { 730 trace_scoped!("Vm::new"); 731 732 let timestamp = Instant::now(); 733 734 #[cfg(feature = "tdx")] 735 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 736 hypervisor.check_required_extensions().unwrap(); 737 #[cfg(feature = "tdx")] 738 let vm = hypervisor 739 .create_vm_with_type(if tdx_enabled { 740 2 // KVM_X86_TDX_VM 741 } else { 742 0 // KVM_X86_LEGACY_VM 743 }) 744 .unwrap(); 745 #[cfg(not(feature = "tdx"))] 746 let vm = hypervisor.create_vm().unwrap(); 747 748 #[cfg(target_arch = "x86_64")] 749 { 750 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 751 .unwrap(); 752 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 753 vm.enable_split_irq().unwrap(); 754 } 755 756 let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits); 757 758 #[cfg(target_arch = "x86_64")] 759 let sgx_epc_config = config.lock().unwrap().sgx_epc.clone(); 760 761 let memory_manager = MemoryManager::new( 762 vm.clone(), 763 &config.lock().unwrap().memory.clone(), 764 None, 765 phys_bits, 766 #[cfg(feature = "tdx")] 767 tdx_enabled, 768 None, 769 None, 770 #[cfg(target_arch = "x86_64")] 771 sgx_epc_config, 772 ) 773 .map_err(Error::MemoryManager)?; 774 775 let new_vm = Vm::new_from_memory_manager( 776 config, 777 memory_manager, 778 vm, 779 exit_evt, 780 reset_evt, 781 #[cfg(feature = "guest_debug")] 782 vm_debug_evt, 783 seccomp_action, 784 hypervisor, 785 activate_evt, 786 false, 787 timestamp, 788 )?; 789 790 // The device manager must create the devices from here as it is part 791 // of the regular code path creating everything from scratch. 792 new_vm 793 .device_manager 794 .lock() 795 .unwrap() 796 .create_devices(serial_pty, console_pty, console_resize_pipe) 797 .map_err(Error::DeviceManager)?; 798 Ok(new_vm) 799 } 800 801 #[allow(clippy::too_many_arguments)] 802 pub fn new_from_snapshot( 803 snapshot: &Snapshot, 804 vm_config: Arc<Mutex<VmConfig>>, 805 exit_evt: EventFd, 806 reset_evt: EventFd, 807 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 808 source_url: Option<&str>, 809 prefault: bool, 810 seccomp_action: &SeccompAction, 811 hypervisor: Arc<dyn hypervisor::Hypervisor>, 812 activate_evt: EventFd, 813 ) -> Result<Self> { 814 let timestamp = Instant::now(); 815 816 hypervisor.check_required_extensions().unwrap(); 817 let vm = hypervisor.create_vm().unwrap(); 818 819 #[cfg(target_arch = "x86_64")] 820 { 821 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 822 .unwrap(); 823 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 824 vm.enable_split_irq().unwrap(); 825 } 826 827 let memory_manager = if let Some(memory_manager_snapshot) = 828 snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) 829 { 830 let phys_bits = physical_bits(vm_config.lock().unwrap().cpus.max_phys_bits); 831 MemoryManager::new_from_snapshot( 832 memory_manager_snapshot, 833 vm.clone(), 834 &vm_config.lock().unwrap().memory.clone(), 835 source_url, 836 prefault, 837 phys_bits, 838 ) 839 .map_err(Error::MemoryManager)? 840 } else { 841 return Err(Error::Restore(MigratableError::Restore(anyhow!( 842 "Missing memory manager snapshot" 843 )))); 844 }; 845 846 Vm::new_from_memory_manager( 847 vm_config, 848 memory_manager, 849 vm, 850 exit_evt, 851 reset_evt, 852 #[cfg(feature = "guest_debug")] 853 vm_debug_evt, 854 seccomp_action, 855 hypervisor, 856 activate_evt, 857 true, 858 timestamp, 859 ) 860 } 861 862 #[allow(clippy::too_many_arguments)] 863 pub fn new_from_migration( 864 config: Arc<Mutex<VmConfig>>, 865 exit_evt: EventFd, 866 reset_evt: EventFd, 867 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 868 seccomp_action: &SeccompAction, 869 hypervisor: Arc<dyn hypervisor::Hypervisor>, 870 activate_evt: EventFd, 871 memory_manager_data: &MemoryManagerSnapshotData, 872 existing_memory_files: Option<HashMap<u32, File>>, 873 ) -> Result<Self> { 874 let timestamp = Instant::now(); 875 876 hypervisor.check_required_extensions().unwrap(); 877 let vm = hypervisor.create_vm().unwrap(); 878 879 #[cfg(target_arch = "x86_64")] 880 { 881 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 882 .unwrap(); 883 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 884 vm.enable_split_irq().unwrap(); 885 } 886 887 let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits); 888 889 let memory_manager = MemoryManager::new( 890 vm.clone(), 891 &config.lock().unwrap().memory.clone(), 892 None, 893 phys_bits, 894 #[cfg(feature = "tdx")] 895 false, 896 Some(memory_manager_data), 897 existing_memory_files, 898 #[cfg(target_arch = "x86_64")] 899 None, 900 ) 901 .map_err(Error::MemoryManager)?; 902 903 Vm::new_from_memory_manager( 904 config, 905 memory_manager, 906 vm, 907 exit_evt, 908 reset_evt, 909 #[cfg(feature = "guest_debug")] 910 vm_debug_evt, 911 seccomp_action, 912 hypervisor, 913 activate_evt, 914 true, 915 timestamp, 916 ) 917 } 918 919 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 920 let mut initramfs = self.initramfs.as_ref().unwrap(); 921 let size: usize = initramfs 922 .seek(SeekFrom::End(0)) 923 .map_err(|_| Error::InitramfsLoad)? 924 .try_into() 925 .unwrap(); 926 initramfs 927 .seek(SeekFrom::Start(0)) 928 .map_err(|_| Error::InitramfsLoad)?; 929 930 let address = 931 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 932 let address = GuestAddress(address); 933 934 guest_mem 935 .read_from(address, &mut initramfs, size) 936 .map_err(|_| Error::InitramfsLoad)?; 937 938 info!("Initramfs loaded: address = 0x{:x}", address.0); 939 Ok(arch::InitramfsConfig { address, size }) 940 } 941 942 fn generate_cmdline( 943 payload: &PayloadConfig, 944 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 945 ) -> Result<Cmdline> { 946 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE); 947 if let Some(s) = payload.cmdline.as_ref() { 948 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 949 } 950 951 #[cfg(target_arch = "aarch64")] 952 for entry in device_manager.lock().unwrap().cmdline_additions() { 953 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 954 } 955 Ok(cmdline) 956 } 957 958 #[cfg(target_arch = "aarch64")] 959 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 960 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 961 let mem = uefi_flash.memory(); 962 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 963 .map_err(Error::UefiLoad)?; 964 Ok(()) 965 } 966 967 #[cfg(target_arch = "aarch64")] 968 fn load_kernel( 969 firmware: Option<File>, 970 kernel: Option<File>, 971 memory_manager: Arc<Mutex<MemoryManager>>, 972 ) -> Result<EntryPoint> { 973 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 974 let mem = guest_memory.memory(); 975 let entry_addr = match (firmware, kernel) { 976 (None, Some(mut kernel)) => { 977 match linux_loader::loader::pe::PE::load( 978 mem.deref(), 979 Some(arch::layout::KERNEL_START), 980 &mut kernel, 981 None, 982 ) { 983 Ok(entry_addr) => entry_addr.kernel_load, 984 // Try to load the binary as kernel PE file at first. 985 // If failed, retry to load it as UEFI binary. 986 // As the UEFI binary is formatless, it must be the last option to try. 987 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 988 Self::load_firmware(&kernel, memory_manager)?; 989 arch::layout::UEFI_START 990 } 991 Err(e) => { 992 return Err(Error::KernelLoad(e)); 993 } 994 } 995 } 996 (Some(firmware), None) => { 997 Self::load_firmware(&firmware, memory_manager)?; 998 arch::layout::UEFI_START 999 } 1000 _ => return Err(Error::InvalidPayload), 1001 }; 1002 1003 Ok(EntryPoint { entry_addr }) 1004 } 1005 1006 #[cfg(target_arch = "x86_64")] 1007 fn load_legacy_firmware( 1008 mut firmware: File, 1009 memory_manager: &Arc<Mutex<MemoryManager>>, 1010 ) -> Result<EntryPoint> { 1011 warn!("Loading of legacy (non-PVH) firmware is deprecated and will be removed in a future version."); 1012 1013 // Not an ELF header - assume raw binary data / firmware 1014 let size = firmware 1015 .seek(SeekFrom::End(0)) 1016 .map_err(Error::FirmwareFile)?; 1017 1018 // The OVMF firmware is as big as you might expect and it's 4MiB so limit to that 1019 if size > 4 << 20 { 1020 return Err(Error::FirmwareTooLarge); 1021 } 1022 1023 // Loaded at the end of the 4GiB 1024 let load_address = GuestAddress(4 << 30) 1025 .checked_sub(size) 1026 .ok_or(Error::FirmwareTooLarge)?; 1027 1028 info!( 1029 "Loading RAW firmware at 0x{:x} (size: {})", 1030 load_address.raw_value(), 1031 size 1032 ); 1033 1034 memory_manager 1035 .lock() 1036 .unwrap() 1037 .add_ram_region(load_address, size as usize) 1038 .map_err(Error::AllocateFirmwareMemory)?; 1039 1040 firmware 1041 .seek(SeekFrom::Start(0)) 1042 .map_err(Error::FirmwareFile)?; 1043 memory_manager 1044 .lock() 1045 .unwrap() 1046 .guest_memory() 1047 .memory() 1048 .read_exact_from(load_address, &mut firmware, size as usize) 1049 .map_err(Error::FirmwareLoad)?; 1050 1051 Ok(EntryPoint { entry_addr: None }) 1052 } 1053 1054 #[cfg(target_arch = "x86_64")] 1055 fn load_kernel( 1056 mut kernel: File, 1057 cmdline: Option<Cmdline>, 1058 memory_manager: Arc<Mutex<MemoryManager>>, 1059 ) -> Result<EntryPoint> { 1060 use linux_loader::loader::{elf::Error::InvalidElfMagicNumber, Error::Elf}; 1061 info!("Loading kernel"); 1062 1063 let mem = { 1064 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 1065 guest_memory.memory() 1066 }; 1067 let entry_addr = match linux_loader::loader::elf::Elf::load( 1068 mem.deref(), 1069 None, 1070 &mut kernel, 1071 Some(arch::layout::HIGH_RAM_START), 1072 ) { 1073 Ok(entry_addr) => entry_addr, 1074 Err(e) => match e { 1075 Elf(InvalidElfMagicNumber) => { 1076 return Self::load_legacy_firmware(kernel, &memory_manager) 1077 } 1078 _ => { 1079 return Err(Error::KernelLoad(e)); 1080 } 1081 }, 1082 }; 1083 1084 if let Some(cmdline) = cmdline { 1085 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1086 .map_err(Error::LoadCmdLine)?; 1087 } 1088 1089 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1090 // Use the PVH kernel entry point to boot the guest 1091 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1092 Ok(EntryPoint { 1093 entry_addr: Some(entry_addr), 1094 }) 1095 } else { 1096 Err(Error::KernelMissingPvhHeader) 1097 } 1098 } 1099 1100 #[cfg(target_arch = "x86_64")] 1101 fn load_payload( 1102 payload: &PayloadConfig, 1103 memory_manager: Arc<Mutex<MemoryManager>>, 1104 ) -> Result<EntryPoint> { 1105 trace_scoped!("load_payload"); 1106 match ( 1107 &payload.firmware, 1108 &payload.kernel, 1109 &payload.initramfs, 1110 &payload.cmdline, 1111 ) { 1112 (Some(firmware), None, None, None) => { 1113 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1114 Self::load_kernel(firmware, None, memory_manager) 1115 } 1116 (None, Some(kernel), _, _) => { 1117 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1118 let cmdline = Self::generate_cmdline(payload)?; 1119 Self::load_kernel(kernel, Some(cmdline), memory_manager) 1120 } 1121 _ => Err(Error::InvalidPayload), 1122 } 1123 } 1124 1125 #[cfg(target_arch = "aarch64")] 1126 fn load_payload( 1127 payload: &PayloadConfig, 1128 memory_manager: Arc<Mutex<MemoryManager>>, 1129 ) -> Result<EntryPoint> { 1130 match (&payload.firmware, &payload.kernel) { 1131 (Some(firmware), None) => { 1132 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1133 Self::load_kernel(Some(firmware), None, memory_manager) 1134 } 1135 (None, Some(kernel)) => { 1136 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1137 Self::load_kernel(None, Some(kernel), memory_manager) 1138 } 1139 _ => Err(Error::InvalidPayload), 1140 } 1141 } 1142 1143 fn load_payload_async( 1144 memory_manager: &Arc<Mutex<MemoryManager>>, 1145 config: &Arc<Mutex<VmConfig>>, 1146 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1147 // Kernel with TDX is loaded in a different manner 1148 #[cfg(feature = "tdx")] 1149 if config.lock().unwrap().is_tdx_enabled() { 1150 return Ok(None); 1151 } 1152 1153 config 1154 .lock() 1155 .unwrap() 1156 .payload 1157 .as_ref() 1158 .map(|payload| { 1159 let memory_manager = memory_manager.clone(); 1160 let payload = payload.clone(); 1161 1162 std::thread::Builder::new() 1163 .name("payload_loader".into()) 1164 .spawn(move || Self::load_payload(&payload, memory_manager)) 1165 .map_err(Error::KernelLoadThreadSpawn) 1166 }) 1167 .transpose() 1168 } 1169 1170 #[cfg(target_arch = "x86_64")] 1171 fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> { 1172 trace_scoped!("configure_system"); 1173 info!("Configuring system"); 1174 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1175 1176 let initramfs_config = match self.initramfs { 1177 Some(_) => Some(self.load_initramfs(&mem)?), 1178 None => None, 1179 }; 1180 1181 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1182 let rsdp_addr = Some(rsdp_addr); 1183 let sgx_epc_region = self 1184 .memory_manager 1185 .lock() 1186 .unwrap() 1187 .sgx_epc_region() 1188 .as_ref() 1189 .cloned(); 1190 1191 let serial_number = self 1192 .config 1193 .lock() 1194 .unwrap() 1195 .platform 1196 .as_ref() 1197 .and_then(|p| p.serial_number.clone()); 1198 1199 let uuid = self 1200 .config 1201 .lock() 1202 .unwrap() 1203 .platform 1204 .as_ref() 1205 .and_then(|p| p.uuid.clone()); 1206 1207 let oem_strings = self 1208 .config 1209 .lock() 1210 .unwrap() 1211 .platform 1212 .as_ref() 1213 .and_then(|p| p.oem_strings.clone()); 1214 1215 let oem_strings = oem_strings 1216 .as_deref() 1217 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1218 1219 arch::configure_system( 1220 &mem, 1221 arch::layout::CMDLINE_START, 1222 &initramfs_config, 1223 boot_vcpus, 1224 rsdp_addr, 1225 sgx_epc_region, 1226 serial_number.as_deref(), 1227 uuid.as_deref(), 1228 oem_strings.as_deref(), 1229 ) 1230 .map_err(Error::ConfigureSystem)?; 1231 Ok(()) 1232 } 1233 1234 #[cfg(target_arch = "aarch64")] 1235 fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> { 1236 let cmdline = Self::generate_cmdline( 1237 self.config.lock().unwrap().payload.as_ref().unwrap(), 1238 &self.device_manager, 1239 )?; 1240 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1241 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1242 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1243 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1244 let initramfs_config = match self.initramfs { 1245 Some(_) => Some(self.load_initramfs(&mem)?), 1246 None => None, 1247 }; 1248 1249 let device_info = &self 1250 .device_manager 1251 .lock() 1252 .unwrap() 1253 .get_device_info() 1254 .clone(); 1255 1256 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1257 let pci_space = PciSpaceInfo { 1258 pci_segment_id: pci_segment.id, 1259 mmio_config_address: pci_segment.mmio_config_address, 1260 pci_device_space_start: pci_segment.start_of_device_area, 1261 pci_device_space_size: pci_segment.end_of_device_area 1262 - pci_segment.start_of_device_area 1263 + 1, 1264 }; 1265 pci_space_info.push(pci_space); 1266 } 1267 1268 let virtio_iommu_bdf = self 1269 .device_manager 1270 .lock() 1271 .unwrap() 1272 .iommu_attached_devices() 1273 .as_ref() 1274 .map(|(v, _)| *v); 1275 1276 let vcpu_count = self.cpu_manager.lock().unwrap().boot_vcpus() as u64; 1277 let vgic = self 1278 .device_manager 1279 .lock() 1280 .unwrap() 1281 .get_interrupt_controller() 1282 .unwrap() 1283 .lock() 1284 .unwrap() 1285 .create_vgic( 1286 &self.memory_manager.lock().as_ref().unwrap().vm, 1287 Gic::create_default_config(vcpu_count), 1288 ) 1289 .map_err(|_| { 1290 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1291 arch::aarch64::Error::SetupGic, 1292 )) 1293 })?; 1294 1295 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1296 let pmu_supported = self 1297 .cpu_manager 1298 .lock() 1299 .unwrap() 1300 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1301 .map_err(|_| { 1302 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1303 arch::aarch64::Error::VcpuInitPmu, 1304 )) 1305 })?; 1306 1307 arch::configure_system( 1308 &mem, 1309 cmdline.as_cstring().unwrap().to_str().unwrap(), 1310 vcpu_mpidrs, 1311 vcpu_topology, 1312 device_info, 1313 &initramfs_config, 1314 &pci_space_info, 1315 virtio_iommu_bdf.map(|bdf| bdf.into()), 1316 &vgic, 1317 &self.numa_nodes, 1318 pmu_supported, 1319 ) 1320 .map_err(Error::ConfigureSystem)?; 1321 1322 // Activate gic device 1323 self.device_manager 1324 .lock() 1325 .unwrap() 1326 .get_interrupt_controller() 1327 .unwrap() 1328 .lock() 1329 .unwrap() 1330 .enable() 1331 .map_err(Error::EnableInterruptController)?; 1332 1333 Ok(()) 1334 } 1335 1336 pub fn serial_pty(&self) -> Option<PtyPair> { 1337 self.device_manager.lock().unwrap().serial_pty() 1338 } 1339 1340 pub fn console_pty(&self) -> Option<PtyPair> { 1341 self.device_manager.lock().unwrap().console_pty() 1342 } 1343 1344 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1345 self.device_manager.lock().unwrap().console_resize_pipe() 1346 } 1347 1348 pub fn shutdown(&mut self) -> Result<()> { 1349 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1350 let new_state = VmState::Shutdown; 1351 1352 state.valid_transition(new_state)?; 1353 1354 if self.on_tty { 1355 // Don't forget to set the terminal in canonical mode 1356 // before to exit. 1357 io::stdin() 1358 .lock() 1359 .set_canon_mode() 1360 .map_err(Error::SetTerminalCanon)?; 1361 } 1362 1363 // Trigger the termination of the signal_handler thread 1364 if let Some(signals) = self.signals.take() { 1365 signals.close(); 1366 } 1367 1368 // Wake up the DeviceManager threads so they will get terminated cleanly 1369 self.device_manager 1370 .lock() 1371 .unwrap() 1372 .resume() 1373 .map_err(Error::Resume)?; 1374 1375 self.cpu_manager 1376 .lock() 1377 .unwrap() 1378 .shutdown() 1379 .map_err(Error::CpuManager)?; 1380 1381 // Wait for all the threads to finish 1382 for thread in self.threads.drain(..) { 1383 thread.join().map_err(Error::ThreadCleanup)? 1384 } 1385 *state = new_state; 1386 1387 event!("vm", "shutdown"); 1388 1389 Ok(()) 1390 } 1391 1392 pub fn resize( 1393 &mut self, 1394 desired_vcpus: Option<u8>, 1395 desired_memory: Option<u64>, 1396 desired_balloon: Option<u64>, 1397 ) -> Result<()> { 1398 event!("vm", "resizing"); 1399 1400 if let Some(desired_vcpus) = desired_vcpus { 1401 if self 1402 .cpu_manager 1403 .lock() 1404 .unwrap() 1405 .resize(desired_vcpus) 1406 .map_err(Error::CpuManager)? 1407 { 1408 self.device_manager 1409 .lock() 1410 .unwrap() 1411 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1412 .map_err(Error::DeviceManager)?; 1413 } 1414 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1415 } 1416 1417 if let Some(desired_memory) = desired_memory { 1418 let new_region = self 1419 .memory_manager 1420 .lock() 1421 .unwrap() 1422 .resize(desired_memory) 1423 .map_err(Error::MemoryManager)?; 1424 1425 let mut memory_config = &mut self.config.lock().unwrap().memory; 1426 1427 if let Some(new_region) = &new_region { 1428 self.device_manager 1429 .lock() 1430 .unwrap() 1431 .update_memory(new_region) 1432 .map_err(Error::DeviceManager)?; 1433 1434 match memory_config.hotplug_method { 1435 HotplugMethod::Acpi => { 1436 self.device_manager 1437 .lock() 1438 .unwrap() 1439 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1440 .map_err(Error::DeviceManager)?; 1441 } 1442 HotplugMethod::VirtioMem => {} 1443 } 1444 } 1445 1446 // We update the VM config regardless of the actual guest resize 1447 // operation result (happened or not), so that if the VM reboots 1448 // it will be running with the last configure memory size. 1449 match memory_config.hotplug_method { 1450 HotplugMethod::Acpi => memory_config.size = desired_memory, 1451 HotplugMethod::VirtioMem => { 1452 if desired_memory > memory_config.size { 1453 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1454 } else { 1455 memory_config.hotplugged_size = None; 1456 } 1457 } 1458 } 1459 } 1460 1461 if let Some(desired_balloon) = desired_balloon { 1462 self.device_manager 1463 .lock() 1464 .unwrap() 1465 .resize_balloon(desired_balloon) 1466 .map_err(Error::DeviceManager)?; 1467 1468 // Update the configuration value for the balloon size to ensure 1469 // a reboot would use the right value. 1470 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1471 balloon_config.size = desired_balloon; 1472 } 1473 } 1474 1475 event!("vm", "resized"); 1476 1477 Ok(()) 1478 } 1479 1480 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1481 let memory_config = &mut self.config.lock().unwrap().memory; 1482 1483 if let Some(zones) = &mut memory_config.zones { 1484 for zone in zones.iter_mut() { 1485 if zone.id == id { 1486 if desired_memory >= zone.size { 1487 let hotplugged_size = desired_memory - zone.size; 1488 self.memory_manager 1489 .lock() 1490 .unwrap() 1491 .resize_zone(&id, desired_memory - zone.size) 1492 .map_err(Error::MemoryManager)?; 1493 // We update the memory zone config regardless of the 1494 // actual 'resize-zone' operation result (happened or 1495 // not), so that if the VM reboots it will be running 1496 // with the last configured memory zone size. 1497 zone.hotplugged_size = Some(hotplugged_size); 1498 1499 return Ok(()); 1500 } else { 1501 error!( 1502 "Invalid to ask less ({}) than boot RAM ({}) for \ 1503 this memory zone", 1504 desired_memory, zone.size, 1505 ); 1506 return Err(Error::ResizeZone); 1507 } 1508 } 1509 } 1510 } 1511 1512 error!("Could not find the memory zone {} for the resize", id); 1513 Err(Error::ResizeZone) 1514 } 1515 1516 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1517 let pci_device_info = self 1518 .device_manager 1519 .lock() 1520 .unwrap() 1521 .add_device(&mut device_cfg) 1522 .map_err(Error::DeviceManager)?; 1523 1524 // Update VmConfig by adding the new device. This is important to 1525 // ensure the device would be created in case of a reboot. 1526 { 1527 let mut config = self.config.lock().unwrap(); 1528 add_to_config(&mut config.devices, device_cfg); 1529 } 1530 1531 self.device_manager 1532 .lock() 1533 .unwrap() 1534 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1535 .map_err(Error::DeviceManager)?; 1536 1537 Ok(pci_device_info) 1538 } 1539 1540 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1541 let pci_device_info = self 1542 .device_manager 1543 .lock() 1544 .unwrap() 1545 .add_user_device(&mut device_cfg) 1546 .map_err(Error::DeviceManager)?; 1547 1548 // Update VmConfig by adding the new device. This is important to 1549 // ensure the device would be created in case of a reboot. 1550 { 1551 let mut config = self.config.lock().unwrap(); 1552 add_to_config(&mut config.user_devices, device_cfg); 1553 } 1554 1555 self.device_manager 1556 .lock() 1557 .unwrap() 1558 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1559 .map_err(Error::DeviceManager)?; 1560 1561 Ok(pci_device_info) 1562 } 1563 1564 pub fn remove_device(&mut self, id: String) -> Result<()> { 1565 self.device_manager 1566 .lock() 1567 .unwrap() 1568 .remove_device(id.clone()) 1569 .map_err(Error::DeviceManager)?; 1570 1571 // Update VmConfig by removing the device. This is important to 1572 // ensure the device would not be created in case of a reboot. 1573 let mut config = self.config.lock().unwrap(); 1574 1575 // Remove if VFIO device 1576 if let Some(devices) = config.devices.as_mut() { 1577 devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1578 } 1579 1580 // Remove if VFIO user device 1581 if let Some(user_devices) = config.user_devices.as_mut() { 1582 user_devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1583 } 1584 1585 // Remove if disk device 1586 if let Some(disks) = config.disks.as_mut() { 1587 disks.retain(|dev| dev.id.as_ref() != Some(&id)); 1588 } 1589 1590 // Remove if fs device 1591 if let Some(fs) = config.fs.as_mut() { 1592 fs.retain(|dev| dev.id.as_ref() != Some(&id)); 1593 } 1594 1595 // Remove if net device 1596 if let Some(net) = config.net.as_mut() { 1597 net.retain(|dev| dev.id.as_ref() != Some(&id)); 1598 } 1599 1600 // Remove if pmem device 1601 if let Some(pmem) = config.pmem.as_mut() { 1602 pmem.retain(|dev| dev.id.as_ref() != Some(&id)); 1603 } 1604 1605 // Remove if vDPA device 1606 if let Some(vdpa) = config.vdpa.as_mut() { 1607 vdpa.retain(|dev| dev.id.as_ref() != Some(&id)); 1608 } 1609 1610 // Remove if vsock device 1611 if let Some(vsock) = config.vsock.as_ref() { 1612 if vsock.id.as_ref() == Some(&id) { 1613 config.vsock = None; 1614 } 1615 } 1616 1617 self.device_manager 1618 .lock() 1619 .unwrap() 1620 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1621 .map_err(Error::DeviceManager)?; 1622 Ok(()) 1623 } 1624 1625 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1626 let pci_device_info = self 1627 .device_manager 1628 .lock() 1629 .unwrap() 1630 .add_disk(&mut disk_cfg) 1631 .map_err(Error::DeviceManager)?; 1632 1633 // Update VmConfig by adding the new device. This is important to 1634 // ensure the device would be created in case of a reboot. 1635 { 1636 let mut config = self.config.lock().unwrap(); 1637 add_to_config(&mut config.disks, disk_cfg); 1638 } 1639 1640 self.device_manager 1641 .lock() 1642 .unwrap() 1643 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1644 .map_err(Error::DeviceManager)?; 1645 1646 Ok(pci_device_info) 1647 } 1648 1649 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1650 let pci_device_info = self 1651 .device_manager 1652 .lock() 1653 .unwrap() 1654 .add_fs(&mut fs_cfg) 1655 .map_err(Error::DeviceManager)?; 1656 1657 // Update VmConfig by adding the new device. This is important to 1658 // ensure the device would be created in case of a reboot. 1659 { 1660 let mut config = self.config.lock().unwrap(); 1661 add_to_config(&mut config.fs, fs_cfg); 1662 } 1663 1664 self.device_manager 1665 .lock() 1666 .unwrap() 1667 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1668 .map_err(Error::DeviceManager)?; 1669 1670 Ok(pci_device_info) 1671 } 1672 1673 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1674 let pci_device_info = self 1675 .device_manager 1676 .lock() 1677 .unwrap() 1678 .add_pmem(&mut pmem_cfg) 1679 .map_err(Error::DeviceManager)?; 1680 1681 // Update VmConfig by adding the new device. This is important to 1682 // ensure the device would be created in case of a reboot. 1683 { 1684 let mut config = self.config.lock().unwrap(); 1685 add_to_config(&mut config.pmem, pmem_cfg); 1686 } 1687 1688 self.device_manager 1689 .lock() 1690 .unwrap() 1691 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1692 .map_err(Error::DeviceManager)?; 1693 1694 Ok(pci_device_info) 1695 } 1696 1697 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1698 let pci_device_info = self 1699 .device_manager 1700 .lock() 1701 .unwrap() 1702 .add_net(&mut net_cfg) 1703 .map_err(Error::DeviceManager)?; 1704 1705 // Update VmConfig by adding the new device. This is important to 1706 // ensure the device would be created in case of a reboot. 1707 { 1708 let mut config = self.config.lock().unwrap(); 1709 add_to_config(&mut config.net, net_cfg); 1710 } 1711 1712 self.device_manager 1713 .lock() 1714 .unwrap() 1715 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1716 .map_err(Error::DeviceManager)?; 1717 1718 Ok(pci_device_info) 1719 } 1720 1721 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1722 let pci_device_info = self 1723 .device_manager 1724 .lock() 1725 .unwrap() 1726 .add_vdpa(&mut vdpa_cfg) 1727 .map_err(Error::DeviceManager)?; 1728 1729 // Update VmConfig by adding the new device. This is important to 1730 // ensure the device would be created in case of a reboot. 1731 { 1732 let mut config = self.config.lock().unwrap(); 1733 add_to_config(&mut config.vdpa, vdpa_cfg); 1734 } 1735 1736 self.device_manager 1737 .lock() 1738 .unwrap() 1739 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1740 .map_err(Error::DeviceManager)?; 1741 1742 Ok(pci_device_info) 1743 } 1744 1745 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1746 let pci_device_info = self 1747 .device_manager 1748 .lock() 1749 .unwrap() 1750 .add_vsock(&mut vsock_cfg) 1751 .map_err(Error::DeviceManager)?; 1752 1753 // Update VmConfig by adding the new device. This is important to 1754 // ensure the device would be created in case of a reboot. 1755 { 1756 let mut config = self.config.lock().unwrap(); 1757 config.vsock = Some(vsock_cfg); 1758 } 1759 1760 self.device_manager 1761 .lock() 1762 .unwrap() 1763 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1764 .map_err(Error::DeviceManager)?; 1765 1766 Ok(pci_device_info) 1767 } 1768 1769 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1770 Ok(self.device_manager.lock().unwrap().counters()) 1771 } 1772 1773 fn signal_handler(mut signals: Signals, console_input_clone: Arc<Console>) { 1774 for sig in &Vm::HANDLED_SIGNALS { 1775 unblock_signal(*sig).unwrap(); 1776 } 1777 1778 for signal in signals.forever() { 1779 if signal == SIGWINCH { 1780 console_input_clone.update_console_size(); 1781 } 1782 } 1783 } 1784 1785 #[cfg(feature = "tdx")] 1786 fn init_tdx(&mut self) -> Result<()> { 1787 let cpuid = self.cpu_manager.lock().unwrap().common_cpuid(); 1788 let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32; 1789 self.vm 1790 .tdx_init(&cpuid, max_vcpus) 1791 .map_err(Error::InitializeTdxVm)?; 1792 Ok(()) 1793 } 1794 1795 #[cfg(feature = "tdx")] 1796 fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> { 1797 use arch::x86_64::tdx::*; 1798 1799 let firmware_path = self 1800 .config 1801 .lock() 1802 .unwrap() 1803 .payload 1804 .as_ref() 1805 .unwrap() 1806 .firmware 1807 .clone() 1808 .ok_or(Error::TdxFirmwareMissing)?; 1809 // The TDVF file contains a table of section as well as code 1810 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1811 1812 // For all the sections allocate some RAM backing them 1813 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1814 } 1815 1816 #[cfg(feature = "tdx")] 1817 fn hob_memory_resources( 1818 mut sorted_sections: Vec<TdvfSection>, 1819 guest_memory: &GuestMemoryMmap, 1820 ) -> Vec<(u64, u64, bool)> { 1821 let mut list = Vec::new(); 1822 1823 let mut current_section = sorted_sections.pop(); 1824 1825 // RAM regions interleaved with TDVF sections 1826 let mut next_start_addr = 0; 1827 for region in guest_memory.iter() { 1828 let region_start = region.start_addr().0; 1829 let region_end = region.last_addr().0; 1830 if region_start > next_start_addr { 1831 next_start_addr = region_start; 1832 } 1833 1834 loop { 1835 let (start, size, ram) = if let Some(section) = ¤t_section { 1836 if section.address <= next_start_addr { 1837 (section.address, section.size, false) 1838 } else { 1839 let last_addr = std::cmp::min(section.address - 1, region_end); 1840 (next_start_addr, last_addr - next_start_addr + 1, true) 1841 } 1842 } else { 1843 (next_start_addr, region_end - next_start_addr + 1, true) 1844 }; 1845 1846 list.push((start, size, ram)); 1847 1848 if !ram { 1849 current_section = sorted_sections.pop(); 1850 } 1851 1852 next_start_addr = start + size; 1853 1854 if region_start > next_start_addr { 1855 next_start_addr = region_start; 1856 } 1857 1858 if next_start_addr > region_end { 1859 break; 1860 } 1861 } 1862 } 1863 1864 // Once all the interleaved sections have been processed, let's simply 1865 // pull the remaining ones. 1866 if let Some(section) = current_section { 1867 list.push((section.address, section.size, false)); 1868 } 1869 while let Some(section) = sorted_sections.pop() { 1870 list.push((section.address, section.size, false)); 1871 } 1872 1873 list 1874 } 1875 1876 #[cfg(feature = "tdx")] 1877 fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> { 1878 use arch::x86_64::tdx::*; 1879 // Get the memory end *before* we start adding TDVF ram regions 1880 let boot_guest_memory = self 1881 .memory_manager 1882 .lock() 1883 .as_ref() 1884 .unwrap() 1885 .boot_guest_memory(); 1886 for section in sections { 1887 // No need to allocate if the section falls within guest RAM ranges 1888 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1889 info!( 1890 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1891 section 1892 ); 1893 continue; 1894 } 1895 1896 info!("Allocating TDVF Section: {:x?}", section); 1897 self.memory_manager 1898 .lock() 1899 .unwrap() 1900 .add_ram_region(GuestAddress(section.address), section.size as usize) 1901 .map_err(Error::AllocatingTdvfMemory)?; 1902 } 1903 1904 // The TDVF file contains a table of section as well as code 1905 let firmware_path = self 1906 .config 1907 .lock() 1908 .unwrap() 1909 .payload 1910 .as_ref() 1911 .unwrap() 1912 .firmware 1913 .clone() 1914 .ok_or(Error::TdxFirmwareMissing)?; 1915 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1916 1917 // The guest memory at this point now has all the required regions so it 1918 // is safe to copy from the TDVF file into it. 1919 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1920 let mem = guest_memory.memory(); 1921 let mut payload_info = None; 1922 let mut hob_offset = None; 1923 for section in sections { 1924 info!("Populating TDVF Section: {:x?}", section); 1925 match section.r#type { 1926 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1927 info!("Copying section to guest memory"); 1928 firmware_file 1929 .seek(SeekFrom::Start(section.data_offset as u64)) 1930 .map_err(Error::LoadTdvf)?; 1931 mem.read_from( 1932 GuestAddress(section.address), 1933 &mut firmware_file, 1934 section.data_size as usize, 1935 ) 1936 .unwrap(); 1937 } 1938 TdvfSectionType::TdHob => { 1939 hob_offset = Some(section.address); 1940 } 1941 TdvfSectionType::Payload => { 1942 info!("Copying payload to guest memory"); 1943 if let Some(payload_file) = self.kernel.as_mut() { 1944 let payload_size = payload_file 1945 .seek(SeekFrom::End(0)) 1946 .map_err(Error::LoadPayload)?; 1947 1948 payload_file 1949 .seek(SeekFrom::Start(0x1f1)) 1950 .map_err(Error::LoadPayload)?; 1951 1952 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1953 payload_header 1954 .as_bytes() 1955 .read_from( 1956 0, 1957 payload_file, 1958 mem::size_of::<linux_loader::bootparam::setup_header>(), 1959 ) 1960 .unwrap(); 1961 1962 if payload_header.header != 0x5372_6448 { 1963 return Err(Error::InvalidPayloadType); 1964 } 1965 1966 if (payload_header.version < 0x0200) 1967 || ((payload_header.loadflags & 0x1) == 0x0) 1968 { 1969 return Err(Error::InvalidPayloadType); 1970 } 1971 1972 payload_file 1973 .seek(SeekFrom::Start(0)) 1974 .map_err(Error::LoadPayload)?; 1975 mem.read_from( 1976 GuestAddress(section.address), 1977 payload_file, 1978 payload_size as usize, 1979 ) 1980 .unwrap(); 1981 1982 // Create the payload info that will be inserted into 1983 // the HOB. 1984 payload_info = Some(PayloadInfo { 1985 image_type: PayloadImageType::BzImage, 1986 entry_point: section.address, 1987 }); 1988 } 1989 } 1990 TdvfSectionType::PayloadParam => { 1991 info!("Copying payload parameters to guest memory"); 1992 let cmdline = Self::generate_cmdline( 1993 self.config.lock().unwrap().payload.as_ref().unwrap(), 1994 )?; 1995 mem.write_slice( 1996 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1997 GuestAddress(section.address), 1998 ) 1999 .unwrap(); 2000 } 2001 _ => {} 2002 } 2003 } 2004 2005 // Generate HOB 2006 let mut hob = TdHob::start(hob_offset.unwrap()); 2007 2008 let mut sorted_sections = sections.to_vec(); 2009 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 2010 2011 sorted_sections.sort_by_key(|section| section.address); 2012 sorted_sections.reverse(); 2013 2014 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 2015 hob.add_memory_resource(&mem, start, size, ram) 2016 .map_err(Error::PopulateHob)?; 2017 } 2018 2019 // MMIO regions 2020 hob.add_mmio_resource( 2021 &mem, 2022 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 2023 arch::layout::APIC_START.raw_value() 2024 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 2025 ) 2026 .map_err(Error::PopulateHob)?; 2027 let start_of_device_area = self 2028 .memory_manager 2029 .lock() 2030 .unwrap() 2031 .start_of_device_area() 2032 .raw_value(); 2033 let end_of_device_area = self 2034 .memory_manager 2035 .lock() 2036 .unwrap() 2037 .end_of_device_area() 2038 .raw_value(); 2039 hob.add_mmio_resource( 2040 &mem, 2041 start_of_device_area, 2042 end_of_device_area - start_of_device_area, 2043 ) 2044 .map_err(Error::PopulateHob)?; 2045 2046 // Loop over the ACPI tables and copy them to the HOB. 2047 2048 for acpi_table in crate::acpi::create_acpi_tables_tdx( 2049 &self.device_manager, 2050 &self.cpu_manager, 2051 &self.memory_manager, 2052 &self.numa_nodes, 2053 ) { 2054 hob.add_acpi_table(&mem, acpi_table.as_slice()) 2055 .map_err(Error::PopulateHob)?; 2056 } 2057 2058 // If a payload info has been created, let's insert it into the HOB. 2059 if let Some(payload_info) = payload_info { 2060 hob.add_payload(&mem, payload_info) 2061 .map_err(Error::PopulateHob)?; 2062 } 2063 2064 hob.finish(&mem).map_err(Error::PopulateHob)?; 2065 2066 Ok(hob_offset) 2067 } 2068 2069 #[cfg(feature = "tdx")] 2070 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 2071 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2072 let mem = guest_memory.memory(); 2073 2074 for section in sections { 2075 self.vm 2076 .tdx_init_memory_region( 2077 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 2078 section.address, 2079 section.size, 2080 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 2081 section.attributes == 1, 2082 ) 2083 .map_err(Error::InitializeTdxMemoryRegion)?; 2084 } 2085 2086 Ok(()) 2087 } 2088 2089 fn setup_signal_handler(&mut self) -> Result<()> { 2090 let console = self.device_manager.lock().unwrap().console().clone(); 2091 let signals = Signals::new(Vm::HANDLED_SIGNALS); 2092 match signals { 2093 Ok(signals) => { 2094 self.signals = Some(signals.handle()); 2095 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?; 2096 let signal_handler_seccomp_filter = get_seccomp_filter( 2097 &self.seccomp_action, 2098 Thread::SignalHandler, 2099 self.hypervisor.hypervisor_type(), 2100 ) 2101 .map_err(Error::CreateSeccompFilter)?; 2102 self.threads.push( 2103 thread::Builder::new() 2104 .name("vm_signal_handler".to_string()) 2105 .spawn(move || { 2106 if !signal_handler_seccomp_filter.is_empty() { 2107 if let Err(e) = apply_filter(&signal_handler_seccomp_filter) 2108 .map_err(Error::ApplySeccompFilter) 2109 { 2110 error!("Error applying seccomp filter: {:?}", e); 2111 exit_evt.write(1).ok(); 2112 return; 2113 } 2114 } 2115 std::panic::catch_unwind(AssertUnwindSafe(|| { 2116 Vm::signal_handler(signals, console); 2117 })) 2118 .map_err(|_| { 2119 error!("signal_handler thead panicked"); 2120 exit_evt.write(1).ok() 2121 }) 2122 .ok(); 2123 }) 2124 .map_err(Error::SignalHandlerSpawn)?, 2125 ); 2126 } 2127 Err(e) => error!("Signal not found {}", e), 2128 } 2129 Ok(()) 2130 } 2131 2132 fn setup_tty(&self) -> Result<()> { 2133 if self.on_tty { 2134 io::stdin() 2135 .lock() 2136 .set_raw_mode() 2137 .map_err(Error::SetTerminalRaw)?; 2138 } 2139 2140 Ok(()) 2141 } 2142 2143 // Creates ACPI tables 2144 // In case of TDX being used, this is a no-op since the tables will be 2145 // created and passed when populating the HOB. 2146 2147 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2148 #[cfg(feature = "tdx")] 2149 if self.config.lock().unwrap().is_tdx_enabled() { 2150 return None; 2151 } 2152 2153 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2154 2155 let rsdp_addr = crate::acpi::create_acpi_tables( 2156 &mem, 2157 &self.device_manager, 2158 &self.cpu_manager, 2159 &self.memory_manager, 2160 &self.numa_nodes, 2161 ); 2162 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2163 2164 Some(rsdp_addr) 2165 } 2166 2167 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2168 trace_scoped!("entry_point"); 2169 2170 self.load_payload_handle 2171 .take() 2172 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2173 .transpose() 2174 } 2175 2176 pub fn boot(&mut self) -> Result<()> { 2177 trace_scoped!("Vm::boot"); 2178 info!("Booting VM"); 2179 event!("vm", "booting"); 2180 let current_state = self.get_state()?; 2181 if current_state == VmState::Paused { 2182 return self.resume().map_err(Error::Resume); 2183 } 2184 2185 let new_state = if self.stop_on_boot { 2186 VmState::BreakPoint 2187 } else { 2188 VmState::Running 2189 }; 2190 current_state.valid_transition(new_state)?; 2191 2192 // Do earlier to parallelise with loading kernel 2193 #[cfg(target_arch = "x86_64")] 2194 let rsdp_addr = self.create_acpi_tables(); 2195 2196 self.setup_signal_handler()?; 2197 self.setup_tty()?; 2198 2199 // Load kernel synchronously or if asynchronous then wait for load to 2200 // finish. 2201 let entry_point = self.entry_point()?; 2202 2203 #[cfg(feature = "tdx")] 2204 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2205 2206 // The initial TDX configuration must be done before the vCPUs are 2207 // created 2208 #[cfg(feature = "tdx")] 2209 if tdx_enabled { 2210 self.init_tdx()?; 2211 } 2212 2213 // Create and configure vcpus 2214 self.cpu_manager 2215 .lock() 2216 .unwrap() 2217 .create_boot_vcpus(entry_point) 2218 .map_err(Error::CpuManager)?; 2219 2220 #[cfg(feature = "tdx")] 2221 let sections = if tdx_enabled { 2222 self.extract_tdvf_sections()? 2223 } else { 2224 Vec::new() 2225 }; 2226 2227 // Configuring the TDX regions requires that the vCPUs are created. 2228 #[cfg(feature = "tdx")] 2229 let hob_address = if tdx_enabled { 2230 // TDX sections are written to memory. 2231 self.populate_tdx_sections(§ions)? 2232 } else { 2233 None 2234 }; 2235 2236 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2237 // available after they are configured 2238 #[cfg(target_arch = "aarch64")] 2239 let rsdp_addr = self.create_acpi_tables(); 2240 2241 // Configure shared state based on loaded kernel 2242 entry_point 2243 .map(|_| { 2244 // Safe to unwrap rsdp_addr as we know it can't be None when 2245 // the entry_point is Some. 2246 self.configure_system(rsdp_addr.unwrap()) 2247 }) 2248 .transpose()?; 2249 2250 #[cfg(feature = "tdx")] 2251 if let Some(hob_address) = hob_address { 2252 // With the HOB address extracted the vCPUs can have 2253 // their TDX state configured. 2254 self.cpu_manager 2255 .lock() 2256 .unwrap() 2257 .initialize_tdx(hob_address) 2258 .map_err(Error::CpuManager)?; 2259 // Let the hypervisor know which memory ranges are shared with the 2260 // guest. This prevents the guest from ignoring/discarding memory 2261 // regions provided by the host. 2262 self.init_tdx_memory(§ions)?; 2263 // With TDX memory and CPU state configured TDX setup is complete 2264 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2265 } 2266 2267 self.cpu_manager 2268 .lock() 2269 .unwrap() 2270 .start_boot_vcpus(new_state == VmState::BreakPoint) 2271 .map_err(Error::CpuManager)?; 2272 2273 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2274 *state = new_state; 2275 event!("vm", "booted"); 2276 Ok(()) 2277 } 2278 2279 /// Gets a thread-safe reference counted pointer to the VM configuration. 2280 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2281 Arc::clone(&self.config) 2282 } 2283 2284 /// Get the VM state. Returns an error if the state is poisoned. 2285 pub fn get_state(&self) -> Result<VmState> { 2286 self.state 2287 .try_read() 2288 .map_err(|_| Error::PoisonedState) 2289 .map(|state| *state) 2290 } 2291 2292 /// Load saved clock from snapshot 2293 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2294 pub fn load_clock_from_snapshot( 2295 &mut self, 2296 snapshot: &Snapshot, 2297 ) -> Result<Option<hypervisor::ClockData>> { 2298 use crate::migration::get_vm_snapshot; 2299 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 2300 self.saved_clock = vm_snapshot.clock; 2301 Ok(self.saved_clock) 2302 } 2303 2304 #[cfg(target_arch = "aarch64")] 2305 /// Add the vGIC section to the VM snapshot. 2306 fn add_vgic_snapshot_section( 2307 &self, 2308 vm_snapshot: &mut Snapshot, 2309 ) -> std::result::Result<(), MigratableError> { 2310 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2311 self.device_manager 2312 .lock() 2313 .unwrap() 2314 .get_interrupt_controller() 2315 .unwrap() 2316 .lock() 2317 .unwrap() 2318 .set_gicr_typers(&saved_vcpu_states); 2319 2320 vm_snapshot.add_snapshot( 2321 self.device_manager 2322 .lock() 2323 .unwrap() 2324 .get_interrupt_controller() 2325 .unwrap() 2326 .lock() 2327 .unwrap() 2328 .snapshot()?, 2329 ); 2330 2331 Ok(()) 2332 } 2333 2334 #[cfg(target_arch = "aarch64")] 2335 /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing. 2336 fn restore_vgic_and_enable_interrupt( 2337 &self, 2338 vm_snapshot: &Snapshot, 2339 ) -> std::result::Result<(), MigratableError> { 2340 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2341 // The number of vCPUs is the same as the number of saved vCPU states. 2342 let vcpu_numbers = saved_vcpu_states.len(); 2343 2344 // Creating a GIC device here, as the GIC will not be created when 2345 // restoring the device manager. Note that currently only the bare GICv3 2346 // without ITS is supported. 2347 let vcpu_count = vcpu_numbers.try_into().unwrap(); 2348 self.device_manager 2349 .lock() 2350 .unwrap() 2351 .get_interrupt_controller() 2352 .unwrap() 2353 .lock() 2354 .unwrap() 2355 .create_vgic(&self.vm, Gic::create_default_config(vcpu_count)) 2356 .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?; 2357 2358 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 2359 self.cpu_manager 2360 .lock() 2361 .unwrap() 2362 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 2363 .map_err(|e| MigratableError::Restore(anyhow!("Error init PMU: {:?}", e)))?; 2364 2365 // Here we prepare the GICR_TYPER registers from the restored vCPU states. 2366 self.device_manager 2367 .lock() 2368 .unwrap() 2369 .get_interrupt_controller() 2370 .unwrap() 2371 .lock() 2372 .unwrap() 2373 .set_gicr_typers(&saved_vcpu_states); 2374 2375 // Restore GIC states. 2376 if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) { 2377 self.device_manager 2378 .lock() 2379 .unwrap() 2380 .get_interrupt_controller() 2381 .unwrap() 2382 .lock() 2383 .unwrap() 2384 .restore(*gicv3_its_snapshot.clone())?; 2385 } else { 2386 return Err(MigratableError::Restore(anyhow!( 2387 "Missing GicV3Its snapshot" 2388 ))); 2389 } 2390 2391 // Activate gic device 2392 self.device_manager 2393 .lock() 2394 .unwrap() 2395 .get_interrupt_controller() 2396 .unwrap() 2397 .lock() 2398 .unwrap() 2399 .enable() 2400 .map_err(|e| { 2401 MigratableError::Restore(anyhow!( 2402 "Could not enable interrupt controller routing: {:#?}", 2403 e 2404 )) 2405 })?; 2406 2407 Ok(()) 2408 } 2409 2410 /// Gets the actual size of the balloon. 2411 pub fn balloon_size(&self) -> u64 { 2412 self.device_manager.lock().unwrap().balloon_size() 2413 } 2414 2415 pub fn receive_memory_regions<F>( 2416 &mut self, 2417 ranges: &MemoryRangeTable, 2418 fd: &mut F, 2419 ) -> std::result::Result<(), MigratableError> 2420 where 2421 F: Read, 2422 { 2423 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2424 let mem = guest_memory.memory(); 2425 2426 for range in ranges.regions() { 2427 let mut offset: u64 = 0; 2428 // Here we are manually handling the retry in case we can't the 2429 // whole region at once because we can't use the implementation 2430 // from vm-memory::GuestMemory of read_exact_from() as it is not 2431 // following the correct behavior. For more info about this issue 2432 // see: https://github.com/rust-vmm/vm-memory/issues/174 2433 loop { 2434 let bytes_read = mem 2435 .read_from( 2436 GuestAddress(range.gpa + offset), 2437 fd, 2438 (range.length - offset) as usize, 2439 ) 2440 .map_err(|e| { 2441 MigratableError::MigrateReceive(anyhow!( 2442 "Error receiving memory from socket: {}", 2443 e 2444 )) 2445 })?; 2446 offset += bytes_read as u64; 2447 2448 if offset == range.length { 2449 break; 2450 } 2451 } 2452 } 2453 2454 Ok(()) 2455 } 2456 2457 pub fn send_memory_fds( 2458 &mut self, 2459 socket: &mut UnixStream, 2460 ) -> std::result::Result<(), MigratableError> { 2461 for (slot, fd) in self 2462 .memory_manager 2463 .lock() 2464 .unwrap() 2465 .memory_slot_fds() 2466 .drain() 2467 { 2468 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2469 .write_to(socket) 2470 .map_err(|e| { 2471 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2472 })?; 2473 socket 2474 .send_with_fd(&slot.to_le_bytes()[..], fd) 2475 .map_err(|e| { 2476 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2477 })?; 2478 2479 let res = Response::read_from(socket)?; 2480 if res.status() != Status::Ok { 2481 warn!("Error during memory fd migration"); 2482 Request::abandon().write_to(socket)?; 2483 Response::read_from(socket).ok(); 2484 return Err(MigratableError::MigrateSend(anyhow!( 2485 "Error during memory fd migration" 2486 ))); 2487 } 2488 } 2489 2490 Ok(()) 2491 } 2492 2493 pub fn send_memory_regions<F>( 2494 &mut self, 2495 ranges: &MemoryRangeTable, 2496 fd: &mut F, 2497 ) -> std::result::Result<(), MigratableError> 2498 where 2499 F: Write, 2500 { 2501 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2502 let mem = guest_memory.memory(); 2503 2504 for range in ranges.regions() { 2505 let mut offset: u64 = 0; 2506 // Here we are manually handling the retry in case we can't the 2507 // whole region at once because we can't use the implementation 2508 // from vm-memory::GuestMemory of write_all_to() as it is not 2509 // following the correct behavior. For more info about this issue 2510 // see: https://github.com/rust-vmm/vm-memory/issues/174 2511 loop { 2512 let bytes_written = mem 2513 .write_to( 2514 GuestAddress(range.gpa + offset), 2515 fd, 2516 (range.length - offset) as usize, 2517 ) 2518 .map_err(|e| { 2519 MigratableError::MigrateSend(anyhow!( 2520 "Error transferring memory to socket: {}", 2521 e 2522 )) 2523 })?; 2524 offset += bytes_written as u64; 2525 2526 if offset == range.length { 2527 break; 2528 } 2529 } 2530 } 2531 2532 Ok(()) 2533 } 2534 2535 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2536 self.memory_manager 2537 .lock() 2538 .unwrap() 2539 .memory_range_table(false) 2540 } 2541 2542 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2543 self.device_manager.lock().unwrap().device_tree() 2544 } 2545 2546 pub fn activate_virtio_devices(&self) -> Result<()> { 2547 self.device_manager 2548 .lock() 2549 .unwrap() 2550 .activate_virtio_devices() 2551 .map_err(Error::ActivateVirtioDevices) 2552 } 2553 2554 #[cfg(target_arch = "x86_64")] 2555 pub fn power_button(&self) -> Result<()> { 2556 return self 2557 .device_manager 2558 .lock() 2559 .unwrap() 2560 .notify_power_button() 2561 .map_err(Error::PowerButton); 2562 } 2563 2564 #[cfg(target_arch = "aarch64")] 2565 pub fn power_button(&self) -> Result<()> { 2566 self.device_manager 2567 .lock() 2568 .unwrap() 2569 .notify_power_button() 2570 .map_err(Error::PowerButton) 2571 } 2572 2573 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2574 self.memory_manager.lock().unwrap().snapshot_data() 2575 } 2576 2577 #[cfg(feature = "guest_debug")] 2578 pub fn debug_request( 2579 &mut self, 2580 gdb_request: &GdbRequestPayload, 2581 cpu_id: usize, 2582 ) -> Result<GdbResponsePayload> { 2583 use GdbRequestPayload::*; 2584 match gdb_request { 2585 SetSingleStep(single_step) => { 2586 self.set_guest_debug(cpu_id, &[], *single_step) 2587 .map_err(Error::Debug)?; 2588 } 2589 SetHwBreakPoint(addrs) => { 2590 self.set_guest_debug(cpu_id, addrs, false) 2591 .map_err(Error::Debug)?; 2592 } 2593 Pause => { 2594 self.debug_pause().map_err(Error::Debug)?; 2595 } 2596 Resume => { 2597 self.debug_resume().map_err(Error::Debug)?; 2598 } 2599 ReadRegs => { 2600 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2601 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2602 } 2603 WriteRegs(regs) => { 2604 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2605 } 2606 ReadMem(vaddr, len) => { 2607 let mem = self.read_mem(cpu_id, *vaddr, *len).map_err(Error::Debug)?; 2608 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2609 } 2610 WriteMem(vaddr, data) => { 2611 self.write_mem(cpu_id, vaddr, data).map_err(Error::Debug)?; 2612 } 2613 ActiveVcpus => { 2614 let active_vcpus = self.active_vcpus(); 2615 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2616 } 2617 } 2618 Ok(GdbResponsePayload::CommandComplete) 2619 } 2620 2621 #[cfg(feature = "guest_debug")] 2622 fn get_dump_state( 2623 &mut self, 2624 destination_url: &str, 2625 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2626 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2627 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2628 let mut elf_phdr_num = 1; 2629 let elf_sh_info = 0; 2630 let coredump_file_path = url_to_file(destination_url)?; 2631 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2632 2633 if mapping_num < UINT16_MAX - 2 { 2634 elf_phdr_num += mapping_num as u16; 2635 } else { 2636 panic!("mapping num beyond 65535 not supported"); 2637 } 2638 let coredump_file = OpenOptions::new() 2639 .read(true) 2640 .write(true) 2641 .create_new(true) 2642 .open(coredump_file_path) 2643 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2644 2645 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2646 let mem_data = self 2647 .memory_manager 2648 .lock() 2649 .unwrap() 2650 .coredump_memory_regions(mem_offset); 2651 2652 Ok(DumpState { 2653 elf_note_size, 2654 elf_phdr_num, 2655 elf_sh_info, 2656 mem_offset, 2657 mem_info: Some(mem_data), 2658 file: Some(coredump_file), 2659 }) 2660 } 2661 2662 #[cfg(feature = "guest_debug")] 2663 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2664 size_of::<elf::Elf64_Ehdr>() as u64 2665 + note_size as u64 2666 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2667 } 2668 } 2669 2670 impl Pausable for Vm { 2671 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2672 event!("vm", "pausing"); 2673 let mut state = self 2674 .state 2675 .try_write() 2676 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2677 let new_state = VmState::Paused; 2678 2679 state 2680 .valid_transition(new_state) 2681 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2682 2683 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2684 { 2685 let mut clock = self 2686 .vm 2687 .get_clock() 2688 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2689 clock.reset_flags(); 2690 self.saved_clock = Some(clock); 2691 } 2692 2693 // Before pausing the vCPUs activate any pending virtio devices that might 2694 // need activation between starting the pause (or e.g. a migration it's part of) 2695 self.activate_virtio_devices().map_err(|e| { 2696 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2697 })?; 2698 2699 self.cpu_manager.lock().unwrap().pause()?; 2700 self.device_manager.lock().unwrap().pause()?; 2701 2702 *state = new_state; 2703 2704 event!("vm", "paused"); 2705 Ok(()) 2706 } 2707 2708 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2709 event!("vm", "resuming"); 2710 let mut state = self 2711 .state 2712 .try_write() 2713 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2714 let new_state = VmState::Running; 2715 2716 state 2717 .valid_transition(new_state) 2718 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2719 2720 self.cpu_manager.lock().unwrap().resume()?; 2721 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2722 { 2723 if let Some(clock) = &self.saved_clock { 2724 self.vm.set_clock(clock).map_err(|e| { 2725 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2726 })?; 2727 } 2728 } 2729 self.device_manager.lock().unwrap().resume()?; 2730 2731 // And we're back to the Running state. 2732 *state = new_state; 2733 event!("vm", "resumed"); 2734 Ok(()) 2735 } 2736 } 2737 2738 #[derive(Serialize, Deserialize)] 2739 pub struct VmSnapshot { 2740 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2741 pub clock: Option<hypervisor::ClockData>, 2742 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2743 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2744 } 2745 2746 pub const VM_SNAPSHOT_ID: &str = "vm"; 2747 impl Snapshottable for Vm { 2748 fn id(&self) -> String { 2749 VM_SNAPSHOT_ID.to_string() 2750 } 2751 2752 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2753 event!("vm", "snapshotting"); 2754 2755 #[cfg(feature = "tdx")] 2756 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2757 2758 #[cfg(feature = "tdx")] 2759 { 2760 if tdx_enabled { 2761 return Err(MigratableError::Snapshot(anyhow!( 2762 "Snapshot not possible with TDX VM" 2763 ))); 2764 } 2765 } 2766 2767 let current_state = self.get_state().unwrap(); 2768 if current_state != VmState::Paused { 2769 return Err(MigratableError::Snapshot(anyhow!( 2770 "Trying to snapshot while VM is running" 2771 ))); 2772 } 2773 2774 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2775 let common_cpuid = { 2776 let phys_bits = physical_bits(self.config.lock().unwrap().cpus.max_phys_bits); 2777 arch::generate_common_cpuid( 2778 self.hypervisor.clone(), 2779 None, 2780 None, 2781 phys_bits, 2782 self.config.lock().unwrap().cpus.kvm_hyperv, 2783 #[cfg(feature = "tdx")] 2784 tdx_enabled, 2785 ) 2786 .map_err(|e| { 2787 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2788 })? 2789 }; 2790 2791 let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID); 2792 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2793 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2794 clock: self.saved_clock, 2795 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2796 common_cpuid, 2797 }) 2798 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2799 2800 vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?); 2801 vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?); 2802 2803 #[cfg(target_arch = "aarch64")] 2804 self.add_vgic_snapshot_section(&mut vm_snapshot) 2805 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2806 2807 vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?); 2808 vm_snapshot.add_data_section(SnapshotDataSection { 2809 id: format!("{}-section", VM_SNAPSHOT_ID), 2810 snapshot: vm_snapshot_data, 2811 }); 2812 2813 event!("vm", "snapshotted"); 2814 Ok(vm_snapshot) 2815 } 2816 2817 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 2818 event!("vm", "restoring"); 2819 2820 let current_state = self 2821 .get_state() 2822 .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?; 2823 let new_state = VmState::Paused; 2824 current_state.valid_transition(new_state).map_err(|e| { 2825 MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e)) 2826 })?; 2827 2828 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2829 self.load_clock_from_snapshot(&snapshot) 2830 .map_err(|e| MigratableError::Restore(anyhow!("Error restoring clock: {:?}", e)))?; 2831 2832 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2833 self.memory_manager 2834 .lock() 2835 .unwrap() 2836 .restore(*memory_manager_snapshot.clone())?; 2837 } else { 2838 return Err(MigratableError::Restore(anyhow!( 2839 "Missing memory manager snapshot" 2840 ))); 2841 } 2842 2843 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2844 self.device_manager 2845 .lock() 2846 .unwrap() 2847 .restore(*device_manager_snapshot.clone())?; 2848 } else { 2849 return Err(MigratableError::Restore(anyhow!( 2850 "Missing device manager snapshot" 2851 ))); 2852 } 2853 2854 if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) { 2855 self.cpu_manager 2856 .lock() 2857 .unwrap() 2858 .restore(*cpu_manager_snapshot.clone())?; 2859 } else { 2860 return Err(MigratableError::Restore(anyhow!( 2861 "Missing CPU manager snapshot" 2862 ))); 2863 } 2864 2865 #[cfg(target_arch = "aarch64")] 2866 self.restore_vgic_and_enable_interrupt(&snapshot)?; 2867 2868 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2869 self.device_manager 2870 .lock() 2871 .unwrap() 2872 .restore_devices(*device_manager_snapshot.clone())?; 2873 } else { 2874 return Err(MigratableError::Restore(anyhow!( 2875 "Missing device manager snapshot" 2876 ))); 2877 } 2878 2879 // Now we can start all vCPUs from here. 2880 self.cpu_manager 2881 .lock() 2882 .unwrap() 2883 .start_restored_vcpus() 2884 .map_err(|e| { 2885 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e)) 2886 })?; 2887 2888 self.setup_signal_handler().map_err(|e| { 2889 MigratableError::Restore(anyhow!("Could not setup signal handler: {:#?}", e)) 2890 })?; 2891 self.setup_tty() 2892 .map_err(|e| MigratableError::Restore(anyhow!("Could not setup tty: {:#?}", e)))?; 2893 2894 let mut state = self 2895 .state 2896 .try_write() 2897 .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?; 2898 *state = new_state; 2899 2900 event!("vm", "restored"); 2901 Ok(()) 2902 } 2903 } 2904 2905 impl Transportable for Vm { 2906 fn send( 2907 &self, 2908 snapshot: &Snapshot, 2909 destination_url: &str, 2910 ) -> std::result::Result<(), MigratableError> { 2911 let mut snapshot_config_path = url_to_path(destination_url)?; 2912 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2913 2914 // Create the snapshot config file 2915 let mut snapshot_config_file = OpenOptions::new() 2916 .read(true) 2917 .write(true) 2918 .create_new(true) 2919 .open(snapshot_config_path) 2920 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2921 2922 // Serialize and write the snapshot config 2923 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2924 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2925 2926 snapshot_config_file 2927 .write(vm_config.as_bytes()) 2928 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2929 2930 let mut snapshot_state_path = url_to_path(destination_url)?; 2931 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2932 2933 // Create the snapshot state file 2934 let mut snapshot_state_file = OpenOptions::new() 2935 .read(true) 2936 .write(true) 2937 .create_new(true) 2938 .open(snapshot_state_path) 2939 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2940 2941 // Serialize and write the snapshot state 2942 let vm_state = 2943 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2944 2945 snapshot_state_file 2946 .write(&vm_state) 2947 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2948 2949 // Tell the memory manager to also send/write its own snapshot. 2950 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2951 self.memory_manager 2952 .lock() 2953 .unwrap() 2954 .send(&memory_manager_snapshot.clone(), destination_url)?; 2955 } else { 2956 return Err(MigratableError::Restore(anyhow!( 2957 "Missing memory manager snapshot" 2958 ))); 2959 } 2960 2961 Ok(()) 2962 } 2963 } 2964 2965 impl Migratable for Vm { 2966 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2967 self.memory_manager.lock().unwrap().start_dirty_log()?; 2968 self.device_manager.lock().unwrap().start_dirty_log() 2969 } 2970 2971 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2972 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2973 self.device_manager.lock().unwrap().stop_dirty_log() 2974 } 2975 2976 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2977 Ok(MemoryRangeTable::new_from_tables(vec![ 2978 self.memory_manager.lock().unwrap().dirty_log()?, 2979 self.device_manager.lock().unwrap().dirty_log()?, 2980 ])) 2981 } 2982 2983 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2984 self.memory_manager.lock().unwrap().start_migration()?; 2985 self.device_manager.lock().unwrap().start_migration() 2986 } 2987 2988 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2989 self.memory_manager.lock().unwrap().complete_migration()?; 2990 self.device_manager.lock().unwrap().complete_migration() 2991 } 2992 } 2993 2994 #[cfg(feature = "guest_debug")] 2995 impl Debuggable for Vm { 2996 fn set_guest_debug( 2997 &self, 2998 cpu_id: usize, 2999 addrs: &[GuestAddress], 3000 singlestep: bool, 3001 ) -> std::result::Result<(), DebuggableError> { 3002 self.cpu_manager 3003 .lock() 3004 .unwrap() 3005 .set_guest_debug(cpu_id, addrs, singlestep) 3006 } 3007 3008 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 3009 if *self.state.read().unwrap() == VmState::Running { 3010 self.pause().map_err(DebuggableError::Pause)?; 3011 } 3012 3013 let mut state = self 3014 .state 3015 .try_write() 3016 .map_err(|_| DebuggableError::PoisonedState)?; 3017 *state = VmState::BreakPoint; 3018 Ok(()) 3019 } 3020 3021 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 3022 if *self.state.read().unwrap() == VmState::BreakPoint { 3023 self.resume().map_err(DebuggableError::Pause)?; 3024 } 3025 3026 Ok(()) 3027 } 3028 3029 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 3030 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 3031 } 3032 3033 fn write_regs( 3034 &self, 3035 cpu_id: usize, 3036 regs: &CoreRegs, 3037 ) -> std::result::Result<(), DebuggableError> { 3038 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 3039 } 3040 3041 fn read_mem( 3042 &self, 3043 cpu_id: usize, 3044 vaddr: GuestAddress, 3045 len: usize, 3046 ) -> std::result::Result<Vec<u8>, DebuggableError> { 3047 self.cpu_manager 3048 .lock() 3049 .unwrap() 3050 .read_mem(cpu_id, vaddr, len) 3051 } 3052 3053 fn write_mem( 3054 &self, 3055 cpu_id: usize, 3056 vaddr: &GuestAddress, 3057 data: &[u8], 3058 ) -> std::result::Result<(), DebuggableError> { 3059 self.cpu_manager 3060 .lock() 3061 .unwrap() 3062 .write_mem(cpu_id, vaddr, data) 3063 } 3064 3065 fn active_vcpus(&self) -> usize { 3066 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 3067 if active_vcpus > 0 { 3068 active_vcpus 3069 } else { 3070 // The VM is not booted yet. Report boot_vcpus() instead. 3071 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 3072 } 3073 } 3074 } 3075 3076 #[cfg(feature = "guest_debug")] 3077 pub const UINT16_MAX: u32 = 65535; 3078 3079 #[cfg(feature = "guest_debug")] 3080 impl Elf64Writable for Vm {} 3081 3082 #[cfg(feature = "guest_debug")] 3083 impl GuestDebuggable for Vm { 3084 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 3085 event!("vm", "coredumping"); 3086 3087 #[cfg(feature = "tdx")] 3088 { 3089 if self.config.lock().unwrap().tdx.is_some() { 3090 return Err(GuestDebuggableError::Coredump(anyhow!( 3091 "Coredump not possible with TDX VM" 3092 ))); 3093 } 3094 } 3095 3096 let current_state = self.get_state().unwrap(); 3097 if current_state != VmState::Paused { 3098 return Err(GuestDebuggableError::Coredump(anyhow!( 3099 "Trying to coredump while VM is running" 3100 ))); 3101 } 3102 3103 let coredump_state = self.get_dump_state(destination_url)?; 3104 3105 self.write_header(&coredump_state)?; 3106 self.write_note(&coredump_state)?; 3107 self.write_loads(&coredump_state)?; 3108 3109 self.cpu_manager 3110 .lock() 3111 .unwrap() 3112 .cpu_write_elf64_note(&coredump_state)?; 3113 self.cpu_manager 3114 .lock() 3115 .unwrap() 3116 .cpu_write_vmm_note(&coredump_state)?; 3117 3118 self.memory_manager 3119 .lock() 3120 .unwrap() 3121 .coredump_iterate_save_mem(&coredump_state) 3122 } 3123 } 3124 3125 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3126 #[cfg(test)] 3127 mod tests { 3128 use super::*; 3129 3130 fn test_vm_state_transitions(state: VmState) { 3131 match state { 3132 VmState::Created => { 3133 // Check the transitions from Created 3134 assert!(state.valid_transition(VmState::Created).is_err()); 3135 assert!(state.valid_transition(VmState::Running).is_ok()); 3136 assert!(state.valid_transition(VmState::Shutdown).is_err()); 3137 assert!(state.valid_transition(VmState::Paused).is_ok()); 3138 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 3139 } 3140 VmState::Running => { 3141 // Check the transitions from Running 3142 assert!(state.valid_transition(VmState::Created).is_err()); 3143 assert!(state.valid_transition(VmState::Running).is_err()); 3144 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 3145 assert!(state.valid_transition(VmState::Paused).is_ok()); 3146 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 3147 } 3148 VmState::Shutdown => { 3149 // Check the transitions from Shutdown 3150 assert!(state.valid_transition(VmState::Created).is_err()); 3151 assert!(state.valid_transition(VmState::Running).is_ok()); 3152 assert!(state.valid_transition(VmState::Shutdown).is_err()); 3153 assert!(state.valid_transition(VmState::Paused).is_err()); 3154 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 3155 } 3156 VmState::Paused => { 3157 // Check the transitions from Paused 3158 assert!(state.valid_transition(VmState::Created).is_err()); 3159 assert!(state.valid_transition(VmState::Running).is_ok()); 3160 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 3161 assert!(state.valid_transition(VmState::Paused).is_err()); 3162 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 3163 } 3164 VmState::BreakPoint => { 3165 // Check the transitions from Breakpoint 3166 assert!(state.valid_transition(VmState::Created).is_ok()); 3167 assert!(state.valid_transition(VmState::Running).is_ok()); 3168 assert!(state.valid_transition(VmState::Shutdown).is_err()); 3169 assert!(state.valid_transition(VmState::Paused).is_err()); 3170 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 3171 } 3172 } 3173 } 3174 3175 #[test] 3176 fn test_vm_created_transitions() { 3177 test_vm_state_transitions(VmState::Created); 3178 } 3179 3180 #[test] 3181 fn test_vm_running_transitions() { 3182 test_vm_state_transitions(VmState::Running); 3183 } 3184 3185 #[test] 3186 fn test_vm_shutdown_transitions() { 3187 test_vm_state_transitions(VmState::Shutdown); 3188 } 3189 3190 #[test] 3191 fn test_vm_paused_transitions() { 3192 test_vm_state_transitions(VmState::Paused); 3193 } 3194 3195 #[cfg(feature = "tdx")] 3196 #[test] 3197 fn test_hob_memory_resources() { 3198 // Case 1: Two TDVF sections in the middle of the RAM 3199 let sections = vec![ 3200 TdvfSection { 3201 address: 0xc000, 3202 size: 0x1000, 3203 ..Default::default() 3204 }, 3205 TdvfSection { 3206 address: 0x1000, 3207 size: 0x4000, 3208 ..Default::default() 3209 }, 3210 ]; 3211 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 3212 let expected = vec![ 3213 (0, 0x1000, true), 3214 (0x1000, 0x4000, false), 3215 (0x5000, 0x7000, true), 3216 (0xc000, 0x1000, false), 3217 (0xd000, 0x0fff_3000, true), 3218 ]; 3219 assert_eq!( 3220 expected, 3221 Vm::hob_memory_resources( 3222 sections, 3223 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3224 ) 3225 ); 3226 3227 // Case 2: Two TDVF sections with no conflict with the RAM 3228 let sections = vec![ 3229 TdvfSection { 3230 address: 0x1000_1000, 3231 size: 0x1000, 3232 ..Default::default() 3233 }, 3234 TdvfSection { 3235 address: 0, 3236 size: 0x1000, 3237 ..Default::default() 3238 }, 3239 ]; 3240 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3241 let expected = vec![ 3242 (0, 0x1000, false), 3243 (0x1000, 0x1000_0000, true), 3244 (0x1000_1000, 0x1000, false), 3245 ]; 3246 assert_eq!( 3247 expected, 3248 Vm::hob_memory_resources( 3249 sections, 3250 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3251 ) 3252 ); 3253 3254 // Case 3: Two TDVF sections with partial conflicts with the RAM 3255 let sections = vec![ 3256 TdvfSection { 3257 address: 0x1000_0000, 3258 size: 0x2000, 3259 ..Default::default() 3260 }, 3261 TdvfSection { 3262 address: 0, 3263 size: 0x2000, 3264 ..Default::default() 3265 }, 3266 ]; 3267 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3268 let expected = vec![ 3269 (0, 0x2000, false), 3270 (0x2000, 0x0fff_e000, true), 3271 (0x1000_0000, 0x2000, false), 3272 ]; 3273 assert_eq!( 3274 expected, 3275 Vm::hob_memory_resources( 3276 sections, 3277 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3278 ) 3279 ); 3280 3281 // Case 4: Two TDVF sections with no conflict before the RAM and two 3282 // more additional sections with no conflict after the RAM. 3283 let sections = vec![ 3284 TdvfSection { 3285 address: 0x2000_1000, 3286 size: 0x1000, 3287 ..Default::default() 3288 }, 3289 TdvfSection { 3290 address: 0x2000_0000, 3291 size: 0x1000, 3292 ..Default::default() 3293 }, 3294 TdvfSection { 3295 address: 0x1000, 3296 size: 0x1000, 3297 ..Default::default() 3298 }, 3299 TdvfSection { 3300 address: 0, 3301 size: 0x1000, 3302 ..Default::default() 3303 }, 3304 ]; 3305 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3306 let expected = vec![ 3307 (0, 0x1000, false), 3308 (0x1000, 0x1000, false), 3309 (0x4000, 0x1000_0000, true), 3310 (0x2000_0000, 0x1000, false), 3311 (0x2000_1000, 0x1000, false), 3312 ]; 3313 assert_eq!( 3314 expected, 3315 Vm::hob_memory_resources( 3316 sections, 3317 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3318 ) 3319 ); 3320 3321 // Case 5: One TDVF section overriding the entire RAM 3322 let sections = vec![TdvfSection { 3323 address: 0, 3324 size: 0x2000_0000, 3325 ..Default::default() 3326 }]; 3327 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3328 let expected = vec![(0, 0x2000_0000, false)]; 3329 assert_eq!( 3330 expected, 3331 Vm::hob_memory_resources( 3332 sections, 3333 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3334 ) 3335 ); 3336 3337 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3338 let sections = vec![ 3339 TdvfSection { 3340 address: 0x1000_2000, 3341 size: 0x2000, 3342 ..Default::default() 3343 }, 3344 TdvfSection { 3345 address: 0, 3346 size: 0x2000, 3347 ..Default::default() 3348 }, 3349 ]; 3350 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3351 (GuestAddress(0x2000), 0x1000_0000), 3352 (GuestAddress(0x1000_4000), 0x1000_0000), 3353 ]; 3354 let expected = vec![ 3355 (0, 0x2000, false), 3356 (0x2000, 0x1000_0000, true), 3357 (0x1000_2000, 0x2000, false), 3358 (0x1000_4000, 0x1000_0000, true), 3359 ]; 3360 assert_eq!( 3361 expected, 3362 Vm::hob_memory_resources( 3363 sections, 3364 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3365 ) 3366 ); 3367 3368 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3369 let sections = vec![ 3370 TdvfSection { 3371 address: 0x1000_0000, 3372 size: 0x4000, 3373 ..Default::default() 3374 }, 3375 TdvfSection { 3376 address: 0, 3377 size: 0x4000, 3378 ..Default::default() 3379 }, 3380 ]; 3381 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3382 (GuestAddress(0x1000), 0x1000_0000), 3383 (GuestAddress(0x1000_3000), 0x1000_0000), 3384 ]; 3385 let expected = vec![ 3386 (0, 0x4000, false), 3387 (0x4000, 0x0fff_c000, true), 3388 (0x1000_0000, 0x4000, false), 3389 (0x1000_4000, 0x0fff_f000, true), 3390 ]; 3391 assert_eq!( 3392 expected, 3393 Vm::hob_memory_resources( 3394 sections, 3395 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3396 ) 3397 ); 3398 } 3399 } 3400 3401 #[cfg(target_arch = "aarch64")] 3402 #[cfg(test)] 3403 mod tests { 3404 use super::*; 3405 use crate::GuestMemoryMmap; 3406 use arch::aarch64::fdt::create_fdt; 3407 use arch::aarch64::layout; 3408 use arch::{DeviceType, MmioDeviceInfo}; 3409 3410 const LEN: u64 = 4096; 3411 3412 #[test] 3413 fn test_create_fdt_with_devices() { 3414 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3415 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3416 3417 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3418 ( 3419 (DeviceType::Serial, DeviceType::Serial.to_string()), 3420 MmioDeviceInfo { 3421 addr: 0x00, 3422 len: LEN, 3423 irq: 33, 3424 }, 3425 ), 3426 ( 3427 (DeviceType::Virtio(1), "virtio".to_string()), 3428 MmioDeviceInfo { 3429 addr: LEN, 3430 len: LEN, 3431 irq: 34, 3432 }, 3433 ), 3434 ( 3435 (DeviceType::Rtc, "rtc".to_string()), 3436 MmioDeviceInfo { 3437 addr: 2 * LEN, 3438 len: LEN, 3439 irq: 35, 3440 }, 3441 ), 3442 ] 3443 .iter() 3444 .cloned() 3445 .collect(); 3446 3447 let hv = hypervisor::new().unwrap(); 3448 let vm = hv.create_vm().unwrap(); 3449 let gic = vm 3450 .create_vgic(Gic::create_default_config(1)) 3451 .expect("Cannot create gic"); 3452 assert!(create_fdt( 3453 &mem, 3454 "console=tty0", 3455 vec![0], 3456 Some((0, 0, 0)), 3457 &dev_info, 3458 &gic, 3459 &None, 3460 &Vec::new(), 3461 &BTreeMap::new(), 3462 None, 3463 true, 3464 ) 3465 .is_ok()) 3466 } 3467 } 3468 3469 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3470 #[test] 3471 pub fn test_vm() { 3472 use hypervisor::VmExit; 3473 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3474 // This example based on https://lwn.net/Articles/658511/ 3475 let code = [ 3476 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3477 0x00, 0xd8, /* add %bl, %al */ 3478 0x04, b'0', /* add $'0', %al */ 3479 0xee, /* out %al, (%dx) */ 3480 0xb0, b'\n', /* mov $'\n', %al */ 3481 0xee, /* out %al, (%dx) */ 3482 0xf4, /* hlt */ 3483 ]; 3484 3485 let mem_size = 0x1000; 3486 let load_addr = GuestAddress(0x1000); 3487 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3488 3489 let hv = hypervisor::new().unwrap(); 3490 let vm = hv.create_vm().expect("new VM creation failed"); 3491 3492 for (index, region) in mem.iter().enumerate() { 3493 let mem_region = vm.make_user_memory_region( 3494 index as u32, 3495 region.start_addr().raw_value(), 3496 region.len() as u64, 3497 region.as_ptr() as u64, 3498 false, 3499 false, 3500 ); 3501 3502 vm.create_user_memory_region(mem_region) 3503 .expect("Cannot configure guest memory"); 3504 } 3505 mem.write_slice(&code, load_addr) 3506 .expect("Writing code to memory failed"); 3507 3508 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3509 3510 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3511 vcpu_sregs.cs.base = 0; 3512 vcpu_sregs.cs.selector = 0; 3513 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3514 3515 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3516 vcpu_regs.rip = 0x1000; 3517 vcpu_regs.rax = 2; 3518 vcpu_regs.rbx = 3; 3519 vcpu_regs.rflags = 2; 3520 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3521 3522 loop { 3523 match vcpu.run().expect("run failed") { 3524 VmExit::IoOut(addr, data) => { 3525 println!( 3526 "IO out -- addr: {:#x} data [{:?}]", 3527 addr, 3528 str::from_utf8(data).unwrap() 3529 ); 3530 } 3531 VmExit::Reset => { 3532 println!("HLT"); 3533 break; 3534 } 3535 r => panic!("unexpected exit reason: {:?}", r), 3536 } 3537 } 3538 } 3539