1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::{ 15 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 16 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 17 }; 18 use crate::config::{NumaConfig, PayloadConfig}; 19 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "guest_debug")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 use crate::memory_manager::{ 29 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 30 }; 31 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 32 use crate::migration::get_vm_snapshot; 33 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 34 use crate::migration::url_to_file; 35 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 36 use crate::GuestMemoryMmap; 37 use crate::{ 38 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 39 }; 40 use anyhow::anyhow; 41 use arch::get_host_cpu_phys_bits; 42 #[cfg(target_arch = "x86_64")] 43 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 44 #[cfg(feature = "tdx")] 45 use arch::x86_64::tdx::TdvfSection; 46 use arch::EntryPoint; 47 #[cfg(target_arch = "aarch64")] 48 use arch::PciSpaceInfo; 49 use arch::{NumaNode, NumaNodes}; 50 #[cfg(target_arch = "aarch64")] 51 use devices::interrupt_controller; 52 use devices::AcpiNotificationFlags; 53 #[cfg(all(target_arch = "aarch64", feature = "guest_debug"))] 54 use gdbstub_arch::aarch64::reg::AArch64CoreRegs as CoreRegs; 55 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 56 use gdbstub_arch::x86::reg::X86_64CoreRegs as CoreRegs; 57 use hypervisor::{HypervisorVmError, VmOps}; 58 use libc::{termios, SIGWINCH}; 59 use linux_loader::cmdline::Cmdline; 60 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 61 use linux_loader::elf; 62 #[cfg(target_arch = "x86_64")] 63 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 64 #[cfg(target_arch = "aarch64")] 65 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 66 use linux_loader::loader::KernelLoader; 67 use seccompiler::SeccompAction; 68 use serde::{Deserialize, Serialize}; 69 use std::cmp; 70 use std::collections::BTreeMap; 71 use std::collections::HashMap; 72 use std::convert::TryInto; 73 use std::fs::{File, OpenOptions}; 74 use std::io::{self, Seek, SeekFrom, Write}; 75 #[cfg(feature = "tdx")] 76 use std::mem; 77 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 78 use std::mem::size_of; 79 use std::num::Wrapping; 80 use std::ops::Deref; 81 use std::os::unix::net::UnixStream; 82 use std::sync::{Arc, Mutex, RwLock}; 83 use std::time::Instant; 84 use std::{result, str, thread}; 85 use thiserror::Error; 86 use tracer::trace_scoped; 87 use vm_device::Bus; 88 #[cfg(feature = "tdx")] 89 use vm_memory::{Address, ByteValued, GuestMemory, GuestMemoryRegion}; 90 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; 91 use vm_migration::protocol::{Request, Response, Status}; 92 use vm_migration::{ 93 protocol::MemoryRangeTable, snapshot_from_id, Migratable, MigratableError, Pausable, Snapshot, 94 SnapshotData, Snapshottable, Transportable, 95 }; 96 use vmm_sys_util::eventfd::EventFd; 97 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 98 99 /// Errors associated with VM management 100 #[derive(Debug, Error)] 101 pub enum Error { 102 #[error("Cannot open kernel file: {0}")] 103 KernelFile(#[source] io::Error), 104 105 #[error("Cannot open initramfs file: {0}")] 106 InitramfsFile(#[source] io::Error), 107 108 #[error("Cannot load the kernel into memory: {0}")] 109 KernelLoad(#[source] linux_loader::loader::Error), 110 111 #[cfg(target_arch = "aarch64")] 112 #[error("Cannot load the UEFI binary in memory: {0:?}")] 113 UefiLoad(arch::aarch64::uefi::Error), 114 115 #[error("Cannot load the initramfs into memory")] 116 InitramfsLoad, 117 118 #[error("Cannot load the kernel command line in memory: {0}")] 119 LoadCmdLine(#[source] linux_loader::loader::Error), 120 121 #[error("Cannot modify the kernel command line: {0}")] 122 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 123 124 #[error("Cannot create the kernel command line: {0}")] 125 CmdLineCreate(#[source] linux_loader::cmdline::Error), 126 127 #[error("Cannot configure system: {0}")] 128 ConfigureSystem(#[source] arch::Error), 129 130 #[cfg(target_arch = "aarch64")] 131 #[error("Cannot enable interrupt controller: {0:?}")] 132 EnableInterruptController(interrupt_controller::Error), 133 134 #[error("VM state is poisoned")] 135 PoisonedState, 136 137 #[error("Error from device manager: {0:?}")] 138 DeviceManager(DeviceManagerError), 139 140 #[error("No device with id {0:?} to remove")] 141 NoDeviceToRemove(String), 142 143 #[error("Cannot spawn a signal handler thread: {0}")] 144 SignalHandlerSpawn(#[source] io::Error), 145 146 #[error("Failed to join on threads: {0:?}")] 147 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 148 149 #[error("VM config is missing")] 150 VmMissingConfig, 151 152 #[error("VM is not created")] 153 VmNotCreated, 154 155 #[error("VM is already created")] 156 VmAlreadyCreated, 157 158 #[error("VM is not running")] 159 VmNotRunning, 160 161 #[error("Cannot clone EventFd: {0}")] 162 EventFdClone(#[source] io::Error), 163 164 #[error("invalid VM state transition: {0:?} to {1:?}")] 165 InvalidStateTransition(VmState, VmState), 166 167 #[error("Error from CPU manager: {0}")] 168 CpuManager(#[source] cpu::Error), 169 170 #[error("Cannot pause devices: {0}")] 171 PauseDevices(#[source] MigratableError), 172 173 #[error("Cannot resume devices: {0}")] 174 ResumeDevices(#[source] MigratableError), 175 176 #[error("Cannot pause CPUs: {0}")] 177 PauseCpus(#[source] MigratableError), 178 179 #[error("Cannot resume cpus: {0}")] 180 ResumeCpus(#[source] MigratableError), 181 182 #[error("Cannot pause VM: {0}")] 183 Pause(#[source] MigratableError), 184 185 #[error("Cannot resume VM: {0}")] 186 Resume(#[source] MigratableError), 187 188 #[error("Memory manager error: {0:?}")] 189 MemoryManager(MemoryManagerError), 190 191 #[error("Eventfd write error: {0}")] 192 EventfdError(#[source] std::io::Error), 193 194 #[error("Cannot snapshot VM: {0}")] 195 Snapshot(#[source] MigratableError), 196 197 #[error("Cannot restore VM: {0}")] 198 Restore(#[source] MigratableError), 199 200 #[error("Cannot send VM snapshot: {0}")] 201 SnapshotSend(#[source] MigratableError), 202 203 #[error("Invalid restore source URL")] 204 InvalidRestoreSourceUrl, 205 206 #[error("Failed to validate config: {0}")] 207 ConfigValidation(#[source] ValidationError), 208 209 #[error("Too many virtio-vsock devices")] 210 TooManyVsockDevices, 211 212 #[error("Failed serializing into JSON: {0}")] 213 SerializeJson(#[source] serde_json::Error), 214 215 #[error("Invalid NUMA configuration")] 216 InvalidNumaConfig, 217 218 #[error("Cannot create seccomp filter: {0}")] 219 CreateSeccompFilter(#[source] seccompiler::Error), 220 221 #[error("Cannot apply seccomp filter: {0}")] 222 ApplySeccompFilter(#[source] seccompiler::Error), 223 224 #[error("Failed resizing a memory zone")] 225 ResizeZone, 226 227 #[error("Cannot activate virtio devices: {0:?}")] 228 ActivateVirtioDevices(DeviceManagerError), 229 230 #[error("Error triggering power button: {0:?}")] 231 PowerButton(DeviceManagerError), 232 233 #[error("Kernel lacks PVH header")] 234 KernelMissingPvhHeader, 235 236 #[error("Failed to allocate firmware RAM: {0:?}")] 237 AllocateFirmwareMemory(MemoryManagerError), 238 239 #[error("Error manipulating firmware file: {0}")] 240 FirmwareFile(#[source] std::io::Error), 241 242 #[error("Firmware too big")] 243 FirmwareTooLarge, 244 245 #[error("Failed to copy firmware to memory: {0}")] 246 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 247 248 #[cfg(feature = "tdx")] 249 #[error("Error performing I/O on TDX firmware file: {0}")] 250 LoadTdvf(#[source] std::io::Error), 251 252 #[cfg(feature = "tdx")] 253 #[error("Error performing I/O on the TDX payload file: {0}")] 254 LoadPayload(#[source] std::io::Error), 255 256 #[cfg(feature = "tdx")] 257 #[error("Error parsing TDVF: {0}")] 258 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 259 260 #[cfg(feature = "tdx")] 261 #[error("Error populating TDX HOB: {0}")] 262 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 263 264 #[cfg(feature = "tdx")] 265 #[error("Error allocating TDVF memory: {0:?}")] 266 AllocatingTdvfMemory(crate::memory_manager::Error), 267 268 #[cfg(feature = "tdx")] 269 #[error("Error enabling TDX VM: {0}")] 270 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 271 272 #[cfg(feature = "tdx")] 273 #[error("Error enabling TDX memory region: {0}")] 274 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 275 276 #[cfg(feature = "tdx")] 277 #[error("Error finalizing TDX VM: {0}")] 278 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 279 280 #[cfg(feature = "tdx")] 281 #[error("TDX firmware missing")] 282 TdxFirmwareMissing, 283 284 #[cfg(feature = "tdx")] 285 #[error("Invalid TDX payload type")] 286 InvalidPayloadType, 287 288 #[cfg(feature = "guest_debug")] 289 #[error("Error debugging VM: {0:?}")] 290 Debug(DebuggableError), 291 292 #[error("Error spawning kernel loading thread")] 293 KernelLoadThreadSpawn(std::io::Error), 294 295 #[error("Error joining kernel loading thread")] 296 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 297 298 #[error("Payload configuration is not bootable")] 299 InvalidPayload, 300 301 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 302 #[error("Error coredumping VM: {0:?}")] 303 Coredump(GuestDebuggableError), 304 } 305 pub type Result<T> = result::Result<T, Error>; 306 307 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 308 pub enum VmState { 309 Created, 310 Running, 311 Shutdown, 312 Paused, 313 BreakPoint, 314 } 315 316 impl VmState { 317 fn valid_transition(self, new_state: VmState) -> Result<()> { 318 match self { 319 VmState::Created => match new_state { 320 VmState::Created => Err(Error::InvalidStateTransition(self, new_state)), 321 VmState::Running | VmState::Paused | VmState::BreakPoint | VmState::Shutdown => { 322 Ok(()) 323 } 324 }, 325 326 VmState::Running => match new_state { 327 VmState::Created | VmState::Running => { 328 Err(Error::InvalidStateTransition(self, new_state)) 329 } 330 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 331 }, 332 333 VmState::Shutdown => match new_state { 334 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 335 Err(Error::InvalidStateTransition(self, new_state)) 336 } 337 VmState::Running => Ok(()), 338 }, 339 340 VmState::Paused => match new_state { 341 VmState::Created | VmState::Paused | VmState::BreakPoint => { 342 Err(Error::InvalidStateTransition(self, new_state)) 343 } 344 VmState::Running | VmState::Shutdown => Ok(()), 345 }, 346 VmState::BreakPoint => match new_state { 347 VmState::Created | VmState::Running => Ok(()), 348 _ => Err(Error::InvalidStateTransition(self, new_state)), 349 }, 350 } 351 } 352 } 353 354 struct VmOpsHandler { 355 memory: GuestMemoryAtomic<GuestMemoryMmap>, 356 #[cfg(target_arch = "x86_64")] 357 io_bus: Arc<Bus>, 358 mmio_bus: Arc<Bus>, 359 } 360 361 impl VmOps for VmOpsHandler { 362 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 363 self.memory 364 .memory() 365 .write(buf, GuestAddress(gpa)) 366 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 367 } 368 369 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 370 self.memory 371 .memory() 372 .read(buf, GuestAddress(gpa)) 373 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 374 } 375 376 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 377 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 378 info!("Guest MMIO read to unregistered address 0x{:x}", gpa); 379 } 380 Ok(()) 381 } 382 383 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 384 match self.mmio_bus.write(gpa, data) { 385 Err(vm_device::BusError::MissingAddressRange) => { 386 info!("Guest MMIO write to unregistered address 0x{:x}", gpa); 387 } 388 Ok(Some(barrier)) => { 389 info!("Waiting for barrier"); 390 barrier.wait(); 391 info!("Barrier released"); 392 } 393 _ => {} 394 }; 395 Ok(()) 396 } 397 398 #[cfg(target_arch = "x86_64")] 399 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 400 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 401 info!("Guest PIO read to unregistered address 0x{:x}", port); 402 } 403 Ok(()) 404 } 405 406 #[cfg(target_arch = "x86_64")] 407 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 408 match self.io_bus.write(port, data) { 409 Err(vm_device::BusError::MissingAddressRange) => { 410 info!("Guest PIO write to unregistered address 0x{:x}", port); 411 } 412 Ok(Some(barrier)) => { 413 info!("Waiting for barrier"); 414 barrier.wait(); 415 info!("Barrier released"); 416 } 417 _ => {} 418 }; 419 Ok(()) 420 } 421 } 422 423 pub fn physical_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>, max_phys_bits: u8) -> u8 { 424 let host_phys_bits = get_host_cpu_phys_bits(hypervisor); 425 426 cmp::min(host_phys_bits, max_phys_bits) 427 } 428 429 pub struct Vm { 430 #[cfg(feature = "tdx")] 431 kernel: Option<File>, 432 initramfs: Option<File>, 433 threads: Vec<thread::JoinHandle<()>>, 434 device_manager: Arc<Mutex<DeviceManager>>, 435 config: Arc<Mutex<VmConfig>>, 436 state: RwLock<VmState>, 437 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 438 memory_manager: Arc<Mutex<MemoryManager>>, 439 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 440 // The hypervisor abstracted virtual machine. 441 vm: Arc<dyn hypervisor::Vm>, 442 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 443 saved_clock: Option<hypervisor::ClockData>, 444 numa_nodes: NumaNodes, 445 #[cfg_attr(any(not(feature = "kvm"), target_arch = "aarch64"), allow(dead_code))] 446 hypervisor: Arc<dyn hypervisor::Hypervisor>, 447 stop_on_boot: bool, 448 load_payload_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 449 } 450 451 impl Vm { 452 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 453 454 #[allow(clippy::too_many_arguments)] 455 pub fn new_from_memory_manager( 456 config: Arc<Mutex<VmConfig>>, 457 memory_manager: Arc<Mutex<MemoryManager>>, 458 vm: Arc<dyn hypervisor::Vm>, 459 exit_evt: EventFd, 460 reset_evt: EventFd, 461 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 462 seccomp_action: &SeccompAction, 463 hypervisor: Arc<dyn hypervisor::Hypervisor>, 464 activate_evt: EventFd, 465 timestamp: Instant, 466 serial_pty: Option<PtyPair>, 467 console_pty: Option<PtyPair>, 468 console_resize_pipe: Option<File>, 469 original_termios: Arc<Mutex<Option<termios>>>, 470 snapshot: Option<Snapshot>, 471 ) -> Result<Self> { 472 trace_scoped!("Vm::new_from_memory_manager"); 473 474 let boot_id_list = config 475 .lock() 476 .unwrap() 477 .validate() 478 .map_err(Error::ConfigValidation)?; 479 480 let load_payload_handle = if snapshot.is_none() { 481 Self::load_payload_async(&memory_manager, &config)? 482 } else { 483 None 484 }; 485 486 info!("Booting VM from config: {:?}", &config); 487 488 // Create NUMA nodes based on NumaConfig. 489 let numa_nodes = 490 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 491 492 #[cfg(feature = "tdx")] 493 let tdx_enabled = config.lock().unwrap().is_tdx_enabled(); 494 #[cfg(feature = "tdx")] 495 let force_iommu = tdx_enabled; 496 #[cfg(not(feature = "tdx"))] 497 let force_iommu = false; 498 499 #[cfg(feature = "guest_debug")] 500 let stop_on_boot = config.lock().unwrap().gdb; 501 #[cfg(not(feature = "guest_debug"))] 502 let stop_on_boot = false; 503 504 let memory = memory_manager.lock().unwrap().guest_memory(); 505 #[cfg(target_arch = "x86_64")] 506 let io_bus = Arc::new(Bus::new()); 507 let mmio_bus = Arc::new(Bus::new()); 508 509 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 510 memory, 511 #[cfg(target_arch = "x86_64")] 512 io_bus: io_bus.clone(), 513 mmio_bus: mmio_bus.clone(), 514 }); 515 516 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 517 let cpu_manager = cpu::CpuManager::new( 518 cpus_config, 519 vm.clone(), 520 exit_evt.try_clone().map_err(Error::EventFdClone)?, 521 reset_evt.try_clone().map_err(Error::EventFdClone)?, 522 #[cfg(feature = "guest_debug")] 523 vm_debug_evt, 524 &hypervisor, 525 seccomp_action.clone(), 526 vm_ops, 527 #[cfg(feature = "tdx")] 528 tdx_enabled, 529 &numa_nodes, 530 ) 531 .map_err(Error::CpuManager)?; 532 533 #[cfg(target_arch = "x86_64")] 534 cpu_manager 535 .lock() 536 .unwrap() 537 .populate_cpuid( 538 &memory_manager, 539 &hypervisor, 540 #[cfg(feature = "tdx")] 541 tdx_enabled, 542 ) 543 .map_err(Error::CpuManager)?; 544 545 // The initial TDX configuration must be done before the vCPUs are 546 // created 547 #[cfg(feature = "tdx")] 548 if tdx_enabled { 549 let cpuid = cpu_manager.lock().unwrap().common_cpuid(); 550 let max_vcpus = cpu_manager.lock().unwrap().max_vcpus() as u32; 551 vm.tdx_init(&cpuid, max_vcpus) 552 .map_err(Error::InitializeTdxVm)?; 553 } 554 555 cpu_manager 556 .lock() 557 .unwrap() 558 .create_boot_vcpus(snapshot_from_id(snapshot.as_ref(), CPU_MANAGER_SNAPSHOT_ID)) 559 .map_err(Error::CpuManager)?; 560 561 #[cfg(feature = "tdx")] 562 let dynamic = !tdx_enabled; 563 #[cfg(not(feature = "tdx"))] 564 let dynamic = true; 565 566 let device_manager = DeviceManager::new( 567 #[cfg(target_arch = "x86_64")] 568 io_bus, 569 mmio_bus, 570 hypervisor.hypervisor_type(), 571 vm.clone(), 572 config.clone(), 573 memory_manager.clone(), 574 cpu_manager.clone(), 575 exit_evt.try_clone().map_err(Error::EventFdClone)?, 576 reset_evt, 577 seccomp_action.clone(), 578 numa_nodes.clone(), 579 &activate_evt, 580 force_iommu, 581 boot_id_list, 582 timestamp, 583 snapshot_from_id(snapshot.as_ref(), DEVICE_MANAGER_SNAPSHOT_ID), 584 dynamic, 585 ) 586 .map_err(Error::DeviceManager)?; 587 588 device_manager 589 .lock() 590 .unwrap() 591 .create_devices( 592 serial_pty, 593 console_pty, 594 console_resize_pipe, 595 original_termios, 596 ) 597 .map_err(Error::DeviceManager)?; 598 599 #[cfg(feature = "tdx")] 600 let kernel = config 601 .lock() 602 .unwrap() 603 .payload 604 .as_ref() 605 .map(|p| p.kernel.as_ref().map(File::open)) 606 .unwrap_or_default() 607 .transpose() 608 .map_err(Error::KernelFile)?; 609 610 let initramfs = config 611 .lock() 612 .unwrap() 613 .payload 614 .as_ref() 615 .map(|p| p.initramfs.as_ref().map(File::open)) 616 .unwrap_or_default() 617 .transpose() 618 .map_err(Error::InitramfsFile)?; 619 620 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 621 let saved_clock = if let Some(snapshot) = snapshot.as_ref() { 622 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 623 vm_snapshot.clock 624 } else { 625 None 626 }; 627 628 let vm_state = if snapshot.is_some() { 629 VmState::Paused 630 } else { 631 VmState::Created 632 }; 633 634 Ok(Vm { 635 #[cfg(feature = "tdx")] 636 kernel, 637 initramfs, 638 device_manager, 639 config, 640 threads: Vec::with_capacity(1), 641 state: RwLock::new(vm_state), 642 cpu_manager, 643 memory_manager, 644 vm, 645 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 646 saved_clock, 647 numa_nodes, 648 hypervisor, 649 stop_on_boot, 650 load_payload_handle, 651 }) 652 } 653 654 fn create_numa_nodes( 655 configs: Option<Vec<NumaConfig>>, 656 memory_manager: &Arc<Mutex<MemoryManager>>, 657 ) -> Result<NumaNodes> { 658 let mm = memory_manager.lock().unwrap(); 659 let mm_zones = mm.memory_zones(); 660 let mut numa_nodes = BTreeMap::new(); 661 662 if let Some(configs) = &configs { 663 for config in configs.iter() { 664 if numa_nodes.contains_key(&config.guest_numa_id) { 665 error!("Can't define twice the same NUMA node"); 666 return Err(Error::InvalidNumaConfig); 667 } 668 669 let mut node = NumaNode::default(); 670 671 if let Some(memory_zones) = &config.memory_zones { 672 for memory_zone in memory_zones.iter() { 673 if let Some(mm_zone) = mm_zones.get(memory_zone) { 674 node.memory_regions.extend(mm_zone.regions().clone()); 675 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 676 node.hotplug_regions.push(virtiomem_zone.region().clone()); 677 } 678 node.memory_zones.push(memory_zone.clone()); 679 } else { 680 error!("Unknown memory zone '{}'", memory_zone); 681 return Err(Error::InvalidNumaConfig); 682 } 683 } 684 } 685 686 if let Some(cpus) = &config.cpus { 687 node.cpus.extend(cpus); 688 } 689 690 if let Some(distances) = &config.distances { 691 for distance in distances.iter() { 692 let dest = distance.destination; 693 let dist = distance.distance; 694 695 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 696 error!("Unknown destination NUMA node {}", dest); 697 return Err(Error::InvalidNumaConfig); 698 } 699 700 if node.distances.contains_key(&dest) { 701 error!("Destination NUMA node {} has been already set", dest); 702 return Err(Error::InvalidNumaConfig); 703 } 704 705 node.distances.insert(dest, dist); 706 } 707 } 708 709 #[cfg(target_arch = "x86_64")] 710 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 711 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 712 let mm_sections = sgx_epc_region.epc_sections(); 713 for sgx_epc_section in sgx_epc_sections.iter() { 714 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 715 node.sgx_epc_sections.push(mm_section.clone()); 716 } else { 717 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 718 return Err(Error::InvalidNumaConfig); 719 } 720 } 721 } else { 722 error!("Missing SGX EPC region"); 723 return Err(Error::InvalidNumaConfig); 724 } 725 } 726 727 numa_nodes.insert(config.guest_numa_id, node); 728 } 729 } 730 731 Ok(numa_nodes) 732 } 733 734 #[allow(clippy::too_many_arguments)] 735 pub fn new( 736 vm_config: Arc<Mutex<VmConfig>>, 737 exit_evt: EventFd, 738 reset_evt: EventFd, 739 #[cfg(feature = "guest_debug")] vm_debug_evt: EventFd, 740 seccomp_action: &SeccompAction, 741 hypervisor: Arc<dyn hypervisor::Hypervisor>, 742 activate_evt: EventFd, 743 serial_pty: Option<PtyPair>, 744 console_pty: Option<PtyPair>, 745 console_resize_pipe: Option<File>, 746 original_termios: Arc<Mutex<Option<termios>>>, 747 snapshot: Option<Snapshot>, 748 source_url: Option<&str>, 749 prefault: Option<bool>, 750 ) -> Result<Self> { 751 trace_scoped!("Vm::new"); 752 753 let timestamp = Instant::now(); 754 755 #[cfg(feature = "tdx")] 756 let tdx_enabled = if snapshot.is_some() { 757 false 758 } else { 759 vm_config.lock().unwrap().is_tdx_enabled() 760 }; 761 762 let vm = Self::create_hypervisor_vm( 763 &hypervisor, 764 #[cfg(feature = "tdx")] 765 tdx_enabled, 766 )?; 767 768 let phys_bits = physical_bits(&hypervisor, vm_config.lock().unwrap().cpus.max_phys_bits); 769 770 let memory_manager = if let Some(snapshot) = 771 snapshot_from_id(snapshot.as_ref(), MEMORY_MANAGER_SNAPSHOT_ID) 772 { 773 MemoryManager::new_from_snapshot( 774 &snapshot, 775 vm.clone(), 776 &vm_config.lock().unwrap().memory.clone(), 777 source_url, 778 prefault.unwrap(), 779 phys_bits, 780 ) 781 .map_err(Error::MemoryManager)? 782 } else { 783 #[cfg(target_arch = "x86_64")] 784 let sgx_epc_config = vm_config.lock().unwrap().sgx_epc.clone(); 785 786 MemoryManager::new( 787 vm.clone(), 788 &vm_config.lock().unwrap().memory.clone(), 789 None, 790 phys_bits, 791 #[cfg(feature = "tdx")] 792 tdx_enabled, 793 None, 794 None, 795 #[cfg(target_arch = "x86_64")] 796 sgx_epc_config, 797 ) 798 .map_err(Error::MemoryManager)? 799 }; 800 801 Vm::new_from_memory_manager( 802 vm_config, 803 memory_manager, 804 vm, 805 exit_evt, 806 reset_evt, 807 #[cfg(feature = "guest_debug")] 808 vm_debug_evt, 809 seccomp_action, 810 hypervisor, 811 activate_evt, 812 timestamp, 813 serial_pty, 814 console_pty, 815 console_resize_pipe, 816 original_termios, 817 snapshot, 818 ) 819 } 820 821 pub fn create_hypervisor_vm( 822 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 823 #[cfg(feature = "tdx")] tdx_enabled: bool, 824 ) -> Result<Arc<dyn hypervisor::Vm>> { 825 hypervisor.check_required_extensions().unwrap(); 826 827 // 0 for KVM_X86_LEGACY_VM 828 // 1 for KVM_X86_TDX_VM 829 #[cfg(feature = "tdx")] 830 let vm = hypervisor 831 .create_vm_with_type(u64::from(tdx_enabled)) 832 .unwrap(); 833 #[cfg(not(feature = "tdx"))] 834 let vm = hypervisor.create_vm().unwrap(); 835 836 #[cfg(target_arch = "x86_64")] 837 { 838 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 839 .unwrap(); 840 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 841 vm.enable_split_irq().unwrap(); 842 } 843 844 Ok(vm) 845 } 846 847 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 848 let mut initramfs = self.initramfs.as_ref().unwrap(); 849 let size: usize = initramfs 850 .seek(SeekFrom::End(0)) 851 .map_err(|_| Error::InitramfsLoad)? 852 .try_into() 853 .unwrap(); 854 initramfs.rewind().map_err(|_| Error::InitramfsLoad)?; 855 856 let address = 857 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 858 let address = GuestAddress(address); 859 860 guest_mem 861 .read_from(address, &mut initramfs, size) 862 .map_err(|_| Error::InitramfsLoad)?; 863 864 info!("Initramfs loaded: address = 0x{:x}", address.0); 865 Ok(arch::InitramfsConfig { address, size }) 866 } 867 868 pub fn generate_cmdline( 869 payload: &PayloadConfig, 870 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 871 ) -> Result<Cmdline> { 872 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE).map_err(Error::CmdLineCreate)?; 873 if let Some(s) = payload.cmdline.as_ref() { 874 cmdline.insert_str(s).map_err(Error::CmdLineInsertStr)?; 875 } 876 877 #[cfg(target_arch = "aarch64")] 878 for entry in device_manager.lock().unwrap().cmdline_additions() { 879 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 880 } 881 Ok(cmdline) 882 } 883 884 #[cfg(target_arch = "aarch64")] 885 fn load_firmware(mut firmware: &File, memory_manager: Arc<Mutex<MemoryManager>>) -> Result<()> { 886 let uefi_flash = memory_manager.lock().as_ref().unwrap().uefi_flash(); 887 let mem = uefi_flash.memory(); 888 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut firmware) 889 .map_err(Error::UefiLoad)?; 890 Ok(()) 891 } 892 893 #[cfg(target_arch = "aarch64")] 894 fn load_kernel( 895 firmware: Option<File>, 896 kernel: Option<File>, 897 memory_manager: Arc<Mutex<MemoryManager>>, 898 ) -> Result<EntryPoint> { 899 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 900 let mem = guest_memory.memory(); 901 let entry_addr = match (firmware, kernel) { 902 (None, Some(mut kernel)) => { 903 match linux_loader::loader::pe::PE::load( 904 mem.deref(), 905 Some(arch::layout::KERNEL_START), 906 &mut kernel, 907 None, 908 ) { 909 Ok(entry_addr) => entry_addr.kernel_load, 910 // Try to load the binary as kernel PE file at first. 911 // If failed, retry to load it as UEFI binary. 912 // As the UEFI binary is formatless, it must be the last option to try. 913 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 914 Self::load_firmware(&kernel, memory_manager)?; 915 arch::layout::UEFI_START 916 } 917 Err(e) => { 918 return Err(Error::KernelLoad(e)); 919 } 920 } 921 } 922 (Some(firmware), None) => { 923 Self::load_firmware(&firmware, memory_manager)?; 924 arch::layout::UEFI_START 925 } 926 _ => return Err(Error::InvalidPayload), 927 }; 928 929 Ok(EntryPoint { entry_addr }) 930 } 931 932 #[cfg(target_arch = "x86_64")] 933 fn load_kernel( 934 mut kernel: File, 935 cmdline: Option<Cmdline>, 936 memory_manager: Arc<Mutex<MemoryManager>>, 937 ) -> Result<EntryPoint> { 938 info!("Loading kernel"); 939 940 let mem = { 941 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 942 guest_memory.memory() 943 }; 944 let entry_addr = linux_loader::loader::elf::Elf::load( 945 mem.deref(), 946 None, 947 &mut kernel, 948 Some(arch::layout::HIGH_RAM_START), 949 ) 950 .map_err(Error::KernelLoad)?; 951 952 if let Some(cmdline) = cmdline { 953 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 954 .map_err(Error::LoadCmdLine)?; 955 } 956 957 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 958 // Use the PVH kernel entry point to boot the guest 959 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 960 Ok(EntryPoint { 961 entry_addr: Some(entry_addr), 962 }) 963 } else { 964 Err(Error::KernelMissingPvhHeader) 965 } 966 } 967 968 #[cfg(target_arch = "x86_64")] 969 fn load_payload( 970 payload: &PayloadConfig, 971 memory_manager: Arc<Mutex<MemoryManager>>, 972 ) -> Result<EntryPoint> { 973 trace_scoped!("load_payload"); 974 match ( 975 &payload.firmware, 976 &payload.kernel, 977 &payload.initramfs, 978 &payload.cmdline, 979 ) { 980 (Some(firmware), None, None, None) => { 981 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 982 Self::load_kernel(firmware, None, memory_manager) 983 } 984 (None, Some(kernel), _, _) => { 985 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 986 let cmdline = Self::generate_cmdline(payload)?; 987 Self::load_kernel(kernel, Some(cmdline), memory_manager) 988 } 989 _ => Err(Error::InvalidPayload), 990 } 991 } 992 993 #[cfg(target_arch = "aarch64")] 994 fn load_payload( 995 payload: &PayloadConfig, 996 memory_manager: Arc<Mutex<MemoryManager>>, 997 ) -> Result<EntryPoint> { 998 match (&payload.firmware, &payload.kernel) { 999 (Some(firmware), None) => { 1000 let firmware = File::open(firmware).map_err(Error::FirmwareFile)?; 1001 Self::load_kernel(Some(firmware), None, memory_manager) 1002 } 1003 (None, Some(kernel)) => { 1004 let kernel = File::open(kernel).map_err(Error::KernelFile)?; 1005 Self::load_kernel(None, Some(kernel), memory_manager) 1006 } 1007 _ => Err(Error::InvalidPayload), 1008 } 1009 } 1010 1011 fn load_payload_async( 1012 memory_manager: &Arc<Mutex<MemoryManager>>, 1013 config: &Arc<Mutex<VmConfig>>, 1014 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1015 // Kernel with TDX is loaded in a different manner 1016 #[cfg(feature = "tdx")] 1017 if config.lock().unwrap().is_tdx_enabled() { 1018 return Ok(None); 1019 } 1020 1021 config 1022 .lock() 1023 .unwrap() 1024 .payload 1025 .as_ref() 1026 .map(|payload| { 1027 let memory_manager = memory_manager.clone(); 1028 let payload = payload.clone(); 1029 1030 std::thread::Builder::new() 1031 .name("payload_loader".into()) 1032 .spawn(move || Self::load_payload(&payload, memory_manager)) 1033 .map_err(Error::KernelLoadThreadSpawn) 1034 }) 1035 .transpose() 1036 } 1037 1038 #[cfg(target_arch = "x86_64")] 1039 fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> { 1040 trace_scoped!("configure_system"); 1041 info!("Configuring system"); 1042 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1043 1044 let initramfs_config = match self.initramfs { 1045 Some(_) => Some(self.load_initramfs(&mem)?), 1046 None => None, 1047 }; 1048 1049 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1050 let rsdp_addr = Some(rsdp_addr); 1051 let sgx_epc_region = self 1052 .memory_manager 1053 .lock() 1054 .unwrap() 1055 .sgx_epc_region() 1056 .as_ref() 1057 .cloned(); 1058 1059 let serial_number = self 1060 .config 1061 .lock() 1062 .unwrap() 1063 .platform 1064 .as_ref() 1065 .and_then(|p| p.serial_number.clone()); 1066 1067 let uuid = self 1068 .config 1069 .lock() 1070 .unwrap() 1071 .platform 1072 .as_ref() 1073 .and_then(|p| p.uuid.clone()); 1074 1075 let oem_strings = self 1076 .config 1077 .lock() 1078 .unwrap() 1079 .platform 1080 .as_ref() 1081 .and_then(|p| p.oem_strings.clone()); 1082 1083 let oem_strings = oem_strings 1084 .as_deref() 1085 .map(|strings| strings.iter().map(|s| s.as_ref()).collect::<Vec<&str>>()); 1086 1087 arch::configure_system( 1088 &mem, 1089 arch::layout::CMDLINE_START, 1090 &initramfs_config, 1091 boot_vcpus, 1092 rsdp_addr, 1093 sgx_epc_region, 1094 serial_number.as_deref(), 1095 uuid.as_deref(), 1096 oem_strings.as_deref(), 1097 ) 1098 .map_err(Error::ConfigureSystem)?; 1099 Ok(()) 1100 } 1101 1102 #[cfg(target_arch = "aarch64")] 1103 fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> { 1104 let cmdline = Self::generate_cmdline( 1105 self.config.lock().unwrap().payload.as_ref().unwrap(), 1106 &self.device_manager, 1107 )?; 1108 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1109 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1110 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1111 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1112 let initramfs_config = match self.initramfs { 1113 Some(_) => Some(self.load_initramfs(&mem)?), 1114 None => None, 1115 }; 1116 1117 let device_info = &self 1118 .device_manager 1119 .lock() 1120 .unwrap() 1121 .get_device_info() 1122 .clone(); 1123 1124 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1125 let pci_space = PciSpaceInfo { 1126 pci_segment_id: pci_segment.id, 1127 mmio_config_address: pci_segment.mmio_config_address, 1128 pci_device_space_start: pci_segment.start_of_device_area, 1129 pci_device_space_size: pci_segment.end_of_device_area 1130 - pci_segment.start_of_device_area 1131 + 1, 1132 }; 1133 pci_space_info.push(pci_space); 1134 } 1135 1136 let virtio_iommu_bdf = self 1137 .device_manager 1138 .lock() 1139 .unwrap() 1140 .iommu_attached_devices() 1141 .as_ref() 1142 .map(|(v, _)| *v); 1143 1144 let vgic = self 1145 .device_manager 1146 .lock() 1147 .unwrap() 1148 .get_interrupt_controller() 1149 .unwrap() 1150 .lock() 1151 .unwrap() 1152 .get_vgic() 1153 .map_err(|_| { 1154 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1155 arch::aarch64::Error::SetupGic, 1156 )) 1157 })?; 1158 1159 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1160 let pmu_supported = self 1161 .cpu_manager 1162 .lock() 1163 .unwrap() 1164 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1165 .map_err(|_| { 1166 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1167 arch::aarch64::Error::VcpuInitPmu, 1168 )) 1169 })?; 1170 1171 arch::configure_system( 1172 &mem, 1173 cmdline.as_cstring().unwrap().to_str().unwrap(), 1174 vcpu_mpidrs, 1175 vcpu_topology, 1176 device_info, 1177 &initramfs_config, 1178 &pci_space_info, 1179 virtio_iommu_bdf.map(|bdf| bdf.into()), 1180 &vgic, 1181 &self.numa_nodes, 1182 pmu_supported, 1183 ) 1184 .map_err(Error::ConfigureSystem)?; 1185 1186 Ok(()) 1187 } 1188 1189 pub fn serial_pty(&self) -> Option<PtyPair> { 1190 self.device_manager.lock().unwrap().serial_pty() 1191 } 1192 1193 pub fn console_pty(&self) -> Option<PtyPair> { 1194 self.device_manager.lock().unwrap().console_pty() 1195 } 1196 1197 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1198 self.device_manager.lock().unwrap().console_resize_pipe() 1199 } 1200 1201 pub fn shutdown(&mut self) -> Result<()> { 1202 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1203 let new_state = VmState::Shutdown; 1204 1205 state.valid_transition(new_state)?; 1206 1207 // Wake up the DeviceManager threads so they will get terminated cleanly 1208 self.device_manager 1209 .lock() 1210 .unwrap() 1211 .resume() 1212 .map_err(Error::Resume)?; 1213 1214 self.cpu_manager 1215 .lock() 1216 .unwrap() 1217 .shutdown() 1218 .map_err(Error::CpuManager)?; 1219 1220 // Wait for all the threads to finish 1221 for thread in self.threads.drain(..) { 1222 thread.join().map_err(Error::ThreadCleanup)? 1223 } 1224 *state = new_state; 1225 1226 event!("vm", "shutdown"); 1227 1228 Ok(()) 1229 } 1230 1231 pub fn resize( 1232 &mut self, 1233 desired_vcpus: Option<u8>, 1234 desired_memory: Option<u64>, 1235 desired_balloon: Option<u64>, 1236 ) -> Result<()> { 1237 event!("vm", "resizing"); 1238 1239 if let Some(desired_vcpus) = desired_vcpus { 1240 if self 1241 .cpu_manager 1242 .lock() 1243 .unwrap() 1244 .resize(desired_vcpus) 1245 .map_err(Error::CpuManager)? 1246 { 1247 self.device_manager 1248 .lock() 1249 .unwrap() 1250 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1251 .map_err(Error::DeviceManager)?; 1252 } 1253 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1254 } 1255 1256 if let Some(desired_memory) = desired_memory { 1257 let new_region = self 1258 .memory_manager 1259 .lock() 1260 .unwrap() 1261 .resize(desired_memory) 1262 .map_err(Error::MemoryManager)?; 1263 1264 let memory_config = &mut self.config.lock().unwrap().memory; 1265 1266 if let Some(new_region) = &new_region { 1267 self.device_manager 1268 .lock() 1269 .unwrap() 1270 .update_memory(new_region) 1271 .map_err(Error::DeviceManager)?; 1272 1273 match memory_config.hotplug_method { 1274 HotplugMethod::Acpi => { 1275 self.device_manager 1276 .lock() 1277 .unwrap() 1278 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1279 .map_err(Error::DeviceManager)?; 1280 } 1281 HotplugMethod::VirtioMem => {} 1282 } 1283 } 1284 1285 // We update the VM config regardless of the actual guest resize 1286 // operation result (happened or not), so that if the VM reboots 1287 // it will be running with the last configure memory size. 1288 match memory_config.hotplug_method { 1289 HotplugMethod::Acpi => memory_config.size = desired_memory, 1290 HotplugMethod::VirtioMem => { 1291 if desired_memory > memory_config.size { 1292 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1293 } else { 1294 memory_config.hotplugged_size = None; 1295 } 1296 } 1297 } 1298 } 1299 1300 if let Some(desired_balloon) = desired_balloon { 1301 self.device_manager 1302 .lock() 1303 .unwrap() 1304 .resize_balloon(desired_balloon) 1305 .map_err(Error::DeviceManager)?; 1306 1307 // Update the configuration value for the balloon size to ensure 1308 // a reboot would use the right value. 1309 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1310 balloon_config.size = desired_balloon; 1311 } 1312 } 1313 1314 event!("vm", "resized"); 1315 1316 Ok(()) 1317 } 1318 1319 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1320 let memory_config = &mut self.config.lock().unwrap().memory; 1321 1322 if let Some(zones) = &mut memory_config.zones { 1323 for zone in zones.iter_mut() { 1324 if zone.id == id { 1325 if desired_memory >= zone.size { 1326 let hotplugged_size = desired_memory - zone.size; 1327 self.memory_manager 1328 .lock() 1329 .unwrap() 1330 .resize_zone(&id, desired_memory - zone.size) 1331 .map_err(Error::MemoryManager)?; 1332 // We update the memory zone config regardless of the 1333 // actual 'resize-zone' operation result (happened or 1334 // not), so that if the VM reboots it will be running 1335 // with the last configured memory zone size. 1336 zone.hotplugged_size = Some(hotplugged_size); 1337 1338 return Ok(()); 1339 } else { 1340 error!( 1341 "Invalid to ask less ({}) than boot RAM ({}) for \ 1342 this memory zone", 1343 desired_memory, zone.size, 1344 ); 1345 return Err(Error::ResizeZone); 1346 } 1347 } 1348 } 1349 } 1350 1351 error!("Could not find the memory zone {} for the resize", id); 1352 Err(Error::ResizeZone) 1353 } 1354 1355 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1356 let pci_device_info = self 1357 .device_manager 1358 .lock() 1359 .unwrap() 1360 .add_device(&mut device_cfg) 1361 .map_err(Error::DeviceManager)?; 1362 1363 // Update VmConfig by adding the new device. This is important to 1364 // ensure the device would be created in case of a reboot. 1365 { 1366 let mut config = self.config.lock().unwrap(); 1367 add_to_config(&mut config.devices, device_cfg); 1368 } 1369 1370 self.device_manager 1371 .lock() 1372 .unwrap() 1373 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1374 .map_err(Error::DeviceManager)?; 1375 1376 Ok(pci_device_info) 1377 } 1378 1379 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1380 let pci_device_info = self 1381 .device_manager 1382 .lock() 1383 .unwrap() 1384 .add_user_device(&mut device_cfg) 1385 .map_err(Error::DeviceManager)?; 1386 1387 // Update VmConfig by adding the new device. This is important to 1388 // ensure the device would be created in case of a reboot. 1389 { 1390 let mut config = self.config.lock().unwrap(); 1391 add_to_config(&mut config.user_devices, device_cfg); 1392 } 1393 1394 self.device_manager 1395 .lock() 1396 .unwrap() 1397 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1398 .map_err(Error::DeviceManager)?; 1399 1400 Ok(pci_device_info) 1401 } 1402 1403 pub fn remove_device(&mut self, id: String) -> Result<()> { 1404 self.device_manager 1405 .lock() 1406 .unwrap() 1407 .remove_device(id.clone()) 1408 .map_err(Error::DeviceManager)?; 1409 1410 // Update VmConfig by removing the device. This is important to 1411 // ensure the device would not be created in case of a reboot. 1412 self.config.lock().unwrap().remove_device(&id); 1413 1414 self.device_manager 1415 .lock() 1416 .unwrap() 1417 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1418 .map_err(Error::DeviceManager)?; 1419 Ok(()) 1420 } 1421 1422 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1423 let pci_device_info = self 1424 .device_manager 1425 .lock() 1426 .unwrap() 1427 .add_disk(&mut disk_cfg) 1428 .map_err(Error::DeviceManager)?; 1429 1430 // Update VmConfig by adding the new device. This is important to 1431 // ensure the device would be created in case of a reboot. 1432 { 1433 let mut config = self.config.lock().unwrap(); 1434 add_to_config(&mut config.disks, disk_cfg); 1435 } 1436 1437 self.device_manager 1438 .lock() 1439 .unwrap() 1440 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1441 .map_err(Error::DeviceManager)?; 1442 1443 Ok(pci_device_info) 1444 } 1445 1446 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1447 let pci_device_info = self 1448 .device_manager 1449 .lock() 1450 .unwrap() 1451 .add_fs(&mut fs_cfg) 1452 .map_err(Error::DeviceManager)?; 1453 1454 // Update VmConfig by adding the new device. This is important to 1455 // ensure the device would be created in case of a reboot. 1456 { 1457 let mut config = self.config.lock().unwrap(); 1458 add_to_config(&mut config.fs, fs_cfg); 1459 } 1460 1461 self.device_manager 1462 .lock() 1463 .unwrap() 1464 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1465 .map_err(Error::DeviceManager)?; 1466 1467 Ok(pci_device_info) 1468 } 1469 1470 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1471 let pci_device_info = self 1472 .device_manager 1473 .lock() 1474 .unwrap() 1475 .add_pmem(&mut pmem_cfg) 1476 .map_err(Error::DeviceManager)?; 1477 1478 // Update VmConfig by adding the new device. This is important to 1479 // ensure the device would be created in case of a reboot. 1480 { 1481 let mut config = self.config.lock().unwrap(); 1482 add_to_config(&mut config.pmem, pmem_cfg); 1483 } 1484 1485 self.device_manager 1486 .lock() 1487 .unwrap() 1488 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1489 .map_err(Error::DeviceManager)?; 1490 1491 Ok(pci_device_info) 1492 } 1493 1494 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1495 let pci_device_info = self 1496 .device_manager 1497 .lock() 1498 .unwrap() 1499 .add_net(&mut net_cfg) 1500 .map_err(Error::DeviceManager)?; 1501 1502 // Update VmConfig by adding the new device. This is important to 1503 // ensure the device would be created in case of a reboot. 1504 { 1505 let mut config = self.config.lock().unwrap(); 1506 add_to_config(&mut config.net, net_cfg); 1507 } 1508 1509 self.device_manager 1510 .lock() 1511 .unwrap() 1512 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1513 .map_err(Error::DeviceManager)?; 1514 1515 Ok(pci_device_info) 1516 } 1517 1518 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1519 let pci_device_info = self 1520 .device_manager 1521 .lock() 1522 .unwrap() 1523 .add_vdpa(&mut vdpa_cfg) 1524 .map_err(Error::DeviceManager)?; 1525 1526 // Update VmConfig by adding the new device. This is important to 1527 // ensure the device would be created in case of a reboot. 1528 { 1529 let mut config = self.config.lock().unwrap(); 1530 add_to_config(&mut config.vdpa, vdpa_cfg); 1531 } 1532 1533 self.device_manager 1534 .lock() 1535 .unwrap() 1536 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1537 .map_err(Error::DeviceManager)?; 1538 1539 Ok(pci_device_info) 1540 } 1541 1542 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1543 let pci_device_info = self 1544 .device_manager 1545 .lock() 1546 .unwrap() 1547 .add_vsock(&mut vsock_cfg) 1548 .map_err(Error::DeviceManager)?; 1549 1550 // Update VmConfig by adding the new device. This is important to 1551 // ensure the device would be created in case of a reboot. 1552 { 1553 let mut config = self.config.lock().unwrap(); 1554 config.vsock = Some(vsock_cfg); 1555 } 1556 1557 self.device_manager 1558 .lock() 1559 .unwrap() 1560 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1561 .map_err(Error::DeviceManager)?; 1562 1563 Ok(pci_device_info) 1564 } 1565 1566 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1567 Ok(self.device_manager.lock().unwrap().counters()) 1568 } 1569 1570 #[cfg(feature = "tdx")] 1571 fn extract_tdvf_sections(&mut self) -> Result<(Vec<TdvfSection>, bool)> { 1572 use arch::x86_64::tdx::*; 1573 1574 let firmware_path = self 1575 .config 1576 .lock() 1577 .unwrap() 1578 .payload 1579 .as_ref() 1580 .unwrap() 1581 .firmware 1582 .clone() 1583 .ok_or(Error::TdxFirmwareMissing)?; 1584 // The TDVF file contains a table of section as well as code 1585 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1586 1587 // For all the sections allocate some RAM backing them 1588 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1589 } 1590 1591 #[cfg(feature = "tdx")] 1592 fn hob_memory_resources( 1593 mut sorted_sections: Vec<TdvfSection>, 1594 guest_memory: &GuestMemoryMmap, 1595 ) -> Vec<(u64, u64, bool)> { 1596 let mut list = Vec::new(); 1597 1598 let mut current_section = sorted_sections.pop(); 1599 1600 // RAM regions interleaved with TDVF sections 1601 let mut next_start_addr = 0; 1602 for region in guest_memory.iter() { 1603 let region_start = region.start_addr().0; 1604 let region_end = region.last_addr().0; 1605 if region_start > next_start_addr { 1606 next_start_addr = region_start; 1607 } 1608 1609 loop { 1610 let (start, size, ram) = if let Some(section) = ¤t_section { 1611 if section.address <= next_start_addr { 1612 (section.address, section.size, false) 1613 } else { 1614 let last_addr = std::cmp::min(section.address - 1, region_end); 1615 (next_start_addr, last_addr - next_start_addr + 1, true) 1616 } 1617 } else { 1618 (next_start_addr, region_end - next_start_addr + 1, true) 1619 }; 1620 1621 list.push((start, size, ram)); 1622 1623 if !ram { 1624 current_section = sorted_sections.pop(); 1625 } 1626 1627 next_start_addr = start + size; 1628 1629 if region_start > next_start_addr { 1630 next_start_addr = region_start; 1631 } 1632 1633 if next_start_addr > region_end { 1634 break; 1635 } 1636 } 1637 } 1638 1639 // Once all the interleaved sections have been processed, let's simply 1640 // pull the remaining ones. 1641 if let Some(section) = current_section { 1642 list.push((section.address, section.size, false)); 1643 } 1644 while let Some(section) = sorted_sections.pop() { 1645 list.push((section.address, section.size, false)); 1646 } 1647 1648 list 1649 } 1650 1651 #[cfg(feature = "tdx")] 1652 fn populate_tdx_sections( 1653 &mut self, 1654 sections: &[TdvfSection], 1655 guid_found: bool, 1656 ) -> Result<Option<u64>> { 1657 use arch::x86_64::tdx::*; 1658 // Get the memory end *before* we start adding TDVF ram regions 1659 let boot_guest_memory = self 1660 .memory_manager 1661 .lock() 1662 .as_ref() 1663 .unwrap() 1664 .boot_guest_memory(); 1665 for section in sections { 1666 // No need to allocate if the section falls within guest RAM ranges 1667 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1668 info!( 1669 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1670 section 1671 ); 1672 continue; 1673 } 1674 1675 info!("Allocating TDVF Section: {:x?}", section); 1676 self.memory_manager 1677 .lock() 1678 .unwrap() 1679 .add_ram_region(GuestAddress(section.address), section.size as usize) 1680 .map_err(Error::AllocatingTdvfMemory)?; 1681 } 1682 1683 // The TDVF file contains a table of section as well as code 1684 let firmware_path = self 1685 .config 1686 .lock() 1687 .unwrap() 1688 .payload 1689 .as_ref() 1690 .unwrap() 1691 .firmware 1692 .clone() 1693 .ok_or(Error::TdxFirmwareMissing)?; 1694 let mut firmware_file = File::open(firmware_path).map_err(Error::LoadTdvf)?; 1695 1696 // The guest memory at this point now has all the required regions so it 1697 // is safe to copy from the TDVF file into it. 1698 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1699 let mem = guest_memory.memory(); 1700 let mut payload_info = None; 1701 let mut hob_offset = None; 1702 for section in sections { 1703 info!("Populating TDVF Section: {:x?}", section); 1704 match section.r#type { 1705 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1706 info!("Copying section to guest memory"); 1707 firmware_file 1708 .seek(SeekFrom::Start(section.data_offset as u64)) 1709 .map_err(Error::LoadTdvf)?; 1710 mem.read_from( 1711 GuestAddress(section.address), 1712 &mut firmware_file, 1713 section.data_size as usize, 1714 ) 1715 .unwrap(); 1716 } 1717 TdvfSectionType::TdHob => { 1718 hob_offset = Some(section.address); 1719 } 1720 TdvfSectionType::Payload => { 1721 info!("Copying payload to guest memory"); 1722 if let Some(payload_file) = self.kernel.as_mut() { 1723 let payload_size = payload_file 1724 .seek(SeekFrom::End(0)) 1725 .map_err(Error::LoadPayload)?; 1726 1727 payload_file 1728 .seek(SeekFrom::Start(0x1f1)) 1729 .map_err(Error::LoadPayload)?; 1730 1731 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1732 payload_header 1733 .as_bytes() 1734 .read_from( 1735 0, 1736 payload_file, 1737 mem::size_of::<linux_loader::bootparam::setup_header>(), 1738 ) 1739 .unwrap(); 1740 1741 if payload_header.header != 0x5372_6448 { 1742 return Err(Error::InvalidPayloadType); 1743 } 1744 1745 if (payload_header.version < 0x0200) 1746 || ((payload_header.loadflags & 0x1) == 0x0) 1747 { 1748 return Err(Error::InvalidPayloadType); 1749 } 1750 1751 payload_file.rewind().map_err(Error::LoadPayload)?; 1752 mem.read_from( 1753 GuestAddress(section.address), 1754 payload_file, 1755 payload_size as usize, 1756 ) 1757 .unwrap(); 1758 1759 // Create the payload info that will be inserted into 1760 // the HOB. 1761 payload_info = Some(PayloadInfo { 1762 image_type: PayloadImageType::BzImage, 1763 entry_point: section.address, 1764 }); 1765 } 1766 } 1767 TdvfSectionType::PayloadParam => { 1768 info!("Copying payload parameters to guest memory"); 1769 let cmdline = Self::generate_cmdline( 1770 self.config.lock().unwrap().payload.as_ref().unwrap(), 1771 )?; 1772 mem.write_slice( 1773 cmdline.as_cstring().unwrap().as_bytes_with_nul(), 1774 GuestAddress(section.address), 1775 ) 1776 .unwrap(); 1777 } 1778 _ => {} 1779 } 1780 } 1781 1782 // Generate HOB 1783 let mut hob = TdHob::start(hob_offset.unwrap()); 1784 1785 let mut sorted_sections = sections.to_vec(); 1786 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1787 1788 sorted_sections.sort_by_key(|section| section.address); 1789 sorted_sections.reverse(); 1790 1791 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1792 hob.add_memory_resource(&mem, start, size, ram, guid_found) 1793 .map_err(Error::PopulateHob)?; 1794 } 1795 1796 // MMIO regions 1797 hob.add_mmio_resource( 1798 &mem, 1799 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1800 arch::layout::APIC_START.raw_value() 1801 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1802 ) 1803 .map_err(Error::PopulateHob)?; 1804 let start_of_device_area = self 1805 .memory_manager 1806 .lock() 1807 .unwrap() 1808 .start_of_device_area() 1809 .raw_value(); 1810 let end_of_device_area = self 1811 .memory_manager 1812 .lock() 1813 .unwrap() 1814 .end_of_device_area() 1815 .raw_value(); 1816 hob.add_mmio_resource( 1817 &mem, 1818 start_of_device_area, 1819 end_of_device_area - start_of_device_area, 1820 ) 1821 .map_err(Error::PopulateHob)?; 1822 1823 // Loop over the ACPI tables and copy them to the HOB. 1824 1825 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1826 &self.device_manager, 1827 &self.cpu_manager, 1828 &self.memory_manager, 1829 &self.numa_nodes, 1830 ) { 1831 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1832 .map_err(Error::PopulateHob)?; 1833 } 1834 1835 // If a payload info has been created, let's insert it into the HOB. 1836 if let Some(payload_info) = payload_info { 1837 hob.add_payload(&mem, payload_info) 1838 .map_err(Error::PopulateHob)?; 1839 } 1840 1841 hob.finish(&mem).map_err(Error::PopulateHob)?; 1842 1843 Ok(hob_offset) 1844 } 1845 1846 #[cfg(feature = "tdx")] 1847 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1848 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1849 let mem = guest_memory.memory(); 1850 1851 for section in sections { 1852 self.vm 1853 .tdx_init_memory_region( 1854 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1855 section.address, 1856 section.size, 1857 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1858 section.attributes == 1, 1859 ) 1860 .map_err(Error::InitializeTdxMemoryRegion)?; 1861 } 1862 1863 Ok(()) 1864 } 1865 1866 // Creates ACPI tables 1867 // In case of TDX being used, this is a no-op since the tables will be 1868 // created and passed when populating the HOB. 1869 1870 fn create_acpi_tables(&self) -> Option<GuestAddress> { 1871 #[cfg(feature = "tdx")] 1872 if self.config.lock().unwrap().is_tdx_enabled() { 1873 return None; 1874 } 1875 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 1876 let tpm_enabled = self.config.lock().unwrap().tpm.is_some(); 1877 let rsdp_addr = crate::acpi::create_acpi_tables( 1878 &mem, 1879 &self.device_manager, 1880 &self.cpu_manager, 1881 &self.memory_manager, 1882 &self.numa_nodes, 1883 tpm_enabled, 1884 ); 1885 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 1886 1887 Some(rsdp_addr) 1888 } 1889 1890 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 1891 trace_scoped!("entry_point"); 1892 1893 self.load_payload_handle 1894 .take() 1895 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 1896 .transpose() 1897 } 1898 1899 pub fn boot(&mut self) -> Result<()> { 1900 trace_scoped!("Vm::boot"); 1901 info!("Booting VM"); 1902 event!("vm", "booting"); 1903 let current_state = self.get_state()?; 1904 if current_state == VmState::Paused { 1905 return self.resume().map_err(Error::Resume); 1906 } 1907 1908 let new_state = if self.stop_on_boot { 1909 VmState::BreakPoint 1910 } else { 1911 VmState::Running 1912 }; 1913 current_state.valid_transition(new_state)?; 1914 1915 // Do earlier to parallelise with loading kernel 1916 #[cfg(target_arch = "x86_64")] 1917 let rsdp_addr = self.create_acpi_tables(); 1918 1919 // Load kernel synchronously or if asynchronous then wait for load to 1920 // finish. 1921 let entry_point = self.entry_point()?; 1922 1923 #[cfg(feature = "tdx")] 1924 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 1925 1926 // Configure the vcpus that have been created 1927 let vcpus = self.cpu_manager.lock().unwrap().vcpus(); 1928 for vcpu in vcpus { 1929 let guest_memory = &self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1930 let boot_setup = entry_point.map(|e| (e, guest_memory)); 1931 self.cpu_manager 1932 .lock() 1933 .unwrap() 1934 .configure_vcpu(vcpu, boot_setup) 1935 .map_err(Error::CpuManager)?; 1936 } 1937 1938 #[cfg(feature = "tdx")] 1939 let (sections, guid_found) = if tdx_enabled { 1940 self.extract_tdvf_sections()? 1941 } else { 1942 (Vec::new(), false) 1943 }; 1944 1945 // Configuring the TDX regions requires that the vCPUs are created. 1946 #[cfg(feature = "tdx")] 1947 let hob_address = if tdx_enabled { 1948 // TDX sections are written to memory. 1949 self.populate_tdx_sections(§ions, guid_found)? 1950 } else { 1951 None 1952 }; 1953 1954 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 1955 // available after they are configured 1956 #[cfg(target_arch = "aarch64")] 1957 let rsdp_addr = self.create_acpi_tables(); 1958 1959 // Configure shared state based on loaded kernel 1960 entry_point 1961 .map(|_| { 1962 // Safe to unwrap rsdp_addr as we know it can't be None when 1963 // the entry_point is Some. 1964 self.configure_system(rsdp_addr.unwrap()) 1965 }) 1966 .transpose()?; 1967 1968 #[cfg(target_arch = "x86_64")] 1969 // Note: For x86, always call this function before invoking start boot vcpus. 1970 // Otherwise guest would fail to boot because we haven't created the 1971 // userspace mappings to update the hypervisor about the memory mappings. 1972 // These mappings must be created before we start the vCPU threads for 1973 // the very first time. 1974 self.memory_manager 1975 .lock() 1976 .unwrap() 1977 .allocate_address_space() 1978 .map_err(Error::MemoryManager)?; 1979 1980 #[cfg(feature = "tdx")] 1981 if let Some(hob_address) = hob_address { 1982 // With the HOB address extracted the vCPUs can have 1983 // their TDX state configured. 1984 self.cpu_manager 1985 .lock() 1986 .unwrap() 1987 .initialize_tdx(hob_address) 1988 .map_err(Error::CpuManager)?; 1989 // Let the hypervisor know which memory ranges are shared with the 1990 // guest. This prevents the guest from ignoring/discarding memory 1991 // regions provided by the host. 1992 self.init_tdx_memory(§ions)?; 1993 // With TDX memory and CPU state configured TDX setup is complete 1994 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 1995 } 1996 1997 self.cpu_manager 1998 .lock() 1999 .unwrap() 2000 .start_boot_vcpus(new_state == VmState::BreakPoint) 2001 .map_err(Error::CpuManager)?; 2002 2003 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2004 *state = new_state; 2005 event!("vm", "booted"); 2006 Ok(()) 2007 } 2008 2009 pub fn restore(&mut self) -> Result<()> { 2010 event!("vm", "restoring"); 2011 2012 #[cfg(target_arch = "x86_64")] 2013 // Note: For x86, always call this function before invoking start boot vcpus. 2014 // Otherwise guest would fail to boot because we haven't created the 2015 // userspace mappings to update the hypervisor about the memory mappings. 2016 // These mappings must be created before we start the vCPU threads for 2017 // the very first time for the restored VM. 2018 self.memory_manager 2019 .lock() 2020 .unwrap() 2021 .allocate_address_space() 2022 .map_err(Error::MemoryManager)?; 2023 2024 // Now we can start all vCPUs from here. 2025 self.cpu_manager 2026 .lock() 2027 .unwrap() 2028 .start_restored_vcpus() 2029 .map_err(Error::CpuManager)?; 2030 2031 event!("vm", "restored"); 2032 Ok(()) 2033 } 2034 2035 /// Gets a thread-safe reference counted pointer to the VM configuration. 2036 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2037 Arc::clone(&self.config) 2038 } 2039 2040 /// Get the VM state. Returns an error if the state is poisoned. 2041 pub fn get_state(&self) -> Result<VmState> { 2042 self.state 2043 .try_read() 2044 .map_err(|_| Error::PoisonedState) 2045 .map(|state| *state) 2046 } 2047 2048 /// Gets the actual size of the balloon. 2049 pub fn balloon_size(&self) -> u64 { 2050 self.device_manager.lock().unwrap().balloon_size() 2051 } 2052 2053 pub fn send_memory_fds( 2054 &mut self, 2055 socket: &mut UnixStream, 2056 ) -> std::result::Result<(), MigratableError> { 2057 for (slot, fd) in self 2058 .memory_manager 2059 .lock() 2060 .unwrap() 2061 .memory_slot_fds() 2062 .drain() 2063 { 2064 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2065 .write_to(socket) 2066 .map_err(|e| { 2067 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2068 })?; 2069 socket 2070 .send_with_fd(&slot.to_le_bytes()[..], fd) 2071 .map_err(|e| { 2072 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2073 })?; 2074 2075 let res = Response::read_from(socket)?; 2076 if res.status() != Status::Ok { 2077 warn!("Error during memory fd migration"); 2078 Request::abandon().write_to(socket)?; 2079 Response::read_from(socket).ok(); 2080 return Err(MigratableError::MigrateSend(anyhow!( 2081 "Error during memory fd migration" 2082 ))); 2083 } 2084 } 2085 2086 Ok(()) 2087 } 2088 2089 pub fn send_memory_regions<F>( 2090 &mut self, 2091 ranges: &MemoryRangeTable, 2092 fd: &mut F, 2093 ) -> std::result::Result<(), MigratableError> 2094 where 2095 F: Write, 2096 { 2097 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2098 let mem = guest_memory.memory(); 2099 2100 for range in ranges.regions() { 2101 let mut offset: u64 = 0; 2102 // Here we are manually handling the retry in case we can't the 2103 // whole region at once because we can't use the implementation 2104 // from vm-memory::GuestMemory of write_all_to() as it is not 2105 // following the correct behavior. For more info about this issue 2106 // see: https://github.com/rust-vmm/vm-memory/issues/174 2107 loop { 2108 let bytes_written = mem 2109 .write_to( 2110 GuestAddress(range.gpa + offset), 2111 fd, 2112 (range.length - offset) as usize, 2113 ) 2114 .map_err(|e| { 2115 MigratableError::MigrateSend(anyhow!( 2116 "Error transferring memory to socket: {}", 2117 e 2118 )) 2119 })?; 2120 offset += bytes_written as u64; 2121 2122 if offset == range.length { 2123 break; 2124 } 2125 } 2126 } 2127 2128 Ok(()) 2129 } 2130 2131 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2132 self.memory_manager 2133 .lock() 2134 .unwrap() 2135 .memory_range_table(false) 2136 } 2137 2138 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2139 self.device_manager.lock().unwrap().device_tree() 2140 } 2141 2142 pub fn activate_virtio_devices(&self) -> Result<()> { 2143 self.device_manager 2144 .lock() 2145 .unwrap() 2146 .activate_virtio_devices() 2147 .map_err(Error::ActivateVirtioDevices) 2148 } 2149 2150 #[cfg(target_arch = "x86_64")] 2151 pub fn power_button(&self) -> Result<()> { 2152 return self 2153 .device_manager 2154 .lock() 2155 .unwrap() 2156 .notify_power_button() 2157 .map_err(Error::PowerButton); 2158 } 2159 2160 #[cfg(target_arch = "aarch64")] 2161 pub fn power_button(&self) -> Result<()> { 2162 self.device_manager 2163 .lock() 2164 .unwrap() 2165 .notify_power_button() 2166 .map_err(Error::PowerButton) 2167 } 2168 2169 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2170 self.memory_manager.lock().unwrap().snapshot_data() 2171 } 2172 2173 #[cfg(feature = "guest_debug")] 2174 pub fn debug_request( 2175 &mut self, 2176 gdb_request: &GdbRequestPayload, 2177 cpu_id: usize, 2178 ) -> Result<GdbResponsePayload> { 2179 use GdbRequestPayload::*; 2180 match gdb_request { 2181 SetSingleStep(single_step) => { 2182 self.set_guest_debug(cpu_id, &[], *single_step) 2183 .map_err(Error::Debug)?; 2184 } 2185 SetHwBreakPoint(addrs) => { 2186 self.set_guest_debug(cpu_id, addrs, false) 2187 .map_err(Error::Debug)?; 2188 } 2189 Pause => { 2190 self.debug_pause().map_err(Error::Debug)?; 2191 } 2192 Resume => { 2193 self.debug_resume().map_err(Error::Debug)?; 2194 } 2195 ReadRegs => { 2196 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2197 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2198 } 2199 WriteRegs(regs) => { 2200 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2201 } 2202 ReadMem(vaddr, len) => { 2203 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2204 let mem = self 2205 .read_mem(&guest_memory, cpu_id, *vaddr, *len) 2206 .map_err(Error::Debug)?; 2207 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2208 } 2209 WriteMem(vaddr, data) => { 2210 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2211 self.write_mem(&guest_memory, cpu_id, vaddr, data) 2212 .map_err(Error::Debug)?; 2213 } 2214 ActiveVcpus => { 2215 let active_vcpus = self.active_vcpus(); 2216 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2217 } 2218 } 2219 Ok(GdbResponsePayload::CommandComplete) 2220 } 2221 2222 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2223 fn get_dump_state( 2224 &mut self, 2225 destination_url: &str, 2226 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2227 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2228 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2229 let mut elf_phdr_num = 1; 2230 let elf_sh_info = 0; 2231 let coredump_file_path = url_to_file(destination_url)?; 2232 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2233 2234 if mapping_num < UINT16_MAX - 2 { 2235 elf_phdr_num += mapping_num as u16; 2236 } else { 2237 panic!("mapping num beyond 65535 not supported"); 2238 } 2239 let coredump_file = OpenOptions::new() 2240 .read(true) 2241 .write(true) 2242 .create_new(true) 2243 .open(coredump_file_path) 2244 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2245 2246 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2247 let mem_data = self 2248 .memory_manager 2249 .lock() 2250 .unwrap() 2251 .coredump_memory_regions(mem_offset); 2252 2253 Ok(DumpState { 2254 elf_note_size, 2255 elf_phdr_num, 2256 elf_sh_info, 2257 mem_offset, 2258 mem_info: Some(mem_data), 2259 file: Some(coredump_file), 2260 }) 2261 } 2262 2263 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2264 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2265 size_of::<elf::Elf64_Ehdr>() as u64 2266 + note_size as u64 2267 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2268 } 2269 } 2270 2271 impl Pausable for Vm { 2272 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2273 event!("vm", "pausing"); 2274 let mut state = self 2275 .state 2276 .try_write() 2277 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2278 let new_state = VmState::Paused; 2279 2280 state 2281 .valid_transition(new_state) 2282 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2283 2284 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2285 { 2286 let mut clock = self 2287 .vm 2288 .get_clock() 2289 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2290 clock.reset_flags(); 2291 self.saved_clock = Some(clock); 2292 } 2293 2294 // Before pausing the vCPUs activate any pending virtio devices that might 2295 // need activation between starting the pause (or e.g. a migration it's part of) 2296 self.activate_virtio_devices().map_err(|e| { 2297 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2298 })?; 2299 2300 self.cpu_manager.lock().unwrap().pause()?; 2301 self.device_manager.lock().unwrap().pause()?; 2302 2303 *state = new_state; 2304 2305 event!("vm", "paused"); 2306 Ok(()) 2307 } 2308 2309 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2310 event!("vm", "resuming"); 2311 let mut state = self 2312 .state 2313 .try_write() 2314 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2315 let new_state = VmState::Running; 2316 2317 state 2318 .valid_transition(new_state) 2319 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2320 2321 self.cpu_manager.lock().unwrap().resume()?; 2322 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2323 { 2324 if let Some(clock) = &self.saved_clock { 2325 self.vm.set_clock(clock).map_err(|e| { 2326 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2327 })?; 2328 } 2329 } 2330 self.device_manager.lock().unwrap().resume()?; 2331 2332 // And we're back to the Running state. 2333 *state = new_state; 2334 event!("vm", "resumed"); 2335 Ok(()) 2336 } 2337 } 2338 2339 #[derive(Serialize, Deserialize)] 2340 pub struct VmSnapshot { 2341 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2342 pub clock: Option<hypervisor::ClockData>, 2343 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2344 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2345 } 2346 2347 pub const VM_SNAPSHOT_ID: &str = "vm"; 2348 impl Snapshottable for Vm { 2349 fn id(&self) -> String { 2350 VM_SNAPSHOT_ID.to_string() 2351 } 2352 2353 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2354 event!("vm", "snapshotting"); 2355 2356 #[cfg(feature = "tdx")] 2357 let tdx_enabled = self.config.lock().unwrap().is_tdx_enabled(); 2358 2359 #[cfg(feature = "tdx")] 2360 { 2361 if tdx_enabled { 2362 return Err(MigratableError::Snapshot(anyhow!( 2363 "Snapshot not possible with TDX VM" 2364 ))); 2365 } 2366 } 2367 2368 let current_state = self.get_state().unwrap(); 2369 if current_state != VmState::Paused { 2370 return Err(MigratableError::Snapshot(anyhow!( 2371 "Trying to snapshot while VM is running" 2372 ))); 2373 } 2374 2375 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2376 let common_cpuid = { 2377 let phys_bits = physical_bits( 2378 &self.hypervisor, 2379 self.config.lock().unwrap().cpus.max_phys_bits, 2380 ); 2381 arch::generate_common_cpuid( 2382 &self.hypervisor, 2383 None, 2384 None, 2385 phys_bits, 2386 self.config.lock().unwrap().cpus.kvm_hyperv, 2387 #[cfg(feature = "tdx")] 2388 tdx_enabled, 2389 ) 2390 .map_err(|e| { 2391 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2392 })? 2393 }; 2394 2395 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2396 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2397 clock: self.saved_clock, 2398 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2399 common_cpuid, 2400 }) 2401 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2402 2403 let mut vm_snapshot = Snapshot::from_data(SnapshotData(vm_snapshot_data)); 2404 2405 let (id, snapshot) = { 2406 let mut cpu_manager = self.cpu_manager.lock().unwrap(); 2407 (cpu_manager.id(), cpu_manager.snapshot()?) 2408 }; 2409 vm_snapshot.add_snapshot(id, snapshot); 2410 let (id, snapshot) = { 2411 let mut memory_manager = self.memory_manager.lock().unwrap(); 2412 (memory_manager.id(), memory_manager.snapshot()?) 2413 }; 2414 vm_snapshot.add_snapshot(id, snapshot); 2415 let (id, snapshot) = { 2416 let mut device_manager = self.device_manager.lock().unwrap(); 2417 (device_manager.id(), device_manager.snapshot()?) 2418 }; 2419 vm_snapshot.add_snapshot(id, snapshot); 2420 2421 event!("vm", "snapshotted"); 2422 Ok(vm_snapshot) 2423 } 2424 } 2425 2426 impl Transportable for Vm { 2427 fn send( 2428 &self, 2429 snapshot: &Snapshot, 2430 destination_url: &str, 2431 ) -> std::result::Result<(), MigratableError> { 2432 let mut snapshot_config_path = url_to_path(destination_url)?; 2433 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2434 2435 // Create the snapshot config file 2436 let mut snapshot_config_file = OpenOptions::new() 2437 .read(true) 2438 .write(true) 2439 .create_new(true) 2440 .open(snapshot_config_path) 2441 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2442 2443 // Serialize and write the snapshot config 2444 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2445 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2446 2447 snapshot_config_file 2448 .write(vm_config.as_bytes()) 2449 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2450 2451 let mut snapshot_state_path = url_to_path(destination_url)?; 2452 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2453 2454 // Create the snapshot state file 2455 let mut snapshot_state_file = OpenOptions::new() 2456 .read(true) 2457 .write(true) 2458 .create_new(true) 2459 .open(snapshot_state_path) 2460 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2461 2462 // Serialize and write the snapshot state 2463 let vm_state = 2464 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2465 2466 snapshot_state_file 2467 .write(&vm_state) 2468 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2469 2470 // Tell the memory manager to also send/write its own snapshot. 2471 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2472 self.memory_manager 2473 .lock() 2474 .unwrap() 2475 .send(&memory_manager_snapshot.clone(), destination_url)?; 2476 } else { 2477 return Err(MigratableError::Restore(anyhow!( 2478 "Missing memory manager snapshot" 2479 ))); 2480 } 2481 2482 Ok(()) 2483 } 2484 } 2485 2486 impl Migratable for Vm { 2487 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2488 self.memory_manager.lock().unwrap().start_dirty_log()?; 2489 self.device_manager.lock().unwrap().start_dirty_log() 2490 } 2491 2492 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2493 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2494 self.device_manager.lock().unwrap().stop_dirty_log() 2495 } 2496 2497 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2498 Ok(MemoryRangeTable::new_from_tables(vec![ 2499 self.memory_manager.lock().unwrap().dirty_log()?, 2500 self.device_manager.lock().unwrap().dirty_log()?, 2501 ])) 2502 } 2503 2504 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2505 self.memory_manager.lock().unwrap().start_migration()?; 2506 self.device_manager.lock().unwrap().start_migration() 2507 } 2508 2509 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2510 self.memory_manager.lock().unwrap().complete_migration()?; 2511 self.device_manager.lock().unwrap().complete_migration() 2512 } 2513 } 2514 2515 #[cfg(feature = "guest_debug")] 2516 impl Debuggable for Vm { 2517 fn set_guest_debug( 2518 &self, 2519 cpu_id: usize, 2520 addrs: &[GuestAddress], 2521 singlestep: bool, 2522 ) -> std::result::Result<(), DebuggableError> { 2523 self.cpu_manager 2524 .lock() 2525 .unwrap() 2526 .set_guest_debug(cpu_id, addrs, singlestep) 2527 } 2528 2529 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2530 if *self.state.read().unwrap() == VmState::Running { 2531 self.pause().map_err(DebuggableError::Pause)?; 2532 } 2533 2534 let mut state = self 2535 .state 2536 .try_write() 2537 .map_err(|_| DebuggableError::PoisonedState)?; 2538 *state = VmState::BreakPoint; 2539 Ok(()) 2540 } 2541 2542 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2543 if *self.state.read().unwrap() == VmState::BreakPoint { 2544 self.resume().map_err(DebuggableError::Pause)?; 2545 } 2546 2547 Ok(()) 2548 } 2549 2550 fn read_regs(&self, cpu_id: usize) -> std::result::Result<CoreRegs, DebuggableError> { 2551 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2552 } 2553 2554 fn write_regs( 2555 &self, 2556 cpu_id: usize, 2557 regs: &CoreRegs, 2558 ) -> std::result::Result<(), DebuggableError> { 2559 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2560 } 2561 2562 fn read_mem( 2563 &self, 2564 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2565 cpu_id: usize, 2566 vaddr: GuestAddress, 2567 len: usize, 2568 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2569 self.cpu_manager 2570 .lock() 2571 .unwrap() 2572 .read_mem(guest_memory, cpu_id, vaddr, len) 2573 } 2574 2575 fn write_mem( 2576 &self, 2577 guest_memory: &GuestMemoryAtomic<GuestMemoryMmap>, 2578 cpu_id: usize, 2579 vaddr: &GuestAddress, 2580 data: &[u8], 2581 ) -> std::result::Result<(), DebuggableError> { 2582 self.cpu_manager 2583 .lock() 2584 .unwrap() 2585 .write_mem(guest_memory, cpu_id, vaddr, data) 2586 } 2587 2588 fn active_vcpus(&self) -> usize { 2589 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2590 if active_vcpus > 0 { 2591 active_vcpus 2592 } else { 2593 // The VM is not booted yet. Report boot_vcpus() instead. 2594 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2595 } 2596 } 2597 } 2598 2599 #[cfg(feature = "guest_debug")] 2600 pub const UINT16_MAX: u32 = 65535; 2601 2602 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2603 impl Elf64Writable for Vm {} 2604 2605 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2606 impl GuestDebuggable for Vm { 2607 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2608 event!("vm", "coredumping"); 2609 2610 let mut resume = false; 2611 2612 #[cfg(feature = "tdx")] 2613 { 2614 if let Some(ref platform) = self.config.lock().unwrap().platform { 2615 if platform.tdx { 2616 return Err(GuestDebuggableError::Coredump(anyhow!( 2617 "Coredump not possible with TDX VM" 2618 ))); 2619 } 2620 } 2621 } 2622 2623 match self.get_state().unwrap() { 2624 VmState::Running => { 2625 self.pause().map_err(GuestDebuggableError::Pause)?; 2626 resume = true; 2627 } 2628 VmState::Paused => {} 2629 _ => { 2630 return Err(GuestDebuggableError::Coredump(anyhow!( 2631 "Trying to coredump while VM is not running or paused" 2632 ))); 2633 } 2634 } 2635 2636 let coredump_state = self.get_dump_state(destination_url)?; 2637 2638 self.write_header(&coredump_state)?; 2639 self.write_note(&coredump_state)?; 2640 self.write_loads(&coredump_state)?; 2641 2642 self.cpu_manager 2643 .lock() 2644 .unwrap() 2645 .cpu_write_elf64_note(&coredump_state)?; 2646 self.cpu_manager 2647 .lock() 2648 .unwrap() 2649 .cpu_write_vmm_note(&coredump_state)?; 2650 2651 self.memory_manager 2652 .lock() 2653 .unwrap() 2654 .coredump_iterate_save_mem(&coredump_state)?; 2655 2656 if resume { 2657 self.resume().map_err(GuestDebuggableError::Resume)?; 2658 } 2659 2660 Ok(()) 2661 } 2662 } 2663 2664 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2665 #[cfg(test)] 2666 mod tests { 2667 use super::*; 2668 2669 fn test_vm_state_transitions(state: VmState) { 2670 match state { 2671 VmState::Created => { 2672 // Check the transitions from Created 2673 assert!(state.valid_transition(VmState::Created).is_err()); 2674 assert!(state.valid_transition(VmState::Running).is_ok()); 2675 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2676 assert!(state.valid_transition(VmState::Paused).is_ok()); 2677 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2678 } 2679 VmState::Running => { 2680 // Check the transitions from Running 2681 assert!(state.valid_transition(VmState::Created).is_err()); 2682 assert!(state.valid_transition(VmState::Running).is_err()); 2683 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2684 assert!(state.valid_transition(VmState::Paused).is_ok()); 2685 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2686 } 2687 VmState::Shutdown => { 2688 // Check the transitions from Shutdown 2689 assert!(state.valid_transition(VmState::Created).is_err()); 2690 assert!(state.valid_transition(VmState::Running).is_ok()); 2691 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2692 assert!(state.valid_transition(VmState::Paused).is_err()); 2693 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2694 } 2695 VmState::Paused => { 2696 // Check the transitions from Paused 2697 assert!(state.valid_transition(VmState::Created).is_err()); 2698 assert!(state.valid_transition(VmState::Running).is_ok()); 2699 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2700 assert!(state.valid_transition(VmState::Paused).is_err()); 2701 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2702 } 2703 VmState::BreakPoint => { 2704 // Check the transitions from Breakpoint 2705 assert!(state.valid_transition(VmState::Created).is_ok()); 2706 assert!(state.valid_transition(VmState::Running).is_ok()); 2707 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2708 assert!(state.valid_transition(VmState::Paused).is_err()); 2709 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2710 } 2711 } 2712 } 2713 2714 #[test] 2715 fn test_vm_created_transitions() { 2716 test_vm_state_transitions(VmState::Created); 2717 } 2718 2719 #[test] 2720 fn test_vm_running_transitions() { 2721 test_vm_state_transitions(VmState::Running); 2722 } 2723 2724 #[test] 2725 fn test_vm_shutdown_transitions() { 2726 test_vm_state_transitions(VmState::Shutdown); 2727 } 2728 2729 #[test] 2730 fn test_vm_paused_transitions() { 2731 test_vm_state_transitions(VmState::Paused); 2732 } 2733 2734 #[cfg(feature = "tdx")] 2735 #[test] 2736 fn test_hob_memory_resources() { 2737 // Case 1: Two TDVF sections in the middle of the RAM 2738 let sections = vec![ 2739 TdvfSection { 2740 address: 0xc000, 2741 size: 0x1000, 2742 ..Default::default() 2743 }, 2744 TdvfSection { 2745 address: 0x1000, 2746 size: 0x4000, 2747 ..Default::default() 2748 }, 2749 ]; 2750 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 2751 let expected = vec![ 2752 (0, 0x1000, true), 2753 (0x1000, 0x4000, false), 2754 (0x5000, 0x7000, true), 2755 (0xc000, 0x1000, false), 2756 (0xd000, 0x0fff_3000, true), 2757 ]; 2758 assert_eq!( 2759 expected, 2760 Vm::hob_memory_resources( 2761 sections, 2762 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2763 ) 2764 ); 2765 2766 // Case 2: Two TDVF sections with no conflict with the RAM 2767 let sections = vec![ 2768 TdvfSection { 2769 address: 0x1000_1000, 2770 size: 0x1000, 2771 ..Default::default() 2772 }, 2773 TdvfSection { 2774 address: 0, 2775 size: 0x1000, 2776 ..Default::default() 2777 }, 2778 ]; 2779 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2780 let expected = vec![ 2781 (0, 0x1000, false), 2782 (0x1000, 0x1000_0000, true), 2783 (0x1000_1000, 0x1000, false), 2784 ]; 2785 assert_eq!( 2786 expected, 2787 Vm::hob_memory_resources( 2788 sections, 2789 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2790 ) 2791 ); 2792 2793 // Case 3: Two TDVF sections with partial conflicts with the RAM 2794 let sections = vec![ 2795 TdvfSection { 2796 address: 0x1000_0000, 2797 size: 0x2000, 2798 ..Default::default() 2799 }, 2800 TdvfSection { 2801 address: 0, 2802 size: 0x2000, 2803 ..Default::default() 2804 }, 2805 ]; 2806 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2807 let expected = vec![ 2808 (0, 0x2000, false), 2809 (0x2000, 0x0fff_e000, true), 2810 (0x1000_0000, 0x2000, false), 2811 ]; 2812 assert_eq!( 2813 expected, 2814 Vm::hob_memory_resources( 2815 sections, 2816 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2817 ) 2818 ); 2819 2820 // Case 4: Two TDVF sections with no conflict before the RAM and two 2821 // more additional sections with no conflict after the RAM. 2822 let sections = vec![ 2823 TdvfSection { 2824 address: 0x2000_1000, 2825 size: 0x1000, 2826 ..Default::default() 2827 }, 2828 TdvfSection { 2829 address: 0x2000_0000, 2830 size: 0x1000, 2831 ..Default::default() 2832 }, 2833 TdvfSection { 2834 address: 0x1000, 2835 size: 0x1000, 2836 ..Default::default() 2837 }, 2838 TdvfSection { 2839 address: 0, 2840 size: 0x1000, 2841 ..Default::default() 2842 }, 2843 ]; 2844 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 2845 let expected = vec![ 2846 (0, 0x1000, false), 2847 (0x1000, 0x1000, false), 2848 (0x4000, 0x1000_0000, true), 2849 (0x2000_0000, 0x1000, false), 2850 (0x2000_1000, 0x1000, false), 2851 ]; 2852 assert_eq!( 2853 expected, 2854 Vm::hob_memory_resources( 2855 sections, 2856 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2857 ) 2858 ); 2859 2860 // Case 5: One TDVF section overriding the entire RAM 2861 let sections = vec![TdvfSection { 2862 address: 0, 2863 size: 0x2000_0000, 2864 ..Default::default() 2865 }]; 2866 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 2867 let expected = vec![(0, 0x2000_0000, false)]; 2868 assert_eq!( 2869 expected, 2870 Vm::hob_memory_resources( 2871 sections, 2872 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2873 ) 2874 ); 2875 2876 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 2877 let sections = vec![ 2878 TdvfSection { 2879 address: 0x1000_2000, 2880 size: 0x2000, 2881 ..Default::default() 2882 }, 2883 TdvfSection { 2884 address: 0, 2885 size: 0x2000, 2886 ..Default::default() 2887 }, 2888 ]; 2889 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 2890 (GuestAddress(0x2000), 0x1000_0000), 2891 (GuestAddress(0x1000_4000), 0x1000_0000), 2892 ]; 2893 let expected = vec![ 2894 (0, 0x2000, false), 2895 (0x2000, 0x1000_0000, true), 2896 (0x1000_2000, 0x2000, false), 2897 (0x1000_4000, 0x1000_0000, true), 2898 ]; 2899 assert_eq!( 2900 expected, 2901 Vm::hob_memory_resources( 2902 sections, 2903 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2904 ) 2905 ); 2906 2907 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 2908 let sections = vec![ 2909 TdvfSection { 2910 address: 0x1000_0000, 2911 size: 0x4000, 2912 ..Default::default() 2913 }, 2914 TdvfSection { 2915 address: 0, 2916 size: 0x4000, 2917 ..Default::default() 2918 }, 2919 ]; 2920 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 2921 (GuestAddress(0x1000), 0x1000_0000), 2922 (GuestAddress(0x1000_3000), 0x1000_0000), 2923 ]; 2924 let expected = vec![ 2925 (0, 0x4000, false), 2926 (0x4000, 0x0fff_c000, true), 2927 (0x1000_0000, 0x4000, false), 2928 (0x1000_4000, 0x0fff_f000, true), 2929 ]; 2930 assert_eq!( 2931 expected, 2932 Vm::hob_memory_resources( 2933 sections, 2934 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 2935 ) 2936 ); 2937 } 2938 } 2939 2940 #[cfg(target_arch = "aarch64")] 2941 #[cfg(test)] 2942 mod tests { 2943 use super::*; 2944 use crate::GuestMemoryMmap; 2945 use arch::aarch64::fdt::create_fdt; 2946 use arch::aarch64::layout; 2947 use arch::{DeviceType, MmioDeviceInfo}; 2948 use devices::gic::Gic; 2949 2950 const LEN: u64 = 4096; 2951 2952 #[test] 2953 fn test_create_fdt_with_devices() { 2954 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 2955 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 2956 2957 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 2958 ( 2959 (DeviceType::Serial, DeviceType::Serial.to_string()), 2960 MmioDeviceInfo { 2961 addr: 0x00, 2962 len: LEN, 2963 irq: 33, 2964 }, 2965 ), 2966 ( 2967 (DeviceType::Virtio(1), "virtio".to_string()), 2968 MmioDeviceInfo { 2969 addr: LEN, 2970 len: LEN, 2971 irq: 34, 2972 }, 2973 ), 2974 ( 2975 (DeviceType::Rtc, "rtc".to_string()), 2976 MmioDeviceInfo { 2977 addr: 2 * LEN, 2978 len: LEN, 2979 irq: 35, 2980 }, 2981 ), 2982 ] 2983 .iter() 2984 .cloned() 2985 .collect(); 2986 2987 let hv = hypervisor::new().unwrap(); 2988 let vm = hv.create_vm().unwrap(); 2989 let gic = vm 2990 .create_vgic(Gic::create_default_config(1)) 2991 .expect("Cannot create gic"); 2992 assert!(create_fdt( 2993 &mem, 2994 "console=tty0", 2995 vec![0], 2996 Some((0, 0, 0)), 2997 &dev_info, 2998 &gic, 2999 &None, 3000 &Vec::new(), 3001 &BTreeMap::new(), 3002 None, 3003 true, 3004 ) 3005 .is_ok()) 3006 } 3007 } 3008 3009 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3010 #[test] 3011 pub fn test_vm() { 3012 use hypervisor::VmExit; 3013 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3014 // This example based on https://lwn.net/Articles/658511/ 3015 let code = [ 3016 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3017 0x00, 0xd8, /* add %bl, %al */ 3018 0x04, b'0', /* add $'0', %al */ 3019 0xee, /* out %al, (%dx) */ 3020 0xb0, b'\n', /* mov $'\n', %al */ 3021 0xee, /* out %al, (%dx) */ 3022 0xf4, /* hlt */ 3023 ]; 3024 3025 let mem_size = 0x1000; 3026 let load_addr = GuestAddress(0x1000); 3027 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3028 3029 let hv = hypervisor::new().unwrap(); 3030 let vm = hv.create_vm().expect("new VM creation failed"); 3031 3032 for (index, region) in mem.iter().enumerate() { 3033 let mem_region = vm.make_user_memory_region( 3034 index as u32, 3035 region.start_addr().raw_value(), 3036 region.len(), 3037 region.as_ptr() as u64, 3038 false, 3039 false, 3040 ); 3041 3042 vm.create_user_memory_region(mem_region) 3043 .expect("Cannot configure guest memory"); 3044 } 3045 mem.write_slice(&code, load_addr) 3046 .expect("Writing code to memory failed"); 3047 3048 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3049 3050 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3051 vcpu_sregs.cs.base = 0; 3052 vcpu_sregs.cs.selector = 0; 3053 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3054 3055 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3056 vcpu_regs.rip = 0x1000; 3057 vcpu_regs.rax = 2; 3058 vcpu_regs.rbx = 3; 3059 vcpu_regs.rflags = 2; 3060 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3061 3062 loop { 3063 match vcpu.run().expect("run failed") { 3064 VmExit::IoOut(addr, data) => { 3065 println!( 3066 "IO out -- addr: {:#x} data [{:?}]", 3067 addr, 3068 str::from_utf8(data).unwrap() 3069 ); 3070 } 3071 VmExit::Reset => { 3072 println!("HLT"); 3073 break; 3074 } 3075 r => panic!("unexpected exit reason: {r:?}"), 3076 } 3077 } 3078 } 3079