1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::NumaConfig; 15 use crate::config::{ 16 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 17 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 18 }; 19 #[cfg(feature = "guest_debug")] 20 use crate::coredump::{ 21 CpuElf64Writable, DumpState, Elf64Writable, GuestDebuggable, GuestDebuggableError, NoteDescType, 22 }; 23 use crate::cpu; 24 use crate::device_manager::{Console, DeviceManager, DeviceManagerError, PtyPair}; 25 use crate::device_tree::DeviceTree; 26 #[cfg(feature = "gdb")] 27 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 28 use crate::memory_manager::{ 29 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 30 }; 31 #[cfg(feature = "guest_debug")] 32 use crate::migration::url_to_file; 33 use crate::migration::{url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 34 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 35 use crate::GuestMemoryMmap; 36 use crate::{ 37 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 38 }; 39 use anyhow::anyhow; 40 use arch::get_host_cpu_phys_bits; 41 #[cfg(target_arch = "x86_64")] 42 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 43 #[cfg(feature = "tdx")] 44 use arch::x86_64::tdx::TdvfSection; 45 use arch::EntryPoint; 46 #[cfg(target_arch = "aarch64")] 47 use arch::PciSpaceInfo; 48 use arch::{NumaNode, NumaNodes}; 49 #[cfg(target_arch = "aarch64")] 50 use devices::gic::GIC_V3_ITS_SNAPSHOT_ID; 51 #[cfg(target_arch = "aarch64")] 52 use devices::interrupt_controller::{self, InterruptController}; 53 use devices::AcpiNotificationFlags; 54 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 55 use gdbstub_arch::x86::reg::X86_64CoreRegs; 56 use hypervisor::{HypervisorVmError, VmOps}; 57 use linux_loader::cmdline::Cmdline; 58 #[cfg(feature = "guest_debug")] 59 use linux_loader::elf; 60 #[cfg(target_arch = "x86_64")] 61 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 62 #[cfg(target_arch = "aarch64")] 63 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 64 use linux_loader::loader::KernelLoader; 65 use seccompiler::{apply_filter, SeccompAction}; 66 use serde::{Deserialize, Serialize}; 67 use signal_hook::{consts::SIGWINCH, iterator::backend::Handle, iterator::Signals}; 68 use std::cmp; 69 use std::collections::BTreeMap; 70 use std::collections::HashMap; 71 use std::convert::TryInto; 72 use std::fs::{File, OpenOptions}; 73 use std::io::{self, Read, Write}; 74 use std::io::{Seek, SeekFrom}; 75 #[cfg(feature = "tdx")] 76 use std::mem; 77 #[cfg(feature = "guest_debug")] 78 use std::mem::size_of; 79 use std::num::Wrapping; 80 use std::ops::Deref; 81 use std::os::unix::net::UnixStream; 82 use std::panic::AssertUnwindSafe; 83 use std::sync::{Arc, Mutex, RwLock}; 84 use std::time::Instant; 85 use std::{result, str, thread}; 86 use thiserror::Error; 87 use vm_device::Bus; 88 #[cfg(target_arch = "x86_64")] 89 use vm_device::BusDevice; 90 #[cfg(target_arch = "x86_64")] 91 use vm_memory::Address; 92 #[cfg(feature = "tdx")] 93 use vm_memory::{ByteValued, GuestMemory, GuestMemoryRegion}; 94 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; 95 use vm_migration::protocol::{Request, Response, Status}; 96 use vm_migration::{ 97 protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, Snapshot, 98 SnapshotDataSection, Snapshottable, Transportable, 99 }; 100 use vmm_sys_util::eventfd::EventFd; 101 use vmm_sys_util::signal::unblock_signal; 102 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 103 use vmm_sys_util::terminal::Terminal; 104 105 /// Errors associated with VM management 106 #[derive(Debug, Error)] 107 pub enum Error { 108 #[error("Cannot open kernel file: {0}")] 109 KernelFile(#[source] io::Error), 110 111 #[error("Cannot open initramfs file: {0}")] 112 InitramfsFile(#[source] io::Error), 113 114 #[error("Cannot load the kernel into memory: {0}")] 115 KernelLoad(#[source] linux_loader::loader::Error), 116 117 #[cfg(target_arch = "aarch64")] 118 #[error("Cannot load the UEFI binary in memory: {0:?}")] 119 UefiLoad(arch::aarch64::uefi::Error), 120 121 #[error("Cannot load the initramfs into memory")] 122 InitramfsLoad, 123 124 #[error("Cannot load the kernel command line in memory: {0}")] 125 LoadCmdLine(#[source] linux_loader::loader::Error), 126 127 #[error("Cannot modify the kernel command line: {0}")] 128 CmdLineInsertStr(#[source] linux_loader::cmdline::Error), 129 130 #[error("Cannot configure system: {0}")] 131 ConfigureSystem(#[source] arch::Error), 132 133 #[cfg(target_arch = "aarch64")] 134 #[error("Cannot enable interrupt controller: {0:?}")] 135 EnableInterruptController(interrupt_controller::Error), 136 137 #[error("VM state is poisoned")] 138 PoisonedState, 139 140 #[error("Error from device manager: {0:?}")] 141 DeviceManager(DeviceManagerError), 142 143 #[error("Cannot setup terminal in raw mode: {0}")] 144 SetTerminalRaw(#[source] vmm_sys_util::errno::Error), 145 146 #[error("Cannot setup terminal in canonical mode.: {0}")] 147 SetTerminalCanon(#[source] vmm_sys_util::errno::Error), 148 149 #[error("Cannot spawn a signal handler thread: {0}")] 150 SignalHandlerSpawn(#[source] io::Error), 151 152 #[error("Failed to join on threads: {0:?}")] 153 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 154 155 #[error("VM config is missing")] 156 VmMissingConfig, 157 158 #[error("VM is not created")] 159 VmNotCreated, 160 161 #[error("VM is already created")] 162 VmAlreadyCreated, 163 164 #[error("VM is not running")] 165 VmNotRunning, 166 167 #[error("Cannot clone EventFd: {0}")] 168 EventFdClone(#[source] io::Error), 169 170 #[error("invalid VM state transition: {0:?} to {1:?}")] 171 InvalidStateTransition(VmState, VmState), 172 173 #[error("Error from CPU manager: {0}")] 174 CpuManager(#[source] cpu::Error), 175 176 #[error("Cannot pause devices: {0}")] 177 PauseDevices(#[source] MigratableError), 178 179 #[error("Cannot resume devices: {0}")] 180 ResumeDevices(#[source] MigratableError), 181 182 #[error("Cannot pause CPUs: {0}")] 183 PauseCpus(#[source] MigratableError), 184 185 #[error("Cannot resume cpus: {0}")] 186 ResumeCpus(#[source] MigratableError), 187 188 #[error("Cannot pause VM: {0}")] 189 Pause(#[source] MigratableError), 190 191 #[error("Cannot resume VM: {0}")] 192 Resume(#[source] MigratableError), 193 194 #[error("Memory manager error: {0:?}")] 195 MemoryManager(MemoryManagerError), 196 197 #[error("Eventfd write error: {0}")] 198 EventfdError(#[source] std::io::Error), 199 200 #[error("Cannot snapshot VM: {0}")] 201 Snapshot(#[source] MigratableError), 202 203 #[error("Cannot restore VM: {0}")] 204 Restore(#[source] MigratableError), 205 206 #[error("Cannot send VM snapshot: {0}")] 207 SnapshotSend(#[source] MigratableError), 208 209 #[error("Invalid restore source URL")] 210 InvalidRestoreSourceUrl, 211 212 #[error("Failed to validate config: {0}")] 213 ConfigValidation(#[source] ValidationError), 214 215 #[error("Too many virtio-vsock devices")] 216 TooManyVsockDevices, 217 218 #[error("Failed serializing into JSON: {0}")] 219 SerializeJson(#[source] serde_json::Error), 220 221 #[error("Invalid NUMA configuration")] 222 InvalidNumaConfig, 223 224 #[error("Cannot create seccomp filter: {0}")] 225 CreateSeccompFilter(#[source] seccompiler::Error), 226 227 #[error("Cannot apply seccomp filter: {0}")] 228 ApplySeccompFilter(#[source] seccompiler::Error), 229 230 #[error("Failed resizing a memory zone")] 231 ResizeZone, 232 233 #[error("Cannot activate virtio devices: {0:?}")] 234 ActivateVirtioDevices(DeviceManagerError), 235 236 #[error("Error triggering power button: {0:?}")] 237 PowerButton(DeviceManagerError), 238 239 #[error("Kernel lacks PVH header")] 240 KernelMissingPvhHeader, 241 242 #[error("Failed to allocate firmware RAM: {0:?}")] 243 AllocateFirmwareMemory(MemoryManagerError), 244 245 #[error("Error manipulating firmware file: {0}")] 246 FirmwareFile(#[source] std::io::Error), 247 248 #[error("Firmware too big")] 249 FirmwareTooLarge, 250 251 #[error("Failed to copy firmware to memory: {0}")] 252 FirmwareLoad(#[source] vm_memory::GuestMemoryError), 253 254 #[cfg(feature = "tdx")] 255 #[error("Error performing I/O on TDX firmware file: {0}")] 256 LoadTdvf(#[source] std::io::Error), 257 258 #[cfg(feature = "tdx")] 259 #[error("Error performing I/O on the TDX payload file: {0}")] 260 LoadPayload(#[source] std::io::Error), 261 262 #[cfg(feature = "tdx")] 263 #[error("Error parsing TDVF: {0}")] 264 ParseTdvf(#[source] arch::x86_64::tdx::TdvfError), 265 266 #[cfg(feature = "tdx")] 267 #[error("Error populating TDX HOB: {0}")] 268 PopulateHob(#[source] arch::x86_64::tdx::TdvfError), 269 270 #[cfg(feature = "tdx")] 271 #[error("Error allocating TDVF memory: {0:?}")] 272 AllocatingTdvfMemory(crate::memory_manager::Error), 273 274 #[cfg(feature = "tdx")] 275 #[error("Error enabling TDX VM: {0}")] 276 InitializeTdxVm(#[source] hypervisor::HypervisorVmError), 277 278 #[cfg(feature = "tdx")] 279 #[error("Error enabling TDX memory region: {0}")] 280 InitializeTdxMemoryRegion(#[source] hypervisor::HypervisorVmError), 281 282 #[cfg(feature = "tdx")] 283 #[error("Error finalizing TDX VM: {0}")] 284 FinalizeTdx(#[source] hypervisor::HypervisorVmError), 285 286 #[cfg(feature = "tdx")] 287 #[error("Invalid TDX payload type")] 288 InvalidPayloadType, 289 290 #[cfg(feature = "gdb")] 291 #[error("Error debugging VM: {0:?}")] 292 Debug(DebuggableError), 293 294 #[cfg(target_arch = "x86_64")] 295 #[error("Error spawning kernel loading thread")] 296 KernelLoadThreadSpawn(std::io::Error), 297 298 #[cfg(target_arch = "x86_64")] 299 #[error("Error joining kernel loading thread")] 300 KernelLoadThreadJoin(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 301 302 #[cfg(feature = "guest_debug")] 303 #[error("Error coredumping VM: {0:?}")] 304 Coredump(GuestDebuggableError), 305 } 306 pub type Result<T> = result::Result<T, Error>; 307 308 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)] 309 pub enum VmState { 310 Created, 311 Running, 312 Shutdown, 313 Paused, 314 BreakPoint, 315 } 316 317 impl VmState { 318 fn valid_transition(self, new_state: VmState) -> Result<()> { 319 match self { 320 VmState::Created => match new_state { 321 VmState::Created | VmState::Shutdown => { 322 Err(Error::InvalidStateTransition(self, new_state)) 323 } 324 VmState::Running | VmState::Paused | VmState::BreakPoint => Ok(()), 325 }, 326 327 VmState::Running => match new_state { 328 VmState::Created | VmState::Running => { 329 Err(Error::InvalidStateTransition(self, new_state)) 330 } 331 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 332 }, 333 334 VmState::Shutdown => match new_state { 335 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 336 Err(Error::InvalidStateTransition(self, new_state)) 337 } 338 VmState::Running => Ok(()), 339 }, 340 341 VmState::Paused => match new_state { 342 VmState::Created | VmState::Paused | VmState::BreakPoint => { 343 Err(Error::InvalidStateTransition(self, new_state)) 344 } 345 VmState::Running | VmState::Shutdown => Ok(()), 346 }, 347 VmState::BreakPoint => match new_state { 348 VmState::Created | VmState::Running => Ok(()), 349 _ => Err(Error::InvalidStateTransition(self, new_state)), 350 }, 351 } 352 } 353 } 354 355 struct VmOpsHandler { 356 memory: GuestMemoryAtomic<GuestMemoryMmap>, 357 #[cfg(target_arch = "x86_64")] 358 io_bus: Arc<Bus>, 359 mmio_bus: Arc<Bus>, 360 #[cfg(target_arch = "x86_64")] 361 pci_config_io: Arc<Mutex<dyn BusDevice>>, 362 } 363 364 impl VmOps for VmOpsHandler { 365 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> result::Result<usize, HypervisorVmError> { 366 self.memory 367 .memory() 368 .write(buf, GuestAddress(gpa)) 369 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 370 } 371 372 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> result::Result<usize, HypervisorVmError> { 373 self.memory 374 .memory() 375 .read(buf, GuestAddress(gpa)) 376 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 377 } 378 379 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 380 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 381 warn!("Guest MMIO read to unregistered address 0x{:x}", gpa); 382 } 383 Ok(()) 384 } 385 386 fn mmio_write(&self, gpa: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 387 match self.mmio_bus.write(gpa, data) { 388 Err(vm_device::BusError::MissingAddressRange) => { 389 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa); 390 } 391 Ok(Some(barrier)) => { 392 info!("Waiting for barrier"); 393 barrier.wait(); 394 info!("Barrier released"); 395 } 396 _ => {} 397 }; 398 Ok(()) 399 } 400 401 #[cfg(target_arch = "x86_64")] 402 fn pio_read(&self, port: u64, data: &mut [u8]) -> result::Result<(), HypervisorVmError> { 403 use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE}; 404 405 if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) { 406 self.pci_config_io.lock().unwrap().read( 407 PCI_CONFIG_IO_PORT, 408 port - PCI_CONFIG_IO_PORT, 409 data, 410 ); 411 return Ok(()); 412 } 413 414 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 415 warn!("Guest PIO read to unregistered address 0x{:x}", port); 416 } 417 Ok(()) 418 } 419 420 #[cfg(target_arch = "x86_64")] 421 fn pio_write(&self, port: u64, data: &[u8]) -> result::Result<(), HypervisorVmError> { 422 use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE}; 423 424 if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) { 425 self.pci_config_io.lock().unwrap().write( 426 PCI_CONFIG_IO_PORT, 427 port - PCI_CONFIG_IO_PORT, 428 data, 429 ); 430 return Ok(()); 431 } 432 433 match self.io_bus.write(port, data) { 434 Err(vm_device::BusError::MissingAddressRange) => { 435 warn!("Guest PIO write to unregistered address 0x{:x}", port); 436 } 437 Ok(Some(barrier)) => { 438 info!("Waiting for barrier"); 439 barrier.wait(); 440 info!("Barrier released"); 441 } 442 _ => {} 443 }; 444 Ok(()) 445 } 446 } 447 448 pub fn physical_bits(max_phys_bits: u8) -> u8 { 449 let host_phys_bits = get_host_cpu_phys_bits(); 450 451 cmp::min(host_phys_bits, max_phys_bits) 452 } 453 454 pub struct Vm { 455 #[cfg(any(target_arch = "aarch64", feature = "tdx"))] 456 kernel: Option<File>, 457 initramfs: Option<File>, 458 threads: Vec<thread::JoinHandle<()>>, 459 device_manager: Arc<Mutex<DeviceManager>>, 460 config: Arc<Mutex<VmConfig>>, 461 on_tty: bool, 462 signals: Option<Handle>, 463 state: RwLock<VmState>, 464 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 465 memory_manager: Arc<Mutex<MemoryManager>>, 466 #[cfg_attr(not(feature = "kvm"), allow(dead_code))] 467 // The hypervisor abstracted virtual machine. 468 vm: Arc<dyn hypervisor::Vm>, 469 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 470 saved_clock: Option<hypervisor::ClockData>, 471 numa_nodes: NumaNodes, 472 seccomp_action: SeccompAction, 473 exit_evt: EventFd, 474 hypervisor: Arc<dyn hypervisor::Hypervisor>, 475 stop_on_boot: bool, 476 #[cfg(target_arch = "x86_64")] 477 load_kernel_handle: Option<thread::JoinHandle<Result<EntryPoint>>>, 478 } 479 480 impl Vm { 481 pub const HANDLED_SIGNALS: [i32; 1] = [SIGWINCH]; 482 483 #[allow(clippy::too_many_arguments)] 484 fn new_from_memory_manager( 485 config: Arc<Mutex<VmConfig>>, 486 memory_manager: Arc<Mutex<MemoryManager>>, 487 vm: Arc<dyn hypervisor::Vm>, 488 exit_evt: EventFd, 489 reset_evt: EventFd, 490 #[cfg(feature = "gdb")] vm_debug_evt: EventFd, 491 seccomp_action: &SeccompAction, 492 hypervisor: Arc<dyn hypervisor::Hypervisor>, 493 activate_evt: EventFd, 494 restoring: bool, 495 timestamp: Instant, 496 ) -> Result<Self> { 497 let kernel = config 498 .lock() 499 .unwrap() 500 .kernel 501 .as_ref() 502 .map(|k| File::open(&k.path)) 503 .transpose() 504 .map_err(Error::KernelFile)?; 505 506 #[cfg(target_arch = "x86_64")] 507 let load_kernel_handle = if !restoring { 508 Self::load_kernel_async(&kernel, &memory_manager, &config)? 509 } else { 510 None 511 }; 512 513 let boot_id_list = config 514 .lock() 515 .unwrap() 516 .validate() 517 .map_err(Error::ConfigValidation)?; 518 519 info!("Booting VM from config: {:?}", &config); 520 521 // Create NUMA nodes based on NumaConfig. 522 let numa_nodes = 523 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 524 525 #[cfg(feature = "tdx")] 526 let force_iommu = config.lock().unwrap().tdx.is_some(); 527 #[cfg(not(feature = "tdx"))] 528 let force_iommu = false; 529 530 #[cfg(feature = "gdb")] 531 let stop_on_boot = config.lock().unwrap().gdb; 532 #[cfg(not(feature = "gdb"))] 533 let stop_on_boot = false; 534 535 let device_manager = DeviceManager::new( 536 hypervisor.hypervisor_type(), 537 vm.clone(), 538 config.clone(), 539 memory_manager.clone(), 540 &exit_evt, 541 &reset_evt, 542 seccomp_action.clone(), 543 numa_nodes.clone(), 544 &activate_evt, 545 force_iommu, 546 restoring, 547 boot_id_list, 548 timestamp, 549 ) 550 .map_err(Error::DeviceManager)?; 551 552 let memory = memory_manager.lock().unwrap().guest_memory(); 553 #[cfg(target_arch = "x86_64")] 554 let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus()); 555 let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus()); 556 557 #[cfg(target_arch = "x86_64")] 558 let pci_config_io = 559 device_manager.lock().unwrap().pci_config_io() as Arc<Mutex<dyn BusDevice>>; 560 let vm_ops: Arc<dyn VmOps> = Arc::new(VmOpsHandler { 561 memory, 562 #[cfg(target_arch = "x86_64")] 563 io_bus, 564 mmio_bus, 565 #[cfg(target_arch = "x86_64")] 566 pci_config_io, 567 }); 568 569 let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?; 570 #[cfg(feature = "tdx")] 571 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 572 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 573 let cpu_manager = cpu::CpuManager::new( 574 cpus_config, 575 &device_manager, 576 &memory_manager, 577 vm.clone(), 578 exit_evt_clone, 579 reset_evt, 580 #[cfg(feature = "gdb")] 581 vm_debug_evt, 582 hypervisor.clone(), 583 seccomp_action.clone(), 584 vm_ops, 585 #[cfg(feature = "tdx")] 586 tdx_enabled, 587 &numa_nodes, 588 ) 589 .map_err(Error::CpuManager)?; 590 591 let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0; 592 593 let initramfs = config 594 .lock() 595 .unwrap() 596 .initramfs 597 .as_ref() 598 .map(|i| File::open(&i.path)) 599 .transpose() 600 .map_err(Error::InitramfsFile)?; 601 602 Ok(Vm { 603 #[cfg(any(target_arch = "aarch64", feature = "tdx"))] 604 kernel, 605 initramfs, 606 device_manager, 607 config, 608 on_tty, 609 threads: Vec::with_capacity(1), 610 signals: None, 611 state: RwLock::new(VmState::Created), 612 cpu_manager, 613 memory_manager, 614 vm, 615 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 616 saved_clock: None, 617 numa_nodes, 618 seccomp_action: seccomp_action.clone(), 619 exit_evt, 620 hypervisor, 621 stop_on_boot, 622 #[cfg(target_arch = "x86_64")] 623 load_kernel_handle, 624 }) 625 } 626 627 fn create_numa_nodes( 628 configs: Option<Vec<NumaConfig>>, 629 memory_manager: &Arc<Mutex<MemoryManager>>, 630 ) -> Result<NumaNodes> { 631 let mm = memory_manager.lock().unwrap(); 632 let mm_zones = mm.memory_zones(); 633 let mut numa_nodes = BTreeMap::new(); 634 635 if let Some(configs) = &configs { 636 for config in configs.iter() { 637 if numa_nodes.contains_key(&config.guest_numa_id) { 638 error!("Can't define twice the same NUMA node"); 639 return Err(Error::InvalidNumaConfig); 640 } 641 642 let mut node = NumaNode::default(); 643 644 if let Some(memory_zones) = &config.memory_zones { 645 for memory_zone in memory_zones.iter() { 646 if let Some(mm_zone) = mm_zones.get(memory_zone) { 647 node.memory_regions.extend(mm_zone.regions().clone()); 648 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 649 node.hotplug_regions.push(virtiomem_zone.region().clone()); 650 } 651 node.memory_zones.push(memory_zone.clone()); 652 } else { 653 error!("Unknown memory zone '{}'", memory_zone); 654 return Err(Error::InvalidNumaConfig); 655 } 656 } 657 } 658 659 if let Some(cpus) = &config.cpus { 660 node.cpus.extend(cpus); 661 } 662 663 if let Some(distances) = &config.distances { 664 for distance in distances.iter() { 665 let dest = distance.destination; 666 let dist = distance.distance; 667 668 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 669 error!("Unknown destination NUMA node {}", dest); 670 return Err(Error::InvalidNumaConfig); 671 } 672 673 if node.distances.contains_key(&dest) { 674 error!("Destination NUMA node {} has been already set", dest); 675 return Err(Error::InvalidNumaConfig); 676 } 677 678 node.distances.insert(dest, dist); 679 } 680 } 681 682 #[cfg(target_arch = "x86_64")] 683 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 684 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 685 let mm_sections = sgx_epc_region.epc_sections(); 686 for sgx_epc_section in sgx_epc_sections.iter() { 687 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 688 node.sgx_epc_sections.push(mm_section.clone()); 689 } else { 690 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 691 return Err(Error::InvalidNumaConfig); 692 } 693 } 694 } else { 695 error!("Missing SGX EPC region"); 696 return Err(Error::InvalidNumaConfig); 697 } 698 } 699 700 numa_nodes.insert(config.guest_numa_id, node); 701 } 702 } 703 704 Ok(numa_nodes) 705 } 706 707 #[allow(clippy::too_many_arguments)] 708 pub fn new( 709 config: Arc<Mutex<VmConfig>>, 710 exit_evt: EventFd, 711 reset_evt: EventFd, 712 #[cfg(feature = "gdb")] vm_debug_evt: EventFd, 713 seccomp_action: &SeccompAction, 714 hypervisor: Arc<dyn hypervisor::Hypervisor>, 715 activate_evt: EventFd, 716 serial_pty: Option<PtyPair>, 717 console_pty: Option<PtyPair>, 718 console_resize_pipe: Option<File>, 719 ) -> Result<Self> { 720 let timestamp = Instant::now(); 721 722 #[cfg(feature = "tdx")] 723 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 724 hypervisor.check_required_extensions().unwrap(); 725 #[cfg(feature = "tdx")] 726 let vm = hypervisor 727 .create_vm_with_type(if tdx_enabled { 728 2 // KVM_X86_TDX_VM 729 } else { 730 0 // KVM_X86_LEGACY_VM 731 }) 732 .unwrap(); 733 #[cfg(not(feature = "tdx"))] 734 let vm = hypervisor.create_vm().unwrap(); 735 736 #[cfg(target_arch = "x86_64")] 737 { 738 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 739 .unwrap(); 740 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 741 vm.enable_split_irq().unwrap(); 742 } 743 744 let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits); 745 746 #[cfg(target_arch = "x86_64")] 747 let sgx_epc_config = config.lock().unwrap().sgx_epc.clone(); 748 749 let memory_manager = MemoryManager::new( 750 vm.clone(), 751 &config.lock().unwrap().memory.clone(), 752 None, 753 phys_bits, 754 #[cfg(feature = "tdx")] 755 tdx_enabled, 756 None, 757 None, 758 #[cfg(target_arch = "x86_64")] 759 sgx_epc_config, 760 ) 761 .map_err(Error::MemoryManager)?; 762 763 let new_vm = Vm::new_from_memory_manager( 764 config, 765 memory_manager, 766 vm, 767 exit_evt, 768 reset_evt, 769 #[cfg(feature = "gdb")] 770 vm_debug_evt, 771 seccomp_action, 772 hypervisor, 773 activate_evt, 774 false, 775 timestamp, 776 )?; 777 778 // The device manager must create the devices from here as it is part 779 // of the regular code path creating everything from scratch. 780 new_vm 781 .device_manager 782 .lock() 783 .unwrap() 784 .create_devices(serial_pty, console_pty, console_resize_pipe) 785 .map_err(Error::DeviceManager)?; 786 Ok(new_vm) 787 } 788 789 #[allow(clippy::too_many_arguments)] 790 pub fn new_from_snapshot( 791 snapshot: &Snapshot, 792 vm_config: Arc<Mutex<VmConfig>>, 793 exit_evt: EventFd, 794 reset_evt: EventFd, 795 #[cfg(feature = "gdb")] vm_debug_evt: EventFd, 796 source_url: Option<&str>, 797 prefault: bool, 798 seccomp_action: &SeccompAction, 799 hypervisor: Arc<dyn hypervisor::Hypervisor>, 800 activate_evt: EventFd, 801 ) -> Result<Self> { 802 let timestamp = Instant::now(); 803 804 hypervisor.check_required_extensions().unwrap(); 805 let vm = hypervisor.create_vm().unwrap(); 806 807 #[cfg(target_arch = "x86_64")] 808 { 809 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 810 .unwrap(); 811 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 812 vm.enable_split_irq().unwrap(); 813 } 814 815 let memory_manager = if let Some(memory_manager_snapshot) = 816 snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) 817 { 818 let phys_bits = physical_bits(vm_config.lock().unwrap().cpus.max_phys_bits); 819 MemoryManager::new_from_snapshot( 820 memory_manager_snapshot, 821 vm.clone(), 822 &vm_config.lock().unwrap().memory.clone(), 823 source_url, 824 prefault, 825 phys_bits, 826 ) 827 .map_err(Error::MemoryManager)? 828 } else { 829 return Err(Error::Restore(MigratableError::Restore(anyhow!( 830 "Missing memory manager snapshot" 831 )))); 832 }; 833 834 Vm::new_from_memory_manager( 835 vm_config, 836 memory_manager, 837 vm, 838 exit_evt, 839 reset_evt, 840 #[cfg(feature = "gdb")] 841 vm_debug_evt, 842 seccomp_action, 843 hypervisor, 844 activate_evt, 845 true, 846 timestamp, 847 ) 848 } 849 850 #[allow(clippy::too_many_arguments)] 851 pub fn new_from_migration( 852 config: Arc<Mutex<VmConfig>>, 853 exit_evt: EventFd, 854 reset_evt: EventFd, 855 #[cfg(feature = "gdb")] vm_debug_evt: EventFd, 856 seccomp_action: &SeccompAction, 857 hypervisor: Arc<dyn hypervisor::Hypervisor>, 858 activate_evt: EventFd, 859 memory_manager_data: &MemoryManagerSnapshotData, 860 existing_memory_files: Option<HashMap<u32, File>>, 861 ) -> Result<Self> { 862 let timestamp = Instant::now(); 863 864 hypervisor.check_required_extensions().unwrap(); 865 let vm = hypervisor.create_vm().unwrap(); 866 867 #[cfg(target_arch = "x86_64")] 868 { 869 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 870 .unwrap(); 871 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 872 vm.enable_split_irq().unwrap(); 873 } 874 875 let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits); 876 877 let memory_manager = MemoryManager::new( 878 vm.clone(), 879 &config.lock().unwrap().memory.clone(), 880 None, 881 phys_bits, 882 #[cfg(feature = "tdx")] 883 false, 884 Some(memory_manager_data), 885 existing_memory_files, 886 #[cfg(target_arch = "x86_64")] 887 None, 888 ) 889 .map_err(Error::MemoryManager)?; 890 891 Vm::new_from_memory_manager( 892 config, 893 memory_manager, 894 vm, 895 exit_evt, 896 reset_evt, 897 #[cfg(feature = "gdb")] 898 vm_debug_evt, 899 seccomp_action, 900 hypervisor, 901 activate_evt, 902 true, 903 timestamp, 904 ) 905 } 906 907 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 908 let mut initramfs = self.initramfs.as_ref().unwrap(); 909 let size: usize = initramfs 910 .seek(SeekFrom::End(0)) 911 .map_err(|_| Error::InitramfsLoad)? 912 .try_into() 913 .unwrap(); 914 initramfs 915 .seek(SeekFrom::Start(0)) 916 .map_err(|_| Error::InitramfsLoad)?; 917 918 let address = 919 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 920 let address = GuestAddress(address); 921 922 guest_mem 923 .read_from(address, &mut initramfs, size) 924 .map_err(|_| Error::InitramfsLoad)?; 925 926 info!("Initramfs loaded: address = 0x{:x}", address.0); 927 Ok(arch::InitramfsConfig { address, size }) 928 } 929 930 fn generate_cmdline( 931 config: &Arc<Mutex<VmConfig>>, 932 #[cfg(target_arch = "aarch64")] device_manager: &Arc<Mutex<DeviceManager>>, 933 ) -> Result<Cmdline> { 934 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE); 935 cmdline 936 .insert_str(&config.lock().unwrap().cmdline.args) 937 .map_err(Error::CmdLineInsertStr)?; 938 939 #[cfg(target_arch = "aarch64")] 940 for entry in device_manager.lock().unwrap().cmdline_additions() { 941 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 942 } 943 Ok(cmdline) 944 } 945 946 #[cfg(target_arch = "aarch64")] 947 fn load_kernel(&mut self) -> Result<EntryPoint> { 948 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 949 let mem = guest_memory.memory(); 950 let mut kernel = self.kernel.as_ref().unwrap(); 951 let entry_addr = match linux_loader::loader::pe::PE::load( 952 mem.deref(), 953 Some(arch::layout::KERNEL_START), 954 &mut kernel, 955 None, 956 ) { 957 Ok(entry_addr) => entry_addr, 958 // Try to load the binary as kernel PE file at first. 959 // If failed, retry to load it as UEFI binary. 960 // As the UEFI binary is formatless, it must be the last option to try. 961 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 962 let uefi_flash = self.device_manager.lock().as_ref().unwrap().uefi_flash(); 963 let mem = uefi_flash.memory(); 964 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut kernel) 965 .map_err(Error::UefiLoad)?; 966 967 // The entry point offset in UEFI image is always 0. 968 return Ok(EntryPoint { 969 entry_addr: arch::layout::UEFI_START, 970 }); 971 } 972 Err(e) => { 973 return Err(Error::KernelLoad(e)); 974 } 975 }; 976 977 let entry_point_addr: GuestAddress = entry_addr.kernel_load; 978 979 Ok(EntryPoint { 980 entry_addr: entry_point_addr, 981 }) 982 } 983 984 #[cfg(target_arch = "x86_64")] 985 fn load_kernel( 986 mut kernel: File, 987 cmdline: Cmdline, 988 memory_manager: Arc<Mutex<MemoryManager>>, 989 ) -> Result<EntryPoint> { 990 use linux_loader::loader::{elf::Error::InvalidElfMagicNumber, Error::Elf}; 991 info!("Loading kernel"); 992 993 let mem = { 994 let guest_memory = memory_manager.lock().as_ref().unwrap().guest_memory(); 995 guest_memory.memory() 996 }; 997 let entry_addr = match linux_loader::loader::elf::Elf::load( 998 mem.deref(), 999 None, 1000 &mut kernel, 1001 Some(arch::layout::HIGH_RAM_START), 1002 ) { 1003 Ok(entry_addr) => entry_addr, 1004 Err(e) => match e { 1005 Elf(InvalidElfMagicNumber) => { 1006 // Not an ELF header - assume raw binary data / firmware 1007 let size = kernel.seek(SeekFrom::End(0)).map_err(Error::FirmwareFile)?; 1008 1009 // The OVMF firmware is as big as you might expect and it's 4MiB so limit to that 1010 if size > 4 << 20 { 1011 return Err(Error::FirmwareTooLarge); 1012 } 1013 1014 // Loaded at the end of the 4GiB 1015 let load_address = GuestAddress(4 << 30) 1016 .checked_sub(size) 1017 .ok_or(Error::FirmwareTooLarge)?; 1018 1019 info!( 1020 "Loading RAW firmware at 0x{:x} (size: {})", 1021 load_address.raw_value(), 1022 size 1023 ); 1024 1025 memory_manager 1026 .lock() 1027 .unwrap() 1028 .add_ram_region(load_address, size as usize) 1029 .map_err(Error::AllocateFirmwareMemory)?; 1030 1031 kernel 1032 .seek(SeekFrom::Start(0)) 1033 .map_err(Error::FirmwareFile)?; 1034 memory_manager 1035 .lock() 1036 .unwrap() 1037 .guest_memory() 1038 .memory() 1039 .read_exact_from(load_address, &mut kernel, size as usize) 1040 .map_err(Error::FirmwareLoad)?; 1041 1042 return Ok(EntryPoint { entry_addr: None }); 1043 } 1044 _ => { 1045 return Err(Error::KernelLoad(e)); 1046 } 1047 }, 1048 }; 1049 1050 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1051 .map_err(Error::LoadCmdLine)?; 1052 1053 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1054 // Use the PVH kernel entry point to boot the guest 1055 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1056 Ok(EntryPoint { 1057 entry_addr: Some(entry_addr), 1058 }) 1059 } else { 1060 Err(Error::KernelMissingPvhHeader) 1061 } 1062 } 1063 1064 #[cfg(target_arch = "x86_64")] 1065 fn load_kernel_async( 1066 kernel: &Option<File>, 1067 memory_manager: &Arc<Mutex<MemoryManager>>, 1068 config: &Arc<Mutex<VmConfig>>, 1069 ) -> Result<Option<thread::JoinHandle<Result<EntryPoint>>>> { 1070 // Kernel with TDX is loaded in a different manner 1071 #[cfg(feature = "tdx")] 1072 if config.lock().unwrap().tdx.is_some() { 1073 return Ok(None); 1074 } 1075 1076 kernel 1077 .as_ref() 1078 .map(|kernel| { 1079 let kernel = kernel.try_clone().unwrap(); 1080 let config = config.clone(); 1081 let memory_manager = memory_manager.clone(); 1082 1083 std::thread::Builder::new() 1084 .name("kernel_loader".into()) 1085 .spawn(move || { 1086 let cmdline = Self::generate_cmdline(&config)?; 1087 Self::load_kernel(kernel, cmdline, memory_manager) 1088 }) 1089 .map_err(Error::KernelLoadThreadSpawn) 1090 }) 1091 .transpose() 1092 } 1093 1094 #[cfg(target_arch = "x86_64")] 1095 fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> { 1096 info!("Configuring system"); 1097 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1098 1099 let initramfs_config = match self.initramfs { 1100 Some(_) => Some(self.load_initramfs(&mem)?), 1101 None => None, 1102 }; 1103 1104 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1105 let rsdp_addr = Some(rsdp_addr); 1106 let sgx_epc_region = self 1107 .memory_manager 1108 .lock() 1109 .unwrap() 1110 .sgx_epc_region() 1111 .as_ref() 1112 .cloned(); 1113 1114 let serial_number = self 1115 .config 1116 .lock() 1117 .unwrap() 1118 .platform 1119 .as_ref() 1120 .and_then(|p| p.serial_number.clone()); 1121 1122 arch::configure_system( 1123 &mem, 1124 arch::layout::CMDLINE_START, 1125 &initramfs_config, 1126 boot_vcpus, 1127 rsdp_addr, 1128 sgx_epc_region, 1129 serial_number.as_deref(), 1130 ) 1131 .map_err(Error::ConfigureSystem)?; 1132 Ok(()) 1133 } 1134 1135 #[cfg(target_arch = "aarch64")] 1136 fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> { 1137 let cmdline = Self::generate_cmdline(&self.config, &self.device_manager)?; 1138 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1139 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1140 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1141 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1142 let initramfs_config = match self.initramfs { 1143 Some(_) => Some(self.load_initramfs(&mem)?), 1144 None => None, 1145 }; 1146 1147 let device_info = &self 1148 .device_manager 1149 .lock() 1150 .unwrap() 1151 .get_device_info() 1152 .clone(); 1153 1154 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1155 let pci_space = PciSpaceInfo { 1156 pci_segment_id: pci_segment.id, 1157 mmio_config_address: pci_segment.mmio_config_address, 1158 pci_device_space_start: pci_segment.start_of_device_area, 1159 pci_device_space_size: pci_segment.end_of_device_area 1160 - pci_segment.start_of_device_area 1161 + 1, 1162 }; 1163 pci_space_info.push(pci_space); 1164 } 1165 1166 let virtio_iommu_bdf = self 1167 .device_manager 1168 .lock() 1169 .unwrap() 1170 .iommu_attached_devices() 1171 .as_ref() 1172 .map(|(v, _)| *v); 1173 1174 let vgic = self 1175 .device_manager 1176 .lock() 1177 .unwrap() 1178 .get_interrupt_controller() 1179 .unwrap() 1180 .lock() 1181 .unwrap() 1182 .create_vgic( 1183 &self.memory_manager.lock().as_ref().unwrap().vm, 1184 self.cpu_manager.lock().unwrap().boot_vcpus() as u64, 1185 ) 1186 .map_err(|_| { 1187 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1188 arch::aarch64::Error::SetupGic, 1189 )) 1190 })?; 1191 1192 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1193 let pmu_supported = self 1194 .cpu_manager 1195 .lock() 1196 .unwrap() 1197 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1198 .map_err(|_| { 1199 Error::ConfigureSystem(arch::Error::PlatformSpecific( 1200 arch::aarch64::Error::VcpuInitPmu, 1201 )) 1202 })?; 1203 1204 arch::configure_system( 1205 &mem, 1206 cmdline.as_str(), 1207 vcpu_mpidrs, 1208 vcpu_topology, 1209 device_info, 1210 &initramfs_config, 1211 &pci_space_info, 1212 virtio_iommu_bdf.map(|bdf| bdf.into()), 1213 &vgic, 1214 &self.numa_nodes, 1215 pmu_supported, 1216 ) 1217 .map_err(Error::ConfigureSystem)?; 1218 1219 // Activate gic device 1220 self.device_manager 1221 .lock() 1222 .unwrap() 1223 .get_interrupt_controller() 1224 .unwrap() 1225 .lock() 1226 .unwrap() 1227 .enable() 1228 .map_err(Error::EnableInterruptController)?; 1229 1230 Ok(()) 1231 } 1232 1233 pub fn serial_pty(&self) -> Option<PtyPair> { 1234 self.device_manager.lock().unwrap().serial_pty() 1235 } 1236 1237 pub fn console_pty(&self) -> Option<PtyPair> { 1238 self.device_manager.lock().unwrap().console_pty() 1239 } 1240 1241 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1242 self.device_manager.lock().unwrap().console_resize_pipe() 1243 } 1244 1245 pub fn shutdown(&mut self) -> Result<()> { 1246 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1247 let new_state = VmState::Shutdown; 1248 1249 state.valid_transition(new_state)?; 1250 1251 if self.on_tty { 1252 // Don't forget to set the terminal in canonical mode 1253 // before to exit. 1254 io::stdin() 1255 .lock() 1256 .set_canon_mode() 1257 .map_err(Error::SetTerminalCanon)?; 1258 } 1259 1260 // Trigger the termination of the signal_handler thread 1261 if let Some(signals) = self.signals.take() { 1262 signals.close(); 1263 } 1264 1265 // Wake up the DeviceManager threads so they will get terminated cleanly 1266 self.device_manager 1267 .lock() 1268 .unwrap() 1269 .resume() 1270 .map_err(Error::Resume)?; 1271 1272 self.cpu_manager 1273 .lock() 1274 .unwrap() 1275 .shutdown() 1276 .map_err(Error::CpuManager)?; 1277 1278 // Wait for all the threads to finish 1279 for thread in self.threads.drain(..) { 1280 thread.join().map_err(Error::ThreadCleanup)? 1281 } 1282 *state = new_state; 1283 1284 event!("vm", "shutdown"); 1285 1286 Ok(()) 1287 } 1288 1289 pub fn resize( 1290 &mut self, 1291 desired_vcpus: Option<u8>, 1292 desired_memory: Option<u64>, 1293 desired_balloon: Option<u64>, 1294 ) -> Result<()> { 1295 event!("vm", "resizing"); 1296 1297 if let Some(desired_vcpus) = desired_vcpus { 1298 if self 1299 .cpu_manager 1300 .lock() 1301 .unwrap() 1302 .resize(desired_vcpus) 1303 .map_err(Error::CpuManager)? 1304 { 1305 self.device_manager 1306 .lock() 1307 .unwrap() 1308 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1309 .map_err(Error::DeviceManager)?; 1310 } 1311 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1312 } 1313 1314 if let Some(desired_memory) = desired_memory { 1315 let new_region = self 1316 .memory_manager 1317 .lock() 1318 .unwrap() 1319 .resize(desired_memory) 1320 .map_err(Error::MemoryManager)?; 1321 1322 let mut memory_config = &mut self.config.lock().unwrap().memory; 1323 1324 if let Some(new_region) = &new_region { 1325 self.device_manager 1326 .lock() 1327 .unwrap() 1328 .update_memory(new_region) 1329 .map_err(Error::DeviceManager)?; 1330 1331 match memory_config.hotplug_method { 1332 HotplugMethod::Acpi => { 1333 self.device_manager 1334 .lock() 1335 .unwrap() 1336 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1337 .map_err(Error::DeviceManager)?; 1338 } 1339 HotplugMethod::VirtioMem => {} 1340 } 1341 } 1342 1343 // We update the VM config regardless of the actual guest resize 1344 // operation result (happened or not), so that if the VM reboots 1345 // it will be running with the last configure memory size. 1346 match memory_config.hotplug_method { 1347 HotplugMethod::Acpi => memory_config.size = desired_memory, 1348 HotplugMethod::VirtioMem => { 1349 if desired_memory > memory_config.size { 1350 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1351 } else { 1352 memory_config.hotplugged_size = None; 1353 } 1354 } 1355 } 1356 } 1357 1358 if let Some(desired_balloon) = desired_balloon { 1359 self.device_manager 1360 .lock() 1361 .unwrap() 1362 .resize_balloon(desired_balloon) 1363 .map_err(Error::DeviceManager)?; 1364 1365 // Update the configuration value for the balloon size to ensure 1366 // a reboot would use the right value. 1367 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1368 balloon_config.size = desired_balloon; 1369 } 1370 } 1371 1372 event!("vm", "resized"); 1373 1374 Ok(()) 1375 } 1376 1377 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1378 let memory_config = &mut self.config.lock().unwrap().memory; 1379 1380 if let Some(zones) = &mut memory_config.zones { 1381 for zone in zones.iter_mut() { 1382 if zone.id == id { 1383 if desired_memory >= zone.size { 1384 let hotplugged_size = desired_memory - zone.size; 1385 self.memory_manager 1386 .lock() 1387 .unwrap() 1388 .resize_zone(&id, desired_memory - zone.size) 1389 .map_err(Error::MemoryManager)?; 1390 // We update the memory zone config regardless of the 1391 // actual 'resize-zone' operation result (happened or 1392 // not), so that if the VM reboots it will be running 1393 // with the last configured memory zone size. 1394 zone.hotplugged_size = Some(hotplugged_size); 1395 1396 return Ok(()); 1397 } else { 1398 error!( 1399 "Invalid to ask less ({}) than boot RAM ({}) for \ 1400 this memory zone", 1401 desired_memory, zone.size, 1402 ); 1403 return Err(Error::ResizeZone); 1404 } 1405 } 1406 } 1407 } 1408 1409 error!("Could not find the memory zone {} for the resize", id); 1410 Err(Error::ResizeZone) 1411 } 1412 1413 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1414 let pci_device_info = self 1415 .device_manager 1416 .lock() 1417 .unwrap() 1418 .add_device(&mut device_cfg) 1419 .map_err(Error::DeviceManager)?; 1420 1421 // Update VmConfig by adding the new device. This is important to 1422 // ensure the device would be created in case of a reboot. 1423 { 1424 let mut config = self.config.lock().unwrap(); 1425 add_to_config(&mut config.devices, device_cfg); 1426 } 1427 1428 self.device_manager 1429 .lock() 1430 .unwrap() 1431 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1432 .map_err(Error::DeviceManager)?; 1433 1434 Ok(pci_device_info) 1435 } 1436 1437 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1438 let pci_device_info = self 1439 .device_manager 1440 .lock() 1441 .unwrap() 1442 .add_user_device(&mut device_cfg) 1443 .map_err(Error::DeviceManager)?; 1444 1445 // Update VmConfig by adding the new device. This is important to 1446 // ensure the device would be created in case of a reboot. 1447 { 1448 let mut config = self.config.lock().unwrap(); 1449 add_to_config(&mut config.user_devices, device_cfg); 1450 } 1451 1452 self.device_manager 1453 .lock() 1454 .unwrap() 1455 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1456 .map_err(Error::DeviceManager)?; 1457 1458 Ok(pci_device_info) 1459 } 1460 1461 pub fn remove_device(&mut self, id: String) -> Result<()> { 1462 self.device_manager 1463 .lock() 1464 .unwrap() 1465 .remove_device(id.clone()) 1466 .map_err(Error::DeviceManager)?; 1467 1468 // Update VmConfig by removing the device. This is important to 1469 // ensure the device would not be created in case of a reboot. 1470 let mut config = self.config.lock().unwrap(); 1471 1472 // Remove if VFIO device 1473 if let Some(devices) = config.devices.as_mut() { 1474 devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1475 } 1476 1477 // Remove if VFIO user device 1478 if let Some(user_devices) = config.user_devices.as_mut() { 1479 user_devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1480 } 1481 1482 // Remove if disk device 1483 if let Some(disks) = config.disks.as_mut() { 1484 disks.retain(|dev| dev.id.as_ref() != Some(&id)); 1485 } 1486 1487 // Remove if fs device 1488 if let Some(fs) = config.fs.as_mut() { 1489 fs.retain(|dev| dev.id.as_ref() != Some(&id)); 1490 } 1491 1492 // Remove if net device 1493 if let Some(net) = config.net.as_mut() { 1494 net.retain(|dev| dev.id.as_ref() != Some(&id)); 1495 } 1496 1497 // Remove if pmem device 1498 if let Some(pmem) = config.pmem.as_mut() { 1499 pmem.retain(|dev| dev.id.as_ref() != Some(&id)); 1500 } 1501 1502 // Remove if vDPA device 1503 if let Some(vdpa) = config.vdpa.as_mut() { 1504 vdpa.retain(|dev| dev.id.as_ref() != Some(&id)); 1505 } 1506 1507 // Remove if vsock device 1508 if let Some(vsock) = config.vsock.as_ref() { 1509 if vsock.id.as_ref() == Some(&id) { 1510 config.vsock = None; 1511 } 1512 } 1513 1514 self.device_manager 1515 .lock() 1516 .unwrap() 1517 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1518 .map_err(Error::DeviceManager)?; 1519 Ok(()) 1520 } 1521 1522 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1523 let pci_device_info = self 1524 .device_manager 1525 .lock() 1526 .unwrap() 1527 .add_disk(&mut disk_cfg) 1528 .map_err(Error::DeviceManager)?; 1529 1530 // Update VmConfig by adding the new device. This is important to 1531 // ensure the device would be created in case of a reboot. 1532 { 1533 let mut config = self.config.lock().unwrap(); 1534 add_to_config(&mut config.disks, disk_cfg); 1535 } 1536 1537 self.device_manager 1538 .lock() 1539 .unwrap() 1540 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1541 .map_err(Error::DeviceManager)?; 1542 1543 Ok(pci_device_info) 1544 } 1545 1546 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1547 let pci_device_info = self 1548 .device_manager 1549 .lock() 1550 .unwrap() 1551 .add_fs(&mut fs_cfg) 1552 .map_err(Error::DeviceManager)?; 1553 1554 // Update VmConfig by adding the new device. This is important to 1555 // ensure the device would be created in case of a reboot. 1556 { 1557 let mut config = self.config.lock().unwrap(); 1558 add_to_config(&mut config.fs, fs_cfg); 1559 } 1560 1561 self.device_manager 1562 .lock() 1563 .unwrap() 1564 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1565 .map_err(Error::DeviceManager)?; 1566 1567 Ok(pci_device_info) 1568 } 1569 1570 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1571 let pci_device_info = self 1572 .device_manager 1573 .lock() 1574 .unwrap() 1575 .add_pmem(&mut pmem_cfg) 1576 .map_err(Error::DeviceManager)?; 1577 1578 // Update VmConfig by adding the new device. This is important to 1579 // ensure the device would be created in case of a reboot. 1580 { 1581 let mut config = self.config.lock().unwrap(); 1582 add_to_config(&mut config.pmem, pmem_cfg); 1583 } 1584 1585 self.device_manager 1586 .lock() 1587 .unwrap() 1588 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1589 .map_err(Error::DeviceManager)?; 1590 1591 Ok(pci_device_info) 1592 } 1593 1594 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1595 let pci_device_info = self 1596 .device_manager 1597 .lock() 1598 .unwrap() 1599 .add_net(&mut net_cfg) 1600 .map_err(Error::DeviceManager)?; 1601 1602 // Update VmConfig by adding the new device. This is important to 1603 // ensure the device would be created in case of a reboot. 1604 { 1605 let mut config = self.config.lock().unwrap(); 1606 add_to_config(&mut config.net, net_cfg); 1607 } 1608 1609 self.device_manager 1610 .lock() 1611 .unwrap() 1612 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1613 .map_err(Error::DeviceManager)?; 1614 1615 Ok(pci_device_info) 1616 } 1617 1618 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1619 let pci_device_info = self 1620 .device_manager 1621 .lock() 1622 .unwrap() 1623 .add_vdpa(&mut vdpa_cfg) 1624 .map_err(Error::DeviceManager)?; 1625 1626 // Update VmConfig by adding the new device. This is important to 1627 // ensure the device would be created in case of a reboot. 1628 { 1629 let mut config = self.config.lock().unwrap(); 1630 add_to_config(&mut config.vdpa, vdpa_cfg); 1631 } 1632 1633 self.device_manager 1634 .lock() 1635 .unwrap() 1636 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1637 .map_err(Error::DeviceManager)?; 1638 1639 Ok(pci_device_info) 1640 } 1641 1642 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1643 let pci_device_info = self 1644 .device_manager 1645 .lock() 1646 .unwrap() 1647 .add_vsock(&mut vsock_cfg) 1648 .map_err(Error::DeviceManager)?; 1649 1650 // Update VmConfig by adding the new device. This is important to 1651 // ensure the device would be created in case of a reboot. 1652 { 1653 let mut config = self.config.lock().unwrap(); 1654 config.vsock = Some(vsock_cfg); 1655 } 1656 1657 self.device_manager 1658 .lock() 1659 .unwrap() 1660 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1661 .map_err(Error::DeviceManager)?; 1662 1663 Ok(pci_device_info) 1664 } 1665 1666 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1667 Ok(self.device_manager.lock().unwrap().counters()) 1668 } 1669 1670 fn signal_handler(mut signals: Signals, console_input_clone: Arc<Console>) { 1671 for sig in &Vm::HANDLED_SIGNALS { 1672 unblock_signal(*sig).unwrap(); 1673 } 1674 1675 for signal in signals.forever() { 1676 if signal == SIGWINCH { 1677 console_input_clone.update_console_size(); 1678 } 1679 } 1680 } 1681 1682 #[cfg(feature = "tdx")] 1683 fn init_tdx(&mut self) -> Result<()> { 1684 let cpuid = self.cpu_manager.lock().unwrap().common_cpuid(); 1685 let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32; 1686 self.vm 1687 .tdx_init(&cpuid, max_vcpus) 1688 .map_err(Error::InitializeTdxVm)?; 1689 Ok(()) 1690 } 1691 1692 #[cfg(feature = "tdx")] 1693 fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> { 1694 use arch::x86_64::tdx::*; 1695 // The TDVF file contains a table of section as well as code 1696 let mut firmware_file = 1697 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1698 .map_err(Error::LoadTdvf)?; 1699 1700 // For all the sections allocate some RAM backing them 1701 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1702 } 1703 1704 #[cfg(feature = "tdx")] 1705 fn hob_memory_resources( 1706 mut sorted_sections: Vec<TdvfSection>, 1707 guest_memory: &GuestMemoryMmap, 1708 ) -> Vec<(u64, u64, bool)> { 1709 let mut list = Vec::new(); 1710 1711 let mut current_section = sorted_sections.pop(); 1712 1713 // RAM regions interleaved with TDVF sections 1714 let mut next_start_addr = 0; 1715 for region in guest_memory.iter() { 1716 let region_start = region.start_addr().0; 1717 let region_end = region.last_addr().0; 1718 if region_start > next_start_addr { 1719 next_start_addr = region_start; 1720 } 1721 1722 loop { 1723 let (start, size, ram) = if let Some(section) = ¤t_section { 1724 if section.address <= next_start_addr { 1725 (section.address, section.size, false) 1726 } else { 1727 let last_addr = std::cmp::min(section.address - 1, region_end); 1728 (next_start_addr, last_addr - next_start_addr + 1, true) 1729 } 1730 } else { 1731 (next_start_addr, region_end - next_start_addr + 1, true) 1732 }; 1733 1734 list.push((start, size, ram)); 1735 1736 if !ram { 1737 current_section = sorted_sections.pop(); 1738 } 1739 1740 next_start_addr = start + size; 1741 1742 if region_start > next_start_addr { 1743 next_start_addr = region_start; 1744 } 1745 1746 if next_start_addr > region_end { 1747 break; 1748 } 1749 } 1750 } 1751 1752 // Once all the interleaved sections have been processed, let's simply 1753 // pull the remaining ones. 1754 if let Some(section) = current_section { 1755 list.push((section.address, section.size, false)); 1756 } 1757 while let Some(section) = sorted_sections.pop() { 1758 list.push((section.address, section.size, false)); 1759 } 1760 1761 list 1762 } 1763 1764 #[cfg(feature = "tdx")] 1765 fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> { 1766 use arch::x86_64::tdx::*; 1767 // Get the memory end *before* we start adding TDVF ram regions 1768 let boot_guest_memory = self 1769 .memory_manager 1770 .lock() 1771 .as_ref() 1772 .unwrap() 1773 .boot_guest_memory(); 1774 for section in sections { 1775 // No need to allocate if the section falls within guest RAM ranges 1776 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1777 info!( 1778 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1779 section 1780 ); 1781 continue; 1782 } 1783 1784 info!("Allocating TDVF Section: {:x?}", section); 1785 self.memory_manager 1786 .lock() 1787 .unwrap() 1788 .add_ram_region(GuestAddress(section.address), section.size as usize) 1789 .map_err(Error::AllocatingTdvfMemory)?; 1790 } 1791 1792 // The TDVF file contains a table of section as well as code 1793 let mut firmware_file = 1794 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1795 .map_err(Error::LoadTdvf)?; 1796 1797 // The guest memory at this point now has all the required regions so it 1798 // is safe to copy from the TDVF file into it. 1799 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1800 let mem = guest_memory.memory(); 1801 let mut payload_info = None; 1802 let mut hob_offset = None; 1803 for section in sections { 1804 info!("Populating TDVF Section: {:x?}", section); 1805 match section.r#type { 1806 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1807 info!("Copying section to guest memory"); 1808 firmware_file 1809 .seek(SeekFrom::Start(section.data_offset as u64)) 1810 .map_err(Error::LoadTdvf)?; 1811 mem.read_from( 1812 GuestAddress(section.address), 1813 &mut firmware_file, 1814 section.data_size as usize, 1815 ) 1816 .unwrap(); 1817 } 1818 TdvfSectionType::TdHob => { 1819 hob_offset = Some(section.address); 1820 } 1821 TdvfSectionType::Payload => { 1822 info!("Copying payload to guest memory"); 1823 if let Some(payload_file) = self.kernel.as_mut() { 1824 let payload_size = payload_file 1825 .seek(SeekFrom::End(0)) 1826 .map_err(Error::LoadPayload)?; 1827 1828 payload_file 1829 .seek(SeekFrom::Start(0x1f1)) 1830 .map_err(Error::LoadPayload)?; 1831 1832 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1833 payload_header 1834 .as_bytes() 1835 .read_from( 1836 0, 1837 payload_file, 1838 mem::size_of::<linux_loader::bootparam::setup_header>(), 1839 ) 1840 .unwrap(); 1841 1842 if payload_header.header != 0x5372_6448 { 1843 return Err(Error::InvalidPayloadType); 1844 } 1845 1846 if (payload_header.version < 0x0200) 1847 || ((payload_header.loadflags & 0x1) == 0x0) 1848 { 1849 return Err(Error::InvalidPayloadType); 1850 } 1851 1852 payload_file 1853 .seek(SeekFrom::Start(0)) 1854 .map_err(Error::LoadPayload)?; 1855 mem.read_from( 1856 GuestAddress(section.address), 1857 payload_file, 1858 payload_size as usize, 1859 ) 1860 .unwrap(); 1861 1862 // Create the payload info that will be inserted into 1863 // the HOB. 1864 payload_info = Some(PayloadInfo { 1865 image_type: PayloadImageType::BzImage, 1866 entry_point: section.address, 1867 }); 1868 } 1869 } 1870 TdvfSectionType::PayloadParam => { 1871 info!("Copying payload parameters to guest memory"); 1872 let cmdline = Self::generate_cmdline(&self.config)?; 1873 mem.write_slice(cmdline.as_str().as_bytes(), GuestAddress(section.address)) 1874 .unwrap(); 1875 } 1876 _ => {} 1877 } 1878 } 1879 1880 // Generate HOB 1881 let mut hob = TdHob::start(hob_offset.unwrap()); 1882 1883 let mut sorted_sections = sections.to_vec(); 1884 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1885 1886 sorted_sections.sort_by_key(|section| section.address); 1887 sorted_sections.reverse(); 1888 1889 for (start, size, ram) in Vm::hob_memory_resources(sorted_sections, &boot_guest_memory) { 1890 hob.add_memory_resource(&mem, start, size, ram) 1891 .map_err(Error::PopulateHob)?; 1892 } 1893 1894 // MMIO regions 1895 hob.add_mmio_resource( 1896 &mem, 1897 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1898 arch::layout::APIC_START.raw_value() 1899 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1900 ) 1901 .map_err(Error::PopulateHob)?; 1902 let start_of_device_area = self 1903 .memory_manager 1904 .lock() 1905 .unwrap() 1906 .start_of_device_area() 1907 .raw_value(); 1908 let end_of_device_area = self 1909 .memory_manager 1910 .lock() 1911 .unwrap() 1912 .end_of_device_area() 1913 .raw_value(); 1914 hob.add_mmio_resource( 1915 &mem, 1916 start_of_device_area, 1917 end_of_device_area - start_of_device_area, 1918 ) 1919 .map_err(Error::PopulateHob)?; 1920 1921 // Loop over the ACPI tables and copy them to the HOB. 1922 1923 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1924 &self.device_manager, 1925 &self.cpu_manager, 1926 &self.memory_manager, 1927 &self.numa_nodes, 1928 ) { 1929 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1930 .map_err(Error::PopulateHob)?; 1931 } 1932 1933 // If a payload info has been created, let's insert it into the HOB. 1934 if let Some(payload_info) = payload_info { 1935 hob.add_payload(&mem, payload_info) 1936 .map_err(Error::PopulateHob)?; 1937 } 1938 1939 hob.finish(&mem).map_err(Error::PopulateHob)?; 1940 1941 Ok(hob_offset) 1942 } 1943 1944 #[cfg(feature = "tdx")] 1945 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1946 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1947 let mem = guest_memory.memory(); 1948 1949 for section in sections { 1950 self.vm 1951 .tdx_init_memory_region( 1952 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1953 section.address, 1954 section.size, 1955 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1956 section.attributes == 1, 1957 ) 1958 .map_err(Error::InitializeTdxMemoryRegion)?; 1959 } 1960 1961 Ok(()) 1962 } 1963 1964 fn setup_signal_handler(&mut self) -> Result<()> { 1965 let console = self.device_manager.lock().unwrap().console().clone(); 1966 let signals = Signals::new(&Vm::HANDLED_SIGNALS); 1967 match signals { 1968 Ok(signals) => { 1969 self.signals = Some(signals.handle()); 1970 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?; 1971 let signal_handler_seccomp_filter = get_seccomp_filter( 1972 &self.seccomp_action, 1973 Thread::SignalHandler, 1974 self.hypervisor.hypervisor_type(), 1975 ) 1976 .map_err(Error::CreateSeccompFilter)?; 1977 self.threads.push( 1978 thread::Builder::new() 1979 .name("vm_signal_handler".to_string()) 1980 .spawn(move || { 1981 if !signal_handler_seccomp_filter.is_empty() { 1982 if let Err(e) = apply_filter(&signal_handler_seccomp_filter) 1983 .map_err(Error::ApplySeccompFilter) 1984 { 1985 error!("Error applying seccomp filter: {:?}", e); 1986 exit_evt.write(1).ok(); 1987 return; 1988 } 1989 } 1990 std::panic::catch_unwind(AssertUnwindSafe(|| { 1991 Vm::signal_handler(signals, console); 1992 })) 1993 .map_err(|_| { 1994 error!("signal_handler thead panicked"); 1995 exit_evt.write(1).ok() 1996 }) 1997 .ok(); 1998 }) 1999 .map_err(Error::SignalHandlerSpawn)?, 2000 ); 2001 } 2002 Err(e) => error!("Signal not found {}", e), 2003 } 2004 Ok(()) 2005 } 2006 2007 fn setup_tty(&self) -> Result<()> { 2008 if self.on_tty { 2009 io::stdin() 2010 .lock() 2011 .set_raw_mode() 2012 .map_err(Error::SetTerminalRaw)?; 2013 } 2014 2015 Ok(()) 2016 } 2017 2018 // Creates ACPI tables 2019 // In case of TDX being used, this is a no-op since the tables will be 2020 // created and passed when populating the HOB. 2021 2022 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2023 #[cfg(feature = "tdx")] 2024 if self.config.lock().unwrap().tdx.is_some() { 2025 return None; 2026 } 2027 2028 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2029 2030 let rsdp_addr = crate::acpi::create_acpi_tables( 2031 &mem, 2032 &self.device_manager, 2033 &self.cpu_manager, 2034 &self.memory_manager, 2035 &self.numa_nodes, 2036 ); 2037 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2038 2039 Some(rsdp_addr) 2040 } 2041 2042 #[cfg(target_arch = "x86_64")] 2043 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2044 self.load_kernel_handle 2045 .take() 2046 .map(|handle| handle.join().map_err(Error::KernelLoadThreadJoin)?) 2047 .transpose() 2048 } 2049 2050 #[cfg(target_arch = "aarch64")] 2051 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2052 Ok(if self.kernel.as_ref().is_some() { 2053 Some(self.load_kernel()?) 2054 } else { 2055 None 2056 }) 2057 } 2058 2059 pub fn boot(&mut self) -> Result<()> { 2060 info!("Booting VM"); 2061 event!("vm", "booting"); 2062 let current_state = self.get_state()?; 2063 if current_state == VmState::Paused { 2064 return self.resume().map_err(Error::Resume); 2065 } 2066 2067 let new_state = if self.stop_on_boot { 2068 VmState::BreakPoint 2069 } else { 2070 VmState::Running 2071 }; 2072 current_state.valid_transition(new_state)?; 2073 2074 // Do earlier to parallelise with loading kernel 2075 #[cfg(target_arch = "x86_64")] 2076 let rsdp_addr = self.create_acpi_tables(); 2077 2078 self.setup_signal_handler()?; 2079 self.setup_tty()?; 2080 2081 // Load kernel synchronously or if asynchronous then wait for load to 2082 // finish. 2083 let entry_point = self.entry_point()?; 2084 2085 // The initial TDX configuration must be done before the vCPUs are 2086 // created 2087 #[cfg(feature = "tdx")] 2088 if self.config.lock().unwrap().tdx.is_some() { 2089 self.init_tdx()?; 2090 } 2091 2092 // Create and configure vcpus 2093 self.cpu_manager 2094 .lock() 2095 .unwrap() 2096 .create_boot_vcpus(entry_point) 2097 .map_err(Error::CpuManager)?; 2098 2099 #[cfg(feature = "tdx")] 2100 let sections = if self.config.lock().unwrap().tdx.is_some() { 2101 self.extract_tdvf_sections()? 2102 } else { 2103 Vec::new() 2104 }; 2105 2106 // Configuring the TDX regions requires that the vCPUs are created. 2107 #[cfg(feature = "tdx")] 2108 let hob_address = if self.config.lock().unwrap().tdx.is_some() { 2109 // TDX sections are written to memory. 2110 self.populate_tdx_sections(§ions)? 2111 } else { 2112 None 2113 }; 2114 2115 // On aarch64 the ACPI tables depend on the vCPU mpidr which is only 2116 // available after they are configured 2117 #[cfg(target_arch = "aarch64")] 2118 let rsdp_addr = self.create_acpi_tables(); 2119 2120 // Configure shared state based on loaded kernel 2121 entry_point 2122 .map(|_| { 2123 // Safe to unwrap rsdp_addr as we know it can't be None when 2124 // the entry_point is Some. 2125 self.configure_system(rsdp_addr.unwrap()) 2126 }) 2127 .transpose()?; 2128 2129 #[cfg(feature = "tdx")] 2130 if let Some(hob_address) = hob_address { 2131 // With the HOB address extracted the vCPUs can have 2132 // their TDX state configured. 2133 self.cpu_manager 2134 .lock() 2135 .unwrap() 2136 .initialize_tdx(hob_address) 2137 .map_err(Error::CpuManager)?; 2138 // Let the hypervisor know which memory ranges are shared with the 2139 // guest. This prevents the guest from ignoring/discarding memory 2140 // regions provided by the host. 2141 self.init_tdx_memory(§ions)?; 2142 // With TDX memory and CPU state configured TDX setup is complete 2143 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2144 } 2145 2146 self.cpu_manager 2147 .lock() 2148 .unwrap() 2149 .start_boot_vcpus(new_state == VmState::BreakPoint) 2150 .map_err(Error::CpuManager)?; 2151 2152 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2153 *state = new_state; 2154 event!("vm", "booted"); 2155 Ok(()) 2156 } 2157 2158 /// Gets a thread-safe reference counted pointer to the VM configuration. 2159 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2160 Arc::clone(&self.config) 2161 } 2162 2163 /// Get the VM state. Returns an error if the state is poisoned. 2164 pub fn get_state(&self) -> Result<VmState> { 2165 self.state 2166 .try_read() 2167 .map_err(|_| Error::PoisonedState) 2168 .map(|state| *state) 2169 } 2170 2171 /// Load saved clock from snapshot 2172 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2173 pub fn load_clock_from_snapshot( 2174 &mut self, 2175 snapshot: &Snapshot, 2176 ) -> Result<Option<hypervisor::ClockData>> { 2177 use crate::migration::get_vm_snapshot; 2178 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 2179 self.saved_clock = vm_snapshot.clock; 2180 Ok(self.saved_clock) 2181 } 2182 2183 #[cfg(target_arch = "aarch64")] 2184 /// Add the vGIC section to the VM snapshot. 2185 fn add_vgic_snapshot_section( 2186 &self, 2187 vm_snapshot: &mut Snapshot, 2188 ) -> std::result::Result<(), MigratableError> { 2189 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2190 self.device_manager 2191 .lock() 2192 .unwrap() 2193 .get_interrupt_controller() 2194 .unwrap() 2195 .lock() 2196 .unwrap() 2197 .set_gicr_typers(&saved_vcpu_states); 2198 2199 vm_snapshot.add_snapshot( 2200 self.device_manager 2201 .lock() 2202 .unwrap() 2203 .get_interrupt_controller() 2204 .unwrap() 2205 .lock() 2206 .unwrap() 2207 .snapshot()?, 2208 ); 2209 2210 Ok(()) 2211 } 2212 2213 #[cfg(target_arch = "aarch64")] 2214 /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing. 2215 fn restore_vgic_and_enable_interrupt( 2216 &self, 2217 vm_snapshot: &Snapshot, 2218 ) -> std::result::Result<(), MigratableError> { 2219 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2220 // The number of vCPUs is the same as the number of saved vCPU states. 2221 let vcpu_numbers = saved_vcpu_states.len(); 2222 2223 // Creating a GIC device here, as the GIC will not be created when 2224 // restoring the device manager. Note that currently only the bare GICv3 2225 // without ITS is supported. 2226 self.device_manager 2227 .lock() 2228 .unwrap() 2229 .get_interrupt_controller() 2230 .unwrap() 2231 .lock() 2232 .unwrap() 2233 .create_vgic(&self.vm, vcpu_numbers.try_into().unwrap()) 2234 .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?; 2235 2236 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 2237 self.cpu_manager 2238 .lock() 2239 .unwrap() 2240 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 2241 .map_err(|e| MigratableError::Restore(anyhow!("Error init PMU: {:?}", e)))?; 2242 2243 // Here we prepare the GICR_TYPER registers from the restored vCPU states. 2244 self.device_manager 2245 .lock() 2246 .unwrap() 2247 .get_interrupt_controller() 2248 .unwrap() 2249 .lock() 2250 .unwrap() 2251 .set_gicr_typers(&saved_vcpu_states); 2252 2253 // Restore GIC states. 2254 if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) { 2255 self.device_manager 2256 .lock() 2257 .unwrap() 2258 .get_interrupt_controller() 2259 .unwrap() 2260 .lock() 2261 .unwrap() 2262 .restore(*gicv3_its_snapshot.clone())?; 2263 } else { 2264 return Err(MigratableError::Restore(anyhow!( 2265 "Missing GicV3Its snapshot" 2266 ))); 2267 } 2268 2269 // Activate gic device 2270 self.device_manager 2271 .lock() 2272 .unwrap() 2273 .get_interrupt_controller() 2274 .unwrap() 2275 .lock() 2276 .unwrap() 2277 .enable() 2278 .map_err(|e| { 2279 MigratableError::Restore(anyhow!( 2280 "Could not enable interrupt controller routing: {:#?}", 2281 e 2282 )) 2283 })?; 2284 2285 Ok(()) 2286 } 2287 2288 /// Gets the actual size of the balloon. 2289 pub fn balloon_size(&self) -> u64 { 2290 self.device_manager.lock().unwrap().balloon_size() 2291 } 2292 2293 pub fn receive_memory_regions<F>( 2294 &mut self, 2295 ranges: &MemoryRangeTable, 2296 fd: &mut F, 2297 ) -> std::result::Result<(), MigratableError> 2298 where 2299 F: Read, 2300 { 2301 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2302 let mem = guest_memory.memory(); 2303 2304 for range in ranges.regions() { 2305 let mut offset: u64 = 0; 2306 // Here we are manually handling the retry in case we can't the 2307 // whole region at once because we can't use the implementation 2308 // from vm-memory::GuestMemory of read_exact_from() as it is not 2309 // following the correct behavior. For more info about this issue 2310 // see: https://github.com/rust-vmm/vm-memory/issues/174 2311 loop { 2312 let bytes_read = mem 2313 .read_from( 2314 GuestAddress(range.gpa + offset), 2315 fd, 2316 (range.length - offset) as usize, 2317 ) 2318 .map_err(|e| { 2319 MigratableError::MigrateReceive(anyhow!( 2320 "Error receiving memory from socket: {}", 2321 e 2322 )) 2323 })?; 2324 offset += bytes_read as u64; 2325 2326 if offset == range.length { 2327 break; 2328 } 2329 } 2330 } 2331 2332 Ok(()) 2333 } 2334 2335 pub fn send_memory_fds( 2336 &mut self, 2337 socket: &mut UnixStream, 2338 ) -> std::result::Result<(), MigratableError> { 2339 for (slot, fd) in self 2340 .memory_manager 2341 .lock() 2342 .unwrap() 2343 .memory_slot_fds() 2344 .drain() 2345 { 2346 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2347 .write_to(socket) 2348 .map_err(|e| { 2349 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2350 })?; 2351 socket 2352 .send_with_fd(&slot.to_le_bytes()[..], fd) 2353 .map_err(|e| { 2354 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2355 })?; 2356 2357 let res = Response::read_from(socket)?; 2358 if res.status() != Status::Ok { 2359 warn!("Error during memory fd migration"); 2360 Request::abandon().write_to(socket)?; 2361 Response::read_from(socket).ok(); 2362 return Err(MigratableError::MigrateSend(anyhow!( 2363 "Error during memory fd migration" 2364 ))); 2365 } 2366 } 2367 2368 Ok(()) 2369 } 2370 2371 pub fn send_memory_regions<F>( 2372 &mut self, 2373 ranges: &MemoryRangeTable, 2374 fd: &mut F, 2375 ) -> std::result::Result<(), MigratableError> 2376 where 2377 F: Write, 2378 { 2379 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2380 let mem = guest_memory.memory(); 2381 2382 for range in ranges.regions() { 2383 let mut offset: u64 = 0; 2384 // Here we are manually handling the retry in case we can't the 2385 // whole region at once because we can't use the implementation 2386 // from vm-memory::GuestMemory of write_all_to() as it is not 2387 // following the correct behavior. For more info about this issue 2388 // see: https://github.com/rust-vmm/vm-memory/issues/174 2389 loop { 2390 let bytes_written = mem 2391 .write_to( 2392 GuestAddress(range.gpa + offset), 2393 fd, 2394 (range.length - offset) as usize, 2395 ) 2396 .map_err(|e| { 2397 MigratableError::MigrateSend(anyhow!( 2398 "Error transferring memory to socket: {}", 2399 e 2400 )) 2401 })?; 2402 offset += bytes_written as u64; 2403 2404 if offset == range.length { 2405 break; 2406 } 2407 } 2408 } 2409 2410 Ok(()) 2411 } 2412 2413 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2414 self.memory_manager 2415 .lock() 2416 .unwrap() 2417 .memory_range_table(false) 2418 } 2419 2420 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2421 self.device_manager.lock().unwrap().device_tree() 2422 } 2423 2424 pub fn activate_virtio_devices(&self) -> Result<()> { 2425 self.device_manager 2426 .lock() 2427 .unwrap() 2428 .activate_virtio_devices() 2429 .map_err(Error::ActivateVirtioDevices) 2430 } 2431 2432 #[cfg(target_arch = "x86_64")] 2433 pub fn power_button(&self) -> Result<()> { 2434 return self 2435 .device_manager 2436 .lock() 2437 .unwrap() 2438 .notify_power_button() 2439 .map_err(Error::PowerButton); 2440 } 2441 2442 #[cfg(target_arch = "aarch64")] 2443 pub fn power_button(&self) -> Result<()> { 2444 self.device_manager 2445 .lock() 2446 .unwrap() 2447 .notify_power_button() 2448 .map_err(Error::PowerButton) 2449 } 2450 2451 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2452 self.memory_manager.lock().unwrap().snapshot_data() 2453 } 2454 2455 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 2456 pub fn debug_request( 2457 &mut self, 2458 gdb_request: &GdbRequestPayload, 2459 cpu_id: usize, 2460 ) -> Result<GdbResponsePayload> { 2461 use GdbRequestPayload::*; 2462 match gdb_request { 2463 SetSingleStep(single_step) => { 2464 self.set_guest_debug(cpu_id, &[], *single_step) 2465 .map_err(Error::Debug)?; 2466 } 2467 SetHwBreakPoint(addrs) => { 2468 self.set_guest_debug(cpu_id, addrs, false) 2469 .map_err(Error::Debug)?; 2470 } 2471 Pause => { 2472 self.debug_pause().map_err(Error::Debug)?; 2473 } 2474 Resume => { 2475 self.debug_resume().map_err(Error::Debug)?; 2476 } 2477 ReadRegs => { 2478 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2479 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2480 } 2481 WriteRegs(regs) => { 2482 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2483 } 2484 ReadMem(vaddr, len) => { 2485 let mem = self.read_mem(cpu_id, *vaddr, *len).map_err(Error::Debug)?; 2486 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2487 } 2488 WriteMem(vaddr, data) => { 2489 self.write_mem(cpu_id, vaddr, data).map_err(Error::Debug)?; 2490 } 2491 ActiveVcpus => { 2492 let active_vcpus = self.active_vcpus(); 2493 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2494 } 2495 } 2496 Ok(GdbResponsePayload::CommandComplete) 2497 } 2498 2499 #[cfg(feature = "guest_debug")] 2500 fn get_dump_state( 2501 &mut self, 2502 destination_url: &str, 2503 ) -> std::result::Result<DumpState, GuestDebuggableError> { 2504 let nr_cpus = self.config.lock().unwrap().cpus.boot_vcpus as u32; 2505 let elf_note_size = self.get_note_size(NoteDescType::ElfAndVmm, nr_cpus) as isize; 2506 let mut elf_phdr_num = 1; 2507 let elf_sh_info = 0; 2508 let coredump_file_path = url_to_file(destination_url)?; 2509 let mapping_num = self.memory_manager.lock().unwrap().num_guest_ram_mappings(); 2510 2511 if mapping_num < UINT16_MAX - 2 { 2512 elf_phdr_num += mapping_num as u16; 2513 } else { 2514 panic!("mapping num beyond 65535 not supported"); 2515 } 2516 let coredump_file = OpenOptions::new() 2517 .read(true) 2518 .write(true) 2519 .create_new(true) 2520 .open(coredump_file_path) 2521 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2522 2523 let mem_offset = self.coredump_get_mem_offset(elf_phdr_num, elf_note_size); 2524 let mem_data = self 2525 .memory_manager 2526 .lock() 2527 .unwrap() 2528 .coredump_memory_regions(mem_offset); 2529 2530 Ok(DumpState { 2531 elf_note_size, 2532 elf_phdr_num, 2533 elf_sh_info, 2534 mem_offset, 2535 mem_info: Some(mem_data), 2536 file: Some(coredump_file), 2537 }) 2538 } 2539 2540 #[cfg(feature = "guest_debug")] 2541 fn coredump_get_mem_offset(&self, phdr_num: u16, note_size: isize) -> u64 { 2542 size_of::<elf::Elf64_Ehdr>() as u64 2543 + note_size as u64 2544 + size_of::<elf::Elf64_Phdr>() as u64 * phdr_num as u64 2545 } 2546 } 2547 2548 impl Pausable for Vm { 2549 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2550 event!("vm", "pausing"); 2551 let mut state = self 2552 .state 2553 .try_write() 2554 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2555 let new_state = VmState::Paused; 2556 2557 state 2558 .valid_transition(new_state) 2559 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2560 2561 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2562 { 2563 let mut clock = self 2564 .vm 2565 .get_clock() 2566 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2567 clock.reset_flags(); 2568 self.saved_clock = Some(clock); 2569 } 2570 2571 // Before pausing the vCPUs activate any pending virtio devices that might 2572 // need activation between starting the pause (or e.g. a migration it's part of) 2573 self.activate_virtio_devices().map_err(|e| { 2574 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2575 })?; 2576 2577 self.cpu_manager.lock().unwrap().pause()?; 2578 self.device_manager.lock().unwrap().pause()?; 2579 2580 *state = new_state; 2581 2582 event!("vm", "paused"); 2583 Ok(()) 2584 } 2585 2586 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2587 event!("vm", "resuming"); 2588 let mut state = self 2589 .state 2590 .try_write() 2591 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2592 let new_state = VmState::Running; 2593 2594 state 2595 .valid_transition(new_state) 2596 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2597 2598 self.cpu_manager.lock().unwrap().resume()?; 2599 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2600 { 2601 if let Some(clock) = &self.saved_clock { 2602 self.vm.set_clock(clock).map_err(|e| { 2603 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2604 })?; 2605 } 2606 } 2607 self.device_manager.lock().unwrap().resume()?; 2608 2609 // And we're back to the Running state. 2610 *state = new_state; 2611 event!("vm", "resumed"); 2612 Ok(()) 2613 } 2614 } 2615 2616 #[derive(Serialize, Deserialize)] 2617 pub struct VmSnapshot { 2618 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2619 pub clock: Option<hypervisor::ClockData>, 2620 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2621 pub common_cpuid: Vec<hypervisor::arch::x86::CpuIdEntry>, 2622 } 2623 2624 pub const VM_SNAPSHOT_ID: &str = "vm"; 2625 impl Snapshottable for Vm { 2626 fn id(&self) -> String { 2627 VM_SNAPSHOT_ID.to_string() 2628 } 2629 2630 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2631 event!("vm", "snapshotting"); 2632 2633 #[cfg(feature = "tdx")] 2634 { 2635 if self.config.lock().unwrap().tdx.is_some() { 2636 return Err(MigratableError::Snapshot(anyhow!( 2637 "Snapshot not possible with TDX VM" 2638 ))); 2639 } 2640 } 2641 2642 let current_state = self.get_state().unwrap(); 2643 if current_state != VmState::Paused { 2644 return Err(MigratableError::Snapshot(anyhow!( 2645 "Trying to snapshot while VM is running" 2646 ))); 2647 } 2648 2649 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2650 let common_cpuid = { 2651 #[cfg(feature = "tdx")] 2652 let tdx_enabled = self.config.lock().unwrap().tdx.is_some(); 2653 let phys_bits = physical_bits(self.config.lock().unwrap().cpus.max_phys_bits); 2654 arch::generate_common_cpuid( 2655 self.hypervisor.clone(), 2656 None, 2657 None, 2658 phys_bits, 2659 self.config.lock().unwrap().cpus.kvm_hyperv, 2660 #[cfg(feature = "tdx")] 2661 tdx_enabled, 2662 ) 2663 .map_err(|e| { 2664 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2665 })? 2666 }; 2667 2668 let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID); 2669 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2670 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2671 clock: self.saved_clock, 2672 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2673 common_cpuid, 2674 }) 2675 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2676 2677 vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?); 2678 vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?); 2679 2680 #[cfg(target_arch = "aarch64")] 2681 self.add_vgic_snapshot_section(&mut vm_snapshot) 2682 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2683 2684 vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?); 2685 vm_snapshot.add_data_section(SnapshotDataSection { 2686 id: format!("{}-section", VM_SNAPSHOT_ID), 2687 snapshot: vm_snapshot_data, 2688 }); 2689 2690 event!("vm", "snapshotted"); 2691 Ok(vm_snapshot) 2692 } 2693 2694 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 2695 event!("vm", "restoring"); 2696 2697 let current_state = self 2698 .get_state() 2699 .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?; 2700 let new_state = VmState::Paused; 2701 current_state.valid_transition(new_state).map_err(|e| { 2702 MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e)) 2703 })?; 2704 2705 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2706 self.load_clock_from_snapshot(&snapshot) 2707 .map_err(|e| MigratableError::Restore(anyhow!("Error restoring clock: {:?}", e)))?; 2708 2709 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2710 self.memory_manager 2711 .lock() 2712 .unwrap() 2713 .restore(*memory_manager_snapshot.clone())?; 2714 } else { 2715 return Err(MigratableError::Restore(anyhow!( 2716 "Missing memory manager snapshot" 2717 ))); 2718 } 2719 2720 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2721 self.device_manager 2722 .lock() 2723 .unwrap() 2724 .restore(*device_manager_snapshot.clone())?; 2725 } else { 2726 return Err(MigratableError::Restore(anyhow!( 2727 "Missing device manager snapshot" 2728 ))); 2729 } 2730 2731 if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) { 2732 self.cpu_manager 2733 .lock() 2734 .unwrap() 2735 .restore(*cpu_manager_snapshot.clone())?; 2736 } else { 2737 return Err(MigratableError::Restore(anyhow!( 2738 "Missing CPU manager snapshot" 2739 ))); 2740 } 2741 2742 #[cfg(target_arch = "aarch64")] 2743 self.restore_vgic_and_enable_interrupt(&snapshot)?; 2744 2745 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2746 self.device_manager 2747 .lock() 2748 .unwrap() 2749 .restore_devices(*device_manager_snapshot.clone())?; 2750 } else { 2751 return Err(MigratableError::Restore(anyhow!( 2752 "Missing device manager snapshot" 2753 ))); 2754 } 2755 2756 // Now we can start all vCPUs from here. 2757 self.cpu_manager 2758 .lock() 2759 .unwrap() 2760 .start_restored_vcpus() 2761 .map_err(|e| { 2762 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e)) 2763 })?; 2764 2765 self.setup_signal_handler().map_err(|e| { 2766 MigratableError::Restore(anyhow!("Could not setup signal handler: {:#?}", e)) 2767 })?; 2768 self.setup_tty() 2769 .map_err(|e| MigratableError::Restore(anyhow!("Could not setup tty: {:#?}", e)))?; 2770 2771 let mut state = self 2772 .state 2773 .try_write() 2774 .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?; 2775 *state = new_state; 2776 2777 event!("vm", "restored"); 2778 Ok(()) 2779 } 2780 } 2781 2782 impl Transportable for Vm { 2783 fn send( 2784 &self, 2785 snapshot: &Snapshot, 2786 destination_url: &str, 2787 ) -> std::result::Result<(), MigratableError> { 2788 let mut snapshot_config_path = url_to_path(destination_url)?; 2789 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2790 2791 // Create the snapshot config file 2792 let mut snapshot_config_file = OpenOptions::new() 2793 .read(true) 2794 .write(true) 2795 .create_new(true) 2796 .open(snapshot_config_path) 2797 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2798 2799 // Serialize and write the snapshot config 2800 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2801 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2802 2803 snapshot_config_file 2804 .write(vm_config.as_bytes()) 2805 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2806 2807 let mut snapshot_state_path = url_to_path(destination_url)?; 2808 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2809 2810 // Create the snapshot state file 2811 let mut snapshot_state_file = OpenOptions::new() 2812 .read(true) 2813 .write(true) 2814 .create_new(true) 2815 .open(snapshot_state_path) 2816 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2817 2818 // Serialize and write the snapshot state 2819 let vm_state = 2820 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2821 2822 snapshot_state_file 2823 .write(&vm_state) 2824 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2825 2826 // Tell the memory manager to also send/write its own snapshot. 2827 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2828 self.memory_manager 2829 .lock() 2830 .unwrap() 2831 .send(&*memory_manager_snapshot.clone(), destination_url)?; 2832 } else { 2833 return Err(MigratableError::Restore(anyhow!( 2834 "Missing memory manager snapshot" 2835 ))); 2836 } 2837 2838 Ok(()) 2839 } 2840 } 2841 2842 impl Migratable for Vm { 2843 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2844 self.memory_manager.lock().unwrap().start_dirty_log()?; 2845 self.device_manager.lock().unwrap().start_dirty_log() 2846 } 2847 2848 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2849 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2850 self.device_manager.lock().unwrap().stop_dirty_log() 2851 } 2852 2853 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2854 Ok(MemoryRangeTable::new_from_tables(vec![ 2855 self.memory_manager.lock().unwrap().dirty_log()?, 2856 self.device_manager.lock().unwrap().dirty_log()?, 2857 ])) 2858 } 2859 2860 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2861 self.memory_manager.lock().unwrap().start_migration()?; 2862 self.device_manager.lock().unwrap().start_migration() 2863 } 2864 2865 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2866 self.memory_manager.lock().unwrap().complete_migration()?; 2867 self.device_manager.lock().unwrap().complete_migration() 2868 } 2869 } 2870 2871 #[cfg(feature = "gdb")] 2872 impl Debuggable for Vm { 2873 fn set_guest_debug( 2874 &self, 2875 cpu_id: usize, 2876 addrs: &[GuestAddress], 2877 singlestep: bool, 2878 ) -> std::result::Result<(), DebuggableError> { 2879 self.cpu_manager 2880 .lock() 2881 .unwrap() 2882 .set_guest_debug(cpu_id, addrs, singlestep) 2883 } 2884 2885 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2886 if *self.state.read().unwrap() == VmState::Running { 2887 self.pause().map_err(DebuggableError::Pause)?; 2888 } 2889 2890 let mut state = self 2891 .state 2892 .try_write() 2893 .map_err(|_| DebuggableError::PoisonedState)?; 2894 *state = VmState::BreakPoint; 2895 Ok(()) 2896 } 2897 2898 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2899 if *self.state.read().unwrap() == VmState::BreakPoint { 2900 self.resume().map_err(DebuggableError::Pause)?; 2901 } 2902 2903 Ok(()) 2904 } 2905 2906 fn read_regs(&self, cpu_id: usize) -> std::result::Result<X86_64CoreRegs, DebuggableError> { 2907 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2908 } 2909 2910 fn write_regs( 2911 &self, 2912 cpu_id: usize, 2913 regs: &X86_64CoreRegs, 2914 ) -> std::result::Result<(), DebuggableError> { 2915 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2916 } 2917 2918 fn read_mem( 2919 &self, 2920 cpu_id: usize, 2921 vaddr: GuestAddress, 2922 len: usize, 2923 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2924 self.cpu_manager 2925 .lock() 2926 .unwrap() 2927 .read_mem(cpu_id, vaddr, len) 2928 } 2929 2930 fn write_mem( 2931 &self, 2932 cpu_id: usize, 2933 vaddr: &GuestAddress, 2934 data: &[u8], 2935 ) -> std::result::Result<(), DebuggableError> { 2936 self.cpu_manager 2937 .lock() 2938 .unwrap() 2939 .write_mem(cpu_id, vaddr, data) 2940 } 2941 2942 fn active_vcpus(&self) -> usize { 2943 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2944 if active_vcpus > 0 { 2945 active_vcpus 2946 } else { 2947 // The VM is not booted yet. Report boot_vcpus() instead. 2948 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2949 } 2950 } 2951 } 2952 2953 #[cfg(feature = "guest_debug")] 2954 pub const UINT16_MAX: u32 = 65535; 2955 2956 #[cfg(feature = "guest_debug")] 2957 impl Elf64Writable for Vm {} 2958 2959 #[cfg(feature = "guest_debug")] 2960 impl GuestDebuggable for Vm { 2961 fn coredump(&mut self, destination_url: &str) -> std::result::Result<(), GuestDebuggableError> { 2962 event!("vm", "coredumping"); 2963 2964 #[cfg(feature = "tdx")] 2965 { 2966 if self.config.lock().unwrap().tdx.is_some() { 2967 return Err(GuestDebuggableError::Coredump(anyhow!( 2968 "Coredump not possible with TDX VM" 2969 ))); 2970 } 2971 } 2972 2973 let current_state = self.get_state().unwrap(); 2974 if current_state != VmState::Paused { 2975 return Err(GuestDebuggableError::Coredump(anyhow!( 2976 "Trying to coredump while VM is running" 2977 ))); 2978 } 2979 2980 let coredump_state = self.get_dump_state(destination_url)?; 2981 2982 self.write_header(&coredump_state)?; 2983 self.write_note(&coredump_state)?; 2984 self.write_loads(&coredump_state)?; 2985 2986 self.cpu_manager 2987 .lock() 2988 .unwrap() 2989 .cpu_write_elf64_note(&coredump_state)?; 2990 self.cpu_manager 2991 .lock() 2992 .unwrap() 2993 .cpu_write_vmm_note(&coredump_state)?; 2994 2995 self.memory_manager 2996 .lock() 2997 .unwrap() 2998 .coredump_iterate_save_mem(&coredump_state) 2999 } 3000 } 3001 3002 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3003 #[cfg(test)] 3004 mod tests { 3005 use super::*; 3006 3007 fn test_vm_state_transitions(state: VmState) { 3008 match state { 3009 VmState::Created => { 3010 // Check the transitions from Created 3011 assert!(state.valid_transition(VmState::Created).is_err()); 3012 assert!(state.valid_transition(VmState::Running).is_ok()); 3013 assert!(state.valid_transition(VmState::Shutdown).is_err()); 3014 assert!(state.valid_transition(VmState::Paused).is_ok()); 3015 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 3016 } 3017 VmState::Running => { 3018 // Check the transitions from Running 3019 assert!(state.valid_transition(VmState::Created).is_err()); 3020 assert!(state.valid_transition(VmState::Running).is_err()); 3021 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 3022 assert!(state.valid_transition(VmState::Paused).is_ok()); 3023 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 3024 } 3025 VmState::Shutdown => { 3026 // Check the transitions from Shutdown 3027 assert!(state.valid_transition(VmState::Created).is_err()); 3028 assert!(state.valid_transition(VmState::Running).is_ok()); 3029 assert!(state.valid_transition(VmState::Shutdown).is_err()); 3030 assert!(state.valid_transition(VmState::Paused).is_err()); 3031 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 3032 } 3033 VmState::Paused => { 3034 // Check the transitions from Paused 3035 assert!(state.valid_transition(VmState::Created).is_err()); 3036 assert!(state.valid_transition(VmState::Running).is_ok()); 3037 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 3038 assert!(state.valid_transition(VmState::Paused).is_err()); 3039 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 3040 } 3041 VmState::BreakPoint => { 3042 // Check the transitions from Breakpoint 3043 assert!(state.valid_transition(VmState::Created).is_ok()); 3044 assert!(state.valid_transition(VmState::Running).is_ok()); 3045 assert!(state.valid_transition(VmState::Shutdown).is_err()); 3046 assert!(state.valid_transition(VmState::Paused).is_err()); 3047 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 3048 } 3049 } 3050 } 3051 3052 #[test] 3053 fn test_vm_created_transitions() { 3054 test_vm_state_transitions(VmState::Created); 3055 } 3056 3057 #[test] 3058 fn test_vm_running_transitions() { 3059 test_vm_state_transitions(VmState::Running); 3060 } 3061 3062 #[test] 3063 fn test_vm_shutdown_transitions() { 3064 test_vm_state_transitions(VmState::Shutdown); 3065 } 3066 3067 #[test] 3068 fn test_vm_paused_transitions() { 3069 test_vm_state_transitions(VmState::Paused); 3070 } 3071 3072 #[cfg(feature = "tdx")] 3073 #[test] 3074 fn test_hob_memory_resources() { 3075 // Case 1: Two TDVF sections in the middle of the RAM 3076 let sections = vec![ 3077 TdvfSection { 3078 address: 0xc000, 3079 size: 0x1000, 3080 ..Default::default() 3081 }, 3082 TdvfSection { 3083 address: 0x1000, 3084 size: 0x4000, 3085 ..Default::default() 3086 }, 3087 ]; 3088 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0), 0x1000_0000)]; 3089 let expected = vec![ 3090 (0, 0x1000, true), 3091 (0x1000, 0x4000, false), 3092 (0x5000, 0x7000, true), 3093 (0xc000, 0x1000, false), 3094 (0xd000, 0x0fff_3000, true), 3095 ]; 3096 assert_eq!( 3097 expected, 3098 Vm::hob_memory_resources( 3099 sections, 3100 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3101 ) 3102 ); 3103 3104 // Case 2: Two TDVF sections with no conflict with the RAM 3105 let sections = vec![ 3106 TdvfSection { 3107 address: 0x1000_1000, 3108 size: 0x1000, 3109 ..Default::default() 3110 }, 3111 TdvfSection { 3112 address: 0, 3113 size: 0x1000, 3114 ..Default::default() 3115 }, 3116 ]; 3117 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3118 let expected = vec![ 3119 (0, 0x1000, false), 3120 (0x1000, 0x1000_0000, true), 3121 (0x1000_1000, 0x1000, false), 3122 ]; 3123 assert_eq!( 3124 expected, 3125 Vm::hob_memory_resources( 3126 sections, 3127 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3128 ) 3129 ); 3130 3131 // Case 3: Two TDVF sections with partial conflicts with the RAM 3132 let sections = vec![ 3133 TdvfSection { 3134 address: 0x1000_0000, 3135 size: 0x2000, 3136 ..Default::default() 3137 }, 3138 TdvfSection { 3139 address: 0, 3140 size: 0x2000, 3141 ..Default::default() 3142 }, 3143 ]; 3144 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3145 let expected = vec![ 3146 (0, 0x2000, false), 3147 (0x2000, 0x0fff_e000, true), 3148 (0x1000_0000, 0x2000, false), 3149 ]; 3150 assert_eq!( 3151 expected, 3152 Vm::hob_memory_resources( 3153 sections, 3154 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3155 ) 3156 ); 3157 3158 // Case 4: Two TDVF sections with no conflict before the RAM and two 3159 // more additional sections with no conflict after the RAM. 3160 let sections = vec![ 3161 TdvfSection { 3162 address: 0x2000_1000, 3163 size: 0x1000, 3164 ..Default::default() 3165 }, 3166 TdvfSection { 3167 address: 0x2000_0000, 3168 size: 0x1000, 3169 ..Default::default() 3170 }, 3171 TdvfSection { 3172 address: 0x1000, 3173 size: 0x1000, 3174 ..Default::default() 3175 }, 3176 TdvfSection { 3177 address: 0, 3178 size: 0x1000, 3179 ..Default::default() 3180 }, 3181 ]; 3182 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x4000), 0x1000_0000)]; 3183 let expected = vec![ 3184 (0, 0x1000, false), 3185 (0x1000, 0x1000, false), 3186 (0x4000, 0x1000_0000, true), 3187 (0x2000_0000, 0x1000, false), 3188 (0x2000_1000, 0x1000, false), 3189 ]; 3190 assert_eq!( 3191 expected, 3192 Vm::hob_memory_resources( 3193 sections, 3194 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3195 ) 3196 ); 3197 3198 // Case 5: One TDVF section overriding the entire RAM 3199 let sections = vec![TdvfSection { 3200 address: 0, 3201 size: 0x2000_0000, 3202 ..Default::default() 3203 }]; 3204 let guest_ranges: Vec<(GuestAddress, usize)> = vec![(GuestAddress(0x1000), 0x1000_0000)]; 3205 let expected = vec![(0, 0x2000_0000, false)]; 3206 assert_eq!( 3207 expected, 3208 Vm::hob_memory_resources( 3209 sections, 3210 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3211 ) 3212 ); 3213 3214 // Case 6: Two TDVF sections with no conflict with 2 RAM regions 3215 let sections = vec![ 3216 TdvfSection { 3217 address: 0x1000_2000, 3218 size: 0x2000, 3219 ..Default::default() 3220 }, 3221 TdvfSection { 3222 address: 0, 3223 size: 0x2000, 3224 ..Default::default() 3225 }, 3226 ]; 3227 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3228 (GuestAddress(0x2000), 0x1000_0000), 3229 (GuestAddress(0x1000_4000), 0x1000_0000), 3230 ]; 3231 let expected = vec![ 3232 (0, 0x2000, false), 3233 (0x2000, 0x1000_0000, true), 3234 (0x1000_2000, 0x2000, false), 3235 (0x1000_4000, 0x1000_0000, true), 3236 ]; 3237 assert_eq!( 3238 expected, 3239 Vm::hob_memory_resources( 3240 sections, 3241 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3242 ) 3243 ); 3244 3245 // Case 7: Two TDVF sections with partial conflicts with 2 RAM regions 3246 let sections = vec![ 3247 TdvfSection { 3248 address: 0x1000_0000, 3249 size: 0x4000, 3250 ..Default::default() 3251 }, 3252 TdvfSection { 3253 address: 0, 3254 size: 0x4000, 3255 ..Default::default() 3256 }, 3257 ]; 3258 let guest_ranges: Vec<(GuestAddress, usize)> = vec![ 3259 (GuestAddress(0x1000), 0x1000_0000), 3260 (GuestAddress(0x1000_3000), 0x1000_0000), 3261 ]; 3262 let expected = vec![ 3263 (0, 0x4000, false), 3264 (0x4000, 0x0fff_c000, true), 3265 (0x1000_0000, 0x4000, false), 3266 (0x1000_4000, 0x0fff_f000, true), 3267 ]; 3268 assert_eq!( 3269 expected, 3270 Vm::hob_memory_resources( 3271 sections, 3272 &GuestMemoryMmap::from_ranges(&guest_ranges).unwrap() 3273 ) 3274 ); 3275 } 3276 } 3277 3278 #[cfg(target_arch = "aarch64")] 3279 #[cfg(test)] 3280 mod tests { 3281 use super::*; 3282 use crate::GuestMemoryMmap; 3283 use arch::aarch64::fdt::create_fdt; 3284 use arch::aarch64::layout; 3285 use arch::{DeviceType, MmioDeviceInfo}; 3286 3287 const LEN: u64 = 4096; 3288 3289 #[test] 3290 fn test_create_fdt_with_devices() { 3291 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 3292 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 3293 3294 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 3295 ( 3296 (DeviceType::Serial, DeviceType::Serial.to_string()), 3297 MmioDeviceInfo { 3298 addr: 0x00, 3299 len: LEN, 3300 irq: 33, 3301 }, 3302 ), 3303 ( 3304 (DeviceType::Virtio(1), "virtio".to_string()), 3305 MmioDeviceInfo { 3306 addr: LEN, 3307 len: LEN, 3308 irq: 34, 3309 }, 3310 ), 3311 ( 3312 (DeviceType::Rtc, "rtc".to_string()), 3313 MmioDeviceInfo { 3314 addr: 2 * LEN, 3315 len: LEN, 3316 irq: 35, 3317 }, 3318 ), 3319 ] 3320 .iter() 3321 .cloned() 3322 .collect(); 3323 3324 let hv = hypervisor::new().unwrap(); 3325 let vm = hv.create_vm().unwrap(); 3326 let gic = vm 3327 .create_vgic( 3328 1, 3329 0x0900_0000 - 0x01_0000, 3330 0x01_0000, 3331 0x02_0000, 3332 0x02_0000, 3333 256, 3334 ) 3335 .expect("Cannot create gic"); 3336 assert!(create_fdt( 3337 &mem, 3338 "console=tty0", 3339 vec![0], 3340 Some((0, 0, 0)), 3341 &dev_info, 3342 &gic, 3343 &None, 3344 &Vec::new(), 3345 &BTreeMap::new(), 3346 None, 3347 true, 3348 ) 3349 .is_ok()) 3350 } 3351 } 3352 3353 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3354 #[test] 3355 pub fn test_vm() { 3356 use hypervisor::VmExit; 3357 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3358 // This example based on https://lwn.net/Articles/658511/ 3359 let code = [ 3360 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3361 0x00, 0xd8, /* add %bl, %al */ 3362 0x04, b'0', /* add $'0', %al */ 3363 0xee, /* out %al, (%dx) */ 3364 0xb0, b'\n', /* mov $'\n', %al */ 3365 0xee, /* out %al, (%dx) */ 3366 0xf4, /* hlt */ 3367 ]; 3368 3369 let mem_size = 0x1000; 3370 let load_addr = GuestAddress(0x1000); 3371 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3372 3373 let hv = hypervisor::new().unwrap(); 3374 let vm = hv.create_vm().expect("new VM creation failed"); 3375 3376 for (index, region) in mem.iter().enumerate() { 3377 let mem_region = vm.make_user_memory_region( 3378 index as u32, 3379 region.start_addr().raw_value(), 3380 region.len() as u64, 3381 region.as_ptr() as u64, 3382 false, 3383 false, 3384 ); 3385 3386 vm.create_user_memory_region(mem_region) 3387 .expect("Cannot configure guest memory"); 3388 } 3389 mem.write_slice(&code, load_addr) 3390 .expect("Writing code to memory failed"); 3391 3392 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3393 3394 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3395 vcpu_sregs.cs.base = 0; 3396 vcpu_sregs.cs.selector = 0; 3397 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3398 3399 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3400 vcpu_regs.rip = 0x1000; 3401 vcpu_regs.rax = 2; 3402 vcpu_regs.rbx = 3; 3403 vcpu_regs.rflags = 2; 3404 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3405 3406 loop { 3407 match vcpu.run().expect("run failed") { 3408 VmExit::IoOut(addr, data) => { 3409 println!( 3410 "IO out -- addr: {:#x} data [{:?}]", 3411 addr, 3412 str::from_utf8(data).unwrap() 3413 ); 3414 } 3415 VmExit::Reset => { 3416 println!("HLT"); 3417 break; 3418 } 3419 r => panic!("unexpected exit reason: {:?}", r), 3420 } 3421 } 3422 } 3423