1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 use crate::config::NumaConfig; 15 use crate::config::{ 16 add_to_config, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 17 UserDeviceConfig, ValidationError, VdpaConfig, VmConfig, VsockConfig, 18 }; 19 use crate::cpu; 20 use crate::device_manager::{self, Console, DeviceManager, DeviceManagerError, PtyPair}; 21 use crate::device_tree::DeviceTree; 22 #[cfg(feature = "gdb")] 23 use crate::gdb::{Debuggable, DebuggableError, GdbRequestPayload, GdbResponsePayload}; 24 use crate::memory_manager::{ 25 Error as MemoryManagerError, MemoryManager, MemoryManagerSnapshotData, 26 }; 27 use crate::migration::{get_vm_snapshot, url_to_path, SNAPSHOT_CONFIG_FILE, SNAPSHOT_STATE_FILE}; 28 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 29 use crate::GuestMemoryMmap; 30 use crate::{ 31 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 32 }; 33 use anyhow::anyhow; 34 use arch::get_host_cpu_phys_bits; 35 #[cfg(target_arch = "x86_64")] 36 use arch::layout::{KVM_IDENTITY_MAP_START, KVM_TSS_START}; 37 #[cfg(feature = "tdx")] 38 use arch::x86_64::tdx::TdvfSection; 39 use arch::EntryPoint; 40 #[cfg(target_arch = "aarch64")] 41 use arch::PciSpaceInfo; 42 use arch::{NumaNode, NumaNodes}; 43 use devices::AcpiNotificationFlags; 44 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 45 use gdbstub_arch::x86::reg::X86_64CoreRegs; 46 use hypervisor::vm::{HypervisorVmError, VmmOps}; 47 use linux_loader::cmdline::Cmdline; 48 #[cfg(target_arch = "x86_64")] 49 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 50 #[cfg(target_arch = "aarch64")] 51 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 52 use linux_loader::loader::KernelLoader; 53 use seccompiler::{apply_filter, SeccompAction}; 54 use signal_hook::{ 55 consts::{SIGINT, SIGTERM, SIGWINCH}, 56 iterator::backend::Handle, 57 iterator::Signals, 58 }; 59 use std::cmp; 60 use std::collections::BTreeMap; 61 use std::collections::HashMap; 62 use std::convert::TryInto; 63 #[cfg(target_arch = "x86_64")] 64 use std::fmt; 65 use std::fs::{File, OpenOptions}; 66 use std::io::{self, Read, Write}; 67 use std::io::{Seek, SeekFrom}; 68 #[cfg(feature = "tdx")] 69 use std::mem; 70 use std::num::Wrapping; 71 use std::ops::Deref; 72 use std::os::unix::net::UnixStream; 73 use std::panic::AssertUnwindSafe; 74 use std::sync::{Arc, Mutex, RwLock}; 75 use std::{result, str, thread}; 76 use vm_device::Bus; 77 #[cfg(target_arch = "x86_64")] 78 use vm_device::BusDevice; 79 #[cfg(target_arch = "x86_64")] 80 use vm_memory::Address; 81 #[cfg(feature = "tdx")] 82 use vm_memory::{ByteValued, GuestMemory, GuestMemoryRegion}; 83 use vm_memory::{Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic}; 84 use vm_migration::protocol::{Request, Response, Status}; 85 use vm_migration::{ 86 protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, Snapshot, 87 SnapshotDataSection, Snapshottable, Transportable, 88 }; 89 use vmm_sys_util::eventfd::EventFd; 90 use vmm_sys_util::signal::unblock_signal; 91 use vmm_sys_util::sock_ctrl_msg::ScmSocket; 92 use vmm_sys_util::terminal::Terminal; 93 94 #[cfg(target_arch = "aarch64")] 95 use arch::aarch64::gic::gicv3_its::kvm::{KvmGicV3Its, GIC_V3_ITS_SNAPSHOT_ID}; 96 #[cfg(target_arch = "aarch64")] 97 use arch::aarch64::gic::kvm::create_gic; 98 #[cfg(target_arch = "aarch64")] 99 use devices::interrupt_controller::{self, InterruptController}; 100 101 /// Errors associated with VM management 102 #[derive(Debug)] 103 pub enum Error { 104 /// Cannot open the kernel image 105 KernelFile(io::Error), 106 107 /// Cannot open the initramfs image 108 InitramfsFile(io::Error), 109 110 /// Cannot load the kernel in memory 111 KernelLoad(linux_loader::loader::Error), 112 113 #[cfg(target_arch = "aarch64")] 114 /// Cannot load the UEFI binary in memory 115 UefiLoad(arch::aarch64::uefi::Error), 116 117 /// Cannot load the initramfs in memory 118 InitramfsLoad, 119 120 /// Cannot load the command line in memory 121 LoadCmdLine(linux_loader::loader::Error), 122 123 /// Cannot modify the command line 124 CmdLineInsertStr(linux_loader::cmdline::Error), 125 126 /// Cannot configure system 127 ConfigureSystem(arch::Error), 128 129 /// Cannot enable interrupt controller 130 #[cfg(target_arch = "aarch64")] 131 EnableInterruptController(interrupt_controller::Error), 132 133 PoisonedState, 134 135 /// Cannot create a device manager. 136 DeviceManager(DeviceManagerError), 137 138 /// Write to the console failed. 139 Console(vmm_sys_util::errno::Error), 140 141 /// Write to the pty console failed. 142 PtyConsole(io::Error), 143 144 /// Cannot setup terminal in raw mode. 145 SetTerminalRaw(vmm_sys_util::errno::Error), 146 147 /// Cannot setup terminal in canonical mode. 148 SetTerminalCanon(vmm_sys_util::errno::Error), 149 150 /// Memory is overflow 151 MemOverflow, 152 153 /// Cannot spawn a signal handler thread 154 SignalHandlerSpawn(io::Error), 155 156 /// Failed to join on vCPU threads 157 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 158 159 /// VM config is missing. 160 VmMissingConfig, 161 162 /// VM is not created 163 VmNotCreated, 164 165 /// VM is already created 166 VmAlreadyCreated, 167 168 /// VM is not running 169 VmNotRunning, 170 171 /// Cannot clone EventFd. 172 EventFdClone(io::Error), 173 174 /// Invalid VM state transition 175 InvalidStateTransition(VmState, VmState), 176 177 /// Error from CPU handling 178 CpuManager(cpu::Error), 179 180 /// Cannot pause devices 181 PauseDevices(MigratableError), 182 183 /// Cannot resume devices 184 ResumeDevices(MigratableError), 185 186 /// Cannot pause CPUs 187 PauseCpus(MigratableError), 188 189 /// Cannot resume cpus 190 ResumeCpus(MigratableError), 191 192 /// Cannot pause VM 193 Pause(MigratableError), 194 195 /// Cannot resume VM 196 Resume(MigratableError), 197 198 /// Memory manager error 199 MemoryManager(MemoryManagerError), 200 201 /// Eventfd write error 202 EventfdError(std::io::Error), 203 204 /// Cannot snapshot VM 205 Snapshot(MigratableError), 206 207 /// Cannot restore VM 208 Restore(MigratableError), 209 210 /// Cannot send VM snapshot 211 SnapshotSend(MigratableError), 212 213 /// Cannot convert source URL from Path into &str 214 RestoreSourceUrlPathToStr, 215 216 /// Failed to validate config 217 ConfigValidation(ValidationError), 218 219 /// No more that one virtio-vsock device 220 TooManyVsockDevices, 221 222 /// Failed serializing into JSON 223 SerializeJson(serde_json::Error), 224 225 /// Invalid configuration for NUMA. 226 InvalidNumaConfig, 227 228 /// Cannot create seccomp filter 229 CreateSeccompFilter(seccompiler::Error), 230 231 /// Cannot apply seccomp filter 232 ApplySeccompFilter(seccompiler::Error), 233 234 /// Failed resizing a memory zone. 235 ResizeZone, 236 237 /// Cannot activate virtio devices 238 ActivateVirtioDevices(device_manager::DeviceManagerError), 239 240 /// Error triggering power button 241 PowerButton(device_manager::DeviceManagerError), 242 243 /// Kernel lacks PVH header 244 KernelMissingPvhHeader, 245 246 /// Failed to allocate firmware RAM 247 AllocateFirmwareMemory(MemoryManagerError), 248 249 /// Error manipulating firmware file 250 FirmwareFile(std::io::Error), 251 252 /// Firmware too big 253 FirmwareTooLarge, 254 255 // Failed to copy to memory 256 FirmwareLoad(vm_memory::GuestMemoryError), 257 258 /// Error performing I/O on TDX firmware file 259 #[cfg(feature = "tdx")] 260 LoadTdvf(std::io::Error), 261 262 /// Error performing I/O on the payload file 263 #[cfg(feature = "tdx")] 264 LoadPayload(std::io::Error), 265 266 /// Error parsing TDVF 267 #[cfg(feature = "tdx")] 268 ParseTdvf(arch::x86_64::tdx::TdvfError), 269 270 /// Error populating HOB 271 #[cfg(feature = "tdx")] 272 PopulateHob(arch::x86_64::tdx::TdvfError), 273 274 /// Error allocating TDVF memory 275 #[cfg(feature = "tdx")] 276 AllocatingTdvfMemory(crate::memory_manager::Error), 277 278 /// Error enabling TDX VM 279 #[cfg(feature = "tdx")] 280 InitializeTdxVm(hypervisor::HypervisorVmError), 281 282 /// Error enabling TDX memory region 283 #[cfg(feature = "tdx")] 284 InitializeTdxMemoryRegion(hypervisor::HypervisorVmError), 285 286 /// Error finalizing TDX setup 287 #[cfg(feature = "tdx")] 288 FinalizeTdx(hypervisor::HypervisorVmError), 289 290 /// Invalid payload type 291 #[cfg(feature = "tdx")] 292 InvalidPayloadType, 293 294 /// Error debugging VM 295 #[cfg(feature = "gdb")] 296 Debug(DebuggableError), 297 } 298 pub type Result<T> = result::Result<T, Error>; 299 300 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq)] 301 pub enum VmState { 302 Created, 303 Running, 304 Shutdown, 305 Paused, 306 BreakPoint, 307 } 308 309 impl VmState { 310 fn valid_transition(self, new_state: VmState) -> Result<()> { 311 match self { 312 VmState::Created => match new_state { 313 VmState::Created | VmState::Shutdown => { 314 Err(Error::InvalidStateTransition(self, new_state)) 315 } 316 VmState::Running | VmState::Paused | VmState::BreakPoint => Ok(()), 317 }, 318 319 VmState::Running => match new_state { 320 VmState::Created | VmState::Running => { 321 Err(Error::InvalidStateTransition(self, new_state)) 322 } 323 VmState::Paused | VmState::Shutdown | VmState::BreakPoint => Ok(()), 324 }, 325 326 VmState::Shutdown => match new_state { 327 VmState::Paused | VmState::Created | VmState::Shutdown | VmState::BreakPoint => { 328 Err(Error::InvalidStateTransition(self, new_state)) 329 } 330 VmState::Running => Ok(()), 331 }, 332 333 VmState::Paused => match new_state { 334 VmState::Created | VmState::Paused | VmState::BreakPoint => { 335 Err(Error::InvalidStateTransition(self, new_state)) 336 } 337 VmState::Running | VmState::Shutdown => Ok(()), 338 }, 339 VmState::BreakPoint => match new_state { 340 VmState::Created | VmState::Running => Ok(()), 341 _ => Err(Error::InvalidStateTransition(self, new_state)), 342 }, 343 } 344 } 345 } 346 347 // Debug I/O port 348 #[cfg(target_arch = "x86_64")] 349 const DEBUG_IOPORT: u16 = 0x80; 350 #[cfg(target_arch = "x86_64")] 351 const DEBUG_IOPORT_PREFIX: &str = "Debug I/O port"; 352 353 #[cfg(target_arch = "x86_64")] 354 /// Debug I/O port, see: 355 /// https://www.intel.com/content/www/us/en/support/articles/000005500/boards-and-kits.html 356 /// 357 /// Since we're not a physical platform, we can freely assign code ranges for 358 /// debugging specific parts of our virtual platform. 359 pub enum DebugIoPortRange { 360 Firmware, 361 Bootloader, 362 Kernel, 363 Userspace, 364 Custom, 365 } 366 #[cfg(target_arch = "x86_64")] 367 impl DebugIoPortRange { 368 fn from_u8(value: u8) -> DebugIoPortRange { 369 match value { 370 0x00..=0x1f => DebugIoPortRange::Firmware, 371 0x20..=0x3f => DebugIoPortRange::Bootloader, 372 0x40..=0x5f => DebugIoPortRange::Kernel, 373 0x60..=0x7f => DebugIoPortRange::Userspace, 374 _ => DebugIoPortRange::Custom, 375 } 376 } 377 } 378 379 #[cfg(target_arch = "x86_64")] 380 impl fmt::Display for DebugIoPortRange { 381 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 382 match self { 383 DebugIoPortRange::Firmware => write!(f, "{}: Firmware", DEBUG_IOPORT_PREFIX), 384 DebugIoPortRange::Bootloader => write!(f, "{}: Bootloader", DEBUG_IOPORT_PREFIX), 385 DebugIoPortRange::Kernel => write!(f, "{}: Kernel", DEBUG_IOPORT_PREFIX), 386 DebugIoPortRange::Userspace => write!(f, "{}: Userspace", DEBUG_IOPORT_PREFIX), 387 DebugIoPortRange::Custom => write!(f, "{}: Custom", DEBUG_IOPORT_PREFIX), 388 } 389 } 390 } 391 392 struct VmOps { 393 memory: GuestMemoryAtomic<GuestMemoryMmap>, 394 #[cfg(target_arch = "x86_64")] 395 io_bus: Arc<Bus>, 396 mmio_bus: Arc<Bus>, 397 #[cfg(target_arch = "x86_64")] 398 timestamp: std::time::Instant, 399 #[cfg(target_arch = "x86_64")] 400 pci_config_io: Arc<Mutex<dyn BusDevice>>, 401 } 402 403 impl VmOps { 404 #[cfg(target_arch = "x86_64")] 405 // Log debug io port codes. 406 fn log_debug_ioport(&self, code: u8) { 407 let elapsed = self.timestamp.elapsed(); 408 409 info!( 410 "[{} code 0x{:x}] {}.{:>06} seconds", 411 DebugIoPortRange::from_u8(code), 412 code, 413 elapsed.as_secs(), 414 elapsed.as_micros() 415 ); 416 } 417 } 418 419 impl VmmOps for VmOps { 420 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> hypervisor::vm::Result<usize> { 421 self.memory 422 .memory() 423 .write(buf, GuestAddress(gpa)) 424 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 425 } 426 427 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> hypervisor::vm::Result<usize> { 428 self.memory 429 .memory() 430 .read(buf, GuestAddress(gpa)) 431 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 432 } 433 434 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> { 435 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 436 warn!("Guest MMIO read to unregistered address 0x{:x}", gpa); 437 } 438 Ok(()) 439 } 440 441 fn mmio_write(&self, gpa: u64, data: &[u8]) -> hypervisor::vm::Result<()> { 442 match self.mmio_bus.write(gpa, data) { 443 Err(vm_device::BusError::MissingAddressRange) => { 444 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa); 445 } 446 Ok(Some(barrier)) => { 447 info!("Waiting for barrier"); 448 barrier.wait(); 449 info!("Barrier released"); 450 } 451 _ => {} 452 }; 453 Ok(()) 454 } 455 456 #[cfg(target_arch = "x86_64")] 457 fn pio_read(&self, port: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> { 458 use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE}; 459 460 if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) { 461 self.pci_config_io.lock().unwrap().read( 462 PCI_CONFIG_IO_PORT, 463 port - PCI_CONFIG_IO_PORT, 464 data, 465 ); 466 return Ok(()); 467 } 468 469 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 470 warn!("Guest PIO read to unregistered address 0x{:x}", port); 471 } 472 Ok(()) 473 } 474 475 #[cfg(target_arch = "x86_64")] 476 fn pio_write(&self, port: u64, data: &[u8]) -> hypervisor::vm::Result<()> { 477 use pci::{PCI_CONFIG_IO_PORT, PCI_CONFIG_IO_PORT_SIZE}; 478 479 if port == DEBUG_IOPORT as u64 && data.len() == 1 { 480 self.log_debug_ioport(data[0]); 481 return Ok(()); 482 } 483 484 if (PCI_CONFIG_IO_PORT..(PCI_CONFIG_IO_PORT + PCI_CONFIG_IO_PORT_SIZE)).contains(&port) { 485 self.pci_config_io.lock().unwrap().write( 486 PCI_CONFIG_IO_PORT, 487 port - PCI_CONFIG_IO_PORT, 488 data, 489 ); 490 return Ok(()); 491 } 492 493 match self.io_bus.write(port, data) { 494 Err(vm_device::BusError::MissingAddressRange) => { 495 warn!("Guest PIO write to unregistered address 0x{:x}", port); 496 } 497 Ok(Some(barrier)) => { 498 info!("Waiting for barrier"); 499 barrier.wait(); 500 info!("Barrier released"); 501 } 502 _ => {} 503 }; 504 Ok(()) 505 } 506 } 507 508 pub fn physical_bits(max_phys_bits: u8) -> u8 { 509 let host_phys_bits = get_host_cpu_phys_bits(); 510 511 cmp::min(host_phys_bits, max_phys_bits) 512 } 513 514 pub const HANDLED_SIGNALS: [i32; 3] = [SIGWINCH, SIGTERM, SIGINT]; 515 516 pub struct Vm { 517 kernel: Option<File>, 518 initramfs: Option<File>, 519 threads: Vec<thread::JoinHandle<()>>, 520 device_manager: Arc<Mutex<DeviceManager>>, 521 config: Arc<Mutex<VmConfig>>, 522 on_tty: bool, 523 signals: Option<Handle>, 524 state: RwLock<VmState>, 525 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 526 memory_manager: Arc<Mutex<MemoryManager>>, 527 #[cfg_attr(not(feature = "kvm"), allow(dead_code))] 528 // The hypervisor abstracted virtual machine. 529 vm: Arc<dyn hypervisor::Vm>, 530 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 531 saved_clock: Option<hypervisor::ClockData>, 532 533 numa_nodes: NumaNodes, 534 seccomp_action: SeccompAction, 535 exit_evt: EventFd, 536 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 537 hypervisor: Arc<dyn hypervisor::Hypervisor>, 538 stop_on_boot: bool, 539 } 540 541 impl Vm { 542 #[allow(clippy::too_many_arguments)] 543 fn new_from_memory_manager( 544 config: Arc<Mutex<VmConfig>>, 545 memory_manager: Arc<Mutex<MemoryManager>>, 546 vm: Arc<dyn hypervisor::Vm>, 547 exit_evt: EventFd, 548 reset_evt: EventFd, 549 #[cfg(feature = "gdb")] vm_debug_evt: EventFd, 550 seccomp_action: &SeccompAction, 551 hypervisor: Arc<dyn hypervisor::Hypervisor>, 552 activate_evt: EventFd, 553 restoring: bool, 554 ) -> Result<Self> { 555 config 556 .lock() 557 .unwrap() 558 .validate() 559 .map_err(Error::ConfigValidation)?; 560 561 info!("Booting VM from config: {:?}", &config); 562 563 // Create NUMA nodes based on NumaConfig. 564 let numa_nodes = 565 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 566 567 #[cfg(feature = "tdx")] 568 let force_iommu = config.lock().unwrap().tdx.is_some(); 569 #[cfg(not(feature = "tdx"))] 570 let force_iommu = false; 571 572 #[cfg(feature = "gdb")] 573 let stop_on_boot = config.lock().unwrap().gdb; 574 #[cfg(not(feature = "gdb"))] 575 let stop_on_boot = false; 576 577 let device_manager = DeviceManager::new( 578 vm.clone(), 579 config.clone(), 580 memory_manager.clone(), 581 &exit_evt, 582 &reset_evt, 583 seccomp_action.clone(), 584 numa_nodes.clone(), 585 &activate_evt, 586 force_iommu, 587 restoring, 588 ) 589 .map_err(Error::DeviceManager)?; 590 591 let memory = memory_manager.lock().unwrap().guest_memory(); 592 #[cfg(target_arch = "x86_64")] 593 let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus()); 594 let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus()); 595 // Create the VmOps structure, which implements the VmmOps trait. 596 // And send it to the hypervisor. 597 598 #[cfg(target_arch = "x86_64")] 599 let pci_config_io = 600 device_manager.lock().unwrap().pci_config_io() as Arc<Mutex<dyn BusDevice>>; 601 let vm_ops: Arc<dyn VmmOps> = Arc::new(VmOps { 602 memory, 603 #[cfg(target_arch = "x86_64")] 604 io_bus, 605 mmio_bus, 606 #[cfg(target_arch = "x86_64")] 607 timestamp: std::time::Instant::now(), 608 #[cfg(target_arch = "x86_64")] 609 pci_config_io, 610 }); 611 612 let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?; 613 #[cfg(feature = "tdx")] 614 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 615 let cpus_config = { &config.lock().unwrap().cpus.clone() }; 616 let cpu_manager = cpu::CpuManager::new( 617 cpus_config, 618 &device_manager, 619 &memory_manager, 620 vm.clone(), 621 exit_evt_clone, 622 reset_evt, 623 #[cfg(feature = "gdb")] 624 vm_debug_evt, 625 hypervisor.clone(), 626 seccomp_action.clone(), 627 vm_ops, 628 #[cfg(feature = "tdx")] 629 tdx_enabled, 630 &numa_nodes, 631 ) 632 .map_err(Error::CpuManager)?; 633 634 let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0; 635 let kernel = config 636 .lock() 637 .unwrap() 638 .kernel 639 .as_ref() 640 .map(|k| File::open(&k.path)) 641 .transpose() 642 .map_err(Error::KernelFile)?; 643 644 let initramfs = config 645 .lock() 646 .unwrap() 647 .initramfs 648 .as_ref() 649 .map(|i| File::open(&i.path)) 650 .transpose() 651 .map_err(Error::InitramfsFile)?; 652 653 Ok(Vm { 654 kernel, 655 initramfs, 656 device_manager, 657 config, 658 on_tty, 659 threads: Vec::with_capacity(1), 660 signals: None, 661 state: RwLock::new(VmState::Created), 662 cpu_manager, 663 memory_manager, 664 vm, 665 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 666 saved_clock: None, 667 668 numa_nodes, 669 seccomp_action: seccomp_action.clone(), 670 exit_evt, 671 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 672 hypervisor, 673 stop_on_boot, 674 }) 675 } 676 677 fn create_numa_nodes( 678 configs: Option<Vec<NumaConfig>>, 679 memory_manager: &Arc<Mutex<MemoryManager>>, 680 ) -> Result<NumaNodes> { 681 let mm = memory_manager.lock().unwrap(); 682 let mm_zones = mm.memory_zones(); 683 let mut numa_nodes = BTreeMap::new(); 684 685 if let Some(configs) = &configs { 686 for config in configs.iter() { 687 if numa_nodes.contains_key(&config.guest_numa_id) { 688 error!("Can't define twice the same NUMA node"); 689 return Err(Error::InvalidNumaConfig); 690 } 691 692 let mut node = NumaNode::default(); 693 694 if let Some(memory_zones) = &config.memory_zones { 695 for memory_zone in memory_zones.iter() { 696 if let Some(mm_zone) = mm_zones.get(memory_zone) { 697 node.memory_regions.extend(mm_zone.regions().clone()); 698 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 699 node.hotplug_regions.push(virtiomem_zone.region().clone()); 700 } 701 node.memory_zones.push(memory_zone.clone()); 702 } else { 703 error!("Unknown memory zone '{}'", memory_zone); 704 return Err(Error::InvalidNumaConfig); 705 } 706 } 707 } 708 709 if let Some(cpus) = &config.cpus { 710 node.cpus.extend(cpus); 711 } 712 713 if let Some(distances) = &config.distances { 714 for distance in distances.iter() { 715 let dest = distance.destination; 716 let dist = distance.distance; 717 718 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 719 error!("Unknown destination NUMA node {}", dest); 720 return Err(Error::InvalidNumaConfig); 721 } 722 723 if node.distances.contains_key(&dest) { 724 error!("Destination NUMA node {} has been already set", dest); 725 return Err(Error::InvalidNumaConfig); 726 } 727 728 node.distances.insert(dest, dist); 729 } 730 } 731 732 #[cfg(target_arch = "x86_64")] 733 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 734 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 735 let mm_sections = sgx_epc_region.epc_sections(); 736 for sgx_epc_section in sgx_epc_sections.iter() { 737 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 738 node.sgx_epc_sections.push(mm_section.clone()); 739 } else { 740 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 741 return Err(Error::InvalidNumaConfig); 742 } 743 } 744 } else { 745 error!("Missing SGX EPC region"); 746 return Err(Error::InvalidNumaConfig); 747 } 748 } 749 750 numa_nodes.insert(config.guest_numa_id, node); 751 } 752 } 753 754 Ok(numa_nodes) 755 } 756 757 #[allow(clippy::too_many_arguments)] 758 pub fn new( 759 config: Arc<Mutex<VmConfig>>, 760 exit_evt: EventFd, 761 reset_evt: EventFd, 762 #[cfg(feature = "gdb")] vm_debug_evt: EventFd, 763 seccomp_action: &SeccompAction, 764 hypervisor: Arc<dyn hypervisor::Hypervisor>, 765 activate_evt: EventFd, 766 serial_pty: Option<PtyPair>, 767 console_pty: Option<PtyPair>, 768 console_resize_pipe: Option<File>, 769 ) -> Result<Self> { 770 #[cfg(feature = "tdx")] 771 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 772 hypervisor.check_required_extensions().unwrap(); 773 #[cfg(feature = "tdx")] 774 let vm = hypervisor 775 .create_vm_with_type(if tdx_enabled { 776 2 // KVM_X86_TDX_VM 777 } else { 778 0 // KVM_X86_LEGACY_VM 779 }) 780 .unwrap(); 781 #[cfg(not(feature = "tdx"))] 782 let vm = hypervisor.create_vm().unwrap(); 783 784 #[cfg(target_arch = "x86_64")] 785 { 786 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 787 .unwrap(); 788 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 789 vm.enable_split_irq().unwrap(); 790 } 791 792 let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits); 793 794 #[cfg(target_arch = "x86_64")] 795 let sgx_epc_config = config.lock().unwrap().sgx_epc.clone(); 796 797 let memory_manager = MemoryManager::new( 798 vm.clone(), 799 &config.lock().unwrap().memory.clone(), 800 None, 801 phys_bits, 802 #[cfg(feature = "tdx")] 803 tdx_enabled, 804 None, 805 None, 806 #[cfg(target_arch = "x86_64")] 807 sgx_epc_config, 808 ) 809 .map_err(Error::MemoryManager)?; 810 811 let new_vm = Vm::new_from_memory_manager( 812 config, 813 memory_manager, 814 vm, 815 exit_evt, 816 reset_evt, 817 #[cfg(feature = "gdb")] 818 vm_debug_evt, 819 seccomp_action, 820 hypervisor, 821 activate_evt, 822 false, 823 )?; 824 825 // The device manager must create the devices from here as it is part 826 // of the regular code path creating everything from scratch. 827 new_vm 828 .device_manager 829 .lock() 830 .unwrap() 831 .create_devices(serial_pty, console_pty, console_resize_pipe) 832 .map_err(Error::DeviceManager)?; 833 Ok(new_vm) 834 } 835 836 #[allow(clippy::too_many_arguments)] 837 pub fn new_from_snapshot( 838 snapshot: &Snapshot, 839 vm_config: Arc<Mutex<VmConfig>>, 840 exit_evt: EventFd, 841 reset_evt: EventFd, 842 #[cfg(feature = "gdb")] vm_debug_evt: EventFd, 843 source_url: Option<&str>, 844 prefault: bool, 845 seccomp_action: &SeccompAction, 846 hypervisor: Arc<dyn hypervisor::Hypervisor>, 847 activate_evt: EventFd, 848 ) -> Result<Self> { 849 hypervisor.check_required_extensions().unwrap(); 850 let vm = hypervisor.create_vm().unwrap(); 851 852 #[cfg(target_arch = "x86_64")] 853 { 854 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 855 .unwrap(); 856 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 857 vm.enable_split_irq().unwrap(); 858 } 859 860 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 861 if let Some(state) = vm_snapshot.state { 862 vm.set_state(state) 863 .map_err(|e| Error::Restore(MigratableError::Restore(e.into())))?; 864 } 865 866 let memory_manager = if let Some(memory_manager_snapshot) = 867 snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) 868 { 869 let phys_bits = physical_bits(vm_config.lock().unwrap().cpus.max_phys_bits); 870 MemoryManager::new_from_snapshot( 871 memory_manager_snapshot, 872 vm.clone(), 873 &vm_config.lock().unwrap().memory.clone(), 874 source_url, 875 prefault, 876 phys_bits, 877 ) 878 .map_err(Error::MemoryManager)? 879 } else { 880 return Err(Error::Restore(MigratableError::Restore(anyhow!( 881 "Missing memory manager snapshot" 882 )))); 883 }; 884 885 Vm::new_from_memory_manager( 886 vm_config, 887 memory_manager, 888 vm, 889 exit_evt, 890 reset_evt, 891 #[cfg(feature = "gdb")] 892 vm_debug_evt, 893 seccomp_action, 894 hypervisor, 895 activate_evt, 896 true, 897 ) 898 } 899 900 #[allow(clippy::too_many_arguments)] 901 pub fn new_from_migration( 902 config: Arc<Mutex<VmConfig>>, 903 exit_evt: EventFd, 904 reset_evt: EventFd, 905 #[cfg(feature = "gdb")] vm_debug_evt: EventFd, 906 seccomp_action: &SeccompAction, 907 hypervisor: Arc<dyn hypervisor::Hypervisor>, 908 activate_evt: EventFd, 909 memory_manager_data: &MemoryManagerSnapshotData, 910 existing_memory_files: Option<HashMap<u32, File>>, 911 ) -> Result<Self> { 912 hypervisor.check_required_extensions().unwrap(); 913 let vm = hypervisor.create_vm().unwrap(); 914 915 #[cfg(target_arch = "x86_64")] 916 { 917 vm.set_identity_map_address(KVM_IDENTITY_MAP_START.0) 918 .unwrap(); 919 vm.set_tss_address(KVM_TSS_START.0 as usize).unwrap(); 920 vm.enable_split_irq().unwrap(); 921 } 922 923 let phys_bits = physical_bits(config.lock().unwrap().cpus.max_phys_bits); 924 925 let memory_manager = MemoryManager::new( 926 vm.clone(), 927 &config.lock().unwrap().memory.clone(), 928 None, 929 phys_bits, 930 #[cfg(feature = "tdx")] 931 false, 932 Some(memory_manager_data), 933 existing_memory_files, 934 #[cfg(target_arch = "x86_64")] 935 None, 936 ) 937 .map_err(Error::MemoryManager)?; 938 939 Vm::new_from_memory_manager( 940 config, 941 memory_manager, 942 vm, 943 exit_evt, 944 reset_evt, 945 #[cfg(feature = "gdb")] 946 vm_debug_evt, 947 seccomp_action, 948 hypervisor, 949 activate_evt, 950 true, 951 ) 952 } 953 954 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 955 let mut initramfs = self.initramfs.as_ref().unwrap(); 956 let size: usize = initramfs 957 .seek(SeekFrom::End(0)) 958 .map_err(|_| Error::InitramfsLoad)? 959 .try_into() 960 .unwrap(); 961 initramfs 962 .seek(SeekFrom::Start(0)) 963 .map_err(|_| Error::InitramfsLoad)?; 964 965 let address = 966 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 967 let address = GuestAddress(address); 968 969 guest_mem 970 .read_from(address, &mut initramfs, size) 971 .map_err(|_| Error::InitramfsLoad)?; 972 973 info!("Initramfs loaded: address = 0x{:x}", address.0); 974 Ok(arch::InitramfsConfig { address, size }) 975 } 976 977 fn get_cmdline(&mut self) -> Result<Cmdline> { 978 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE); 979 cmdline 980 .insert_str(self.config.lock().unwrap().cmdline.args.clone()) 981 .map_err(Error::CmdLineInsertStr)?; 982 for entry in self.device_manager.lock().unwrap().cmdline_additions() { 983 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 984 } 985 Ok(cmdline) 986 } 987 988 #[cfg(target_arch = "aarch64")] 989 fn load_kernel(&mut self) -> Result<EntryPoint> { 990 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 991 let mem = guest_memory.memory(); 992 let mut kernel = self.kernel.as_ref().unwrap(); 993 let entry_addr = match linux_loader::loader::pe::PE::load( 994 mem.deref(), 995 Some(arch::layout::KERNEL_START), 996 &mut kernel, 997 None, 998 ) { 999 Ok(entry_addr) => entry_addr, 1000 // Try to load the binary as kernel PE file at first. 1001 // If failed, retry to load it as UEFI binary. 1002 // As the UEFI binary is formatless, it must be the last option to try. 1003 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 1004 arch::aarch64::uefi::load_uefi(mem.deref(), arch::layout::UEFI_START, &mut kernel) 1005 .map_err(Error::UefiLoad)?; 1006 // The entry point offset in UEFI image is always 0. 1007 return Ok(EntryPoint { 1008 entry_addr: arch::layout::UEFI_START, 1009 }); 1010 } 1011 Err(e) => { 1012 return Err(Error::KernelLoad(e)); 1013 } 1014 }; 1015 1016 let entry_point_addr: GuestAddress = entry_addr.kernel_load; 1017 1018 Ok(EntryPoint { 1019 entry_addr: entry_point_addr, 1020 }) 1021 } 1022 1023 #[cfg(target_arch = "x86_64")] 1024 fn load_kernel(&mut self) -> Result<EntryPoint> { 1025 use linux_loader::loader::{elf::Error::InvalidElfMagicNumber, Error::Elf}; 1026 info!("Loading kernel"); 1027 let cmdline = self.get_cmdline()?; 1028 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1029 let mem = guest_memory.memory(); 1030 let mut kernel = self.kernel.as_ref().unwrap(); 1031 let entry_addr = match linux_loader::loader::elf::Elf::load( 1032 mem.deref(), 1033 None, 1034 &mut kernel, 1035 Some(arch::layout::HIGH_RAM_START), 1036 ) { 1037 Ok(entry_addr) => entry_addr, 1038 Err(e) => match e { 1039 Elf(InvalidElfMagicNumber) => { 1040 // Not an ELF header - assume raw binary data / firmware 1041 let size = kernel.seek(SeekFrom::End(0)).map_err(Error::FirmwareFile)?; 1042 1043 // The OVMF firmware is as big as you might expect and it's 4MiB so limit to that 1044 if size > 4 << 20 { 1045 return Err(Error::FirmwareTooLarge); 1046 } 1047 1048 // Loaded at the end of the 4GiB 1049 let load_address = GuestAddress(4 << 30) 1050 .checked_sub(size) 1051 .ok_or(Error::FirmwareTooLarge)?; 1052 1053 info!( 1054 "Loading RAW firmware at 0x{:x} (size: {})", 1055 load_address.raw_value(), 1056 size 1057 ); 1058 1059 self.memory_manager 1060 .lock() 1061 .unwrap() 1062 .add_ram_region(load_address, size as usize) 1063 .map_err(Error::AllocateFirmwareMemory)?; 1064 1065 kernel 1066 .seek(SeekFrom::Start(0)) 1067 .map_err(Error::FirmwareFile)?; 1068 guest_memory 1069 .memory() 1070 .read_exact_from(load_address, &mut kernel, size as usize) 1071 .map_err(Error::FirmwareLoad)?; 1072 1073 return Ok(EntryPoint { entry_addr: None }); 1074 } 1075 _ => { 1076 return Err(Error::KernelLoad(e)); 1077 } 1078 }, 1079 }; 1080 1081 linux_loader::loader::load_cmdline(mem.deref(), arch::layout::CMDLINE_START, &cmdline) 1082 .map_err(Error::LoadCmdLine)?; 1083 1084 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1085 // Use the PVH kernel entry point to boot the guest 1086 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1087 Ok(EntryPoint { 1088 entry_addr: Some(entry_addr), 1089 }) 1090 } else { 1091 Err(Error::KernelMissingPvhHeader) 1092 } 1093 } 1094 1095 #[cfg(target_arch = "x86_64")] 1096 fn configure_system(&mut self, rsdp_addr: GuestAddress) -> Result<()> { 1097 info!("Configuring system"); 1098 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1099 1100 let initramfs_config = match self.initramfs { 1101 Some(_) => Some(self.load_initramfs(&mem)?), 1102 None => None, 1103 }; 1104 1105 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1106 let rsdp_addr = Some(rsdp_addr); 1107 let sgx_epc_region = self 1108 .memory_manager 1109 .lock() 1110 .unwrap() 1111 .sgx_epc_region() 1112 .as_ref() 1113 .cloned(); 1114 1115 arch::configure_system( 1116 &mem, 1117 arch::layout::CMDLINE_START, 1118 &initramfs_config, 1119 boot_vcpus, 1120 rsdp_addr, 1121 sgx_epc_region, 1122 ) 1123 .map_err(Error::ConfigureSystem)?; 1124 Ok(()) 1125 } 1126 1127 #[cfg(target_arch = "aarch64")] 1128 fn configure_system(&mut self, _rsdp_addr: GuestAddress) -> Result<()> { 1129 let cmdline = self.get_cmdline()?; 1130 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1131 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1132 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1133 let mut pci_space_info: Vec<PciSpaceInfo> = Vec::new(); 1134 let initramfs_config = match self.initramfs { 1135 Some(_) => Some(self.load_initramfs(&mem)?), 1136 None => None, 1137 }; 1138 1139 let device_info = &self 1140 .device_manager 1141 .lock() 1142 .unwrap() 1143 .get_device_info() 1144 .clone(); 1145 1146 for pci_segment in self.device_manager.lock().unwrap().pci_segments().iter() { 1147 let pci_space = PciSpaceInfo { 1148 pci_segment_id: pci_segment.id, 1149 mmio_config_address: pci_segment.mmio_config_address, 1150 pci_device_space_start: pci_segment.start_of_device_area, 1151 pci_device_space_size: pci_segment.end_of_device_area 1152 - pci_segment.start_of_device_area 1153 + 1, 1154 }; 1155 pci_space_info.push(pci_space); 1156 } 1157 1158 let virtio_iommu_bdf = self 1159 .device_manager 1160 .lock() 1161 .unwrap() 1162 .iommu_attached_devices() 1163 .as_ref() 1164 .map(|(v, _)| *v); 1165 1166 let gic_device = create_gic( 1167 &self.memory_manager.lock().as_ref().unwrap().vm, 1168 self.cpu_manager.lock().unwrap().boot_vcpus() as u64, 1169 ) 1170 .map_err(|e| { 1171 Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::SetupGic(e))) 1172 })?; 1173 1174 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 1175 let pmu_supported = self 1176 .cpu_manager 1177 .lock() 1178 .unwrap() 1179 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 1180 .map_err(|_| { 1181 Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::VcpuInitPmu)) 1182 })?; 1183 1184 arch::configure_system( 1185 &mem, 1186 cmdline.as_str(), 1187 vcpu_mpidrs, 1188 vcpu_topology, 1189 device_info, 1190 &initramfs_config, 1191 &pci_space_info, 1192 virtio_iommu_bdf.map(|bdf| bdf.into()), 1193 &*gic_device, 1194 &self.numa_nodes, 1195 pmu_supported, 1196 ) 1197 .map_err(Error::ConfigureSystem)?; 1198 1199 // Update the GIC entity in device manager 1200 self.device_manager 1201 .lock() 1202 .unwrap() 1203 .get_interrupt_controller() 1204 .unwrap() 1205 .lock() 1206 .unwrap() 1207 .set_gic_device(Arc::new(Mutex::new(gic_device))); 1208 1209 // Activate gic device 1210 self.device_manager 1211 .lock() 1212 .unwrap() 1213 .get_interrupt_controller() 1214 .unwrap() 1215 .lock() 1216 .unwrap() 1217 .enable() 1218 .map_err(Error::EnableInterruptController)?; 1219 1220 Ok(()) 1221 } 1222 1223 pub fn serial_pty(&self) -> Option<PtyPair> { 1224 self.device_manager.lock().unwrap().serial_pty() 1225 } 1226 1227 pub fn console_pty(&self) -> Option<PtyPair> { 1228 self.device_manager.lock().unwrap().console_pty() 1229 } 1230 1231 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1232 self.device_manager.lock().unwrap().console_resize_pipe() 1233 } 1234 1235 pub fn shutdown(&mut self) -> Result<()> { 1236 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1237 let new_state = VmState::Shutdown; 1238 1239 state.valid_transition(new_state)?; 1240 1241 if self.on_tty { 1242 // Don't forget to set the terminal in canonical mode 1243 // before to exit. 1244 io::stdin() 1245 .lock() 1246 .set_canon_mode() 1247 .map_err(Error::SetTerminalCanon)?; 1248 } 1249 1250 // Trigger the termination of the signal_handler thread 1251 if let Some(signals) = self.signals.take() { 1252 signals.close(); 1253 } 1254 1255 // Wake up the DeviceManager threads so they will get terminated cleanly 1256 self.device_manager 1257 .lock() 1258 .unwrap() 1259 .resume() 1260 .map_err(Error::Resume)?; 1261 1262 self.cpu_manager 1263 .lock() 1264 .unwrap() 1265 .shutdown() 1266 .map_err(Error::CpuManager)?; 1267 1268 // Wait for all the threads to finish 1269 for thread in self.threads.drain(..) { 1270 thread.join().map_err(Error::ThreadCleanup)? 1271 } 1272 *state = new_state; 1273 1274 event!("vm", "shutdown"); 1275 1276 Ok(()) 1277 } 1278 1279 pub fn resize( 1280 &mut self, 1281 desired_vcpus: Option<u8>, 1282 desired_memory: Option<u64>, 1283 desired_balloon: Option<u64>, 1284 ) -> Result<()> { 1285 event!("vm", "resizing"); 1286 1287 if let Some(desired_vcpus) = desired_vcpus { 1288 if self 1289 .cpu_manager 1290 .lock() 1291 .unwrap() 1292 .resize(desired_vcpus) 1293 .map_err(Error::CpuManager)? 1294 { 1295 self.device_manager 1296 .lock() 1297 .unwrap() 1298 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1299 .map_err(Error::DeviceManager)?; 1300 } 1301 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1302 } 1303 1304 if let Some(desired_memory) = desired_memory { 1305 let new_region = self 1306 .memory_manager 1307 .lock() 1308 .unwrap() 1309 .resize(desired_memory) 1310 .map_err(Error::MemoryManager)?; 1311 1312 let mut memory_config = &mut self.config.lock().unwrap().memory; 1313 1314 if let Some(new_region) = &new_region { 1315 self.device_manager 1316 .lock() 1317 .unwrap() 1318 .update_memory(new_region) 1319 .map_err(Error::DeviceManager)?; 1320 1321 match memory_config.hotplug_method { 1322 HotplugMethod::Acpi => { 1323 self.device_manager 1324 .lock() 1325 .unwrap() 1326 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1327 .map_err(Error::DeviceManager)?; 1328 } 1329 HotplugMethod::VirtioMem => {} 1330 } 1331 } 1332 1333 // We update the VM config regardless of the actual guest resize 1334 // operation result (happened or not), so that if the VM reboots 1335 // it will be running with the last configure memory size. 1336 match memory_config.hotplug_method { 1337 HotplugMethod::Acpi => memory_config.size = desired_memory, 1338 HotplugMethod::VirtioMem => { 1339 if desired_memory > memory_config.size { 1340 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1341 } else { 1342 memory_config.hotplugged_size = None; 1343 } 1344 } 1345 } 1346 } 1347 1348 if let Some(desired_balloon) = desired_balloon { 1349 self.device_manager 1350 .lock() 1351 .unwrap() 1352 .resize_balloon(desired_balloon) 1353 .map_err(Error::DeviceManager)?; 1354 1355 // Update the configuration value for the balloon size to ensure 1356 // a reboot would use the right value. 1357 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1358 balloon_config.size = desired_balloon; 1359 } 1360 } 1361 1362 event!("vm", "resized"); 1363 1364 Ok(()) 1365 } 1366 1367 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1368 let memory_config = &mut self.config.lock().unwrap().memory; 1369 1370 if let Some(zones) = &mut memory_config.zones { 1371 for zone in zones.iter_mut() { 1372 if zone.id == id { 1373 if desired_memory >= zone.size { 1374 let hotplugged_size = desired_memory - zone.size; 1375 self.memory_manager 1376 .lock() 1377 .unwrap() 1378 .resize_zone(&id, desired_memory - zone.size) 1379 .map_err(Error::MemoryManager)?; 1380 // We update the memory zone config regardless of the 1381 // actual 'resize-zone' operation result (happened or 1382 // not), so that if the VM reboots it will be running 1383 // with the last configured memory zone size. 1384 zone.hotplugged_size = Some(hotplugged_size); 1385 1386 return Ok(()); 1387 } else { 1388 error!( 1389 "Invalid to ask less ({}) than boot RAM ({}) for \ 1390 this memory zone", 1391 desired_memory, zone.size, 1392 ); 1393 return Err(Error::ResizeZone); 1394 } 1395 } 1396 } 1397 } 1398 1399 error!("Could not find the memory zone {} for the resize", id); 1400 Err(Error::ResizeZone) 1401 } 1402 1403 pub fn add_device(&mut self, mut device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1404 let pci_device_info = self 1405 .device_manager 1406 .lock() 1407 .unwrap() 1408 .add_device(&mut device_cfg) 1409 .map_err(Error::DeviceManager)?; 1410 1411 // Update VmConfig by adding the new device. This is important to 1412 // ensure the device would be created in case of a reboot. 1413 { 1414 let mut config = self.config.lock().unwrap(); 1415 add_to_config(&mut config.devices, device_cfg); 1416 } 1417 1418 self.device_manager 1419 .lock() 1420 .unwrap() 1421 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1422 .map_err(Error::DeviceManager)?; 1423 1424 Ok(pci_device_info) 1425 } 1426 1427 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1428 let pci_device_info = self 1429 .device_manager 1430 .lock() 1431 .unwrap() 1432 .add_user_device(&mut device_cfg) 1433 .map_err(Error::DeviceManager)?; 1434 1435 // Update VmConfig by adding the new device. This is important to 1436 // ensure the device would be created in case of a reboot. 1437 { 1438 let mut config = self.config.lock().unwrap(); 1439 add_to_config(&mut config.user_devices, device_cfg); 1440 } 1441 1442 self.device_manager 1443 .lock() 1444 .unwrap() 1445 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1446 .map_err(Error::DeviceManager)?; 1447 1448 Ok(pci_device_info) 1449 } 1450 1451 pub fn remove_device(&mut self, id: String) -> Result<()> { 1452 self.device_manager 1453 .lock() 1454 .unwrap() 1455 .remove_device(id.clone()) 1456 .map_err(Error::DeviceManager)?; 1457 1458 // Update VmConfig by removing the device. This is important to 1459 // ensure the device would not be created in case of a reboot. 1460 let mut config = self.config.lock().unwrap(); 1461 1462 // Remove if VFIO device 1463 if let Some(devices) = config.devices.as_mut() { 1464 devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1465 } 1466 1467 // Remove if VFIO user device 1468 if let Some(user_devices) = config.user_devices.as_mut() { 1469 user_devices.retain(|dev| dev.id.as_ref() != Some(&id)); 1470 } 1471 1472 // Remove if disk device 1473 if let Some(disks) = config.disks.as_mut() { 1474 disks.retain(|dev| dev.id.as_ref() != Some(&id)); 1475 } 1476 1477 // Remove if net device 1478 if let Some(net) = config.net.as_mut() { 1479 net.retain(|dev| dev.id.as_ref() != Some(&id)); 1480 } 1481 1482 // Remove if pmem device 1483 if let Some(pmem) = config.pmem.as_mut() { 1484 pmem.retain(|dev| dev.id.as_ref() != Some(&id)); 1485 } 1486 1487 // Remove if vDPA device 1488 if let Some(vdpa) = config.vdpa.as_mut() { 1489 vdpa.retain(|dev| dev.id.as_ref() != Some(&id)); 1490 } 1491 1492 // Remove if vsock device 1493 if let Some(vsock) = config.vsock.as_ref() { 1494 if vsock.id.as_ref() == Some(&id) { 1495 config.vsock = None; 1496 } 1497 } 1498 1499 self.device_manager 1500 .lock() 1501 .unwrap() 1502 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1503 .map_err(Error::DeviceManager)?; 1504 Ok(()) 1505 } 1506 1507 pub fn add_disk(&mut self, mut disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1508 let pci_device_info = self 1509 .device_manager 1510 .lock() 1511 .unwrap() 1512 .add_disk(&mut disk_cfg) 1513 .map_err(Error::DeviceManager)?; 1514 1515 // Update VmConfig by adding the new device. This is important to 1516 // ensure the device would be created in case of a reboot. 1517 { 1518 let mut config = self.config.lock().unwrap(); 1519 add_to_config(&mut config.disks, disk_cfg); 1520 } 1521 1522 self.device_manager 1523 .lock() 1524 .unwrap() 1525 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1526 .map_err(Error::DeviceManager)?; 1527 1528 Ok(pci_device_info) 1529 } 1530 1531 pub fn add_fs(&mut self, mut fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1532 let pci_device_info = self 1533 .device_manager 1534 .lock() 1535 .unwrap() 1536 .add_fs(&mut fs_cfg) 1537 .map_err(Error::DeviceManager)?; 1538 1539 // Update VmConfig by adding the new device. This is important to 1540 // ensure the device would be created in case of a reboot. 1541 { 1542 let mut config = self.config.lock().unwrap(); 1543 add_to_config(&mut config.fs, fs_cfg); 1544 } 1545 1546 self.device_manager 1547 .lock() 1548 .unwrap() 1549 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1550 .map_err(Error::DeviceManager)?; 1551 1552 Ok(pci_device_info) 1553 } 1554 1555 pub fn add_pmem(&mut self, mut pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1556 let pci_device_info = self 1557 .device_manager 1558 .lock() 1559 .unwrap() 1560 .add_pmem(&mut pmem_cfg) 1561 .map_err(Error::DeviceManager)?; 1562 1563 // Update VmConfig by adding the new device. This is important to 1564 // ensure the device would be created in case of a reboot. 1565 { 1566 let mut config = self.config.lock().unwrap(); 1567 add_to_config(&mut config.pmem, pmem_cfg); 1568 } 1569 1570 self.device_manager 1571 .lock() 1572 .unwrap() 1573 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1574 .map_err(Error::DeviceManager)?; 1575 1576 Ok(pci_device_info) 1577 } 1578 1579 pub fn add_net(&mut self, mut net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1580 let pci_device_info = self 1581 .device_manager 1582 .lock() 1583 .unwrap() 1584 .add_net(&mut net_cfg) 1585 .map_err(Error::DeviceManager)?; 1586 1587 // Update VmConfig by adding the new device. This is important to 1588 // ensure the device would be created in case of a reboot. 1589 { 1590 let mut config = self.config.lock().unwrap(); 1591 add_to_config(&mut config.net, net_cfg); 1592 } 1593 1594 self.device_manager 1595 .lock() 1596 .unwrap() 1597 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1598 .map_err(Error::DeviceManager)?; 1599 1600 Ok(pci_device_info) 1601 } 1602 1603 pub fn add_vdpa(&mut self, mut vdpa_cfg: VdpaConfig) -> Result<PciDeviceInfo> { 1604 let pci_device_info = self 1605 .device_manager 1606 .lock() 1607 .unwrap() 1608 .add_vdpa(&mut vdpa_cfg) 1609 .map_err(Error::DeviceManager)?; 1610 1611 // Update VmConfig by adding the new device. This is important to 1612 // ensure the device would be created in case of a reboot. 1613 { 1614 let mut config = self.config.lock().unwrap(); 1615 add_to_config(&mut config.vdpa, vdpa_cfg); 1616 } 1617 1618 self.device_manager 1619 .lock() 1620 .unwrap() 1621 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1622 .map_err(Error::DeviceManager)?; 1623 1624 Ok(pci_device_info) 1625 } 1626 1627 pub fn add_vsock(&mut self, mut vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1628 let pci_device_info = self 1629 .device_manager 1630 .lock() 1631 .unwrap() 1632 .add_vsock(&mut vsock_cfg) 1633 .map_err(Error::DeviceManager)?; 1634 1635 // Update VmConfig by adding the new device. This is important to 1636 // ensure the device would be created in case of a reboot. 1637 { 1638 let mut config = self.config.lock().unwrap(); 1639 config.vsock = Some(vsock_cfg); 1640 } 1641 1642 self.device_manager 1643 .lock() 1644 .unwrap() 1645 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1646 .map_err(Error::DeviceManager)?; 1647 1648 Ok(pci_device_info) 1649 } 1650 1651 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1652 Ok(self.device_manager.lock().unwrap().counters()) 1653 } 1654 1655 fn os_signal_handler( 1656 mut signals: Signals, 1657 console_input_clone: Arc<Console>, 1658 on_tty: bool, 1659 exit_evt: &EventFd, 1660 ) { 1661 for sig in &HANDLED_SIGNALS { 1662 unblock_signal(*sig).unwrap(); 1663 } 1664 1665 for signal in signals.forever() { 1666 match signal { 1667 SIGWINCH => { 1668 console_input_clone.update_console_size(); 1669 } 1670 SIGTERM | SIGINT => { 1671 if on_tty { 1672 io::stdin() 1673 .lock() 1674 .set_canon_mode() 1675 .expect("failed to restore terminal mode"); 1676 } 1677 if exit_evt.write(1).is_err() { 1678 std::process::exit(1); 1679 } 1680 } 1681 _ => (), 1682 } 1683 } 1684 } 1685 1686 #[cfg(feature = "tdx")] 1687 fn init_tdx(&mut self) -> Result<()> { 1688 let cpuid = self.cpu_manager.lock().unwrap().common_cpuid(); 1689 let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32; 1690 self.vm 1691 .tdx_init(&cpuid, max_vcpus) 1692 .map_err(Error::InitializeTdxVm)?; 1693 Ok(()) 1694 } 1695 1696 #[cfg(feature = "tdx")] 1697 fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> { 1698 use arch::x86_64::tdx::*; 1699 // The TDVF file contains a table of section as well as code 1700 let mut firmware_file = 1701 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1702 .map_err(Error::LoadTdvf)?; 1703 1704 // For all the sections allocate some RAM backing them 1705 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1706 } 1707 1708 #[cfg(feature = "tdx")] 1709 fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> { 1710 use arch::x86_64::tdx::*; 1711 // Get the memory end *before* we start adding TDVF ram regions 1712 let boot_guest_memory = self 1713 .memory_manager 1714 .lock() 1715 .as_ref() 1716 .unwrap() 1717 .boot_guest_memory(); 1718 for section in sections { 1719 // No need to allocate if the section falls within guest RAM ranges 1720 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1721 info!( 1722 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1723 section 1724 ); 1725 continue; 1726 } 1727 1728 info!("Allocating TDVF Section: {:x?}", section); 1729 self.memory_manager 1730 .lock() 1731 .unwrap() 1732 .add_ram_region(GuestAddress(section.address), section.size as usize) 1733 .map_err(Error::AllocatingTdvfMemory)?; 1734 } 1735 1736 // The TDVF file contains a table of section as well as code 1737 let mut firmware_file = 1738 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1739 .map_err(Error::LoadTdvf)?; 1740 1741 // The guest memory at this point now has all the required regions so it 1742 // is safe to copy from the TDVF file into it. 1743 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1744 let mem = guest_memory.memory(); 1745 let mut payload_info = None; 1746 let mut hob_offset = None; 1747 for section in sections { 1748 info!("Populating TDVF Section: {:x?}", section); 1749 match section.r#type { 1750 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1751 info!("Copying section to guest memory"); 1752 firmware_file 1753 .seek(SeekFrom::Start(section.data_offset as u64)) 1754 .map_err(Error::LoadTdvf)?; 1755 mem.read_from( 1756 GuestAddress(section.address), 1757 &mut firmware_file, 1758 section.data_size as usize, 1759 ) 1760 .unwrap(); 1761 } 1762 TdvfSectionType::TdHob => { 1763 hob_offset = Some(section.address); 1764 } 1765 TdvfSectionType::Payload => { 1766 info!("Copying payload to guest memory"); 1767 if let Some(payload_file) = self.kernel.as_mut() { 1768 let payload_size = payload_file 1769 .seek(SeekFrom::End(0)) 1770 .map_err(Error::LoadPayload)?; 1771 1772 payload_file 1773 .seek(SeekFrom::Start(0x1f1)) 1774 .map_err(Error::LoadPayload)?; 1775 1776 let mut payload_header = linux_loader::bootparam::setup_header::default(); 1777 payload_header 1778 .as_bytes() 1779 .read_from( 1780 0, 1781 payload_file, 1782 mem::size_of::<linux_loader::bootparam::setup_header>(), 1783 ) 1784 .unwrap(); 1785 1786 if payload_header.header != 0x5372_6448 { 1787 return Err(Error::InvalidPayloadType); 1788 } 1789 1790 if (payload_header.version < 0x0200) 1791 || ((payload_header.loadflags & 0x1) == 0x0) 1792 { 1793 return Err(Error::InvalidPayloadType); 1794 } 1795 1796 payload_file 1797 .seek(SeekFrom::Start(0)) 1798 .map_err(Error::LoadPayload)?; 1799 mem.read_from( 1800 GuestAddress(section.address), 1801 payload_file, 1802 payload_size as usize, 1803 ) 1804 .unwrap(); 1805 1806 // Create the payload info that will be inserted into 1807 // the HOB. 1808 payload_info = Some(PayloadInfo { 1809 image_type: PayloadImageType::BzImage, 1810 entry_point: section.address, 1811 }); 1812 } 1813 } 1814 TdvfSectionType::PayloadParam => { 1815 info!("Copying payload parameters to guest memory"); 1816 let cmdline = self.get_cmdline()?; 1817 mem.write_slice(cmdline.as_str().as_bytes(), GuestAddress(section.address)) 1818 .unwrap(); 1819 } 1820 _ => {} 1821 } 1822 } 1823 1824 // Generate HOB 1825 let mut hob = TdHob::start(hob_offset.unwrap()); 1826 1827 let mut sorted_sections = sections.to_vec(); 1828 sorted_sections.retain(|section| matches!(section.r#type, TdvfSectionType::TempMem)); 1829 1830 sorted_sections.sort_by_key(|section| section.address); 1831 sorted_sections.reverse(); 1832 let mut current_section = sorted_sections.pop(); 1833 1834 // RAM regions interleaved with TDVF sections 1835 let mut next_start_addr = 0; 1836 for region in boot_guest_memory.iter() { 1837 let region_start = region.start_addr().0; 1838 let region_end = region.last_addr().0; 1839 if region_start > next_start_addr { 1840 next_start_addr = region_start; 1841 } 1842 1843 loop { 1844 let (start, size, ram) = if let Some(section) = ¤t_section { 1845 if section.address <= next_start_addr { 1846 (section.address, section.size, false) 1847 } else { 1848 let last_addr = std::cmp::min(section.address - 1, region_end); 1849 (next_start_addr, last_addr - next_start_addr + 1, true) 1850 } 1851 } else { 1852 (next_start_addr, region_end - next_start_addr + 1, true) 1853 }; 1854 1855 hob.add_memory_resource(&mem, start, size, ram) 1856 .map_err(Error::PopulateHob)?; 1857 1858 if !ram { 1859 current_section = sorted_sections.pop(); 1860 } 1861 1862 next_start_addr = start + size; 1863 1864 if region_start > next_start_addr { 1865 next_start_addr = region_start; 1866 } 1867 1868 if next_start_addr > region_end { 1869 break; 1870 } 1871 } 1872 } 1873 1874 // MMIO regions 1875 hob.add_mmio_resource( 1876 &mem, 1877 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1878 arch::layout::APIC_START.raw_value() 1879 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1880 ) 1881 .map_err(Error::PopulateHob)?; 1882 let start_of_device_area = self 1883 .memory_manager 1884 .lock() 1885 .unwrap() 1886 .start_of_device_area() 1887 .raw_value(); 1888 let end_of_device_area = self 1889 .memory_manager 1890 .lock() 1891 .unwrap() 1892 .end_of_device_area() 1893 .raw_value(); 1894 hob.add_mmio_resource( 1895 &mem, 1896 start_of_device_area, 1897 end_of_device_area - start_of_device_area, 1898 ) 1899 .map_err(Error::PopulateHob)?; 1900 1901 // Loop over the ACPI tables and copy them to the HOB. 1902 1903 for acpi_table in crate::acpi::create_acpi_tables_tdx( 1904 &self.device_manager, 1905 &self.cpu_manager, 1906 &self.memory_manager, 1907 &self.numa_nodes, 1908 ) { 1909 hob.add_acpi_table(&mem, acpi_table.as_slice()) 1910 .map_err(Error::PopulateHob)?; 1911 } 1912 1913 // If a payload info has been created, let's insert it into the HOB. 1914 if let Some(payload_info) = payload_info { 1915 hob.add_payload(&mem, payload_info) 1916 .map_err(Error::PopulateHob)?; 1917 } 1918 1919 hob.finish(&mem).map_err(Error::PopulateHob)?; 1920 1921 Ok(hob_offset) 1922 } 1923 1924 #[cfg(feature = "tdx")] 1925 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1926 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1927 let mem = guest_memory.memory(); 1928 1929 for section in sections { 1930 self.vm 1931 .tdx_init_memory_region( 1932 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1933 section.address, 1934 section.size, 1935 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1936 section.attributes == 1, 1937 ) 1938 .map_err(Error::InitializeTdxMemoryRegion)?; 1939 } 1940 1941 Ok(()) 1942 } 1943 1944 fn setup_signal_handler(&mut self) -> Result<()> { 1945 let console = self.device_manager.lock().unwrap().console().clone(); 1946 let signals = Signals::new(&HANDLED_SIGNALS); 1947 match signals { 1948 Ok(signals) => { 1949 self.signals = Some(signals.handle()); 1950 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?; 1951 let on_tty = self.on_tty; 1952 let signal_handler_seccomp_filter = 1953 get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler) 1954 .map_err(Error::CreateSeccompFilter)?; 1955 self.threads.push( 1956 thread::Builder::new() 1957 .name("signal_handler".to_string()) 1958 .spawn(move || { 1959 if !signal_handler_seccomp_filter.is_empty() { 1960 if let Err(e) = apply_filter(&signal_handler_seccomp_filter) 1961 .map_err(Error::ApplySeccompFilter) 1962 { 1963 error!("Error applying seccomp filter: {:?}", e); 1964 exit_evt.write(1).ok(); 1965 return; 1966 } 1967 } 1968 std::panic::catch_unwind(AssertUnwindSafe(|| { 1969 Vm::os_signal_handler(signals, console, on_tty, &exit_evt); 1970 })) 1971 .map_err(|_| { 1972 error!("signal_handler thead panicked"); 1973 exit_evt.write(1).ok() 1974 }) 1975 .ok(); 1976 }) 1977 .map_err(Error::SignalHandlerSpawn)?, 1978 ); 1979 } 1980 Err(e) => error!("Signal not found {}", e), 1981 } 1982 Ok(()) 1983 } 1984 1985 fn setup_tty(&self) -> Result<()> { 1986 if self.on_tty { 1987 io::stdin() 1988 .lock() 1989 .set_raw_mode() 1990 .map_err(Error::SetTerminalRaw)?; 1991 } 1992 1993 Ok(()) 1994 } 1995 1996 // Creates ACPI tables 1997 // In case of TDX being used, this is a no-op since the tables will be 1998 // created and passed when populating the HOB. 1999 2000 fn create_acpi_tables(&self) -> Option<GuestAddress> { 2001 #[cfg(feature = "tdx")] 2002 if self.config.lock().unwrap().tdx.is_some() { 2003 return None; 2004 } 2005 2006 let mem = self.memory_manager.lock().unwrap().guest_memory().memory(); 2007 2008 let rsdp_addr = crate::acpi::create_acpi_tables( 2009 &mem, 2010 &self.device_manager, 2011 &self.cpu_manager, 2012 &self.memory_manager, 2013 &self.numa_nodes, 2014 ); 2015 info!("Created ACPI tables: rsdp_addr = 0x{:x}", rsdp_addr.0); 2016 2017 Some(rsdp_addr) 2018 } 2019 2020 fn entry_point(&mut self) -> Result<Option<EntryPoint>> { 2021 Ok(if self.kernel.as_ref().is_some() { 2022 #[cfg(feature = "tdx")] 2023 if self.config.lock().unwrap().tdx.is_some() { 2024 return Ok(None); 2025 } 2026 Some(self.load_kernel()?) 2027 } else { 2028 None 2029 }) 2030 } 2031 2032 pub fn boot(&mut self) -> Result<()> { 2033 info!("Booting VM"); 2034 event!("vm", "booting"); 2035 let current_state = self.get_state()?; 2036 if current_state == VmState::Paused { 2037 return self.resume().map_err(Error::Resume); 2038 } 2039 2040 let new_state = if self.stop_on_boot { 2041 VmState::BreakPoint 2042 } else { 2043 VmState::Running 2044 }; 2045 current_state.valid_transition(new_state)?; 2046 2047 // Load kernel if configured 2048 let entry_point = self.entry_point()?; 2049 2050 // The initial TDX configuration must be done before the vCPUs are 2051 // created 2052 #[cfg(feature = "tdx")] 2053 if self.config.lock().unwrap().tdx.is_some() { 2054 self.init_tdx()?; 2055 } 2056 2057 // Create and configure vcpus 2058 self.cpu_manager 2059 .lock() 2060 .unwrap() 2061 .create_boot_vcpus(entry_point) 2062 .map_err(Error::CpuManager)?; 2063 2064 #[cfg(feature = "tdx")] 2065 let sections = if self.config.lock().unwrap().tdx.is_some() { 2066 self.extract_tdvf_sections()? 2067 } else { 2068 Vec::new() 2069 }; 2070 2071 let rsdp_addr = self.create_acpi_tables(); 2072 2073 // Configuring the TDX regions requires that the vCPUs are created. 2074 #[cfg(feature = "tdx")] 2075 let hob_address = if self.config.lock().unwrap().tdx.is_some() { 2076 // TDX sections are written to memory. 2077 self.populate_tdx_sections(§ions)? 2078 } else { 2079 None 2080 }; 2081 2082 // Configure shared state based on loaded kernel 2083 entry_point 2084 .map(|_| { 2085 // Safe to unwrap rsdp_addr as we know it can't be None when 2086 // the entry_point is Some. 2087 self.configure_system(rsdp_addr.unwrap()) 2088 }) 2089 .transpose()?; 2090 2091 #[cfg(feature = "tdx")] 2092 if let Some(hob_address) = hob_address { 2093 // With the HOB address extracted the vCPUs can have 2094 // their TDX state configured. 2095 self.cpu_manager 2096 .lock() 2097 .unwrap() 2098 .initialize_tdx(hob_address) 2099 .map_err(Error::CpuManager)?; 2100 // Let the hypervisor know which memory ranges are shared with the 2101 // guest. This prevents the guest from ignoring/discarding memory 2102 // regions provided by the host. 2103 self.init_tdx_memory(§ions)?; 2104 // With TDX memory and CPU state configured TDX setup is complete 2105 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 2106 } 2107 2108 if new_state == VmState::Running { 2109 self.cpu_manager 2110 .lock() 2111 .unwrap() 2112 .start_boot_vcpus() 2113 .map_err(Error::CpuManager)?; 2114 } 2115 2116 self.setup_signal_handler()?; 2117 self.setup_tty()?; 2118 2119 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 2120 *state = new_state; 2121 event!("vm", "booted"); 2122 Ok(()) 2123 } 2124 2125 /// Gets a thread-safe reference counted pointer to the VM configuration. 2126 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 2127 Arc::clone(&self.config) 2128 } 2129 2130 /// Get the VM state. Returns an error if the state is poisoned. 2131 pub fn get_state(&self) -> Result<VmState> { 2132 self.state 2133 .try_read() 2134 .map_err(|_| Error::PoisonedState) 2135 .map(|state| *state) 2136 } 2137 2138 /// Load saved clock from snapshot 2139 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2140 pub fn load_clock_from_snapshot( 2141 &mut self, 2142 snapshot: &Snapshot, 2143 ) -> Result<Option<hypervisor::ClockData>> { 2144 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 2145 self.saved_clock = vm_snapshot.clock; 2146 Ok(self.saved_clock) 2147 } 2148 2149 #[cfg(target_arch = "aarch64")] 2150 /// Add the vGIC section to the VM snapshot. 2151 fn add_vgic_snapshot_section( 2152 &self, 2153 vm_snapshot: &mut Snapshot, 2154 ) -> std::result::Result<(), MigratableError> { 2155 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2156 let gic_device = Arc::clone( 2157 self.device_manager 2158 .lock() 2159 .unwrap() 2160 .get_interrupt_controller() 2161 .unwrap() 2162 .lock() 2163 .unwrap() 2164 .get_gic_device() 2165 .unwrap(), 2166 ); 2167 2168 gic_device 2169 .lock() 2170 .unwrap() 2171 .set_gicr_typers(&saved_vcpu_states); 2172 2173 vm_snapshot.add_snapshot( 2174 if let Some(gicv3_its) = gic_device 2175 .lock() 2176 .unwrap() 2177 .as_any_concrete_mut() 2178 .downcast_mut::<KvmGicV3Its>() 2179 { 2180 gicv3_its.snapshot()? 2181 } else { 2182 return Err(MigratableError::Snapshot(anyhow!( 2183 "GicDevice downcast to KvmGicV3Its failed when snapshotting VM!" 2184 ))); 2185 }, 2186 ); 2187 2188 Ok(()) 2189 } 2190 2191 #[cfg(target_arch = "aarch64")] 2192 /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing. 2193 fn restore_vgic_and_enable_interrupt( 2194 &self, 2195 vm_snapshot: &Snapshot, 2196 ) -> std::result::Result<(), MigratableError> { 2197 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2198 // The number of vCPUs is the same as the number of saved vCPU states. 2199 let vcpu_numbers = saved_vcpu_states.len(); 2200 2201 // Creating a GIC device here, as the GIC will not be created when 2202 // restoring the device manager. Note that currently only the bare GICv3 2203 // without ITS is supported. 2204 let mut gic_device = create_gic(&self.vm, vcpu_numbers.try_into().unwrap()) 2205 .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?; 2206 2207 // PMU interrupt sticks to PPI, so need to be added by 16 to get real irq number. 2208 self.cpu_manager 2209 .lock() 2210 .unwrap() 2211 .init_pmu(arch::aarch64::fdt::AARCH64_PMU_IRQ + 16) 2212 .map_err(|e| MigratableError::Restore(anyhow!("Error init PMU: {:?}", e)))?; 2213 2214 // Here we prepare the GICR_TYPER registers from the restored vCPU states. 2215 gic_device.set_gicr_typers(&saved_vcpu_states); 2216 2217 let gic_device = Arc::new(Mutex::new(gic_device)); 2218 // Update the GIC entity in device manager 2219 self.device_manager 2220 .lock() 2221 .unwrap() 2222 .get_interrupt_controller() 2223 .unwrap() 2224 .lock() 2225 .unwrap() 2226 .set_gic_device(Arc::clone(&gic_device)); 2227 2228 // Restore GIC states. 2229 if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) { 2230 if let Some(gicv3_its) = gic_device 2231 .lock() 2232 .unwrap() 2233 .as_any_concrete_mut() 2234 .downcast_mut::<KvmGicV3Its>() 2235 { 2236 gicv3_its.restore(*gicv3_its_snapshot.clone())?; 2237 } else { 2238 return Err(MigratableError::Restore(anyhow!( 2239 "GicDevice downcast to KvmGicV3Its failed when restoring VM!" 2240 ))); 2241 }; 2242 } else { 2243 return Err(MigratableError::Restore(anyhow!( 2244 "Missing GicV3Its snapshot" 2245 ))); 2246 } 2247 2248 // Activate gic device 2249 self.device_manager 2250 .lock() 2251 .unwrap() 2252 .get_interrupt_controller() 2253 .unwrap() 2254 .lock() 2255 .unwrap() 2256 .enable() 2257 .map_err(|e| { 2258 MigratableError::Restore(anyhow!( 2259 "Could not enable interrupt controller routing: {:#?}", 2260 e 2261 )) 2262 })?; 2263 2264 Ok(()) 2265 } 2266 2267 /// Gets the actual size of the balloon. 2268 pub fn balloon_size(&self) -> u64 { 2269 self.device_manager.lock().unwrap().balloon_size() 2270 } 2271 2272 pub fn receive_memory_regions<F>( 2273 &mut self, 2274 ranges: &MemoryRangeTable, 2275 fd: &mut F, 2276 ) -> std::result::Result<(), MigratableError> 2277 where 2278 F: Read, 2279 { 2280 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2281 let mem = guest_memory.memory(); 2282 2283 for range in ranges.regions() { 2284 let mut offset: u64 = 0; 2285 // Here we are manually handling the retry in case we can't the 2286 // whole region at once because we can't use the implementation 2287 // from vm-memory::GuestMemory of read_exact_from() as it is not 2288 // following the correct behavior. For more info about this issue 2289 // see: https://github.com/rust-vmm/vm-memory/issues/174 2290 loop { 2291 let bytes_read = mem 2292 .read_from( 2293 GuestAddress(range.gpa + offset), 2294 fd, 2295 (range.length - offset) as usize, 2296 ) 2297 .map_err(|e| { 2298 MigratableError::MigrateReceive(anyhow!( 2299 "Error receiving memory from socket: {}", 2300 e 2301 )) 2302 })?; 2303 offset += bytes_read as u64; 2304 2305 if offset == range.length { 2306 break; 2307 } 2308 } 2309 } 2310 2311 Ok(()) 2312 } 2313 2314 pub fn send_memory_fds( 2315 &mut self, 2316 socket: &mut UnixStream, 2317 ) -> std::result::Result<(), MigratableError> { 2318 for (slot, fd) in self 2319 .memory_manager 2320 .lock() 2321 .unwrap() 2322 .memory_slot_fds() 2323 .drain() 2324 { 2325 Request::memory_fd(std::mem::size_of_val(&slot) as u64) 2326 .write_to(socket) 2327 .map_err(|e| { 2328 MigratableError::MigrateSend(anyhow!("Error sending memory fd request: {}", e)) 2329 })?; 2330 socket 2331 .send_with_fd(&slot.to_le_bytes()[..], fd) 2332 .map_err(|e| { 2333 MigratableError::MigrateSend(anyhow!("Error sending memory fd: {}", e)) 2334 })?; 2335 2336 let res = Response::read_from(socket)?; 2337 if res.status() != Status::Ok { 2338 warn!("Error during memory fd migration"); 2339 Request::abandon().write_to(socket)?; 2340 Response::read_from(socket).ok(); 2341 return Err(MigratableError::MigrateSend(anyhow!( 2342 "Error during memory fd migration" 2343 ))); 2344 } 2345 } 2346 2347 Ok(()) 2348 } 2349 2350 pub fn send_memory_regions<F>( 2351 &mut self, 2352 ranges: &MemoryRangeTable, 2353 fd: &mut F, 2354 ) -> std::result::Result<(), MigratableError> 2355 where 2356 F: Write, 2357 { 2358 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2359 let mem = guest_memory.memory(); 2360 2361 for range in ranges.regions() { 2362 let mut offset: u64 = 0; 2363 // Here we are manually handling the retry in case we can't the 2364 // whole region at once because we can't use the implementation 2365 // from vm-memory::GuestMemory of write_all_to() as it is not 2366 // following the correct behavior. For more info about this issue 2367 // see: https://github.com/rust-vmm/vm-memory/issues/174 2368 loop { 2369 let bytes_written = mem 2370 .write_to( 2371 GuestAddress(range.gpa + offset), 2372 fd, 2373 (range.length - offset) as usize, 2374 ) 2375 .map_err(|e| { 2376 MigratableError::MigrateSend(anyhow!( 2377 "Error transferring memory to socket: {}", 2378 e 2379 )) 2380 })?; 2381 offset += bytes_written as u64; 2382 2383 if offset == range.length { 2384 break; 2385 } 2386 } 2387 } 2388 2389 Ok(()) 2390 } 2391 2392 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2393 self.memory_manager 2394 .lock() 2395 .unwrap() 2396 .memory_range_table(false) 2397 } 2398 2399 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2400 self.device_manager.lock().unwrap().device_tree() 2401 } 2402 2403 pub fn activate_virtio_devices(&self) -> Result<()> { 2404 self.device_manager 2405 .lock() 2406 .unwrap() 2407 .activate_virtio_devices() 2408 .map_err(Error::ActivateVirtioDevices) 2409 } 2410 2411 #[cfg(target_arch = "x86_64")] 2412 pub fn power_button(&self) -> Result<()> { 2413 return self 2414 .device_manager 2415 .lock() 2416 .unwrap() 2417 .notify_power_button() 2418 .map_err(Error::PowerButton); 2419 } 2420 2421 #[cfg(target_arch = "aarch64")] 2422 pub fn power_button(&self) -> Result<()> { 2423 self.device_manager 2424 .lock() 2425 .unwrap() 2426 .notify_power_button() 2427 .map_err(Error::PowerButton) 2428 } 2429 2430 pub fn memory_manager_data(&self) -> MemoryManagerSnapshotData { 2431 self.memory_manager.lock().unwrap().snapshot_data() 2432 } 2433 2434 #[cfg(all(target_arch = "x86_64", feature = "gdb"))] 2435 pub fn debug_request( 2436 &mut self, 2437 gdb_request: &GdbRequestPayload, 2438 cpu_id: usize, 2439 ) -> Result<GdbResponsePayload> { 2440 use GdbRequestPayload::*; 2441 match gdb_request { 2442 SetSingleStep(single_step) => { 2443 self.set_guest_debug(cpu_id, &[], *single_step) 2444 .map_err(Error::Debug)?; 2445 } 2446 SetHwBreakPoint(addrs) => { 2447 self.set_guest_debug(cpu_id, addrs, false) 2448 .map_err(Error::Debug)?; 2449 } 2450 Pause => { 2451 self.debug_pause().map_err(Error::Debug)?; 2452 } 2453 Resume => { 2454 self.debug_resume().map_err(Error::Debug)?; 2455 } 2456 ReadRegs => { 2457 let regs = self.read_regs(cpu_id).map_err(Error::Debug)?; 2458 return Ok(GdbResponsePayload::RegValues(Box::new(regs))); 2459 } 2460 WriteRegs(regs) => { 2461 self.write_regs(cpu_id, regs).map_err(Error::Debug)?; 2462 } 2463 ReadMem(vaddr, len) => { 2464 let mem = self.read_mem(cpu_id, *vaddr, *len).map_err(Error::Debug)?; 2465 return Ok(GdbResponsePayload::MemoryRegion(mem)); 2466 } 2467 WriteMem(vaddr, data) => { 2468 self.write_mem(cpu_id, vaddr, data).map_err(Error::Debug)?; 2469 } 2470 ActiveVcpus => { 2471 let active_vcpus = self.active_vcpus(); 2472 return Ok(GdbResponsePayload::ActiveVcpus(active_vcpus)); 2473 } 2474 } 2475 Ok(GdbResponsePayload::CommandComplete) 2476 } 2477 } 2478 2479 impl Pausable for Vm { 2480 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2481 event!("vm", "pausing"); 2482 let mut state = self 2483 .state 2484 .try_write() 2485 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2486 let new_state = VmState::Paused; 2487 2488 state 2489 .valid_transition(new_state) 2490 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2491 2492 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2493 { 2494 let mut clock = self 2495 .vm 2496 .get_clock() 2497 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2498 // Reset clock flags. 2499 clock.flags = 0; 2500 self.saved_clock = Some(clock); 2501 } 2502 2503 // Before pausing the vCPUs activate any pending virtio devices that might 2504 // need activation between starting the pause (or e.g. a migration it's part of) 2505 self.activate_virtio_devices().map_err(|e| { 2506 MigratableError::Pause(anyhow!("Error activating pending virtio devices: {:?}", e)) 2507 })?; 2508 2509 self.cpu_manager.lock().unwrap().pause()?; 2510 self.device_manager.lock().unwrap().pause()?; 2511 2512 *state = new_state; 2513 2514 event!("vm", "paused"); 2515 Ok(()) 2516 } 2517 2518 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2519 event!("vm", "resuming"); 2520 let mut state = self 2521 .state 2522 .try_write() 2523 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2524 let new_state = VmState::Running; 2525 2526 state 2527 .valid_transition(new_state) 2528 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2529 2530 self.cpu_manager.lock().unwrap().resume()?; 2531 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2532 { 2533 if let Some(clock) = &self.saved_clock { 2534 self.vm.set_clock(clock).map_err(|e| { 2535 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2536 })?; 2537 } 2538 } 2539 self.device_manager.lock().unwrap().resume()?; 2540 2541 // And we're back to the Running state. 2542 *state = new_state; 2543 event!("vm", "resumed"); 2544 Ok(()) 2545 } 2546 } 2547 2548 #[derive(Serialize, Deserialize)] 2549 pub struct VmSnapshot { 2550 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2551 pub clock: Option<hypervisor::ClockData>, 2552 pub state: Option<hypervisor::VmState>, 2553 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2554 pub common_cpuid: hypervisor::CpuId, 2555 } 2556 2557 pub const VM_SNAPSHOT_ID: &str = "vm"; 2558 impl Snapshottable for Vm { 2559 fn id(&self) -> String { 2560 VM_SNAPSHOT_ID.to_string() 2561 } 2562 2563 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2564 event!("vm", "snapshotting"); 2565 2566 #[cfg(feature = "tdx")] 2567 { 2568 if self.config.lock().unwrap().tdx.is_some() { 2569 return Err(MigratableError::Snapshot(anyhow!( 2570 "Snapshot not possible with TDX VM" 2571 ))); 2572 } 2573 } 2574 2575 let current_state = self.get_state().unwrap(); 2576 if current_state != VmState::Paused { 2577 return Err(MigratableError::Snapshot(anyhow!( 2578 "Trying to snapshot while VM is running" 2579 ))); 2580 } 2581 2582 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2583 let common_cpuid = { 2584 #[cfg(feature = "tdx")] 2585 let tdx_enabled = self.config.lock().unwrap().tdx.is_some(); 2586 let phys_bits = physical_bits(self.config.lock().unwrap().cpus.max_phys_bits); 2587 arch::generate_common_cpuid( 2588 self.hypervisor.clone(), 2589 None, 2590 None, 2591 phys_bits, 2592 self.config.lock().unwrap().cpus.kvm_hyperv, 2593 #[cfg(feature = "tdx")] 2594 tdx_enabled, 2595 ) 2596 .map_err(|e| { 2597 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2598 })? 2599 }; 2600 2601 let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID); 2602 let vm_state = self 2603 .vm 2604 .state() 2605 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2606 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2607 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2608 clock: self.saved_clock, 2609 state: Some(vm_state), 2610 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2611 common_cpuid, 2612 }) 2613 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2614 2615 vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?); 2616 vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?); 2617 2618 #[cfg(target_arch = "aarch64")] 2619 self.add_vgic_snapshot_section(&mut vm_snapshot) 2620 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2621 2622 vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?); 2623 vm_snapshot.add_data_section(SnapshotDataSection { 2624 id: format!("{}-section", VM_SNAPSHOT_ID), 2625 snapshot: vm_snapshot_data, 2626 }); 2627 2628 event!("vm", "snapshotted"); 2629 Ok(vm_snapshot) 2630 } 2631 2632 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 2633 event!("vm", "restoring"); 2634 2635 let current_state = self 2636 .get_state() 2637 .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?; 2638 let new_state = VmState::Paused; 2639 current_state.valid_transition(new_state).map_err(|e| { 2640 MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e)) 2641 })?; 2642 2643 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2644 self.load_clock_from_snapshot(&snapshot) 2645 .map_err(|e| MigratableError::Restore(anyhow!("Error restoring clock: {:?}", e)))?; 2646 2647 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2648 self.memory_manager 2649 .lock() 2650 .unwrap() 2651 .restore(*memory_manager_snapshot.clone())?; 2652 } else { 2653 return Err(MigratableError::Restore(anyhow!( 2654 "Missing memory manager snapshot" 2655 ))); 2656 } 2657 2658 if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) { 2659 self.cpu_manager 2660 .lock() 2661 .unwrap() 2662 .restore(*cpu_manager_snapshot.clone())?; 2663 } else { 2664 return Err(MigratableError::Restore(anyhow!( 2665 "Missing CPU manager snapshot" 2666 ))); 2667 } 2668 2669 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2670 self.device_manager 2671 .lock() 2672 .unwrap() 2673 .restore(*device_manager_snapshot.clone())?; 2674 } else { 2675 return Err(MigratableError::Restore(anyhow!( 2676 "Missing device manager snapshot" 2677 ))); 2678 } 2679 2680 #[cfg(target_arch = "aarch64")] 2681 self.restore_vgic_and_enable_interrupt(&snapshot)?; 2682 2683 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2684 self.device_manager 2685 .lock() 2686 .unwrap() 2687 .restore_devices(*device_manager_snapshot.clone())?; 2688 } else { 2689 return Err(MigratableError::Restore(anyhow!( 2690 "Missing device manager snapshot" 2691 ))); 2692 } 2693 2694 // Now we can start all vCPUs from here. 2695 self.cpu_manager 2696 .lock() 2697 .unwrap() 2698 .start_restored_vcpus() 2699 .map_err(|e| { 2700 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e)) 2701 })?; 2702 2703 self.setup_signal_handler().map_err(|e| { 2704 MigratableError::Restore(anyhow!("Could not setup signal handler: {:#?}", e)) 2705 })?; 2706 self.setup_tty() 2707 .map_err(|e| MigratableError::Restore(anyhow!("Could not setup tty: {:#?}", e)))?; 2708 2709 let mut state = self 2710 .state 2711 .try_write() 2712 .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?; 2713 *state = new_state; 2714 2715 event!("vm", "restored"); 2716 Ok(()) 2717 } 2718 } 2719 2720 impl Transportable for Vm { 2721 fn send( 2722 &self, 2723 snapshot: &Snapshot, 2724 destination_url: &str, 2725 ) -> std::result::Result<(), MigratableError> { 2726 let mut snapshot_config_path = url_to_path(destination_url)?; 2727 snapshot_config_path.push(SNAPSHOT_CONFIG_FILE); 2728 2729 // Create the snapshot config file 2730 let mut snapshot_config_file = OpenOptions::new() 2731 .read(true) 2732 .write(true) 2733 .create_new(true) 2734 .open(snapshot_config_path) 2735 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2736 2737 // Serialize and write the snapshot config 2738 let vm_config = serde_json::to_string(self.config.lock().unwrap().deref()) 2739 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2740 2741 snapshot_config_file 2742 .write(vm_config.as_bytes()) 2743 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2744 2745 let mut snapshot_state_path = url_to_path(destination_url)?; 2746 snapshot_state_path.push(SNAPSHOT_STATE_FILE); 2747 2748 // Create the snapshot state file 2749 let mut snapshot_state_file = OpenOptions::new() 2750 .read(true) 2751 .write(true) 2752 .create_new(true) 2753 .open(snapshot_state_path) 2754 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2755 2756 // Serialize and write the snapshot state 2757 let vm_state = 2758 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2759 2760 snapshot_state_file 2761 .write(&vm_state) 2762 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2763 2764 // Tell the memory manager to also send/write its own snapshot. 2765 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2766 self.memory_manager 2767 .lock() 2768 .unwrap() 2769 .send(&*memory_manager_snapshot.clone(), destination_url)?; 2770 } else { 2771 return Err(MigratableError::Restore(anyhow!( 2772 "Missing memory manager snapshot" 2773 ))); 2774 } 2775 2776 Ok(()) 2777 } 2778 } 2779 2780 impl Migratable for Vm { 2781 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2782 self.memory_manager.lock().unwrap().start_dirty_log()?; 2783 self.device_manager.lock().unwrap().start_dirty_log() 2784 } 2785 2786 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2787 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2788 self.device_manager.lock().unwrap().stop_dirty_log() 2789 } 2790 2791 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2792 Ok(MemoryRangeTable::new_from_tables(vec![ 2793 self.memory_manager.lock().unwrap().dirty_log()?, 2794 self.device_manager.lock().unwrap().dirty_log()?, 2795 ])) 2796 } 2797 2798 fn start_migration(&mut self) -> std::result::Result<(), MigratableError> { 2799 self.memory_manager.lock().unwrap().start_migration()?; 2800 self.device_manager.lock().unwrap().start_migration() 2801 } 2802 2803 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2804 self.memory_manager.lock().unwrap().complete_migration()?; 2805 self.device_manager.lock().unwrap().complete_migration() 2806 } 2807 } 2808 2809 #[cfg(feature = "gdb")] 2810 impl Debuggable for Vm { 2811 fn set_guest_debug( 2812 &self, 2813 cpu_id: usize, 2814 addrs: &[GuestAddress], 2815 singlestep: bool, 2816 ) -> std::result::Result<(), DebuggableError> { 2817 self.cpu_manager 2818 .lock() 2819 .unwrap() 2820 .set_guest_debug(cpu_id, addrs, singlestep) 2821 } 2822 2823 fn debug_pause(&mut self) -> std::result::Result<(), DebuggableError> { 2824 if !self.cpu_manager.lock().unwrap().vcpus_paused() { 2825 self.pause().map_err(DebuggableError::Pause)?; 2826 } 2827 let mut state = self 2828 .state 2829 .try_write() 2830 .map_err(|_| DebuggableError::PoisonedState)?; 2831 *state = VmState::BreakPoint; 2832 Ok(()) 2833 } 2834 2835 fn debug_resume(&mut self) -> std::result::Result<(), DebuggableError> { 2836 if !self.cpu_manager.lock().unwrap().vcpus_paused() { 2837 self.cpu_manager 2838 .lock() 2839 .unwrap() 2840 .start_boot_vcpus() 2841 .map_err(|e| { 2842 DebuggableError::Resume(MigratableError::Resume(anyhow!( 2843 "Could not start boot vCPUs: {:?}", 2844 e 2845 ))) 2846 })?; 2847 } else { 2848 self.resume().map_err(DebuggableError::Resume)?; 2849 } 2850 let mut state = self 2851 .state 2852 .try_write() 2853 .map_err(|_| DebuggableError::PoisonedState)?; 2854 *state = VmState::Running; 2855 Ok(()) 2856 } 2857 2858 fn read_regs(&self, cpu_id: usize) -> std::result::Result<X86_64CoreRegs, DebuggableError> { 2859 self.cpu_manager.lock().unwrap().read_regs(cpu_id) 2860 } 2861 2862 fn write_regs( 2863 &self, 2864 cpu_id: usize, 2865 regs: &X86_64CoreRegs, 2866 ) -> std::result::Result<(), DebuggableError> { 2867 self.cpu_manager.lock().unwrap().write_regs(cpu_id, regs) 2868 } 2869 2870 fn read_mem( 2871 &self, 2872 cpu_id: usize, 2873 vaddr: GuestAddress, 2874 len: usize, 2875 ) -> std::result::Result<Vec<u8>, DebuggableError> { 2876 self.cpu_manager 2877 .lock() 2878 .unwrap() 2879 .read_mem(cpu_id, vaddr, len) 2880 } 2881 2882 fn write_mem( 2883 &self, 2884 cpu_id: usize, 2885 vaddr: &GuestAddress, 2886 data: &[u8], 2887 ) -> std::result::Result<(), DebuggableError> { 2888 self.cpu_manager 2889 .lock() 2890 .unwrap() 2891 .write_mem(cpu_id, vaddr, data) 2892 } 2893 2894 fn active_vcpus(&self) -> usize { 2895 let active_vcpus = self.cpu_manager.lock().unwrap().active_vcpus(); 2896 if active_vcpus > 0 { 2897 active_vcpus 2898 } else { 2899 // The VM is not booted yet. Report boot_vcpus() instead. 2900 self.cpu_manager.lock().unwrap().boot_vcpus() as usize 2901 } 2902 } 2903 } 2904 2905 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2906 #[cfg(test)] 2907 mod tests { 2908 use super::*; 2909 2910 fn test_vm_state_transitions(state: VmState) { 2911 match state { 2912 VmState::Created => { 2913 // Check the transitions from Created 2914 assert!(state.valid_transition(VmState::Created).is_err()); 2915 assert!(state.valid_transition(VmState::Running).is_ok()); 2916 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2917 assert!(state.valid_transition(VmState::Paused).is_ok()); 2918 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2919 } 2920 VmState::Running => { 2921 // Check the transitions from Running 2922 assert!(state.valid_transition(VmState::Created).is_err()); 2923 assert!(state.valid_transition(VmState::Running).is_err()); 2924 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2925 assert!(state.valid_transition(VmState::Paused).is_ok()); 2926 assert!(state.valid_transition(VmState::BreakPoint).is_ok()); 2927 } 2928 VmState::Shutdown => { 2929 // Check the transitions from Shutdown 2930 assert!(state.valid_transition(VmState::Created).is_err()); 2931 assert!(state.valid_transition(VmState::Running).is_ok()); 2932 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2933 assert!(state.valid_transition(VmState::Paused).is_err()); 2934 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2935 } 2936 VmState::Paused => { 2937 // Check the transitions from Paused 2938 assert!(state.valid_transition(VmState::Created).is_err()); 2939 assert!(state.valid_transition(VmState::Running).is_ok()); 2940 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2941 assert!(state.valid_transition(VmState::Paused).is_err()); 2942 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2943 } 2944 VmState::BreakPoint => { 2945 // Check the transitions from Breakpoint 2946 assert!(state.valid_transition(VmState::Created).is_ok()); 2947 assert!(state.valid_transition(VmState::Running).is_ok()); 2948 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2949 assert!(state.valid_transition(VmState::Paused).is_err()); 2950 assert!(state.valid_transition(VmState::BreakPoint).is_err()); 2951 } 2952 } 2953 } 2954 2955 #[test] 2956 fn test_vm_created_transitions() { 2957 test_vm_state_transitions(VmState::Created); 2958 } 2959 2960 #[test] 2961 fn test_vm_running_transitions() { 2962 test_vm_state_transitions(VmState::Running); 2963 } 2964 2965 #[test] 2966 fn test_vm_shutdown_transitions() { 2967 test_vm_state_transitions(VmState::Shutdown); 2968 } 2969 2970 #[test] 2971 fn test_vm_paused_transitions() { 2972 test_vm_state_transitions(VmState::Paused); 2973 } 2974 } 2975 2976 #[cfg(target_arch = "aarch64")] 2977 #[cfg(test)] 2978 mod tests { 2979 use super::*; 2980 use crate::GuestMemoryMmap; 2981 use arch::aarch64::fdt::create_fdt; 2982 use arch::aarch64::gic::kvm::create_gic; 2983 use arch::aarch64::layout; 2984 use arch::{DeviceType, MmioDeviceInfo}; 2985 2986 const LEN: u64 = 4096; 2987 2988 #[test] 2989 fn test_create_fdt_with_devices() { 2990 let regions = vec![(layout::RAM_START, (layout::FDT_MAX_SIZE + 0x1000) as usize)]; 2991 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 2992 2993 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 2994 ( 2995 (DeviceType::Serial, DeviceType::Serial.to_string()), 2996 MmioDeviceInfo { 2997 addr: 0x00, 2998 len: LEN, 2999 irq: 33, 3000 }, 3001 ), 3002 ( 3003 (DeviceType::Virtio(1), "virtio".to_string()), 3004 MmioDeviceInfo { 3005 addr: LEN, 3006 len: LEN, 3007 irq: 34, 3008 }, 3009 ), 3010 ( 3011 (DeviceType::Rtc, "rtc".to_string()), 3012 MmioDeviceInfo { 3013 addr: 2 * LEN, 3014 len: LEN, 3015 irq: 35, 3016 }, 3017 ), 3018 ] 3019 .iter() 3020 .cloned() 3021 .collect(); 3022 3023 let hv = hypervisor::new().unwrap(); 3024 let vm = hv.create_vm().unwrap(); 3025 let gic = create_gic(&vm, 1).unwrap(); 3026 assert!(create_fdt( 3027 &mem, 3028 "console=tty0", 3029 vec![0], 3030 Some((0, 0, 0)), 3031 &dev_info, 3032 &*gic, 3033 &None, 3034 &Vec::new(), 3035 &BTreeMap::new(), 3036 None, 3037 true, 3038 ) 3039 .is_ok()) 3040 } 3041 } 3042 3043 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 3044 #[test] 3045 pub fn test_vm() { 3046 use hypervisor::VmExit; 3047 use vm_memory::{Address, GuestMemory, GuestMemoryRegion}; 3048 // This example based on https://lwn.net/Articles/658511/ 3049 let code = [ 3050 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 3051 0x00, 0xd8, /* add %bl, %al */ 3052 0x04, b'0', /* add $'0', %al */ 3053 0xee, /* out %al, (%dx) */ 3054 0xb0, b'\n', /* mov $'\n', %al */ 3055 0xee, /* out %al, (%dx) */ 3056 0xf4, /* hlt */ 3057 ]; 3058 3059 let mem_size = 0x1000; 3060 let load_addr = GuestAddress(0x1000); 3061 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 3062 3063 let hv = hypervisor::new().unwrap(); 3064 let vm = hv.create_vm().expect("new VM creation failed"); 3065 3066 for (index, region) in mem.iter().enumerate() { 3067 let mem_region = vm.make_user_memory_region( 3068 index as u32, 3069 region.start_addr().raw_value(), 3070 region.len() as u64, 3071 region.as_ptr() as u64, 3072 false, 3073 false, 3074 ); 3075 3076 vm.create_user_memory_region(mem_region) 3077 .expect("Cannot configure guest memory"); 3078 } 3079 mem.write_slice(&code, load_addr) 3080 .expect("Writing code to memory failed"); 3081 3082 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 3083 3084 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 3085 vcpu_sregs.cs.base = 0; 3086 vcpu_sregs.cs.selector = 0; 3087 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 3088 3089 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 3090 vcpu_regs.rip = 0x1000; 3091 vcpu_regs.rax = 2; 3092 vcpu_regs.rbx = 3; 3093 vcpu_regs.rflags = 2; 3094 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 3095 3096 loop { 3097 match vcpu.run().expect("run failed") { 3098 VmExit::IoOut(addr, data) => { 3099 println!( 3100 "IO out -- addr: {:#x} data [{:?}]", 3101 addr, 3102 str::from_utf8(data).unwrap() 3103 ); 3104 } 3105 VmExit::Reset => { 3106 println!("HLT"); 3107 break; 3108 } 3109 r => panic!("unexpected exit reason: {:?}", r), 3110 } 3111 } 3112 } 3113