1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 15 use crate::config::NumaConfig; 16 use crate::config::{ 17 ConsoleOutputMode, DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, 18 UserDeviceConfig, ValidationError, VmConfig, VsockConfig, 19 }; 20 use crate::device_manager::{self, Console, DeviceManager, DeviceManagerError, PtyPair}; 21 use crate::device_tree::DeviceTree; 22 use crate::memory_manager::{Error as MemoryManagerError, MemoryManager}; 23 use crate::migration::{get_vm_snapshot, url_to_path, VM_SNAPSHOT_FILE}; 24 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 25 use crate::GuestMemoryMmap; 26 use crate::{cpu, EpollDispatch}; 27 use crate::{ 28 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 29 }; 30 use anyhow::anyhow; 31 use arch::get_host_cpu_phys_bits; 32 #[cfg(feature = "tdx")] 33 use arch::x86_64::tdx::TdvfSection; 34 use arch::EntryPoint; 35 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 36 use arch::{NumaNode, NumaNodes}; 37 use devices::AcpiNotificationFlags; 38 use hypervisor::vm::{HypervisorVmError, VmmOps}; 39 use linux_loader::cmdline::Cmdline; 40 #[cfg(target_arch = "x86_64")] 41 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 42 #[cfg(target_arch = "aarch64")] 43 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 44 use linux_loader::loader::KernelLoader; 45 use seccompiler::{apply_filter, SeccompAction}; 46 use signal_hook::{ 47 consts::{SIGINT, SIGTERM, SIGWINCH}, 48 iterator::backend::Handle, 49 iterator::Signals, 50 }; 51 use std::cmp; 52 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 53 use std::collections::BTreeMap; 54 use std::collections::HashMap; 55 use std::convert::TryInto; 56 use std::ffi::CString; 57 #[cfg(target_arch = "x86_64")] 58 use std::fmt; 59 use std::fs::{File, OpenOptions}; 60 use std::io::{self, Read, Write}; 61 use std::io::{Seek, SeekFrom}; 62 use std::num::Wrapping; 63 use std::ops::Deref; 64 use std::panic::AssertUnwindSafe; 65 use std::sync::{Arc, Mutex, RwLock}; 66 use std::{result, str, thread}; 67 use vm_device::Bus; 68 use vm_memory::{ 69 Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 70 GuestMemoryRegion, 71 }; 72 use vm_migration::{ 73 protocol::{MemoryRange, MemoryRangeTable}, 74 Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, 75 Transportable, 76 }; 77 use vmm_sys_util::eventfd::EventFd; 78 use vmm_sys_util::signal::unblock_signal; 79 use vmm_sys_util::terminal::Terminal; 80 81 #[cfg(target_arch = "aarch64")] 82 use arch::aarch64::gic::gicv3_its::kvm::{KvmGicV3Its, GIC_V3_ITS_SNAPSHOT_ID}; 83 #[cfg(target_arch = "aarch64")] 84 use arch::aarch64::gic::kvm::create_gic; 85 #[cfg(target_arch = "aarch64")] 86 use devices::interrupt_controller::{self, InterruptController}; 87 88 /// Errors associated with VM management 89 #[derive(Debug)] 90 pub enum Error { 91 /// Cannot open the kernel image 92 KernelFile(io::Error), 93 94 /// Cannot open the initramfs image 95 InitramfsFile(io::Error), 96 97 /// Cannot load the kernel in memory 98 KernelLoad(linux_loader::loader::Error), 99 100 #[cfg(target_arch = "aarch64")] 101 /// Cannot load the UEFI binary in memory 102 UefiLoad(arch::aarch64::uefi::Error), 103 104 /// Cannot load the initramfs in memory 105 InitramfsLoad, 106 107 /// Cannot load the command line in memory 108 LoadCmdLine(linux_loader::loader::Error), 109 110 /// Cannot modify the command line 111 CmdLineInsertStr(linux_loader::cmdline::Error), 112 113 /// Cannot convert command line into CString 114 CmdLineCString(std::ffi::NulError), 115 116 /// Cannot configure system 117 ConfigureSystem(arch::Error), 118 119 /// Cannot enable interrupt controller 120 #[cfg(target_arch = "aarch64")] 121 EnableInterruptController(interrupt_controller::Error), 122 123 PoisonedState, 124 125 /// Cannot create a device manager. 126 DeviceManager(DeviceManagerError), 127 128 /// Write to the console failed. 129 Console(vmm_sys_util::errno::Error), 130 131 /// Write to the pty console failed. 132 PtyConsole(io::Error), 133 134 /// Cannot setup terminal in raw mode. 135 SetTerminalRaw(vmm_sys_util::errno::Error), 136 137 /// Cannot setup terminal in canonical mode. 138 SetTerminalCanon(vmm_sys_util::errno::Error), 139 140 /// Memory is overflow 141 MemOverflow, 142 143 /// Cannot spawn a signal handler thread 144 SignalHandlerSpawn(io::Error), 145 146 /// Failed to join on vCPU threads 147 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 148 149 /// VM config is missing. 150 VmMissingConfig, 151 152 /// VM is not created 153 VmNotCreated, 154 155 /// VM is already created 156 VmAlreadyCreated, 157 158 /// VM is not running 159 VmNotRunning, 160 161 /// Cannot clone EventFd. 162 EventFdClone(io::Error), 163 164 /// Invalid VM state transition 165 InvalidStateTransition(VmState, VmState), 166 167 /// Error from CPU handling 168 CpuManager(cpu::Error), 169 170 /// Cannot pause devices 171 PauseDevices(MigratableError), 172 173 /// Cannot resume devices 174 ResumeDevices(MigratableError), 175 176 /// Cannot pause CPUs 177 PauseCpus(MigratableError), 178 179 /// Cannot resume cpus 180 ResumeCpus(MigratableError), 181 182 /// Cannot pause VM 183 Pause(MigratableError), 184 185 /// Cannot resume VM 186 Resume(MigratableError), 187 188 /// Memory manager error 189 MemoryManager(MemoryManagerError), 190 191 /// Eventfd write error 192 EventfdError(std::io::Error), 193 194 /// Cannot snapshot VM 195 Snapshot(MigratableError), 196 197 /// Cannot restore VM 198 Restore(MigratableError), 199 200 /// Cannot send VM snapshot 201 SnapshotSend(MigratableError), 202 203 /// Cannot convert source URL from Path into &str 204 RestoreSourceUrlPathToStr, 205 206 /// Failed to validate config 207 ConfigValidation(ValidationError), 208 209 /// No more that one virtio-vsock device 210 TooManyVsockDevices, 211 212 /// Failed serializing into JSON 213 SerializeJson(serde_json::Error), 214 215 /// Invalid configuration for NUMA. 216 InvalidNumaConfig, 217 218 /// Cannot create seccomp filter 219 CreateSeccompFilter(seccompiler::Error), 220 221 /// Cannot apply seccomp filter 222 ApplySeccompFilter(seccompiler::Error), 223 224 /// Failed resizing a memory zone. 225 ResizeZone, 226 227 /// Cannot activate virtio devices 228 ActivateVirtioDevices(device_manager::DeviceManagerError), 229 230 /// Power button not supported 231 PowerButtonNotSupported, 232 233 /// Error triggering power button 234 PowerButton(device_manager::DeviceManagerError), 235 236 /// Kernel lacks PVH header 237 KernelMissingPvhHeader, 238 239 /// Error doing I/O on TDX firmware file 240 #[cfg(feature = "tdx")] 241 LoadTdvf(std::io::Error), 242 243 /// Error parsing TDVF 244 #[cfg(feature = "tdx")] 245 ParseTdvf(arch::x86_64::tdx::TdvfError), 246 247 /// Error populating HOB 248 #[cfg(feature = "tdx")] 249 PopulateHob(arch::x86_64::tdx::TdvfError), 250 251 /// Error allocating TDVF memory 252 #[cfg(feature = "tdx")] 253 AllocatingTdvfMemory(crate::memory_manager::Error), 254 255 /// Error enabling TDX VM 256 #[cfg(feature = "tdx")] 257 InitializeTdxVm(hypervisor::HypervisorVmError), 258 259 /// Error enabling TDX memory region 260 #[cfg(feature = "tdx")] 261 InitializeTdxMemoryRegion(hypervisor::HypervisorVmError), 262 263 /// Error finalizing TDX setup 264 #[cfg(feature = "tdx")] 265 FinalizeTdx(hypervisor::HypervisorVmError), 266 } 267 pub type Result<T> = result::Result<T, Error>; 268 269 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq)] 270 pub enum VmState { 271 Created, 272 Running, 273 Shutdown, 274 Paused, 275 } 276 277 impl VmState { 278 fn valid_transition(self, new_state: VmState) -> Result<()> { 279 match self { 280 VmState::Created => match new_state { 281 VmState::Created | VmState::Shutdown => { 282 Err(Error::InvalidStateTransition(self, new_state)) 283 } 284 VmState::Running | VmState::Paused => Ok(()), 285 }, 286 287 VmState::Running => match new_state { 288 VmState::Created | VmState::Running => { 289 Err(Error::InvalidStateTransition(self, new_state)) 290 } 291 VmState::Paused | VmState::Shutdown => Ok(()), 292 }, 293 294 VmState::Shutdown => match new_state { 295 VmState::Paused | VmState::Created | VmState::Shutdown => { 296 Err(Error::InvalidStateTransition(self, new_state)) 297 } 298 VmState::Running => Ok(()), 299 }, 300 301 VmState::Paused => match new_state { 302 VmState::Created | VmState::Paused => { 303 Err(Error::InvalidStateTransition(self, new_state)) 304 } 305 VmState::Running | VmState::Shutdown => Ok(()), 306 }, 307 } 308 } 309 } 310 311 // Debug I/O port 312 #[cfg(target_arch = "x86_64")] 313 const DEBUG_IOPORT: u16 = 0x80; 314 #[cfg(target_arch = "x86_64")] 315 const DEBUG_IOPORT_PREFIX: &str = "Debug I/O port"; 316 317 #[cfg(target_arch = "x86_64")] 318 /// Debug I/O port, see: 319 /// https://www.intel.com/content/www/us/en/support/articles/000005500/boards-and-kits.html 320 /// 321 /// Since we're not a physical platform, we can freely assign code ranges for 322 /// debugging specific parts of our virtual platform. 323 pub enum DebugIoPortRange { 324 Firmware, 325 Bootloader, 326 Kernel, 327 Userspace, 328 Custom, 329 } 330 #[cfg(target_arch = "x86_64")] 331 impl DebugIoPortRange { 332 fn from_u8(value: u8) -> DebugIoPortRange { 333 match value { 334 0x00..=0x1f => DebugIoPortRange::Firmware, 335 0x20..=0x3f => DebugIoPortRange::Bootloader, 336 0x40..=0x5f => DebugIoPortRange::Kernel, 337 0x60..=0x7f => DebugIoPortRange::Userspace, 338 _ => DebugIoPortRange::Custom, 339 } 340 } 341 } 342 343 #[cfg(target_arch = "x86_64")] 344 impl fmt::Display for DebugIoPortRange { 345 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 346 match self { 347 DebugIoPortRange::Firmware => write!(f, "{}: Firmware", DEBUG_IOPORT_PREFIX), 348 DebugIoPortRange::Bootloader => write!(f, "{}: Bootloader", DEBUG_IOPORT_PREFIX), 349 DebugIoPortRange::Kernel => write!(f, "{}: Kernel", DEBUG_IOPORT_PREFIX), 350 DebugIoPortRange::Userspace => write!(f, "{}: Userspace", DEBUG_IOPORT_PREFIX), 351 DebugIoPortRange::Custom => write!(f, "{}: Custom", DEBUG_IOPORT_PREFIX), 352 } 353 } 354 } 355 356 struct VmOps { 357 memory: GuestMemoryAtomic<GuestMemoryMmap>, 358 #[cfg(target_arch = "x86_64")] 359 io_bus: Arc<Bus>, 360 mmio_bus: Arc<Bus>, 361 #[cfg(target_arch = "x86_64")] 362 timestamp: std::time::Instant, 363 } 364 365 impl VmOps { 366 #[cfg(target_arch = "x86_64")] 367 // Log debug io port codes. 368 fn log_debug_ioport(&self, code: u8) { 369 let elapsed = self.timestamp.elapsed(); 370 371 debug!( 372 "[{} code 0x{:x}] {}.{:>06} seconds", 373 DebugIoPortRange::from_u8(code), 374 code, 375 elapsed.as_secs(), 376 elapsed.as_micros() 377 ); 378 } 379 } 380 381 impl VmmOps for VmOps { 382 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> hypervisor::vm::Result<usize> { 383 self.memory 384 .memory() 385 .write(buf, GuestAddress(gpa)) 386 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 387 } 388 389 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> hypervisor::vm::Result<usize> { 390 self.memory 391 .memory() 392 .read(buf, GuestAddress(gpa)) 393 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 394 } 395 396 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> { 397 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 398 warn!("Guest MMIO read to unregistered address 0x{:x}", gpa); 399 } 400 Ok(()) 401 } 402 403 fn mmio_write(&self, gpa: u64, data: &[u8]) -> hypervisor::vm::Result<()> { 404 match self.mmio_bus.write(gpa, data) { 405 Err(vm_device::BusError::MissingAddressRange) => { 406 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa); 407 } 408 Ok(Some(barrier)) => { 409 info!("Waiting for barrier"); 410 barrier.wait(); 411 info!("Barrier released"); 412 } 413 _ => {} 414 }; 415 Ok(()) 416 } 417 418 #[cfg(target_arch = "x86_64")] 419 fn pio_read(&self, port: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> { 420 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 421 warn!("Guest PIO read to unregistered address 0x{:x}", port); 422 } 423 Ok(()) 424 } 425 426 #[cfg(target_arch = "x86_64")] 427 fn pio_write(&self, port: u64, data: &[u8]) -> hypervisor::vm::Result<()> { 428 if port == DEBUG_IOPORT as u64 && data.len() == 1 { 429 self.log_debug_ioport(data[0]); 430 return Ok(()); 431 } 432 433 match self.io_bus.write(port, data) { 434 Err(vm_device::BusError::MissingAddressRange) => { 435 warn!("Guest PIO write to unregistered address 0x{:x}", port); 436 } 437 Ok(Some(barrier)) => { 438 info!("Waiting for barrier"); 439 barrier.wait(); 440 info!("Barrier released"); 441 } 442 _ => {} 443 }; 444 Ok(()) 445 } 446 } 447 448 pub fn physical_bits(max_phys_bits: Option<u8>, #[cfg(feature = "tdx")] tdx_enabled: bool) -> u8 { 449 #[cfg(not(feature = "tdx"))] 450 let host_phys_bits = get_host_cpu_phys_bits(); 451 #[cfg(feature = "tdx")] 452 let mut host_phys_bits = get_host_cpu_phys_bits(); 453 454 #[cfg(feature = "tdx")] 455 if tdx_enabled { 456 // When running TDX guest, the Guest Physical Address space is limited 457 // by a shared bit that is located on bit 47 for 4 level paging, and on 458 // bit 51 for 5 level paging (when GPAW bit is 1). In order to keep 459 // things simple, and since a 47 bits address space is 128TiB large, we 460 // ensure to limit the physical addressable space to 47 bits when 461 // runnning TDX. 462 host_phys_bits = std::cmp::min(host_phys_bits, 47) 463 } 464 465 cmp::min(host_phys_bits, max_phys_bits.unwrap_or(host_phys_bits)) 466 } 467 468 pub const HANDLED_SIGNALS: [i32; 3] = [SIGWINCH, SIGTERM, SIGINT]; 469 470 pub struct Vm { 471 kernel: Option<File>, 472 initramfs: Option<File>, 473 threads: Vec<thread::JoinHandle<()>>, 474 device_manager: Arc<Mutex<DeviceManager>>, 475 config: Arc<Mutex<VmConfig>>, 476 on_tty: bool, 477 signals: Option<Handle>, 478 state: RwLock<VmState>, 479 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 480 memory_manager: Arc<Mutex<MemoryManager>>, 481 #[cfg_attr(not(feature = "kvm"), allow(dead_code))] 482 // The hypervisor abstracted virtual machine. 483 vm: Arc<dyn hypervisor::Vm>, 484 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 485 saved_clock: Option<hypervisor::ClockData>, 486 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 487 numa_nodes: NumaNodes, 488 seccomp_action: SeccompAction, 489 exit_evt: EventFd, 490 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 491 hypervisor: Arc<dyn hypervisor::Hypervisor>, 492 } 493 494 impl Vm { 495 #[allow(clippy::too_many_arguments)] 496 fn new_from_memory_manager( 497 config: Arc<Mutex<VmConfig>>, 498 memory_manager: Arc<Mutex<MemoryManager>>, 499 vm: Arc<dyn hypervisor::Vm>, 500 exit_evt: EventFd, 501 reset_evt: EventFd, 502 seccomp_action: &SeccompAction, 503 hypervisor: Arc<dyn hypervisor::Hypervisor>, 504 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] _saved_clock: Option< 505 hypervisor::ClockData, 506 >, 507 activate_evt: EventFd, 508 restoring: bool, 509 ) -> Result<Self> { 510 config 511 .lock() 512 .unwrap() 513 .validate() 514 .map_err(Error::ConfigValidation)?; 515 516 info!("Booting VM from config: {:?}", &config); 517 518 // Create NUMA nodes based on NumaConfig. 519 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 520 let numa_nodes = 521 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 522 523 #[cfg(feature = "tdx")] 524 let force_iommu = config.lock().unwrap().tdx.is_some(); 525 #[cfg(not(feature = "tdx"))] 526 let force_iommu = false; 527 528 let device_manager = DeviceManager::new( 529 vm.clone(), 530 config.clone(), 531 memory_manager.clone(), 532 &exit_evt, 533 &reset_evt, 534 seccomp_action.clone(), 535 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 536 numa_nodes.clone(), 537 &activate_evt, 538 force_iommu, 539 restoring, 540 ) 541 .map_err(Error::DeviceManager)?; 542 543 let memory = memory_manager.lock().unwrap().guest_memory(); 544 #[cfg(target_arch = "x86_64")] 545 let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus()); 546 let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus()); 547 // Create the VmOps structure, which implements the VmmOps trait. 548 // And send it to the hypervisor. 549 let vm_ops: Arc<dyn VmmOps> = Arc::new(VmOps { 550 memory, 551 #[cfg(target_arch = "x86_64")] 552 io_bus, 553 mmio_bus, 554 #[cfg(target_arch = "x86_64")] 555 timestamp: std::time::Instant::now(), 556 }); 557 558 let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?; 559 #[cfg(feature = "tdx")] 560 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 561 let cpu_manager = cpu::CpuManager::new( 562 &config.lock().unwrap().cpus.clone(), 563 &device_manager, 564 &memory_manager, 565 vm.clone(), 566 exit_evt_clone, 567 reset_evt, 568 hypervisor.clone(), 569 seccomp_action.clone(), 570 vm_ops, 571 #[cfg(feature = "tdx")] 572 tdx_enabled, 573 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 574 &numa_nodes, 575 ) 576 .map_err(Error::CpuManager)?; 577 578 let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0; 579 let kernel = config 580 .lock() 581 .unwrap() 582 .kernel 583 .as_ref() 584 .map(|k| File::open(&k.path)) 585 .transpose() 586 .map_err(Error::KernelFile)?; 587 588 let initramfs = config 589 .lock() 590 .unwrap() 591 .initramfs 592 .as_ref() 593 .map(|i| File::open(&i.path)) 594 .transpose() 595 .map_err(Error::InitramfsFile)?; 596 597 Ok(Vm { 598 kernel, 599 initramfs, 600 device_manager, 601 config, 602 on_tty, 603 threads: Vec::with_capacity(1), 604 signals: None, 605 state: RwLock::new(VmState::Created), 606 cpu_manager, 607 memory_manager, 608 vm, 609 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 610 saved_clock: _saved_clock, 611 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 612 numa_nodes, 613 seccomp_action: seccomp_action.clone(), 614 exit_evt, 615 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 616 hypervisor, 617 }) 618 } 619 620 #[cfg(any(target_arch = "aarch64", feature = "acpi"))] 621 fn create_numa_nodes( 622 configs: Option<Vec<NumaConfig>>, 623 memory_manager: &Arc<Mutex<MemoryManager>>, 624 ) -> Result<NumaNodes> { 625 let mm = memory_manager.lock().unwrap(); 626 let mm_zones = mm.memory_zones(); 627 let mut numa_nodes = BTreeMap::new(); 628 629 if let Some(configs) = &configs { 630 for config in configs.iter() { 631 if numa_nodes.contains_key(&config.guest_numa_id) { 632 error!("Can't define twice the same NUMA node"); 633 return Err(Error::InvalidNumaConfig); 634 } 635 636 let mut node = NumaNode::default(); 637 638 if let Some(memory_zones) = &config.memory_zones { 639 for memory_zone in memory_zones.iter() { 640 if let Some(mm_zone) = mm_zones.get(memory_zone) { 641 node.memory_regions.extend(mm_zone.regions().clone()); 642 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 643 node.hotplug_regions.push(virtiomem_zone.region().clone()); 644 } 645 node.memory_zones.push(memory_zone.clone()); 646 } else { 647 error!("Unknown memory zone '{}'", memory_zone); 648 return Err(Error::InvalidNumaConfig); 649 } 650 } 651 } 652 653 if let Some(cpus) = &config.cpus { 654 node.cpus.extend(cpus); 655 } 656 657 if let Some(distances) = &config.distances { 658 for distance in distances.iter() { 659 let dest = distance.destination; 660 let dist = distance.distance; 661 662 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 663 error!("Unknown destination NUMA node {}", dest); 664 return Err(Error::InvalidNumaConfig); 665 } 666 667 if node.distances.contains_key(&dest) { 668 error!("Destination NUMA node {} has been already set", dest); 669 return Err(Error::InvalidNumaConfig); 670 } 671 672 node.distances.insert(dest, dist); 673 } 674 } 675 676 #[cfg(target_arch = "x86_64")] 677 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 678 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 679 let mm_sections = sgx_epc_region.epc_sections(); 680 for sgx_epc_section in sgx_epc_sections.iter() { 681 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 682 node.sgx_epc_sections.push(mm_section.clone()); 683 } else { 684 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 685 return Err(Error::InvalidNumaConfig); 686 } 687 } 688 } else { 689 error!("Missing SGX EPC region"); 690 return Err(Error::InvalidNumaConfig); 691 } 692 } 693 694 numa_nodes.insert(config.guest_numa_id, node); 695 } 696 } 697 698 Ok(numa_nodes) 699 } 700 701 #[allow(clippy::too_many_arguments)] 702 pub fn new( 703 config: Arc<Mutex<VmConfig>>, 704 exit_evt: EventFd, 705 reset_evt: EventFd, 706 seccomp_action: &SeccompAction, 707 hypervisor: Arc<dyn hypervisor::Hypervisor>, 708 activate_evt: EventFd, 709 serial_pty: Option<PtyPair>, 710 console_pty: Option<PtyPair>, 711 console_resize_pipe: Option<File>, 712 ) -> Result<Self> { 713 #[cfg(feature = "tdx")] 714 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 715 hypervisor.check_required_extensions().unwrap(); 716 #[cfg(feature = "tdx")] 717 let vm = hypervisor 718 .create_vm_with_type(if tdx_enabled { 719 2 // KVM_X86_TDX_VM 720 } else { 721 0 // KVM_X86_LEGACY_VM 722 }) 723 .unwrap(); 724 #[cfg(not(feature = "tdx"))] 725 let vm = hypervisor.create_vm().unwrap(); 726 727 #[cfg(target_arch = "x86_64")] 728 vm.enable_split_irq().unwrap(); 729 let phys_bits = physical_bits( 730 config.lock().unwrap().cpus.max_phys_bits, 731 #[cfg(feature = "tdx")] 732 tdx_enabled, 733 ); 734 let memory_manager = MemoryManager::new( 735 vm.clone(), 736 &config.lock().unwrap().memory.clone(), 737 false, 738 phys_bits, 739 #[cfg(feature = "tdx")] 740 tdx_enabled, 741 ) 742 .map_err(Error::MemoryManager)?; 743 744 #[cfg(target_arch = "x86_64")] 745 { 746 if let Some(sgx_epc_config) = config.lock().unwrap().sgx_epc.clone() { 747 memory_manager 748 .lock() 749 .unwrap() 750 .setup_sgx(sgx_epc_config, &vm) 751 .map_err(Error::MemoryManager)?; 752 } 753 } 754 755 let new_vm = Vm::new_from_memory_manager( 756 config, 757 memory_manager, 758 vm, 759 exit_evt, 760 reset_evt, 761 seccomp_action, 762 hypervisor, 763 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 764 None, 765 activate_evt, 766 false, 767 )?; 768 769 // The device manager must create the devices from here as it is part 770 // of the regular code path creating everything from scratch. 771 new_vm 772 .device_manager 773 .lock() 774 .unwrap() 775 .create_devices(serial_pty, console_pty, console_resize_pipe) 776 .map_err(Error::DeviceManager)?; 777 Ok(new_vm) 778 } 779 780 #[allow(clippy::too_many_arguments)] 781 pub fn new_from_snapshot( 782 snapshot: &Snapshot, 783 exit_evt: EventFd, 784 reset_evt: EventFd, 785 source_url: Option<&str>, 786 prefault: bool, 787 seccomp_action: &SeccompAction, 788 hypervisor: Arc<dyn hypervisor::Hypervisor>, 789 activate_evt: EventFd, 790 ) -> Result<Self> { 791 hypervisor.check_required_extensions().unwrap(); 792 let vm = hypervisor.create_vm().unwrap(); 793 #[cfg(target_arch = "x86_64")] 794 vm.enable_split_irq().unwrap(); 795 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 796 let config = vm_snapshot.config; 797 if let Some(state) = vm_snapshot.state { 798 vm.set_state(state) 799 .map_err(|e| Error::Restore(MigratableError::Restore(e.into())))?; 800 } 801 802 let memory_manager = if let Some(memory_manager_snapshot) = 803 snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) 804 { 805 let phys_bits = physical_bits( 806 config.lock().unwrap().cpus.max_phys_bits, 807 #[cfg(feature = "tdx")] 808 config.lock().unwrap().tdx.is_some(), 809 ); 810 MemoryManager::new_from_snapshot( 811 memory_manager_snapshot, 812 vm.clone(), 813 &config.lock().unwrap().memory.clone(), 814 source_url, 815 prefault, 816 phys_bits, 817 ) 818 .map_err(Error::MemoryManager)? 819 } else { 820 return Err(Error::Restore(MigratableError::Restore(anyhow!( 821 "Missing memory manager snapshot" 822 )))); 823 }; 824 825 Vm::new_from_memory_manager( 826 config, 827 memory_manager, 828 vm, 829 exit_evt, 830 reset_evt, 831 seccomp_action, 832 hypervisor, 833 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 834 vm_snapshot.clock, 835 activate_evt, 836 true, 837 ) 838 } 839 840 pub fn new_from_migration( 841 config: Arc<Mutex<VmConfig>>, 842 exit_evt: EventFd, 843 reset_evt: EventFd, 844 seccomp_action: &SeccompAction, 845 hypervisor: Arc<dyn hypervisor::Hypervisor>, 846 activate_evt: EventFd, 847 ) -> Result<Self> { 848 hypervisor.check_required_extensions().unwrap(); 849 let vm = hypervisor.create_vm().unwrap(); 850 #[cfg(target_arch = "x86_64")] 851 vm.enable_split_irq().unwrap(); 852 let phys_bits = physical_bits( 853 config.lock().unwrap().cpus.max_phys_bits, 854 #[cfg(feature = "tdx")] 855 config.lock().unwrap().tdx.is_some(), 856 ); 857 858 let memory_manager = MemoryManager::new( 859 vm.clone(), 860 &config.lock().unwrap().memory.clone(), 861 false, 862 phys_bits, 863 #[cfg(feature = "tdx")] 864 false, 865 ) 866 .map_err(Error::MemoryManager)?; 867 868 Vm::new_from_memory_manager( 869 config, 870 memory_manager, 871 vm, 872 exit_evt, 873 reset_evt, 874 seccomp_action, 875 hypervisor, 876 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 877 None, 878 activate_evt, 879 true, 880 ) 881 } 882 883 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 884 let mut initramfs = self.initramfs.as_ref().unwrap(); 885 let size: usize = initramfs 886 .seek(SeekFrom::End(0)) 887 .map_err(|_| Error::InitramfsLoad)? 888 .try_into() 889 .unwrap(); 890 initramfs 891 .seek(SeekFrom::Start(0)) 892 .map_err(|_| Error::InitramfsLoad)?; 893 894 let address = 895 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 896 let address = GuestAddress(address); 897 898 guest_mem 899 .read_from(address, &mut initramfs, size) 900 .map_err(|_| Error::InitramfsLoad)?; 901 902 info!("Initramfs loaded: address = 0x{:x}", address.0); 903 Ok(arch::InitramfsConfig { address, size }) 904 } 905 906 fn get_cmdline(&mut self) -> Result<CString> { 907 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE); 908 cmdline 909 .insert_str(self.config.lock().unwrap().cmdline.args.clone()) 910 .map_err(Error::CmdLineInsertStr)?; 911 for entry in self.device_manager.lock().unwrap().cmdline_additions() { 912 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 913 } 914 CString::new(cmdline).map_err(Error::CmdLineCString) 915 } 916 917 #[cfg(target_arch = "aarch64")] 918 fn load_kernel(&mut self) -> Result<EntryPoint> { 919 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 920 let mem = guest_memory.memory(); 921 let mut kernel = self.kernel.as_ref().unwrap(); 922 let entry_addr = match linux_loader::loader::pe::PE::load( 923 mem.deref(), 924 Some(GuestAddress(arch::get_kernel_start())), 925 &mut kernel, 926 None, 927 ) { 928 Ok(entry_addr) => entry_addr, 929 // Try to load the binary as kernel PE file at first. 930 // If failed, retry to load it as UEFI binary. 931 // As the UEFI binary is formatless, it must be the last option to try. 932 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 933 arch::aarch64::uefi::load_uefi( 934 mem.deref(), 935 GuestAddress(arch::get_uefi_start()), 936 &mut kernel, 937 ) 938 .map_err(Error::UefiLoad)?; 939 // The entry point offset in UEFI image is always 0. 940 return Ok(EntryPoint { 941 entry_addr: GuestAddress(arch::get_uefi_start()), 942 }); 943 } 944 Err(e) => { 945 return Err(Error::KernelLoad(e)); 946 } 947 }; 948 949 let entry_point_addr: GuestAddress = entry_addr.kernel_load; 950 951 Ok(EntryPoint { 952 entry_addr: entry_point_addr, 953 }) 954 } 955 956 #[cfg(target_arch = "x86_64")] 957 fn load_kernel(&mut self) -> Result<EntryPoint> { 958 info!("Loading kernel"); 959 let cmdline_cstring = self.get_cmdline()?; 960 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 961 let mem = guest_memory.memory(); 962 let mut kernel = self.kernel.as_ref().unwrap(); 963 let entry_addr = match linux_loader::loader::elf::Elf::load( 964 mem.deref(), 965 None, 966 &mut kernel, 967 Some(arch::layout::HIGH_RAM_START), 968 ) { 969 Ok(entry_addr) => entry_addr, 970 Err(e) => { 971 return Err(Error::KernelLoad(e)); 972 } 973 }; 974 975 linux_loader::loader::load_cmdline( 976 mem.deref(), 977 arch::layout::CMDLINE_START, 978 &cmdline_cstring, 979 ) 980 .map_err(Error::LoadCmdLine)?; 981 982 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 983 // Use the PVH kernel entry point to boot the guest 984 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 985 Ok(EntryPoint { entry_addr }) 986 } else { 987 Err(Error::KernelMissingPvhHeader) 988 } 989 } 990 991 #[cfg(target_arch = "x86_64")] 992 fn configure_system(&mut self) -> Result<()> { 993 info!("Configuring system"); 994 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 995 996 let initramfs_config = match self.initramfs { 997 Some(_) => Some(self.load_initramfs(&mem)?), 998 None => None, 999 }; 1000 1001 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1002 1003 #[allow(unused_mut, unused_assignments)] 1004 let mut rsdp_addr: Option<GuestAddress> = None; 1005 1006 #[cfg(feature = "acpi")] 1007 { 1008 rsdp_addr = Some(crate::acpi::create_acpi_tables( 1009 &mem, 1010 &self.device_manager, 1011 &self.cpu_manager, 1012 &self.memory_manager, 1013 &self.numa_nodes, 1014 )); 1015 info!( 1016 "Created ACPI tables: rsdp_addr = 0x{:x}", 1017 rsdp_addr.unwrap().0 1018 ); 1019 } 1020 1021 let sgx_epc_region = self 1022 .memory_manager 1023 .lock() 1024 .unwrap() 1025 .sgx_epc_region() 1026 .as_ref() 1027 .cloned(); 1028 1029 arch::configure_system( 1030 &mem, 1031 arch::layout::CMDLINE_START, 1032 &initramfs_config, 1033 boot_vcpus, 1034 rsdp_addr, 1035 sgx_epc_region, 1036 ) 1037 .map_err(Error::ConfigureSystem)?; 1038 Ok(()) 1039 } 1040 1041 #[cfg(target_arch = "aarch64")] 1042 fn configure_system(&mut self) -> Result<()> { 1043 let cmdline_cstring = self.get_cmdline()?; 1044 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1045 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1046 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1047 let initramfs_config = match self.initramfs { 1048 Some(_) => Some(self.load_initramfs(&mem)?), 1049 None => None, 1050 }; 1051 1052 let device_info = &self 1053 .device_manager 1054 .lock() 1055 .unwrap() 1056 .get_device_info() 1057 .clone(); 1058 1059 let pci_space_start: GuestAddress = self 1060 .memory_manager 1061 .lock() 1062 .as_ref() 1063 .unwrap() 1064 .start_of_device_area(); 1065 1066 let pci_space_end: GuestAddress = self 1067 .memory_manager 1068 .lock() 1069 .as_ref() 1070 .unwrap() 1071 .end_of_device_area(); 1072 1073 let pci_space_size = pci_space_end 1074 .checked_offset_from(pci_space_start) 1075 .ok_or(Error::MemOverflow)? 1076 + 1; 1077 1078 let pci_space = (pci_space_start.0, pci_space_size); 1079 1080 #[cfg(feature = "acpi")] 1081 { 1082 let _ = crate::acpi::create_acpi_tables( 1083 &mem, 1084 &self.device_manager, 1085 &self.cpu_manager, 1086 &self.memory_manager, 1087 &self.numa_nodes, 1088 ); 1089 } 1090 1091 let gic_device = create_gic( 1092 &self.memory_manager.lock().as_ref().unwrap().vm, 1093 self.cpu_manager.lock().unwrap().boot_vcpus() as u64, 1094 ) 1095 .map_err(|e| { 1096 Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::SetupGic(e))) 1097 })?; 1098 1099 arch::configure_system( 1100 &mem, 1101 &cmdline_cstring, 1102 vcpu_mpidrs, 1103 vcpu_topology, 1104 device_info, 1105 &initramfs_config, 1106 &pci_space, 1107 &*gic_device, 1108 &self.numa_nodes, 1109 ) 1110 .map_err(Error::ConfigureSystem)?; 1111 1112 // Update the GIC entity in device manager 1113 self.device_manager 1114 .lock() 1115 .unwrap() 1116 .get_interrupt_controller() 1117 .unwrap() 1118 .lock() 1119 .unwrap() 1120 .set_gic_device(Arc::new(Mutex::new(gic_device))); 1121 1122 // Activate gic device 1123 self.device_manager 1124 .lock() 1125 .unwrap() 1126 .get_interrupt_controller() 1127 .unwrap() 1128 .lock() 1129 .unwrap() 1130 .enable() 1131 .map_err(Error::EnableInterruptController)?; 1132 1133 Ok(()) 1134 } 1135 1136 pub fn serial_pty(&self) -> Option<PtyPair> { 1137 self.device_manager.lock().unwrap().serial_pty() 1138 } 1139 1140 pub fn console_pty(&self) -> Option<PtyPair> { 1141 self.device_manager.lock().unwrap().console_pty() 1142 } 1143 1144 pub fn console_resize_pipe(&self) -> Option<Arc<File>> { 1145 self.device_manager.lock().unwrap().console_resize_pipe() 1146 } 1147 1148 pub fn shutdown(&mut self) -> Result<()> { 1149 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1150 let new_state = VmState::Shutdown; 1151 1152 state.valid_transition(new_state)?; 1153 1154 if self.on_tty { 1155 // Don't forget to set the terminal in canonical mode 1156 // before to exit. 1157 io::stdin() 1158 .lock() 1159 .set_canon_mode() 1160 .map_err(Error::SetTerminalCanon)?; 1161 } 1162 1163 // Trigger the termination of the signal_handler thread 1164 if let Some(signals) = self.signals.take() { 1165 signals.close(); 1166 } 1167 1168 // Wake up the DeviceManager threads so they will get terminated cleanly 1169 self.device_manager 1170 .lock() 1171 .unwrap() 1172 .resume() 1173 .map_err(Error::Resume)?; 1174 1175 self.cpu_manager 1176 .lock() 1177 .unwrap() 1178 .shutdown() 1179 .map_err(Error::CpuManager)?; 1180 1181 // Wait for all the threads to finish 1182 for thread in self.threads.drain(..) { 1183 thread.join().map_err(Error::ThreadCleanup)? 1184 } 1185 *state = new_state; 1186 1187 event!("vm", "shutdown"); 1188 1189 Ok(()) 1190 } 1191 1192 pub fn resize( 1193 &mut self, 1194 desired_vcpus: Option<u8>, 1195 desired_memory: Option<u64>, 1196 desired_balloon: Option<u64>, 1197 ) -> Result<()> { 1198 event!("vm", "resizing"); 1199 1200 if let Some(desired_vcpus) = desired_vcpus { 1201 if self 1202 .cpu_manager 1203 .lock() 1204 .unwrap() 1205 .resize(desired_vcpus) 1206 .map_err(Error::CpuManager)? 1207 { 1208 self.device_manager 1209 .lock() 1210 .unwrap() 1211 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1212 .map_err(Error::DeviceManager)?; 1213 } 1214 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1215 } 1216 1217 if let Some(desired_memory) = desired_memory { 1218 let new_region = self 1219 .memory_manager 1220 .lock() 1221 .unwrap() 1222 .resize(desired_memory) 1223 .map_err(Error::MemoryManager)?; 1224 1225 let mut memory_config = &mut self.config.lock().unwrap().memory; 1226 1227 if let Some(new_region) = &new_region { 1228 self.device_manager 1229 .lock() 1230 .unwrap() 1231 .update_memory(new_region) 1232 .map_err(Error::DeviceManager)?; 1233 1234 match memory_config.hotplug_method { 1235 HotplugMethod::Acpi => { 1236 self.device_manager 1237 .lock() 1238 .unwrap() 1239 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1240 .map_err(Error::DeviceManager)?; 1241 } 1242 HotplugMethod::VirtioMem => {} 1243 } 1244 } 1245 1246 // We update the VM config regardless of the actual guest resize 1247 // operation result (happened or not), so that if the VM reboots 1248 // it will be running with the last configure memory size. 1249 match memory_config.hotplug_method { 1250 HotplugMethod::Acpi => memory_config.size = desired_memory, 1251 HotplugMethod::VirtioMem => { 1252 if desired_memory > memory_config.size { 1253 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1254 } else { 1255 memory_config.hotplugged_size = None; 1256 } 1257 } 1258 } 1259 } 1260 1261 if let Some(desired_balloon) = desired_balloon { 1262 self.device_manager 1263 .lock() 1264 .unwrap() 1265 .resize_balloon(desired_balloon) 1266 .map_err(Error::DeviceManager)?; 1267 1268 // Update the configuration value for the balloon size to ensure 1269 // a reboot would use the right value. 1270 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1271 balloon_config.size = desired_balloon; 1272 } 1273 } 1274 1275 event!("vm", "resized"); 1276 1277 Ok(()) 1278 } 1279 1280 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1281 let memory_config = &mut self.config.lock().unwrap().memory; 1282 1283 if let Some(zones) = &mut memory_config.zones { 1284 for zone in zones.iter_mut() { 1285 if zone.id == id { 1286 if desired_memory >= zone.size { 1287 let hotplugged_size = desired_memory - zone.size; 1288 self.memory_manager 1289 .lock() 1290 .unwrap() 1291 .resize_zone(&id, desired_memory - zone.size) 1292 .map_err(Error::MemoryManager)?; 1293 // We update the memory zone config regardless of the 1294 // actual 'resize-zone' operation result (happened or 1295 // not), so that if the VM reboots it will be running 1296 // with the last configured memory zone size. 1297 zone.hotplugged_size = Some(hotplugged_size); 1298 1299 return Ok(()); 1300 } else { 1301 error!( 1302 "Invalid to ask less ({}) than boot RAM ({}) for \ 1303 this memory zone", 1304 desired_memory, zone.size, 1305 ); 1306 return Err(Error::ResizeZone); 1307 } 1308 } 1309 } 1310 } 1311 1312 error!("Could not find the memory zone {} for the resize", id); 1313 Err(Error::ResizeZone) 1314 } 1315 1316 fn add_to_config<T>(devices: &mut Option<Vec<T>>, device: T) { 1317 if let Some(devices) = devices { 1318 devices.push(device); 1319 } else { 1320 *devices = Some(vec![device]); 1321 } 1322 } 1323 1324 pub fn add_device(&mut self, mut _device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1325 { 1326 // Validate on a clone of the config 1327 let mut config = self.config.lock().unwrap().clone(); 1328 Self::add_to_config(&mut config.devices, _device_cfg.clone()); 1329 config.validate().map_err(Error::ConfigValidation)?; 1330 } 1331 1332 let pci_device_info = self 1333 .device_manager 1334 .lock() 1335 .unwrap() 1336 .add_device(&mut _device_cfg) 1337 .map_err(Error::DeviceManager)?; 1338 1339 // Update VmConfig by adding the new device. This is important to 1340 // ensure the device would be created in case of a reboot. 1341 { 1342 let mut config = self.config.lock().unwrap(); 1343 Self::add_to_config(&mut config.devices, _device_cfg); 1344 } 1345 1346 self.device_manager 1347 .lock() 1348 .unwrap() 1349 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1350 .map_err(Error::DeviceManager)?; 1351 1352 Ok(pci_device_info) 1353 } 1354 1355 pub fn add_user_device(&mut self, mut device_cfg: UserDeviceConfig) -> Result<PciDeviceInfo> { 1356 { 1357 // Validate on a clone of the config 1358 let mut config = self.config.lock().unwrap().clone(); 1359 Self::add_to_config(&mut config.user_devices, device_cfg.clone()); 1360 config.validate().map_err(Error::ConfigValidation)?; 1361 } 1362 1363 let pci_device_info = self 1364 .device_manager 1365 .lock() 1366 .unwrap() 1367 .add_user_device(&mut device_cfg) 1368 .map_err(Error::DeviceManager)?; 1369 1370 // Update VmConfig by adding the new device. This is important to 1371 // ensure the device would be created in case of a reboot. 1372 { 1373 let mut config = self.config.lock().unwrap(); 1374 Self::add_to_config(&mut config.user_devices, device_cfg); 1375 } 1376 1377 self.device_manager 1378 .lock() 1379 .unwrap() 1380 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1381 .map_err(Error::DeviceManager)?; 1382 1383 Ok(pci_device_info) 1384 } 1385 1386 pub fn remove_device(&mut self, _id: String) -> Result<()> { 1387 self.device_manager 1388 .lock() 1389 .unwrap() 1390 .remove_device(_id.clone()) 1391 .map_err(Error::DeviceManager)?; 1392 1393 // Update VmConfig by removing the device. This is important to 1394 // ensure the device would not be created in case of a reboot. 1395 let mut config = self.config.lock().unwrap(); 1396 1397 // Remove if VFIO device 1398 if let Some(devices) = config.devices.as_mut() { 1399 devices.retain(|dev| dev.id.as_ref() != Some(&_id)); 1400 } 1401 1402 // Remove if disk device 1403 if let Some(disks) = config.disks.as_mut() { 1404 disks.retain(|dev| dev.id.as_ref() != Some(&_id)); 1405 } 1406 1407 // Remove if net device 1408 if let Some(net) = config.net.as_mut() { 1409 net.retain(|dev| dev.id.as_ref() != Some(&_id)); 1410 } 1411 1412 // Remove if pmem device 1413 if let Some(pmem) = config.pmem.as_mut() { 1414 pmem.retain(|dev| dev.id.as_ref() != Some(&_id)); 1415 } 1416 1417 // Remove if vsock device 1418 if let Some(vsock) = config.vsock.as_ref() { 1419 if vsock.id.as_ref() == Some(&_id) { 1420 config.vsock = None; 1421 } 1422 } 1423 1424 self.device_manager 1425 .lock() 1426 .unwrap() 1427 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1428 .map_err(Error::DeviceManager)?; 1429 Ok(()) 1430 } 1431 1432 pub fn add_disk(&mut self, mut _disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1433 { 1434 // Validate on a clone of the config 1435 let mut config = self.config.lock().unwrap().clone(); 1436 Self::add_to_config(&mut config.disks, _disk_cfg.clone()); 1437 config.validate().map_err(Error::ConfigValidation)?; 1438 } 1439 1440 let pci_device_info = self 1441 .device_manager 1442 .lock() 1443 .unwrap() 1444 .add_disk(&mut _disk_cfg) 1445 .map_err(Error::DeviceManager)?; 1446 1447 // Update VmConfig by adding the new device. This is important to 1448 // ensure the device would be created in case of a reboot. 1449 { 1450 let mut config = self.config.lock().unwrap(); 1451 Self::add_to_config(&mut config.disks, _disk_cfg); 1452 } 1453 1454 self.device_manager 1455 .lock() 1456 .unwrap() 1457 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1458 .map_err(Error::DeviceManager)?; 1459 1460 Ok(pci_device_info) 1461 } 1462 1463 pub fn add_fs(&mut self, mut _fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1464 { 1465 // Validate on a clone of the config 1466 let mut config = self.config.lock().unwrap().clone(); 1467 Self::add_to_config(&mut config.fs, _fs_cfg.clone()); 1468 config.validate().map_err(Error::ConfigValidation)?; 1469 } 1470 1471 let pci_device_info = self 1472 .device_manager 1473 .lock() 1474 .unwrap() 1475 .add_fs(&mut _fs_cfg) 1476 .map_err(Error::DeviceManager)?; 1477 1478 // Update VmConfig by adding the new device. This is important to 1479 // ensure the device would be created in case of a reboot. 1480 { 1481 let mut config = self.config.lock().unwrap(); 1482 Self::add_to_config(&mut config.fs, _fs_cfg); 1483 } 1484 1485 self.device_manager 1486 .lock() 1487 .unwrap() 1488 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1489 .map_err(Error::DeviceManager)?; 1490 1491 Ok(pci_device_info) 1492 } 1493 1494 pub fn add_pmem(&mut self, mut _pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1495 { 1496 // Validate on a clone of the config 1497 let mut config = self.config.lock().unwrap().clone(); 1498 Self::add_to_config(&mut config.pmem, _pmem_cfg.clone()); 1499 config.validate().map_err(Error::ConfigValidation)?; 1500 } 1501 1502 let pci_device_info = self 1503 .device_manager 1504 .lock() 1505 .unwrap() 1506 .add_pmem(&mut _pmem_cfg) 1507 .map_err(Error::DeviceManager)?; 1508 1509 // Update VmConfig by adding the new device. This is important to 1510 // ensure the device would be created in case of a reboot. 1511 { 1512 let mut config = self.config.lock().unwrap(); 1513 Self::add_to_config(&mut config.pmem, _pmem_cfg); 1514 } 1515 1516 self.device_manager 1517 .lock() 1518 .unwrap() 1519 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1520 .map_err(Error::DeviceManager)?; 1521 1522 Ok(pci_device_info) 1523 } 1524 1525 pub fn add_net(&mut self, mut _net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1526 { 1527 // Validate on a clone of the config 1528 let mut config = self.config.lock().unwrap().clone(); 1529 Self::add_to_config(&mut config.net, _net_cfg.clone()); 1530 config.validate().map_err(Error::ConfigValidation)?; 1531 } 1532 1533 let pci_device_info = self 1534 .device_manager 1535 .lock() 1536 .unwrap() 1537 .add_net(&mut _net_cfg) 1538 .map_err(Error::DeviceManager)?; 1539 1540 // Update VmConfig by adding the new device. This is important to 1541 // ensure the device would be created in case of a reboot. 1542 { 1543 let mut config = self.config.lock().unwrap(); 1544 Self::add_to_config(&mut config.net, _net_cfg); 1545 } 1546 1547 self.device_manager 1548 .lock() 1549 .unwrap() 1550 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1551 .map_err(Error::DeviceManager)?; 1552 1553 Ok(pci_device_info) 1554 } 1555 1556 pub fn add_vsock(&mut self, mut _vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1557 if self.config.lock().unwrap().vsock.is_some() { 1558 return Err(Error::TooManyVsockDevices); 1559 } 1560 1561 { 1562 // Validate on a clone of the config 1563 let mut config = self.config.lock().unwrap().clone(); 1564 config.vsock = Some(_vsock_cfg.clone()); 1565 config.validate().map_err(Error::ConfigValidation)?; 1566 } 1567 1568 let pci_device_info = self 1569 .device_manager 1570 .lock() 1571 .unwrap() 1572 .add_vsock(&mut _vsock_cfg) 1573 .map_err(Error::DeviceManager)?; 1574 1575 // Update VmConfig by adding the new device. This is important to 1576 // ensure the device would be created in case of a reboot. 1577 { 1578 let mut config = self.config.lock().unwrap(); 1579 config.vsock = Some(_vsock_cfg); 1580 } 1581 1582 self.device_manager 1583 .lock() 1584 .unwrap() 1585 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1586 .map_err(Error::DeviceManager)?; 1587 1588 Ok(pci_device_info) 1589 } 1590 1591 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1592 Ok(self.device_manager.lock().unwrap().counters()) 1593 } 1594 1595 fn os_signal_handler( 1596 mut signals: Signals, 1597 console_input_clone: Arc<Console>, 1598 on_tty: bool, 1599 exit_evt: &EventFd, 1600 ) { 1601 for sig in HANDLED_SIGNALS { 1602 unblock_signal(sig).unwrap(); 1603 } 1604 1605 for signal in signals.forever() { 1606 match signal { 1607 SIGWINCH => { 1608 console_input_clone.update_console_size(); 1609 } 1610 SIGTERM | SIGINT => { 1611 if on_tty { 1612 io::stdin() 1613 .lock() 1614 .set_canon_mode() 1615 .expect("failed to restore terminal mode"); 1616 } 1617 if exit_evt.write(1).is_err() { 1618 std::process::exit(1); 1619 } 1620 } 1621 _ => (), 1622 } 1623 } 1624 } 1625 1626 #[cfg(feature = "tdx")] 1627 fn init_tdx(&mut self) -> Result<()> { 1628 let cpuid = self.cpu_manager.lock().unwrap().common_cpuid(); 1629 let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32; 1630 self.vm 1631 .tdx_init(&cpuid, max_vcpus) 1632 .map_err(Error::InitializeTdxVm)?; 1633 Ok(()) 1634 } 1635 1636 #[cfg(feature = "tdx")] 1637 fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> { 1638 use arch::x86_64::tdx::*; 1639 // The TDVF file contains a table of section as well as code 1640 let mut firmware_file = 1641 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1642 .map_err(Error::LoadTdvf)?; 1643 1644 // For all the sections allocate some RAM backing them 1645 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1646 } 1647 1648 #[cfg(feature = "tdx")] 1649 fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> { 1650 use arch::x86_64::tdx::*; 1651 // Get the memory end *before* we start adding TDVF ram regions 1652 let boot_guest_memory = self 1653 .memory_manager 1654 .lock() 1655 .as_ref() 1656 .unwrap() 1657 .boot_guest_memory(); 1658 for section in sections { 1659 // No need to allocate if the section falls within guest RAM ranges 1660 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1661 info!( 1662 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1663 section 1664 ); 1665 continue; 1666 } 1667 1668 info!("Allocating TDVF Section: {:x?}", section); 1669 self.memory_manager 1670 .lock() 1671 .unwrap() 1672 .add_ram_region(GuestAddress(section.address), section.size as usize) 1673 .map_err(Error::AllocatingTdvfMemory)?; 1674 } 1675 1676 // The TDVF file contains a table of section as well as code 1677 let mut firmware_file = 1678 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1679 .map_err(Error::LoadTdvf)?; 1680 1681 // The guest memory at this point now has all the required regions so it 1682 // is safe to copy from the TDVF file into it. 1683 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1684 let mem = guest_memory.memory(); 1685 let mut hob_offset = None; 1686 for section in sections { 1687 info!("Populating TDVF Section: {:x?}", section); 1688 match section.r#type { 1689 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1690 info!("Copying section to guest memory"); 1691 firmware_file 1692 .seek(SeekFrom::Start(section.data_offset as u64)) 1693 .map_err(Error::LoadTdvf)?; 1694 mem.read_from( 1695 GuestAddress(section.address), 1696 &mut firmware_file, 1697 section.data_size as usize, 1698 ) 1699 .unwrap(); 1700 } 1701 TdvfSectionType::TdHob => { 1702 hob_offset = Some(section.address); 1703 } 1704 _ => {} 1705 } 1706 } 1707 1708 // Generate HOB 1709 let mut hob = TdHob::start(hob_offset.unwrap()); 1710 1711 let mut sorted_sections = sections.to_vec(); 1712 sorted_sections.retain(|section| { 1713 !matches!(section.r#type, TdvfSectionType::Bfv | TdvfSectionType::Cfv) 1714 }); 1715 sorted_sections.sort_by_key(|section| section.address); 1716 sorted_sections.reverse(); 1717 let mut current_section = sorted_sections.pop(); 1718 1719 // RAM regions interleaved with TDVF sections 1720 let mut next_start_addr = 0; 1721 for region in boot_guest_memory.iter() { 1722 let region_start = region.start_addr().0; 1723 let region_end = region.last_addr().0; 1724 if region_start > next_start_addr { 1725 next_start_addr = region_start; 1726 } 1727 1728 loop { 1729 let (start, size, ram) = if let Some(section) = ¤t_section { 1730 if section.address <= next_start_addr { 1731 (section.address, section.size, false) 1732 } else { 1733 let last_addr = std::cmp::min(section.address - 1, region_end); 1734 (next_start_addr, last_addr - next_start_addr + 1, true) 1735 } 1736 } else { 1737 (next_start_addr, region_end - next_start_addr + 1, true) 1738 }; 1739 1740 hob.add_memory_resource(&mem, start, size, ram) 1741 .map_err(Error::PopulateHob)?; 1742 1743 if !ram { 1744 current_section = sorted_sections.pop(); 1745 } 1746 1747 next_start_addr = start + size; 1748 1749 if next_start_addr > region_end { 1750 break; 1751 } 1752 } 1753 } 1754 1755 // MMIO regions 1756 hob.add_mmio_resource( 1757 &mem, 1758 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1759 arch::layout::APIC_START.raw_value() 1760 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1761 ) 1762 .map_err(Error::PopulateHob)?; 1763 let start_of_device_area = self 1764 .memory_manager 1765 .lock() 1766 .unwrap() 1767 .start_of_device_area() 1768 .raw_value(); 1769 let end_of_device_area = self 1770 .memory_manager 1771 .lock() 1772 .unwrap() 1773 .end_of_device_area() 1774 .raw_value(); 1775 hob.add_mmio_resource( 1776 &mem, 1777 start_of_device_area, 1778 end_of_device_area - start_of_device_area, 1779 ) 1780 .map_err(Error::PopulateHob)?; 1781 1782 hob.finish(&mem).map_err(Error::PopulateHob)?; 1783 1784 Ok(hob_offset) 1785 } 1786 1787 #[cfg(feature = "tdx")] 1788 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1789 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1790 let mem = guest_memory.memory(); 1791 1792 for section in sections { 1793 self.vm 1794 .tdx_init_memory_region( 1795 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1796 section.address, 1797 section.size, 1798 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1799 section.attributes == 1, 1800 ) 1801 .map_err(Error::InitializeTdxMemoryRegion)?; 1802 } 1803 Ok(()) 1804 } 1805 1806 fn setup_signal_handler(&mut self) -> Result<()> { 1807 let console = self.device_manager.lock().unwrap().console().clone(); 1808 let signals = Signals::new(&HANDLED_SIGNALS); 1809 match signals { 1810 Ok(signals) => { 1811 self.signals = Some(signals.handle()); 1812 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?; 1813 let on_tty = self.on_tty; 1814 let signal_handler_seccomp_filter = 1815 get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler) 1816 .map_err(Error::CreateSeccompFilter)?; 1817 self.threads.push( 1818 thread::Builder::new() 1819 .name("signal_handler".to_string()) 1820 .spawn(move || { 1821 if !signal_handler_seccomp_filter.is_empty() { 1822 if let Err(e) = apply_filter(&signal_handler_seccomp_filter) 1823 .map_err(Error::ApplySeccompFilter) 1824 { 1825 error!("Error applying seccomp filter: {:?}", e); 1826 exit_evt.write(1).ok(); 1827 return; 1828 } 1829 } 1830 std::panic::catch_unwind(AssertUnwindSafe(|| { 1831 Vm::os_signal_handler(signals, console, on_tty, &exit_evt); 1832 })) 1833 .map_err(|_| { 1834 error!("signal_handler thead panicked"); 1835 exit_evt.write(1).ok() 1836 }) 1837 .ok(); 1838 }) 1839 .map_err(Error::SignalHandlerSpawn)?, 1840 ); 1841 } 1842 Err(e) => error!("Signal not found {}", e), 1843 } 1844 Ok(()) 1845 } 1846 1847 fn setup_tty(&self) -> Result<()> { 1848 if self.on_tty { 1849 io::stdin() 1850 .lock() 1851 .set_raw_mode() 1852 .map_err(Error::SetTerminalRaw)?; 1853 } 1854 1855 Ok(()) 1856 } 1857 1858 pub fn boot(&mut self) -> Result<()> { 1859 info!("Booting VM"); 1860 event!("vm", "booting"); 1861 let current_state = self.get_state()?; 1862 if current_state == VmState::Paused { 1863 return self.resume().map_err(Error::Resume); 1864 } 1865 1866 let new_state = VmState::Running; 1867 current_state.valid_transition(new_state)?; 1868 1869 // Load kernel if configured 1870 let entry_point = if self.kernel.as_ref().is_some() { 1871 Some(self.load_kernel()?) 1872 } else { 1873 None 1874 }; 1875 1876 // The initial TDX configuration must be done before the vCPUs are 1877 // created 1878 #[cfg(feature = "tdx")] 1879 if self.config.lock().unwrap().tdx.is_some() { 1880 self.init_tdx()?; 1881 } 1882 1883 // Create and configure vcpus 1884 self.cpu_manager 1885 .lock() 1886 .unwrap() 1887 .create_boot_vcpus(entry_point) 1888 .map_err(Error::CpuManager)?; 1889 1890 #[cfg(feature = "tdx")] 1891 let sections = self.extract_tdvf_sections()?; 1892 1893 // Configuring the TDX regions requires that the vCPUs are created 1894 #[cfg(feature = "tdx")] 1895 let hob_address = if self.config.lock().unwrap().tdx.is_some() { 1896 self.populate_tdx_sections(§ions)? 1897 } else { 1898 None 1899 }; 1900 1901 // Configure shared state based on loaded kernel 1902 entry_point.map(|_| self.configure_system()).transpose()?; 1903 1904 #[cfg(feature = "tdx")] 1905 if let Some(hob_address) = hob_address { 1906 // With the HOB address extracted the vCPUs can have 1907 // their TDX state configured. 1908 self.cpu_manager 1909 .lock() 1910 .unwrap() 1911 .initialize_tdx(hob_address) 1912 .map_err(Error::CpuManager)?; 1913 self.init_tdx_memory(§ions)?; 1914 // With TDX memory and CPU state configured TDX setup is complete 1915 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 1916 } 1917 1918 self.cpu_manager 1919 .lock() 1920 .unwrap() 1921 .start_boot_vcpus() 1922 .map_err(Error::CpuManager)?; 1923 1924 self.setup_signal_handler()?; 1925 self.setup_tty()?; 1926 1927 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1928 *state = new_state; 1929 event!("vm", "booted"); 1930 Ok(()) 1931 } 1932 1933 pub fn handle_pty(&self, event: EpollDispatch) -> Result<()> { 1934 // Could be a little dangerous, picks up a lock on device_manager 1935 // and goes into a blocking read. If the epoll loops starts to be 1936 // services by multiple threads likely need to revist this. 1937 let dm = self.device_manager.lock().unwrap(); 1938 1939 if matches!(event, EpollDispatch::SerialPty) { 1940 if let Some(mut pty) = dm.serial_pty() { 1941 let mut out = [0u8; 64]; 1942 let count = pty.main.read(&mut out).map_err(Error::PtyConsole)?; 1943 let console = dm.console(); 1944 console 1945 .queue_input_bytes_serial(&out[..count]) 1946 .map_err(Error::Console)?; 1947 }; 1948 } 1949 1950 Ok(()) 1951 } 1952 1953 pub fn handle_stdin(&self) -> Result<()> { 1954 let mut out = [0u8; 64]; 1955 let count = io::stdin() 1956 .lock() 1957 .read_raw(&mut out) 1958 .map_err(Error::Console)?; 1959 1960 // Replace "\n" with "\r" to deal with Windows SAC (#1170) 1961 if count == 1 && out[0] == 0x0a { 1962 out[0] = 0x0d; 1963 } 1964 1965 if matches!( 1966 self.config.lock().unwrap().serial.mode, 1967 ConsoleOutputMode::Tty 1968 ) { 1969 self.device_manager 1970 .lock() 1971 .unwrap() 1972 .console() 1973 .queue_input_bytes_serial(&out[..count]) 1974 .map_err(Error::Console)?; 1975 } 1976 1977 Ok(()) 1978 } 1979 1980 /// Gets a thread-safe reference counted pointer to the VM configuration. 1981 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 1982 Arc::clone(&self.config) 1983 } 1984 1985 /// Get the VM state. Returns an error if the state is poisoned. 1986 pub fn get_state(&self) -> Result<VmState> { 1987 self.state 1988 .try_read() 1989 .map_err(|_| Error::PoisonedState) 1990 .map(|state| *state) 1991 } 1992 1993 /// Load saved clock from snapshot 1994 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1995 pub fn load_clock_from_snapshot( 1996 &mut self, 1997 snapshot: &Snapshot, 1998 ) -> Result<Option<hypervisor::ClockData>> { 1999 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 2000 self.saved_clock = vm_snapshot.clock; 2001 Ok(self.saved_clock) 2002 } 2003 2004 #[cfg(target_arch = "aarch64")] 2005 /// Add the vGIC section to the VM snapshot. 2006 fn add_vgic_snapshot_section( 2007 &self, 2008 vm_snapshot: &mut Snapshot, 2009 ) -> std::result::Result<(), MigratableError> { 2010 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2011 let gic_device = Arc::clone( 2012 self.device_manager 2013 .lock() 2014 .unwrap() 2015 .get_interrupt_controller() 2016 .unwrap() 2017 .lock() 2018 .unwrap() 2019 .get_gic_device() 2020 .unwrap(), 2021 ); 2022 2023 gic_device 2024 .lock() 2025 .unwrap() 2026 .set_gicr_typers(&saved_vcpu_states); 2027 2028 vm_snapshot.add_snapshot( 2029 if let Some(gicv3_its) = gic_device 2030 .lock() 2031 .unwrap() 2032 .as_any_concrete_mut() 2033 .downcast_mut::<KvmGicV3Its>() 2034 { 2035 gicv3_its.snapshot()? 2036 } else { 2037 return Err(MigratableError::Snapshot(anyhow!( 2038 "GicDevice downcast to KvmGicV3Its failed when snapshotting VM!" 2039 ))); 2040 }, 2041 ); 2042 2043 Ok(()) 2044 } 2045 2046 #[cfg(target_arch = "aarch64")] 2047 /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing. 2048 fn restore_vgic_and_enable_interrupt( 2049 &self, 2050 vm_snapshot: &Snapshot, 2051 ) -> std::result::Result<(), MigratableError> { 2052 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2053 // The number of vCPUs is the same as the number of saved vCPU states. 2054 let vcpu_numbers = saved_vcpu_states.len(); 2055 2056 // Creating a GIC device here, as the GIC will not be created when 2057 // restoring the device manager. Note that currently only the bare GICv3 2058 // without ITS is supported. 2059 let mut gic_device = create_gic(&self.vm, vcpu_numbers.try_into().unwrap()) 2060 .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?; 2061 2062 // Here we prepare the GICR_TYPER registers from the restored vCPU states. 2063 gic_device.set_gicr_typers(&saved_vcpu_states); 2064 2065 let gic_device = Arc::new(Mutex::new(gic_device)); 2066 // Update the GIC entity in device manager 2067 self.device_manager 2068 .lock() 2069 .unwrap() 2070 .get_interrupt_controller() 2071 .unwrap() 2072 .lock() 2073 .unwrap() 2074 .set_gic_device(Arc::clone(&gic_device)); 2075 2076 // Restore GIC states. 2077 if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) { 2078 if let Some(gicv3_its) = gic_device 2079 .lock() 2080 .unwrap() 2081 .as_any_concrete_mut() 2082 .downcast_mut::<KvmGicV3Its>() 2083 { 2084 gicv3_its.restore(*gicv3_its_snapshot.clone())?; 2085 } else { 2086 return Err(MigratableError::Restore(anyhow!( 2087 "GicDevice downcast to KvmGicV3Its failed when restoring VM!" 2088 ))); 2089 }; 2090 } else { 2091 return Err(MigratableError::Restore(anyhow!( 2092 "Missing GicV3Its snapshot" 2093 ))); 2094 } 2095 2096 // Activate gic device 2097 self.device_manager 2098 .lock() 2099 .unwrap() 2100 .get_interrupt_controller() 2101 .unwrap() 2102 .lock() 2103 .unwrap() 2104 .enable() 2105 .map_err(|e| { 2106 MigratableError::Restore(anyhow!( 2107 "Could not enable interrupt controller routing: {:#?}", 2108 e 2109 )) 2110 })?; 2111 2112 Ok(()) 2113 } 2114 2115 /// Gets the actual size of the balloon. 2116 pub fn balloon_size(&self) -> u64 { 2117 self.device_manager.lock().unwrap().balloon_size() 2118 } 2119 2120 pub fn receive_memory_regions<F>( 2121 &mut self, 2122 ranges: &MemoryRangeTable, 2123 fd: &mut F, 2124 ) -> std::result::Result<(), MigratableError> 2125 where 2126 F: Read, 2127 { 2128 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2129 let mem = guest_memory.memory(); 2130 2131 for range in ranges.regions() { 2132 mem.read_exact_from(GuestAddress(range.gpa), fd, range.length as usize) 2133 .map_err(|e| { 2134 MigratableError::MigrateReceive(anyhow!( 2135 "Error transferring memory to socket: {}", 2136 e 2137 )) 2138 })?; 2139 } 2140 Ok(()) 2141 } 2142 2143 pub fn send_memory_regions<F>( 2144 &mut self, 2145 ranges: &MemoryRangeTable, 2146 fd: &mut F, 2147 ) -> std::result::Result<(), MigratableError> 2148 where 2149 F: Write, 2150 { 2151 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2152 let mem = guest_memory.memory(); 2153 2154 for range in ranges.regions() { 2155 mem.write_all_to(GuestAddress(range.gpa), fd, range.length as usize) 2156 .map_err(|e| { 2157 MigratableError::MigrateSend(anyhow!( 2158 "Error transferring memory to socket: {}", 2159 e 2160 )) 2161 })?; 2162 } 2163 2164 Ok(()) 2165 } 2166 2167 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2168 let mut table = MemoryRangeTable::default(); 2169 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2170 2171 for region in guest_memory.memory().iter() { 2172 table.push(MemoryRange { 2173 gpa: region.start_addr().raw_value(), 2174 length: region.len() as u64, 2175 }); 2176 } 2177 2178 Ok(table) 2179 } 2180 2181 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2182 self.device_manager.lock().unwrap().device_tree() 2183 } 2184 2185 pub fn activate_virtio_devices(&self) -> Result<()> { 2186 self.device_manager 2187 .lock() 2188 .unwrap() 2189 .activate_virtio_devices() 2190 .map_err(Error::ActivateVirtioDevices) 2191 } 2192 2193 #[cfg(target_arch = "x86_64")] 2194 pub fn power_button(&self) -> Result<()> { 2195 #[cfg(feature = "acpi")] 2196 return self 2197 .device_manager 2198 .lock() 2199 .unwrap() 2200 .notify_power_button() 2201 .map_err(Error::PowerButton); 2202 #[cfg(not(feature = "acpi"))] 2203 Err(Error::PowerButtonNotSupported) 2204 } 2205 2206 #[cfg(target_arch = "aarch64")] 2207 pub fn power_button(&self) -> Result<()> { 2208 self.device_manager 2209 .lock() 2210 .unwrap() 2211 .notify_power_button() 2212 .map_err(Error::PowerButton) 2213 } 2214 } 2215 2216 impl Pausable for Vm { 2217 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2218 event!("vm", "pausing"); 2219 let mut state = self 2220 .state 2221 .try_write() 2222 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2223 let new_state = VmState::Paused; 2224 2225 state 2226 .valid_transition(new_state) 2227 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2228 2229 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2230 { 2231 let mut clock = self 2232 .vm 2233 .get_clock() 2234 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2235 // Reset clock flags. 2236 clock.flags = 0; 2237 self.saved_clock = Some(clock); 2238 } 2239 self.cpu_manager.lock().unwrap().pause()?; 2240 self.device_manager.lock().unwrap().pause()?; 2241 2242 *state = new_state; 2243 2244 event!("vm", "paused"); 2245 Ok(()) 2246 } 2247 2248 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2249 event!("vm", "resuming"); 2250 let mut state = self 2251 .state 2252 .try_write() 2253 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2254 let new_state = VmState::Running; 2255 2256 state 2257 .valid_transition(new_state) 2258 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2259 2260 self.cpu_manager.lock().unwrap().resume()?; 2261 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2262 { 2263 if let Some(clock) = &self.saved_clock { 2264 self.vm.set_clock(clock).map_err(|e| { 2265 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2266 })?; 2267 } 2268 } 2269 self.device_manager.lock().unwrap().resume()?; 2270 2271 // And we're back to the Running state. 2272 *state = new_state; 2273 event!("vm", "resumed"); 2274 Ok(()) 2275 } 2276 } 2277 2278 #[derive(Serialize, Deserialize)] 2279 pub struct VmSnapshot { 2280 pub config: Arc<Mutex<VmConfig>>, 2281 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2282 pub clock: Option<hypervisor::ClockData>, 2283 pub state: Option<hypervisor::VmState>, 2284 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2285 pub common_cpuid: hypervisor::CpuId, 2286 } 2287 2288 pub const VM_SNAPSHOT_ID: &str = "vm"; 2289 impl Snapshottable for Vm { 2290 fn id(&self) -> String { 2291 VM_SNAPSHOT_ID.to_string() 2292 } 2293 2294 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2295 event!("vm", "snapshotting"); 2296 2297 #[cfg(feature = "tdx")] 2298 { 2299 if self.config.lock().unwrap().tdx.is_some() { 2300 return Err(MigratableError::Snapshot(anyhow!( 2301 "Snapshot not possible with TDX VM" 2302 ))); 2303 } 2304 } 2305 2306 let current_state = self.get_state().unwrap(); 2307 if current_state != VmState::Paused { 2308 return Err(MigratableError::Snapshot(anyhow!( 2309 "Trying to snapshot while VM is running" 2310 ))); 2311 } 2312 2313 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2314 let common_cpuid = { 2315 #[cfg(feature = "tdx")] 2316 let tdx_enabled = self.config.lock().unwrap().tdx.is_some(); 2317 let phys_bits = physical_bits( 2318 self.config.lock().unwrap().cpus.max_phys_bits, 2319 #[cfg(feature = "tdx")] 2320 tdx_enabled, 2321 ); 2322 arch::generate_common_cpuid( 2323 self.hypervisor.clone(), 2324 None, 2325 None, 2326 phys_bits, 2327 self.config.lock().unwrap().cpus.kvm_hyperv, 2328 #[cfg(feature = "tdx")] 2329 tdx_enabled, 2330 ) 2331 .map_err(|e| { 2332 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2333 })? 2334 }; 2335 2336 let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID); 2337 let vm_state = self 2338 .vm 2339 .state() 2340 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2341 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2342 config: self.get_config(), 2343 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2344 clock: self.saved_clock, 2345 state: Some(vm_state), 2346 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2347 common_cpuid, 2348 }) 2349 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2350 2351 vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?); 2352 vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?); 2353 2354 #[cfg(target_arch = "aarch64")] 2355 self.add_vgic_snapshot_section(&mut vm_snapshot) 2356 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2357 2358 vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?); 2359 vm_snapshot.add_data_section(SnapshotDataSection { 2360 id: format!("{}-section", VM_SNAPSHOT_ID), 2361 snapshot: vm_snapshot_data, 2362 }); 2363 2364 event!("vm", "snapshotted"); 2365 Ok(vm_snapshot) 2366 } 2367 2368 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 2369 event!("vm", "restoring"); 2370 2371 let current_state = self 2372 .get_state() 2373 .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?; 2374 let new_state = VmState::Paused; 2375 current_state.valid_transition(new_state).map_err(|e| { 2376 MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e)) 2377 })?; 2378 2379 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2380 self.memory_manager 2381 .lock() 2382 .unwrap() 2383 .restore(*memory_manager_snapshot.clone())?; 2384 } else { 2385 return Err(MigratableError::Restore(anyhow!( 2386 "Missing memory manager snapshot" 2387 ))); 2388 } 2389 2390 if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) { 2391 self.cpu_manager 2392 .lock() 2393 .unwrap() 2394 .restore(*cpu_manager_snapshot.clone())?; 2395 } else { 2396 return Err(MigratableError::Restore(anyhow!( 2397 "Missing CPU manager snapshot" 2398 ))); 2399 } 2400 2401 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2402 self.device_manager 2403 .lock() 2404 .unwrap() 2405 .restore(*device_manager_snapshot.clone())?; 2406 } else { 2407 return Err(MigratableError::Restore(anyhow!( 2408 "Missing device manager snapshot" 2409 ))); 2410 } 2411 2412 #[cfg(target_arch = "aarch64")] 2413 self.restore_vgic_and_enable_interrupt(&snapshot)?; 2414 2415 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2416 self.device_manager 2417 .lock() 2418 .unwrap() 2419 .restore_devices(*device_manager_snapshot.clone())?; 2420 } else { 2421 return Err(MigratableError::Restore(anyhow!( 2422 "Missing device manager snapshot" 2423 ))); 2424 } 2425 2426 // Now we can start all vCPUs from here. 2427 self.cpu_manager 2428 .lock() 2429 .unwrap() 2430 .start_restored_vcpus() 2431 .map_err(|e| { 2432 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e)) 2433 })?; 2434 2435 self.setup_signal_handler().map_err(|e| { 2436 MigratableError::Restore(anyhow!("Could not setup signal handler: {:#?}", e)) 2437 })?; 2438 self.setup_tty() 2439 .map_err(|e| MigratableError::Restore(anyhow!("Could not setup tty: {:#?}", e)))?; 2440 2441 let mut state = self 2442 .state 2443 .try_write() 2444 .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?; 2445 *state = new_state; 2446 2447 event!("vm", "restored"); 2448 Ok(()) 2449 } 2450 } 2451 2452 impl Transportable for Vm { 2453 fn send( 2454 &self, 2455 snapshot: &Snapshot, 2456 destination_url: &str, 2457 ) -> std::result::Result<(), MigratableError> { 2458 let mut vm_snapshot_path = url_to_path(destination_url)?; 2459 vm_snapshot_path.push(VM_SNAPSHOT_FILE); 2460 2461 // Create the snapshot file 2462 let mut vm_snapshot_file = OpenOptions::new() 2463 .read(true) 2464 .write(true) 2465 .create_new(true) 2466 .open(vm_snapshot_path) 2467 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2468 2469 // Serialize and write the snapshot 2470 let vm_snapshot = 2471 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2472 2473 vm_snapshot_file 2474 .write(&vm_snapshot) 2475 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2476 2477 // Tell the memory manager to also send/write its own snapshot. 2478 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2479 self.memory_manager 2480 .lock() 2481 .unwrap() 2482 .send(&*memory_manager_snapshot.clone(), destination_url)?; 2483 } else { 2484 return Err(MigratableError::Restore(anyhow!( 2485 "Missing memory manager snapshot" 2486 ))); 2487 } 2488 2489 Ok(()) 2490 } 2491 } 2492 2493 impl Migratable for Vm { 2494 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2495 self.memory_manager.lock().unwrap().start_dirty_log()?; 2496 self.device_manager.lock().unwrap().start_dirty_log() 2497 } 2498 2499 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2500 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2501 self.device_manager.lock().unwrap().stop_dirty_log() 2502 } 2503 2504 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2505 Ok(MemoryRangeTable::new_from_tables(vec![ 2506 self.memory_manager.lock().unwrap().dirty_log()?, 2507 self.device_manager.lock().unwrap().dirty_log()?, 2508 ])) 2509 } 2510 2511 fn complete_migration(&mut self) -> std::result::Result<(), MigratableError> { 2512 self.memory_manager.lock().unwrap().complete_migration()?; 2513 self.device_manager.lock().unwrap().complete_migration() 2514 } 2515 } 2516 2517 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2518 #[cfg(test)] 2519 mod tests { 2520 use super::*; 2521 2522 fn test_vm_state_transitions(state: VmState) { 2523 match state { 2524 VmState::Created => { 2525 // Check the transitions from Created 2526 assert!(state.valid_transition(VmState::Created).is_err()); 2527 assert!(state.valid_transition(VmState::Running).is_ok()); 2528 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2529 assert!(state.valid_transition(VmState::Paused).is_ok()); 2530 } 2531 VmState::Running => { 2532 // Check the transitions from Running 2533 assert!(state.valid_transition(VmState::Created).is_err()); 2534 assert!(state.valid_transition(VmState::Running).is_err()); 2535 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2536 assert!(state.valid_transition(VmState::Paused).is_ok()); 2537 } 2538 VmState::Shutdown => { 2539 // Check the transitions from Shutdown 2540 assert!(state.valid_transition(VmState::Created).is_err()); 2541 assert!(state.valid_transition(VmState::Running).is_ok()); 2542 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2543 assert!(state.valid_transition(VmState::Paused).is_err()); 2544 } 2545 VmState::Paused => { 2546 // Check the transitions from Paused 2547 assert!(state.valid_transition(VmState::Created).is_err()); 2548 assert!(state.valid_transition(VmState::Running).is_ok()); 2549 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2550 assert!(state.valid_transition(VmState::Paused).is_err()); 2551 } 2552 } 2553 } 2554 2555 #[test] 2556 fn test_vm_created_transitions() { 2557 test_vm_state_transitions(VmState::Created); 2558 } 2559 2560 #[test] 2561 fn test_vm_running_transitions() { 2562 test_vm_state_transitions(VmState::Running); 2563 } 2564 2565 #[test] 2566 fn test_vm_shutdown_transitions() { 2567 test_vm_state_transitions(VmState::Shutdown); 2568 } 2569 2570 #[test] 2571 fn test_vm_paused_transitions() { 2572 test_vm_state_transitions(VmState::Paused); 2573 } 2574 } 2575 2576 #[cfg(target_arch = "aarch64")] 2577 #[cfg(test)] 2578 mod tests { 2579 use super::*; 2580 use crate::GuestMemoryMmap; 2581 use arch::aarch64::fdt::create_fdt; 2582 use arch::aarch64::gic::kvm::create_gic; 2583 use arch::aarch64::layout; 2584 use arch::{DeviceType, MmioDeviceInfo}; 2585 use vm_memory::GuestAddress; 2586 2587 const LEN: u64 = 4096; 2588 2589 #[test] 2590 fn test_create_fdt_with_devices() { 2591 let regions = vec![( 2592 GuestAddress(layout::RAM_64BIT_START), 2593 (layout::FDT_MAX_SIZE + 0x1000) as usize, 2594 )]; 2595 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 2596 2597 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 2598 ( 2599 (DeviceType::Serial, DeviceType::Serial.to_string()), 2600 MmioDeviceInfo { 2601 addr: 0x00, 2602 irq: 33, 2603 }, 2604 ), 2605 ( 2606 (DeviceType::Virtio(1), "virtio".to_string()), 2607 MmioDeviceInfo { addr: LEN, irq: 34 }, 2608 ), 2609 ( 2610 (DeviceType::Rtc, "rtc".to_string()), 2611 MmioDeviceInfo { 2612 addr: 2 * LEN, 2613 irq: 35, 2614 }, 2615 ), 2616 ] 2617 .iter() 2618 .cloned() 2619 .collect(); 2620 2621 let hv = hypervisor::new().unwrap(); 2622 let vm = hv.create_vm().unwrap(); 2623 let gic = create_gic(&vm, 1).unwrap(); 2624 assert!(create_fdt( 2625 &mem, 2626 &CString::new("console=tty0").unwrap(), 2627 vec![0], 2628 Some((0, 0, 0)), 2629 &dev_info, 2630 &*gic, 2631 &None, 2632 &(0x1_0000_0000, 0x1_0000), 2633 &BTreeMap::new(), 2634 ) 2635 .is_ok()) 2636 } 2637 } 2638 2639 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2640 #[test] 2641 pub fn test_vm() { 2642 use hypervisor::VmExit; 2643 use vm_memory::{GuestMemory, GuestMemoryRegion}; 2644 // This example based on https://lwn.net/Articles/658511/ 2645 let code = [ 2646 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 2647 0x00, 0xd8, /* add %bl, %al */ 2648 0x04, b'0', /* add $'0', %al */ 2649 0xee, /* out %al, (%dx) */ 2650 0xb0, b'\n', /* mov $'\n', %al */ 2651 0xee, /* out %al, (%dx) */ 2652 0xf4, /* hlt */ 2653 ]; 2654 2655 let mem_size = 0x1000; 2656 let load_addr = GuestAddress(0x1000); 2657 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 2658 2659 let hv = hypervisor::new().unwrap(); 2660 let vm = hv.create_vm().expect("new VM creation failed"); 2661 2662 for (index, region) in mem.iter().enumerate() { 2663 let mem_region = vm.make_user_memory_region( 2664 index as u32, 2665 region.start_addr().raw_value(), 2666 region.len() as u64, 2667 region.as_ptr() as u64, 2668 false, 2669 false, 2670 ); 2671 2672 vm.create_user_memory_region(mem_region) 2673 .expect("Cannot configure guest memory"); 2674 } 2675 mem.write_slice(&code, load_addr) 2676 .expect("Writing code to memory failed"); 2677 2678 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 2679 2680 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 2681 vcpu_sregs.cs.base = 0; 2682 vcpu_sregs.cs.selector = 0; 2683 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 2684 2685 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 2686 vcpu_regs.rip = 0x1000; 2687 vcpu_regs.rax = 2; 2688 vcpu_regs.rbx = 3; 2689 vcpu_regs.rflags = 2; 2690 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 2691 2692 loop { 2693 match vcpu.run().expect("run failed") { 2694 VmExit::IoOut(addr, data) => { 2695 println!( 2696 "IO out -- addr: {:#x} data [{:?}]", 2697 addr, 2698 str::from_utf8(data).unwrap() 2699 ); 2700 } 2701 VmExit::Reset => { 2702 println!("HLT"); 2703 break; 2704 } 2705 r => panic!("unexpected exit reason: {:?}", r), 2706 } 2707 } 2708 } 2709