1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 #[cfg(feature = "acpi")] 15 use crate::config::NumaConfig; 16 use crate::config::{ 17 DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, ValidationError, 18 VmConfig, VsockConfig, 19 }; 20 use crate::cpu; 21 use crate::device_manager::{ 22 self, get_win_size, Console, DeviceManager, DeviceManagerError, PtyPair, 23 }; 24 use crate::device_tree::DeviceTree; 25 use crate::memory_manager::{Error as MemoryManagerError, MemoryManager}; 26 use crate::migration::{get_vm_snapshot, url_to_path, VM_SNAPSHOT_FILE}; 27 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 28 use crate::{GuestMemoryMmap, GuestRegionMmap}; 29 use crate::{ 30 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 31 }; 32 use anyhow::anyhow; 33 use arch::get_host_cpu_phys_bits; 34 #[cfg(feature = "tdx")] 35 use arch::x86_64::tdx::TdvfSection; 36 #[cfg(target_arch = "x86_64")] 37 use arch::x86_64::SgxEpcSection; 38 use arch::EntryPoint; 39 use devices::AcpiNotificationFlags; 40 use hypervisor::vm::{HypervisorVmError, VmmOps}; 41 use linux_loader::cmdline::Cmdline; 42 #[cfg(target_arch = "x86_64")] 43 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 44 #[cfg(target_arch = "aarch64")] 45 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 46 use linux_loader::loader::KernelLoader; 47 use seccomp::{SeccompAction, SeccompFilter}; 48 use signal_hook::{ 49 consts::{SIGINT, SIGTERM, SIGWINCH}, 50 iterator::backend::Handle, 51 iterator::Signals, 52 }; 53 use std::cmp; 54 use std::collections::{BTreeMap, HashMap}; 55 use std::convert::TryInto; 56 use std::ffi::CString; 57 #[cfg(target_arch = "x86_64")] 58 use std::fmt; 59 use std::fs::{File, OpenOptions}; 60 use std::io::{self, Read, Write}; 61 use std::io::{Seek, SeekFrom}; 62 use std::num::Wrapping; 63 use std::ops::Deref; 64 use std::sync::{Arc, Mutex, RwLock}; 65 use std::{result, str, thread}; 66 use vm_device::Bus; 67 use vm_memory::{ 68 Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 69 GuestMemoryRegion, 70 }; 71 use vm_migration::{ 72 protocol::{MemoryRange, MemoryRangeTable}, 73 Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, 74 Transportable, 75 }; 76 use vmm_sys_util::eventfd::EventFd; 77 use vmm_sys_util::terminal::Terminal; 78 79 #[cfg(target_arch = "aarch64")] 80 use arch::aarch64::gic::gicv3_its::kvm::{KvmGicV3Its, GIC_V3_ITS_SNAPSHOT_ID}; 81 #[cfg(target_arch = "aarch64")] 82 use arch::aarch64::gic::kvm::create_gic; 83 #[cfg(target_arch = "aarch64")] 84 use devices::interrupt_controller::{self, InterruptController}; 85 86 /// Errors associated with VM management 87 #[derive(Debug)] 88 pub enum Error { 89 /// Cannot open the kernel image 90 KernelFile(io::Error), 91 92 /// Cannot open the initramfs image 93 InitramfsFile(io::Error), 94 95 /// Cannot load the kernel in memory 96 KernelLoad(linux_loader::loader::Error), 97 98 #[cfg(target_arch = "aarch64")] 99 /// Cannot load the UEFI binary in memory 100 UefiLoad(arch::aarch64::uefi::Error), 101 102 /// Cannot load the initramfs in memory 103 InitramfsLoad, 104 105 /// Cannot load the command line in memory 106 LoadCmdLine(linux_loader::loader::Error), 107 108 /// Cannot modify the command line 109 CmdLineInsertStr(linux_loader::cmdline::Error), 110 111 /// Cannot convert command line into CString 112 CmdLineCString(std::ffi::NulError), 113 114 /// Cannot configure system 115 ConfigureSystem(arch::Error), 116 117 /// Cannot enable interrupt controller 118 #[cfg(target_arch = "aarch64")] 119 EnableInterruptController(interrupt_controller::Error), 120 121 PoisonedState, 122 123 /// Cannot create a device manager. 124 DeviceManager(DeviceManagerError), 125 126 /// Write to the console failed. 127 Console(vmm_sys_util::errno::Error), 128 129 /// Write to the pty console failed. 130 PtyConsole(io::Error), 131 132 /// Cannot setup terminal in raw mode. 133 SetTerminalRaw(vmm_sys_util::errno::Error), 134 135 /// Cannot setup terminal in canonical mode. 136 SetTerminalCanon(vmm_sys_util::errno::Error), 137 138 /// Memory is overflow 139 MemOverflow, 140 141 /// Cannot spawn a signal handler thread 142 SignalHandlerSpawn(io::Error), 143 144 /// Failed to join on vCPU threads 145 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 146 147 /// VM config is missing. 148 VmMissingConfig, 149 150 /// VM is not created 151 VmNotCreated, 152 153 /// VM is already created 154 VmAlreadyCreated, 155 156 /// VM is not running 157 VmNotRunning, 158 159 /// Cannot clone EventFd. 160 EventFdClone(io::Error), 161 162 /// Invalid VM state transition 163 InvalidStateTransition(VmState, VmState), 164 165 /// Error from CPU handling 166 CpuManager(cpu::Error), 167 168 /// Cannot pause devices 169 PauseDevices(MigratableError), 170 171 /// Cannot resume devices 172 ResumeDevices(MigratableError), 173 174 /// Cannot pause CPUs 175 PauseCpus(MigratableError), 176 177 /// Cannot resume cpus 178 ResumeCpus(MigratableError), 179 180 /// Cannot pause VM 181 Pause(MigratableError), 182 183 /// Cannot resume VM 184 Resume(MigratableError), 185 186 /// Memory manager error 187 MemoryManager(MemoryManagerError), 188 189 /// Eventfd write error 190 EventfdError(std::io::Error), 191 192 /// Cannot snapshot VM 193 Snapshot(MigratableError), 194 195 /// Cannot restore VM 196 Restore(MigratableError), 197 198 /// Cannot send VM snapshot 199 SnapshotSend(MigratableError), 200 201 /// Cannot convert source URL from Path into &str 202 RestoreSourceUrlPathToStr, 203 204 /// Failed to validate config 205 ConfigValidation(ValidationError), 206 207 /// No more that one virtio-vsock device 208 TooManyVsockDevices, 209 210 /// Failed serializing into JSON 211 SerializeJson(serde_json::Error), 212 213 /// Invalid configuration for NUMA. 214 InvalidNumaConfig, 215 216 /// Cannot create seccomp filter 217 CreateSeccompFilter(seccomp::SeccompError), 218 219 /// Cannot apply seccomp filter 220 ApplySeccompFilter(seccomp::Error), 221 222 /// Failed resizing a memory zone. 223 ResizeZone, 224 225 /// Cannot activate virtio devices 226 ActivateVirtioDevices(device_manager::DeviceManagerError), 227 228 /// Power button not supported 229 PowerButtonNotSupported, 230 231 /// Error triggering power button 232 PowerButton(device_manager::DeviceManagerError), 233 234 /// Kernel lacks PVH header 235 KernelMissingPvhHeader, 236 237 /// Error doing I/O on TDX firmware file 238 #[cfg(feature = "tdx")] 239 LoadTdvf(std::io::Error), 240 241 /// Error parsing TDVF 242 #[cfg(feature = "tdx")] 243 ParseTdvf(arch::x86_64::tdx::TdvfError), 244 245 /// Error populating HOB 246 #[cfg(feature = "tdx")] 247 PopulateHob(arch::x86_64::tdx::TdvfError), 248 249 /// Error allocating TDVF memory 250 #[cfg(feature = "tdx")] 251 AllocatingTdvfMemory(crate::memory_manager::Error), 252 253 /// Error enabling TDX VM 254 #[cfg(feature = "tdx")] 255 InitializeTdxVm(hypervisor::HypervisorVmError), 256 257 /// Error enabling TDX memory region 258 #[cfg(feature = "tdx")] 259 InitializeTdxMemoryRegion(hypervisor::HypervisorVmError), 260 261 /// Error finalizing TDX setup 262 #[cfg(feature = "tdx")] 263 FinalizeTdx(hypervisor::HypervisorVmError), 264 } 265 pub type Result<T> = result::Result<T, Error>; 266 267 #[derive(Clone, Default)] 268 pub struct NumaNode { 269 memory_regions: Vec<Arc<GuestRegionMmap>>, 270 hotplug_regions: Vec<Arc<GuestRegionMmap>>, 271 cpus: Vec<u8>, 272 distances: BTreeMap<u32, u8>, 273 memory_zones: Vec<String>, 274 #[cfg(target_arch = "x86_64")] 275 sgx_epc_sections: Vec<SgxEpcSection>, 276 } 277 278 impl NumaNode { 279 pub fn memory_regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 280 &self.memory_regions 281 } 282 283 pub fn hotplug_regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 284 &self.hotplug_regions 285 } 286 287 pub fn cpus(&self) -> &Vec<u8> { 288 &self.cpus 289 } 290 291 pub fn distances(&self) -> &BTreeMap<u32, u8> { 292 &self.distances 293 } 294 295 pub fn memory_zones(&self) -> &Vec<String> { 296 &self.memory_zones 297 } 298 299 #[cfg(target_arch = "x86_64")] 300 pub fn sgx_epc_sections(&self) -> &Vec<SgxEpcSection> { 301 &self.sgx_epc_sections 302 } 303 } 304 305 pub type NumaNodes = BTreeMap<u32, NumaNode>; 306 307 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq)] 308 pub enum VmState { 309 Created, 310 Running, 311 Shutdown, 312 Paused, 313 } 314 315 impl VmState { 316 fn valid_transition(self, new_state: VmState) -> Result<()> { 317 match self { 318 VmState::Created => match new_state { 319 VmState::Created | VmState::Shutdown => { 320 Err(Error::InvalidStateTransition(self, new_state)) 321 } 322 VmState::Running | VmState::Paused => Ok(()), 323 }, 324 325 VmState::Running => match new_state { 326 VmState::Created | VmState::Running => { 327 Err(Error::InvalidStateTransition(self, new_state)) 328 } 329 VmState::Paused | VmState::Shutdown => Ok(()), 330 }, 331 332 VmState::Shutdown => match new_state { 333 VmState::Paused | VmState::Created | VmState::Shutdown => { 334 Err(Error::InvalidStateTransition(self, new_state)) 335 } 336 VmState::Running => Ok(()), 337 }, 338 339 VmState::Paused => match new_state { 340 VmState::Created | VmState::Paused => { 341 Err(Error::InvalidStateTransition(self, new_state)) 342 } 343 VmState::Running | VmState::Shutdown => Ok(()), 344 }, 345 } 346 } 347 } 348 349 // Debug I/O port 350 #[cfg(target_arch = "x86_64")] 351 const DEBUG_IOPORT: u16 = 0x80; 352 #[cfg(target_arch = "x86_64")] 353 const DEBUG_IOPORT_PREFIX: &str = "Debug I/O port"; 354 355 #[cfg(target_arch = "x86_64")] 356 /// Debug I/O port, see: 357 /// https://www.intel.com/content/www/us/en/support/articles/000005500/boards-and-kits.html 358 /// 359 /// Since we're not a physical platform, we can freely assign code ranges for 360 /// debugging specific parts of our virtual platform. 361 pub enum DebugIoPortRange { 362 Firmware, 363 Bootloader, 364 Kernel, 365 Userspace, 366 Custom, 367 } 368 #[cfg(target_arch = "x86_64")] 369 impl DebugIoPortRange { 370 fn from_u8(value: u8) -> DebugIoPortRange { 371 match value { 372 0x00..=0x1f => DebugIoPortRange::Firmware, 373 0x20..=0x3f => DebugIoPortRange::Bootloader, 374 0x40..=0x5f => DebugIoPortRange::Kernel, 375 0x60..=0x7f => DebugIoPortRange::Userspace, 376 _ => DebugIoPortRange::Custom, 377 } 378 } 379 } 380 381 #[cfg(target_arch = "x86_64")] 382 impl fmt::Display for DebugIoPortRange { 383 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 384 match self { 385 DebugIoPortRange::Firmware => write!(f, "{}: Firmware", DEBUG_IOPORT_PREFIX), 386 DebugIoPortRange::Bootloader => write!(f, "{}: Bootloader", DEBUG_IOPORT_PREFIX), 387 DebugIoPortRange::Kernel => write!(f, "{}: Kernel", DEBUG_IOPORT_PREFIX), 388 DebugIoPortRange::Userspace => write!(f, "{}: Userspace", DEBUG_IOPORT_PREFIX), 389 DebugIoPortRange::Custom => write!(f, "{}: Custom", DEBUG_IOPORT_PREFIX), 390 } 391 } 392 } 393 394 struct VmOps { 395 memory: GuestMemoryAtomic<GuestMemoryMmap>, 396 #[cfg(target_arch = "x86_64")] 397 io_bus: Arc<Bus>, 398 mmio_bus: Arc<Bus>, 399 #[cfg(target_arch = "x86_64")] 400 timestamp: std::time::Instant, 401 } 402 403 impl VmOps { 404 #[cfg(target_arch = "x86_64")] 405 // Log debug io port codes. 406 fn log_debug_ioport(&self, code: u8) { 407 let elapsed = self.timestamp.elapsed(); 408 409 debug!( 410 "[{} code 0x{:x}] {}.{:>06} seconds", 411 DebugIoPortRange::from_u8(code), 412 code, 413 elapsed.as_secs(), 414 elapsed.as_micros() 415 ); 416 } 417 } 418 419 impl VmmOps for VmOps { 420 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> hypervisor::vm::Result<usize> { 421 self.memory 422 .memory() 423 .write(buf, GuestAddress(gpa)) 424 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 425 } 426 427 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> hypervisor::vm::Result<usize> { 428 self.memory 429 .memory() 430 .read(buf, GuestAddress(gpa)) 431 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 432 } 433 434 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> { 435 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 436 warn!("Guest MMIO read to unregistered address 0x{:x}", gpa); 437 } 438 Ok(()) 439 } 440 441 fn mmio_write(&self, gpa: u64, data: &[u8]) -> hypervisor::vm::Result<()> { 442 match self.mmio_bus.write(gpa, data) { 443 Err(vm_device::BusError::MissingAddressRange) => { 444 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa); 445 } 446 Ok(Some(barrier)) => { 447 info!("Waiting for barrier"); 448 barrier.wait(); 449 info!("Barrier released"); 450 } 451 _ => {} 452 }; 453 Ok(()) 454 } 455 456 #[cfg(target_arch = "x86_64")] 457 fn pio_read(&self, port: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> { 458 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 459 warn!("Guest PIO read to unregistered address 0x{:x}", port); 460 } 461 Ok(()) 462 } 463 464 #[cfg(target_arch = "x86_64")] 465 fn pio_write(&self, port: u64, data: &[u8]) -> hypervisor::vm::Result<()> { 466 if port == DEBUG_IOPORT as u64 && data.len() == 1 { 467 self.log_debug_ioport(data[0]); 468 return Ok(()); 469 } 470 471 match self.io_bus.write(port, data) { 472 Err(vm_device::BusError::MissingAddressRange) => { 473 warn!("Guest PIO write to unregistered address 0x{:x}", port); 474 } 475 Ok(Some(barrier)) => { 476 info!("Waiting for barrier"); 477 barrier.wait(); 478 info!("Barrier released"); 479 } 480 _ => {} 481 }; 482 Ok(()) 483 } 484 } 485 486 pub fn physical_bits(max_phys_bits: Option<u8>, #[cfg(feature = "tdx")] tdx_enabled: bool) -> u8 { 487 #[cfg(not(feature = "tdx"))] 488 let host_phys_bits = get_host_cpu_phys_bits(); 489 #[cfg(feature = "tdx")] 490 let mut host_phys_bits = get_host_cpu_phys_bits(); 491 492 #[cfg(feature = "tdx")] 493 if tdx_enabled { 494 // When running TDX guest, the Guest Physical Address space is limited 495 // by a shared bit that is located on bit 47 for 4 level paging, and on 496 // bit 51 for 5 level paging (when GPAW bit is 1). In order to keep 497 // things simple, and since a 47 bits address space is 128TiB large, we 498 // ensure to limit the physical addressable space to 47 bits when 499 // runnning TDX. 500 host_phys_bits = std::cmp::min(host_phys_bits, 47) 501 } 502 503 cmp::min(host_phys_bits, max_phys_bits.unwrap_or(host_phys_bits)) 504 } 505 506 pub struct Vm { 507 kernel: Option<File>, 508 initramfs: Option<File>, 509 threads: Vec<thread::JoinHandle<()>>, 510 device_manager: Arc<Mutex<DeviceManager>>, 511 config: Arc<Mutex<VmConfig>>, 512 on_tty: bool, 513 signals: Option<Handle>, 514 state: RwLock<VmState>, 515 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 516 memory_manager: Arc<Mutex<MemoryManager>>, 517 #[cfg_attr(not(feature = "kvm"), allow(dead_code))] 518 // The hypervisor abstracted virtual machine. 519 vm: Arc<dyn hypervisor::Vm>, 520 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 521 saved_clock: Option<hypervisor::ClockData>, 522 #[cfg(feature = "acpi")] 523 numa_nodes: NumaNodes, 524 seccomp_action: SeccompAction, 525 exit_evt: EventFd, 526 } 527 528 impl Vm { 529 #[allow(clippy::too_many_arguments)] 530 fn new_from_memory_manager( 531 config: Arc<Mutex<VmConfig>>, 532 memory_manager: Arc<Mutex<MemoryManager>>, 533 vm: Arc<dyn hypervisor::Vm>, 534 exit_evt: EventFd, 535 reset_evt: EventFd, 536 seccomp_action: &SeccompAction, 537 hypervisor: Arc<dyn hypervisor::Hypervisor>, 538 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] _saved_clock: Option< 539 hypervisor::ClockData, 540 >, 541 activate_evt: EventFd, 542 ) -> Result<Self> { 543 config 544 .lock() 545 .unwrap() 546 .validate() 547 .map_err(Error::ConfigValidation)?; 548 549 info!("Booting VM from config: {:?}", &config); 550 551 // Create NUMA nodes based on NumaConfig. 552 #[cfg(feature = "acpi")] 553 let numa_nodes = 554 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 555 556 #[cfg(feature = "tdx")] 557 let force_iommu = config.lock().unwrap().tdx.is_some(); 558 #[cfg(not(feature = "tdx"))] 559 let force_iommu = false; 560 561 let device_manager = DeviceManager::new( 562 vm.clone(), 563 config.clone(), 564 memory_manager.clone(), 565 &exit_evt, 566 &reset_evt, 567 seccomp_action.clone(), 568 #[cfg(feature = "acpi")] 569 numa_nodes.clone(), 570 &activate_evt, 571 force_iommu, 572 ) 573 .map_err(Error::DeviceManager)?; 574 575 let memory = memory_manager.lock().unwrap().guest_memory(); 576 #[cfg(target_arch = "x86_64")] 577 let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus()); 578 let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus()); 579 // Create the VmOps structure, which implements the VmmOps trait. 580 // And send it to the hypervisor. 581 let vm_ops: Arc<Box<dyn VmmOps>> = Arc::new(Box::new(VmOps { 582 memory, 583 #[cfg(target_arch = "x86_64")] 584 io_bus, 585 mmio_bus, 586 #[cfg(target_arch = "x86_64")] 587 timestamp: std::time::Instant::now(), 588 })); 589 590 let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?; 591 #[cfg(feature = "tdx")] 592 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 593 let cpu_manager = cpu::CpuManager::new( 594 &config.lock().unwrap().cpus.clone(), 595 &device_manager, 596 &memory_manager, 597 vm.clone(), 598 exit_evt_clone, 599 reset_evt, 600 hypervisor, 601 seccomp_action.clone(), 602 vm_ops, 603 #[cfg(feature = "tdx")] 604 tdx_enabled, 605 #[cfg(feature = "acpi")] 606 &numa_nodes, 607 ) 608 .map_err(Error::CpuManager)?; 609 610 let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0; 611 let kernel = config 612 .lock() 613 .unwrap() 614 .kernel 615 .as_ref() 616 .map(|k| File::open(&k.path)) 617 .transpose() 618 .map_err(Error::KernelFile)?; 619 620 let initramfs = config 621 .lock() 622 .unwrap() 623 .initramfs 624 .as_ref() 625 .map(|i| File::open(&i.path)) 626 .transpose() 627 .map_err(Error::InitramfsFile)?; 628 629 Ok(Vm { 630 kernel, 631 initramfs, 632 device_manager, 633 config, 634 on_tty, 635 threads: Vec::with_capacity(1), 636 signals: None, 637 state: RwLock::new(VmState::Created), 638 cpu_manager, 639 memory_manager, 640 vm, 641 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 642 saved_clock: _saved_clock, 643 #[cfg(feature = "acpi")] 644 numa_nodes, 645 seccomp_action: seccomp_action.clone(), 646 exit_evt, 647 }) 648 } 649 650 #[cfg(feature = "acpi")] 651 fn create_numa_nodes( 652 configs: Option<Vec<NumaConfig>>, 653 memory_manager: &Arc<Mutex<MemoryManager>>, 654 ) -> Result<NumaNodes> { 655 let mm = memory_manager.lock().unwrap(); 656 let mm_zones = mm.memory_zones(); 657 let mut numa_nodes = BTreeMap::new(); 658 659 if let Some(configs) = &configs { 660 for config in configs.iter() { 661 if numa_nodes.contains_key(&config.guest_numa_id) { 662 error!("Can't define twice the same NUMA node"); 663 return Err(Error::InvalidNumaConfig); 664 } 665 666 let mut node = NumaNode::default(); 667 668 if let Some(memory_zones) = &config.memory_zones { 669 for memory_zone in memory_zones.iter() { 670 if let Some(mm_zone) = mm_zones.get(memory_zone) { 671 node.memory_regions.extend(mm_zone.regions().clone()); 672 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 673 node.hotplug_regions.push(virtiomem_zone.region().clone()); 674 } 675 node.memory_zones.push(memory_zone.clone()); 676 } else { 677 error!("Unknown memory zone '{}'", memory_zone); 678 return Err(Error::InvalidNumaConfig); 679 } 680 } 681 } 682 683 if let Some(cpus) = &config.cpus { 684 node.cpus.extend(cpus); 685 } 686 687 if let Some(distances) = &config.distances { 688 for distance in distances.iter() { 689 let dest = distance.destination; 690 let dist = distance.distance; 691 692 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 693 error!("Unknown destination NUMA node {}", dest); 694 return Err(Error::InvalidNumaConfig); 695 } 696 697 if node.distances.contains_key(&dest) { 698 error!("Destination NUMA node {} has been already set", dest); 699 return Err(Error::InvalidNumaConfig); 700 } 701 702 node.distances.insert(dest, dist); 703 } 704 } 705 706 #[cfg(target_arch = "x86_64")] 707 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 708 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 709 let mm_sections = sgx_epc_region.epc_sections(); 710 for sgx_epc_section in sgx_epc_sections.iter() { 711 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 712 node.sgx_epc_sections.push(mm_section.clone()); 713 } else { 714 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 715 return Err(Error::InvalidNumaConfig); 716 } 717 } 718 } else { 719 error!("Missing SGX EPC region"); 720 return Err(Error::InvalidNumaConfig); 721 } 722 } 723 724 numa_nodes.insert(config.guest_numa_id, node); 725 } 726 } 727 728 Ok(numa_nodes) 729 } 730 731 #[allow(clippy::too_many_arguments)] 732 pub fn new( 733 config: Arc<Mutex<VmConfig>>, 734 exit_evt: EventFd, 735 reset_evt: EventFd, 736 seccomp_action: &SeccompAction, 737 hypervisor: Arc<dyn hypervisor::Hypervisor>, 738 activate_evt: EventFd, 739 serial_pty: Option<PtyPair>, 740 console_pty: Option<PtyPair>, 741 ) -> Result<Self> { 742 #[cfg(feature = "tdx")] 743 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 744 hypervisor.check_required_extensions().unwrap(); 745 #[cfg(feature = "tdx")] 746 let vm = hypervisor 747 .create_vm_with_type(if tdx_enabled { 748 2 // KVM_X86_TDX_VM 749 } else { 750 0 // KVM_X86_LEGACY_VM 751 }) 752 .unwrap(); 753 #[cfg(not(feature = "tdx"))] 754 let vm = hypervisor.create_vm().unwrap(); 755 756 #[cfg(target_arch = "x86_64")] 757 vm.enable_split_irq().unwrap(); 758 let phys_bits = physical_bits( 759 config.lock().unwrap().cpus.max_phys_bits, 760 #[cfg(feature = "tdx")] 761 tdx_enabled, 762 ); 763 let memory_manager = MemoryManager::new( 764 vm.clone(), 765 &config.lock().unwrap().memory.clone(), 766 false, 767 phys_bits, 768 #[cfg(feature = "tdx")] 769 tdx_enabled, 770 ) 771 .map_err(Error::MemoryManager)?; 772 773 #[cfg(target_arch = "x86_64")] 774 { 775 if let Some(sgx_epc_config) = config.lock().unwrap().sgx_epc.clone() { 776 memory_manager 777 .lock() 778 .unwrap() 779 .setup_sgx(sgx_epc_config, &vm) 780 .map_err(Error::MemoryManager)?; 781 } 782 } 783 784 let new_vm = Vm::new_from_memory_manager( 785 config, 786 memory_manager, 787 vm, 788 exit_evt, 789 reset_evt, 790 seccomp_action, 791 hypervisor, 792 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 793 None, 794 activate_evt, 795 )?; 796 797 // The device manager must create the devices from here as it is part 798 // of the regular code path creating everything from scratch. 799 new_vm 800 .device_manager 801 .lock() 802 .unwrap() 803 .create_devices(serial_pty, console_pty) 804 .map_err(Error::DeviceManager)?; 805 Ok(new_vm) 806 } 807 808 #[allow(clippy::too_many_arguments)] 809 pub fn new_from_snapshot( 810 snapshot: &Snapshot, 811 exit_evt: EventFd, 812 reset_evt: EventFd, 813 source_url: Option<&str>, 814 prefault: bool, 815 seccomp_action: &SeccompAction, 816 hypervisor: Arc<dyn hypervisor::Hypervisor>, 817 activate_evt: EventFd, 818 ) -> Result<Self> { 819 hypervisor.check_required_extensions().unwrap(); 820 let vm = hypervisor.create_vm().unwrap(); 821 #[cfg(target_arch = "x86_64")] 822 vm.enable_split_irq().unwrap(); 823 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 824 let config = vm_snapshot.config; 825 if let Some(state) = vm_snapshot.state { 826 vm.set_state(state) 827 .map_err(|e| Error::Restore(MigratableError::Restore(e.into())))?; 828 } 829 830 let memory_manager = if let Some(memory_manager_snapshot) = 831 snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) 832 { 833 let phys_bits = physical_bits( 834 config.lock().unwrap().cpus.max_phys_bits, 835 #[cfg(feature = "tdx")] 836 config.lock().unwrap().tdx.is_some(), 837 ); 838 MemoryManager::new_from_snapshot( 839 memory_manager_snapshot, 840 vm.clone(), 841 &config.lock().unwrap().memory.clone(), 842 source_url, 843 prefault, 844 phys_bits, 845 ) 846 .map_err(Error::MemoryManager)? 847 } else { 848 return Err(Error::Restore(MigratableError::Restore(anyhow!( 849 "Missing memory manager snapshot" 850 )))); 851 }; 852 853 Vm::new_from_memory_manager( 854 config, 855 memory_manager, 856 vm, 857 exit_evt, 858 reset_evt, 859 seccomp_action, 860 hypervisor, 861 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 862 vm_snapshot.clock, 863 activate_evt, 864 ) 865 } 866 867 pub fn new_from_migration( 868 config: Arc<Mutex<VmConfig>>, 869 exit_evt: EventFd, 870 reset_evt: EventFd, 871 seccomp_action: &SeccompAction, 872 hypervisor: Arc<dyn hypervisor::Hypervisor>, 873 activate_evt: EventFd, 874 ) -> Result<Self> { 875 hypervisor.check_required_extensions().unwrap(); 876 let vm = hypervisor.create_vm().unwrap(); 877 #[cfg(target_arch = "x86_64")] 878 vm.enable_split_irq().unwrap(); 879 let phys_bits = physical_bits( 880 config.lock().unwrap().cpus.max_phys_bits, 881 #[cfg(feature = "tdx")] 882 config.lock().unwrap().tdx.is_some(), 883 ); 884 885 let memory_manager = MemoryManager::new( 886 vm.clone(), 887 &config.lock().unwrap().memory.clone(), 888 false, 889 phys_bits, 890 #[cfg(feature = "tdx")] 891 false, 892 ) 893 .map_err(Error::MemoryManager)?; 894 895 Vm::new_from_memory_manager( 896 config, 897 memory_manager, 898 vm, 899 exit_evt, 900 reset_evt, 901 seccomp_action, 902 hypervisor, 903 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 904 None, 905 activate_evt, 906 ) 907 } 908 909 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 910 let mut initramfs = self.initramfs.as_ref().unwrap(); 911 let size: usize = initramfs 912 .seek(SeekFrom::End(0)) 913 .map_err(|_| Error::InitramfsLoad)? 914 .try_into() 915 .unwrap(); 916 initramfs 917 .seek(SeekFrom::Start(0)) 918 .map_err(|_| Error::InitramfsLoad)?; 919 920 let address = 921 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 922 let address = GuestAddress(address); 923 924 guest_mem 925 .read_from(address, &mut initramfs, size) 926 .map_err(|_| Error::InitramfsLoad)?; 927 928 info!("Initramfs loaded: address = 0x{:x}", address.0); 929 Ok(arch::InitramfsConfig { address, size }) 930 } 931 932 fn get_cmdline(&mut self) -> Result<CString> { 933 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE); 934 cmdline 935 .insert_str(self.config.lock().unwrap().cmdline.args.clone()) 936 .map_err(Error::CmdLineInsertStr)?; 937 for entry in self.device_manager.lock().unwrap().cmdline_additions() { 938 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 939 } 940 CString::new(cmdline).map_err(Error::CmdLineCString) 941 } 942 943 #[cfg(target_arch = "aarch64")] 944 fn load_kernel(&mut self) -> Result<EntryPoint> { 945 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 946 let mem = guest_memory.memory(); 947 let mut kernel = self.kernel.as_ref().unwrap(); 948 let entry_addr = match linux_loader::loader::pe::PE::load( 949 mem.deref(), 950 Some(GuestAddress(arch::get_kernel_start())), 951 &mut kernel, 952 None, 953 ) { 954 Ok(entry_addr) => entry_addr, 955 // Try to load the binary as kernel PE file at first. 956 // If failed, retry to load it as UEFI binary. 957 // As the UEFI binary is formatless, it must be the last option to try. 958 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 959 arch::aarch64::uefi::load_uefi( 960 mem.deref(), 961 GuestAddress(arch::get_uefi_start()), 962 &mut kernel, 963 ) 964 .map_err(Error::UefiLoad)?; 965 // The entry point offset in UEFI image is always 0. 966 return Ok(EntryPoint { 967 entry_addr: GuestAddress(arch::get_uefi_start()), 968 }); 969 } 970 Err(e) => { 971 return Err(Error::KernelLoad(e)); 972 } 973 }; 974 975 let entry_point_addr: GuestAddress = entry_addr.kernel_load; 976 977 Ok(EntryPoint { 978 entry_addr: entry_point_addr, 979 }) 980 } 981 982 #[cfg(target_arch = "x86_64")] 983 fn load_kernel(&mut self) -> Result<EntryPoint> { 984 info!("Loading kernel"); 985 let cmdline_cstring = self.get_cmdline()?; 986 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 987 let mem = guest_memory.memory(); 988 let mut kernel = self.kernel.as_ref().unwrap(); 989 let entry_addr = match linux_loader::loader::elf::Elf::load( 990 mem.deref(), 991 None, 992 &mut kernel, 993 Some(arch::layout::HIGH_RAM_START), 994 ) { 995 Ok(entry_addr) => entry_addr, 996 Err(e) => { 997 return Err(Error::KernelLoad(e)); 998 } 999 }; 1000 1001 linux_loader::loader::load_cmdline( 1002 mem.deref(), 1003 arch::layout::CMDLINE_START, 1004 &cmdline_cstring, 1005 ) 1006 .map_err(Error::LoadCmdLine)?; 1007 1008 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1009 // Use the PVH kernel entry point to boot the guest 1010 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1011 Ok(EntryPoint { entry_addr }) 1012 } else { 1013 Err(Error::KernelMissingPvhHeader) 1014 } 1015 } 1016 1017 #[cfg(target_arch = "x86_64")] 1018 fn configure_system(&mut self) -> Result<()> { 1019 info!("Configuring system"); 1020 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1021 1022 let initramfs_config = match self.initramfs { 1023 Some(_) => Some(self.load_initramfs(&mem)?), 1024 None => None, 1025 }; 1026 1027 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1028 1029 #[allow(unused_mut, unused_assignments)] 1030 let mut rsdp_addr: Option<GuestAddress> = None; 1031 1032 #[cfg(feature = "acpi")] 1033 { 1034 rsdp_addr = Some(crate::acpi::create_acpi_tables( 1035 &mem, 1036 &self.device_manager, 1037 &self.cpu_manager, 1038 &self.memory_manager, 1039 &self.numa_nodes, 1040 )); 1041 info!( 1042 "Created ACPI tables: rsdp_addr = 0x{:x}", 1043 rsdp_addr.unwrap().0 1044 ); 1045 } 1046 1047 let sgx_epc_region = self 1048 .memory_manager 1049 .lock() 1050 .unwrap() 1051 .sgx_epc_region() 1052 .as_ref() 1053 .cloned(); 1054 1055 arch::configure_system( 1056 &mem, 1057 arch::layout::CMDLINE_START, 1058 &initramfs_config, 1059 boot_vcpus, 1060 rsdp_addr, 1061 sgx_epc_region, 1062 ) 1063 .map_err(Error::ConfigureSystem)?; 1064 Ok(()) 1065 } 1066 1067 #[cfg(target_arch = "aarch64")] 1068 fn configure_system(&mut self) -> Result<()> { 1069 let cmdline_cstring = self.get_cmdline()?; 1070 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1071 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1072 let initramfs_config = match self.initramfs { 1073 Some(_) => Some(self.load_initramfs(&mem)?), 1074 None => None, 1075 }; 1076 1077 let device_info = &self 1078 .device_manager 1079 .lock() 1080 .unwrap() 1081 .get_device_info() 1082 .clone(); 1083 1084 let pci_space_start: GuestAddress = self 1085 .memory_manager 1086 .lock() 1087 .as_ref() 1088 .unwrap() 1089 .start_of_device_area(); 1090 1091 let pci_space_end: GuestAddress = self 1092 .memory_manager 1093 .lock() 1094 .as_ref() 1095 .unwrap() 1096 .end_of_device_area(); 1097 1098 let pci_space_size = pci_space_end 1099 .checked_offset_from(pci_space_start) 1100 .ok_or(Error::MemOverflow)? 1101 + 1; 1102 1103 let pci_space = (pci_space_start.0, pci_space_size); 1104 1105 #[cfg(feature = "acpi")] 1106 { 1107 let _ = crate::acpi::create_acpi_tables( 1108 &mem, 1109 &self.device_manager, 1110 &self.cpu_manager, 1111 &self.memory_manager, 1112 &self.numa_nodes, 1113 ); 1114 } 1115 1116 let gic_device = create_gic( 1117 &self.memory_manager.lock().as_ref().unwrap().vm, 1118 self.cpu_manager.lock().unwrap().boot_vcpus() as u64, 1119 ) 1120 .map_err(|e| { 1121 Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::SetupGic(e))) 1122 })?; 1123 1124 arch::configure_system( 1125 &mem, 1126 &cmdline_cstring, 1127 vcpu_mpidrs, 1128 device_info, 1129 &initramfs_config, 1130 &pci_space, 1131 &*gic_device, 1132 ) 1133 .map_err(Error::ConfigureSystem)?; 1134 1135 // Update the GIC entity in device manager 1136 self.device_manager 1137 .lock() 1138 .unwrap() 1139 .get_interrupt_controller() 1140 .unwrap() 1141 .lock() 1142 .unwrap() 1143 .set_gic_device(Arc::new(Mutex::new(gic_device))); 1144 1145 // Activate gic device 1146 self.device_manager 1147 .lock() 1148 .unwrap() 1149 .get_interrupt_controller() 1150 .unwrap() 1151 .lock() 1152 .unwrap() 1153 .enable() 1154 .map_err(Error::EnableInterruptController)?; 1155 1156 Ok(()) 1157 } 1158 1159 pub fn serial_pty(&self) -> Option<PtyPair> { 1160 self.device_manager.lock().unwrap().serial_pty() 1161 } 1162 1163 pub fn console_pty(&self) -> Option<PtyPair> { 1164 self.device_manager.lock().unwrap().console_pty() 1165 } 1166 1167 pub fn shutdown(&mut self) -> Result<()> { 1168 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1169 let new_state = VmState::Shutdown; 1170 1171 state.valid_transition(new_state)?; 1172 1173 if self.on_tty { 1174 // Don't forget to set the terminal in canonical mode 1175 // before to exit. 1176 io::stdin() 1177 .lock() 1178 .set_canon_mode() 1179 .map_err(Error::SetTerminalCanon)?; 1180 } 1181 1182 // Trigger the termination of the signal_handler thread 1183 if let Some(signals) = self.signals.take() { 1184 signals.close(); 1185 } 1186 1187 // Wake up the DeviceManager threads so they will get terminated cleanly 1188 self.device_manager 1189 .lock() 1190 .unwrap() 1191 .resume() 1192 .map_err(Error::Resume)?; 1193 1194 self.cpu_manager 1195 .lock() 1196 .unwrap() 1197 .shutdown() 1198 .map_err(Error::CpuManager)?; 1199 1200 // Wait for all the threads to finish 1201 for thread in self.threads.drain(..) { 1202 thread.join().map_err(Error::ThreadCleanup)? 1203 } 1204 *state = new_state; 1205 1206 event!("vm", "shutdown"); 1207 1208 Ok(()) 1209 } 1210 1211 pub fn resize( 1212 &mut self, 1213 desired_vcpus: Option<u8>, 1214 desired_memory: Option<u64>, 1215 desired_balloon: Option<u64>, 1216 ) -> Result<()> { 1217 event!("vm", "resizing"); 1218 1219 if let Some(desired_vcpus) = desired_vcpus { 1220 if self 1221 .cpu_manager 1222 .lock() 1223 .unwrap() 1224 .resize(desired_vcpus) 1225 .map_err(Error::CpuManager)? 1226 { 1227 self.device_manager 1228 .lock() 1229 .unwrap() 1230 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1231 .map_err(Error::DeviceManager)?; 1232 } 1233 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1234 } 1235 1236 if let Some(desired_memory) = desired_memory { 1237 let new_region = self 1238 .memory_manager 1239 .lock() 1240 .unwrap() 1241 .resize(desired_memory) 1242 .map_err(Error::MemoryManager)?; 1243 1244 let mut memory_config = &mut self.config.lock().unwrap().memory; 1245 1246 if let Some(new_region) = &new_region { 1247 self.device_manager 1248 .lock() 1249 .unwrap() 1250 .update_memory(new_region) 1251 .map_err(Error::DeviceManager)?; 1252 1253 match memory_config.hotplug_method { 1254 HotplugMethod::Acpi => { 1255 self.device_manager 1256 .lock() 1257 .unwrap() 1258 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1259 .map_err(Error::DeviceManager)?; 1260 } 1261 HotplugMethod::VirtioMem => {} 1262 } 1263 } 1264 1265 // We update the VM config regardless of the actual guest resize 1266 // operation result (happened or not), so that if the VM reboots 1267 // it will be running with the last configure memory size. 1268 match memory_config.hotplug_method { 1269 HotplugMethod::Acpi => memory_config.size = desired_memory, 1270 HotplugMethod::VirtioMem => { 1271 if desired_memory > memory_config.size { 1272 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1273 } else { 1274 memory_config.hotplugged_size = None; 1275 } 1276 } 1277 } 1278 } 1279 1280 if let Some(desired_balloon) = desired_balloon { 1281 self.device_manager 1282 .lock() 1283 .unwrap() 1284 .resize_balloon(desired_balloon) 1285 .map_err(Error::DeviceManager)?; 1286 1287 // Update the configuration value for the balloon size to ensure 1288 // a reboot would use the right value. 1289 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1290 balloon_config.size = desired_balloon; 1291 } 1292 } 1293 1294 event!("vm", "resized"); 1295 1296 Ok(()) 1297 } 1298 1299 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1300 let memory_config = &mut self.config.lock().unwrap().memory; 1301 1302 if let Some(zones) = &mut memory_config.zones { 1303 for zone in zones.iter_mut() { 1304 if zone.id == id { 1305 if desired_memory >= zone.size { 1306 let hotplugged_size = desired_memory - zone.size; 1307 self.memory_manager 1308 .lock() 1309 .unwrap() 1310 .resize_zone(&id, desired_memory - zone.size) 1311 .map_err(Error::MemoryManager)?; 1312 // We update the memory zone config regardless of the 1313 // actual 'resize-zone' operation result (happened or 1314 // not), so that if the VM reboots it will be running 1315 // with the last configured memory zone size. 1316 zone.hotplugged_size = Some(hotplugged_size); 1317 1318 return Ok(()); 1319 } else { 1320 error!( 1321 "Invalid to ask less ({}) than boot RAM ({}) for \ 1322 this memory zone", 1323 desired_memory, zone.size, 1324 ); 1325 return Err(Error::ResizeZone); 1326 } 1327 } 1328 } 1329 } 1330 1331 error!("Could not find the memory zone {} for the resize", id); 1332 Err(Error::ResizeZone) 1333 } 1334 1335 fn add_to_config<T>(devices: &mut Option<Vec<T>>, device: T) { 1336 if let Some(devices) = devices { 1337 devices.push(device); 1338 } else { 1339 *devices = Some(vec![device]); 1340 } 1341 } 1342 1343 pub fn add_device(&mut self, mut _device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1344 { 1345 // Validate on a clone of the config 1346 let mut config = self.config.lock().unwrap().clone(); 1347 Self::add_to_config(&mut config.devices, _device_cfg.clone()); 1348 config.validate().map_err(Error::ConfigValidation)?; 1349 } 1350 1351 let pci_device_info = self 1352 .device_manager 1353 .lock() 1354 .unwrap() 1355 .add_device(&mut _device_cfg) 1356 .map_err(Error::DeviceManager)?; 1357 1358 // Update VmConfig by adding the new device. This is important to 1359 // ensure the device would be created in case of a reboot. 1360 { 1361 let mut config = self.config.lock().unwrap(); 1362 Self::add_to_config(&mut config.devices, _device_cfg); 1363 } 1364 1365 self.device_manager 1366 .lock() 1367 .unwrap() 1368 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1369 .map_err(Error::DeviceManager)?; 1370 1371 Ok(pci_device_info) 1372 } 1373 1374 pub fn remove_device(&mut self, _id: String) -> Result<()> { 1375 self.device_manager 1376 .lock() 1377 .unwrap() 1378 .remove_device(_id.clone()) 1379 .map_err(Error::DeviceManager)?; 1380 1381 // Update VmConfig by removing the device. This is important to 1382 // ensure the device would not be created in case of a reboot. 1383 let mut config = self.config.lock().unwrap(); 1384 1385 // Remove if VFIO device 1386 if let Some(devices) = config.devices.as_mut() { 1387 devices.retain(|dev| dev.id.as_ref() != Some(&_id)); 1388 } 1389 1390 // Remove if disk device 1391 if let Some(disks) = config.disks.as_mut() { 1392 disks.retain(|dev| dev.id.as_ref() != Some(&_id)); 1393 } 1394 1395 // Remove if net device 1396 if let Some(net) = config.net.as_mut() { 1397 net.retain(|dev| dev.id.as_ref() != Some(&_id)); 1398 } 1399 1400 // Remove if pmem device 1401 if let Some(pmem) = config.pmem.as_mut() { 1402 pmem.retain(|dev| dev.id.as_ref() != Some(&_id)); 1403 } 1404 1405 // Remove if vsock device 1406 if let Some(vsock) = config.vsock.as_ref() { 1407 if vsock.id.as_ref() == Some(&_id) { 1408 config.vsock = None; 1409 } 1410 } 1411 1412 self.device_manager 1413 .lock() 1414 .unwrap() 1415 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1416 .map_err(Error::DeviceManager)?; 1417 Ok(()) 1418 } 1419 1420 pub fn add_disk(&mut self, mut _disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1421 { 1422 // Validate on a clone of the config 1423 let mut config = self.config.lock().unwrap().clone(); 1424 Self::add_to_config(&mut config.disks, _disk_cfg.clone()); 1425 config.validate().map_err(Error::ConfigValidation)?; 1426 } 1427 1428 let pci_device_info = self 1429 .device_manager 1430 .lock() 1431 .unwrap() 1432 .add_disk(&mut _disk_cfg) 1433 .map_err(Error::DeviceManager)?; 1434 1435 // Update VmConfig by adding the new device. This is important to 1436 // ensure the device would be created in case of a reboot. 1437 { 1438 let mut config = self.config.lock().unwrap(); 1439 Self::add_to_config(&mut config.disks, _disk_cfg); 1440 } 1441 1442 self.device_manager 1443 .lock() 1444 .unwrap() 1445 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1446 .map_err(Error::DeviceManager)?; 1447 1448 Ok(pci_device_info) 1449 } 1450 1451 pub fn add_fs(&mut self, mut _fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1452 { 1453 // Validate on a clone of the config 1454 let mut config = self.config.lock().unwrap().clone(); 1455 Self::add_to_config(&mut config.fs, _fs_cfg.clone()); 1456 config.validate().map_err(Error::ConfigValidation)?; 1457 } 1458 1459 let pci_device_info = self 1460 .device_manager 1461 .lock() 1462 .unwrap() 1463 .add_fs(&mut _fs_cfg) 1464 .map_err(Error::DeviceManager)?; 1465 1466 // Update VmConfig by adding the new device. This is important to 1467 // ensure the device would be created in case of a reboot. 1468 { 1469 let mut config = self.config.lock().unwrap(); 1470 Self::add_to_config(&mut config.fs, _fs_cfg); 1471 } 1472 1473 self.device_manager 1474 .lock() 1475 .unwrap() 1476 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1477 .map_err(Error::DeviceManager)?; 1478 1479 Ok(pci_device_info) 1480 } 1481 1482 pub fn add_pmem(&mut self, mut _pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1483 { 1484 // Validate on a clone of the config 1485 let mut config = self.config.lock().unwrap().clone(); 1486 Self::add_to_config(&mut config.pmem, _pmem_cfg.clone()); 1487 config.validate().map_err(Error::ConfigValidation)?; 1488 } 1489 1490 let pci_device_info = self 1491 .device_manager 1492 .lock() 1493 .unwrap() 1494 .add_pmem(&mut _pmem_cfg) 1495 .map_err(Error::DeviceManager)?; 1496 1497 // Update VmConfig by adding the new device. This is important to 1498 // ensure the device would be created in case of a reboot. 1499 { 1500 let mut config = self.config.lock().unwrap(); 1501 Self::add_to_config(&mut config.pmem, _pmem_cfg); 1502 } 1503 1504 self.device_manager 1505 .lock() 1506 .unwrap() 1507 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1508 .map_err(Error::DeviceManager)?; 1509 1510 Ok(pci_device_info) 1511 } 1512 1513 pub fn add_net(&mut self, mut _net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1514 { 1515 // Validate on a clone of the config 1516 let mut config = self.config.lock().unwrap().clone(); 1517 Self::add_to_config(&mut config.net, _net_cfg.clone()); 1518 config.validate().map_err(Error::ConfigValidation)?; 1519 } 1520 1521 let pci_device_info = self 1522 .device_manager 1523 .lock() 1524 .unwrap() 1525 .add_net(&mut _net_cfg) 1526 .map_err(Error::DeviceManager)?; 1527 1528 // Update VmConfig by adding the new device. This is important to 1529 // ensure the device would be created in case of a reboot. 1530 { 1531 let mut config = self.config.lock().unwrap(); 1532 Self::add_to_config(&mut config.net, _net_cfg); 1533 } 1534 1535 self.device_manager 1536 .lock() 1537 .unwrap() 1538 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1539 .map_err(Error::DeviceManager)?; 1540 1541 Ok(pci_device_info) 1542 } 1543 1544 pub fn add_vsock(&mut self, mut _vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1545 if self.config.lock().unwrap().vsock.is_some() { 1546 return Err(Error::TooManyVsockDevices); 1547 } 1548 1549 { 1550 // Validate on a clone of the config 1551 let mut config = self.config.lock().unwrap().clone(); 1552 config.vsock = Some(_vsock_cfg.clone()); 1553 config.validate().map_err(Error::ConfigValidation)?; 1554 } 1555 1556 let pci_device_info = self 1557 .device_manager 1558 .lock() 1559 .unwrap() 1560 .add_vsock(&mut _vsock_cfg) 1561 .map_err(Error::DeviceManager)?; 1562 1563 // Update VmConfig by adding the new device. This is important to 1564 // ensure the device would be created in case of a reboot. 1565 { 1566 let mut config = self.config.lock().unwrap(); 1567 config.vsock = Some(_vsock_cfg); 1568 } 1569 1570 self.device_manager 1571 .lock() 1572 .unwrap() 1573 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1574 .map_err(Error::DeviceManager)?; 1575 1576 Ok(pci_device_info) 1577 } 1578 1579 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1580 Ok(self.device_manager.lock().unwrap().counters()) 1581 } 1582 1583 fn os_signal_handler( 1584 mut signals: Signals, 1585 console_input_clone: Arc<Console>, 1586 on_tty: bool, 1587 exit_evt: EventFd, 1588 ) { 1589 for signal in signals.forever() { 1590 match signal { 1591 SIGWINCH => { 1592 let (col, row) = get_win_size(); 1593 console_input_clone.update_console_size(col, row); 1594 } 1595 SIGTERM | SIGINT => { 1596 if on_tty { 1597 io::stdin() 1598 .lock() 1599 .set_canon_mode() 1600 .expect("failed to restore terminal mode"); 1601 } 1602 if exit_evt.write(1).is_err() { 1603 std::process::exit(1); 1604 } 1605 } 1606 _ => (), 1607 } 1608 } 1609 } 1610 1611 #[cfg(feature = "tdx")] 1612 fn init_tdx(&mut self) -> Result<()> { 1613 let cpuid = self.cpu_manager.lock().unwrap().common_cpuid(); 1614 let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32; 1615 self.vm 1616 .tdx_init(&cpuid, max_vcpus) 1617 .map_err(Error::InitializeTdxVm)?; 1618 Ok(()) 1619 } 1620 1621 #[cfg(feature = "tdx")] 1622 fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> { 1623 use arch::x86_64::tdx::*; 1624 // The TDVF file contains a table of section as well as code 1625 let mut firmware_file = 1626 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1627 .map_err(Error::LoadTdvf)?; 1628 1629 // For all the sections allocate some RAM backing them 1630 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1631 } 1632 1633 #[cfg(feature = "tdx")] 1634 fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> { 1635 use arch::x86_64::tdx::*; 1636 // Get the memory end *before* we start adding TDVF ram regions 1637 let boot_guest_memory = self 1638 .memory_manager 1639 .lock() 1640 .as_ref() 1641 .unwrap() 1642 .boot_guest_memory(); 1643 for section in sections { 1644 // No need to allocate if the section falls within guest RAM ranges 1645 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1646 info!( 1647 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1648 section 1649 ); 1650 continue; 1651 } 1652 1653 info!("Allocating TDVF Section: {:x?}", section); 1654 self.memory_manager 1655 .lock() 1656 .unwrap() 1657 .add_ram_region(GuestAddress(section.address), section.size as usize) 1658 .map_err(Error::AllocatingTdvfMemory)?; 1659 } 1660 1661 // The TDVF file contains a table of section as well as code 1662 let mut firmware_file = 1663 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1664 .map_err(Error::LoadTdvf)?; 1665 1666 // The guest memory at this point now has all the required regions so it 1667 // is safe to copy from the TDVF file into it. 1668 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1669 let mem = guest_memory.memory(); 1670 let mut hob_offset = None; 1671 for section in sections { 1672 info!("Populating TDVF Section: {:x?}", section); 1673 match section.r#type { 1674 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1675 info!("Copying section to guest memory"); 1676 firmware_file 1677 .seek(SeekFrom::Start(section.data_offset as u64)) 1678 .map_err(Error::LoadTdvf)?; 1679 mem.read_from( 1680 GuestAddress(section.address), 1681 &mut firmware_file, 1682 section.data_size as usize, 1683 ) 1684 .unwrap(); 1685 } 1686 TdvfSectionType::TdHob => { 1687 hob_offset = Some(section.address); 1688 } 1689 _ => {} 1690 } 1691 } 1692 1693 // Generate HOB 1694 let mut hob = TdHob::start(hob_offset.unwrap()); 1695 1696 let mut sorted_sections = sections.to_vec(); 1697 sorted_sections.retain(|section| { 1698 !matches!(section.r#type, TdvfSectionType::Bfv | TdvfSectionType::Cfv) 1699 }); 1700 sorted_sections.sort_by_key(|section| section.address); 1701 sorted_sections.reverse(); 1702 let mut current_section = sorted_sections.pop(); 1703 1704 // RAM regions interleaved with TDVF sections 1705 let mut next_start_addr = 0; 1706 for region in boot_guest_memory.iter() { 1707 let region_start = region.start_addr().0; 1708 let region_end = region.last_addr().0; 1709 if region_start > next_start_addr { 1710 next_start_addr = region_start; 1711 } 1712 1713 loop { 1714 let (start, size, ram) = if let Some(section) = ¤t_section { 1715 if section.address <= next_start_addr { 1716 (section.address, section.size, false) 1717 } else { 1718 let last_addr = std::cmp::min(section.address - 1, region_end); 1719 (next_start_addr, last_addr - next_start_addr + 1, true) 1720 } 1721 } else { 1722 (next_start_addr, region_end - next_start_addr + 1, true) 1723 }; 1724 1725 hob.add_memory_resource(&mem, start, size, ram) 1726 .map_err(Error::PopulateHob)?; 1727 1728 if !ram { 1729 current_section = sorted_sections.pop(); 1730 } 1731 1732 next_start_addr = start + size; 1733 1734 if next_start_addr > region_end { 1735 break; 1736 } 1737 } 1738 } 1739 1740 // MMIO regions 1741 hob.add_mmio_resource( 1742 &mem, 1743 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1744 arch::layout::APIC_START.raw_value() 1745 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1746 ) 1747 .map_err(Error::PopulateHob)?; 1748 let start_of_device_area = self 1749 .memory_manager 1750 .lock() 1751 .unwrap() 1752 .start_of_device_area() 1753 .raw_value(); 1754 let end_of_device_area = self 1755 .memory_manager 1756 .lock() 1757 .unwrap() 1758 .end_of_device_area() 1759 .raw_value(); 1760 hob.add_mmio_resource( 1761 &mem, 1762 start_of_device_area, 1763 end_of_device_area - start_of_device_area, 1764 ) 1765 .map_err(Error::PopulateHob)?; 1766 1767 hob.finish(&mem).map_err(Error::PopulateHob)?; 1768 1769 Ok(hob_offset) 1770 } 1771 1772 #[cfg(feature = "tdx")] 1773 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1774 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1775 let mem = guest_memory.memory(); 1776 1777 for section in sections { 1778 self.vm 1779 .tdx_init_memory_region( 1780 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1781 section.address, 1782 section.size, 1783 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1784 section.attributes == 1, 1785 ) 1786 .map_err(Error::InitializeTdxMemoryRegion)?; 1787 } 1788 Ok(()) 1789 } 1790 1791 pub fn boot(&mut self) -> Result<()> { 1792 info!("Booting VM"); 1793 event!("vm", "booting"); 1794 let current_state = self.get_state()?; 1795 if current_state == VmState::Paused { 1796 return self.resume().map_err(Error::Resume); 1797 } 1798 1799 let new_state = VmState::Running; 1800 current_state.valid_transition(new_state)?; 1801 1802 // Load kernel if configured 1803 let entry_point = if self.kernel.as_ref().is_some() { 1804 Some(self.load_kernel()?) 1805 } else { 1806 None 1807 }; 1808 1809 // The initial TDX configuration must be done before the vCPUs are 1810 // created 1811 #[cfg(feature = "tdx")] 1812 if self.config.lock().unwrap().tdx.is_some() { 1813 self.init_tdx()?; 1814 } 1815 1816 // Create and configure vcpus 1817 self.cpu_manager 1818 .lock() 1819 .unwrap() 1820 .create_boot_vcpus(entry_point) 1821 .map_err(Error::CpuManager)?; 1822 1823 #[cfg(feature = "tdx")] 1824 let sections = self.extract_tdvf_sections()?; 1825 1826 // Configuring the TDX regions requires that the vCPUs are created 1827 #[cfg(feature = "tdx")] 1828 let hob_address = if self.config.lock().unwrap().tdx.is_some() { 1829 self.populate_tdx_sections(§ions)? 1830 } else { 1831 None 1832 }; 1833 1834 // Configure shared state based on loaded kernel 1835 entry_point.map(|_| self.configure_system()).transpose()?; 1836 1837 #[cfg(feature = "tdx")] 1838 if let Some(hob_address) = hob_address { 1839 // With the HOB address extracted the vCPUs can have 1840 // their TDX state configured. 1841 self.cpu_manager 1842 .lock() 1843 .unwrap() 1844 .initialize_tdx(hob_address) 1845 .map_err(Error::CpuManager)?; 1846 self.init_tdx_memory(§ions)?; 1847 // With TDX memory and CPU state configured TDX setup is complete 1848 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 1849 } 1850 1851 self.cpu_manager 1852 .lock() 1853 .unwrap() 1854 .start_boot_vcpus() 1855 .map_err(Error::CpuManager)?; 1856 1857 if self 1858 .device_manager 1859 .lock() 1860 .unwrap() 1861 .console() 1862 .input_enabled() 1863 { 1864 let console = self.device_manager.lock().unwrap().console().clone(); 1865 let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]); 1866 match signals { 1867 Ok(signals) => { 1868 self.signals = Some(signals.handle()); 1869 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?; 1870 let on_tty = self.on_tty; 1871 let signal_handler_seccomp_filter = 1872 get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler) 1873 .map_err(Error::CreateSeccompFilter)?; 1874 self.threads.push( 1875 thread::Builder::new() 1876 .name("signal_handler".to_string()) 1877 .spawn(move || { 1878 if let Err(e) = SeccompFilter::apply(signal_handler_seccomp_filter) 1879 .map_err(Error::ApplySeccompFilter) 1880 { 1881 error!("Error applying seccomp filter: {:?}", e); 1882 return; 1883 } 1884 1885 Vm::os_signal_handler(signals, console, on_tty, exit_evt); 1886 }) 1887 .map_err(Error::SignalHandlerSpawn)?, 1888 ); 1889 } 1890 Err(e) => error!("Signal not found {}", e), 1891 } 1892 1893 if self.on_tty { 1894 io::stdin() 1895 .lock() 1896 .set_raw_mode() 1897 .map_err(Error::SetTerminalRaw)?; 1898 } 1899 } 1900 1901 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1902 *state = new_state; 1903 event!("vm", "booted"); 1904 Ok(()) 1905 } 1906 1907 pub fn handle_pty(&self) -> Result<()> { 1908 // Could be a little dangerous, picks up a lock on device_manager 1909 // and goes into a blocking read. If the epoll loops starts to be 1910 // services by multiple threads likely need to revist this. 1911 let dm = self.device_manager.lock().unwrap(); 1912 let mut out = [0u8; 64]; 1913 if let Some(mut pty) = dm.serial_pty() { 1914 let count = pty.main.read(&mut out).map_err(Error::PtyConsole)?; 1915 let console = dm.console(); 1916 if console.input_enabled() { 1917 console 1918 .queue_input_bytes_serial(&out[..count]) 1919 .map_err(Error::Console)?; 1920 } 1921 }; 1922 let count = match dm.console_pty() { 1923 Some(mut pty) => pty.main.read(&mut out).map_err(Error::PtyConsole)?, 1924 None => return Ok(()), 1925 }; 1926 let console = dm.console(); 1927 if console.input_enabled() { 1928 console.queue_input_bytes_console(&out[..count]) 1929 } 1930 1931 Ok(()) 1932 } 1933 1934 pub fn handle_stdin(&self) -> Result<()> { 1935 let mut out = [0u8; 64]; 1936 let count = io::stdin() 1937 .lock() 1938 .read_raw(&mut out) 1939 .map_err(Error::Console)?; 1940 1941 // Replace "\n" with "\r" to deal with Windows SAC (#1170) 1942 if count == 1 && out[0] == 0x0a { 1943 out[0] = 0x0d; 1944 } 1945 1946 if self 1947 .device_manager 1948 .lock() 1949 .unwrap() 1950 .console() 1951 .input_enabled() 1952 { 1953 self.device_manager 1954 .lock() 1955 .unwrap() 1956 .console() 1957 .queue_input_bytes(&out[..count]) 1958 .map_err(Error::Console)?; 1959 } 1960 1961 Ok(()) 1962 } 1963 1964 /// Gets a thread-safe reference counted pointer to the VM configuration. 1965 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 1966 Arc::clone(&self.config) 1967 } 1968 1969 /// Get the VM state. Returns an error if the state is poisoned. 1970 pub fn get_state(&self) -> Result<VmState> { 1971 self.state 1972 .try_read() 1973 .map_err(|_| Error::PoisonedState) 1974 .map(|state| *state) 1975 } 1976 1977 /// Load saved clock from snapshot 1978 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1979 pub fn load_clock_from_snapshot( 1980 &mut self, 1981 snapshot: &Snapshot, 1982 ) -> Result<Option<hypervisor::ClockData>> { 1983 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 1984 self.saved_clock = vm_snapshot.clock; 1985 Ok(self.saved_clock) 1986 } 1987 1988 #[cfg(target_arch = "aarch64")] 1989 /// Add the vGIC section to the VM snapshot. 1990 fn add_vgic_snapshot_section( 1991 &self, 1992 vm_snapshot: &mut Snapshot, 1993 ) -> std::result::Result<(), MigratableError> { 1994 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 1995 let gic_device = Arc::clone( 1996 self.device_manager 1997 .lock() 1998 .unwrap() 1999 .get_interrupt_controller() 2000 .unwrap() 2001 .lock() 2002 .unwrap() 2003 .get_gic_device() 2004 .unwrap(), 2005 ); 2006 2007 gic_device 2008 .lock() 2009 .unwrap() 2010 .set_gicr_typers(&saved_vcpu_states); 2011 2012 vm_snapshot.add_snapshot( 2013 gic_device 2014 .lock() 2015 .unwrap() 2016 .as_any_concrete_mut() 2017 .downcast_mut::<KvmGicV3Its>() 2018 .unwrap() 2019 .snapshot()?, 2020 ); 2021 2022 Ok(()) 2023 } 2024 2025 #[cfg(target_arch = "aarch64")] 2026 /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing. 2027 fn restore_vgic_and_enable_interrupt( 2028 &self, 2029 vm_snapshot: &Snapshot, 2030 ) -> std::result::Result<(), MigratableError> { 2031 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2032 // The number of vCPUs is the same as the number of saved vCPU states. 2033 let vcpu_numbers = saved_vcpu_states.len(); 2034 2035 // Creating a GIC device here, as the GIC will not be created when 2036 // restoring the device manager. Note that currently only the bare GICv3 2037 // without ITS is supported. 2038 let mut gic_device = create_gic(&self.vm, vcpu_numbers.try_into().unwrap()) 2039 .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?; 2040 2041 // Here we prepare the GICR_TYPER registers from the restored vCPU states. 2042 gic_device.set_gicr_typers(&saved_vcpu_states); 2043 2044 let gic_device = Arc::new(Mutex::new(gic_device)); 2045 // Update the GIC entity in device manager 2046 self.device_manager 2047 .lock() 2048 .unwrap() 2049 .get_interrupt_controller() 2050 .unwrap() 2051 .lock() 2052 .unwrap() 2053 .set_gic_device(Arc::clone(&gic_device)); 2054 2055 // Restore GIC states. 2056 if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) { 2057 gic_device 2058 .lock() 2059 .unwrap() 2060 .as_any_concrete_mut() 2061 .downcast_mut::<KvmGicV3Its>() 2062 .unwrap() 2063 .restore(*gicv3_its_snapshot.clone())?; 2064 } else { 2065 return Err(MigratableError::Restore(anyhow!( 2066 "Missing GicV3Its snapshot" 2067 ))); 2068 } 2069 2070 // Activate gic device 2071 self.device_manager 2072 .lock() 2073 .unwrap() 2074 .get_interrupt_controller() 2075 .unwrap() 2076 .lock() 2077 .unwrap() 2078 .enable() 2079 .map_err(|e| { 2080 MigratableError::Restore(anyhow!( 2081 "Could not enable interrupt controller routing: {:#?}", 2082 e 2083 )) 2084 })?; 2085 2086 Ok(()) 2087 } 2088 2089 /// Gets the actual size of the balloon. 2090 pub fn balloon_size(&self) -> u64 { 2091 self.device_manager.lock().unwrap().balloon_size() 2092 } 2093 2094 pub fn receive_memory_regions<F>( 2095 &mut self, 2096 ranges: &MemoryRangeTable, 2097 fd: &mut F, 2098 ) -> std::result::Result<(), MigratableError> 2099 where 2100 F: Read, 2101 { 2102 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2103 let mem = guest_memory.memory(); 2104 2105 for range in ranges.regions() { 2106 mem.read_exact_from(GuestAddress(range.gpa), fd, range.length as usize) 2107 .map_err(|e| { 2108 MigratableError::MigrateReceive(anyhow!( 2109 "Error transferring memory to socket: {}", 2110 e 2111 )) 2112 })?; 2113 } 2114 Ok(()) 2115 } 2116 2117 pub fn send_memory_regions<F>( 2118 &mut self, 2119 ranges: &MemoryRangeTable, 2120 fd: &mut F, 2121 ) -> std::result::Result<(), MigratableError> 2122 where 2123 F: Write, 2124 { 2125 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2126 let mem = guest_memory.memory(); 2127 2128 for range in ranges.regions() { 2129 mem.write_all_to(GuestAddress(range.gpa), fd, range.length as usize) 2130 .map_err(|e| { 2131 MigratableError::MigrateSend(anyhow!( 2132 "Error transferring memory to socket: {}", 2133 e 2134 )) 2135 })?; 2136 } 2137 2138 Ok(()) 2139 } 2140 2141 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2142 let mut table = MemoryRangeTable::default(); 2143 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2144 2145 for region in guest_memory.memory().iter() { 2146 table.push(MemoryRange { 2147 gpa: region.start_addr().raw_value(), 2148 length: region.len() as u64, 2149 }); 2150 } 2151 2152 Ok(table) 2153 } 2154 2155 pub fn start_memory_dirty_log(&self) -> std::result::Result<(), MigratableError> { 2156 self.memory_manager.lock().unwrap().start_memory_dirty_log() 2157 } 2158 2159 pub fn dirty_memory_range_table( 2160 &self, 2161 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 2162 self.memory_manager 2163 .lock() 2164 .unwrap() 2165 .dirty_memory_range_table() 2166 } 2167 2168 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2169 self.device_manager.lock().unwrap().device_tree() 2170 } 2171 2172 pub fn activate_virtio_devices(&self) -> Result<()> { 2173 self.device_manager 2174 .lock() 2175 .unwrap() 2176 .activate_virtio_devices() 2177 .map_err(Error::ActivateVirtioDevices) 2178 } 2179 2180 #[cfg(target_arch = "x86_64")] 2181 pub fn power_button(&self) -> Result<()> { 2182 #[cfg(feature = "acpi")] 2183 return self 2184 .device_manager 2185 .lock() 2186 .unwrap() 2187 .notify_power_button() 2188 .map_err(Error::PowerButton); 2189 #[cfg(not(feature = "acpi"))] 2190 Err(Error::PowerButtonNotSupported) 2191 } 2192 2193 #[cfg(target_arch = "aarch64")] 2194 pub fn power_button(&self) -> Result<()> { 2195 self.device_manager 2196 .lock() 2197 .unwrap() 2198 .notify_power_button() 2199 .map_err(Error::PowerButton) 2200 } 2201 } 2202 2203 impl Pausable for Vm { 2204 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2205 event!("vm", "pausing"); 2206 let mut state = self 2207 .state 2208 .try_write() 2209 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2210 let new_state = VmState::Paused; 2211 2212 state 2213 .valid_transition(new_state) 2214 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2215 2216 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2217 { 2218 let mut clock = self 2219 .vm 2220 .get_clock() 2221 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2222 // Reset clock flags. 2223 clock.flags = 0; 2224 self.saved_clock = Some(clock); 2225 } 2226 self.cpu_manager.lock().unwrap().pause()?; 2227 self.device_manager.lock().unwrap().pause()?; 2228 2229 *state = new_state; 2230 2231 event!("vm", "paused"); 2232 Ok(()) 2233 } 2234 2235 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2236 event!("vm", "resuming"); 2237 let mut state = self 2238 .state 2239 .try_write() 2240 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2241 let new_state = VmState::Running; 2242 2243 state 2244 .valid_transition(new_state) 2245 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2246 2247 self.cpu_manager.lock().unwrap().resume()?; 2248 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2249 { 2250 if let Some(clock) = &self.saved_clock { 2251 self.vm.set_clock(clock).map_err(|e| { 2252 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2253 })?; 2254 } 2255 } 2256 self.device_manager.lock().unwrap().resume()?; 2257 2258 // And we're back to the Running state. 2259 *state = new_state; 2260 event!("vm", "resumed"); 2261 Ok(()) 2262 } 2263 } 2264 2265 #[derive(Serialize, Deserialize)] 2266 pub struct VmSnapshot { 2267 pub config: Arc<Mutex<VmConfig>>, 2268 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2269 pub clock: Option<hypervisor::ClockData>, 2270 pub state: Option<hypervisor::VmState>, 2271 } 2272 2273 pub const VM_SNAPSHOT_ID: &str = "vm"; 2274 impl Snapshottable for Vm { 2275 fn id(&self) -> String { 2276 VM_SNAPSHOT_ID.to_string() 2277 } 2278 2279 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2280 event!("vm", "snapshotting"); 2281 2282 #[cfg(feature = "tdx")] 2283 { 2284 if self.config.lock().unwrap().tdx.is_some() { 2285 return Err(MigratableError::Snapshot(anyhow!( 2286 "Snapshot not possible with TDX VM" 2287 ))); 2288 } 2289 } 2290 2291 let current_state = self.get_state().unwrap(); 2292 if current_state != VmState::Paused { 2293 return Err(MigratableError::Snapshot(anyhow!( 2294 "Trying to snapshot while VM is running" 2295 ))); 2296 } 2297 2298 let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID); 2299 let vm_state = self 2300 .vm 2301 .state() 2302 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2303 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2304 config: self.get_config(), 2305 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2306 clock: self.saved_clock, 2307 state: Some(vm_state), 2308 }) 2309 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2310 2311 vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?); 2312 vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?); 2313 2314 #[cfg(target_arch = "aarch64")] 2315 self.add_vgic_snapshot_section(&mut vm_snapshot) 2316 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2317 2318 vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?); 2319 vm_snapshot.add_data_section(SnapshotDataSection { 2320 id: format!("{}-section", VM_SNAPSHOT_ID), 2321 snapshot: vm_snapshot_data, 2322 }); 2323 2324 event!("vm", "snapshotted"); 2325 Ok(vm_snapshot) 2326 } 2327 2328 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 2329 event!("vm", "restoring"); 2330 2331 let current_state = self 2332 .get_state() 2333 .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?; 2334 let new_state = VmState::Paused; 2335 current_state.valid_transition(new_state).map_err(|e| { 2336 MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e)) 2337 })?; 2338 2339 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2340 self.memory_manager 2341 .lock() 2342 .unwrap() 2343 .restore(*memory_manager_snapshot.clone())?; 2344 } else { 2345 return Err(MigratableError::Restore(anyhow!( 2346 "Missing memory manager snapshot" 2347 ))); 2348 } 2349 2350 if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) { 2351 self.cpu_manager 2352 .lock() 2353 .unwrap() 2354 .restore(*cpu_manager_snapshot.clone())?; 2355 } else { 2356 return Err(MigratableError::Restore(anyhow!( 2357 "Missing CPU manager snapshot" 2358 ))); 2359 } 2360 2361 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2362 self.device_manager 2363 .lock() 2364 .unwrap() 2365 .restore(*device_manager_snapshot.clone())?; 2366 } else { 2367 return Err(MigratableError::Restore(anyhow!( 2368 "Missing device manager snapshot" 2369 ))); 2370 } 2371 2372 #[cfg(target_arch = "aarch64")] 2373 self.restore_vgic_and_enable_interrupt(&snapshot)?; 2374 2375 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2376 self.device_manager 2377 .lock() 2378 .unwrap() 2379 .restore_devices(*device_manager_snapshot.clone())?; 2380 } else { 2381 return Err(MigratableError::Restore(anyhow!( 2382 "Missing device manager snapshot" 2383 ))); 2384 } 2385 2386 // Now we can start all vCPUs from here. 2387 self.cpu_manager 2388 .lock() 2389 .unwrap() 2390 .start_restored_vcpus() 2391 .map_err(|e| { 2392 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e)) 2393 })?; 2394 2395 if self 2396 .device_manager 2397 .lock() 2398 .unwrap() 2399 .console() 2400 .input_enabled() 2401 { 2402 let console = self.device_manager.lock().unwrap().console().clone(); 2403 let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]); 2404 match signals { 2405 Ok(signals) => { 2406 self.signals = Some(signals.handle()); 2407 2408 let on_tty = self.on_tty; 2409 let signal_handler_seccomp_filter = 2410 get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler).map_err( 2411 |e| { 2412 MigratableError::Restore(anyhow!( 2413 "Could not create seccomp filter: {:#?}", 2414 Error::CreateSeccompFilter(e) 2415 )) 2416 }, 2417 )?; 2418 let exit_evt = self.exit_evt.try_clone().map_err(|e| { 2419 MigratableError::Restore(anyhow!("Could not clone exit event fd: {:?}", e)) 2420 })?; 2421 2422 self.threads.push( 2423 thread::Builder::new() 2424 .name("signal_handler".to_string()) 2425 .spawn(move || { 2426 if let Err(e) = SeccompFilter::apply(signal_handler_seccomp_filter) 2427 .map_err(Error::ApplySeccompFilter) 2428 { 2429 error!("Error applying seccomp filter: {:?}", e); 2430 return; 2431 } 2432 2433 Vm::os_signal_handler(signals, console, on_tty, exit_evt) 2434 }) 2435 .map_err(|e| { 2436 MigratableError::Restore(anyhow!( 2437 "Could not start console signal thread: {:#?}", 2438 e 2439 )) 2440 })?, 2441 ); 2442 } 2443 Err(e) => error!("Signal not found {}", e), 2444 } 2445 2446 if self.on_tty { 2447 io::stdin().lock().set_raw_mode().map_err(|e| { 2448 MigratableError::Restore(anyhow!( 2449 "Could not set terminal in raw mode: {:#?}", 2450 e 2451 )) 2452 })?; 2453 } 2454 } 2455 2456 let mut state = self 2457 .state 2458 .try_write() 2459 .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?; 2460 *state = new_state; 2461 2462 event!("vm", "restored"); 2463 Ok(()) 2464 } 2465 } 2466 2467 impl Transportable for Vm { 2468 fn send( 2469 &self, 2470 snapshot: &Snapshot, 2471 destination_url: &str, 2472 ) -> std::result::Result<(), MigratableError> { 2473 let mut vm_snapshot_path = url_to_path(destination_url)?; 2474 vm_snapshot_path.push(VM_SNAPSHOT_FILE); 2475 2476 // Create the snapshot file 2477 let mut vm_snapshot_file = OpenOptions::new() 2478 .read(true) 2479 .write(true) 2480 .create_new(true) 2481 .open(vm_snapshot_path) 2482 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2483 2484 // Serialize and write the snapshot 2485 let vm_snapshot = 2486 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2487 2488 vm_snapshot_file 2489 .write(&vm_snapshot) 2490 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2491 2492 // Tell the memory manager to also send/write its own snapshot. 2493 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2494 self.memory_manager 2495 .lock() 2496 .unwrap() 2497 .send(&*memory_manager_snapshot.clone(), destination_url)?; 2498 } else { 2499 return Err(MigratableError::Restore(anyhow!( 2500 "Missing memory manager snapshot" 2501 ))); 2502 } 2503 2504 Ok(()) 2505 } 2506 } 2507 impl Migratable for Vm {} 2508 2509 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2510 #[cfg(test)] 2511 mod tests { 2512 use super::*; 2513 2514 fn test_vm_state_transitions(state: VmState) { 2515 match state { 2516 VmState::Created => { 2517 // Check the transitions from Created 2518 assert!(state.valid_transition(VmState::Created).is_err()); 2519 assert!(state.valid_transition(VmState::Running).is_ok()); 2520 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2521 assert!(state.valid_transition(VmState::Paused).is_ok()); 2522 } 2523 VmState::Running => { 2524 // Check the transitions from Running 2525 assert!(state.valid_transition(VmState::Created).is_err()); 2526 assert!(state.valid_transition(VmState::Running).is_err()); 2527 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2528 assert!(state.valid_transition(VmState::Paused).is_ok()); 2529 } 2530 VmState::Shutdown => { 2531 // Check the transitions from Shutdown 2532 assert!(state.valid_transition(VmState::Created).is_err()); 2533 assert!(state.valid_transition(VmState::Running).is_ok()); 2534 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2535 assert!(state.valid_transition(VmState::Paused).is_err()); 2536 } 2537 VmState::Paused => { 2538 // Check the transitions from Paused 2539 assert!(state.valid_transition(VmState::Created).is_err()); 2540 assert!(state.valid_transition(VmState::Running).is_ok()); 2541 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2542 assert!(state.valid_transition(VmState::Paused).is_err()); 2543 } 2544 } 2545 } 2546 2547 #[test] 2548 fn test_vm_created_transitions() { 2549 test_vm_state_transitions(VmState::Created); 2550 } 2551 2552 #[test] 2553 fn test_vm_running_transitions() { 2554 test_vm_state_transitions(VmState::Running); 2555 } 2556 2557 #[test] 2558 fn test_vm_shutdown_transitions() { 2559 test_vm_state_transitions(VmState::Shutdown); 2560 } 2561 2562 #[test] 2563 fn test_vm_paused_transitions() { 2564 test_vm_state_transitions(VmState::Paused); 2565 } 2566 } 2567 2568 #[cfg(target_arch = "aarch64")] 2569 #[cfg(test)] 2570 mod tests { 2571 use super::*; 2572 use crate::GuestMemoryMmap; 2573 use arch::aarch64::fdt::create_fdt; 2574 use arch::aarch64::gic::kvm::create_gic; 2575 use arch::aarch64::layout; 2576 use arch::{DeviceType, MmioDeviceInfo}; 2577 use vm_memory::GuestAddress; 2578 2579 const LEN: u64 = 4096; 2580 2581 #[test] 2582 fn test_create_fdt_with_devices() { 2583 let regions = vec![( 2584 GuestAddress(layout::RAM_64BIT_START), 2585 (layout::FDT_MAX_SIZE + 0x1000) as usize, 2586 )]; 2587 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 2588 2589 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 2590 ( 2591 (DeviceType::Serial, DeviceType::Serial.to_string()), 2592 MmioDeviceInfo { 2593 addr: 0x00, 2594 irq: 33, 2595 }, 2596 ), 2597 ( 2598 (DeviceType::Virtio(1), "virtio".to_string()), 2599 MmioDeviceInfo { addr: LEN, irq: 34 }, 2600 ), 2601 ( 2602 (DeviceType::Rtc, "rtc".to_string()), 2603 MmioDeviceInfo { 2604 addr: 2 * LEN, 2605 irq: 35, 2606 }, 2607 ), 2608 ] 2609 .iter() 2610 .cloned() 2611 .collect(); 2612 2613 let hv = hypervisor::new().unwrap(); 2614 let vm = hv.create_vm().unwrap(); 2615 let gic = create_gic(&vm, 1).unwrap(); 2616 assert!(create_fdt( 2617 &mem, 2618 &CString::new("console=tty0").unwrap(), 2619 vec![0], 2620 &dev_info, 2621 &*gic, 2622 &None, 2623 &(0x1_0000_0000, 0x1_0000), 2624 ) 2625 .is_ok()) 2626 } 2627 } 2628 2629 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2630 #[test] 2631 pub fn test_vm() { 2632 use hypervisor::VmExit; 2633 use vm_memory::{GuestMemory, GuestMemoryRegion}; 2634 // This example based on https://lwn.net/Articles/658511/ 2635 let code = [ 2636 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 2637 0x00, 0xd8, /* add %bl, %al */ 2638 0x04, b'0', /* add $'0', %al */ 2639 0xee, /* out %al, (%dx) */ 2640 0xb0, b'\n', /* mov $'\n', %al */ 2641 0xee, /* out %al, (%dx) */ 2642 0xf4, /* hlt */ 2643 ]; 2644 2645 let mem_size = 0x1000; 2646 let load_addr = GuestAddress(0x1000); 2647 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 2648 2649 let hv = hypervisor::new().unwrap(); 2650 let vm = hv.create_vm().expect("new VM creation failed"); 2651 2652 for (index, region) in mem.iter().enumerate() { 2653 let mem_region = vm.make_user_memory_region( 2654 index as u32, 2655 region.start_addr().raw_value(), 2656 region.len() as u64, 2657 region.as_ptr() as u64, 2658 false, 2659 false, 2660 ); 2661 2662 vm.create_user_memory_region(mem_region) 2663 .expect("Cannot configure guest memory"); 2664 } 2665 mem.write_slice(&code, load_addr) 2666 .expect("Writing code to memory failed"); 2667 2668 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 2669 2670 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 2671 vcpu_sregs.cs.base = 0; 2672 vcpu_sregs.cs.selector = 0; 2673 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 2674 2675 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 2676 vcpu_regs.rip = 0x1000; 2677 vcpu_regs.rax = 2; 2678 vcpu_regs.rbx = 3; 2679 vcpu_regs.rflags = 2; 2680 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 2681 2682 loop { 2683 match vcpu.run().expect("run failed") { 2684 VmExit::IoOut(addr, data) => { 2685 println!( 2686 "IO out -- addr: {:#x} data [{:?}]", 2687 addr, 2688 str::from_utf8(data).unwrap() 2689 ); 2690 } 2691 VmExit::Reset => { 2692 println!("HLT"); 2693 break; 2694 } 2695 r => panic!("unexpected exit reason: {:?}", r), 2696 } 2697 } 2698 } 2699