1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // 5 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 6 // Use of this source code is governed by a BSD-style license that can be 7 // found in the LICENSE-BSD-3-Clause file. 8 // 9 // Copyright © 2019 Intel Corporation 10 // 11 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 12 // 13 14 #[cfg(feature = "acpi")] 15 use crate::config::NumaConfig; 16 use crate::config::{ 17 DeviceConfig, DiskConfig, FsConfig, HotplugMethod, NetConfig, PmemConfig, ValidationError, 18 VmConfig, VsockConfig, 19 }; 20 use crate::cpu; 21 use crate::device_manager::{ 22 self, get_win_size, Console, DeviceManager, DeviceManagerError, PtyPair, 23 }; 24 use crate::device_tree::DeviceTree; 25 use crate::memory_manager::{Error as MemoryManagerError, MemoryManager}; 26 use crate::migration::{get_vm_snapshot, url_to_path, VM_SNAPSHOT_FILE}; 27 use crate::seccomp_filters::{get_seccomp_filter, Thread}; 28 use crate::{GuestMemoryMmap, GuestRegionMmap}; 29 use crate::{ 30 PciDeviceInfo, CPU_MANAGER_SNAPSHOT_ID, DEVICE_MANAGER_SNAPSHOT_ID, MEMORY_MANAGER_SNAPSHOT_ID, 31 }; 32 use anyhow::anyhow; 33 use arch::get_host_cpu_phys_bits; 34 #[cfg(feature = "tdx")] 35 use arch::x86_64::tdx::TdvfSection; 36 #[cfg(target_arch = "x86_64")] 37 use arch::x86_64::SgxEpcSection; 38 use arch::EntryPoint; 39 use devices::AcpiNotificationFlags; 40 use hypervisor::vm::{HypervisorVmError, VmmOps}; 41 use linux_loader::cmdline::Cmdline; 42 #[cfg(target_arch = "x86_64")] 43 use linux_loader::loader::elf::PvhBootCapability::PvhEntryPresent; 44 #[cfg(target_arch = "aarch64")] 45 use linux_loader::loader::pe::Error::InvalidImageMagicNumber; 46 use linux_loader::loader::KernelLoader; 47 use seccomp::{SeccompAction, SeccompFilter}; 48 use signal_hook::{ 49 consts::{SIGINT, SIGTERM, SIGWINCH}, 50 iterator::backend::Handle, 51 iterator::Signals, 52 }; 53 use std::cmp; 54 use std::collections::{BTreeMap, HashMap}; 55 use std::convert::TryInto; 56 use std::ffi::CString; 57 #[cfg(target_arch = "x86_64")] 58 use std::fmt; 59 use std::fs::{File, OpenOptions}; 60 use std::io::{self, Read, Write}; 61 use std::io::{Seek, SeekFrom}; 62 use std::num::Wrapping; 63 use std::ops::Deref; 64 use std::sync::{Arc, Mutex, RwLock}; 65 use std::{result, str, thread}; 66 use vm_device::Bus; 67 use vm_memory::{ 68 Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 69 GuestMemoryRegion, 70 }; 71 use vm_migration::{ 72 protocol::{MemoryRange, MemoryRangeTable}, 73 Migratable, MigratableError, Pausable, Snapshot, SnapshotDataSection, Snapshottable, 74 Transportable, 75 }; 76 use vmm_sys_util::eventfd::EventFd; 77 use vmm_sys_util::terminal::Terminal; 78 79 #[cfg(target_arch = "aarch64")] 80 use arch::aarch64::gic::gicv3_its::kvm::{KvmGicV3Its, GIC_V3_ITS_SNAPSHOT_ID}; 81 #[cfg(target_arch = "aarch64")] 82 use arch::aarch64::gic::kvm::create_gic; 83 #[cfg(target_arch = "aarch64")] 84 use devices::interrupt_controller::{self, InterruptController}; 85 86 /// Errors associated with VM management 87 #[derive(Debug)] 88 pub enum Error { 89 /// Cannot open the kernel image 90 KernelFile(io::Error), 91 92 /// Cannot open the initramfs image 93 InitramfsFile(io::Error), 94 95 /// Cannot load the kernel in memory 96 KernelLoad(linux_loader::loader::Error), 97 98 #[cfg(target_arch = "aarch64")] 99 /// Cannot load the UEFI binary in memory 100 UefiLoad(arch::aarch64::uefi::Error), 101 102 /// Cannot load the initramfs in memory 103 InitramfsLoad, 104 105 /// Cannot load the command line in memory 106 LoadCmdLine(linux_loader::loader::Error), 107 108 /// Cannot modify the command line 109 CmdLineInsertStr(linux_loader::cmdline::Error), 110 111 /// Cannot convert command line into CString 112 CmdLineCString(std::ffi::NulError), 113 114 /// Cannot configure system 115 ConfigureSystem(arch::Error), 116 117 /// Cannot enable interrupt controller 118 #[cfg(target_arch = "aarch64")] 119 EnableInterruptController(interrupt_controller::Error), 120 121 PoisonedState, 122 123 /// Cannot create a device manager. 124 DeviceManager(DeviceManagerError), 125 126 /// Write to the console failed. 127 Console(vmm_sys_util::errno::Error), 128 129 /// Write to the pty console failed. 130 PtyConsole(io::Error), 131 132 /// Cannot setup terminal in raw mode. 133 SetTerminalRaw(vmm_sys_util::errno::Error), 134 135 /// Cannot setup terminal in canonical mode. 136 SetTerminalCanon(vmm_sys_util::errno::Error), 137 138 /// Memory is overflow 139 MemOverflow, 140 141 /// Cannot spawn a signal handler thread 142 SignalHandlerSpawn(io::Error), 143 144 /// Failed to join on vCPU threads 145 ThreadCleanup(std::boxed::Box<dyn std::any::Any + std::marker::Send>), 146 147 /// VM config is missing. 148 VmMissingConfig, 149 150 /// VM is not created 151 VmNotCreated, 152 153 /// VM is already created 154 VmAlreadyCreated, 155 156 /// VM is not running 157 VmNotRunning, 158 159 /// Cannot clone EventFd. 160 EventFdClone(io::Error), 161 162 /// Invalid VM state transition 163 InvalidStateTransition(VmState, VmState), 164 165 /// Error from CPU handling 166 CpuManager(cpu::Error), 167 168 /// Cannot pause devices 169 PauseDevices(MigratableError), 170 171 /// Cannot resume devices 172 ResumeDevices(MigratableError), 173 174 /// Cannot pause CPUs 175 PauseCpus(MigratableError), 176 177 /// Cannot resume cpus 178 ResumeCpus(MigratableError), 179 180 /// Cannot pause VM 181 Pause(MigratableError), 182 183 /// Cannot resume VM 184 Resume(MigratableError), 185 186 /// Memory manager error 187 MemoryManager(MemoryManagerError), 188 189 /// Eventfd write error 190 EventfdError(std::io::Error), 191 192 /// Cannot snapshot VM 193 Snapshot(MigratableError), 194 195 /// Cannot restore VM 196 Restore(MigratableError), 197 198 /// Cannot send VM snapshot 199 SnapshotSend(MigratableError), 200 201 /// Cannot convert source URL from Path into &str 202 RestoreSourceUrlPathToStr, 203 204 /// Failed to validate config 205 ConfigValidation(ValidationError), 206 207 /// No more that one virtio-vsock device 208 TooManyVsockDevices, 209 210 /// Failed serializing into JSON 211 SerializeJson(serde_json::Error), 212 213 /// Invalid configuration for NUMA. 214 InvalidNumaConfig, 215 216 /// Cannot create seccomp filter 217 CreateSeccompFilter(seccomp::SeccompError), 218 219 /// Cannot apply seccomp filter 220 ApplySeccompFilter(seccomp::Error), 221 222 /// Failed resizing a memory zone. 223 ResizeZone, 224 225 /// Cannot activate virtio devices 226 ActivateVirtioDevices(device_manager::DeviceManagerError), 227 228 /// Power button not supported 229 PowerButtonNotSupported, 230 231 /// Error triggering power button 232 PowerButton(device_manager::DeviceManagerError), 233 234 /// Kernel lacks PVH header 235 KernelMissingPvhHeader, 236 237 /// Error doing I/O on TDX firmware file 238 #[cfg(feature = "tdx")] 239 LoadTdvf(std::io::Error), 240 241 /// Error parsing TDVF 242 #[cfg(feature = "tdx")] 243 ParseTdvf(arch::x86_64::tdx::TdvfError), 244 245 /// Error populating HOB 246 #[cfg(feature = "tdx")] 247 PopulateHob(arch::x86_64::tdx::TdvfError), 248 249 /// Error allocating TDVF memory 250 #[cfg(feature = "tdx")] 251 AllocatingTdvfMemory(crate::memory_manager::Error), 252 253 /// Error enabling TDX VM 254 #[cfg(feature = "tdx")] 255 InitializeTdxVm(hypervisor::HypervisorVmError), 256 257 /// Error enabling TDX memory region 258 #[cfg(feature = "tdx")] 259 InitializeTdxMemoryRegion(hypervisor::HypervisorVmError), 260 261 /// Error finalizing TDX setup 262 #[cfg(feature = "tdx")] 263 FinalizeTdx(hypervisor::HypervisorVmError), 264 } 265 pub type Result<T> = result::Result<T, Error>; 266 267 #[derive(Clone, Default)] 268 pub struct NumaNode { 269 memory_regions: Vec<Arc<GuestRegionMmap>>, 270 hotplug_regions: Vec<Arc<GuestRegionMmap>>, 271 cpus: Vec<u8>, 272 distances: BTreeMap<u32, u8>, 273 memory_zones: Vec<String>, 274 #[cfg(target_arch = "x86_64")] 275 sgx_epc_sections: Vec<SgxEpcSection>, 276 } 277 278 impl NumaNode { 279 pub fn memory_regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 280 &self.memory_regions 281 } 282 283 pub fn hotplug_regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 284 &self.hotplug_regions 285 } 286 287 pub fn cpus(&self) -> &Vec<u8> { 288 &self.cpus 289 } 290 291 pub fn distances(&self) -> &BTreeMap<u32, u8> { 292 &self.distances 293 } 294 295 pub fn memory_zones(&self) -> &Vec<String> { 296 &self.memory_zones 297 } 298 299 #[cfg(target_arch = "x86_64")] 300 pub fn sgx_epc_sections(&self) -> &Vec<SgxEpcSection> { 301 &self.sgx_epc_sections 302 } 303 } 304 305 pub type NumaNodes = BTreeMap<u32, NumaNode>; 306 307 #[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq)] 308 pub enum VmState { 309 Created, 310 Running, 311 Shutdown, 312 Paused, 313 } 314 315 impl VmState { 316 fn valid_transition(self, new_state: VmState) -> Result<()> { 317 match self { 318 VmState::Created => match new_state { 319 VmState::Created | VmState::Shutdown => { 320 Err(Error::InvalidStateTransition(self, new_state)) 321 } 322 VmState::Running | VmState::Paused => Ok(()), 323 }, 324 325 VmState::Running => match new_state { 326 VmState::Created | VmState::Running => { 327 Err(Error::InvalidStateTransition(self, new_state)) 328 } 329 VmState::Paused | VmState::Shutdown => Ok(()), 330 }, 331 332 VmState::Shutdown => match new_state { 333 VmState::Paused | VmState::Created | VmState::Shutdown => { 334 Err(Error::InvalidStateTransition(self, new_state)) 335 } 336 VmState::Running => Ok(()), 337 }, 338 339 VmState::Paused => match new_state { 340 VmState::Created | VmState::Paused => { 341 Err(Error::InvalidStateTransition(self, new_state)) 342 } 343 VmState::Running | VmState::Shutdown => Ok(()), 344 }, 345 } 346 } 347 } 348 349 // Debug I/O port 350 #[cfg(target_arch = "x86_64")] 351 const DEBUG_IOPORT: u16 = 0x80; 352 #[cfg(target_arch = "x86_64")] 353 const DEBUG_IOPORT_PREFIX: &str = "Debug I/O port"; 354 355 #[cfg(target_arch = "x86_64")] 356 /// Debug I/O port, see: 357 /// https://www.intel.com/content/www/us/en/support/articles/000005500/boards-and-kits.html 358 /// 359 /// Since we're not a physical platform, we can freely assign code ranges for 360 /// debugging specific parts of our virtual platform. 361 pub enum DebugIoPortRange { 362 Firmware, 363 Bootloader, 364 Kernel, 365 Userspace, 366 Custom, 367 } 368 #[cfg(target_arch = "x86_64")] 369 impl DebugIoPortRange { 370 fn from_u8(value: u8) -> DebugIoPortRange { 371 match value { 372 0x00..=0x1f => DebugIoPortRange::Firmware, 373 0x20..=0x3f => DebugIoPortRange::Bootloader, 374 0x40..=0x5f => DebugIoPortRange::Kernel, 375 0x60..=0x7f => DebugIoPortRange::Userspace, 376 _ => DebugIoPortRange::Custom, 377 } 378 } 379 } 380 381 #[cfg(target_arch = "x86_64")] 382 impl fmt::Display for DebugIoPortRange { 383 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 384 match self { 385 DebugIoPortRange::Firmware => write!(f, "{}: Firmware", DEBUG_IOPORT_PREFIX), 386 DebugIoPortRange::Bootloader => write!(f, "{}: Bootloader", DEBUG_IOPORT_PREFIX), 387 DebugIoPortRange::Kernel => write!(f, "{}: Kernel", DEBUG_IOPORT_PREFIX), 388 DebugIoPortRange::Userspace => write!(f, "{}: Userspace", DEBUG_IOPORT_PREFIX), 389 DebugIoPortRange::Custom => write!(f, "{}: Custom", DEBUG_IOPORT_PREFIX), 390 } 391 } 392 } 393 394 struct VmOps { 395 memory: GuestMemoryAtomic<GuestMemoryMmap>, 396 #[cfg(target_arch = "x86_64")] 397 io_bus: Arc<Bus>, 398 mmio_bus: Arc<Bus>, 399 #[cfg(target_arch = "x86_64")] 400 timestamp: std::time::Instant, 401 } 402 403 impl VmOps { 404 #[cfg(target_arch = "x86_64")] 405 // Log debug io port codes. 406 fn log_debug_ioport(&self, code: u8) { 407 let elapsed = self.timestamp.elapsed(); 408 409 debug!( 410 "[{} code 0x{:x}] {}.{:>06} seconds", 411 DebugIoPortRange::from_u8(code), 412 code, 413 elapsed.as_secs(), 414 elapsed.as_micros() 415 ); 416 } 417 } 418 419 impl VmmOps for VmOps { 420 fn guest_mem_write(&self, gpa: u64, buf: &[u8]) -> hypervisor::vm::Result<usize> { 421 self.memory 422 .memory() 423 .write(buf, GuestAddress(gpa)) 424 .map_err(|e| HypervisorVmError::GuestMemWrite(e.into())) 425 } 426 427 fn guest_mem_read(&self, gpa: u64, buf: &mut [u8]) -> hypervisor::vm::Result<usize> { 428 self.memory 429 .memory() 430 .read(buf, GuestAddress(gpa)) 431 .map_err(|e| HypervisorVmError::GuestMemRead(e.into())) 432 } 433 434 fn mmio_read(&self, gpa: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> { 435 if let Err(vm_device::BusError::MissingAddressRange) = self.mmio_bus.read(gpa, data) { 436 warn!("Guest MMIO read to unregistered address 0x{:x}", gpa); 437 } 438 Ok(()) 439 } 440 441 fn mmio_write(&self, gpa: u64, data: &[u8]) -> hypervisor::vm::Result<()> { 442 match self.mmio_bus.write(gpa, data) { 443 Err(vm_device::BusError::MissingAddressRange) => { 444 warn!("Guest MMIO write to unregistered address 0x{:x}", gpa); 445 } 446 Ok(Some(barrier)) => { 447 info!("Waiting for barrier"); 448 barrier.wait(); 449 info!("Barrier released"); 450 } 451 _ => {} 452 }; 453 Ok(()) 454 } 455 456 #[cfg(target_arch = "x86_64")] 457 fn pio_read(&self, port: u64, data: &mut [u8]) -> hypervisor::vm::Result<()> { 458 if let Err(vm_device::BusError::MissingAddressRange) = self.io_bus.read(port, data) { 459 warn!("Guest PIO read to unregistered address 0x{:x}", port); 460 } 461 Ok(()) 462 } 463 464 #[cfg(target_arch = "x86_64")] 465 fn pio_write(&self, port: u64, data: &[u8]) -> hypervisor::vm::Result<()> { 466 if port == DEBUG_IOPORT as u64 && data.len() == 1 { 467 self.log_debug_ioport(data[0]); 468 return Ok(()); 469 } 470 471 match self.io_bus.write(port, data) { 472 Err(vm_device::BusError::MissingAddressRange) => { 473 warn!("Guest PIO write to unregistered address 0x{:x}", port); 474 } 475 Ok(Some(barrier)) => { 476 info!("Waiting for barrier"); 477 barrier.wait(); 478 info!("Barrier released"); 479 } 480 _ => {} 481 }; 482 Ok(()) 483 } 484 } 485 486 pub fn physical_bits(max_phys_bits: Option<u8>, #[cfg(feature = "tdx")] tdx_enabled: bool) -> u8 { 487 #[cfg(not(feature = "tdx"))] 488 let host_phys_bits = get_host_cpu_phys_bits(); 489 #[cfg(feature = "tdx")] 490 let mut host_phys_bits = get_host_cpu_phys_bits(); 491 492 #[cfg(feature = "tdx")] 493 if tdx_enabled { 494 // When running TDX guest, the Guest Physical Address space is limited 495 // by a shared bit that is located on bit 47 for 4 level paging, and on 496 // bit 51 for 5 level paging (when GPAW bit is 1). In order to keep 497 // things simple, and since a 47 bits address space is 128TiB large, we 498 // ensure to limit the physical addressable space to 47 bits when 499 // runnning TDX. 500 host_phys_bits = std::cmp::min(host_phys_bits, 47) 501 } 502 503 cmp::min(host_phys_bits, max_phys_bits.unwrap_or(host_phys_bits)) 504 } 505 506 pub struct Vm { 507 kernel: Option<File>, 508 initramfs: Option<File>, 509 threads: Vec<thread::JoinHandle<()>>, 510 device_manager: Arc<Mutex<DeviceManager>>, 511 config: Arc<Mutex<VmConfig>>, 512 on_tty: bool, 513 signals: Option<Handle>, 514 state: RwLock<VmState>, 515 cpu_manager: Arc<Mutex<cpu::CpuManager>>, 516 memory_manager: Arc<Mutex<MemoryManager>>, 517 #[cfg_attr(not(feature = "kvm"), allow(dead_code))] 518 // The hypervisor abstracted virtual machine. 519 vm: Arc<dyn hypervisor::Vm>, 520 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 521 saved_clock: Option<hypervisor::ClockData>, 522 #[cfg(feature = "acpi")] 523 numa_nodes: NumaNodes, 524 seccomp_action: SeccompAction, 525 exit_evt: EventFd, 526 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 527 hypervisor: Arc<dyn hypervisor::Hypervisor>, 528 } 529 530 impl Vm { 531 #[allow(clippy::too_many_arguments)] 532 fn new_from_memory_manager( 533 config: Arc<Mutex<VmConfig>>, 534 memory_manager: Arc<Mutex<MemoryManager>>, 535 vm: Arc<dyn hypervisor::Vm>, 536 exit_evt: EventFd, 537 reset_evt: EventFd, 538 seccomp_action: &SeccompAction, 539 hypervisor: Arc<dyn hypervisor::Hypervisor>, 540 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] _saved_clock: Option< 541 hypervisor::ClockData, 542 >, 543 activate_evt: EventFd, 544 ) -> Result<Self> { 545 config 546 .lock() 547 .unwrap() 548 .validate() 549 .map_err(Error::ConfigValidation)?; 550 551 info!("Booting VM from config: {:?}", &config); 552 553 // Create NUMA nodes based on NumaConfig. 554 #[cfg(feature = "acpi")] 555 let numa_nodes = 556 Self::create_numa_nodes(config.lock().unwrap().numa.clone(), &memory_manager)?; 557 558 #[cfg(feature = "tdx")] 559 let force_iommu = config.lock().unwrap().tdx.is_some(); 560 #[cfg(not(feature = "tdx"))] 561 let force_iommu = false; 562 563 let device_manager = DeviceManager::new( 564 vm.clone(), 565 config.clone(), 566 memory_manager.clone(), 567 &exit_evt, 568 &reset_evt, 569 seccomp_action.clone(), 570 #[cfg(feature = "acpi")] 571 numa_nodes.clone(), 572 &activate_evt, 573 force_iommu, 574 ) 575 .map_err(Error::DeviceManager)?; 576 577 let memory = memory_manager.lock().unwrap().guest_memory(); 578 #[cfg(target_arch = "x86_64")] 579 let io_bus = Arc::clone(device_manager.lock().unwrap().io_bus()); 580 let mmio_bus = Arc::clone(device_manager.lock().unwrap().mmio_bus()); 581 // Create the VmOps structure, which implements the VmmOps trait. 582 // And send it to the hypervisor. 583 let vm_ops: Arc<dyn VmmOps> = Arc::new(VmOps { 584 memory, 585 #[cfg(target_arch = "x86_64")] 586 io_bus, 587 mmio_bus, 588 #[cfg(target_arch = "x86_64")] 589 timestamp: std::time::Instant::now(), 590 }); 591 592 let exit_evt_clone = exit_evt.try_clone().map_err(Error::EventFdClone)?; 593 #[cfg(feature = "tdx")] 594 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 595 let cpu_manager = cpu::CpuManager::new( 596 &config.lock().unwrap().cpus.clone(), 597 &device_manager, 598 &memory_manager, 599 vm.clone(), 600 exit_evt_clone, 601 reset_evt, 602 hypervisor.clone(), 603 seccomp_action.clone(), 604 vm_ops, 605 #[cfg(feature = "tdx")] 606 tdx_enabled, 607 #[cfg(feature = "acpi")] 608 &numa_nodes, 609 ) 610 .map_err(Error::CpuManager)?; 611 612 let on_tty = unsafe { libc::isatty(libc::STDIN_FILENO as i32) } != 0; 613 let kernel = config 614 .lock() 615 .unwrap() 616 .kernel 617 .as_ref() 618 .map(|k| File::open(&k.path)) 619 .transpose() 620 .map_err(Error::KernelFile)?; 621 622 let initramfs = config 623 .lock() 624 .unwrap() 625 .initramfs 626 .as_ref() 627 .map(|i| File::open(&i.path)) 628 .transpose() 629 .map_err(Error::InitramfsFile)?; 630 631 Ok(Vm { 632 kernel, 633 initramfs, 634 device_manager, 635 config, 636 on_tty, 637 threads: Vec::with_capacity(1), 638 signals: None, 639 state: RwLock::new(VmState::Created), 640 cpu_manager, 641 memory_manager, 642 vm, 643 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 644 saved_clock: _saved_clock, 645 #[cfg(feature = "acpi")] 646 numa_nodes, 647 seccomp_action: seccomp_action.clone(), 648 exit_evt, 649 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 650 hypervisor, 651 }) 652 } 653 654 #[cfg(feature = "acpi")] 655 fn create_numa_nodes( 656 configs: Option<Vec<NumaConfig>>, 657 memory_manager: &Arc<Mutex<MemoryManager>>, 658 ) -> Result<NumaNodes> { 659 let mm = memory_manager.lock().unwrap(); 660 let mm_zones = mm.memory_zones(); 661 let mut numa_nodes = BTreeMap::new(); 662 663 if let Some(configs) = &configs { 664 for config in configs.iter() { 665 if numa_nodes.contains_key(&config.guest_numa_id) { 666 error!("Can't define twice the same NUMA node"); 667 return Err(Error::InvalidNumaConfig); 668 } 669 670 let mut node = NumaNode::default(); 671 672 if let Some(memory_zones) = &config.memory_zones { 673 for memory_zone in memory_zones.iter() { 674 if let Some(mm_zone) = mm_zones.get(memory_zone) { 675 node.memory_regions.extend(mm_zone.regions().clone()); 676 if let Some(virtiomem_zone) = mm_zone.virtio_mem_zone() { 677 node.hotplug_regions.push(virtiomem_zone.region().clone()); 678 } 679 node.memory_zones.push(memory_zone.clone()); 680 } else { 681 error!("Unknown memory zone '{}'", memory_zone); 682 return Err(Error::InvalidNumaConfig); 683 } 684 } 685 } 686 687 if let Some(cpus) = &config.cpus { 688 node.cpus.extend(cpus); 689 } 690 691 if let Some(distances) = &config.distances { 692 for distance in distances.iter() { 693 let dest = distance.destination; 694 let dist = distance.distance; 695 696 if !configs.iter().any(|cfg| cfg.guest_numa_id == dest) { 697 error!("Unknown destination NUMA node {}", dest); 698 return Err(Error::InvalidNumaConfig); 699 } 700 701 if node.distances.contains_key(&dest) { 702 error!("Destination NUMA node {} has been already set", dest); 703 return Err(Error::InvalidNumaConfig); 704 } 705 706 node.distances.insert(dest, dist); 707 } 708 } 709 710 #[cfg(target_arch = "x86_64")] 711 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 712 if let Some(sgx_epc_region) = mm.sgx_epc_region() { 713 let mm_sections = sgx_epc_region.epc_sections(); 714 for sgx_epc_section in sgx_epc_sections.iter() { 715 if let Some(mm_section) = mm_sections.get(sgx_epc_section) { 716 node.sgx_epc_sections.push(mm_section.clone()); 717 } else { 718 error!("Unknown SGX EPC section '{}'", sgx_epc_section); 719 return Err(Error::InvalidNumaConfig); 720 } 721 } 722 } else { 723 error!("Missing SGX EPC region"); 724 return Err(Error::InvalidNumaConfig); 725 } 726 } 727 728 numa_nodes.insert(config.guest_numa_id, node); 729 } 730 } 731 732 Ok(numa_nodes) 733 } 734 735 #[allow(clippy::too_many_arguments)] 736 pub fn new( 737 config: Arc<Mutex<VmConfig>>, 738 exit_evt: EventFd, 739 reset_evt: EventFd, 740 seccomp_action: &SeccompAction, 741 hypervisor: Arc<dyn hypervisor::Hypervisor>, 742 activate_evt: EventFd, 743 serial_pty: Option<PtyPair>, 744 console_pty: Option<PtyPair>, 745 ) -> Result<Self> { 746 #[cfg(feature = "tdx")] 747 let tdx_enabled = config.lock().unwrap().tdx.is_some(); 748 hypervisor.check_required_extensions().unwrap(); 749 #[cfg(feature = "tdx")] 750 let vm = hypervisor 751 .create_vm_with_type(if tdx_enabled { 752 2 // KVM_X86_TDX_VM 753 } else { 754 0 // KVM_X86_LEGACY_VM 755 }) 756 .unwrap(); 757 #[cfg(not(feature = "tdx"))] 758 let vm = hypervisor.create_vm().unwrap(); 759 760 #[cfg(target_arch = "x86_64")] 761 vm.enable_split_irq().unwrap(); 762 let phys_bits = physical_bits( 763 config.lock().unwrap().cpus.max_phys_bits, 764 #[cfg(feature = "tdx")] 765 tdx_enabled, 766 ); 767 let memory_manager = MemoryManager::new( 768 vm.clone(), 769 &config.lock().unwrap().memory.clone(), 770 false, 771 phys_bits, 772 #[cfg(feature = "tdx")] 773 tdx_enabled, 774 ) 775 .map_err(Error::MemoryManager)?; 776 777 #[cfg(target_arch = "x86_64")] 778 { 779 if let Some(sgx_epc_config) = config.lock().unwrap().sgx_epc.clone() { 780 memory_manager 781 .lock() 782 .unwrap() 783 .setup_sgx(sgx_epc_config, &vm) 784 .map_err(Error::MemoryManager)?; 785 } 786 } 787 788 let new_vm = Vm::new_from_memory_manager( 789 config, 790 memory_manager, 791 vm, 792 exit_evt, 793 reset_evt, 794 seccomp_action, 795 hypervisor, 796 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 797 None, 798 activate_evt, 799 )?; 800 801 // The device manager must create the devices from here as it is part 802 // of the regular code path creating everything from scratch. 803 new_vm 804 .device_manager 805 .lock() 806 .unwrap() 807 .create_devices(serial_pty, console_pty) 808 .map_err(Error::DeviceManager)?; 809 Ok(new_vm) 810 } 811 812 #[allow(clippy::too_many_arguments)] 813 pub fn new_from_snapshot( 814 snapshot: &Snapshot, 815 exit_evt: EventFd, 816 reset_evt: EventFd, 817 source_url: Option<&str>, 818 prefault: bool, 819 seccomp_action: &SeccompAction, 820 hypervisor: Arc<dyn hypervisor::Hypervisor>, 821 activate_evt: EventFd, 822 ) -> Result<Self> { 823 hypervisor.check_required_extensions().unwrap(); 824 let vm = hypervisor.create_vm().unwrap(); 825 #[cfg(target_arch = "x86_64")] 826 vm.enable_split_irq().unwrap(); 827 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 828 let config = vm_snapshot.config; 829 if let Some(state) = vm_snapshot.state { 830 vm.set_state(state) 831 .map_err(|e| Error::Restore(MigratableError::Restore(e.into())))?; 832 } 833 834 let memory_manager = if let Some(memory_manager_snapshot) = 835 snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) 836 { 837 let phys_bits = physical_bits( 838 config.lock().unwrap().cpus.max_phys_bits, 839 #[cfg(feature = "tdx")] 840 config.lock().unwrap().tdx.is_some(), 841 ); 842 MemoryManager::new_from_snapshot( 843 memory_manager_snapshot, 844 vm.clone(), 845 &config.lock().unwrap().memory.clone(), 846 source_url, 847 prefault, 848 phys_bits, 849 ) 850 .map_err(Error::MemoryManager)? 851 } else { 852 return Err(Error::Restore(MigratableError::Restore(anyhow!( 853 "Missing memory manager snapshot" 854 )))); 855 }; 856 857 Vm::new_from_memory_manager( 858 config, 859 memory_manager, 860 vm, 861 exit_evt, 862 reset_evt, 863 seccomp_action, 864 hypervisor, 865 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 866 vm_snapshot.clock, 867 activate_evt, 868 ) 869 } 870 871 pub fn new_from_migration( 872 config: Arc<Mutex<VmConfig>>, 873 exit_evt: EventFd, 874 reset_evt: EventFd, 875 seccomp_action: &SeccompAction, 876 hypervisor: Arc<dyn hypervisor::Hypervisor>, 877 activate_evt: EventFd, 878 ) -> Result<Self> { 879 hypervisor.check_required_extensions().unwrap(); 880 let vm = hypervisor.create_vm().unwrap(); 881 #[cfg(target_arch = "x86_64")] 882 vm.enable_split_irq().unwrap(); 883 let phys_bits = physical_bits( 884 config.lock().unwrap().cpus.max_phys_bits, 885 #[cfg(feature = "tdx")] 886 config.lock().unwrap().tdx.is_some(), 887 ); 888 889 let memory_manager = MemoryManager::new( 890 vm.clone(), 891 &config.lock().unwrap().memory.clone(), 892 false, 893 phys_bits, 894 #[cfg(feature = "tdx")] 895 false, 896 ) 897 .map_err(Error::MemoryManager)?; 898 899 Vm::new_from_memory_manager( 900 config, 901 memory_manager, 902 vm, 903 exit_evt, 904 reset_evt, 905 seccomp_action, 906 hypervisor, 907 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 908 None, 909 activate_evt, 910 ) 911 } 912 913 fn load_initramfs(&mut self, guest_mem: &GuestMemoryMmap) -> Result<arch::InitramfsConfig> { 914 let mut initramfs = self.initramfs.as_ref().unwrap(); 915 let size: usize = initramfs 916 .seek(SeekFrom::End(0)) 917 .map_err(|_| Error::InitramfsLoad)? 918 .try_into() 919 .unwrap(); 920 initramfs 921 .seek(SeekFrom::Start(0)) 922 .map_err(|_| Error::InitramfsLoad)?; 923 924 let address = 925 arch::initramfs_load_addr(guest_mem, size).map_err(|_| Error::InitramfsLoad)?; 926 let address = GuestAddress(address); 927 928 guest_mem 929 .read_from(address, &mut initramfs, size) 930 .map_err(|_| Error::InitramfsLoad)?; 931 932 info!("Initramfs loaded: address = 0x{:x}", address.0); 933 Ok(arch::InitramfsConfig { address, size }) 934 } 935 936 fn get_cmdline(&mut self) -> Result<CString> { 937 let mut cmdline = Cmdline::new(arch::CMDLINE_MAX_SIZE); 938 cmdline 939 .insert_str(self.config.lock().unwrap().cmdline.args.clone()) 940 .map_err(Error::CmdLineInsertStr)?; 941 for entry in self.device_manager.lock().unwrap().cmdline_additions() { 942 cmdline.insert_str(entry).map_err(Error::CmdLineInsertStr)?; 943 } 944 CString::new(cmdline).map_err(Error::CmdLineCString) 945 } 946 947 #[cfg(target_arch = "aarch64")] 948 fn load_kernel(&mut self) -> Result<EntryPoint> { 949 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 950 let mem = guest_memory.memory(); 951 let mut kernel = self.kernel.as_ref().unwrap(); 952 let entry_addr = match linux_loader::loader::pe::PE::load( 953 mem.deref(), 954 Some(GuestAddress(arch::get_kernel_start())), 955 &mut kernel, 956 None, 957 ) { 958 Ok(entry_addr) => entry_addr, 959 // Try to load the binary as kernel PE file at first. 960 // If failed, retry to load it as UEFI binary. 961 // As the UEFI binary is formatless, it must be the last option to try. 962 Err(linux_loader::loader::Error::Pe(InvalidImageMagicNumber)) => { 963 arch::aarch64::uefi::load_uefi( 964 mem.deref(), 965 GuestAddress(arch::get_uefi_start()), 966 &mut kernel, 967 ) 968 .map_err(Error::UefiLoad)?; 969 // The entry point offset in UEFI image is always 0. 970 return Ok(EntryPoint { 971 entry_addr: GuestAddress(arch::get_uefi_start()), 972 }); 973 } 974 Err(e) => { 975 return Err(Error::KernelLoad(e)); 976 } 977 }; 978 979 let entry_point_addr: GuestAddress = entry_addr.kernel_load; 980 981 Ok(EntryPoint { 982 entry_addr: entry_point_addr, 983 }) 984 } 985 986 #[cfg(target_arch = "x86_64")] 987 fn load_kernel(&mut self) -> Result<EntryPoint> { 988 info!("Loading kernel"); 989 let cmdline_cstring = self.get_cmdline()?; 990 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 991 let mem = guest_memory.memory(); 992 let mut kernel = self.kernel.as_ref().unwrap(); 993 let entry_addr = match linux_loader::loader::elf::Elf::load( 994 mem.deref(), 995 None, 996 &mut kernel, 997 Some(arch::layout::HIGH_RAM_START), 998 ) { 999 Ok(entry_addr) => entry_addr, 1000 Err(e) => { 1001 return Err(Error::KernelLoad(e)); 1002 } 1003 }; 1004 1005 linux_loader::loader::load_cmdline( 1006 mem.deref(), 1007 arch::layout::CMDLINE_START, 1008 &cmdline_cstring, 1009 ) 1010 .map_err(Error::LoadCmdLine)?; 1011 1012 if let PvhEntryPresent(entry_addr) = entry_addr.pvh_boot_cap { 1013 // Use the PVH kernel entry point to boot the guest 1014 info!("Kernel loaded: entry_addr = 0x{:x}", entry_addr.0); 1015 Ok(EntryPoint { entry_addr }) 1016 } else { 1017 Err(Error::KernelMissingPvhHeader) 1018 } 1019 } 1020 1021 #[cfg(target_arch = "x86_64")] 1022 fn configure_system(&mut self) -> Result<()> { 1023 info!("Configuring system"); 1024 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1025 1026 let initramfs_config = match self.initramfs { 1027 Some(_) => Some(self.load_initramfs(&mem)?), 1028 None => None, 1029 }; 1030 1031 let boot_vcpus = self.cpu_manager.lock().unwrap().boot_vcpus(); 1032 1033 #[allow(unused_mut, unused_assignments)] 1034 let mut rsdp_addr: Option<GuestAddress> = None; 1035 1036 #[cfg(feature = "acpi")] 1037 { 1038 rsdp_addr = Some(crate::acpi::create_acpi_tables( 1039 &mem, 1040 &self.device_manager, 1041 &self.cpu_manager, 1042 &self.memory_manager, 1043 &self.numa_nodes, 1044 )); 1045 info!( 1046 "Created ACPI tables: rsdp_addr = 0x{:x}", 1047 rsdp_addr.unwrap().0 1048 ); 1049 } 1050 1051 let sgx_epc_region = self 1052 .memory_manager 1053 .lock() 1054 .unwrap() 1055 .sgx_epc_region() 1056 .as_ref() 1057 .cloned(); 1058 1059 arch::configure_system( 1060 &mem, 1061 arch::layout::CMDLINE_START, 1062 &initramfs_config, 1063 boot_vcpus, 1064 rsdp_addr, 1065 sgx_epc_region, 1066 ) 1067 .map_err(Error::ConfigureSystem)?; 1068 Ok(()) 1069 } 1070 1071 #[cfg(target_arch = "aarch64")] 1072 fn configure_system(&mut self) -> Result<()> { 1073 let cmdline_cstring = self.get_cmdline()?; 1074 let vcpu_mpidrs = self.cpu_manager.lock().unwrap().get_mpidrs(); 1075 let vcpu_topology = self.cpu_manager.lock().unwrap().get_vcpu_topology(); 1076 let mem = self.memory_manager.lock().unwrap().boot_guest_memory(); 1077 let initramfs_config = match self.initramfs { 1078 Some(_) => Some(self.load_initramfs(&mem)?), 1079 None => None, 1080 }; 1081 1082 let device_info = &self 1083 .device_manager 1084 .lock() 1085 .unwrap() 1086 .get_device_info() 1087 .clone(); 1088 1089 let pci_space_start: GuestAddress = self 1090 .memory_manager 1091 .lock() 1092 .as_ref() 1093 .unwrap() 1094 .start_of_device_area(); 1095 1096 let pci_space_end: GuestAddress = self 1097 .memory_manager 1098 .lock() 1099 .as_ref() 1100 .unwrap() 1101 .end_of_device_area(); 1102 1103 let pci_space_size = pci_space_end 1104 .checked_offset_from(pci_space_start) 1105 .ok_or(Error::MemOverflow)? 1106 + 1; 1107 1108 let pci_space = (pci_space_start.0, pci_space_size); 1109 1110 #[cfg(feature = "acpi")] 1111 { 1112 let _ = crate::acpi::create_acpi_tables( 1113 &mem, 1114 &self.device_manager, 1115 &self.cpu_manager, 1116 &self.memory_manager, 1117 &self.numa_nodes, 1118 ); 1119 } 1120 1121 let gic_device = create_gic( 1122 &self.memory_manager.lock().as_ref().unwrap().vm, 1123 self.cpu_manager.lock().unwrap().boot_vcpus() as u64, 1124 ) 1125 .map_err(|e| { 1126 Error::ConfigureSystem(arch::Error::AArch64Setup(arch::aarch64::Error::SetupGic(e))) 1127 })?; 1128 1129 arch::configure_system( 1130 &mem, 1131 &cmdline_cstring, 1132 vcpu_mpidrs, 1133 vcpu_topology, 1134 device_info, 1135 &initramfs_config, 1136 &pci_space, 1137 &*gic_device, 1138 ) 1139 .map_err(Error::ConfigureSystem)?; 1140 1141 // Update the GIC entity in device manager 1142 self.device_manager 1143 .lock() 1144 .unwrap() 1145 .get_interrupt_controller() 1146 .unwrap() 1147 .lock() 1148 .unwrap() 1149 .set_gic_device(Arc::new(Mutex::new(gic_device))); 1150 1151 // Activate gic device 1152 self.device_manager 1153 .lock() 1154 .unwrap() 1155 .get_interrupt_controller() 1156 .unwrap() 1157 .lock() 1158 .unwrap() 1159 .enable() 1160 .map_err(Error::EnableInterruptController)?; 1161 1162 Ok(()) 1163 } 1164 1165 pub fn serial_pty(&self) -> Option<PtyPair> { 1166 self.device_manager.lock().unwrap().serial_pty() 1167 } 1168 1169 pub fn console_pty(&self) -> Option<PtyPair> { 1170 self.device_manager.lock().unwrap().console_pty() 1171 } 1172 1173 pub fn shutdown(&mut self) -> Result<()> { 1174 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1175 let new_state = VmState::Shutdown; 1176 1177 state.valid_transition(new_state)?; 1178 1179 if self.on_tty { 1180 // Don't forget to set the terminal in canonical mode 1181 // before to exit. 1182 io::stdin() 1183 .lock() 1184 .set_canon_mode() 1185 .map_err(Error::SetTerminalCanon)?; 1186 } 1187 1188 // Trigger the termination of the signal_handler thread 1189 if let Some(signals) = self.signals.take() { 1190 signals.close(); 1191 } 1192 1193 // Wake up the DeviceManager threads so they will get terminated cleanly 1194 self.device_manager 1195 .lock() 1196 .unwrap() 1197 .resume() 1198 .map_err(Error::Resume)?; 1199 1200 self.cpu_manager 1201 .lock() 1202 .unwrap() 1203 .shutdown() 1204 .map_err(Error::CpuManager)?; 1205 1206 // Wait for all the threads to finish 1207 for thread in self.threads.drain(..) { 1208 thread.join().map_err(Error::ThreadCleanup)? 1209 } 1210 *state = new_state; 1211 1212 event!("vm", "shutdown"); 1213 1214 Ok(()) 1215 } 1216 1217 pub fn resize( 1218 &mut self, 1219 desired_vcpus: Option<u8>, 1220 desired_memory: Option<u64>, 1221 desired_balloon: Option<u64>, 1222 ) -> Result<()> { 1223 event!("vm", "resizing"); 1224 1225 if let Some(desired_vcpus) = desired_vcpus { 1226 if self 1227 .cpu_manager 1228 .lock() 1229 .unwrap() 1230 .resize(desired_vcpus) 1231 .map_err(Error::CpuManager)? 1232 { 1233 self.device_manager 1234 .lock() 1235 .unwrap() 1236 .notify_hotplug(AcpiNotificationFlags::CPU_DEVICES_CHANGED) 1237 .map_err(Error::DeviceManager)?; 1238 } 1239 self.config.lock().unwrap().cpus.boot_vcpus = desired_vcpus; 1240 } 1241 1242 if let Some(desired_memory) = desired_memory { 1243 let new_region = self 1244 .memory_manager 1245 .lock() 1246 .unwrap() 1247 .resize(desired_memory) 1248 .map_err(Error::MemoryManager)?; 1249 1250 let mut memory_config = &mut self.config.lock().unwrap().memory; 1251 1252 if let Some(new_region) = &new_region { 1253 self.device_manager 1254 .lock() 1255 .unwrap() 1256 .update_memory(new_region) 1257 .map_err(Error::DeviceManager)?; 1258 1259 match memory_config.hotplug_method { 1260 HotplugMethod::Acpi => { 1261 self.device_manager 1262 .lock() 1263 .unwrap() 1264 .notify_hotplug(AcpiNotificationFlags::MEMORY_DEVICES_CHANGED) 1265 .map_err(Error::DeviceManager)?; 1266 } 1267 HotplugMethod::VirtioMem => {} 1268 } 1269 } 1270 1271 // We update the VM config regardless of the actual guest resize 1272 // operation result (happened or not), so that if the VM reboots 1273 // it will be running with the last configure memory size. 1274 match memory_config.hotplug_method { 1275 HotplugMethod::Acpi => memory_config.size = desired_memory, 1276 HotplugMethod::VirtioMem => { 1277 if desired_memory > memory_config.size { 1278 memory_config.hotplugged_size = Some(desired_memory - memory_config.size); 1279 } else { 1280 memory_config.hotplugged_size = None; 1281 } 1282 } 1283 } 1284 } 1285 1286 if let Some(desired_balloon) = desired_balloon { 1287 self.device_manager 1288 .lock() 1289 .unwrap() 1290 .resize_balloon(desired_balloon) 1291 .map_err(Error::DeviceManager)?; 1292 1293 // Update the configuration value for the balloon size to ensure 1294 // a reboot would use the right value. 1295 if let Some(balloon_config) = &mut self.config.lock().unwrap().balloon { 1296 balloon_config.size = desired_balloon; 1297 } 1298 } 1299 1300 event!("vm", "resized"); 1301 1302 Ok(()) 1303 } 1304 1305 pub fn resize_zone(&mut self, id: String, desired_memory: u64) -> Result<()> { 1306 let memory_config = &mut self.config.lock().unwrap().memory; 1307 1308 if let Some(zones) = &mut memory_config.zones { 1309 for zone in zones.iter_mut() { 1310 if zone.id == id { 1311 if desired_memory >= zone.size { 1312 let hotplugged_size = desired_memory - zone.size; 1313 self.memory_manager 1314 .lock() 1315 .unwrap() 1316 .resize_zone(&id, desired_memory - zone.size) 1317 .map_err(Error::MemoryManager)?; 1318 // We update the memory zone config regardless of the 1319 // actual 'resize-zone' operation result (happened or 1320 // not), so that if the VM reboots it will be running 1321 // with the last configured memory zone size. 1322 zone.hotplugged_size = Some(hotplugged_size); 1323 1324 return Ok(()); 1325 } else { 1326 error!( 1327 "Invalid to ask less ({}) than boot RAM ({}) for \ 1328 this memory zone", 1329 desired_memory, zone.size, 1330 ); 1331 return Err(Error::ResizeZone); 1332 } 1333 } 1334 } 1335 } 1336 1337 error!("Could not find the memory zone {} for the resize", id); 1338 Err(Error::ResizeZone) 1339 } 1340 1341 fn add_to_config<T>(devices: &mut Option<Vec<T>>, device: T) { 1342 if let Some(devices) = devices { 1343 devices.push(device); 1344 } else { 1345 *devices = Some(vec![device]); 1346 } 1347 } 1348 1349 pub fn add_device(&mut self, mut _device_cfg: DeviceConfig) -> Result<PciDeviceInfo> { 1350 { 1351 // Validate on a clone of the config 1352 let mut config = self.config.lock().unwrap().clone(); 1353 Self::add_to_config(&mut config.devices, _device_cfg.clone()); 1354 config.validate().map_err(Error::ConfigValidation)?; 1355 } 1356 1357 let pci_device_info = self 1358 .device_manager 1359 .lock() 1360 .unwrap() 1361 .add_device(&mut _device_cfg) 1362 .map_err(Error::DeviceManager)?; 1363 1364 // Update VmConfig by adding the new device. This is important to 1365 // ensure the device would be created in case of a reboot. 1366 { 1367 let mut config = self.config.lock().unwrap(); 1368 Self::add_to_config(&mut config.devices, _device_cfg); 1369 } 1370 1371 self.device_manager 1372 .lock() 1373 .unwrap() 1374 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1375 .map_err(Error::DeviceManager)?; 1376 1377 Ok(pci_device_info) 1378 } 1379 1380 pub fn remove_device(&mut self, _id: String) -> Result<()> { 1381 self.device_manager 1382 .lock() 1383 .unwrap() 1384 .remove_device(_id.clone()) 1385 .map_err(Error::DeviceManager)?; 1386 1387 // Update VmConfig by removing the device. This is important to 1388 // ensure the device would not be created in case of a reboot. 1389 let mut config = self.config.lock().unwrap(); 1390 1391 // Remove if VFIO device 1392 if let Some(devices) = config.devices.as_mut() { 1393 devices.retain(|dev| dev.id.as_ref() != Some(&_id)); 1394 } 1395 1396 // Remove if disk device 1397 if let Some(disks) = config.disks.as_mut() { 1398 disks.retain(|dev| dev.id.as_ref() != Some(&_id)); 1399 } 1400 1401 // Remove if net device 1402 if let Some(net) = config.net.as_mut() { 1403 net.retain(|dev| dev.id.as_ref() != Some(&_id)); 1404 } 1405 1406 // Remove if pmem device 1407 if let Some(pmem) = config.pmem.as_mut() { 1408 pmem.retain(|dev| dev.id.as_ref() != Some(&_id)); 1409 } 1410 1411 // Remove if vsock device 1412 if let Some(vsock) = config.vsock.as_ref() { 1413 if vsock.id.as_ref() == Some(&_id) { 1414 config.vsock = None; 1415 } 1416 } 1417 1418 self.device_manager 1419 .lock() 1420 .unwrap() 1421 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1422 .map_err(Error::DeviceManager)?; 1423 Ok(()) 1424 } 1425 1426 pub fn add_disk(&mut self, mut _disk_cfg: DiskConfig) -> Result<PciDeviceInfo> { 1427 { 1428 // Validate on a clone of the config 1429 let mut config = self.config.lock().unwrap().clone(); 1430 Self::add_to_config(&mut config.disks, _disk_cfg.clone()); 1431 config.validate().map_err(Error::ConfigValidation)?; 1432 } 1433 1434 let pci_device_info = self 1435 .device_manager 1436 .lock() 1437 .unwrap() 1438 .add_disk(&mut _disk_cfg) 1439 .map_err(Error::DeviceManager)?; 1440 1441 // Update VmConfig by adding the new device. This is important to 1442 // ensure the device would be created in case of a reboot. 1443 { 1444 let mut config = self.config.lock().unwrap(); 1445 Self::add_to_config(&mut config.disks, _disk_cfg); 1446 } 1447 1448 self.device_manager 1449 .lock() 1450 .unwrap() 1451 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1452 .map_err(Error::DeviceManager)?; 1453 1454 Ok(pci_device_info) 1455 } 1456 1457 pub fn add_fs(&mut self, mut _fs_cfg: FsConfig) -> Result<PciDeviceInfo> { 1458 { 1459 // Validate on a clone of the config 1460 let mut config = self.config.lock().unwrap().clone(); 1461 Self::add_to_config(&mut config.fs, _fs_cfg.clone()); 1462 config.validate().map_err(Error::ConfigValidation)?; 1463 } 1464 1465 let pci_device_info = self 1466 .device_manager 1467 .lock() 1468 .unwrap() 1469 .add_fs(&mut _fs_cfg) 1470 .map_err(Error::DeviceManager)?; 1471 1472 // Update VmConfig by adding the new device. This is important to 1473 // ensure the device would be created in case of a reboot. 1474 { 1475 let mut config = self.config.lock().unwrap(); 1476 Self::add_to_config(&mut config.fs, _fs_cfg); 1477 } 1478 1479 self.device_manager 1480 .lock() 1481 .unwrap() 1482 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1483 .map_err(Error::DeviceManager)?; 1484 1485 Ok(pci_device_info) 1486 } 1487 1488 pub fn add_pmem(&mut self, mut _pmem_cfg: PmemConfig) -> Result<PciDeviceInfo> { 1489 { 1490 // Validate on a clone of the config 1491 let mut config = self.config.lock().unwrap().clone(); 1492 Self::add_to_config(&mut config.pmem, _pmem_cfg.clone()); 1493 config.validate().map_err(Error::ConfigValidation)?; 1494 } 1495 1496 let pci_device_info = self 1497 .device_manager 1498 .lock() 1499 .unwrap() 1500 .add_pmem(&mut _pmem_cfg) 1501 .map_err(Error::DeviceManager)?; 1502 1503 // Update VmConfig by adding the new device. This is important to 1504 // ensure the device would be created in case of a reboot. 1505 { 1506 let mut config = self.config.lock().unwrap(); 1507 Self::add_to_config(&mut config.pmem, _pmem_cfg); 1508 } 1509 1510 self.device_manager 1511 .lock() 1512 .unwrap() 1513 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1514 .map_err(Error::DeviceManager)?; 1515 1516 Ok(pci_device_info) 1517 } 1518 1519 pub fn add_net(&mut self, mut _net_cfg: NetConfig) -> Result<PciDeviceInfo> { 1520 { 1521 // Validate on a clone of the config 1522 let mut config = self.config.lock().unwrap().clone(); 1523 Self::add_to_config(&mut config.net, _net_cfg.clone()); 1524 config.validate().map_err(Error::ConfigValidation)?; 1525 } 1526 1527 let pci_device_info = self 1528 .device_manager 1529 .lock() 1530 .unwrap() 1531 .add_net(&mut _net_cfg) 1532 .map_err(Error::DeviceManager)?; 1533 1534 // Update VmConfig by adding the new device. This is important to 1535 // ensure the device would be created in case of a reboot. 1536 { 1537 let mut config = self.config.lock().unwrap(); 1538 Self::add_to_config(&mut config.net, _net_cfg); 1539 } 1540 1541 self.device_manager 1542 .lock() 1543 .unwrap() 1544 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1545 .map_err(Error::DeviceManager)?; 1546 1547 Ok(pci_device_info) 1548 } 1549 1550 pub fn add_vsock(&mut self, mut _vsock_cfg: VsockConfig) -> Result<PciDeviceInfo> { 1551 if self.config.lock().unwrap().vsock.is_some() { 1552 return Err(Error::TooManyVsockDevices); 1553 } 1554 1555 { 1556 // Validate on a clone of the config 1557 let mut config = self.config.lock().unwrap().clone(); 1558 config.vsock = Some(_vsock_cfg.clone()); 1559 config.validate().map_err(Error::ConfigValidation)?; 1560 } 1561 1562 let pci_device_info = self 1563 .device_manager 1564 .lock() 1565 .unwrap() 1566 .add_vsock(&mut _vsock_cfg) 1567 .map_err(Error::DeviceManager)?; 1568 1569 // Update VmConfig by adding the new device. This is important to 1570 // ensure the device would be created in case of a reboot. 1571 { 1572 let mut config = self.config.lock().unwrap(); 1573 config.vsock = Some(_vsock_cfg); 1574 } 1575 1576 self.device_manager 1577 .lock() 1578 .unwrap() 1579 .notify_hotplug(AcpiNotificationFlags::PCI_DEVICES_CHANGED) 1580 .map_err(Error::DeviceManager)?; 1581 1582 Ok(pci_device_info) 1583 } 1584 1585 pub fn counters(&self) -> Result<HashMap<String, HashMap<&'static str, Wrapping<u64>>>> { 1586 Ok(self.device_manager.lock().unwrap().counters()) 1587 } 1588 1589 fn os_signal_handler( 1590 mut signals: Signals, 1591 console_input_clone: Arc<Console>, 1592 on_tty: bool, 1593 exit_evt: EventFd, 1594 ) { 1595 for signal in signals.forever() { 1596 match signal { 1597 SIGWINCH => { 1598 let (col, row) = get_win_size(); 1599 console_input_clone.update_console_size(col, row); 1600 } 1601 SIGTERM | SIGINT => { 1602 if on_tty { 1603 io::stdin() 1604 .lock() 1605 .set_canon_mode() 1606 .expect("failed to restore terminal mode"); 1607 } 1608 if exit_evt.write(1).is_err() { 1609 std::process::exit(1); 1610 } 1611 } 1612 _ => (), 1613 } 1614 } 1615 } 1616 1617 #[cfg(feature = "tdx")] 1618 fn init_tdx(&mut self) -> Result<()> { 1619 let cpuid = self.cpu_manager.lock().unwrap().common_cpuid(); 1620 let max_vcpus = self.cpu_manager.lock().unwrap().max_vcpus() as u32; 1621 self.vm 1622 .tdx_init(&cpuid, max_vcpus) 1623 .map_err(Error::InitializeTdxVm)?; 1624 Ok(()) 1625 } 1626 1627 #[cfg(feature = "tdx")] 1628 fn extract_tdvf_sections(&mut self) -> Result<Vec<TdvfSection>> { 1629 use arch::x86_64::tdx::*; 1630 // The TDVF file contains a table of section as well as code 1631 let mut firmware_file = 1632 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1633 .map_err(Error::LoadTdvf)?; 1634 1635 // For all the sections allocate some RAM backing them 1636 parse_tdvf_sections(&mut firmware_file).map_err(Error::ParseTdvf) 1637 } 1638 1639 #[cfg(feature = "tdx")] 1640 fn populate_tdx_sections(&mut self, sections: &[TdvfSection]) -> Result<Option<u64>> { 1641 use arch::x86_64::tdx::*; 1642 // Get the memory end *before* we start adding TDVF ram regions 1643 let boot_guest_memory = self 1644 .memory_manager 1645 .lock() 1646 .as_ref() 1647 .unwrap() 1648 .boot_guest_memory(); 1649 for section in sections { 1650 // No need to allocate if the section falls within guest RAM ranges 1651 if boot_guest_memory.address_in_range(GuestAddress(section.address)) { 1652 info!( 1653 "Not allocating TDVF Section: {:x?} since it is already part of guest RAM", 1654 section 1655 ); 1656 continue; 1657 } 1658 1659 info!("Allocating TDVF Section: {:x?}", section); 1660 self.memory_manager 1661 .lock() 1662 .unwrap() 1663 .add_ram_region(GuestAddress(section.address), section.size as usize) 1664 .map_err(Error::AllocatingTdvfMemory)?; 1665 } 1666 1667 // The TDVF file contains a table of section as well as code 1668 let mut firmware_file = 1669 File::open(&self.config.lock().unwrap().tdx.as_ref().unwrap().firmware) 1670 .map_err(Error::LoadTdvf)?; 1671 1672 // The guest memory at this point now has all the required regions so it 1673 // is safe to copy from the TDVF file into it. 1674 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1675 let mem = guest_memory.memory(); 1676 let mut hob_offset = None; 1677 for section in sections { 1678 info!("Populating TDVF Section: {:x?}", section); 1679 match section.r#type { 1680 TdvfSectionType::Bfv | TdvfSectionType::Cfv => { 1681 info!("Copying section to guest memory"); 1682 firmware_file 1683 .seek(SeekFrom::Start(section.data_offset as u64)) 1684 .map_err(Error::LoadTdvf)?; 1685 mem.read_from( 1686 GuestAddress(section.address), 1687 &mut firmware_file, 1688 section.data_size as usize, 1689 ) 1690 .unwrap(); 1691 } 1692 TdvfSectionType::TdHob => { 1693 hob_offset = Some(section.address); 1694 } 1695 _ => {} 1696 } 1697 } 1698 1699 // Generate HOB 1700 let mut hob = TdHob::start(hob_offset.unwrap()); 1701 1702 let mut sorted_sections = sections.to_vec(); 1703 sorted_sections.retain(|section| { 1704 !matches!(section.r#type, TdvfSectionType::Bfv | TdvfSectionType::Cfv) 1705 }); 1706 sorted_sections.sort_by_key(|section| section.address); 1707 sorted_sections.reverse(); 1708 let mut current_section = sorted_sections.pop(); 1709 1710 // RAM regions interleaved with TDVF sections 1711 let mut next_start_addr = 0; 1712 for region in boot_guest_memory.iter() { 1713 let region_start = region.start_addr().0; 1714 let region_end = region.last_addr().0; 1715 if region_start > next_start_addr { 1716 next_start_addr = region_start; 1717 } 1718 1719 loop { 1720 let (start, size, ram) = if let Some(section) = ¤t_section { 1721 if section.address <= next_start_addr { 1722 (section.address, section.size, false) 1723 } else { 1724 let last_addr = std::cmp::min(section.address - 1, region_end); 1725 (next_start_addr, last_addr - next_start_addr + 1, true) 1726 } 1727 } else { 1728 (next_start_addr, region_end - next_start_addr + 1, true) 1729 }; 1730 1731 hob.add_memory_resource(&mem, start, size, ram) 1732 .map_err(Error::PopulateHob)?; 1733 1734 if !ram { 1735 current_section = sorted_sections.pop(); 1736 } 1737 1738 next_start_addr = start + size; 1739 1740 if next_start_addr > region_end { 1741 break; 1742 } 1743 } 1744 } 1745 1746 // MMIO regions 1747 hob.add_mmio_resource( 1748 &mem, 1749 arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1750 arch::layout::APIC_START.raw_value() 1751 - arch::layout::MEM_32BIT_DEVICES_START.raw_value(), 1752 ) 1753 .map_err(Error::PopulateHob)?; 1754 let start_of_device_area = self 1755 .memory_manager 1756 .lock() 1757 .unwrap() 1758 .start_of_device_area() 1759 .raw_value(); 1760 let end_of_device_area = self 1761 .memory_manager 1762 .lock() 1763 .unwrap() 1764 .end_of_device_area() 1765 .raw_value(); 1766 hob.add_mmio_resource( 1767 &mem, 1768 start_of_device_area, 1769 end_of_device_area - start_of_device_area, 1770 ) 1771 .map_err(Error::PopulateHob)?; 1772 1773 hob.finish(&mem).map_err(Error::PopulateHob)?; 1774 1775 Ok(hob_offset) 1776 } 1777 1778 #[cfg(feature = "tdx")] 1779 fn init_tdx_memory(&mut self, sections: &[TdvfSection]) -> Result<()> { 1780 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 1781 let mem = guest_memory.memory(); 1782 1783 for section in sections { 1784 self.vm 1785 .tdx_init_memory_region( 1786 mem.get_host_address(GuestAddress(section.address)).unwrap() as u64, 1787 section.address, 1788 section.size, 1789 /* TDVF_SECTION_ATTRIBUTES_EXTENDMR */ 1790 section.attributes == 1, 1791 ) 1792 .map_err(Error::InitializeTdxMemoryRegion)?; 1793 } 1794 Ok(()) 1795 } 1796 1797 pub fn boot(&mut self) -> Result<()> { 1798 info!("Booting VM"); 1799 event!("vm", "booting"); 1800 let current_state = self.get_state()?; 1801 if current_state == VmState::Paused { 1802 return self.resume().map_err(Error::Resume); 1803 } 1804 1805 let new_state = VmState::Running; 1806 current_state.valid_transition(new_state)?; 1807 1808 // Load kernel if configured 1809 let entry_point = if self.kernel.as_ref().is_some() { 1810 Some(self.load_kernel()?) 1811 } else { 1812 None 1813 }; 1814 1815 // The initial TDX configuration must be done before the vCPUs are 1816 // created 1817 #[cfg(feature = "tdx")] 1818 if self.config.lock().unwrap().tdx.is_some() { 1819 self.init_tdx()?; 1820 } 1821 1822 // Create and configure vcpus 1823 self.cpu_manager 1824 .lock() 1825 .unwrap() 1826 .create_boot_vcpus(entry_point) 1827 .map_err(Error::CpuManager)?; 1828 1829 #[cfg(feature = "tdx")] 1830 let sections = self.extract_tdvf_sections()?; 1831 1832 // Configuring the TDX regions requires that the vCPUs are created 1833 #[cfg(feature = "tdx")] 1834 let hob_address = if self.config.lock().unwrap().tdx.is_some() { 1835 self.populate_tdx_sections(§ions)? 1836 } else { 1837 None 1838 }; 1839 1840 // Configure shared state based on loaded kernel 1841 entry_point.map(|_| self.configure_system()).transpose()?; 1842 1843 #[cfg(feature = "tdx")] 1844 if let Some(hob_address) = hob_address { 1845 // With the HOB address extracted the vCPUs can have 1846 // their TDX state configured. 1847 self.cpu_manager 1848 .lock() 1849 .unwrap() 1850 .initialize_tdx(hob_address) 1851 .map_err(Error::CpuManager)?; 1852 self.init_tdx_memory(§ions)?; 1853 // With TDX memory and CPU state configured TDX setup is complete 1854 self.vm.tdx_finalize().map_err(Error::FinalizeTdx)?; 1855 } 1856 1857 self.cpu_manager 1858 .lock() 1859 .unwrap() 1860 .start_boot_vcpus() 1861 .map_err(Error::CpuManager)?; 1862 1863 if self 1864 .device_manager 1865 .lock() 1866 .unwrap() 1867 .console() 1868 .input_enabled() 1869 { 1870 let console = self.device_manager.lock().unwrap().console().clone(); 1871 let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]); 1872 match signals { 1873 Ok(signals) => { 1874 self.signals = Some(signals.handle()); 1875 let exit_evt = self.exit_evt.try_clone().map_err(Error::EventFdClone)?; 1876 let on_tty = self.on_tty; 1877 let signal_handler_seccomp_filter = 1878 get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler) 1879 .map_err(Error::CreateSeccompFilter)?; 1880 self.threads.push( 1881 thread::Builder::new() 1882 .name("signal_handler".to_string()) 1883 .spawn(move || { 1884 if let Err(e) = SeccompFilter::apply(signal_handler_seccomp_filter) 1885 .map_err(Error::ApplySeccompFilter) 1886 { 1887 error!("Error applying seccomp filter: {:?}", e); 1888 return; 1889 } 1890 1891 Vm::os_signal_handler(signals, console, on_tty, exit_evt); 1892 }) 1893 .map_err(Error::SignalHandlerSpawn)?, 1894 ); 1895 } 1896 Err(e) => error!("Signal not found {}", e), 1897 } 1898 1899 if self.on_tty { 1900 io::stdin() 1901 .lock() 1902 .set_raw_mode() 1903 .map_err(Error::SetTerminalRaw)?; 1904 } 1905 } 1906 1907 let mut state = self.state.try_write().map_err(|_| Error::PoisonedState)?; 1908 *state = new_state; 1909 event!("vm", "booted"); 1910 Ok(()) 1911 } 1912 1913 pub fn handle_pty(&self) -> Result<()> { 1914 // Could be a little dangerous, picks up a lock on device_manager 1915 // and goes into a blocking read. If the epoll loops starts to be 1916 // services by multiple threads likely need to revist this. 1917 let dm = self.device_manager.lock().unwrap(); 1918 let mut out = [0u8; 64]; 1919 if let Some(mut pty) = dm.serial_pty() { 1920 let count = pty.main.read(&mut out).map_err(Error::PtyConsole)?; 1921 let console = dm.console(); 1922 if console.input_enabled() { 1923 console 1924 .queue_input_bytes_serial(&out[..count]) 1925 .map_err(Error::Console)?; 1926 } 1927 }; 1928 let count = match dm.console_pty() { 1929 Some(mut pty) => pty.main.read(&mut out).map_err(Error::PtyConsole)?, 1930 None => return Ok(()), 1931 }; 1932 let console = dm.console(); 1933 if console.input_enabled() { 1934 console.queue_input_bytes_console(&out[..count]) 1935 } 1936 1937 Ok(()) 1938 } 1939 1940 pub fn handle_stdin(&self) -> Result<()> { 1941 let mut out = [0u8; 64]; 1942 let count = io::stdin() 1943 .lock() 1944 .read_raw(&mut out) 1945 .map_err(Error::Console)?; 1946 1947 // Replace "\n" with "\r" to deal with Windows SAC (#1170) 1948 if count == 1 && out[0] == 0x0a { 1949 out[0] = 0x0d; 1950 } 1951 1952 if self 1953 .device_manager 1954 .lock() 1955 .unwrap() 1956 .console() 1957 .input_enabled() 1958 { 1959 self.device_manager 1960 .lock() 1961 .unwrap() 1962 .console() 1963 .queue_input_bytes(&out[..count]) 1964 .map_err(Error::Console)?; 1965 } 1966 1967 Ok(()) 1968 } 1969 1970 /// Gets a thread-safe reference counted pointer to the VM configuration. 1971 pub fn get_config(&self) -> Arc<Mutex<VmConfig>> { 1972 Arc::clone(&self.config) 1973 } 1974 1975 /// Get the VM state. Returns an error if the state is poisoned. 1976 pub fn get_state(&self) -> Result<VmState> { 1977 self.state 1978 .try_read() 1979 .map_err(|_| Error::PoisonedState) 1980 .map(|state| *state) 1981 } 1982 1983 /// Load saved clock from snapshot 1984 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 1985 pub fn load_clock_from_snapshot( 1986 &mut self, 1987 snapshot: &Snapshot, 1988 ) -> Result<Option<hypervisor::ClockData>> { 1989 let vm_snapshot = get_vm_snapshot(snapshot).map_err(Error::Restore)?; 1990 self.saved_clock = vm_snapshot.clock; 1991 Ok(self.saved_clock) 1992 } 1993 1994 #[cfg(target_arch = "aarch64")] 1995 /// Add the vGIC section to the VM snapshot. 1996 fn add_vgic_snapshot_section( 1997 &self, 1998 vm_snapshot: &mut Snapshot, 1999 ) -> std::result::Result<(), MigratableError> { 2000 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2001 let gic_device = Arc::clone( 2002 self.device_manager 2003 .lock() 2004 .unwrap() 2005 .get_interrupt_controller() 2006 .unwrap() 2007 .lock() 2008 .unwrap() 2009 .get_gic_device() 2010 .unwrap(), 2011 ); 2012 2013 gic_device 2014 .lock() 2015 .unwrap() 2016 .set_gicr_typers(&saved_vcpu_states); 2017 2018 vm_snapshot.add_snapshot( 2019 gic_device 2020 .lock() 2021 .unwrap() 2022 .as_any_concrete_mut() 2023 .downcast_mut::<KvmGicV3Its>() 2024 .unwrap() 2025 .snapshot()?, 2026 ); 2027 2028 Ok(()) 2029 } 2030 2031 #[cfg(target_arch = "aarch64")] 2032 /// Restore the vGIC from the VM snapshot and enable the interrupt controller routing. 2033 fn restore_vgic_and_enable_interrupt( 2034 &self, 2035 vm_snapshot: &Snapshot, 2036 ) -> std::result::Result<(), MigratableError> { 2037 let saved_vcpu_states = self.cpu_manager.lock().unwrap().get_saved_states(); 2038 // The number of vCPUs is the same as the number of saved vCPU states. 2039 let vcpu_numbers = saved_vcpu_states.len(); 2040 2041 // Creating a GIC device here, as the GIC will not be created when 2042 // restoring the device manager. Note that currently only the bare GICv3 2043 // without ITS is supported. 2044 let mut gic_device = create_gic(&self.vm, vcpu_numbers.try_into().unwrap()) 2045 .map_err(|e| MigratableError::Restore(anyhow!("Could not create GIC: {:#?}", e)))?; 2046 2047 // Here we prepare the GICR_TYPER registers from the restored vCPU states. 2048 gic_device.set_gicr_typers(&saved_vcpu_states); 2049 2050 let gic_device = Arc::new(Mutex::new(gic_device)); 2051 // Update the GIC entity in device manager 2052 self.device_manager 2053 .lock() 2054 .unwrap() 2055 .get_interrupt_controller() 2056 .unwrap() 2057 .lock() 2058 .unwrap() 2059 .set_gic_device(Arc::clone(&gic_device)); 2060 2061 // Restore GIC states. 2062 if let Some(gicv3_its_snapshot) = vm_snapshot.snapshots.get(GIC_V3_ITS_SNAPSHOT_ID) { 2063 gic_device 2064 .lock() 2065 .unwrap() 2066 .as_any_concrete_mut() 2067 .downcast_mut::<KvmGicV3Its>() 2068 .unwrap() 2069 .restore(*gicv3_its_snapshot.clone())?; 2070 } else { 2071 return Err(MigratableError::Restore(anyhow!( 2072 "Missing GicV3Its snapshot" 2073 ))); 2074 } 2075 2076 // Activate gic device 2077 self.device_manager 2078 .lock() 2079 .unwrap() 2080 .get_interrupt_controller() 2081 .unwrap() 2082 .lock() 2083 .unwrap() 2084 .enable() 2085 .map_err(|e| { 2086 MigratableError::Restore(anyhow!( 2087 "Could not enable interrupt controller routing: {:#?}", 2088 e 2089 )) 2090 })?; 2091 2092 Ok(()) 2093 } 2094 2095 /// Gets the actual size of the balloon. 2096 pub fn balloon_size(&self) -> u64 { 2097 self.device_manager.lock().unwrap().balloon_size() 2098 } 2099 2100 pub fn receive_memory_regions<F>( 2101 &mut self, 2102 ranges: &MemoryRangeTable, 2103 fd: &mut F, 2104 ) -> std::result::Result<(), MigratableError> 2105 where 2106 F: Read, 2107 { 2108 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2109 let mem = guest_memory.memory(); 2110 2111 for range in ranges.regions() { 2112 mem.read_exact_from(GuestAddress(range.gpa), fd, range.length as usize) 2113 .map_err(|e| { 2114 MigratableError::MigrateReceive(anyhow!( 2115 "Error transferring memory to socket: {}", 2116 e 2117 )) 2118 })?; 2119 } 2120 Ok(()) 2121 } 2122 2123 pub fn send_memory_regions<F>( 2124 &mut self, 2125 ranges: &MemoryRangeTable, 2126 fd: &mut F, 2127 ) -> std::result::Result<(), MigratableError> 2128 where 2129 F: Write, 2130 { 2131 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2132 let mem = guest_memory.memory(); 2133 2134 for range in ranges.regions() { 2135 mem.write_all_to(GuestAddress(range.gpa), fd, range.length as usize) 2136 .map_err(|e| { 2137 MigratableError::MigrateSend(anyhow!( 2138 "Error transferring memory to socket: {}", 2139 e 2140 )) 2141 })?; 2142 } 2143 2144 Ok(()) 2145 } 2146 2147 pub fn memory_range_table(&self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2148 let mut table = MemoryRangeTable::default(); 2149 let guest_memory = self.memory_manager.lock().as_ref().unwrap().guest_memory(); 2150 2151 for region in guest_memory.memory().iter() { 2152 table.push(MemoryRange { 2153 gpa: region.start_addr().raw_value(), 2154 length: region.len() as u64, 2155 }); 2156 } 2157 2158 Ok(table) 2159 } 2160 2161 pub fn device_tree(&self) -> Arc<Mutex<DeviceTree>> { 2162 self.device_manager.lock().unwrap().device_tree() 2163 } 2164 2165 pub fn activate_virtio_devices(&self) -> Result<()> { 2166 self.device_manager 2167 .lock() 2168 .unwrap() 2169 .activate_virtio_devices() 2170 .map_err(Error::ActivateVirtioDevices) 2171 } 2172 2173 #[cfg(target_arch = "x86_64")] 2174 pub fn power_button(&self) -> Result<()> { 2175 #[cfg(feature = "acpi")] 2176 return self 2177 .device_manager 2178 .lock() 2179 .unwrap() 2180 .notify_power_button() 2181 .map_err(Error::PowerButton); 2182 #[cfg(not(feature = "acpi"))] 2183 Err(Error::PowerButtonNotSupported) 2184 } 2185 2186 #[cfg(target_arch = "aarch64")] 2187 pub fn power_button(&self) -> Result<()> { 2188 self.device_manager 2189 .lock() 2190 .unwrap() 2191 .notify_power_button() 2192 .map_err(Error::PowerButton) 2193 } 2194 } 2195 2196 impl Pausable for Vm { 2197 fn pause(&mut self) -> std::result::Result<(), MigratableError> { 2198 event!("vm", "pausing"); 2199 let mut state = self 2200 .state 2201 .try_write() 2202 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM state: {}", e)))?; 2203 let new_state = VmState::Paused; 2204 2205 state 2206 .valid_transition(new_state) 2207 .map_err(|e| MigratableError::Pause(anyhow!("Invalid transition: {:?}", e)))?; 2208 2209 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2210 { 2211 let mut clock = self 2212 .vm 2213 .get_clock() 2214 .map_err(|e| MigratableError::Pause(anyhow!("Could not get VM clock: {}", e)))?; 2215 // Reset clock flags. 2216 clock.flags = 0; 2217 self.saved_clock = Some(clock); 2218 } 2219 self.cpu_manager.lock().unwrap().pause()?; 2220 self.device_manager.lock().unwrap().pause()?; 2221 2222 *state = new_state; 2223 2224 event!("vm", "paused"); 2225 Ok(()) 2226 } 2227 2228 fn resume(&mut self) -> std::result::Result<(), MigratableError> { 2229 event!("vm", "resuming"); 2230 let mut state = self 2231 .state 2232 .try_write() 2233 .map_err(|e| MigratableError::Resume(anyhow!("Could not get VM state: {}", e)))?; 2234 let new_state = VmState::Running; 2235 2236 state 2237 .valid_transition(new_state) 2238 .map_err(|e| MigratableError::Resume(anyhow!("Invalid transition: {:?}", e)))?; 2239 2240 self.cpu_manager.lock().unwrap().resume()?; 2241 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2242 { 2243 if let Some(clock) = &self.saved_clock { 2244 self.vm.set_clock(clock).map_err(|e| { 2245 MigratableError::Resume(anyhow!("Could not set VM clock: {}", e)) 2246 })?; 2247 } 2248 } 2249 self.device_manager.lock().unwrap().resume()?; 2250 2251 // And we're back to the Running state. 2252 *state = new_state; 2253 event!("vm", "resumed"); 2254 Ok(()) 2255 } 2256 } 2257 2258 #[derive(Serialize, Deserialize)] 2259 pub struct VmSnapshot { 2260 pub config: Arc<Mutex<VmConfig>>, 2261 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2262 pub clock: Option<hypervisor::ClockData>, 2263 pub state: Option<hypervisor::VmState>, 2264 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2265 pub common_cpuid: hypervisor::CpuId, 2266 } 2267 2268 pub const VM_SNAPSHOT_ID: &str = "vm"; 2269 impl Snapshottable for Vm { 2270 fn id(&self) -> String { 2271 VM_SNAPSHOT_ID.to_string() 2272 } 2273 2274 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 2275 event!("vm", "snapshotting"); 2276 2277 #[cfg(feature = "tdx")] 2278 { 2279 if self.config.lock().unwrap().tdx.is_some() { 2280 return Err(MigratableError::Snapshot(anyhow!( 2281 "Snapshot not possible with TDX VM" 2282 ))); 2283 } 2284 } 2285 2286 let current_state = self.get_state().unwrap(); 2287 if current_state != VmState::Paused { 2288 return Err(MigratableError::Snapshot(anyhow!( 2289 "Trying to snapshot while VM is running" 2290 ))); 2291 } 2292 2293 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2294 let common_cpuid = { 2295 #[cfg(feature = "tdx")] 2296 let tdx_enabled = self.config.lock().unwrap().tdx.is_some(); 2297 let phys_bits = physical_bits( 2298 self.config.lock().unwrap().cpus.max_phys_bits, 2299 #[cfg(feature = "tdx")] 2300 tdx_enabled, 2301 ); 2302 arch::generate_common_cpuid( 2303 self.hypervisor.clone(), 2304 None, 2305 None, 2306 phys_bits, 2307 self.config.lock().unwrap().cpus.kvm_hyperv, 2308 #[cfg(feature = "tdx")] 2309 tdx_enabled, 2310 ) 2311 .map_err(|e| { 2312 MigratableError::MigrateReceive(anyhow!("Error generating common cpuid: {:?}", e)) 2313 })? 2314 }; 2315 2316 let mut vm_snapshot = Snapshot::new(VM_SNAPSHOT_ID); 2317 let vm_state = self 2318 .vm 2319 .state() 2320 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2321 let vm_snapshot_data = serde_json::to_vec(&VmSnapshot { 2322 config: self.get_config(), 2323 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2324 clock: self.saved_clock, 2325 state: Some(vm_state), 2326 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2327 common_cpuid, 2328 }) 2329 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2330 2331 vm_snapshot.add_snapshot(self.cpu_manager.lock().unwrap().snapshot()?); 2332 vm_snapshot.add_snapshot(self.memory_manager.lock().unwrap().snapshot()?); 2333 2334 #[cfg(target_arch = "aarch64")] 2335 self.add_vgic_snapshot_section(&mut vm_snapshot) 2336 .map_err(|e| MigratableError::Snapshot(e.into()))?; 2337 2338 vm_snapshot.add_snapshot(self.device_manager.lock().unwrap().snapshot()?); 2339 vm_snapshot.add_data_section(SnapshotDataSection { 2340 id: format!("{}-section", VM_SNAPSHOT_ID), 2341 snapshot: vm_snapshot_data, 2342 }); 2343 2344 event!("vm", "snapshotted"); 2345 Ok(vm_snapshot) 2346 } 2347 2348 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 2349 event!("vm", "restoring"); 2350 2351 let current_state = self 2352 .get_state() 2353 .map_err(|e| MigratableError::Restore(anyhow!("Could not get VM state: {:#?}", e)))?; 2354 let new_state = VmState::Paused; 2355 current_state.valid_transition(new_state).map_err(|e| { 2356 MigratableError::Restore(anyhow!("Could not restore VM state: {:#?}", e)) 2357 })?; 2358 2359 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2360 self.memory_manager 2361 .lock() 2362 .unwrap() 2363 .restore(*memory_manager_snapshot.clone())?; 2364 } else { 2365 return Err(MigratableError::Restore(anyhow!( 2366 "Missing memory manager snapshot" 2367 ))); 2368 } 2369 2370 if let Some(cpu_manager_snapshot) = snapshot.snapshots.get(CPU_MANAGER_SNAPSHOT_ID) { 2371 self.cpu_manager 2372 .lock() 2373 .unwrap() 2374 .restore(*cpu_manager_snapshot.clone())?; 2375 } else { 2376 return Err(MigratableError::Restore(anyhow!( 2377 "Missing CPU manager snapshot" 2378 ))); 2379 } 2380 2381 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2382 self.device_manager 2383 .lock() 2384 .unwrap() 2385 .restore(*device_manager_snapshot.clone())?; 2386 } else { 2387 return Err(MigratableError::Restore(anyhow!( 2388 "Missing device manager snapshot" 2389 ))); 2390 } 2391 2392 #[cfg(target_arch = "aarch64")] 2393 self.restore_vgic_and_enable_interrupt(&snapshot)?; 2394 2395 if let Some(device_manager_snapshot) = snapshot.snapshots.get(DEVICE_MANAGER_SNAPSHOT_ID) { 2396 self.device_manager 2397 .lock() 2398 .unwrap() 2399 .restore_devices(*device_manager_snapshot.clone())?; 2400 } else { 2401 return Err(MigratableError::Restore(anyhow!( 2402 "Missing device manager snapshot" 2403 ))); 2404 } 2405 2406 // Now we can start all vCPUs from here. 2407 self.cpu_manager 2408 .lock() 2409 .unwrap() 2410 .start_restored_vcpus() 2411 .map_err(|e| { 2412 MigratableError::Restore(anyhow!("Cannot start restored vCPUs: {:#?}", e)) 2413 })?; 2414 2415 if self 2416 .device_manager 2417 .lock() 2418 .unwrap() 2419 .console() 2420 .input_enabled() 2421 { 2422 let console = self.device_manager.lock().unwrap().console().clone(); 2423 let signals = Signals::new(&[SIGWINCH, SIGINT, SIGTERM]); 2424 match signals { 2425 Ok(signals) => { 2426 self.signals = Some(signals.handle()); 2427 2428 let on_tty = self.on_tty; 2429 let signal_handler_seccomp_filter = 2430 get_seccomp_filter(&self.seccomp_action, Thread::SignalHandler).map_err( 2431 |e| { 2432 MigratableError::Restore(anyhow!( 2433 "Could not create seccomp filter: {:#?}", 2434 Error::CreateSeccompFilter(e) 2435 )) 2436 }, 2437 )?; 2438 let exit_evt = self.exit_evt.try_clone().map_err(|e| { 2439 MigratableError::Restore(anyhow!("Could not clone exit event fd: {:?}", e)) 2440 })?; 2441 2442 self.threads.push( 2443 thread::Builder::new() 2444 .name("signal_handler".to_string()) 2445 .spawn(move || { 2446 if let Err(e) = SeccompFilter::apply(signal_handler_seccomp_filter) 2447 .map_err(Error::ApplySeccompFilter) 2448 { 2449 error!("Error applying seccomp filter: {:?}", e); 2450 return; 2451 } 2452 2453 Vm::os_signal_handler(signals, console, on_tty, exit_evt) 2454 }) 2455 .map_err(|e| { 2456 MigratableError::Restore(anyhow!( 2457 "Could not start console signal thread: {:#?}", 2458 e 2459 )) 2460 })?, 2461 ); 2462 } 2463 Err(e) => error!("Signal not found {}", e), 2464 } 2465 2466 if self.on_tty { 2467 io::stdin().lock().set_raw_mode().map_err(|e| { 2468 MigratableError::Restore(anyhow!( 2469 "Could not set terminal in raw mode: {:#?}", 2470 e 2471 )) 2472 })?; 2473 } 2474 } 2475 2476 let mut state = self 2477 .state 2478 .try_write() 2479 .map_err(|e| MigratableError::Restore(anyhow!("Could not set VM state: {:#?}", e)))?; 2480 *state = new_state; 2481 2482 event!("vm", "restored"); 2483 Ok(()) 2484 } 2485 } 2486 2487 impl Transportable for Vm { 2488 fn send( 2489 &self, 2490 snapshot: &Snapshot, 2491 destination_url: &str, 2492 ) -> std::result::Result<(), MigratableError> { 2493 let mut vm_snapshot_path = url_to_path(destination_url)?; 2494 vm_snapshot_path.push(VM_SNAPSHOT_FILE); 2495 2496 // Create the snapshot file 2497 let mut vm_snapshot_file = OpenOptions::new() 2498 .read(true) 2499 .write(true) 2500 .create_new(true) 2501 .open(vm_snapshot_path) 2502 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2503 2504 // Serialize and write the snapshot 2505 let vm_snapshot = 2506 serde_json::to_vec(snapshot).map_err(|e| MigratableError::MigrateSend(e.into()))?; 2507 2508 vm_snapshot_file 2509 .write(&vm_snapshot) 2510 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2511 2512 // Tell the memory manager to also send/write its own snapshot. 2513 if let Some(memory_manager_snapshot) = snapshot.snapshots.get(MEMORY_MANAGER_SNAPSHOT_ID) { 2514 self.memory_manager 2515 .lock() 2516 .unwrap() 2517 .send(&*memory_manager_snapshot.clone(), destination_url)?; 2518 } else { 2519 return Err(MigratableError::Restore(anyhow!( 2520 "Missing memory manager snapshot" 2521 ))); 2522 } 2523 2524 Ok(()) 2525 } 2526 } 2527 2528 impl Migratable for Vm { 2529 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2530 self.memory_manager.lock().unwrap().start_dirty_log()?; 2531 self.device_manager.lock().unwrap().start_dirty_log() 2532 } 2533 2534 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2535 self.memory_manager.lock().unwrap().stop_dirty_log()?; 2536 self.device_manager.lock().unwrap().stop_dirty_log() 2537 } 2538 2539 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2540 Ok(MemoryRangeTable::new_from_tables(vec![ 2541 self.memory_manager.lock().unwrap().dirty_log()?, 2542 self.device_manager.lock().unwrap().dirty_log()?, 2543 ])) 2544 } 2545 } 2546 2547 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2548 #[cfg(test)] 2549 mod tests { 2550 use super::*; 2551 2552 fn test_vm_state_transitions(state: VmState) { 2553 match state { 2554 VmState::Created => { 2555 // Check the transitions from Created 2556 assert!(state.valid_transition(VmState::Created).is_err()); 2557 assert!(state.valid_transition(VmState::Running).is_ok()); 2558 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2559 assert!(state.valid_transition(VmState::Paused).is_ok()); 2560 } 2561 VmState::Running => { 2562 // Check the transitions from Running 2563 assert!(state.valid_transition(VmState::Created).is_err()); 2564 assert!(state.valid_transition(VmState::Running).is_err()); 2565 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2566 assert!(state.valid_transition(VmState::Paused).is_ok()); 2567 } 2568 VmState::Shutdown => { 2569 // Check the transitions from Shutdown 2570 assert!(state.valid_transition(VmState::Created).is_err()); 2571 assert!(state.valid_transition(VmState::Running).is_ok()); 2572 assert!(state.valid_transition(VmState::Shutdown).is_err()); 2573 assert!(state.valid_transition(VmState::Paused).is_err()); 2574 } 2575 VmState::Paused => { 2576 // Check the transitions from Paused 2577 assert!(state.valid_transition(VmState::Created).is_err()); 2578 assert!(state.valid_transition(VmState::Running).is_ok()); 2579 assert!(state.valid_transition(VmState::Shutdown).is_ok()); 2580 assert!(state.valid_transition(VmState::Paused).is_err()); 2581 } 2582 } 2583 } 2584 2585 #[test] 2586 fn test_vm_created_transitions() { 2587 test_vm_state_transitions(VmState::Created); 2588 } 2589 2590 #[test] 2591 fn test_vm_running_transitions() { 2592 test_vm_state_transitions(VmState::Running); 2593 } 2594 2595 #[test] 2596 fn test_vm_shutdown_transitions() { 2597 test_vm_state_transitions(VmState::Shutdown); 2598 } 2599 2600 #[test] 2601 fn test_vm_paused_transitions() { 2602 test_vm_state_transitions(VmState::Paused); 2603 } 2604 } 2605 2606 #[cfg(target_arch = "aarch64")] 2607 #[cfg(test)] 2608 mod tests { 2609 use super::*; 2610 use crate::GuestMemoryMmap; 2611 use arch::aarch64::fdt::create_fdt; 2612 use arch::aarch64::gic::kvm::create_gic; 2613 use arch::aarch64::layout; 2614 use arch::{DeviceType, MmioDeviceInfo}; 2615 use vm_memory::GuestAddress; 2616 2617 const LEN: u64 = 4096; 2618 2619 #[test] 2620 fn test_create_fdt_with_devices() { 2621 let regions = vec![( 2622 GuestAddress(layout::RAM_64BIT_START), 2623 (layout::FDT_MAX_SIZE + 0x1000) as usize, 2624 )]; 2625 let mem = GuestMemoryMmap::from_ranges(®ions).expect("Cannot initialize memory"); 2626 2627 let dev_info: HashMap<(DeviceType, std::string::String), MmioDeviceInfo> = [ 2628 ( 2629 (DeviceType::Serial, DeviceType::Serial.to_string()), 2630 MmioDeviceInfo { 2631 addr: 0x00, 2632 irq: 33, 2633 }, 2634 ), 2635 ( 2636 (DeviceType::Virtio(1), "virtio".to_string()), 2637 MmioDeviceInfo { addr: LEN, irq: 34 }, 2638 ), 2639 ( 2640 (DeviceType::Rtc, "rtc".to_string()), 2641 MmioDeviceInfo { 2642 addr: 2 * LEN, 2643 irq: 35, 2644 }, 2645 ), 2646 ] 2647 .iter() 2648 .cloned() 2649 .collect(); 2650 2651 let hv = hypervisor::new().unwrap(); 2652 let vm = hv.create_vm().unwrap(); 2653 let gic = create_gic(&vm, 1).unwrap(); 2654 assert!(create_fdt( 2655 &mem, 2656 &CString::new("console=tty0").unwrap(), 2657 vec![0], 2658 Some((0, 0, 0)), 2659 &dev_info, 2660 &*gic, 2661 &None, 2662 &(0x1_0000_0000, 0x1_0000), 2663 ) 2664 .is_ok()) 2665 } 2666 } 2667 2668 #[cfg(all(feature = "kvm", target_arch = "x86_64"))] 2669 #[test] 2670 pub fn test_vm() { 2671 use hypervisor::VmExit; 2672 use vm_memory::{GuestMemory, GuestMemoryRegion}; 2673 // This example based on https://lwn.net/Articles/658511/ 2674 let code = [ 2675 0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */ 2676 0x00, 0xd8, /* add %bl, %al */ 2677 0x04, b'0', /* add $'0', %al */ 2678 0xee, /* out %al, (%dx) */ 2679 0xb0, b'\n', /* mov $'\n', %al */ 2680 0xee, /* out %al, (%dx) */ 2681 0xf4, /* hlt */ 2682 ]; 2683 2684 let mem_size = 0x1000; 2685 let load_addr = GuestAddress(0x1000); 2686 let mem = GuestMemoryMmap::from_ranges(&[(load_addr, mem_size)]).unwrap(); 2687 2688 let hv = hypervisor::new().unwrap(); 2689 let vm = hv.create_vm().expect("new VM creation failed"); 2690 2691 for (index, region) in mem.iter().enumerate() { 2692 let mem_region = vm.make_user_memory_region( 2693 index as u32, 2694 region.start_addr().raw_value(), 2695 region.len() as u64, 2696 region.as_ptr() as u64, 2697 false, 2698 false, 2699 ); 2700 2701 vm.create_user_memory_region(mem_region) 2702 .expect("Cannot configure guest memory"); 2703 } 2704 mem.write_slice(&code, load_addr) 2705 .expect("Writing code to memory failed"); 2706 2707 let vcpu = vm.create_vcpu(0, None).expect("new Vcpu failed"); 2708 2709 let mut vcpu_sregs = vcpu.get_sregs().expect("get sregs failed"); 2710 vcpu_sregs.cs.base = 0; 2711 vcpu_sregs.cs.selector = 0; 2712 vcpu.set_sregs(&vcpu_sregs).expect("set sregs failed"); 2713 2714 let mut vcpu_regs = vcpu.get_regs().expect("get regs failed"); 2715 vcpu_regs.rip = 0x1000; 2716 vcpu_regs.rax = 2; 2717 vcpu_regs.rbx = 3; 2718 vcpu_regs.rflags = 2; 2719 vcpu.set_regs(&vcpu_regs).expect("set regs failed"); 2720 2721 loop { 2722 match vcpu.run().expect("run failed") { 2723 VmExit::IoOut(addr, data) => { 2724 println!( 2725 "IO out -- addr: {:#x} data [{:?}]", 2726 addr, 2727 str::from_utf8(data).unwrap() 2728 ); 2729 } 2730 VmExit::Reset => { 2731 println!("HLT"); 2732 break; 2733 } 2734 r => panic!("unexpected exit reason: {:?}", r), 2735 } 2736 } 2737 } 2738