1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 6 use std::collections::BTreeMap; 7 use std::collections::HashMap; 8 use std::fs::{File, OpenOptions}; 9 use std::io::{self}; 10 use std::ops::{BitAnd, Deref, Not, Sub}; 11 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 12 use std::os::fd::AsFd; 13 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 14 use std::path::PathBuf; 15 use std::result; 16 use std::sync::{Arc, Barrier, Mutex}; 17 use std::{ffi, thread}; 18 19 use acpi_tables::{aml, Aml}; 20 use anyhow::anyhow; 21 #[cfg(target_arch = "x86_64")] 22 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 23 use arch::RegionType; 24 #[cfg(target_arch = "x86_64")] 25 use devices::ioapic; 26 #[cfg(target_arch = "aarch64")] 27 use hypervisor::HypervisorVmError; 28 use libc::_SC_NPROCESSORS_ONLN; 29 #[cfg(target_arch = "x86_64")] 30 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 31 use serde::{Deserialize, Serialize}; 32 use tracer::trace_scoped; 33 use virtio_devices::BlocksState; 34 #[cfg(target_arch = "x86_64")] 35 use vm_allocator::GsiApic; 36 use vm_allocator::{AddressAllocator, SystemAllocator}; 37 use vm_device::BusDevice; 38 use vm_memory::bitmap::AtomicBitmap; 39 use vm_memory::guest_memory::FileOffset; 40 use vm_memory::{ 41 mmap::MmapRegionError, Address, Error as MmapError, GuestAddress, GuestAddressSpace, 42 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 43 ReadVolatile, 44 }; 45 use vm_migration::{ 46 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 47 Snapshot, SnapshotData, Snapshottable, Transportable, 48 }; 49 50 #[cfg(target_arch = "x86_64")] 51 use crate::config::SgxEpcConfig; 52 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 53 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 54 use crate::coredump::{ 55 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, 56 }; 57 use crate::migration::url_to_path; 58 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 59 use crate::{GuestMemoryMmap, GuestRegionMmap}; 60 61 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 62 63 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 64 65 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 66 67 #[cfg(target_arch = "x86_64")] 68 const X86_64_IRQ_BASE: u32 = 5; 69 70 #[cfg(target_arch = "x86_64")] 71 const SGX_PAGE_SIZE: u64 = 1 << 12; 72 73 const HOTPLUG_COUNT: usize = 8; 74 75 // Memory policy constants 76 const MPOL_BIND: u32 = 2; 77 const MPOL_MF_STRICT: u32 = 1; 78 const MPOL_MF_MOVE: u32 = 1 << 1; 79 80 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 81 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 82 83 const MAX_PREFAULT_THREAD_COUNT: usize = 16; 84 85 #[derive(Clone, Default, Serialize, Deserialize)] 86 struct HotPlugState { 87 base: u64, 88 length: u64, 89 active: bool, 90 inserting: bool, 91 removing: bool, 92 } 93 94 pub struct VirtioMemZone { 95 region: Arc<GuestRegionMmap>, 96 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 97 hotplugged_size: u64, 98 hugepages: bool, 99 blocks_state: Arc<Mutex<BlocksState>>, 100 } 101 102 impl VirtioMemZone { 103 pub fn region(&self) -> &Arc<GuestRegionMmap> { 104 &self.region 105 } 106 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 107 self.virtio_device = Some(virtio_device); 108 } 109 pub fn hotplugged_size(&self) -> u64 { 110 self.hotplugged_size 111 } 112 pub fn hugepages(&self) -> bool { 113 self.hugepages 114 } 115 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 116 &self.blocks_state 117 } 118 pub fn plugged_ranges(&self) -> MemoryRangeTable { 119 self.blocks_state 120 .lock() 121 .unwrap() 122 .memory_ranges(self.region.start_addr().raw_value(), true) 123 } 124 } 125 126 #[derive(Default)] 127 pub struct MemoryZone { 128 regions: Vec<Arc<GuestRegionMmap>>, 129 virtio_mem_zone: Option<VirtioMemZone>, 130 } 131 132 impl MemoryZone { 133 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 134 &self.regions 135 } 136 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 137 &self.virtio_mem_zone 138 } 139 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 140 self.virtio_mem_zone.as_mut() 141 } 142 } 143 144 pub type MemoryZones = HashMap<String, MemoryZone>; 145 146 #[derive(Clone, Serialize, Deserialize)] 147 struct GuestRamMapping { 148 slot: u32, 149 gpa: u64, 150 size: u64, 151 zone_id: String, 152 virtio_mem: bool, 153 file_offset: u64, 154 } 155 156 #[derive(Clone, Serialize, Deserialize)] 157 struct ArchMemRegion { 158 base: u64, 159 size: usize, 160 r_type: RegionType, 161 } 162 163 pub struct MemoryManager { 164 boot_guest_memory: GuestMemoryMmap, 165 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 166 next_memory_slot: u32, 167 start_of_device_area: GuestAddress, 168 end_of_device_area: GuestAddress, 169 end_of_ram_area: GuestAddress, 170 pub vm: Arc<dyn hypervisor::Vm>, 171 hotplug_slots: Vec<HotPlugState>, 172 selected_slot: usize, 173 mergeable: bool, 174 allocator: Arc<Mutex<SystemAllocator>>, 175 hotplug_method: HotplugMethod, 176 boot_ram: u64, 177 current_ram: u64, 178 next_hotplug_slot: usize, 179 shared: bool, 180 hugepages: bool, 181 hugepage_size: Option<u64>, 182 prefault: bool, 183 thp: bool, 184 #[cfg(target_arch = "x86_64")] 185 sgx_epc_region: Option<SgxEpcRegion>, 186 user_provided_zones: bool, 187 snapshot_memory_ranges: MemoryRangeTable, 188 memory_zones: MemoryZones, 189 log_dirty: bool, // Enable dirty logging for created RAM regions 190 arch_mem_regions: Vec<ArchMemRegion>, 191 ram_allocator: AddressAllocator, 192 dynamic: bool, 193 194 // Keep track of calls to create_userspace_mapping() for guest RAM. 195 // This is useful for getting the dirty pages as we need to know the 196 // slots that the mapping is created in. 197 guest_ram_mappings: Vec<GuestRamMapping>, 198 199 pub acpi_address: Option<GuestAddress>, 200 #[cfg(target_arch = "aarch64")] 201 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 202 } 203 204 #[derive(Debug)] 205 pub enum Error { 206 /// Failed to create shared file. 207 SharedFileCreate(io::Error), 208 209 /// Failed to set shared file length. 210 SharedFileSetLen(io::Error), 211 212 /// Mmap backed guest memory error 213 GuestMemory(MmapError), 214 215 /// Failed to allocate a memory range. 216 MemoryRangeAllocation, 217 218 /// Error from region creation 219 GuestMemoryRegion(MmapRegionError), 220 221 /// No ACPI slot available 222 NoSlotAvailable, 223 224 /// Not enough space in the hotplug RAM region 225 InsufficientHotplugRam, 226 227 /// The requested hotplug memory addition is not a valid size 228 InvalidSize, 229 230 /// Failed to create the user memory region. 231 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 232 233 /// Failed to remove the user memory region. 234 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 235 236 /// Failed to EventFd. 237 EventFdFail(io::Error), 238 239 /// Eventfd write error 240 EventfdError(io::Error), 241 242 /// Failed to virtio-mem resize 243 VirtioMemResizeFail(virtio_devices::mem::Error), 244 245 /// Cannot restore VM 246 Restore(MigratableError), 247 248 /// Cannot restore VM because source URL is missing 249 RestoreMissingSourceUrl, 250 251 /// Cannot create the system allocator 252 CreateSystemAllocator, 253 254 /// Invalid SGX EPC section size 255 #[cfg(target_arch = "x86_64")] 256 EpcSectionSizeInvalid, 257 258 /// Failed allocating SGX EPC region 259 #[cfg(target_arch = "x86_64")] 260 SgxEpcRangeAllocation, 261 262 /// Failed opening SGX virtual EPC device 263 #[cfg(target_arch = "x86_64")] 264 SgxVirtEpcOpen(io::Error), 265 266 /// Failed setting the SGX virtual EPC section size 267 #[cfg(target_arch = "x86_64")] 268 SgxVirtEpcFileSetLen(io::Error), 269 270 /// Failed opening SGX provisioning device 271 #[cfg(target_arch = "x86_64")] 272 SgxProvisionOpen(io::Error), 273 274 /// Failed enabling SGX provisioning 275 #[cfg(target_arch = "x86_64")] 276 SgxEnableProvisioning(hypervisor::HypervisorVmError), 277 278 /// Failed creating a new MmapRegion instance. 279 #[cfg(target_arch = "x86_64")] 280 NewMmapRegion(vm_memory::mmap::MmapRegionError), 281 282 /// No memory zones found. 283 MissingMemoryZones, 284 285 /// Memory configuration is not valid. 286 InvalidMemoryParameters, 287 288 /// Forbidden operation. Impossible to resize guest memory if it is 289 /// backed by user defined memory regions. 290 InvalidResizeWithMemoryZones, 291 292 /// It's invalid to try applying a NUMA policy to a memory zone that is 293 /// memory mapped with MAP_SHARED. 294 InvalidSharedMemoryZoneWithHostNuma, 295 296 /// Failed applying NUMA memory policy. 297 ApplyNumaPolicy(io::Error), 298 299 /// Memory zone identifier is not unique. 300 DuplicateZoneId, 301 302 /// No virtio-mem resizing handler found. 303 MissingVirtioMemHandler, 304 305 /// Unknown memory zone. 306 UnknownMemoryZone, 307 308 /// Invalid size for resizing. Can be anything except 0. 309 InvalidHotplugSize, 310 311 /// Invalid hotplug method associated with memory zones resizing capability. 312 InvalidHotplugMethodWithMemoryZones, 313 314 /// Could not find specified memory zone identifier from hash map. 315 MissingZoneIdentifier, 316 317 /// Resizing the memory zone failed. 318 ResizeZone, 319 320 /// Guest address overflow 321 GuestAddressOverFlow, 322 323 /// Error opening snapshot file 324 SnapshotOpen(io::Error), 325 326 // Error copying snapshot into region 327 SnapshotCopy(GuestMemoryError), 328 329 /// Failed to allocate MMIO address 330 AllocateMmioAddress, 331 332 #[cfg(target_arch = "aarch64")] 333 /// Failed to create UEFI flash 334 CreateUefiFlash(HypervisorVmError), 335 336 /// Using a directory as a backing file for memory is not supported 337 DirectoryAsBackingFileForMemory, 338 339 /// Failed to stat filesystem 340 GetFileSystemBlockSize(io::Error), 341 342 /// Memory size is misaligned with default page size or its hugepage size 343 MisalignedMemorySize, 344 } 345 346 const ENABLE_FLAG: usize = 0; 347 const INSERTING_FLAG: usize = 1; 348 const REMOVING_FLAG: usize = 2; 349 const EJECT_FLAG: usize = 3; 350 351 const BASE_OFFSET_LOW: u64 = 0; 352 const BASE_OFFSET_HIGH: u64 = 0x4; 353 const LENGTH_OFFSET_LOW: u64 = 0x8; 354 const LENGTH_OFFSET_HIGH: u64 = 0xC; 355 const STATUS_OFFSET: u64 = 0x14; 356 const SELECTION_OFFSET: u64 = 0; 357 358 // The MMIO address space size is subtracted with 64k. This is done for the 359 // following reasons: 360 // - Reduce the addressable space size by at least 4k to workaround a Linux 361 // bug when the VMM allocates devices at the end of the addressable space 362 // - Windows requires the addressable space size to be 64k aligned 363 fn mmio_address_space_size(phys_bits: u8) -> u64 { 364 (1 << phys_bits) - (1 << 16) 365 } 366 367 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the 368 // `f_bsize` field. 369 // 370 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169 371 fn statfs_get_bsize(path: &str) -> Result<u64, Error> { 372 let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?; 373 let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit(); 374 375 // SAFETY: FFI call with a valid path and buffer 376 let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) }; 377 if ret != 0 { 378 return Err(Error::GetFileSystemBlockSize( 379 std::io::Error::last_os_error(), 380 )); 381 } 382 383 // SAFETY: `buf` is valid at this point 384 // Because this value is always positive, just convert it directly. 385 // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned 386 // by `clippy` on musl target. To avoid the warning, there should be `as _` instead of 387 // `as u64`. 388 let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _; 389 Ok(bsize) 390 } 391 392 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> { 393 // SAFETY: FFI call. Trivially safe. 394 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 395 396 // There is no backend file and the `hugepages` is disabled, just use system page size. 397 if zone.file.is_none() && !zone.hugepages { 398 return Ok(page_size); 399 } 400 401 // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly. 402 if zone.hugepages && zone.hugepage_size.is_some() { 403 return Ok(zone.hugepage_size.unwrap()); 404 } 405 406 // There are two scenarios here: 407 // - `hugepages` is enabled but `hugepage_size` is not specified: 408 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 409 // - The backing file is specified: 410 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 411 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 412 // value is less than or equal to the page size, just use the page size. 413 let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| { 414 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 415 })?; 416 417 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 418 419 Ok(align_size) 420 } 421 422 #[inline] 423 fn align_down<T>(val: T, align: T) -> T 424 where 425 T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>, 426 { 427 val & !(align - 1u8.into()) 428 } 429 430 #[inline] 431 fn is_aligned<T>(val: T, align: T) -> bool 432 where 433 T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq, 434 { 435 (val & (align - 1u8.into())) == 0u8.into() 436 } 437 438 impl BusDevice for MemoryManager { 439 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 440 if self.selected_slot < self.hotplug_slots.len() { 441 let state = &self.hotplug_slots[self.selected_slot]; 442 match offset { 443 BASE_OFFSET_LOW => { 444 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 445 } 446 BASE_OFFSET_HIGH => { 447 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 448 } 449 LENGTH_OFFSET_LOW => { 450 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 451 } 452 LENGTH_OFFSET_HIGH => { 453 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 454 } 455 STATUS_OFFSET => { 456 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 457 data.fill(0); 458 if state.active { 459 data[0] |= 1 << ENABLE_FLAG; 460 } 461 if state.inserting { 462 data[0] |= 1 << INSERTING_FLAG; 463 } 464 if state.removing { 465 data[0] |= 1 << REMOVING_FLAG; 466 } 467 } 468 _ => { 469 warn!( 470 "Unexpected offset for accessing memory manager device: {:#}", 471 offset 472 ); 473 } 474 } 475 } else { 476 warn!("Out of range memory slot: {}", self.selected_slot); 477 } 478 } 479 480 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 481 match offset { 482 SELECTION_OFFSET => { 483 self.selected_slot = usize::from(data[0]); 484 } 485 STATUS_OFFSET => { 486 if self.selected_slot < self.hotplug_slots.len() { 487 let state = &mut self.hotplug_slots[self.selected_slot]; 488 // The ACPI code writes back a 1 to acknowledge the insertion 489 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 490 state.inserting = false; 491 } 492 // Ditto for removal 493 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 494 state.removing = false; 495 } 496 // Trigger removal of "DIMM" 497 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 498 warn!("Ejection of memory not currently supported"); 499 } 500 } else { 501 warn!("Out of range memory slot: {}", self.selected_slot); 502 } 503 } 504 _ => { 505 warn!( 506 "Unexpected offset for accessing memory manager device: {:#}", 507 offset 508 ); 509 } 510 }; 511 None 512 } 513 } 514 515 impl MemoryManager { 516 /// Creates all memory regions based on the available RAM ranges defined 517 /// by `ram_regions`, and based on the description of the memory zones. 518 /// In practice, this function can perform multiple memory mappings of the 519 /// same backing file if there's a hole in the address space between two 520 /// RAM ranges. 521 /// 522 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 523 /// and zones containing two zones (size 1G and size 4G). 524 /// 525 /// This function will create 3 resulting memory regions: 526 /// - First one mapping entirely the first memory zone on 0-1G range 527 /// - Second one mapping partially the second memory zone on 1G-3G range 528 /// - Third one mapping partially the second memory zone on 4G-6G range 529 /// 530 /// Also, all memory regions are page-size aligned (e.g. their sizes must 531 /// be multiple of page-size), which may leave an additional hole in the 532 /// address space when hugepage is used. 533 fn create_memory_regions_from_zones( 534 ram_regions: &[(GuestAddress, usize)], 535 zones: &[MemoryZoneConfig], 536 prefault: Option<bool>, 537 thp: bool, 538 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 539 let mut zone_iter = zones.iter(); 540 let mut mem_regions = Vec::new(); 541 let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?; 542 let mut zone_align_size = memory_zone_get_align_size(zone)?; 543 let mut zone_offset = 0u64; 544 let mut memory_zones = HashMap::new(); 545 546 if !is_aligned(zone.size, zone_align_size) { 547 return Err(Error::MisalignedMemorySize); 548 } 549 550 // Add zone id to the list of memory zones. 551 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 552 553 for ram_region in ram_regions.iter() { 554 let mut ram_region_offset = 0; 555 let mut exit = false; 556 557 loop { 558 let mut ram_region_consumed = false; 559 let mut pull_next_zone = false; 560 561 let ram_region_available_size = 562 align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size); 563 if ram_region_available_size == 0 { 564 break; 565 } 566 let zone_sub_size = zone.size - zone_offset; 567 568 let file_offset = zone_offset; 569 let region_start = ram_region 570 .0 571 .checked_add(ram_region_offset) 572 .ok_or(Error::GuestAddressOverFlow)?; 573 let region_size = if zone_sub_size <= ram_region_available_size { 574 if zone_sub_size == ram_region_available_size { 575 ram_region_consumed = true; 576 } 577 578 ram_region_offset += zone_sub_size; 579 pull_next_zone = true; 580 581 zone_sub_size 582 } else { 583 zone_offset += ram_region_available_size; 584 ram_region_consumed = true; 585 586 ram_region_available_size 587 }; 588 589 info!( 590 "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}", 591 zone.id, 592 region_start.raw_value(), 593 region_size 594 ); 595 let region = MemoryManager::create_ram_region( 596 &zone.file, 597 file_offset, 598 region_start, 599 region_size as usize, 600 prefault.unwrap_or(zone.prefault), 601 zone.shared, 602 zone.hugepages, 603 zone.hugepage_size, 604 zone.host_numa_node, 605 None, 606 thp, 607 )?; 608 609 // Add region to the list of regions associated with the 610 // current memory zone. 611 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 612 memory_zone.regions.push(region.clone()); 613 } 614 615 mem_regions.push(region); 616 617 if pull_next_zone { 618 // Get the next zone and reset the offset. 619 zone_offset = 0; 620 if let Some(z) = zone_iter.next() { 621 zone = z; 622 } else { 623 exit = true; 624 break; 625 } 626 zone_align_size = memory_zone_get_align_size(zone)?; 627 if !is_aligned(zone.size, zone_align_size) { 628 return Err(Error::MisalignedMemorySize); 629 } 630 631 // Check if zone id already exist. In case it does, throw 632 // an error as we need unique identifiers. Otherwise, add 633 // the new zone id to the list of memory zones. 634 if memory_zones.contains_key(&zone.id) { 635 error!( 636 "Memory zone identifier '{}' found more than once. \ 637 It must be unique", 638 zone.id, 639 ); 640 return Err(Error::DuplicateZoneId); 641 } 642 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 643 } 644 645 if ram_region_consumed { 646 break; 647 } 648 } 649 650 if exit { 651 break; 652 } 653 } 654 655 Ok((mem_regions, memory_zones)) 656 } 657 658 // Restore both GuestMemory regions along with MemoryZone zones. 659 fn restore_memory_regions_and_zones( 660 guest_ram_mappings: &[GuestRamMapping], 661 zones_config: &[MemoryZoneConfig], 662 prefault: Option<bool>, 663 mut existing_memory_files: HashMap<u32, File>, 664 thp: bool, 665 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 666 let mut memory_regions = Vec::new(); 667 let mut memory_zones = HashMap::new(); 668 669 for zone_config in zones_config { 670 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 671 } 672 673 for guest_ram_mapping in guest_ram_mappings { 674 for zone_config in zones_config { 675 if guest_ram_mapping.zone_id == zone_config.id { 676 let region = MemoryManager::create_ram_region( 677 if guest_ram_mapping.virtio_mem { 678 &None 679 } else { 680 &zone_config.file 681 }, 682 guest_ram_mapping.file_offset, 683 GuestAddress(guest_ram_mapping.gpa), 684 guest_ram_mapping.size as usize, 685 prefault.unwrap_or(zone_config.prefault), 686 zone_config.shared, 687 zone_config.hugepages, 688 zone_config.hugepage_size, 689 zone_config.host_numa_node, 690 existing_memory_files.remove(&guest_ram_mapping.slot), 691 thp, 692 )?; 693 memory_regions.push(Arc::clone(®ion)); 694 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 695 if guest_ram_mapping.virtio_mem { 696 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 697 let region_size = region.len(); 698 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 699 region, 700 virtio_device: None, 701 hotplugged_size, 702 hugepages: zone_config.hugepages, 703 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 704 }); 705 } else { 706 memory_zone.regions.push(region); 707 } 708 } 709 } 710 } 711 } 712 713 memory_regions.sort_by_key(|x| x.start_addr()); 714 715 Ok((memory_regions, memory_zones)) 716 } 717 718 fn fill_saved_regions( 719 &mut self, 720 file_path: PathBuf, 721 saved_regions: MemoryRangeTable, 722 ) -> Result<(), Error> { 723 if saved_regions.is_empty() { 724 return Ok(()); 725 } 726 727 // Open (read only) the snapshot file. 728 let mut memory_file = OpenOptions::new() 729 .read(true) 730 .open(file_path) 731 .map_err(Error::SnapshotOpen)?; 732 733 let guest_memory = self.guest_memory.memory(); 734 for range in saved_regions.regions() { 735 let mut offset: u64 = 0; 736 // Here we are manually handling the retry in case we can't write 737 // the whole region at once because we can't use the implementation 738 // from vm-memory::GuestMemory of read_exact_from() as it is not 739 // following the correct behavior. For more info about this issue 740 // see: https://github.com/rust-vmm/vm-memory/issues/174 741 loop { 742 let bytes_read = guest_memory 743 .read_volatile_from( 744 GuestAddress(range.gpa + offset), 745 &mut memory_file, 746 (range.length - offset) as usize, 747 ) 748 .map_err(Error::SnapshotCopy)?; 749 offset += bytes_read as u64; 750 751 if offset == range.length { 752 break; 753 } 754 } 755 } 756 757 Ok(()) 758 } 759 760 fn validate_memory_config( 761 config: &MemoryConfig, 762 user_provided_zones: bool, 763 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 764 let mut allow_mem_hotplug = false; 765 766 if !user_provided_zones { 767 if config.zones.is_some() { 768 error!( 769 "User defined memory regions can't be provided if the \ 770 memory size is not 0" 771 ); 772 return Err(Error::InvalidMemoryParameters); 773 } 774 775 if config.hotplug_size.is_some() { 776 allow_mem_hotplug = true; 777 } 778 779 if let Some(hotplugged_size) = config.hotplugged_size { 780 if let Some(hotplug_size) = config.hotplug_size { 781 if hotplugged_size > hotplug_size { 782 error!( 783 "'hotplugged_size' {} can't be bigger than \ 784 'hotplug_size' {}", 785 hotplugged_size, hotplug_size, 786 ); 787 return Err(Error::InvalidMemoryParameters); 788 } 789 } else { 790 error!( 791 "Invalid to define 'hotplugged_size' when there is\ 792 no 'hotplug_size'" 793 ); 794 return Err(Error::InvalidMemoryParameters); 795 } 796 if config.hotplug_method == HotplugMethod::Acpi { 797 error!( 798 "Invalid to define 'hotplugged_size' with hotplug \ 799 method 'acpi'" 800 ); 801 return Err(Error::InvalidMemoryParameters); 802 } 803 } 804 805 // Create a single zone from the global memory config. This lets 806 // us reuse the codepath for user defined memory zones. 807 let zones = vec![MemoryZoneConfig { 808 id: String::from(DEFAULT_MEMORY_ZONE), 809 size: config.size, 810 file: None, 811 shared: config.shared, 812 hugepages: config.hugepages, 813 hugepage_size: config.hugepage_size, 814 host_numa_node: None, 815 hotplug_size: config.hotplug_size, 816 hotplugged_size: config.hotplugged_size, 817 prefault: config.prefault, 818 }]; 819 820 Ok((config.size, zones, allow_mem_hotplug)) 821 } else { 822 if config.zones.is_none() { 823 error!( 824 "User defined memory regions must be provided if the \ 825 memory size is 0" 826 ); 827 return Err(Error::MissingMemoryZones); 828 } 829 830 // Safe to unwrap as we checked right above there were some 831 // regions. 832 let zones = config.zones.clone().unwrap(); 833 if zones.is_empty() { 834 return Err(Error::MissingMemoryZones); 835 } 836 837 let mut total_ram_size: u64 = 0; 838 for zone in zones.iter() { 839 total_ram_size += zone.size; 840 841 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 842 error!( 843 "Invalid to set host NUMA policy for a memory zone \ 844 backed by a regular file and mapped as 'shared'" 845 ); 846 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 847 } 848 849 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 850 error!("Invalid to set ACPI hotplug method for memory zones"); 851 return Err(Error::InvalidHotplugMethodWithMemoryZones); 852 } 853 854 if let Some(hotplugged_size) = zone.hotplugged_size { 855 if let Some(hotplug_size) = zone.hotplug_size { 856 if hotplugged_size > hotplug_size { 857 error!( 858 "'hotplugged_size' {} can't be bigger than \ 859 'hotplug_size' {}", 860 hotplugged_size, hotplug_size, 861 ); 862 return Err(Error::InvalidMemoryParameters); 863 } 864 } else { 865 error!( 866 "Invalid to define 'hotplugged_size' when there is\ 867 no 'hotplug_size' for a memory zone" 868 ); 869 return Err(Error::InvalidMemoryParameters); 870 } 871 if config.hotplug_method == HotplugMethod::Acpi { 872 error!( 873 "Invalid to define 'hotplugged_size' with hotplug \ 874 method 'acpi'" 875 ); 876 return Err(Error::InvalidMemoryParameters); 877 } 878 } 879 } 880 881 Ok((total_ram_size, zones, allow_mem_hotplug)) 882 } 883 } 884 885 pub fn allocate_address_space(&mut self) -> Result<(), Error> { 886 let mut list = Vec::new(); 887 888 for (zone_id, memory_zone) in self.memory_zones.iter() { 889 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 890 memory_zone 891 .regions() 892 .iter() 893 .map(|r| (r.clone(), false)) 894 .collect(); 895 896 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 897 regions.push((virtio_mem_zone.region().clone(), true)); 898 } 899 900 list.push((zone_id.clone(), regions)); 901 } 902 903 for (zone_id, regions) in list { 904 for (region, virtio_mem) in regions { 905 let slot = self.create_userspace_mapping( 906 region.start_addr().raw_value(), 907 region.len(), 908 region.as_ptr() as u64, 909 self.mergeable, 910 false, 911 self.log_dirty, 912 )?; 913 914 let file_offset = if let Some(file_offset) = region.file_offset() { 915 file_offset.start() 916 } else { 917 0 918 }; 919 920 self.guest_ram_mappings.push(GuestRamMapping { 921 gpa: region.start_addr().raw_value(), 922 size: region.len(), 923 slot, 924 zone_id: zone_id.clone(), 925 virtio_mem, 926 file_offset, 927 }); 928 self.ram_allocator 929 .allocate(Some(region.start_addr()), region.len(), None) 930 .ok_or(Error::MemoryRangeAllocation)?; 931 } 932 } 933 934 // Allocate SubRegion and Reserved address ranges. 935 for region in self.arch_mem_regions.iter() { 936 if region.r_type == RegionType::Ram { 937 // Ignore the RAM type since ranges have already been allocated 938 // based on the GuestMemory regions. 939 continue; 940 } 941 self.ram_allocator 942 .allocate( 943 Some(GuestAddress(region.base)), 944 region.size as GuestUsize, 945 None, 946 ) 947 .ok_or(Error::MemoryRangeAllocation)?; 948 } 949 950 Ok(()) 951 } 952 953 #[cfg(target_arch = "aarch64")] 954 fn add_uefi_flash(&mut self) -> Result<(), Error> { 955 // On AArch64, the UEFI binary requires a flash device at address 0. 956 // 4 MiB memory is mapped to simulate the flash. 957 let uefi_mem_slot = self.allocate_memory_slot(); 958 let uefi_region = GuestRegionMmap::new( 959 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 960 arch::layout::UEFI_START, 961 ) 962 .unwrap(); 963 let uefi_mem_region = self.vm.make_user_memory_region( 964 uefi_mem_slot, 965 uefi_region.start_addr().raw_value(), 966 uefi_region.len(), 967 uefi_region.as_ptr() as u64, 968 false, 969 false, 970 ); 971 self.vm 972 .create_user_memory_region(uefi_mem_region) 973 .map_err(Error::CreateUefiFlash)?; 974 975 let uefi_flash = 976 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 977 978 self.uefi_flash = Some(uefi_flash); 979 980 Ok(()) 981 } 982 983 #[allow(clippy::too_many_arguments)] 984 pub fn new( 985 vm: Arc<dyn hypervisor::Vm>, 986 config: &MemoryConfig, 987 prefault: Option<bool>, 988 phys_bits: u8, 989 #[cfg(feature = "tdx")] tdx_enabled: bool, 990 restore_data: Option<&MemoryManagerSnapshotData>, 991 existing_memory_files: Option<HashMap<u32, File>>, 992 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 993 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 994 trace_scoped!("MemoryManager::new"); 995 996 let user_provided_zones = config.size == 0; 997 998 let mmio_address_space_size = mmio_address_space_size(phys_bits); 999 debug_assert_eq!( 1000 (((mmio_address_space_size) >> 16) << 16), 1001 mmio_address_space_size 1002 ); 1003 let start_of_platform_device_area = 1004 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 1005 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 1006 1007 let (ram_size, zones, allow_mem_hotplug) = 1008 Self::validate_memory_config(config, user_provided_zones)?; 1009 1010 let ( 1011 start_of_device_area, 1012 boot_ram, 1013 current_ram, 1014 arch_mem_regions, 1015 memory_zones, 1016 guest_memory, 1017 boot_guest_memory, 1018 hotplug_slots, 1019 next_memory_slot, 1020 selected_slot, 1021 next_hotplug_slot, 1022 ) = if let Some(data) = restore_data { 1023 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 1024 &data.guest_ram_mappings, 1025 &zones, 1026 prefault, 1027 existing_memory_files.unwrap_or_default(), 1028 config.thp, 1029 )?; 1030 let guest_memory = 1031 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 1032 let boot_guest_memory = guest_memory.clone(); 1033 ( 1034 GuestAddress(data.start_of_device_area), 1035 data.boot_ram, 1036 data.current_ram, 1037 data.arch_mem_regions.clone(), 1038 memory_zones, 1039 guest_memory, 1040 boot_guest_memory, 1041 data.hotplug_slots.clone(), 1042 data.next_memory_slot, 1043 data.selected_slot, 1044 data.next_hotplug_slot, 1045 ) 1046 } else { 1047 // Init guest memory 1048 let arch_mem_regions = arch::arch_memory_regions(); 1049 1050 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1051 .iter() 1052 .filter(|r| r.2 == RegionType::Ram) 1053 .map(|r| (r.0, r.1)) 1054 .collect(); 1055 1056 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 1057 .iter() 1058 .map(|(a, b, c)| ArchMemRegion { 1059 base: a.0, 1060 size: *b, 1061 r_type: *c, 1062 }) 1063 .collect(); 1064 1065 let (mem_regions, mut memory_zones) = 1066 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 1067 1068 let mut guest_memory = 1069 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 1070 1071 let boot_guest_memory = guest_memory.clone(); 1072 1073 let mut start_of_device_area = 1074 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 1075 1076 // Update list of memory zones for resize. 1077 for zone in zones.iter() { 1078 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 1079 if let Some(hotplug_size) = zone.hotplug_size { 1080 if hotplug_size == 0 { 1081 error!("'hotplug_size' can't be 0"); 1082 return Err(Error::InvalidHotplugSize); 1083 } 1084 1085 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 1086 start_of_device_area = start_of_device_area 1087 .checked_add(hotplug_size) 1088 .ok_or(Error::GuestAddressOverFlow)?; 1089 } else { 1090 // Alignment must be "natural" i.e. same as size of block 1091 let start_addr = GuestAddress( 1092 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1093 - 1) 1094 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1095 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 1096 ); 1097 1098 // When `prefault` is set by vm_restore, memory manager 1099 // will create ram region with `prefault` option in 1100 // restore config rather than same option in zone 1101 let region = MemoryManager::create_ram_region( 1102 &None, 1103 0, 1104 start_addr, 1105 hotplug_size as usize, 1106 prefault.unwrap_or(zone.prefault), 1107 zone.shared, 1108 zone.hugepages, 1109 zone.hugepage_size, 1110 zone.host_numa_node, 1111 None, 1112 config.thp, 1113 )?; 1114 1115 guest_memory = guest_memory 1116 .insert_region(Arc::clone(®ion)) 1117 .map_err(Error::GuestMemory)?; 1118 1119 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1120 let region_size = region.len(); 1121 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1122 region, 1123 virtio_device: None, 1124 hotplugged_size, 1125 hugepages: zone.hugepages, 1126 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1127 }); 1128 1129 start_of_device_area = start_addr 1130 .checked_add(hotplug_size) 1131 .ok_or(Error::GuestAddressOverFlow)?; 1132 } 1133 } 1134 } else { 1135 return Err(Error::MissingZoneIdentifier); 1136 } 1137 } 1138 1139 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1140 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1141 1142 ( 1143 start_of_device_area, 1144 ram_size, 1145 ram_size, 1146 arch_mem_regions, 1147 memory_zones, 1148 guest_memory, 1149 boot_guest_memory, 1150 hotplug_slots, 1151 0, 1152 0, 1153 0, 1154 ) 1155 }; 1156 1157 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1158 1159 // Both MMIO and PIO address spaces start at address 0. 1160 let allocator = Arc::new(Mutex::new( 1161 SystemAllocator::new( 1162 #[cfg(target_arch = "x86_64")] 1163 { 1164 GuestAddress(0) 1165 }, 1166 #[cfg(target_arch = "x86_64")] 1167 { 1168 1 << 16 1169 }, 1170 start_of_platform_device_area, 1171 PLATFORM_DEVICE_AREA_SIZE, 1172 #[cfg(target_arch = "x86_64")] 1173 vec![GsiApic::new( 1174 X86_64_IRQ_BASE, 1175 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1176 )], 1177 ) 1178 .ok_or(Error::CreateSystemAllocator)?, 1179 )); 1180 1181 #[cfg(not(feature = "tdx"))] 1182 let dynamic = true; 1183 #[cfg(feature = "tdx")] 1184 let dynamic = !tdx_enabled; 1185 1186 let acpi_address = if dynamic 1187 && config.hotplug_method == HotplugMethod::Acpi 1188 && (config.hotplug_size.unwrap_or_default() > 0) 1189 { 1190 Some( 1191 allocator 1192 .lock() 1193 .unwrap() 1194 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1195 .ok_or(Error::AllocateMmioAddress)?, 1196 ) 1197 } else { 1198 None 1199 }; 1200 1201 // If running on SGX the start of device area and RAM area may diverge but 1202 // at this point they are next to each other. 1203 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1204 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1205 1206 let mut memory_manager = MemoryManager { 1207 boot_guest_memory, 1208 guest_memory, 1209 next_memory_slot, 1210 start_of_device_area, 1211 end_of_device_area, 1212 end_of_ram_area, 1213 vm, 1214 hotplug_slots, 1215 selected_slot, 1216 mergeable: config.mergeable, 1217 allocator, 1218 hotplug_method: config.hotplug_method, 1219 boot_ram, 1220 current_ram, 1221 next_hotplug_slot, 1222 shared: config.shared, 1223 hugepages: config.hugepages, 1224 hugepage_size: config.hugepage_size, 1225 prefault: config.prefault, 1226 #[cfg(target_arch = "x86_64")] 1227 sgx_epc_region: None, 1228 user_provided_zones, 1229 snapshot_memory_ranges: MemoryRangeTable::default(), 1230 memory_zones, 1231 guest_ram_mappings: Vec::new(), 1232 acpi_address, 1233 log_dirty: dynamic, // Cannot log dirty pages on a TD 1234 arch_mem_regions, 1235 ram_allocator, 1236 dynamic, 1237 #[cfg(target_arch = "aarch64")] 1238 uefi_flash: None, 1239 thp: config.thp, 1240 }; 1241 1242 #[cfg(target_arch = "aarch64")] 1243 { 1244 // For Aarch64 we cannot lazily allocate the address space like we 1245 // do for x86, because while restoring a VM from snapshot we would 1246 // need the address space to be allocated to properly restore VGIC. 1247 // And the restore of VGIC happens before we attempt to run the vCPUs 1248 // for the first time, thus we need to allocate the address space 1249 // beforehand. 1250 memory_manager.allocate_address_space()?; 1251 memory_manager.add_uefi_flash()?; 1252 } 1253 1254 #[cfg(target_arch = "x86_64")] 1255 if let Some(sgx_epc_config) = sgx_epc_config { 1256 memory_manager.setup_sgx(sgx_epc_config)?; 1257 } 1258 1259 Ok(Arc::new(Mutex::new(memory_manager))) 1260 } 1261 1262 pub fn new_from_snapshot( 1263 snapshot: &Snapshot, 1264 vm: Arc<dyn hypervisor::Vm>, 1265 config: &MemoryConfig, 1266 source_url: Option<&str>, 1267 prefault: bool, 1268 phys_bits: u8, 1269 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1270 if let Some(source_url) = source_url { 1271 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1272 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1273 1274 let mem_snapshot: MemoryManagerSnapshotData = 1275 snapshot.to_state().map_err(Error::Restore)?; 1276 1277 let mm = MemoryManager::new( 1278 vm, 1279 config, 1280 Some(prefault), 1281 phys_bits, 1282 #[cfg(feature = "tdx")] 1283 false, 1284 Some(&mem_snapshot), 1285 None, 1286 #[cfg(target_arch = "x86_64")] 1287 None, 1288 )?; 1289 1290 mm.lock() 1291 .unwrap() 1292 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1293 1294 Ok(mm) 1295 } else { 1296 Err(Error::RestoreMissingSourceUrl) 1297 } 1298 } 1299 1300 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1301 // SAFETY: FFI call with correct arguments 1302 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1303 1304 if res < 0 { 1305 Err(io::Error::last_os_error()) 1306 } else { 1307 Ok(res as RawFd) 1308 } 1309 } 1310 1311 fn mbind( 1312 addr: *mut u8, 1313 len: u64, 1314 mode: u32, 1315 nodemask: Vec<u64>, 1316 maxnode: u64, 1317 flags: u32, 1318 ) -> Result<(), io::Error> { 1319 // SAFETY: FFI call with correct arguments 1320 let res = unsafe { 1321 libc::syscall( 1322 libc::SYS_mbind, 1323 addr as *mut libc::c_void, 1324 len, 1325 mode, 1326 nodemask.as_ptr(), 1327 maxnode, 1328 flags, 1329 ) 1330 }; 1331 1332 if res < 0 { 1333 Err(io::Error::last_os_error()) 1334 } else { 1335 Ok(()) 1336 } 1337 } 1338 1339 fn create_anonymous_file( 1340 size: usize, 1341 hugepages: bool, 1342 hugepage_size: Option<u64>, 1343 ) -> Result<FileOffset, Error> { 1344 let fd = Self::memfd_create( 1345 &ffi::CString::new("ch_ram").unwrap(), 1346 libc::MFD_CLOEXEC 1347 | if hugepages { 1348 libc::MFD_HUGETLB 1349 | if let Some(hugepage_size) = hugepage_size { 1350 /* 1351 * From the Linux kernel: 1352 * Several system calls take a flag to request "hugetlb" huge pages. 1353 * Without further specification, these system calls will use the 1354 * system's default huge page size. If a system supports multiple 1355 * huge page sizes, the desired huge page size can be specified in 1356 * bits [26:31] of the flag arguments. The value in these 6 bits 1357 * will encode the log2 of the huge page size. 1358 */ 1359 1360 hugepage_size.trailing_zeros() << 26 1361 } else { 1362 // Use the system default huge page size 1363 0 1364 } 1365 } else { 1366 0 1367 }, 1368 ) 1369 .map_err(Error::SharedFileCreate)?; 1370 1371 // SAFETY: fd is valid 1372 let f = unsafe { File::from_raw_fd(fd) }; 1373 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1374 1375 Ok(FileOffset::new(f, 0)) 1376 } 1377 1378 fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> { 1379 if backing_file.is_dir() { 1380 Err(Error::DirectoryAsBackingFileForMemory) 1381 } else { 1382 let f = OpenOptions::new() 1383 .read(true) 1384 .write(true) 1385 .open(backing_file) 1386 .map_err(Error::SharedFileCreate)?; 1387 1388 Ok(FileOffset::new(f, file_offset)) 1389 } 1390 } 1391 1392 #[allow(clippy::too_many_arguments)] 1393 pub fn create_ram_region( 1394 backing_file: &Option<PathBuf>, 1395 file_offset: u64, 1396 start_addr: GuestAddress, 1397 size: usize, 1398 prefault: bool, 1399 shared: bool, 1400 hugepages: bool, 1401 hugepage_size: Option<u64>, 1402 host_numa_node: Option<u32>, 1403 existing_memory_file: Option<File>, 1404 thp: bool, 1405 ) -> Result<Arc<GuestRegionMmap>, Error> { 1406 let mut mmap_flags = libc::MAP_NORESERVE; 1407 1408 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1409 // the complexity of the handling clear. 1410 let fo = if let Some(f) = existing_memory_file { 1411 // It must be MAP_SHARED as we wouldn't already have an FD 1412 mmap_flags |= libc::MAP_SHARED; 1413 Some(FileOffset::new(f, file_offset)) 1414 } else if let Some(backing_file) = backing_file { 1415 if shared { 1416 mmap_flags |= libc::MAP_SHARED; 1417 } else { 1418 mmap_flags |= libc::MAP_PRIVATE; 1419 } 1420 Some(Self::open_backing_file(backing_file, file_offset)?) 1421 } else if shared || hugepages { 1422 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1423 // because the MAP_PRIVATE will trigger CoW against the backing file with 1424 // the VFIO pinning 1425 mmap_flags |= libc::MAP_SHARED; 1426 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1427 } else { 1428 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1429 None 1430 }; 1431 1432 let region = GuestRegionMmap::new( 1433 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1434 .map_err(Error::GuestMemoryRegion)?, 1435 start_addr, 1436 ) 1437 .map_err(Error::GuestMemory)?; 1438 1439 // Apply NUMA policy if needed. 1440 if let Some(node) = host_numa_node { 1441 let addr = region.deref().as_ptr(); 1442 let len = region.deref().size() as u64; 1443 let mode = MPOL_BIND; 1444 let mut nodemask: Vec<u64> = Vec::new(); 1445 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1446 1447 // Linux is kind of buggy in the way it interprets maxnode as it 1448 // will cut off the last node. That's why we have to add 1 to what 1449 // we would consider as the proper maxnode value. 1450 let maxnode = node as u64 + 1 + 1; 1451 1452 // Allocate the right size for the vector. 1453 nodemask.resize((node as usize / 64) + 1, 0); 1454 1455 // Fill the global bitmask through the nodemask vector. 1456 let idx = (node / 64) as usize; 1457 let shift = node % 64; 1458 nodemask[idx] |= 1u64 << shift; 1459 1460 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1461 // force the kernel to move all pages that might have been already 1462 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1463 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1464 // MPOL_BIND is the selected mode as it specifies a strict policy 1465 // that restricts memory allocation to the nodes specified in the 1466 // nodemask. 1467 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1468 .map_err(Error::ApplyNumaPolicy)?; 1469 } 1470 1471 // Prefault the region if needed, in parallel. 1472 if prefault { 1473 let page_size = 1474 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize; 1475 1476 if !is_aligned(size, page_size) { 1477 warn!( 1478 "Prefaulting memory size {} misaligned with page size {}", 1479 size, page_size 1480 ); 1481 } 1482 1483 let num_pages = size / page_size; 1484 1485 let num_threads = Self::get_prefault_num_threads(page_size, num_pages); 1486 1487 let pages_per_thread = num_pages / num_threads; 1488 let remainder = num_pages % num_threads; 1489 1490 let barrier = Arc::new(Barrier::new(num_threads)); 1491 thread::scope(|s| { 1492 let r = ®ion; 1493 for i in 0..num_threads { 1494 let barrier = Arc::clone(&barrier); 1495 s.spawn(move || { 1496 // Wait until all threads have been spawned to avoid contention 1497 // over mmap_sem between thread stack allocation and page faulting. 1498 barrier.wait(); 1499 let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; 1500 let offset = 1501 page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); 1502 // SAFETY: FFI call with correct arguments 1503 let ret = unsafe { 1504 let addr = r.as_ptr().add(offset); 1505 libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE) 1506 }; 1507 if ret != 0 { 1508 let e = io::Error::last_os_error(); 1509 warn!("Failed to prefault pages: {}", e); 1510 } 1511 }); 1512 } 1513 }); 1514 } 1515 1516 if region.file_offset().is_none() && thp { 1517 info!( 1518 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1519 region.as_ptr() as u64, 1520 size 1521 ); 1522 // SAFETY: FFI call with correct arguments 1523 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1524 if ret != 0 { 1525 let e = io::Error::last_os_error(); 1526 warn!("Failed to mark pages as THP eligible: {}", e); 1527 } 1528 } 1529 1530 Ok(Arc::new(region)) 1531 } 1532 1533 // Duplicate of `memory_zone_get_align_size` that does not require a `zone` 1534 fn get_prefault_align_size( 1535 backing_file: &Option<PathBuf>, 1536 hugepages: bool, 1537 hugepage_size: Option<u64>, 1538 ) -> Result<u64, Error> { 1539 // SAFETY: FFI call. Trivially safe. 1540 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 1541 match (hugepages, hugepage_size, backing_file) { 1542 (false, _, _) => Ok(page_size), 1543 (true, Some(hugepage_size), _) => Ok(hugepage_size), 1544 (true, None, _) => { 1545 // There are two scenarios here: 1546 // - `hugepages` is enabled but `hugepage_size` is not specified: 1547 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 1548 // - The backing file is specified: 1549 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 1550 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 1551 // value is less than or equal to the page size, just use the page size. 1552 let path = backing_file 1553 .as_ref() 1554 .map_or(Ok("/dev/hugepages"), |pathbuf| { 1555 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 1556 })?; 1557 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 1558 Ok(align_size) 1559 } 1560 } 1561 } 1562 1563 fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize { 1564 let mut n: usize = 1; 1565 1566 // Do not create more threads than processors available. 1567 // SAFETY: FFI call. Trivially safe. 1568 let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) }; 1569 if procs > 0 { 1570 n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT); 1571 } 1572 1573 // Do not create more threads than pages being allocated. 1574 n = std::cmp::min(n, num_pages); 1575 1576 // Do not create threads to allocate less than 64 MiB of memory. 1577 n = std::cmp::min( 1578 n, 1579 std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))), 1580 ); 1581 1582 n 1583 } 1584 1585 // Update the GuestMemoryMmap with the new range 1586 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1587 let guest_memory = self 1588 .guest_memory 1589 .memory() 1590 .insert_region(region) 1591 .map_err(Error::GuestMemory)?; 1592 self.guest_memory.lock().unwrap().replace(guest_memory); 1593 1594 Ok(()) 1595 } 1596 1597 // 1598 // Calculate the start address of an area next to RAM. 1599 // 1600 // If memory hotplug is allowed, the start address needs to be aligned 1601 // (rounded-up) to 128MiB boundary. 1602 // If memory hotplug is not allowed, there is no alignment required. 1603 // And it must also start at the 64bit start. 1604 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1605 let mut start_addr = if allow_mem_hotplug { 1606 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1607 } else { 1608 mem_end 1609 }; 1610 1611 start_addr = start_addr 1612 .checked_add(1) 1613 .ok_or(Error::GuestAddressOverFlow)?; 1614 1615 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1616 return Ok(arch::layout::RAM_64BIT_START); 1617 } 1618 1619 Ok(start_addr) 1620 } 1621 1622 pub fn add_ram_region( 1623 &mut self, 1624 start_addr: GuestAddress, 1625 size: usize, 1626 ) -> Result<Arc<GuestRegionMmap>, Error> { 1627 // Allocate memory for the region 1628 let region = MemoryManager::create_ram_region( 1629 &None, 1630 0, 1631 start_addr, 1632 size, 1633 self.prefault, 1634 self.shared, 1635 self.hugepages, 1636 self.hugepage_size, 1637 None, 1638 None, 1639 self.thp, 1640 )?; 1641 1642 // Map it into the guest 1643 let slot = self.create_userspace_mapping( 1644 region.start_addr().0, 1645 region.len(), 1646 region.as_ptr() as u64, 1647 self.mergeable, 1648 false, 1649 self.log_dirty, 1650 )?; 1651 self.guest_ram_mappings.push(GuestRamMapping { 1652 gpa: region.start_addr().raw_value(), 1653 size: region.len(), 1654 slot, 1655 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1656 virtio_mem: false, 1657 file_offset: 0, 1658 }); 1659 1660 self.add_region(Arc::clone(®ion))?; 1661 1662 Ok(region) 1663 } 1664 1665 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1666 info!("Hotplugging new RAM: {}", size); 1667 1668 // Check that there is a free slot 1669 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1670 return Err(Error::NoSlotAvailable); 1671 } 1672 1673 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1674 if size % (128 << 20) != 0 { 1675 return Err(Error::InvalidSize); 1676 } 1677 1678 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1679 1680 if start_addr 1681 .checked_add((size - 1).try_into().unwrap()) 1682 .unwrap() 1683 > self.end_of_ram_area 1684 { 1685 return Err(Error::InsufficientHotplugRam); 1686 } 1687 1688 let region = self.add_ram_region(start_addr, size)?; 1689 1690 // Add region to the list of regions associated with the default 1691 // memory zone. 1692 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1693 memory_zone.regions.push(Arc::clone(®ion)); 1694 } 1695 1696 // Tell the allocator 1697 self.ram_allocator 1698 .allocate(Some(start_addr), size as GuestUsize, None) 1699 .ok_or(Error::MemoryRangeAllocation)?; 1700 1701 // Update the slot so that it can be queried via the I/O port 1702 let slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1703 slot.active = true; 1704 slot.inserting = true; 1705 slot.base = region.start_addr().0; 1706 slot.length = region.len(); 1707 1708 self.next_hotplug_slot += 1; 1709 1710 Ok(region) 1711 } 1712 1713 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1714 self.guest_memory.clone() 1715 } 1716 1717 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1718 self.boot_guest_memory.clone() 1719 } 1720 1721 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1722 self.allocator.clone() 1723 } 1724 1725 pub fn start_of_device_area(&self) -> GuestAddress { 1726 self.start_of_device_area 1727 } 1728 1729 pub fn end_of_device_area(&self) -> GuestAddress { 1730 self.end_of_device_area 1731 } 1732 1733 pub fn allocate_memory_slot(&mut self) -> u32 { 1734 let slot_id = self.next_memory_slot; 1735 self.next_memory_slot += 1; 1736 slot_id 1737 } 1738 1739 pub fn create_userspace_mapping( 1740 &mut self, 1741 guest_phys_addr: u64, 1742 memory_size: u64, 1743 userspace_addr: u64, 1744 mergeable: bool, 1745 readonly: bool, 1746 log_dirty: bool, 1747 ) -> Result<u32, Error> { 1748 let slot = self.allocate_memory_slot(); 1749 let mem_region = self.vm.make_user_memory_region( 1750 slot, 1751 guest_phys_addr, 1752 memory_size, 1753 userspace_addr, 1754 readonly, 1755 log_dirty, 1756 ); 1757 1758 info!( 1759 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1760 guest_phys_addr, userspace_addr, memory_size, slot 1761 ); 1762 1763 self.vm 1764 .create_user_memory_region(mem_region) 1765 .map_err(Error::CreateUserMemoryRegion)?; 1766 1767 // SAFETY: the address and size are valid since the 1768 // mmap succeeded. 1769 let ret = unsafe { 1770 libc::madvise( 1771 userspace_addr as *mut libc::c_void, 1772 memory_size as libc::size_t, 1773 libc::MADV_DONTDUMP, 1774 ) 1775 }; 1776 if ret != 0 { 1777 let e = io::Error::last_os_error(); 1778 warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e); 1779 } 1780 1781 // Mark the pages as mergeable if explicitly asked for. 1782 if mergeable { 1783 // SAFETY: the address and size are valid since the 1784 // mmap succeeded. 1785 let ret = unsafe { 1786 libc::madvise( 1787 userspace_addr as *mut libc::c_void, 1788 memory_size as libc::size_t, 1789 libc::MADV_MERGEABLE, 1790 ) 1791 }; 1792 if ret != 0 { 1793 let err = io::Error::last_os_error(); 1794 // Safe to unwrap because the error is constructed with 1795 // last_os_error(), which ensures the output will be Some(). 1796 let errno = err.raw_os_error().unwrap(); 1797 if errno == libc::EINVAL { 1798 warn!("kernel not configured with CONFIG_KSM"); 1799 } else { 1800 warn!("madvise error: {}", err); 1801 } 1802 warn!("failed to mark pages as mergeable"); 1803 } 1804 } 1805 1806 info!( 1807 "Created userspace mapping: {:x} -> {:x} {:x}", 1808 guest_phys_addr, userspace_addr, memory_size 1809 ); 1810 1811 Ok(slot) 1812 } 1813 1814 pub fn remove_userspace_mapping( 1815 &mut self, 1816 guest_phys_addr: u64, 1817 memory_size: u64, 1818 userspace_addr: u64, 1819 mergeable: bool, 1820 slot: u32, 1821 ) -> Result<(), Error> { 1822 let mem_region = self.vm.make_user_memory_region( 1823 slot, 1824 guest_phys_addr, 1825 memory_size, 1826 userspace_addr, 1827 false, /* readonly -- don't care */ 1828 false, /* log dirty */ 1829 ); 1830 1831 self.vm 1832 .remove_user_memory_region(mem_region) 1833 .map_err(Error::RemoveUserMemoryRegion)?; 1834 1835 // Mark the pages as unmergeable if there were previously marked as 1836 // mergeable. 1837 if mergeable { 1838 // SAFETY: the address and size are valid as the region was 1839 // previously advised. 1840 let ret = unsafe { 1841 libc::madvise( 1842 userspace_addr as *mut libc::c_void, 1843 memory_size as libc::size_t, 1844 libc::MADV_UNMERGEABLE, 1845 ) 1846 }; 1847 if ret != 0 { 1848 let err = io::Error::last_os_error(); 1849 // Safe to unwrap because the error is constructed with 1850 // last_os_error(), which ensures the output will be Some(). 1851 let errno = err.raw_os_error().unwrap(); 1852 if errno == libc::EINVAL { 1853 warn!("kernel not configured with CONFIG_KSM"); 1854 } else { 1855 warn!("madvise error: {}", err); 1856 } 1857 warn!("failed to mark pages as unmergeable"); 1858 } 1859 } 1860 1861 info!( 1862 "Removed userspace mapping: {:x} -> {:x} {:x}", 1863 guest_phys_addr, userspace_addr, memory_size 1864 ); 1865 1866 Ok(()) 1867 } 1868 1869 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1870 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1871 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1872 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1873 virtio_mem_device 1874 .lock() 1875 .unwrap() 1876 .resize(size) 1877 .map_err(Error::VirtioMemResizeFail)?; 1878 } 1879 1880 // Keep the hotplugged_size up to date. 1881 virtio_mem_zone.hotplugged_size = size; 1882 } else { 1883 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1884 return Err(Error::MissingVirtioMemHandler); 1885 } 1886 1887 return Ok(()); 1888 } 1889 1890 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1891 Err(Error::UnknownMemoryZone) 1892 } 1893 1894 /// In case this function resulted in adding a new memory region to the 1895 /// guest memory, the new region is returned to the caller. The virtio-mem 1896 /// use case never adds a new region as the whole hotpluggable memory has 1897 /// already been allocated at boot time. 1898 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1899 if self.user_provided_zones { 1900 error!( 1901 "Not allowed to resize guest memory when backed with user \ 1902 defined memory zones." 1903 ); 1904 return Err(Error::InvalidResizeWithMemoryZones); 1905 } 1906 1907 let mut region: Option<Arc<GuestRegionMmap>> = None; 1908 match self.hotplug_method { 1909 HotplugMethod::VirtioMem => { 1910 if desired_ram >= self.boot_ram { 1911 if !self.dynamic { 1912 return Ok(region); 1913 } 1914 1915 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1916 self.current_ram = desired_ram; 1917 } 1918 } 1919 HotplugMethod::Acpi => { 1920 if desired_ram > self.current_ram { 1921 if !self.dynamic { 1922 return Ok(region); 1923 } 1924 1925 region = 1926 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1927 self.current_ram = desired_ram; 1928 } 1929 } 1930 } 1931 Ok(region) 1932 } 1933 1934 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1935 if !self.user_provided_zones { 1936 error!( 1937 "Not allowed to resize guest memory zone when no zone is \ 1938 defined." 1939 ); 1940 return Err(Error::ResizeZone); 1941 } 1942 1943 self.virtio_mem_resize(id, virtio_mem_size) 1944 } 1945 1946 #[cfg(target_arch = "x86_64")] 1947 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1948 let file = OpenOptions::new() 1949 .read(true) 1950 .open("/dev/sgx_provision") 1951 .map_err(Error::SgxProvisionOpen)?; 1952 self.vm 1953 .enable_sgx_attribute(file) 1954 .map_err(Error::SgxEnableProvisioning)?; 1955 1956 // Go over each EPC section and verify its size is a 4k multiple. At 1957 // the same time, calculate the total size needed for the contiguous 1958 // EPC region. 1959 let mut epc_region_size = 0; 1960 for epc_section in sgx_epc_config.iter() { 1961 if epc_section.size == 0 { 1962 return Err(Error::EpcSectionSizeInvalid); 1963 } 1964 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1965 return Err(Error::EpcSectionSizeInvalid); 1966 } 1967 1968 epc_region_size += epc_section.size; 1969 } 1970 1971 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1972 let epc_region_start = GuestAddress( 1973 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1974 ); 1975 1976 self.start_of_device_area = epc_region_start 1977 .checked_add(epc_region_size) 1978 .ok_or(Error::GuestAddressOverFlow)?; 1979 1980 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1981 info!( 1982 "SGX EPC region: 0x{:x} (0x{:x})", 1983 epc_region_start.0, epc_region_size 1984 ); 1985 1986 // Each section can be memory mapped into the allocated region. 1987 let mut epc_section_start = epc_region_start.raw_value(); 1988 for epc_section in sgx_epc_config.iter() { 1989 let file = OpenOptions::new() 1990 .read(true) 1991 .write(true) 1992 .open("/dev/sgx_vepc") 1993 .map_err(Error::SgxVirtEpcOpen)?; 1994 1995 let prot = PROT_READ | PROT_WRITE; 1996 let mut flags = MAP_NORESERVE | MAP_SHARED; 1997 if epc_section.prefault { 1998 flags |= MAP_POPULATE; 1999 } 2000 2001 // We can't use the vm-memory crate to perform the memory mapping 2002 // here as it would try to ensure the size of the backing file is 2003 // matching the size of the expected mapping. The /dev/sgx_vepc 2004 // device does not work that way, it provides a file descriptor 2005 // which is not matching the mapping size, as it's a just a way to 2006 // let KVM know that an EPC section is being created for the guest. 2007 // SAFETY: FFI call with correct arguments 2008 let host_addr = unsafe { 2009 libc::mmap( 2010 std::ptr::null_mut(), 2011 epc_section.size as usize, 2012 prot, 2013 flags, 2014 file.as_raw_fd(), 2015 0, 2016 ) 2017 } as u64; 2018 2019 info!( 2020 "Adding SGX EPC section: 0x{:x} (0x{:x})", 2021 epc_section_start, epc_section.size 2022 ); 2023 2024 let _mem_slot = self.create_userspace_mapping( 2025 epc_section_start, 2026 epc_section.size, 2027 host_addr, 2028 false, 2029 false, 2030 false, 2031 )?; 2032 2033 sgx_epc_region.insert( 2034 epc_section.id.clone(), 2035 SgxEpcSection::new( 2036 GuestAddress(epc_section_start), 2037 epc_section.size as GuestUsize, 2038 ), 2039 ); 2040 2041 epc_section_start += epc_section.size; 2042 } 2043 2044 self.sgx_epc_region = Some(sgx_epc_region); 2045 2046 Ok(()) 2047 } 2048 2049 #[cfg(target_arch = "x86_64")] 2050 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 2051 &self.sgx_epc_region 2052 } 2053 2054 pub fn is_hardlink(f: &File) -> bool { 2055 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 2056 // SAFETY: FFI call with correct arguments 2057 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 2058 if ret != 0 { 2059 error!("Couldn't fstat the backing file"); 2060 return false; 2061 } 2062 2063 // SAFETY: stat is valid 2064 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 2065 } 2066 2067 pub fn memory_zones(&self) -> &MemoryZones { 2068 &self.memory_zones 2069 } 2070 2071 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 2072 &mut self.memory_zones 2073 } 2074 2075 pub fn memory_range_table( 2076 &self, 2077 snapshot: bool, 2078 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 2079 let mut table = MemoryRangeTable::default(); 2080 2081 for memory_zone in self.memory_zones.values() { 2082 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 2083 table.extend(virtio_mem_zone.plugged_ranges()); 2084 } 2085 2086 for region in memory_zone.regions() { 2087 if snapshot { 2088 if let Some(file_offset) = region.file_offset() { 2089 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 2090 && Self::is_hardlink(file_offset.file()) 2091 { 2092 // In this very specific case, we know the memory 2093 // region is backed by a file on the host filesystem 2094 // that can be accessed by the user, and additionally 2095 // the mapping is shared, which means that modifications 2096 // to the content are written to the actual file. 2097 // When meeting these conditions, we can skip the 2098 // copy of the memory content for this specific region, 2099 // as we can assume the user will have it saved through 2100 // the backing file already. 2101 continue; 2102 } 2103 } 2104 } 2105 2106 table.push(MemoryRange { 2107 gpa: region.start_addr().raw_value(), 2108 length: region.len(), 2109 }); 2110 } 2111 } 2112 2113 Ok(table) 2114 } 2115 2116 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 2117 MemoryManagerSnapshotData { 2118 memory_ranges: self.snapshot_memory_ranges.clone(), 2119 guest_ram_mappings: self.guest_ram_mappings.clone(), 2120 start_of_device_area: self.start_of_device_area.0, 2121 boot_ram: self.boot_ram, 2122 current_ram: self.current_ram, 2123 arch_mem_regions: self.arch_mem_regions.clone(), 2124 hotplug_slots: self.hotplug_slots.clone(), 2125 next_memory_slot: self.next_memory_slot, 2126 selected_slot: self.selected_slot, 2127 next_hotplug_slot: self.next_hotplug_slot, 2128 } 2129 } 2130 2131 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 2132 let mut memory_slot_fds = HashMap::new(); 2133 for guest_ram_mapping in &self.guest_ram_mappings { 2134 let slot = guest_ram_mapping.slot; 2135 let guest_memory = self.guest_memory.memory(); 2136 let file = guest_memory 2137 .find_region(GuestAddress(guest_ram_mapping.gpa)) 2138 .unwrap() 2139 .file_offset() 2140 .unwrap() 2141 .file(); 2142 memory_slot_fds.insert(slot, file.as_raw_fd()); 2143 } 2144 memory_slot_fds 2145 } 2146 2147 pub fn acpi_address(&self) -> Option<GuestAddress> { 2148 self.acpi_address 2149 } 2150 2151 pub fn num_guest_ram_mappings(&self) -> u32 { 2152 self.guest_ram_mappings.len() as u32 2153 } 2154 2155 #[cfg(target_arch = "aarch64")] 2156 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 2157 self.uefi_flash.as_ref().unwrap().clone() 2158 } 2159 2160 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2161 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 2162 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 2163 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 2164 2165 let mut mem_offset_in_elf = mem_offset; 2166 let mut ram_maps = BTreeMap::new(); 2167 for mapping in mapping_sorted_by_gpa.iter() { 2168 ram_maps.insert( 2169 mapping.gpa, 2170 CoredumpMemoryRegion { 2171 mem_offset_in_elf, 2172 mem_size: mapping.size, 2173 }, 2174 ); 2175 mem_offset_in_elf += mapping.size; 2176 } 2177 2178 CoredumpMemoryRegions { ram_maps } 2179 } 2180 2181 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2182 pub fn coredump_iterate_save_mem( 2183 &mut self, 2184 dump_state: &DumpState, 2185 ) -> std::result::Result<(), GuestDebuggableError> { 2186 let snapshot_memory_ranges = self 2187 .memory_range_table(false) 2188 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2189 2190 if snapshot_memory_ranges.is_empty() { 2191 return Ok(()); 2192 } 2193 2194 let coredump_file = dump_state.file.as_ref().unwrap(); 2195 2196 let guest_memory = self.guest_memory.memory(); 2197 let mut total_bytes: u64 = 0; 2198 2199 for range in snapshot_memory_ranges.regions() { 2200 let mut offset: u64 = 0; 2201 loop { 2202 let bytes_written = guest_memory 2203 .write_volatile_to( 2204 GuestAddress(range.gpa + offset), 2205 &mut coredump_file.as_fd(), 2206 (range.length - offset) as usize, 2207 ) 2208 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2209 offset += bytes_written as u64; 2210 total_bytes += bytes_written as u64; 2211 2212 if offset == range.length { 2213 break; 2214 } 2215 } 2216 } 2217 2218 debug!("coredump total bytes {}", total_bytes); 2219 Ok(()) 2220 } 2221 2222 pub fn receive_memory_regions<F>( 2223 &mut self, 2224 ranges: &MemoryRangeTable, 2225 fd: &mut F, 2226 ) -> std::result::Result<(), MigratableError> 2227 where 2228 F: ReadVolatile, 2229 { 2230 let guest_memory = self.guest_memory(); 2231 let mem = guest_memory.memory(); 2232 2233 for range in ranges.regions() { 2234 let mut offset: u64 = 0; 2235 // Here we are manually handling the retry in case we can't the 2236 // whole region at once because we can't use the implementation 2237 // from vm-memory::GuestMemory of read_exact_from() as it is not 2238 // following the correct behavior. For more info about this issue 2239 // see: https://github.com/rust-vmm/vm-memory/issues/174 2240 loop { 2241 let bytes_read = mem 2242 .read_volatile_from( 2243 GuestAddress(range.gpa + offset), 2244 fd, 2245 (range.length - offset) as usize, 2246 ) 2247 .map_err(|e| { 2248 MigratableError::MigrateReceive(anyhow!( 2249 "Error receiving memory from socket: {}", 2250 e 2251 )) 2252 })?; 2253 offset += bytes_read as u64; 2254 2255 if offset == range.length { 2256 break; 2257 } 2258 } 2259 } 2260 2261 Ok(()) 2262 } 2263 } 2264 2265 struct MemoryNotify { 2266 slot_id: usize, 2267 } 2268 2269 impl Aml for MemoryNotify { 2270 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2271 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2272 aml::If::new( 2273 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2274 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2275 ) 2276 .to_aml_bytes(sink) 2277 } 2278 } 2279 2280 struct MemorySlot { 2281 slot_id: usize, 2282 } 2283 2284 impl Aml for MemorySlot { 2285 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2286 aml::Device::new( 2287 format!("M{:03}", self.slot_id).as_str().into(), 2288 vec![ 2289 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")), 2290 &aml::Name::new("_UID".into(), &self.slot_id), 2291 /* 2292 _STA return value: 2293 Bit [0] – Set if the device is present. 2294 Bit [1] – Set if the device is enabled and decoding its resources. 2295 Bit [2] – Set if the device should be shown in the UI. 2296 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2297 Bit [4] – Set if the battery is present. 2298 Bits [31:5] – Reserved (must be cleared). 2299 */ 2300 &aml::Method::new( 2301 "_STA".into(), 2302 0, 2303 false, 2304 // Call into MSTA method which will interrogate device 2305 vec![&aml::Return::new(&aml::MethodCall::new( 2306 "MSTA".into(), 2307 vec![&self.slot_id], 2308 ))], 2309 ), 2310 // Get details of memory 2311 &aml::Method::new( 2312 "_CRS".into(), 2313 0, 2314 false, 2315 // Call into MCRS which provides actual memory details 2316 vec![&aml::Return::new(&aml::MethodCall::new( 2317 "MCRS".into(), 2318 vec![&self.slot_id], 2319 ))], 2320 ), 2321 ], 2322 ) 2323 .to_aml_bytes(sink) 2324 } 2325 } 2326 2327 struct MemorySlots { 2328 slots: usize, 2329 } 2330 2331 impl Aml for MemorySlots { 2332 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2333 for slot_id in 0..self.slots { 2334 MemorySlot { slot_id }.to_aml_bytes(sink); 2335 } 2336 } 2337 } 2338 2339 struct MemoryMethods { 2340 slots: usize, 2341 } 2342 2343 impl Aml for MemoryMethods { 2344 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2345 // Add "MTFY" notification method 2346 let mut memory_notifies = Vec::new(); 2347 for slot_id in 0..self.slots { 2348 memory_notifies.push(MemoryNotify { slot_id }); 2349 } 2350 2351 let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2352 for memory_notifier in memory_notifies.iter() { 2353 memory_notifies_refs.push(memory_notifier); 2354 } 2355 2356 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink); 2357 2358 // MSCN method 2359 aml::Method::new( 2360 "MSCN".into(), 2361 0, 2362 true, 2363 vec![ 2364 // Take lock defined above 2365 &aml::Acquire::new("MLCK".into(), 0xffff), 2366 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2367 &aml::While::new( 2368 &aml::LessThan::new(&aml::Local(0), &self.slots), 2369 vec![ 2370 // Write slot number (in first argument) to I/O port via field 2371 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2372 // Check if MINS bit is set (inserting) 2373 &aml::If::new( 2374 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2375 // Notify device if it is 2376 vec![ 2377 &aml::MethodCall::new( 2378 "MTFY".into(), 2379 vec![&aml::Local(0), &aml::ONE], 2380 ), 2381 // Reset MINS bit 2382 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2383 ], 2384 ), 2385 // Check if MRMV bit is set 2386 &aml::If::new( 2387 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2388 // Notify device if it is (with the eject constant 0x3) 2389 vec![ 2390 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2391 // Reset MRMV bit 2392 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2393 ], 2394 ), 2395 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2396 ], 2397 ), 2398 // Release lock 2399 &aml::Release::new("MLCK".into()), 2400 ], 2401 ) 2402 .to_aml_bytes(sink); 2403 2404 // Memory status method 2405 aml::Method::new( 2406 "MSTA".into(), 2407 1, 2408 true, 2409 vec![ 2410 // Take lock defined above 2411 &aml::Acquire::new("MLCK".into(), 0xffff), 2412 // Write slot number (in first argument) to I/O port via field 2413 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2414 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2415 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2416 &aml::If::new( 2417 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2418 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2419 ), 2420 // Release lock 2421 &aml::Release::new("MLCK".into()), 2422 // Return 0 or 0xf 2423 &aml::Return::new(&aml::Local(0)), 2424 ], 2425 ) 2426 .to_aml_bytes(sink); 2427 2428 // Memory range method 2429 aml::Method::new( 2430 "MCRS".into(), 2431 1, 2432 true, 2433 vec![ 2434 // Take lock defined above 2435 &aml::Acquire::new("MLCK".into(), 0xffff), 2436 // Write slot number (in first argument) to I/O port via field 2437 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2438 &aml::Name::new( 2439 "MR64".into(), 2440 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2441 aml::AddressSpaceCacheable::Cacheable, 2442 true, 2443 0x0000_0000_0000_0000u64, 2444 0xFFFF_FFFF_FFFF_FFFEu64, 2445 None, 2446 )]), 2447 ), 2448 &aml::CreateQWordField::new( 2449 &aml::Path::new("MINL"), 2450 &aml::Path::new("MR64"), 2451 &14usize, 2452 ), 2453 &aml::CreateDWordField::new( 2454 &aml::Path::new("MINH"), 2455 &aml::Path::new("MR64"), 2456 &18usize, 2457 ), 2458 &aml::CreateQWordField::new( 2459 &aml::Path::new("MAXL"), 2460 &aml::Path::new("MR64"), 2461 &22usize, 2462 ), 2463 &aml::CreateDWordField::new( 2464 &aml::Path::new("MAXH"), 2465 &aml::Path::new("MR64"), 2466 &26usize, 2467 ), 2468 &aml::CreateQWordField::new( 2469 &aml::Path::new("LENL"), 2470 &aml::Path::new("MR64"), 2471 &38usize, 2472 ), 2473 &aml::CreateDWordField::new( 2474 &aml::Path::new("LENH"), 2475 &aml::Path::new("MR64"), 2476 &42usize, 2477 ), 2478 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2479 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2480 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2481 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2482 &aml::Add::new( 2483 &aml::Path::new("MAXL"), 2484 &aml::Path::new("MINL"), 2485 &aml::Path::new("LENL"), 2486 ), 2487 &aml::Add::new( 2488 &aml::Path::new("MAXH"), 2489 &aml::Path::new("MINH"), 2490 &aml::Path::new("LENH"), 2491 ), 2492 &aml::If::new( 2493 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2494 vec![&aml::Add::new( 2495 &aml::Path::new("MAXH"), 2496 &aml::ONE, 2497 &aml::Path::new("MAXH"), 2498 )], 2499 ), 2500 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2501 // Release lock 2502 &aml::Release::new("MLCK".into()), 2503 &aml::Return::new(&aml::Path::new("MR64")), 2504 ], 2505 ) 2506 .to_aml_bytes(sink) 2507 } 2508 } 2509 2510 impl Aml for MemoryManager { 2511 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2512 if let Some(acpi_address) = self.acpi_address { 2513 // Memory Hotplug Controller 2514 aml::Device::new( 2515 "_SB_.MHPC".into(), 2516 vec![ 2517 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2518 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2519 // Mutex to protect concurrent access as we write to choose slot and then read back status 2520 &aml::Mutex::new("MLCK".into(), 0), 2521 &aml::Name::new( 2522 "_CRS".into(), 2523 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2524 aml::AddressSpaceCacheable::NotCacheable, 2525 true, 2526 acpi_address.0, 2527 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2528 None, 2529 )]), 2530 ), 2531 // OpRegion and Fields map MMIO range into individual field values 2532 &aml::OpRegion::new( 2533 "MHPR".into(), 2534 aml::OpRegionSpace::SystemMemory, 2535 &(acpi_address.0 as usize), 2536 &MEMORY_MANAGER_ACPI_SIZE, 2537 ), 2538 &aml::Field::new( 2539 "MHPR".into(), 2540 aml::FieldAccessType::DWord, 2541 aml::FieldLockRule::NoLock, 2542 aml::FieldUpdateRule::Preserve, 2543 vec![ 2544 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2545 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2546 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2547 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2548 ], 2549 ), 2550 &aml::Field::new( 2551 "MHPR".into(), 2552 aml::FieldAccessType::DWord, 2553 aml::FieldLockRule::NoLock, 2554 aml::FieldUpdateRule::Preserve, 2555 vec![ 2556 aml::FieldEntry::Reserved(128), 2557 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2558 ], 2559 ), 2560 &aml::Field::new( 2561 "MHPR".into(), 2562 aml::FieldAccessType::Byte, 2563 aml::FieldLockRule::NoLock, 2564 aml::FieldUpdateRule::WriteAsZeroes, 2565 vec![ 2566 aml::FieldEntry::Reserved(160), 2567 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2568 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2569 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2570 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2571 ], 2572 ), 2573 &aml::Field::new( 2574 "MHPR".into(), 2575 aml::FieldAccessType::DWord, 2576 aml::FieldLockRule::NoLock, 2577 aml::FieldUpdateRule::Preserve, 2578 vec![ 2579 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2580 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2581 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2582 ], 2583 ), 2584 &MemoryMethods { 2585 slots: self.hotplug_slots.len(), 2586 }, 2587 &MemorySlots { 2588 slots: self.hotplug_slots.len(), 2589 }, 2590 ], 2591 ) 2592 .to_aml_bytes(sink); 2593 } else { 2594 aml::Device::new( 2595 "_SB_.MHPC".into(), 2596 vec![ 2597 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2598 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2599 // Empty MSCN for GED 2600 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2601 ], 2602 ) 2603 .to_aml_bytes(sink); 2604 } 2605 2606 #[cfg(target_arch = "x86_64")] 2607 { 2608 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2609 let min = sgx_epc_region.start().raw_value(); 2610 let max = min + sgx_epc_region.size() - 1; 2611 // SGX EPC region 2612 aml::Device::new( 2613 "_SB_.EPC_".into(), 2614 vec![ 2615 &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")), 2616 // QWORD describing the EPC region start and size 2617 &aml::Name::new( 2618 "_CRS".into(), 2619 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2620 aml::AddressSpaceCacheable::NotCacheable, 2621 true, 2622 min, 2623 max, 2624 None, 2625 )]), 2626 ), 2627 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2628 ], 2629 ) 2630 .to_aml_bytes(sink); 2631 } 2632 } 2633 } 2634 } 2635 2636 impl Pausable for MemoryManager {} 2637 2638 #[derive(Clone, Serialize, Deserialize)] 2639 pub struct MemoryManagerSnapshotData { 2640 memory_ranges: MemoryRangeTable, 2641 guest_ram_mappings: Vec<GuestRamMapping>, 2642 start_of_device_area: u64, 2643 boot_ram: u64, 2644 current_ram: u64, 2645 arch_mem_regions: Vec<ArchMemRegion>, 2646 hotplug_slots: Vec<HotPlugState>, 2647 next_memory_slot: u32, 2648 selected_slot: usize, 2649 next_hotplug_slot: usize, 2650 } 2651 2652 impl Snapshottable for MemoryManager { 2653 fn id(&self) -> String { 2654 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2655 } 2656 2657 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2658 let memory_ranges = self.memory_range_table(true)?; 2659 2660 // Store locally this list of ranges as it will be used through the 2661 // Transportable::send() implementation. The point is to avoid the 2662 // duplication of code regarding the creation of the path for each 2663 // region. The 'snapshot' step creates the list of memory regions, 2664 // including information about the need to copy a memory region or 2665 // not. This saves the 'send' step having to go through the same 2666 // process, and instead it can directly proceed with storing the 2667 // memory range content for the ranges requiring it. 2668 self.snapshot_memory_ranges = memory_ranges; 2669 2670 Ok(Snapshot::from_data(SnapshotData::new_from_state( 2671 &self.snapshot_data(), 2672 )?)) 2673 } 2674 } 2675 2676 impl Transportable for MemoryManager { 2677 fn send( 2678 &self, 2679 _snapshot: &Snapshot, 2680 destination_url: &str, 2681 ) -> result::Result<(), MigratableError> { 2682 if self.snapshot_memory_ranges.is_empty() { 2683 return Ok(()); 2684 } 2685 2686 let mut memory_file_path = url_to_path(destination_url)?; 2687 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2688 2689 // Create the snapshot file for the entire memory 2690 let mut memory_file = OpenOptions::new() 2691 .read(true) 2692 .write(true) 2693 .create_new(true) 2694 .open(memory_file_path) 2695 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2696 2697 let guest_memory = self.guest_memory.memory(); 2698 2699 for range in self.snapshot_memory_ranges.regions() { 2700 let mut offset: u64 = 0; 2701 // Here we are manually handling the retry in case we can't read 2702 // the whole region at once because we can't use the implementation 2703 // from vm-memory::GuestMemory of write_all_to() as it is not 2704 // following the correct behavior. For more info about this issue 2705 // see: https://github.com/rust-vmm/vm-memory/issues/174 2706 loop { 2707 let bytes_written = guest_memory 2708 .write_volatile_to( 2709 GuestAddress(range.gpa + offset), 2710 &mut memory_file, 2711 (range.length - offset) as usize, 2712 ) 2713 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2714 offset += bytes_written as u64; 2715 2716 if offset == range.length { 2717 break; 2718 } 2719 } 2720 } 2721 Ok(()) 2722 } 2723 } 2724 2725 impl Migratable for MemoryManager { 2726 // Start the dirty log in the hypervisor (kvm/mshv). 2727 // Also, reset the dirty bitmap logged by the vmm. 2728 // Just before we do a bulk copy we want to start/clear the dirty log so that 2729 // pages touched during our bulk copy are tracked. 2730 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2731 self.vm.start_dirty_log().map_err(|e| { 2732 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2733 })?; 2734 2735 for r in self.guest_memory.memory().iter() { 2736 r.bitmap().reset(); 2737 } 2738 2739 Ok(()) 2740 } 2741 2742 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2743 self.vm.stop_dirty_log().map_err(|e| { 2744 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2745 })?; 2746 2747 Ok(()) 2748 } 2749 2750 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2751 // together in the table if they are contiguous. 2752 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2753 let mut table = MemoryRangeTable::default(); 2754 for r in &self.guest_ram_mappings { 2755 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2756 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2757 })?; 2758 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2759 { 2760 Some(region) => { 2761 assert!(region.start_addr().raw_value() == r.gpa); 2762 assert!(region.len() == r.size); 2763 region.bitmap().get_and_reset() 2764 } 2765 None => { 2766 return Err(MigratableError::MigrateSend(anyhow!( 2767 "Error finding 'guest memory region' with address {:x}", 2768 r.gpa 2769 ))) 2770 } 2771 }; 2772 2773 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2774 .iter() 2775 .zip(vmm_dirty_bitmap.iter()) 2776 .map(|(x, y)| x | y) 2777 .collect(); 2778 2779 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2780 2781 if sub_table.regions().is_empty() { 2782 info!("Dirty Memory Range Table is empty"); 2783 } else { 2784 info!("Dirty Memory Range Table:"); 2785 for range in sub_table.regions() { 2786 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2787 } 2788 } 2789 2790 table.extend(sub_table); 2791 } 2792 Ok(table) 2793 } 2794 } 2795