1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(target_arch = "x86_64")] 6 use crate::config::SgxEpcConfig; 7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 9 use crate::coredump::{ 10 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, 11 }; 12 use crate::migration::url_to_path; 13 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 14 use crate::{GuestMemoryMmap, GuestRegionMmap}; 15 use acpi_tables::{aml, Aml}; 16 use anyhow::anyhow; 17 #[cfg(target_arch = "x86_64")] 18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 19 use arch::RegionType; 20 #[cfg(target_arch = "x86_64")] 21 use devices::ioapic; 22 #[cfg(target_arch = "aarch64")] 23 use hypervisor::HypervisorVmError; 24 use libc::_SC_NPROCESSORS_ONLN; 25 #[cfg(target_arch = "x86_64")] 26 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 27 use serde::{Deserialize, Serialize}; 28 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 29 use std::collections::BTreeMap; 30 use std::collections::HashMap; 31 use std::fs::{File, OpenOptions}; 32 use std::io::{self}; 33 use std::ops::{BitAnd, Deref, Not, Sub}; 34 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 35 use std::os::fd::AsFd; 36 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 37 use std::path::PathBuf; 38 use std::result; 39 use std::sync::{Arc, Barrier, Mutex}; 40 use std::{ffi, thread}; 41 use tracer::trace_scoped; 42 use virtio_devices::BlocksState; 43 #[cfg(target_arch = "x86_64")] 44 use vm_allocator::GsiApic; 45 use vm_allocator::{AddressAllocator, SystemAllocator}; 46 use vm_device::BusDevice; 47 use vm_memory::bitmap::AtomicBitmap; 48 use vm_memory::guest_memory::FileOffset; 49 use vm_memory::{ 50 mmap::MmapRegionError, Address, Error as MmapError, GuestAddress, GuestAddressSpace, 51 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 52 ReadVolatile, 53 }; 54 use vm_migration::{ 55 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 56 Snapshot, SnapshotData, Snapshottable, Transportable, 57 }; 58 59 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 60 61 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 62 63 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 64 65 #[cfg(target_arch = "x86_64")] 66 const X86_64_IRQ_BASE: u32 = 5; 67 68 #[cfg(target_arch = "x86_64")] 69 const SGX_PAGE_SIZE: u64 = 1 << 12; 70 71 const HOTPLUG_COUNT: usize = 8; 72 73 // Memory policy constants 74 const MPOL_BIND: u32 = 2; 75 const MPOL_MF_STRICT: u32 = 1; 76 const MPOL_MF_MOVE: u32 = 1 << 1; 77 78 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 79 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 80 81 const MAX_PREFAULT_THREAD_COUNT: usize = 16; 82 83 #[derive(Clone, Default, Serialize, Deserialize)] 84 struct HotPlugState { 85 base: u64, 86 length: u64, 87 active: bool, 88 inserting: bool, 89 removing: bool, 90 } 91 92 pub struct VirtioMemZone { 93 region: Arc<GuestRegionMmap>, 94 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 95 hotplugged_size: u64, 96 hugepages: bool, 97 blocks_state: Arc<Mutex<BlocksState>>, 98 } 99 100 impl VirtioMemZone { 101 pub fn region(&self) -> &Arc<GuestRegionMmap> { 102 &self.region 103 } 104 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 105 self.virtio_device = Some(virtio_device); 106 } 107 pub fn hotplugged_size(&self) -> u64 { 108 self.hotplugged_size 109 } 110 pub fn hugepages(&self) -> bool { 111 self.hugepages 112 } 113 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 114 &self.blocks_state 115 } 116 pub fn plugged_ranges(&self) -> MemoryRangeTable { 117 self.blocks_state 118 .lock() 119 .unwrap() 120 .memory_ranges(self.region.start_addr().raw_value(), true) 121 } 122 } 123 124 #[derive(Default)] 125 pub struct MemoryZone { 126 regions: Vec<Arc<GuestRegionMmap>>, 127 virtio_mem_zone: Option<VirtioMemZone>, 128 } 129 130 impl MemoryZone { 131 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 132 &self.regions 133 } 134 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 135 &self.virtio_mem_zone 136 } 137 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 138 self.virtio_mem_zone.as_mut() 139 } 140 } 141 142 pub type MemoryZones = HashMap<String, MemoryZone>; 143 144 #[derive(Clone, Serialize, Deserialize)] 145 struct GuestRamMapping { 146 slot: u32, 147 gpa: u64, 148 size: u64, 149 zone_id: String, 150 virtio_mem: bool, 151 file_offset: u64, 152 } 153 154 #[derive(Clone, Serialize, Deserialize)] 155 struct ArchMemRegion { 156 base: u64, 157 size: usize, 158 r_type: RegionType, 159 } 160 161 pub struct MemoryManager { 162 boot_guest_memory: GuestMemoryMmap, 163 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 164 next_memory_slot: u32, 165 start_of_device_area: GuestAddress, 166 end_of_device_area: GuestAddress, 167 end_of_ram_area: GuestAddress, 168 pub vm: Arc<dyn hypervisor::Vm>, 169 hotplug_slots: Vec<HotPlugState>, 170 selected_slot: usize, 171 mergeable: bool, 172 allocator: Arc<Mutex<SystemAllocator>>, 173 hotplug_method: HotplugMethod, 174 boot_ram: u64, 175 current_ram: u64, 176 next_hotplug_slot: usize, 177 shared: bool, 178 hugepages: bool, 179 hugepage_size: Option<u64>, 180 prefault: bool, 181 thp: bool, 182 #[cfg(target_arch = "x86_64")] 183 sgx_epc_region: Option<SgxEpcRegion>, 184 user_provided_zones: bool, 185 snapshot_memory_ranges: MemoryRangeTable, 186 memory_zones: MemoryZones, 187 log_dirty: bool, // Enable dirty logging for created RAM regions 188 arch_mem_regions: Vec<ArchMemRegion>, 189 ram_allocator: AddressAllocator, 190 dynamic: bool, 191 192 // Keep track of calls to create_userspace_mapping() for guest RAM. 193 // This is useful for getting the dirty pages as we need to know the 194 // slots that the mapping is created in. 195 guest_ram_mappings: Vec<GuestRamMapping>, 196 197 pub acpi_address: Option<GuestAddress>, 198 #[cfg(target_arch = "aarch64")] 199 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 200 } 201 202 #[derive(Debug)] 203 pub enum Error { 204 /// Failed to create shared file. 205 SharedFileCreate(io::Error), 206 207 /// Failed to set shared file length. 208 SharedFileSetLen(io::Error), 209 210 /// Mmap backed guest memory error 211 GuestMemory(MmapError), 212 213 /// Failed to allocate a memory range. 214 MemoryRangeAllocation, 215 216 /// Error from region creation 217 GuestMemoryRegion(MmapRegionError), 218 219 /// No ACPI slot available 220 NoSlotAvailable, 221 222 /// Not enough space in the hotplug RAM region 223 InsufficientHotplugRam, 224 225 /// The requested hotplug memory addition is not a valid size 226 InvalidSize, 227 228 /// Failed to create the user memory region. 229 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 230 231 /// Failed to remove the user memory region. 232 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 233 234 /// Failed to EventFd. 235 EventFdFail(io::Error), 236 237 /// Eventfd write error 238 EventfdError(io::Error), 239 240 /// Failed to virtio-mem resize 241 VirtioMemResizeFail(virtio_devices::mem::Error), 242 243 /// Cannot restore VM 244 Restore(MigratableError), 245 246 /// Cannot restore VM because source URL is missing 247 RestoreMissingSourceUrl, 248 249 /// Cannot create the system allocator 250 CreateSystemAllocator, 251 252 /// Invalid SGX EPC section size 253 #[cfg(target_arch = "x86_64")] 254 EpcSectionSizeInvalid, 255 256 /// Failed allocating SGX EPC region 257 #[cfg(target_arch = "x86_64")] 258 SgxEpcRangeAllocation, 259 260 /// Failed opening SGX virtual EPC device 261 #[cfg(target_arch = "x86_64")] 262 SgxVirtEpcOpen(io::Error), 263 264 /// Failed setting the SGX virtual EPC section size 265 #[cfg(target_arch = "x86_64")] 266 SgxVirtEpcFileSetLen(io::Error), 267 268 /// Failed opening SGX provisioning device 269 #[cfg(target_arch = "x86_64")] 270 SgxProvisionOpen(io::Error), 271 272 /// Failed enabling SGX provisioning 273 #[cfg(target_arch = "x86_64")] 274 SgxEnableProvisioning(hypervisor::HypervisorVmError), 275 276 /// Failed creating a new MmapRegion instance. 277 #[cfg(target_arch = "x86_64")] 278 NewMmapRegion(vm_memory::mmap::MmapRegionError), 279 280 /// No memory zones found. 281 MissingMemoryZones, 282 283 /// Memory configuration is not valid. 284 InvalidMemoryParameters, 285 286 /// Forbidden operation. Impossible to resize guest memory if it is 287 /// backed by user defined memory regions. 288 InvalidResizeWithMemoryZones, 289 290 /// It's invalid to try applying a NUMA policy to a memory zone that is 291 /// memory mapped with MAP_SHARED. 292 InvalidSharedMemoryZoneWithHostNuma, 293 294 /// Failed applying NUMA memory policy. 295 ApplyNumaPolicy(io::Error), 296 297 /// Memory zone identifier is not unique. 298 DuplicateZoneId, 299 300 /// No virtio-mem resizing handler found. 301 MissingVirtioMemHandler, 302 303 /// Unknown memory zone. 304 UnknownMemoryZone, 305 306 /// Invalid size for resizing. Can be anything except 0. 307 InvalidHotplugSize, 308 309 /// Invalid hotplug method associated with memory zones resizing capability. 310 InvalidHotplugMethodWithMemoryZones, 311 312 /// Could not find specified memory zone identifier from hash map. 313 MissingZoneIdentifier, 314 315 /// Resizing the memory zone failed. 316 ResizeZone, 317 318 /// Guest address overflow 319 GuestAddressOverFlow, 320 321 /// Error opening snapshot file 322 SnapshotOpen(io::Error), 323 324 // Error copying snapshot into region 325 SnapshotCopy(GuestMemoryError), 326 327 /// Failed to allocate MMIO address 328 AllocateMmioAddress, 329 330 #[cfg(target_arch = "aarch64")] 331 /// Failed to create UEFI flash 332 CreateUefiFlash(HypervisorVmError), 333 334 /// Using a directory as a backing file for memory is not supported 335 DirectoryAsBackingFileForMemory, 336 337 /// Failed to stat filesystem 338 GetFileSystemBlockSize(io::Error), 339 340 /// Memory size is misaligned with default page size or its hugepage size 341 MisalignedMemorySize, 342 } 343 344 const ENABLE_FLAG: usize = 0; 345 const INSERTING_FLAG: usize = 1; 346 const REMOVING_FLAG: usize = 2; 347 const EJECT_FLAG: usize = 3; 348 349 const BASE_OFFSET_LOW: u64 = 0; 350 const BASE_OFFSET_HIGH: u64 = 0x4; 351 const LENGTH_OFFSET_LOW: u64 = 0x8; 352 const LENGTH_OFFSET_HIGH: u64 = 0xC; 353 const STATUS_OFFSET: u64 = 0x14; 354 const SELECTION_OFFSET: u64 = 0; 355 356 // The MMIO address space size is subtracted with 64k. This is done for the 357 // following reasons: 358 // - Reduce the addressable space size by at least 4k to workaround a Linux 359 // bug when the VMM allocates devices at the end of the addressable space 360 // - Windows requires the addressable space size to be 64k aligned 361 fn mmio_address_space_size(phys_bits: u8) -> u64 { 362 (1 << phys_bits) - (1 << 16) 363 } 364 365 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the 366 // `f_bsize` field. 367 // 368 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169 369 fn statfs_get_bsize(path: &str) -> Result<u64, Error> { 370 let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?; 371 let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit(); 372 373 // SAFETY: FFI call with a valid path and buffer 374 let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) }; 375 if ret != 0 { 376 return Err(Error::GetFileSystemBlockSize( 377 std::io::Error::last_os_error(), 378 )); 379 } 380 381 // SAFETY: `buf` is valid at this point 382 // Because this value is always positive, just convert it directly. 383 // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned 384 // by `clippy` on musl target. To avoid the warning, there should be `as _` instead of 385 // `as u64`. 386 let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _; 387 Ok(bsize) 388 } 389 390 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> { 391 // SAFETY: FFI call. Trivially safe. 392 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 393 394 // There is no backend file and the `hugepages` is disabled, just use system page size. 395 if zone.file.is_none() && !zone.hugepages { 396 return Ok(page_size); 397 } 398 399 // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly. 400 if zone.hugepages && zone.hugepage_size.is_some() { 401 return Ok(zone.hugepage_size.unwrap()); 402 } 403 404 // There are two scenarios here: 405 // - `hugepages` is enabled but `hugepage_size` is not specified: 406 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 407 // - The backing file is specified: 408 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 409 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 410 // value is less than or equal to the page size, just use the page size. 411 let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| { 412 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 413 })?; 414 415 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 416 417 Ok(align_size) 418 } 419 420 #[inline] 421 fn align_down<T>(val: T, align: T) -> T 422 where 423 T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>, 424 { 425 val & !(align - 1u8.into()) 426 } 427 428 #[inline] 429 fn is_aligned<T>(val: T, align: T) -> bool 430 where 431 T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq, 432 { 433 (val & (align - 1u8.into())) == 0u8.into() 434 } 435 436 impl BusDevice for MemoryManager { 437 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 438 if self.selected_slot < self.hotplug_slots.len() { 439 let state = &self.hotplug_slots[self.selected_slot]; 440 match offset { 441 BASE_OFFSET_LOW => { 442 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 443 } 444 BASE_OFFSET_HIGH => { 445 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 446 } 447 LENGTH_OFFSET_LOW => { 448 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 449 } 450 LENGTH_OFFSET_HIGH => { 451 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 452 } 453 STATUS_OFFSET => { 454 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 455 data.fill(0); 456 if state.active { 457 data[0] |= 1 << ENABLE_FLAG; 458 } 459 if state.inserting { 460 data[0] |= 1 << INSERTING_FLAG; 461 } 462 if state.removing { 463 data[0] |= 1 << REMOVING_FLAG; 464 } 465 } 466 _ => { 467 warn!( 468 "Unexpected offset for accessing memory manager device: {:#}", 469 offset 470 ); 471 } 472 } 473 } else { 474 warn!("Out of range memory slot: {}", self.selected_slot); 475 } 476 } 477 478 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 479 match offset { 480 SELECTION_OFFSET => { 481 self.selected_slot = usize::from(data[0]); 482 } 483 STATUS_OFFSET => { 484 if self.selected_slot < self.hotplug_slots.len() { 485 let state = &mut self.hotplug_slots[self.selected_slot]; 486 // The ACPI code writes back a 1 to acknowledge the insertion 487 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 488 state.inserting = false; 489 } 490 // Ditto for removal 491 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 492 state.removing = false; 493 } 494 // Trigger removal of "DIMM" 495 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 496 warn!("Ejection of memory not currently supported"); 497 } 498 } else { 499 warn!("Out of range memory slot: {}", self.selected_slot); 500 } 501 } 502 _ => { 503 warn!( 504 "Unexpected offset for accessing memory manager device: {:#}", 505 offset 506 ); 507 } 508 }; 509 None 510 } 511 } 512 513 impl MemoryManager { 514 /// Creates all memory regions based on the available RAM ranges defined 515 /// by `ram_regions`, and based on the description of the memory zones. 516 /// In practice, this function can perform multiple memory mappings of the 517 /// same backing file if there's a hole in the address space between two 518 /// RAM ranges. 519 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 520 /// and zones containing two zones (size 1G and size 4G). 521 /// This function will create 3 resulting memory regions: 522 /// - First one mapping entirely the first memory zone on 0-1G range 523 /// - Second one mapping partially the second memory zone on 1G-3G range 524 /// - Third one mapping partially the second memory zone on 4G-6G range 525 /// Also, all memory regions are page-size aligned (e.g. their sizes must 526 /// be multiple of page-size), which may leave an additional hole in the 527 /// address space when hugepage is used. 528 fn create_memory_regions_from_zones( 529 ram_regions: &[(GuestAddress, usize)], 530 zones: &[MemoryZoneConfig], 531 prefault: Option<bool>, 532 thp: bool, 533 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 534 let mut zone_iter = zones.iter(); 535 let mut mem_regions = Vec::new(); 536 let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?; 537 let mut zone_align_size = memory_zone_get_align_size(zone)?; 538 let mut zone_offset = 0u64; 539 let mut memory_zones = HashMap::new(); 540 541 if !is_aligned(zone.size, zone_align_size) { 542 return Err(Error::MisalignedMemorySize); 543 } 544 545 // Add zone id to the list of memory zones. 546 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 547 548 for ram_region in ram_regions.iter() { 549 let mut ram_region_offset = 0; 550 let mut exit = false; 551 552 loop { 553 let mut ram_region_consumed = false; 554 let mut pull_next_zone = false; 555 556 let ram_region_available_size = 557 align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size); 558 if ram_region_available_size == 0 { 559 break; 560 } 561 let zone_sub_size = zone.size - zone_offset; 562 563 let file_offset = zone_offset; 564 let region_start = ram_region 565 .0 566 .checked_add(ram_region_offset) 567 .ok_or(Error::GuestAddressOverFlow)?; 568 let region_size = if zone_sub_size <= ram_region_available_size { 569 if zone_sub_size == ram_region_available_size { 570 ram_region_consumed = true; 571 } 572 573 ram_region_offset += zone_sub_size; 574 pull_next_zone = true; 575 576 zone_sub_size 577 } else { 578 zone_offset += ram_region_available_size; 579 ram_region_consumed = true; 580 581 ram_region_available_size 582 }; 583 584 info!( 585 "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}", 586 zone.id, 587 region_start.raw_value(), 588 region_size 589 ); 590 let region = MemoryManager::create_ram_region( 591 &zone.file, 592 file_offset, 593 region_start, 594 region_size as usize, 595 prefault.unwrap_or(zone.prefault), 596 zone.shared, 597 zone.hugepages, 598 zone.hugepage_size, 599 zone.host_numa_node, 600 None, 601 thp, 602 )?; 603 604 // Add region to the list of regions associated with the 605 // current memory zone. 606 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 607 memory_zone.regions.push(region.clone()); 608 } 609 610 mem_regions.push(region); 611 612 if pull_next_zone { 613 // Get the next zone and reset the offset. 614 zone_offset = 0; 615 if let Some(z) = zone_iter.next() { 616 zone = z; 617 } else { 618 exit = true; 619 break; 620 } 621 zone_align_size = memory_zone_get_align_size(zone)?; 622 if !is_aligned(zone.size, zone_align_size) { 623 return Err(Error::MisalignedMemorySize); 624 } 625 626 // Check if zone id already exist. In case it does, throw 627 // an error as we need unique identifiers. Otherwise, add 628 // the new zone id to the list of memory zones. 629 if memory_zones.contains_key(&zone.id) { 630 error!( 631 "Memory zone identifier '{}' found more than once. \ 632 It must be unique", 633 zone.id, 634 ); 635 return Err(Error::DuplicateZoneId); 636 } 637 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 638 } 639 640 if ram_region_consumed { 641 break; 642 } 643 } 644 645 if exit { 646 break; 647 } 648 } 649 650 Ok((mem_regions, memory_zones)) 651 } 652 653 // Restore both GuestMemory regions along with MemoryZone zones. 654 fn restore_memory_regions_and_zones( 655 guest_ram_mappings: &[GuestRamMapping], 656 zones_config: &[MemoryZoneConfig], 657 prefault: Option<bool>, 658 mut existing_memory_files: HashMap<u32, File>, 659 thp: bool, 660 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 661 let mut memory_regions = Vec::new(); 662 let mut memory_zones = HashMap::new(); 663 664 for zone_config in zones_config { 665 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 666 } 667 668 for guest_ram_mapping in guest_ram_mappings { 669 for zone_config in zones_config { 670 if guest_ram_mapping.zone_id == zone_config.id { 671 let region = MemoryManager::create_ram_region( 672 if guest_ram_mapping.virtio_mem { 673 &None 674 } else { 675 &zone_config.file 676 }, 677 guest_ram_mapping.file_offset, 678 GuestAddress(guest_ram_mapping.gpa), 679 guest_ram_mapping.size as usize, 680 prefault.unwrap_or(zone_config.prefault), 681 zone_config.shared, 682 zone_config.hugepages, 683 zone_config.hugepage_size, 684 zone_config.host_numa_node, 685 existing_memory_files.remove(&guest_ram_mapping.slot), 686 thp, 687 )?; 688 memory_regions.push(Arc::clone(®ion)); 689 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 690 if guest_ram_mapping.virtio_mem { 691 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 692 let region_size = region.len(); 693 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 694 region, 695 virtio_device: None, 696 hotplugged_size, 697 hugepages: zone_config.hugepages, 698 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 699 }); 700 } else { 701 memory_zone.regions.push(region); 702 } 703 } 704 } 705 } 706 } 707 708 memory_regions.sort_by_key(|x| x.start_addr()); 709 710 Ok((memory_regions, memory_zones)) 711 } 712 713 fn fill_saved_regions( 714 &mut self, 715 file_path: PathBuf, 716 saved_regions: MemoryRangeTable, 717 ) -> Result<(), Error> { 718 if saved_regions.is_empty() { 719 return Ok(()); 720 } 721 722 // Open (read only) the snapshot file. 723 let mut memory_file = OpenOptions::new() 724 .read(true) 725 .open(file_path) 726 .map_err(Error::SnapshotOpen)?; 727 728 let guest_memory = self.guest_memory.memory(); 729 for range in saved_regions.regions() { 730 let mut offset: u64 = 0; 731 // Here we are manually handling the retry in case we can't write 732 // the whole region at once because we can't use the implementation 733 // from vm-memory::GuestMemory of read_exact_from() as it is not 734 // following the correct behavior. For more info about this issue 735 // see: https://github.com/rust-vmm/vm-memory/issues/174 736 loop { 737 let bytes_read = guest_memory 738 .read_volatile_from( 739 GuestAddress(range.gpa + offset), 740 &mut memory_file, 741 (range.length - offset) as usize, 742 ) 743 .map_err(Error::SnapshotCopy)?; 744 offset += bytes_read as u64; 745 746 if offset == range.length { 747 break; 748 } 749 } 750 } 751 752 Ok(()) 753 } 754 755 fn validate_memory_config( 756 config: &MemoryConfig, 757 user_provided_zones: bool, 758 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 759 let mut allow_mem_hotplug = false; 760 761 if !user_provided_zones { 762 if config.zones.is_some() { 763 error!( 764 "User defined memory regions can't be provided if the \ 765 memory size is not 0" 766 ); 767 return Err(Error::InvalidMemoryParameters); 768 } 769 770 if config.hotplug_size.is_some() { 771 allow_mem_hotplug = true; 772 } 773 774 if let Some(hotplugged_size) = config.hotplugged_size { 775 if let Some(hotplug_size) = config.hotplug_size { 776 if hotplugged_size > hotplug_size { 777 error!( 778 "'hotplugged_size' {} can't be bigger than \ 779 'hotplug_size' {}", 780 hotplugged_size, hotplug_size, 781 ); 782 return Err(Error::InvalidMemoryParameters); 783 } 784 } else { 785 error!( 786 "Invalid to define 'hotplugged_size' when there is\ 787 no 'hotplug_size'" 788 ); 789 return Err(Error::InvalidMemoryParameters); 790 } 791 if config.hotplug_method == HotplugMethod::Acpi { 792 error!( 793 "Invalid to define 'hotplugged_size' with hotplug \ 794 method 'acpi'" 795 ); 796 return Err(Error::InvalidMemoryParameters); 797 } 798 } 799 800 // Create a single zone from the global memory config. This lets 801 // us reuse the codepath for user defined memory zones. 802 let zones = vec![MemoryZoneConfig { 803 id: String::from(DEFAULT_MEMORY_ZONE), 804 size: config.size, 805 file: None, 806 shared: config.shared, 807 hugepages: config.hugepages, 808 hugepage_size: config.hugepage_size, 809 host_numa_node: None, 810 hotplug_size: config.hotplug_size, 811 hotplugged_size: config.hotplugged_size, 812 prefault: config.prefault, 813 }]; 814 815 Ok((config.size, zones, allow_mem_hotplug)) 816 } else { 817 if config.zones.is_none() { 818 error!( 819 "User defined memory regions must be provided if the \ 820 memory size is 0" 821 ); 822 return Err(Error::MissingMemoryZones); 823 } 824 825 // Safe to unwrap as we checked right above there were some 826 // regions. 827 let zones = config.zones.clone().unwrap(); 828 if zones.is_empty() { 829 return Err(Error::MissingMemoryZones); 830 } 831 832 let mut total_ram_size: u64 = 0; 833 for zone in zones.iter() { 834 total_ram_size += zone.size; 835 836 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 837 error!( 838 "Invalid to set host NUMA policy for a memory zone \ 839 backed by a regular file and mapped as 'shared'" 840 ); 841 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 842 } 843 844 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 845 error!("Invalid to set ACPI hotplug method for memory zones"); 846 return Err(Error::InvalidHotplugMethodWithMemoryZones); 847 } 848 849 if let Some(hotplugged_size) = zone.hotplugged_size { 850 if let Some(hotplug_size) = zone.hotplug_size { 851 if hotplugged_size > hotplug_size { 852 error!( 853 "'hotplugged_size' {} can't be bigger than \ 854 'hotplug_size' {}", 855 hotplugged_size, hotplug_size, 856 ); 857 return Err(Error::InvalidMemoryParameters); 858 } 859 } else { 860 error!( 861 "Invalid to define 'hotplugged_size' when there is\ 862 no 'hotplug_size' for a memory zone" 863 ); 864 return Err(Error::InvalidMemoryParameters); 865 } 866 if config.hotplug_method == HotplugMethod::Acpi { 867 error!( 868 "Invalid to define 'hotplugged_size' with hotplug \ 869 method 'acpi'" 870 ); 871 return Err(Error::InvalidMemoryParameters); 872 } 873 } 874 } 875 876 Ok((total_ram_size, zones, allow_mem_hotplug)) 877 } 878 } 879 880 pub fn allocate_address_space(&mut self) -> Result<(), Error> { 881 let mut list = Vec::new(); 882 883 for (zone_id, memory_zone) in self.memory_zones.iter() { 884 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 885 memory_zone 886 .regions() 887 .iter() 888 .map(|r| (r.clone(), false)) 889 .collect(); 890 891 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 892 regions.push((virtio_mem_zone.region().clone(), true)); 893 } 894 895 list.push((zone_id.clone(), regions)); 896 } 897 898 for (zone_id, regions) in list { 899 for (region, virtio_mem) in regions { 900 let slot = self.create_userspace_mapping( 901 region.start_addr().raw_value(), 902 region.len(), 903 region.as_ptr() as u64, 904 self.mergeable, 905 false, 906 self.log_dirty, 907 )?; 908 909 let file_offset = if let Some(file_offset) = region.file_offset() { 910 file_offset.start() 911 } else { 912 0 913 }; 914 915 self.guest_ram_mappings.push(GuestRamMapping { 916 gpa: region.start_addr().raw_value(), 917 size: region.len(), 918 slot, 919 zone_id: zone_id.clone(), 920 virtio_mem, 921 file_offset, 922 }); 923 self.ram_allocator 924 .allocate(Some(region.start_addr()), region.len(), None) 925 .ok_or(Error::MemoryRangeAllocation)?; 926 } 927 } 928 929 // Allocate SubRegion and Reserved address ranges. 930 for region in self.arch_mem_regions.iter() { 931 if region.r_type == RegionType::Ram { 932 // Ignore the RAM type since ranges have already been allocated 933 // based on the GuestMemory regions. 934 continue; 935 } 936 self.ram_allocator 937 .allocate( 938 Some(GuestAddress(region.base)), 939 region.size as GuestUsize, 940 None, 941 ) 942 .ok_or(Error::MemoryRangeAllocation)?; 943 } 944 945 Ok(()) 946 } 947 948 #[cfg(target_arch = "aarch64")] 949 fn add_uefi_flash(&mut self) -> Result<(), Error> { 950 // On AArch64, the UEFI binary requires a flash device at address 0. 951 // 4 MiB memory is mapped to simulate the flash. 952 let uefi_mem_slot = self.allocate_memory_slot(); 953 let uefi_region = GuestRegionMmap::new( 954 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 955 arch::layout::UEFI_START, 956 ) 957 .unwrap(); 958 let uefi_mem_region = self.vm.make_user_memory_region( 959 uefi_mem_slot, 960 uefi_region.start_addr().raw_value(), 961 uefi_region.len(), 962 uefi_region.as_ptr() as u64, 963 false, 964 false, 965 ); 966 self.vm 967 .create_user_memory_region(uefi_mem_region) 968 .map_err(Error::CreateUefiFlash)?; 969 970 let uefi_flash = 971 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 972 973 self.uefi_flash = Some(uefi_flash); 974 975 Ok(()) 976 } 977 978 #[allow(clippy::too_many_arguments)] 979 pub fn new( 980 vm: Arc<dyn hypervisor::Vm>, 981 config: &MemoryConfig, 982 prefault: Option<bool>, 983 phys_bits: u8, 984 #[cfg(feature = "tdx")] tdx_enabled: bool, 985 restore_data: Option<&MemoryManagerSnapshotData>, 986 existing_memory_files: Option<HashMap<u32, File>>, 987 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 988 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 989 trace_scoped!("MemoryManager::new"); 990 991 let user_provided_zones = config.size == 0; 992 993 let mmio_address_space_size = mmio_address_space_size(phys_bits); 994 debug_assert_eq!( 995 (((mmio_address_space_size) >> 16) << 16), 996 mmio_address_space_size 997 ); 998 let start_of_platform_device_area = 999 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 1000 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 1001 1002 let (ram_size, zones, allow_mem_hotplug) = 1003 Self::validate_memory_config(config, user_provided_zones)?; 1004 1005 let ( 1006 start_of_device_area, 1007 boot_ram, 1008 current_ram, 1009 arch_mem_regions, 1010 memory_zones, 1011 guest_memory, 1012 boot_guest_memory, 1013 hotplug_slots, 1014 next_memory_slot, 1015 selected_slot, 1016 next_hotplug_slot, 1017 ) = if let Some(data) = restore_data { 1018 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 1019 &data.guest_ram_mappings, 1020 &zones, 1021 prefault, 1022 existing_memory_files.unwrap_or_default(), 1023 config.thp, 1024 )?; 1025 let guest_memory = 1026 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 1027 let boot_guest_memory = guest_memory.clone(); 1028 ( 1029 GuestAddress(data.start_of_device_area), 1030 data.boot_ram, 1031 data.current_ram, 1032 data.arch_mem_regions.clone(), 1033 memory_zones, 1034 guest_memory, 1035 boot_guest_memory, 1036 data.hotplug_slots.clone(), 1037 data.next_memory_slot, 1038 data.selected_slot, 1039 data.next_hotplug_slot, 1040 ) 1041 } else { 1042 // Init guest memory 1043 let arch_mem_regions = arch::arch_memory_regions(); 1044 1045 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1046 .iter() 1047 .filter(|r| r.2 == RegionType::Ram) 1048 .map(|r| (r.0, r.1)) 1049 .collect(); 1050 1051 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 1052 .iter() 1053 .map(|(a, b, c)| ArchMemRegion { 1054 base: a.0, 1055 size: *b, 1056 r_type: *c, 1057 }) 1058 .collect(); 1059 1060 let (mem_regions, mut memory_zones) = 1061 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 1062 1063 let mut guest_memory = 1064 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 1065 1066 let boot_guest_memory = guest_memory.clone(); 1067 1068 let mut start_of_device_area = 1069 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 1070 1071 // Update list of memory zones for resize. 1072 for zone in zones.iter() { 1073 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 1074 if let Some(hotplug_size) = zone.hotplug_size { 1075 if hotplug_size == 0 { 1076 error!("'hotplug_size' can't be 0"); 1077 return Err(Error::InvalidHotplugSize); 1078 } 1079 1080 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 1081 start_of_device_area = start_of_device_area 1082 .checked_add(hotplug_size) 1083 .ok_or(Error::GuestAddressOverFlow)?; 1084 } else { 1085 // Alignment must be "natural" i.e. same as size of block 1086 let start_addr = GuestAddress( 1087 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1088 - 1) 1089 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1090 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 1091 ); 1092 1093 // When `prefault` is set by vm_restore, memory manager 1094 // will create ram region with `prefault` option in 1095 // restore config rather than same option in zone 1096 let region = MemoryManager::create_ram_region( 1097 &None, 1098 0, 1099 start_addr, 1100 hotplug_size as usize, 1101 prefault.unwrap_or(zone.prefault), 1102 zone.shared, 1103 zone.hugepages, 1104 zone.hugepage_size, 1105 zone.host_numa_node, 1106 None, 1107 config.thp, 1108 )?; 1109 1110 guest_memory = guest_memory 1111 .insert_region(Arc::clone(®ion)) 1112 .map_err(Error::GuestMemory)?; 1113 1114 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1115 let region_size = region.len(); 1116 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1117 region, 1118 virtio_device: None, 1119 hotplugged_size, 1120 hugepages: zone.hugepages, 1121 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1122 }); 1123 1124 start_of_device_area = start_addr 1125 .checked_add(hotplug_size) 1126 .ok_or(Error::GuestAddressOverFlow)?; 1127 } 1128 } 1129 } else { 1130 return Err(Error::MissingZoneIdentifier); 1131 } 1132 } 1133 1134 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1135 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1136 1137 ( 1138 start_of_device_area, 1139 ram_size, 1140 ram_size, 1141 arch_mem_regions, 1142 memory_zones, 1143 guest_memory, 1144 boot_guest_memory, 1145 hotplug_slots, 1146 0, 1147 0, 1148 0, 1149 ) 1150 }; 1151 1152 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1153 1154 // Both MMIO and PIO address spaces start at address 0. 1155 let allocator = Arc::new(Mutex::new( 1156 SystemAllocator::new( 1157 #[cfg(target_arch = "x86_64")] 1158 { 1159 GuestAddress(0) 1160 }, 1161 #[cfg(target_arch = "x86_64")] 1162 { 1163 1 << 16 1164 }, 1165 start_of_platform_device_area, 1166 PLATFORM_DEVICE_AREA_SIZE, 1167 #[cfg(target_arch = "x86_64")] 1168 vec![GsiApic::new( 1169 X86_64_IRQ_BASE, 1170 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1171 )], 1172 ) 1173 .ok_or(Error::CreateSystemAllocator)?, 1174 )); 1175 1176 #[cfg(not(feature = "tdx"))] 1177 let dynamic = true; 1178 #[cfg(feature = "tdx")] 1179 let dynamic = !tdx_enabled; 1180 1181 let acpi_address = if dynamic 1182 && config.hotplug_method == HotplugMethod::Acpi 1183 && (config.hotplug_size.unwrap_or_default() > 0) 1184 { 1185 Some( 1186 allocator 1187 .lock() 1188 .unwrap() 1189 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1190 .ok_or(Error::AllocateMmioAddress)?, 1191 ) 1192 } else { 1193 None 1194 }; 1195 1196 // If running on SGX the start of device area and RAM area may diverge but 1197 // at this point they are next to each other. 1198 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1199 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1200 1201 let mut memory_manager = MemoryManager { 1202 boot_guest_memory, 1203 guest_memory, 1204 next_memory_slot, 1205 start_of_device_area, 1206 end_of_device_area, 1207 end_of_ram_area, 1208 vm, 1209 hotplug_slots, 1210 selected_slot, 1211 mergeable: config.mergeable, 1212 allocator, 1213 hotplug_method: config.hotplug_method, 1214 boot_ram, 1215 current_ram, 1216 next_hotplug_slot, 1217 shared: config.shared, 1218 hugepages: config.hugepages, 1219 hugepage_size: config.hugepage_size, 1220 prefault: config.prefault, 1221 #[cfg(target_arch = "x86_64")] 1222 sgx_epc_region: None, 1223 user_provided_zones, 1224 snapshot_memory_ranges: MemoryRangeTable::default(), 1225 memory_zones, 1226 guest_ram_mappings: Vec::new(), 1227 acpi_address, 1228 log_dirty: dynamic, // Cannot log dirty pages on a TD 1229 arch_mem_regions, 1230 ram_allocator, 1231 dynamic, 1232 #[cfg(target_arch = "aarch64")] 1233 uefi_flash: None, 1234 thp: config.thp, 1235 }; 1236 1237 #[cfg(target_arch = "aarch64")] 1238 { 1239 // For Aarch64 we cannot lazily allocate the address space like we 1240 // do for x86, because while restoring a VM from snapshot we would 1241 // need the address space to be allocated to properly restore VGIC. 1242 // And the restore of VGIC happens before we attempt to run the vCPUs 1243 // for the first time, thus we need to allocate the address space 1244 // beforehand. 1245 memory_manager.allocate_address_space()?; 1246 memory_manager.add_uefi_flash()?; 1247 } 1248 1249 #[cfg(target_arch = "x86_64")] 1250 if let Some(sgx_epc_config) = sgx_epc_config { 1251 memory_manager.setup_sgx(sgx_epc_config)?; 1252 } 1253 1254 Ok(Arc::new(Mutex::new(memory_manager))) 1255 } 1256 1257 pub fn new_from_snapshot( 1258 snapshot: &Snapshot, 1259 vm: Arc<dyn hypervisor::Vm>, 1260 config: &MemoryConfig, 1261 source_url: Option<&str>, 1262 prefault: bool, 1263 phys_bits: u8, 1264 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1265 if let Some(source_url) = source_url { 1266 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1267 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1268 1269 let mem_snapshot: MemoryManagerSnapshotData = 1270 snapshot.to_state().map_err(Error::Restore)?; 1271 1272 let mm = MemoryManager::new( 1273 vm, 1274 config, 1275 Some(prefault), 1276 phys_bits, 1277 #[cfg(feature = "tdx")] 1278 false, 1279 Some(&mem_snapshot), 1280 None, 1281 #[cfg(target_arch = "x86_64")] 1282 None, 1283 )?; 1284 1285 mm.lock() 1286 .unwrap() 1287 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1288 1289 Ok(mm) 1290 } else { 1291 Err(Error::RestoreMissingSourceUrl) 1292 } 1293 } 1294 1295 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1296 // SAFETY: FFI call with correct arguments 1297 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1298 1299 if res < 0 { 1300 Err(io::Error::last_os_error()) 1301 } else { 1302 Ok(res as RawFd) 1303 } 1304 } 1305 1306 fn mbind( 1307 addr: *mut u8, 1308 len: u64, 1309 mode: u32, 1310 nodemask: Vec<u64>, 1311 maxnode: u64, 1312 flags: u32, 1313 ) -> Result<(), io::Error> { 1314 // SAFETY: FFI call with correct arguments 1315 let res = unsafe { 1316 libc::syscall( 1317 libc::SYS_mbind, 1318 addr as *mut libc::c_void, 1319 len, 1320 mode, 1321 nodemask.as_ptr(), 1322 maxnode, 1323 flags, 1324 ) 1325 }; 1326 1327 if res < 0 { 1328 Err(io::Error::last_os_error()) 1329 } else { 1330 Ok(()) 1331 } 1332 } 1333 1334 fn create_anonymous_file( 1335 size: usize, 1336 hugepages: bool, 1337 hugepage_size: Option<u64>, 1338 ) -> Result<FileOffset, Error> { 1339 let fd = Self::memfd_create( 1340 &ffi::CString::new("ch_ram").unwrap(), 1341 libc::MFD_CLOEXEC 1342 | if hugepages { 1343 libc::MFD_HUGETLB 1344 | if let Some(hugepage_size) = hugepage_size { 1345 /* 1346 * From the Linux kernel: 1347 * Several system calls take a flag to request "hugetlb" huge pages. 1348 * Without further specification, these system calls will use the 1349 * system's default huge page size. If a system supports multiple 1350 * huge page sizes, the desired huge page size can be specified in 1351 * bits [26:31] of the flag arguments. The value in these 6 bits 1352 * will encode the log2 of the huge page size. 1353 */ 1354 1355 hugepage_size.trailing_zeros() << 26 1356 } else { 1357 // Use the system default huge page size 1358 0 1359 } 1360 } else { 1361 0 1362 }, 1363 ) 1364 .map_err(Error::SharedFileCreate)?; 1365 1366 // SAFETY: fd is valid 1367 let f = unsafe { File::from_raw_fd(fd) }; 1368 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1369 1370 Ok(FileOffset::new(f, 0)) 1371 } 1372 1373 fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> { 1374 if backing_file.is_dir() { 1375 Err(Error::DirectoryAsBackingFileForMemory) 1376 } else { 1377 let f = OpenOptions::new() 1378 .read(true) 1379 .write(true) 1380 .open(backing_file) 1381 .map_err(Error::SharedFileCreate)?; 1382 1383 Ok(FileOffset::new(f, file_offset)) 1384 } 1385 } 1386 1387 #[allow(clippy::too_many_arguments)] 1388 pub fn create_ram_region( 1389 backing_file: &Option<PathBuf>, 1390 file_offset: u64, 1391 start_addr: GuestAddress, 1392 size: usize, 1393 prefault: bool, 1394 shared: bool, 1395 hugepages: bool, 1396 hugepage_size: Option<u64>, 1397 host_numa_node: Option<u32>, 1398 existing_memory_file: Option<File>, 1399 thp: bool, 1400 ) -> Result<Arc<GuestRegionMmap>, Error> { 1401 let mut mmap_flags = libc::MAP_NORESERVE; 1402 1403 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1404 // the complexity of the handling clear. 1405 let fo = if let Some(f) = existing_memory_file { 1406 // It must be MAP_SHARED as we wouldn't already have an FD 1407 mmap_flags |= libc::MAP_SHARED; 1408 Some(FileOffset::new(f, file_offset)) 1409 } else if let Some(backing_file) = backing_file { 1410 if shared { 1411 mmap_flags |= libc::MAP_SHARED; 1412 } else { 1413 mmap_flags |= libc::MAP_PRIVATE; 1414 } 1415 Some(Self::open_backing_file(backing_file, file_offset)?) 1416 } else if shared || hugepages { 1417 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1418 // because the MAP_PRIVATE will trigger CoW against the backing file with 1419 // the VFIO pinning 1420 mmap_flags |= libc::MAP_SHARED; 1421 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1422 } else { 1423 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1424 None 1425 }; 1426 1427 let region = GuestRegionMmap::new( 1428 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1429 .map_err(Error::GuestMemoryRegion)?, 1430 start_addr, 1431 ) 1432 .map_err(Error::GuestMemory)?; 1433 1434 // Apply NUMA policy if needed. 1435 if let Some(node) = host_numa_node { 1436 let addr = region.deref().as_ptr(); 1437 let len = region.deref().size() as u64; 1438 let mode = MPOL_BIND; 1439 let mut nodemask: Vec<u64> = Vec::new(); 1440 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1441 1442 // Linux is kind of buggy in the way it interprets maxnode as it 1443 // will cut off the last node. That's why we have to add 1 to what 1444 // we would consider as the proper maxnode value. 1445 let maxnode = node as u64 + 1 + 1; 1446 1447 // Allocate the right size for the vector. 1448 nodemask.resize((node as usize / 64) + 1, 0); 1449 1450 // Fill the global bitmask through the nodemask vector. 1451 let idx = (node / 64) as usize; 1452 let shift = node % 64; 1453 nodemask[idx] |= 1u64 << shift; 1454 1455 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1456 // force the kernel to move all pages that might have been already 1457 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1458 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1459 // MPOL_BIND is the selected mode as it specifies a strict policy 1460 // that restricts memory allocation to the nodes specified in the 1461 // nodemask. 1462 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1463 .map_err(Error::ApplyNumaPolicy)?; 1464 } 1465 1466 // Prefault the region if needed, in parallel. 1467 if prefault { 1468 let page_size = 1469 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize; 1470 1471 if !is_aligned(size, page_size) { 1472 warn!( 1473 "Prefaulting memory size {} misaligned with page size {}", 1474 size, page_size 1475 ); 1476 } 1477 1478 let num_pages = size / page_size; 1479 1480 let num_threads = Self::get_prefault_num_threads(page_size, num_pages); 1481 1482 let pages_per_thread = num_pages / num_threads; 1483 let remainder = num_pages % num_threads; 1484 1485 let barrier = Arc::new(Barrier::new(num_threads)); 1486 thread::scope(|s| { 1487 let r = ®ion; 1488 for i in 0..num_threads { 1489 let barrier = Arc::clone(&barrier); 1490 s.spawn(move || { 1491 // Wait until all threads have been spawned to avoid contention 1492 // over mmap_sem between thread stack allocation and page faulting. 1493 barrier.wait(); 1494 let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; 1495 let offset = 1496 page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); 1497 // SAFETY: FFI call with correct arguments 1498 let ret = unsafe { 1499 let addr = r.as_ptr().add(offset); 1500 libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE) 1501 }; 1502 if ret != 0 { 1503 let e = io::Error::last_os_error(); 1504 warn!("Failed to prefault pages: {}", e); 1505 } 1506 }); 1507 } 1508 }); 1509 } 1510 1511 if region.file_offset().is_none() && thp { 1512 info!( 1513 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1514 region.as_ptr() as u64, 1515 size 1516 ); 1517 // SAFETY: FFI call with correct arguments 1518 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1519 if ret != 0 { 1520 let e = io::Error::last_os_error(); 1521 warn!("Failed to mark pages as THP eligible: {}", e); 1522 } 1523 } 1524 1525 Ok(Arc::new(region)) 1526 } 1527 1528 // Duplicate of `memory_zone_get_align_size` that does not require a `zone` 1529 fn get_prefault_align_size( 1530 backing_file: &Option<PathBuf>, 1531 hugepages: bool, 1532 hugepage_size: Option<u64>, 1533 ) -> Result<u64, Error> { 1534 // SAFETY: FFI call. Trivially safe. 1535 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 1536 match (hugepages, hugepage_size, backing_file) { 1537 (false, _, _) => Ok(page_size), 1538 (true, Some(hugepage_size), _) => Ok(hugepage_size), 1539 (true, None, _) => { 1540 // There are two scenarios here: 1541 // - `hugepages` is enabled but `hugepage_size` is not specified: 1542 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 1543 // - The backing file is specified: 1544 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 1545 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 1546 // value is less than or equal to the page size, just use the page size. 1547 let path = backing_file 1548 .as_ref() 1549 .map_or(Ok("/dev/hugepages"), |pathbuf| { 1550 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 1551 })?; 1552 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 1553 Ok(align_size) 1554 } 1555 } 1556 } 1557 1558 fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize { 1559 let mut n: usize = 1; 1560 1561 // Do not create more threads than processors available. 1562 // SAFETY: FFI call. Trivially safe. 1563 let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) }; 1564 if procs > 0 { 1565 n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT); 1566 } 1567 1568 // Do not create more threads than pages being allocated. 1569 n = std::cmp::min(n, num_pages); 1570 1571 // Do not create threads to allocate less than 64 MiB of memory. 1572 n = std::cmp::min( 1573 n, 1574 std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))), 1575 ); 1576 1577 n 1578 } 1579 1580 // Update the GuestMemoryMmap with the new range 1581 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1582 let guest_memory = self 1583 .guest_memory 1584 .memory() 1585 .insert_region(region) 1586 .map_err(Error::GuestMemory)?; 1587 self.guest_memory.lock().unwrap().replace(guest_memory); 1588 1589 Ok(()) 1590 } 1591 1592 // 1593 // Calculate the start address of an area next to RAM. 1594 // 1595 // If memory hotplug is allowed, the start address needs to be aligned 1596 // (rounded-up) to 128MiB boundary. 1597 // If memory hotplug is not allowed, there is no alignment required. 1598 // And it must also start at the 64bit start. 1599 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1600 let mut start_addr = if allow_mem_hotplug { 1601 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1602 } else { 1603 mem_end 1604 }; 1605 1606 start_addr = start_addr 1607 .checked_add(1) 1608 .ok_or(Error::GuestAddressOverFlow)?; 1609 1610 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1611 return Ok(arch::layout::RAM_64BIT_START); 1612 } 1613 1614 Ok(start_addr) 1615 } 1616 1617 pub fn add_ram_region( 1618 &mut self, 1619 start_addr: GuestAddress, 1620 size: usize, 1621 ) -> Result<Arc<GuestRegionMmap>, Error> { 1622 // Allocate memory for the region 1623 let region = MemoryManager::create_ram_region( 1624 &None, 1625 0, 1626 start_addr, 1627 size, 1628 self.prefault, 1629 self.shared, 1630 self.hugepages, 1631 self.hugepage_size, 1632 None, 1633 None, 1634 self.thp, 1635 )?; 1636 1637 // Map it into the guest 1638 let slot = self.create_userspace_mapping( 1639 region.start_addr().0, 1640 region.len(), 1641 region.as_ptr() as u64, 1642 self.mergeable, 1643 false, 1644 self.log_dirty, 1645 )?; 1646 self.guest_ram_mappings.push(GuestRamMapping { 1647 gpa: region.start_addr().raw_value(), 1648 size: region.len(), 1649 slot, 1650 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1651 virtio_mem: false, 1652 file_offset: 0, 1653 }); 1654 1655 self.add_region(Arc::clone(®ion))?; 1656 1657 Ok(region) 1658 } 1659 1660 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1661 info!("Hotplugging new RAM: {}", size); 1662 1663 // Check that there is a free slot 1664 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1665 return Err(Error::NoSlotAvailable); 1666 } 1667 1668 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1669 if size % (128 << 20) != 0 { 1670 return Err(Error::InvalidSize); 1671 } 1672 1673 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1674 1675 if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area { 1676 return Err(Error::InsufficientHotplugRam); 1677 } 1678 1679 let region = self.add_ram_region(start_addr, size)?; 1680 1681 // Add region to the list of regions associated with the default 1682 // memory zone. 1683 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1684 memory_zone.regions.push(Arc::clone(®ion)); 1685 } 1686 1687 // Tell the allocator 1688 self.ram_allocator 1689 .allocate(Some(start_addr), size as GuestUsize, None) 1690 .ok_or(Error::MemoryRangeAllocation)?; 1691 1692 // Update the slot so that it can be queried via the I/O port 1693 let slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1694 slot.active = true; 1695 slot.inserting = true; 1696 slot.base = region.start_addr().0; 1697 slot.length = region.len(); 1698 1699 self.next_hotplug_slot += 1; 1700 1701 Ok(region) 1702 } 1703 1704 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1705 self.guest_memory.clone() 1706 } 1707 1708 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1709 self.boot_guest_memory.clone() 1710 } 1711 1712 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1713 self.allocator.clone() 1714 } 1715 1716 pub fn start_of_device_area(&self) -> GuestAddress { 1717 self.start_of_device_area 1718 } 1719 1720 pub fn end_of_device_area(&self) -> GuestAddress { 1721 self.end_of_device_area 1722 } 1723 1724 pub fn allocate_memory_slot(&mut self) -> u32 { 1725 let slot_id = self.next_memory_slot; 1726 self.next_memory_slot += 1; 1727 slot_id 1728 } 1729 1730 pub fn create_userspace_mapping( 1731 &mut self, 1732 guest_phys_addr: u64, 1733 memory_size: u64, 1734 userspace_addr: u64, 1735 mergeable: bool, 1736 readonly: bool, 1737 log_dirty: bool, 1738 ) -> Result<u32, Error> { 1739 let slot = self.allocate_memory_slot(); 1740 let mem_region = self.vm.make_user_memory_region( 1741 slot, 1742 guest_phys_addr, 1743 memory_size, 1744 userspace_addr, 1745 readonly, 1746 log_dirty, 1747 ); 1748 1749 info!( 1750 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1751 guest_phys_addr, userspace_addr, memory_size, slot 1752 ); 1753 1754 self.vm 1755 .create_user_memory_region(mem_region) 1756 .map_err(Error::CreateUserMemoryRegion)?; 1757 1758 // SAFETY: the address and size are valid since the 1759 // mmap succeeded. 1760 let ret = unsafe { 1761 libc::madvise( 1762 userspace_addr as *mut libc::c_void, 1763 memory_size as libc::size_t, 1764 libc::MADV_DONTDUMP, 1765 ) 1766 }; 1767 if ret != 0 { 1768 let e = io::Error::last_os_error(); 1769 warn!("Failed to mark mappin as MADV_DONTDUMP: {}", e); 1770 } 1771 1772 // Mark the pages as mergeable if explicitly asked for. 1773 if mergeable { 1774 // SAFETY: the address and size are valid since the 1775 // mmap succeeded. 1776 let ret = unsafe { 1777 libc::madvise( 1778 userspace_addr as *mut libc::c_void, 1779 memory_size as libc::size_t, 1780 libc::MADV_MERGEABLE, 1781 ) 1782 }; 1783 if ret != 0 { 1784 let err = io::Error::last_os_error(); 1785 // Safe to unwrap because the error is constructed with 1786 // last_os_error(), which ensures the output will be Some(). 1787 let errno = err.raw_os_error().unwrap(); 1788 if errno == libc::EINVAL { 1789 warn!("kernel not configured with CONFIG_KSM"); 1790 } else { 1791 warn!("madvise error: {}", err); 1792 } 1793 warn!("failed to mark pages as mergeable"); 1794 } 1795 } 1796 1797 info!( 1798 "Created userspace mapping: {:x} -> {:x} {:x}", 1799 guest_phys_addr, userspace_addr, memory_size 1800 ); 1801 1802 Ok(slot) 1803 } 1804 1805 pub fn remove_userspace_mapping( 1806 &mut self, 1807 guest_phys_addr: u64, 1808 memory_size: u64, 1809 userspace_addr: u64, 1810 mergeable: bool, 1811 slot: u32, 1812 ) -> Result<(), Error> { 1813 let mem_region = self.vm.make_user_memory_region( 1814 slot, 1815 guest_phys_addr, 1816 memory_size, 1817 userspace_addr, 1818 false, /* readonly -- don't care */ 1819 false, /* log dirty */ 1820 ); 1821 1822 self.vm 1823 .remove_user_memory_region(mem_region) 1824 .map_err(Error::RemoveUserMemoryRegion)?; 1825 1826 // Mark the pages as unmergeable if there were previously marked as 1827 // mergeable. 1828 if mergeable { 1829 // SAFETY: the address and size are valid as the region was 1830 // previously advised. 1831 let ret = unsafe { 1832 libc::madvise( 1833 userspace_addr as *mut libc::c_void, 1834 memory_size as libc::size_t, 1835 libc::MADV_UNMERGEABLE, 1836 ) 1837 }; 1838 if ret != 0 { 1839 let err = io::Error::last_os_error(); 1840 // Safe to unwrap because the error is constructed with 1841 // last_os_error(), which ensures the output will be Some(). 1842 let errno = err.raw_os_error().unwrap(); 1843 if errno == libc::EINVAL { 1844 warn!("kernel not configured with CONFIG_KSM"); 1845 } else { 1846 warn!("madvise error: {}", err); 1847 } 1848 warn!("failed to mark pages as unmergeable"); 1849 } 1850 } 1851 1852 info!( 1853 "Removed userspace mapping: {:x} -> {:x} {:x}", 1854 guest_phys_addr, userspace_addr, memory_size 1855 ); 1856 1857 Ok(()) 1858 } 1859 1860 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1861 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1862 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1863 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1864 virtio_mem_device 1865 .lock() 1866 .unwrap() 1867 .resize(size) 1868 .map_err(Error::VirtioMemResizeFail)?; 1869 } 1870 1871 // Keep the hotplugged_size up to date. 1872 virtio_mem_zone.hotplugged_size = size; 1873 } else { 1874 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1875 return Err(Error::MissingVirtioMemHandler); 1876 } 1877 1878 return Ok(()); 1879 } 1880 1881 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1882 Err(Error::UnknownMemoryZone) 1883 } 1884 1885 /// In case this function resulted in adding a new memory region to the 1886 /// guest memory, the new region is returned to the caller. The virtio-mem 1887 /// use case never adds a new region as the whole hotpluggable memory has 1888 /// already been allocated at boot time. 1889 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1890 if self.user_provided_zones { 1891 error!( 1892 "Not allowed to resize guest memory when backed with user \ 1893 defined memory zones." 1894 ); 1895 return Err(Error::InvalidResizeWithMemoryZones); 1896 } 1897 1898 let mut region: Option<Arc<GuestRegionMmap>> = None; 1899 match self.hotplug_method { 1900 HotplugMethod::VirtioMem => { 1901 if desired_ram >= self.boot_ram { 1902 if !self.dynamic { 1903 return Ok(region); 1904 } 1905 1906 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1907 self.current_ram = desired_ram; 1908 } 1909 } 1910 HotplugMethod::Acpi => { 1911 if desired_ram > self.current_ram { 1912 if !self.dynamic { 1913 return Ok(region); 1914 } 1915 1916 region = 1917 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1918 self.current_ram = desired_ram; 1919 } 1920 } 1921 } 1922 Ok(region) 1923 } 1924 1925 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1926 if !self.user_provided_zones { 1927 error!( 1928 "Not allowed to resize guest memory zone when no zone is \ 1929 defined." 1930 ); 1931 return Err(Error::ResizeZone); 1932 } 1933 1934 self.virtio_mem_resize(id, virtio_mem_size) 1935 } 1936 1937 #[cfg(target_arch = "x86_64")] 1938 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1939 let file = OpenOptions::new() 1940 .read(true) 1941 .open("/dev/sgx_provision") 1942 .map_err(Error::SgxProvisionOpen)?; 1943 self.vm 1944 .enable_sgx_attribute(file) 1945 .map_err(Error::SgxEnableProvisioning)?; 1946 1947 // Go over each EPC section and verify its size is a 4k multiple. At 1948 // the same time, calculate the total size needed for the contiguous 1949 // EPC region. 1950 let mut epc_region_size = 0; 1951 for epc_section in sgx_epc_config.iter() { 1952 if epc_section.size == 0 { 1953 return Err(Error::EpcSectionSizeInvalid); 1954 } 1955 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1956 return Err(Error::EpcSectionSizeInvalid); 1957 } 1958 1959 epc_region_size += epc_section.size; 1960 } 1961 1962 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1963 let epc_region_start = GuestAddress( 1964 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1965 ); 1966 1967 self.start_of_device_area = epc_region_start 1968 .checked_add(epc_region_size) 1969 .ok_or(Error::GuestAddressOverFlow)?; 1970 1971 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1972 info!( 1973 "SGX EPC region: 0x{:x} (0x{:x})", 1974 epc_region_start.0, epc_region_size 1975 ); 1976 1977 // Each section can be memory mapped into the allocated region. 1978 let mut epc_section_start = epc_region_start.raw_value(); 1979 for epc_section in sgx_epc_config.iter() { 1980 let file = OpenOptions::new() 1981 .read(true) 1982 .write(true) 1983 .open("/dev/sgx_vepc") 1984 .map_err(Error::SgxVirtEpcOpen)?; 1985 1986 let prot = PROT_READ | PROT_WRITE; 1987 let mut flags = MAP_NORESERVE | MAP_SHARED; 1988 if epc_section.prefault { 1989 flags |= MAP_POPULATE; 1990 } 1991 1992 // We can't use the vm-memory crate to perform the memory mapping 1993 // here as it would try to ensure the size of the backing file is 1994 // matching the size of the expected mapping. The /dev/sgx_vepc 1995 // device does not work that way, it provides a file descriptor 1996 // which is not matching the mapping size, as it's a just a way to 1997 // let KVM know that an EPC section is being created for the guest. 1998 // SAFETY: FFI call with correct arguments 1999 let host_addr = unsafe { 2000 libc::mmap( 2001 std::ptr::null_mut(), 2002 epc_section.size as usize, 2003 prot, 2004 flags, 2005 file.as_raw_fd(), 2006 0, 2007 ) 2008 } as u64; 2009 2010 info!( 2011 "Adding SGX EPC section: 0x{:x} (0x{:x})", 2012 epc_section_start, epc_section.size 2013 ); 2014 2015 let _mem_slot = self.create_userspace_mapping( 2016 epc_section_start, 2017 epc_section.size, 2018 host_addr, 2019 false, 2020 false, 2021 false, 2022 )?; 2023 2024 sgx_epc_region.insert( 2025 epc_section.id.clone(), 2026 SgxEpcSection::new( 2027 GuestAddress(epc_section_start), 2028 epc_section.size as GuestUsize, 2029 ), 2030 ); 2031 2032 epc_section_start += epc_section.size; 2033 } 2034 2035 self.sgx_epc_region = Some(sgx_epc_region); 2036 2037 Ok(()) 2038 } 2039 2040 #[cfg(target_arch = "x86_64")] 2041 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 2042 &self.sgx_epc_region 2043 } 2044 2045 pub fn is_hardlink(f: &File) -> bool { 2046 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 2047 // SAFETY: FFI call with correct arguments 2048 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 2049 if ret != 0 { 2050 error!("Couldn't fstat the backing file"); 2051 return false; 2052 } 2053 2054 // SAFETY: stat is valid 2055 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 2056 } 2057 2058 pub fn memory_zones(&self) -> &MemoryZones { 2059 &self.memory_zones 2060 } 2061 2062 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 2063 &mut self.memory_zones 2064 } 2065 2066 pub fn memory_range_table( 2067 &self, 2068 snapshot: bool, 2069 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 2070 let mut table = MemoryRangeTable::default(); 2071 2072 for memory_zone in self.memory_zones.values() { 2073 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 2074 table.extend(virtio_mem_zone.plugged_ranges()); 2075 } 2076 2077 for region in memory_zone.regions() { 2078 if snapshot { 2079 if let Some(file_offset) = region.file_offset() { 2080 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 2081 && Self::is_hardlink(file_offset.file()) 2082 { 2083 // In this very specific case, we know the memory 2084 // region is backed by a file on the host filesystem 2085 // that can be accessed by the user, and additionally 2086 // the mapping is shared, which means that modifications 2087 // to the content are written to the actual file. 2088 // When meeting these conditions, we can skip the 2089 // copy of the memory content for this specific region, 2090 // as we can assume the user will have it saved through 2091 // the backing file already. 2092 continue; 2093 } 2094 } 2095 } 2096 2097 table.push(MemoryRange { 2098 gpa: region.start_addr().raw_value(), 2099 length: region.len(), 2100 }); 2101 } 2102 } 2103 2104 Ok(table) 2105 } 2106 2107 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 2108 MemoryManagerSnapshotData { 2109 memory_ranges: self.snapshot_memory_ranges.clone(), 2110 guest_ram_mappings: self.guest_ram_mappings.clone(), 2111 start_of_device_area: self.start_of_device_area.0, 2112 boot_ram: self.boot_ram, 2113 current_ram: self.current_ram, 2114 arch_mem_regions: self.arch_mem_regions.clone(), 2115 hotplug_slots: self.hotplug_slots.clone(), 2116 next_memory_slot: self.next_memory_slot, 2117 selected_slot: self.selected_slot, 2118 next_hotplug_slot: self.next_hotplug_slot, 2119 } 2120 } 2121 2122 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 2123 let mut memory_slot_fds = HashMap::new(); 2124 for guest_ram_mapping in &self.guest_ram_mappings { 2125 let slot = guest_ram_mapping.slot; 2126 let guest_memory = self.guest_memory.memory(); 2127 let file = guest_memory 2128 .find_region(GuestAddress(guest_ram_mapping.gpa)) 2129 .unwrap() 2130 .file_offset() 2131 .unwrap() 2132 .file(); 2133 memory_slot_fds.insert(slot, file.as_raw_fd()); 2134 } 2135 memory_slot_fds 2136 } 2137 2138 pub fn acpi_address(&self) -> Option<GuestAddress> { 2139 self.acpi_address 2140 } 2141 2142 pub fn num_guest_ram_mappings(&self) -> u32 { 2143 self.guest_ram_mappings.len() as u32 2144 } 2145 2146 #[cfg(target_arch = "aarch64")] 2147 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 2148 self.uefi_flash.as_ref().unwrap().clone() 2149 } 2150 2151 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2152 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 2153 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 2154 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 2155 2156 let mut mem_offset_in_elf = mem_offset; 2157 let mut ram_maps = BTreeMap::new(); 2158 for mapping in mapping_sorted_by_gpa.iter() { 2159 ram_maps.insert( 2160 mapping.gpa, 2161 CoredumpMemoryRegion { 2162 mem_offset_in_elf, 2163 mem_size: mapping.size, 2164 }, 2165 ); 2166 mem_offset_in_elf += mapping.size; 2167 } 2168 2169 CoredumpMemoryRegions { ram_maps } 2170 } 2171 2172 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2173 pub fn coredump_iterate_save_mem( 2174 &mut self, 2175 dump_state: &DumpState, 2176 ) -> std::result::Result<(), GuestDebuggableError> { 2177 let snapshot_memory_ranges = self 2178 .memory_range_table(false) 2179 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2180 2181 if snapshot_memory_ranges.is_empty() { 2182 return Ok(()); 2183 } 2184 2185 let coredump_file = dump_state.file.as_ref().unwrap(); 2186 2187 let guest_memory = self.guest_memory.memory(); 2188 let mut total_bytes: u64 = 0; 2189 2190 for range in snapshot_memory_ranges.regions() { 2191 let mut offset: u64 = 0; 2192 loop { 2193 let bytes_written = guest_memory 2194 .write_volatile_to( 2195 GuestAddress(range.gpa + offset), 2196 &mut coredump_file.as_fd(), 2197 (range.length - offset) as usize, 2198 ) 2199 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2200 offset += bytes_written as u64; 2201 total_bytes += bytes_written as u64; 2202 2203 if offset == range.length { 2204 break; 2205 } 2206 } 2207 } 2208 2209 debug!("coredump total bytes {}", total_bytes); 2210 Ok(()) 2211 } 2212 2213 pub fn receive_memory_regions<F>( 2214 &mut self, 2215 ranges: &MemoryRangeTable, 2216 fd: &mut F, 2217 ) -> std::result::Result<(), MigratableError> 2218 where 2219 F: ReadVolatile, 2220 { 2221 let guest_memory = self.guest_memory(); 2222 let mem = guest_memory.memory(); 2223 2224 for range in ranges.regions() { 2225 let mut offset: u64 = 0; 2226 // Here we are manually handling the retry in case we can't the 2227 // whole region at once because we can't use the implementation 2228 // from vm-memory::GuestMemory of read_exact_from() as it is not 2229 // following the correct behavior. For more info about this issue 2230 // see: https://github.com/rust-vmm/vm-memory/issues/174 2231 loop { 2232 let bytes_read = mem 2233 .read_volatile_from( 2234 GuestAddress(range.gpa + offset), 2235 fd, 2236 (range.length - offset) as usize, 2237 ) 2238 .map_err(|e| { 2239 MigratableError::MigrateReceive(anyhow!( 2240 "Error receiving memory from socket: {}", 2241 e 2242 )) 2243 })?; 2244 offset += bytes_read as u64; 2245 2246 if offset == range.length { 2247 break; 2248 } 2249 } 2250 } 2251 2252 Ok(()) 2253 } 2254 } 2255 2256 struct MemoryNotify { 2257 slot_id: usize, 2258 } 2259 2260 impl Aml for MemoryNotify { 2261 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2262 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2263 aml::If::new( 2264 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2265 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2266 ) 2267 .to_aml_bytes(sink) 2268 } 2269 } 2270 2271 struct MemorySlot { 2272 slot_id: usize, 2273 } 2274 2275 impl Aml for MemorySlot { 2276 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2277 aml::Device::new( 2278 format!("M{:03}", self.slot_id).as_str().into(), 2279 vec![ 2280 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")), 2281 &aml::Name::new("_UID".into(), &self.slot_id), 2282 /* 2283 _STA return value: 2284 Bit [0] – Set if the device is present. 2285 Bit [1] – Set if the device is enabled and decoding its resources. 2286 Bit [2] – Set if the device should be shown in the UI. 2287 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2288 Bit [4] – Set if the battery is present. 2289 Bits [31:5] – Reserved (must be cleared). 2290 */ 2291 &aml::Method::new( 2292 "_STA".into(), 2293 0, 2294 false, 2295 // Call into MSTA method which will interrogate device 2296 vec![&aml::Return::new(&aml::MethodCall::new( 2297 "MSTA".into(), 2298 vec![&self.slot_id], 2299 ))], 2300 ), 2301 // Get details of memory 2302 &aml::Method::new( 2303 "_CRS".into(), 2304 0, 2305 false, 2306 // Call into MCRS which provides actual memory details 2307 vec![&aml::Return::new(&aml::MethodCall::new( 2308 "MCRS".into(), 2309 vec![&self.slot_id], 2310 ))], 2311 ), 2312 ], 2313 ) 2314 .to_aml_bytes(sink) 2315 } 2316 } 2317 2318 struct MemorySlots { 2319 slots: usize, 2320 } 2321 2322 impl Aml for MemorySlots { 2323 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2324 for slot_id in 0..self.slots { 2325 MemorySlot { slot_id }.to_aml_bytes(sink); 2326 } 2327 } 2328 } 2329 2330 struct MemoryMethods { 2331 slots: usize, 2332 } 2333 2334 impl Aml for MemoryMethods { 2335 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2336 // Add "MTFY" notification method 2337 let mut memory_notifies = Vec::new(); 2338 for slot_id in 0..self.slots { 2339 memory_notifies.push(MemoryNotify { slot_id }); 2340 } 2341 2342 let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2343 for memory_notifier in memory_notifies.iter() { 2344 memory_notifies_refs.push(memory_notifier); 2345 } 2346 2347 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink); 2348 2349 // MSCN method 2350 aml::Method::new( 2351 "MSCN".into(), 2352 0, 2353 true, 2354 vec![ 2355 // Take lock defined above 2356 &aml::Acquire::new("MLCK".into(), 0xffff), 2357 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2358 &aml::While::new( 2359 &aml::LessThan::new(&aml::Local(0), &self.slots), 2360 vec![ 2361 // Write slot number (in first argument) to I/O port via field 2362 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2363 // Check if MINS bit is set (inserting) 2364 &aml::If::new( 2365 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2366 // Notify device if it is 2367 vec![ 2368 &aml::MethodCall::new( 2369 "MTFY".into(), 2370 vec![&aml::Local(0), &aml::ONE], 2371 ), 2372 // Reset MINS bit 2373 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2374 ], 2375 ), 2376 // Check if MRMV bit is set 2377 &aml::If::new( 2378 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2379 // Notify device if it is (with the eject constant 0x3) 2380 vec![ 2381 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2382 // Reset MRMV bit 2383 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2384 ], 2385 ), 2386 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2387 ], 2388 ), 2389 // Release lock 2390 &aml::Release::new("MLCK".into()), 2391 ], 2392 ) 2393 .to_aml_bytes(sink); 2394 2395 // Memory status method 2396 aml::Method::new( 2397 "MSTA".into(), 2398 1, 2399 true, 2400 vec![ 2401 // Take lock defined above 2402 &aml::Acquire::new("MLCK".into(), 0xffff), 2403 // Write slot number (in first argument) to I/O port via field 2404 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2405 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2406 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2407 &aml::If::new( 2408 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2409 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2410 ), 2411 // Release lock 2412 &aml::Release::new("MLCK".into()), 2413 // Return 0 or 0xf 2414 &aml::Return::new(&aml::Local(0)), 2415 ], 2416 ) 2417 .to_aml_bytes(sink); 2418 2419 // Memory range method 2420 aml::Method::new( 2421 "MCRS".into(), 2422 1, 2423 true, 2424 vec![ 2425 // Take lock defined above 2426 &aml::Acquire::new("MLCK".into(), 0xffff), 2427 // Write slot number (in first argument) to I/O port via field 2428 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2429 &aml::Name::new( 2430 "MR64".into(), 2431 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2432 aml::AddressSpaceCacheable::Cacheable, 2433 true, 2434 0x0000_0000_0000_0000u64, 2435 0xFFFF_FFFF_FFFF_FFFEu64, 2436 None, 2437 )]), 2438 ), 2439 &aml::CreateQWordField::new( 2440 &aml::Path::new("MINL"), 2441 &aml::Path::new("MR64"), 2442 &14usize, 2443 ), 2444 &aml::CreateDWordField::new( 2445 &aml::Path::new("MINH"), 2446 &aml::Path::new("MR64"), 2447 &18usize, 2448 ), 2449 &aml::CreateQWordField::new( 2450 &aml::Path::new("MAXL"), 2451 &aml::Path::new("MR64"), 2452 &22usize, 2453 ), 2454 &aml::CreateDWordField::new( 2455 &aml::Path::new("MAXH"), 2456 &aml::Path::new("MR64"), 2457 &26usize, 2458 ), 2459 &aml::CreateQWordField::new( 2460 &aml::Path::new("LENL"), 2461 &aml::Path::new("MR64"), 2462 &38usize, 2463 ), 2464 &aml::CreateDWordField::new( 2465 &aml::Path::new("LENH"), 2466 &aml::Path::new("MR64"), 2467 &42usize, 2468 ), 2469 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2470 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2471 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2472 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2473 &aml::Add::new( 2474 &aml::Path::new("MAXL"), 2475 &aml::Path::new("MINL"), 2476 &aml::Path::new("LENL"), 2477 ), 2478 &aml::Add::new( 2479 &aml::Path::new("MAXH"), 2480 &aml::Path::new("MINH"), 2481 &aml::Path::new("LENH"), 2482 ), 2483 &aml::If::new( 2484 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2485 vec![&aml::Add::new( 2486 &aml::Path::new("MAXH"), 2487 &aml::ONE, 2488 &aml::Path::new("MAXH"), 2489 )], 2490 ), 2491 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2492 // Release lock 2493 &aml::Release::new("MLCK".into()), 2494 &aml::Return::new(&aml::Path::new("MR64")), 2495 ], 2496 ) 2497 .to_aml_bytes(sink) 2498 } 2499 } 2500 2501 impl Aml for MemoryManager { 2502 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2503 if let Some(acpi_address) = self.acpi_address { 2504 // Memory Hotplug Controller 2505 aml::Device::new( 2506 "_SB_.MHPC".into(), 2507 vec![ 2508 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2509 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2510 // Mutex to protect concurrent access as we write to choose slot and then read back status 2511 &aml::Mutex::new("MLCK".into(), 0), 2512 &aml::Name::new( 2513 "_CRS".into(), 2514 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2515 aml::AddressSpaceCacheable::NotCacheable, 2516 true, 2517 acpi_address.0, 2518 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2519 None, 2520 )]), 2521 ), 2522 // OpRegion and Fields map MMIO range into individual field values 2523 &aml::OpRegion::new( 2524 "MHPR".into(), 2525 aml::OpRegionSpace::SystemMemory, 2526 &(acpi_address.0 as usize), 2527 &MEMORY_MANAGER_ACPI_SIZE, 2528 ), 2529 &aml::Field::new( 2530 "MHPR".into(), 2531 aml::FieldAccessType::DWord, 2532 aml::FieldLockRule::NoLock, 2533 aml::FieldUpdateRule::Preserve, 2534 vec![ 2535 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2536 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2537 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2538 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2539 ], 2540 ), 2541 &aml::Field::new( 2542 "MHPR".into(), 2543 aml::FieldAccessType::DWord, 2544 aml::FieldLockRule::NoLock, 2545 aml::FieldUpdateRule::Preserve, 2546 vec![ 2547 aml::FieldEntry::Reserved(128), 2548 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2549 ], 2550 ), 2551 &aml::Field::new( 2552 "MHPR".into(), 2553 aml::FieldAccessType::Byte, 2554 aml::FieldLockRule::NoLock, 2555 aml::FieldUpdateRule::WriteAsZeroes, 2556 vec![ 2557 aml::FieldEntry::Reserved(160), 2558 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2559 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2560 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2561 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2562 ], 2563 ), 2564 &aml::Field::new( 2565 "MHPR".into(), 2566 aml::FieldAccessType::DWord, 2567 aml::FieldLockRule::NoLock, 2568 aml::FieldUpdateRule::Preserve, 2569 vec![ 2570 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2571 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2572 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2573 ], 2574 ), 2575 &MemoryMethods { 2576 slots: self.hotplug_slots.len(), 2577 }, 2578 &MemorySlots { 2579 slots: self.hotplug_slots.len(), 2580 }, 2581 ], 2582 ) 2583 .to_aml_bytes(sink); 2584 } else { 2585 aml::Device::new( 2586 "_SB_.MHPC".into(), 2587 vec![ 2588 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2589 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2590 // Empty MSCN for GED 2591 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2592 ], 2593 ) 2594 .to_aml_bytes(sink); 2595 } 2596 2597 #[cfg(target_arch = "x86_64")] 2598 { 2599 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2600 let min = sgx_epc_region.start().raw_value(); 2601 let max = min + sgx_epc_region.size() - 1; 2602 // SGX EPC region 2603 aml::Device::new( 2604 "_SB_.EPC_".into(), 2605 vec![ 2606 &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")), 2607 // QWORD describing the EPC region start and size 2608 &aml::Name::new( 2609 "_CRS".into(), 2610 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2611 aml::AddressSpaceCacheable::NotCacheable, 2612 true, 2613 min, 2614 max, 2615 None, 2616 )]), 2617 ), 2618 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2619 ], 2620 ) 2621 .to_aml_bytes(sink); 2622 } 2623 } 2624 } 2625 } 2626 2627 impl Pausable for MemoryManager {} 2628 2629 #[derive(Clone, Serialize, Deserialize)] 2630 pub struct MemoryManagerSnapshotData { 2631 memory_ranges: MemoryRangeTable, 2632 guest_ram_mappings: Vec<GuestRamMapping>, 2633 start_of_device_area: u64, 2634 boot_ram: u64, 2635 current_ram: u64, 2636 arch_mem_regions: Vec<ArchMemRegion>, 2637 hotplug_slots: Vec<HotPlugState>, 2638 next_memory_slot: u32, 2639 selected_slot: usize, 2640 next_hotplug_slot: usize, 2641 } 2642 2643 impl Snapshottable for MemoryManager { 2644 fn id(&self) -> String { 2645 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2646 } 2647 2648 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2649 let memory_ranges = self.memory_range_table(true)?; 2650 2651 // Store locally this list of ranges as it will be used through the 2652 // Transportable::send() implementation. The point is to avoid the 2653 // duplication of code regarding the creation of the path for each 2654 // region. The 'snapshot' step creates the list of memory regions, 2655 // including information about the need to copy a memory region or 2656 // not. This saves the 'send' step having to go through the same 2657 // process, and instead it can directly proceed with storing the 2658 // memory range content for the ranges requiring it. 2659 self.snapshot_memory_ranges = memory_ranges; 2660 2661 Ok(Snapshot::from_data(SnapshotData::new_from_state( 2662 &self.snapshot_data(), 2663 )?)) 2664 } 2665 } 2666 2667 impl Transportable for MemoryManager { 2668 fn send( 2669 &self, 2670 _snapshot: &Snapshot, 2671 destination_url: &str, 2672 ) -> result::Result<(), MigratableError> { 2673 if self.snapshot_memory_ranges.is_empty() { 2674 return Ok(()); 2675 } 2676 2677 let mut memory_file_path = url_to_path(destination_url)?; 2678 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2679 2680 // Create the snapshot file for the entire memory 2681 let mut memory_file = OpenOptions::new() 2682 .read(true) 2683 .write(true) 2684 .create_new(true) 2685 .open(memory_file_path) 2686 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2687 2688 let guest_memory = self.guest_memory.memory(); 2689 2690 for range in self.snapshot_memory_ranges.regions() { 2691 let mut offset: u64 = 0; 2692 // Here we are manually handling the retry in case we can't read 2693 // the whole region at once because we can't use the implementation 2694 // from vm-memory::GuestMemory of write_all_to() as it is not 2695 // following the correct behavior. For more info about this issue 2696 // see: https://github.com/rust-vmm/vm-memory/issues/174 2697 loop { 2698 let bytes_written = guest_memory 2699 .write_volatile_to( 2700 GuestAddress(range.gpa + offset), 2701 &mut memory_file, 2702 (range.length - offset) as usize, 2703 ) 2704 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2705 offset += bytes_written as u64; 2706 2707 if offset == range.length { 2708 break; 2709 } 2710 } 2711 } 2712 Ok(()) 2713 } 2714 } 2715 2716 impl Migratable for MemoryManager { 2717 // Start the dirty log in the hypervisor (kvm/mshv). 2718 // Also, reset the dirty bitmap logged by the vmm. 2719 // Just before we do a bulk copy we want to start/clear the dirty log so that 2720 // pages touched during our bulk copy are tracked. 2721 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2722 self.vm.start_dirty_log().map_err(|e| { 2723 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2724 })?; 2725 2726 for r in self.guest_memory.memory().iter() { 2727 r.bitmap().reset(); 2728 } 2729 2730 Ok(()) 2731 } 2732 2733 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2734 self.vm.stop_dirty_log().map_err(|e| { 2735 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2736 })?; 2737 2738 Ok(()) 2739 } 2740 2741 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2742 // together in the table if they are contiguous. 2743 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2744 let mut table = MemoryRangeTable::default(); 2745 for r in &self.guest_ram_mappings { 2746 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2747 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2748 })?; 2749 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2750 { 2751 Some(region) => { 2752 assert!(region.start_addr().raw_value() == r.gpa); 2753 assert!(region.len() == r.size); 2754 region.bitmap().get_and_reset() 2755 } 2756 None => { 2757 return Err(MigratableError::MigrateSend(anyhow!( 2758 "Error finding 'guest memory region' with address {:x}", 2759 r.gpa 2760 ))) 2761 } 2762 }; 2763 2764 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2765 .iter() 2766 .zip(vmm_dirty_bitmap.iter()) 2767 .map(|(x, y)| x | y) 2768 .collect(); 2769 2770 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2771 2772 if sub_table.regions().is_empty() { 2773 info!("Dirty Memory Range Table is empty"); 2774 } else { 2775 info!("Dirty Memory Range Table:"); 2776 for range in sub_table.regions() { 2777 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2778 } 2779 } 2780 2781 table.extend(sub_table); 2782 } 2783 Ok(table) 2784 } 2785 } 2786