1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 6 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 7 use std::collections::BTreeMap; 8 use std::collections::HashMap; 9 use std::fs::{File, OpenOptions}; 10 use std::io::{self}; 11 use std::ops::{BitAnd, Deref, Not, Sub}; 12 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 13 use std::os::fd::AsFd; 14 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 15 use std::path::PathBuf; 16 use std::sync::{Arc, Barrier, Mutex}; 17 use std::{ffi, result, thread}; 18 19 use acpi_tables::{aml, Aml}; 20 use anyhow::anyhow; 21 #[cfg(target_arch = "x86_64")] 22 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 23 use arch::RegionType; 24 #[cfg(target_arch = "x86_64")] 25 use devices::ioapic; 26 #[cfg(target_arch = "aarch64")] 27 use hypervisor::HypervisorVmError; 28 use libc::_SC_NPROCESSORS_ONLN; 29 #[cfg(target_arch = "x86_64")] 30 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 31 use serde::{Deserialize, Serialize}; 32 use tracer::trace_scoped; 33 use virtio_devices::BlocksState; 34 #[cfg(target_arch = "x86_64")] 35 use vm_allocator::GsiApic; 36 use vm_allocator::{AddressAllocator, SystemAllocator}; 37 use vm_device::BusDevice; 38 use vm_memory::bitmap::AtomicBitmap; 39 use vm_memory::guest_memory::FileOffset; 40 use vm_memory::mmap::MmapRegionError; 41 use vm_memory::{ 42 Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 43 GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile, 44 }; 45 use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; 46 use vm_migration::{ 47 Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable, 48 }; 49 50 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 51 use crate::coredump::{ 52 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, 53 }; 54 use crate::migration::url_to_path; 55 #[cfg(target_arch = "x86_64")] 56 use crate::vm_config::SgxEpcConfig; 57 use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 58 use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID}; 59 60 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 61 62 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 63 64 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 65 66 #[cfg(target_arch = "x86_64")] 67 const X86_64_IRQ_BASE: u32 = 5; 68 69 #[cfg(target_arch = "x86_64")] 70 const SGX_PAGE_SIZE: u64 = 1 << 12; 71 72 const HOTPLUG_COUNT: usize = 8; 73 74 // Memory policy constants 75 const MPOL_BIND: u32 = 2; 76 const MPOL_MF_STRICT: u32 = 1; 77 const MPOL_MF_MOVE: u32 = 1 << 1; 78 79 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 80 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 81 82 const MAX_PREFAULT_THREAD_COUNT: usize = 16; 83 84 #[derive(Clone, Default, Serialize, Deserialize)] 85 struct HotPlugState { 86 base: u64, 87 length: u64, 88 active: bool, 89 inserting: bool, 90 removing: bool, 91 } 92 93 pub struct VirtioMemZone { 94 region: Arc<GuestRegionMmap>, 95 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 96 hotplugged_size: u64, 97 hugepages: bool, 98 blocks_state: Arc<Mutex<BlocksState>>, 99 } 100 101 impl VirtioMemZone { 102 pub fn region(&self) -> &Arc<GuestRegionMmap> { 103 &self.region 104 } 105 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 106 self.virtio_device = Some(virtio_device); 107 } 108 pub fn hotplugged_size(&self) -> u64 { 109 self.hotplugged_size 110 } 111 pub fn hugepages(&self) -> bool { 112 self.hugepages 113 } 114 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 115 &self.blocks_state 116 } 117 pub fn plugged_ranges(&self) -> MemoryRangeTable { 118 self.blocks_state 119 .lock() 120 .unwrap() 121 .memory_ranges(self.region.start_addr().raw_value(), true) 122 } 123 } 124 125 #[derive(Default)] 126 pub struct MemoryZone { 127 regions: Vec<Arc<GuestRegionMmap>>, 128 virtio_mem_zone: Option<VirtioMemZone>, 129 } 130 131 impl MemoryZone { 132 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 133 &self.regions 134 } 135 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 136 &self.virtio_mem_zone 137 } 138 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 139 self.virtio_mem_zone.as_mut() 140 } 141 } 142 143 pub type MemoryZones = HashMap<String, MemoryZone>; 144 145 #[derive(Clone, Serialize, Deserialize)] 146 struct GuestRamMapping { 147 slot: u32, 148 gpa: u64, 149 size: u64, 150 zone_id: String, 151 virtio_mem: bool, 152 file_offset: u64, 153 } 154 155 #[derive(Clone, Serialize, Deserialize)] 156 struct ArchMemRegion { 157 base: u64, 158 size: usize, 159 r_type: RegionType, 160 } 161 162 pub struct MemoryManager { 163 boot_guest_memory: GuestMemoryMmap, 164 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 165 next_memory_slot: u32, 166 start_of_device_area: GuestAddress, 167 end_of_device_area: GuestAddress, 168 end_of_ram_area: GuestAddress, 169 pub vm: Arc<dyn hypervisor::Vm>, 170 hotplug_slots: Vec<HotPlugState>, 171 selected_slot: usize, 172 mergeable: bool, 173 allocator: Arc<Mutex<SystemAllocator>>, 174 hotplug_method: HotplugMethod, 175 boot_ram: u64, 176 current_ram: u64, 177 next_hotplug_slot: usize, 178 shared: bool, 179 hugepages: bool, 180 hugepage_size: Option<u64>, 181 prefault: bool, 182 thp: bool, 183 #[cfg(target_arch = "x86_64")] 184 sgx_epc_region: Option<SgxEpcRegion>, 185 user_provided_zones: bool, 186 snapshot_memory_ranges: MemoryRangeTable, 187 memory_zones: MemoryZones, 188 log_dirty: bool, // Enable dirty logging for created RAM regions 189 arch_mem_regions: Vec<ArchMemRegion>, 190 ram_allocator: AddressAllocator, 191 dynamic: bool, 192 193 // Keep track of calls to create_userspace_mapping() for guest RAM. 194 // This is useful for getting the dirty pages as we need to know the 195 // slots that the mapping is created in. 196 guest_ram_mappings: Vec<GuestRamMapping>, 197 198 pub acpi_address: Option<GuestAddress>, 199 #[cfg(target_arch = "aarch64")] 200 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 201 } 202 203 #[derive(Debug)] 204 pub enum Error { 205 /// Failed to create shared file. 206 SharedFileCreate(io::Error), 207 208 /// Failed to set shared file length. 209 SharedFileSetLen(io::Error), 210 211 /// Mmap backed guest memory error 212 GuestMemory(MmapError), 213 214 /// Failed to allocate a memory range. 215 MemoryRangeAllocation, 216 217 /// Error from region creation 218 GuestMemoryRegion(MmapRegionError), 219 220 /// No ACPI slot available 221 NoSlotAvailable, 222 223 /// Not enough space in the hotplug RAM region 224 InsufficientHotplugRam, 225 226 /// The requested hotplug memory addition is not a valid size 227 InvalidSize, 228 229 /// Failed to create the user memory region. 230 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 231 232 /// Failed to remove the user memory region. 233 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 234 235 /// Failed to EventFd. 236 EventFdFail(io::Error), 237 238 /// Eventfd write error 239 EventfdError(io::Error), 240 241 /// Failed to virtio-mem resize 242 VirtioMemResizeFail(virtio_devices::mem::Error), 243 244 /// Cannot restore VM 245 Restore(MigratableError), 246 247 /// Cannot restore VM because source URL is missing 248 RestoreMissingSourceUrl, 249 250 /// Cannot create the system allocator 251 CreateSystemAllocator, 252 253 /// Invalid SGX EPC section size 254 #[cfg(target_arch = "x86_64")] 255 EpcSectionSizeInvalid, 256 257 /// Failed allocating SGX EPC region 258 #[cfg(target_arch = "x86_64")] 259 SgxEpcRangeAllocation, 260 261 /// Failed opening SGX virtual EPC device 262 #[cfg(target_arch = "x86_64")] 263 SgxVirtEpcOpen(io::Error), 264 265 /// Failed setting the SGX virtual EPC section size 266 #[cfg(target_arch = "x86_64")] 267 SgxVirtEpcFileSetLen(io::Error), 268 269 /// Failed opening SGX provisioning device 270 #[cfg(target_arch = "x86_64")] 271 SgxProvisionOpen(io::Error), 272 273 /// Failed enabling SGX provisioning 274 #[cfg(target_arch = "x86_64")] 275 SgxEnableProvisioning(hypervisor::HypervisorVmError), 276 277 /// Failed creating a new MmapRegion instance. 278 #[cfg(target_arch = "x86_64")] 279 NewMmapRegion(vm_memory::mmap::MmapRegionError), 280 281 /// No memory zones found. 282 MissingMemoryZones, 283 284 /// Memory configuration is not valid. 285 InvalidMemoryParameters, 286 287 /// Forbidden operation. Impossible to resize guest memory if it is 288 /// backed by user defined memory regions. 289 InvalidResizeWithMemoryZones, 290 291 /// It's invalid to try applying a NUMA policy to a memory zone that is 292 /// memory mapped with MAP_SHARED. 293 InvalidSharedMemoryZoneWithHostNuma, 294 295 /// Failed applying NUMA memory policy. 296 ApplyNumaPolicy(io::Error), 297 298 /// Memory zone identifier is not unique. 299 DuplicateZoneId, 300 301 /// No virtio-mem resizing handler found. 302 MissingVirtioMemHandler, 303 304 /// Unknown memory zone. 305 UnknownMemoryZone, 306 307 /// Invalid size for resizing. Can be anything except 0. 308 InvalidHotplugSize, 309 310 /// Invalid hotplug method associated with memory zones resizing capability. 311 InvalidHotplugMethodWithMemoryZones, 312 313 /// Could not find specified memory zone identifier from hash map. 314 MissingZoneIdentifier, 315 316 /// Resizing the memory zone failed. 317 ResizeZone, 318 319 /// Guest address overflow 320 GuestAddressOverFlow, 321 322 /// Error opening snapshot file 323 SnapshotOpen(io::Error), 324 325 // Error copying snapshot into region 326 SnapshotCopy(GuestMemoryError), 327 328 /// Failed to allocate MMIO address 329 AllocateMmioAddress, 330 331 #[cfg(target_arch = "aarch64")] 332 /// Failed to create UEFI flash 333 CreateUefiFlash(HypervisorVmError), 334 335 /// Using a directory as a backing file for memory is not supported 336 DirectoryAsBackingFileForMemory, 337 338 /// Failed to stat filesystem 339 GetFileSystemBlockSize(io::Error), 340 341 /// Memory size is misaligned with default page size or its hugepage size 342 MisalignedMemorySize, 343 } 344 345 const ENABLE_FLAG: usize = 0; 346 const INSERTING_FLAG: usize = 1; 347 const REMOVING_FLAG: usize = 2; 348 const EJECT_FLAG: usize = 3; 349 350 const BASE_OFFSET_LOW: u64 = 0; 351 const BASE_OFFSET_HIGH: u64 = 0x4; 352 const LENGTH_OFFSET_LOW: u64 = 0x8; 353 const LENGTH_OFFSET_HIGH: u64 = 0xC; 354 const STATUS_OFFSET: u64 = 0x14; 355 const SELECTION_OFFSET: u64 = 0; 356 357 // The MMIO address space size is subtracted with 64k. This is done for the 358 // following reasons: 359 // - Reduce the addressable space size by at least 4k to workaround a Linux 360 // bug when the VMM allocates devices at the end of the addressable space 361 // - Windows requires the addressable space size to be 64k aligned 362 fn mmio_address_space_size(phys_bits: u8) -> u64 { 363 (1 << phys_bits) - (1 << 16) 364 } 365 366 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the 367 // `f_bsize` field. 368 // 369 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169 370 fn statfs_get_bsize(path: &str) -> Result<u64, Error> { 371 let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?; 372 let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit(); 373 374 // SAFETY: FFI call with a valid path and buffer 375 let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) }; 376 if ret != 0 { 377 return Err(Error::GetFileSystemBlockSize( 378 std::io::Error::last_os_error(), 379 )); 380 } 381 382 // SAFETY: `buf` is valid at this point 383 // Because this value is always positive, just convert it directly. 384 // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned 385 // by `clippy` on musl target. To avoid the warning, there should be `as _` instead of 386 // `as u64`. 387 let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _; 388 Ok(bsize) 389 } 390 391 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> { 392 // SAFETY: FFI call. Trivially safe. 393 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 394 395 // There is no backend file and the `hugepages` is disabled, just use system page size. 396 if zone.file.is_none() && !zone.hugepages { 397 return Ok(page_size); 398 } 399 400 // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly. 401 if zone.hugepages && zone.hugepage_size.is_some() { 402 return Ok(zone.hugepage_size.unwrap()); 403 } 404 405 // There are two scenarios here: 406 // - `hugepages` is enabled but `hugepage_size` is not specified: 407 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 408 // - The backing file is specified: 409 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 410 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 411 // value is less than or equal to the page size, just use the page size. 412 let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| { 413 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 414 })?; 415 416 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 417 418 Ok(align_size) 419 } 420 421 #[inline] 422 fn align_down<T>(val: T, align: T) -> T 423 where 424 T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>, 425 { 426 val & !(align - 1u8.into()) 427 } 428 429 #[inline] 430 fn is_aligned<T>(val: T, align: T) -> bool 431 where 432 T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq, 433 { 434 (val & (align - 1u8.into())) == 0u8.into() 435 } 436 437 impl BusDevice for MemoryManager { 438 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 439 if self.selected_slot < self.hotplug_slots.len() { 440 let state = &self.hotplug_slots[self.selected_slot]; 441 match offset { 442 BASE_OFFSET_LOW => { 443 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 444 } 445 BASE_OFFSET_HIGH => { 446 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 447 } 448 LENGTH_OFFSET_LOW => { 449 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 450 } 451 LENGTH_OFFSET_HIGH => { 452 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 453 } 454 STATUS_OFFSET => { 455 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 456 data.fill(0); 457 if state.active { 458 data[0] |= 1 << ENABLE_FLAG; 459 } 460 if state.inserting { 461 data[0] |= 1 << INSERTING_FLAG; 462 } 463 if state.removing { 464 data[0] |= 1 << REMOVING_FLAG; 465 } 466 } 467 _ => { 468 warn!( 469 "Unexpected offset for accessing memory manager device: {:#}", 470 offset 471 ); 472 } 473 } 474 } else { 475 warn!("Out of range memory slot: {}", self.selected_slot); 476 } 477 } 478 479 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 480 match offset { 481 SELECTION_OFFSET => { 482 self.selected_slot = usize::from(data[0]); 483 } 484 STATUS_OFFSET => { 485 if self.selected_slot < self.hotplug_slots.len() { 486 let state = &mut self.hotplug_slots[self.selected_slot]; 487 // The ACPI code writes back a 1 to acknowledge the insertion 488 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 489 state.inserting = false; 490 } 491 // Ditto for removal 492 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 493 state.removing = false; 494 } 495 // Trigger removal of "DIMM" 496 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 497 warn!("Ejection of memory not currently supported"); 498 } 499 } else { 500 warn!("Out of range memory slot: {}", self.selected_slot); 501 } 502 } 503 _ => { 504 warn!( 505 "Unexpected offset for accessing memory manager device: {:#}", 506 offset 507 ); 508 } 509 }; 510 None 511 } 512 } 513 514 impl MemoryManager { 515 /// Creates all memory regions based on the available RAM ranges defined 516 /// by `ram_regions`, and based on the description of the memory zones. 517 /// In practice, this function can perform multiple memory mappings of the 518 /// same backing file if there's a hole in the address space between two 519 /// RAM ranges. 520 /// 521 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 522 /// and zones containing two zones (size 1G and size 4G). 523 /// 524 /// This function will create 3 resulting memory regions: 525 /// - First one mapping entirely the first memory zone on 0-1G range 526 /// - Second one mapping partially the second memory zone on 1G-3G range 527 /// - Third one mapping partially the second memory zone on 4G-6G range 528 /// 529 /// Also, all memory regions are page-size aligned (e.g. their sizes must 530 /// be multiple of page-size), which may leave an additional hole in the 531 /// address space when hugepage is used. 532 fn create_memory_regions_from_zones( 533 ram_regions: &[(GuestAddress, usize)], 534 zones: &[MemoryZoneConfig], 535 prefault: Option<bool>, 536 thp: bool, 537 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 538 let mut zone_iter = zones.iter(); 539 let mut mem_regions = Vec::new(); 540 let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?; 541 let mut zone_align_size = memory_zone_get_align_size(zone)?; 542 let mut zone_offset = 0u64; 543 let mut memory_zones = HashMap::new(); 544 545 if !is_aligned(zone.size, zone_align_size) { 546 return Err(Error::MisalignedMemorySize); 547 } 548 549 // Add zone id to the list of memory zones. 550 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 551 552 for ram_region in ram_regions.iter() { 553 let mut ram_region_offset = 0; 554 let mut exit = false; 555 556 loop { 557 let mut ram_region_consumed = false; 558 let mut pull_next_zone = false; 559 560 let ram_region_available_size = 561 align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size); 562 if ram_region_available_size == 0 { 563 break; 564 } 565 let zone_sub_size = zone.size - zone_offset; 566 567 let file_offset = zone_offset; 568 let region_start = ram_region 569 .0 570 .checked_add(ram_region_offset) 571 .ok_or(Error::GuestAddressOverFlow)?; 572 let region_size = if zone_sub_size <= ram_region_available_size { 573 if zone_sub_size == ram_region_available_size { 574 ram_region_consumed = true; 575 } 576 577 ram_region_offset += zone_sub_size; 578 pull_next_zone = true; 579 580 zone_sub_size 581 } else { 582 zone_offset += ram_region_available_size; 583 ram_region_consumed = true; 584 585 ram_region_available_size 586 }; 587 588 info!( 589 "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}", 590 zone.id, 591 region_start.raw_value(), 592 region_size 593 ); 594 let region = MemoryManager::create_ram_region( 595 &zone.file, 596 file_offset, 597 region_start, 598 region_size as usize, 599 prefault.unwrap_or(zone.prefault), 600 zone.shared, 601 zone.hugepages, 602 zone.hugepage_size, 603 zone.host_numa_node, 604 None, 605 thp, 606 )?; 607 608 // Add region to the list of regions associated with the 609 // current memory zone. 610 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 611 memory_zone.regions.push(region.clone()); 612 } 613 614 mem_regions.push(region); 615 616 if pull_next_zone { 617 // Get the next zone and reset the offset. 618 zone_offset = 0; 619 if let Some(z) = zone_iter.next() { 620 zone = z; 621 } else { 622 exit = true; 623 break; 624 } 625 zone_align_size = memory_zone_get_align_size(zone)?; 626 if !is_aligned(zone.size, zone_align_size) { 627 return Err(Error::MisalignedMemorySize); 628 } 629 630 // Check if zone id already exist. In case it does, throw 631 // an error as we need unique identifiers. Otherwise, add 632 // the new zone id to the list of memory zones. 633 if memory_zones.contains_key(&zone.id) { 634 error!( 635 "Memory zone identifier '{}' found more than once. \ 636 It must be unique", 637 zone.id, 638 ); 639 return Err(Error::DuplicateZoneId); 640 } 641 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 642 } 643 644 if ram_region_consumed { 645 break; 646 } 647 } 648 649 if exit { 650 break; 651 } 652 } 653 654 Ok((mem_regions, memory_zones)) 655 } 656 657 // Restore both GuestMemory regions along with MemoryZone zones. 658 fn restore_memory_regions_and_zones( 659 guest_ram_mappings: &[GuestRamMapping], 660 zones_config: &[MemoryZoneConfig], 661 prefault: Option<bool>, 662 mut existing_memory_files: HashMap<u32, File>, 663 thp: bool, 664 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 665 let mut memory_regions = Vec::new(); 666 let mut memory_zones = HashMap::new(); 667 668 for zone_config in zones_config { 669 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 670 } 671 672 for guest_ram_mapping in guest_ram_mappings { 673 for zone_config in zones_config { 674 if guest_ram_mapping.zone_id == zone_config.id { 675 let region = MemoryManager::create_ram_region( 676 if guest_ram_mapping.virtio_mem { 677 &None 678 } else { 679 &zone_config.file 680 }, 681 guest_ram_mapping.file_offset, 682 GuestAddress(guest_ram_mapping.gpa), 683 guest_ram_mapping.size as usize, 684 prefault.unwrap_or(zone_config.prefault), 685 zone_config.shared, 686 zone_config.hugepages, 687 zone_config.hugepage_size, 688 zone_config.host_numa_node, 689 existing_memory_files.remove(&guest_ram_mapping.slot), 690 thp, 691 )?; 692 memory_regions.push(Arc::clone(®ion)); 693 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 694 if guest_ram_mapping.virtio_mem { 695 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 696 let region_size = region.len(); 697 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 698 region, 699 virtio_device: None, 700 hotplugged_size, 701 hugepages: zone_config.hugepages, 702 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 703 }); 704 } else { 705 memory_zone.regions.push(region); 706 } 707 } 708 } 709 } 710 } 711 712 memory_regions.sort_by_key(|x| x.start_addr()); 713 714 Ok((memory_regions, memory_zones)) 715 } 716 717 fn fill_saved_regions( 718 &mut self, 719 file_path: PathBuf, 720 saved_regions: MemoryRangeTable, 721 ) -> Result<(), Error> { 722 if saved_regions.is_empty() { 723 return Ok(()); 724 } 725 726 // Open (read only) the snapshot file. 727 let mut memory_file = OpenOptions::new() 728 .read(true) 729 .open(file_path) 730 .map_err(Error::SnapshotOpen)?; 731 732 let guest_memory = self.guest_memory.memory(); 733 for range in saved_regions.regions() { 734 let mut offset: u64 = 0; 735 // Here we are manually handling the retry in case we can't write 736 // the whole region at once because we can't use the implementation 737 // from vm-memory::GuestMemory of read_exact_from() as it is not 738 // following the correct behavior. For more info about this issue 739 // see: https://github.com/rust-vmm/vm-memory/issues/174 740 loop { 741 let bytes_read = guest_memory 742 .read_volatile_from( 743 GuestAddress(range.gpa + offset), 744 &mut memory_file, 745 (range.length - offset) as usize, 746 ) 747 .map_err(Error::SnapshotCopy)?; 748 offset += bytes_read as u64; 749 750 if offset == range.length { 751 break; 752 } 753 } 754 } 755 756 Ok(()) 757 } 758 759 fn validate_memory_config( 760 config: &MemoryConfig, 761 user_provided_zones: bool, 762 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 763 let mut allow_mem_hotplug = false; 764 765 if !user_provided_zones { 766 if config.zones.is_some() { 767 error!( 768 "User defined memory regions can't be provided if the \ 769 memory size is not 0" 770 ); 771 return Err(Error::InvalidMemoryParameters); 772 } 773 774 if config.hotplug_size.is_some() { 775 allow_mem_hotplug = true; 776 } 777 778 if let Some(hotplugged_size) = config.hotplugged_size { 779 if let Some(hotplug_size) = config.hotplug_size { 780 if hotplugged_size > hotplug_size { 781 error!( 782 "'hotplugged_size' {} can't be bigger than \ 783 'hotplug_size' {}", 784 hotplugged_size, hotplug_size, 785 ); 786 return Err(Error::InvalidMemoryParameters); 787 } 788 } else { 789 error!( 790 "Invalid to define 'hotplugged_size' when there is\ 791 no 'hotplug_size'" 792 ); 793 return Err(Error::InvalidMemoryParameters); 794 } 795 if config.hotplug_method == HotplugMethod::Acpi { 796 error!( 797 "Invalid to define 'hotplugged_size' with hotplug \ 798 method 'acpi'" 799 ); 800 return Err(Error::InvalidMemoryParameters); 801 } 802 } 803 804 // Create a single zone from the global memory config. This lets 805 // us reuse the codepath for user defined memory zones. 806 let zones = vec![MemoryZoneConfig { 807 id: String::from(DEFAULT_MEMORY_ZONE), 808 size: config.size, 809 file: None, 810 shared: config.shared, 811 hugepages: config.hugepages, 812 hugepage_size: config.hugepage_size, 813 host_numa_node: None, 814 hotplug_size: config.hotplug_size, 815 hotplugged_size: config.hotplugged_size, 816 prefault: config.prefault, 817 }]; 818 819 Ok((config.size, zones, allow_mem_hotplug)) 820 } else { 821 if config.zones.is_none() { 822 error!( 823 "User defined memory regions must be provided if the \ 824 memory size is 0" 825 ); 826 return Err(Error::MissingMemoryZones); 827 } 828 829 // Safe to unwrap as we checked right above there were some 830 // regions. 831 let zones = config.zones.clone().unwrap(); 832 if zones.is_empty() { 833 return Err(Error::MissingMemoryZones); 834 } 835 836 let mut total_ram_size: u64 = 0; 837 for zone in zones.iter() { 838 total_ram_size += zone.size; 839 840 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 841 error!( 842 "Invalid to set host NUMA policy for a memory zone \ 843 backed by a regular file and mapped as 'shared'" 844 ); 845 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 846 } 847 848 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 849 error!("Invalid to set ACPI hotplug method for memory zones"); 850 return Err(Error::InvalidHotplugMethodWithMemoryZones); 851 } 852 853 if let Some(hotplugged_size) = zone.hotplugged_size { 854 if let Some(hotplug_size) = zone.hotplug_size { 855 if hotplugged_size > hotplug_size { 856 error!( 857 "'hotplugged_size' {} can't be bigger than \ 858 'hotplug_size' {}", 859 hotplugged_size, hotplug_size, 860 ); 861 return Err(Error::InvalidMemoryParameters); 862 } 863 } else { 864 error!( 865 "Invalid to define 'hotplugged_size' when there is\ 866 no 'hotplug_size' for a memory zone" 867 ); 868 return Err(Error::InvalidMemoryParameters); 869 } 870 if config.hotplug_method == HotplugMethod::Acpi { 871 error!( 872 "Invalid to define 'hotplugged_size' with hotplug \ 873 method 'acpi'" 874 ); 875 return Err(Error::InvalidMemoryParameters); 876 } 877 } 878 } 879 880 Ok((total_ram_size, zones, allow_mem_hotplug)) 881 } 882 } 883 884 pub fn allocate_address_space(&mut self) -> Result<(), Error> { 885 let mut list = Vec::new(); 886 887 for (zone_id, memory_zone) in self.memory_zones.iter() { 888 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 889 memory_zone 890 .regions() 891 .iter() 892 .map(|r| (r.clone(), false)) 893 .collect(); 894 895 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 896 regions.push((virtio_mem_zone.region().clone(), true)); 897 } 898 899 list.push((zone_id.clone(), regions)); 900 } 901 902 for (zone_id, regions) in list { 903 for (region, virtio_mem) in regions { 904 let slot = self.create_userspace_mapping( 905 region.start_addr().raw_value(), 906 region.len(), 907 region.as_ptr() as u64, 908 self.mergeable, 909 false, 910 self.log_dirty, 911 )?; 912 913 let file_offset = if let Some(file_offset) = region.file_offset() { 914 file_offset.start() 915 } else { 916 0 917 }; 918 919 self.guest_ram_mappings.push(GuestRamMapping { 920 gpa: region.start_addr().raw_value(), 921 size: region.len(), 922 slot, 923 zone_id: zone_id.clone(), 924 virtio_mem, 925 file_offset, 926 }); 927 self.ram_allocator 928 .allocate(Some(region.start_addr()), region.len(), None) 929 .ok_or(Error::MemoryRangeAllocation)?; 930 } 931 } 932 933 // Allocate SubRegion and Reserved address ranges. 934 for region in self.arch_mem_regions.iter() { 935 if region.r_type == RegionType::Ram { 936 // Ignore the RAM type since ranges have already been allocated 937 // based on the GuestMemory regions. 938 continue; 939 } 940 self.ram_allocator 941 .allocate( 942 Some(GuestAddress(region.base)), 943 region.size as GuestUsize, 944 None, 945 ) 946 .ok_or(Error::MemoryRangeAllocation)?; 947 } 948 949 Ok(()) 950 } 951 952 #[cfg(target_arch = "aarch64")] 953 fn add_uefi_flash(&mut self) -> Result<(), Error> { 954 // On AArch64, the UEFI binary requires a flash device at address 0. 955 // 4 MiB memory is mapped to simulate the flash. 956 let uefi_mem_slot = self.allocate_memory_slot(); 957 let uefi_region = GuestRegionMmap::new( 958 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 959 arch::layout::UEFI_START, 960 ) 961 .unwrap(); 962 let uefi_mem_region = self.vm.make_user_memory_region( 963 uefi_mem_slot, 964 uefi_region.start_addr().raw_value(), 965 uefi_region.len(), 966 uefi_region.as_ptr() as u64, 967 false, 968 false, 969 ); 970 self.vm 971 .create_user_memory_region(uefi_mem_region) 972 .map_err(Error::CreateUefiFlash)?; 973 974 let uefi_flash = 975 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 976 977 self.uefi_flash = Some(uefi_flash); 978 979 Ok(()) 980 } 981 982 #[allow(clippy::too_many_arguments)] 983 pub fn new( 984 vm: Arc<dyn hypervisor::Vm>, 985 config: &MemoryConfig, 986 prefault: Option<bool>, 987 phys_bits: u8, 988 #[cfg(feature = "tdx")] tdx_enabled: bool, 989 restore_data: Option<&MemoryManagerSnapshotData>, 990 existing_memory_files: Option<HashMap<u32, File>>, 991 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 992 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 993 trace_scoped!("MemoryManager::new"); 994 995 let user_provided_zones = config.size == 0; 996 997 let mmio_address_space_size = mmio_address_space_size(phys_bits); 998 debug_assert_eq!( 999 (((mmio_address_space_size) >> 16) << 16), 1000 mmio_address_space_size 1001 ); 1002 let start_of_platform_device_area = 1003 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 1004 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 1005 1006 let (ram_size, zones, allow_mem_hotplug) = 1007 Self::validate_memory_config(config, user_provided_zones)?; 1008 1009 let ( 1010 start_of_device_area, 1011 boot_ram, 1012 current_ram, 1013 arch_mem_regions, 1014 memory_zones, 1015 guest_memory, 1016 boot_guest_memory, 1017 hotplug_slots, 1018 next_memory_slot, 1019 selected_slot, 1020 next_hotplug_slot, 1021 ) = if let Some(data) = restore_data { 1022 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 1023 &data.guest_ram_mappings, 1024 &zones, 1025 prefault, 1026 existing_memory_files.unwrap_or_default(), 1027 config.thp, 1028 )?; 1029 let guest_memory = 1030 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 1031 let boot_guest_memory = guest_memory.clone(); 1032 ( 1033 GuestAddress(data.start_of_device_area), 1034 data.boot_ram, 1035 data.current_ram, 1036 data.arch_mem_regions.clone(), 1037 memory_zones, 1038 guest_memory, 1039 boot_guest_memory, 1040 data.hotplug_slots.clone(), 1041 data.next_memory_slot, 1042 data.selected_slot, 1043 data.next_hotplug_slot, 1044 ) 1045 } else { 1046 // Init guest memory 1047 let arch_mem_regions = arch::arch_memory_regions(); 1048 1049 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1050 .iter() 1051 .filter(|r| r.2 == RegionType::Ram) 1052 .map(|r| (r.0, r.1)) 1053 .collect(); 1054 1055 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 1056 .iter() 1057 .map(|(a, b, c)| ArchMemRegion { 1058 base: a.0, 1059 size: *b, 1060 r_type: *c, 1061 }) 1062 .collect(); 1063 1064 let (mem_regions, mut memory_zones) = 1065 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 1066 1067 let mut guest_memory = 1068 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 1069 1070 let boot_guest_memory = guest_memory.clone(); 1071 1072 let mut start_of_device_area = 1073 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 1074 1075 // Update list of memory zones for resize. 1076 for zone in zones.iter() { 1077 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 1078 if let Some(hotplug_size) = zone.hotplug_size { 1079 if hotplug_size == 0 { 1080 error!("'hotplug_size' can't be 0"); 1081 return Err(Error::InvalidHotplugSize); 1082 } 1083 1084 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 1085 start_of_device_area = start_of_device_area 1086 .checked_add(hotplug_size) 1087 .ok_or(Error::GuestAddressOverFlow)?; 1088 } else { 1089 // Alignment must be "natural" i.e. same as size of block 1090 let start_addr = GuestAddress( 1091 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1092 - 1) 1093 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1094 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 1095 ); 1096 1097 // When `prefault` is set by vm_restore, memory manager 1098 // will create ram region with `prefault` option in 1099 // restore config rather than same option in zone 1100 let region = MemoryManager::create_ram_region( 1101 &None, 1102 0, 1103 start_addr, 1104 hotplug_size as usize, 1105 prefault.unwrap_or(zone.prefault), 1106 zone.shared, 1107 zone.hugepages, 1108 zone.hugepage_size, 1109 zone.host_numa_node, 1110 None, 1111 config.thp, 1112 )?; 1113 1114 guest_memory = guest_memory 1115 .insert_region(Arc::clone(®ion)) 1116 .map_err(Error::GuestMemory)?; 1117 1118 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1119 let region_size = region.len(); 1120 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1121 region, 1122 virtio_device: None, 1123 hotplugged_size, 1124 hugepages: zone.hugepages, 1125 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1126 }); 1127 1128 start_of_device_area = start_addr 1129 .checked_add(hotplug_size) 1130 .ok_or(Error::GuestAddressOverFlow)?; 1131 } 1132 } 1133 } else { 1134 return Err(Error::MissingZoneIdentifier); 1135 } 1136 } 1137 1138 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1139 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1140 1141 ( 1142 start_of_device_area, 1143 ram_size, 1144 ram_size, 1145 arch_mem_regions, 1146 memory_zones, 1147 guest_memory, 1148 boot_guest_memory, 1149 hotplug_slots, 1150 0, 1151 0, 1152 0, 1153 ) 1154 }; 1155 1156 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1157 1158 // Both MMIO and PIO address spaces start at address 0. 1159 let allocator = Arc::new(Mutex::new( 1160 SystemAllocator::new( 1161 #[cfg(target_arch = "x86_64")] 1162 { 1163 GuestAddress(0) 1164 }, 1165 #[cfg(target_arch = "x86_64")] 1166 { 1167 1 << 16 1168 }, 1169 start_of_platform_device_area, 1170 PLATFORM_DEVICE_AREA_SIZE, 1171 #[cfg(target_arch = "x86_64")] 1172 vec![GsiApic::new( 1173 X86_64_IRQ_BASE, 1174 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1175 )], 1176 ) 1177 .ok_or(Error::CreateSystemAllocator)?, 1178 )); 1179 1180 #[cfg(not(feature = "tdx"))] 1181 let dynamic = true; 1182 #[cfg(feature = "tdx")] 1183 let dynamic = !tdx_enabled; 1184 1185 let acpi_address = if dynamic 1186 && config.hotplug_method == HotplugMethod::Acpi 1187 && (config.hotplug_size.unwrap_or_default() > 0) 1188 { 1189 Some( 1190 allocator 1191 .lock() 1192 .unwrap() 1193 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1194 .ok_or(Error::AllocateMmioAddress)?, 1195 ) 1196 } else { 1197 None 1198 }; 1199 1200 // If running on SGX the start of device area and RAM area may diverge but 1201 // at this point they are next to each other. 1202 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1203 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1204 1205 let mut memory_manager = MemoryManager { 1206 boot_guest_memory, 1207 guest_memory, 1208 next_memory_slot, 1209 start_of_device_area, 1210 end_of_device_area, 1211 end_of_ram_area, 1212 vm, 1213 hotplug_slots, 1214 selected_slot, 1215 mergeable: config.mergeable, 1216 allocator, 1217 hotplug_method: config.hotplug_method, 1218 boot_ram, 1219 current_ram, 1220 next_hotplug_slot, 1221 shared: config.shared, 1222 hugepages: config.hugepages, 1223 hugepage_size: config.hugepage_size, 1224 prefault: config.prefault, 1225 #[cfg(target_arch = "x86_64")] 1226 sgx_epc_region: None, 1227 user_provided_zones, 1228 snapshot_memory_ranges: MemoryRangeTable::default(), 1229 memory_zones, 1230 guest_ram_mappings: Vec::new(), 1231 acpi_address, 1232 log_dirty: dynamic, // Cannot log dirty pages on a TD 1233 arch_mem_regions, 1234 ram_allocator, 1235 dynamic, 1236 #[cfg(target_arch = "aarch64")] 1237 uefi_flash: None, 1238 thp: config.thp, 1239 }; 1240 1241 #[cfg(target_arch = "aarch64")] 1242 { 1243 // For Aarch64 we cannot lazily allocate the address space like we 1244 // do for x86, because while restoring a VM from snapshot we would 1245 // need the address space to be allocated to properly restore VGIC. 1246 // And the restore of VGIC happens before we attempt to run the vCPUs 1247 // for the first time, thus we need to allocate the address space 1248 // beforehand. 1249 memory_manager.allocate_address_space()?; 1250 memory_manager.add_uefi_flash()?; 1251 } 1252 1253 #[cfg(target_arch = "x86_64")] 1254 if let Some(sgx_epc_config) = sgx_epc_config { 1255 memory_manager.setup_sgx(sgx_epc_config)?; 1256 } 1257 1258 Ok(Arc::new(Mutex::new(memory_manager))) 1259 } 1260 1261 pub fn new_from_snapshot( 1262 snapshot: &Snapshot, 1263 vm: Arc<dyn hypervisor::Vm>, 1264 config: &MemoryConfig, 1265 source_url: Option<&str>, 1266 prefault: bool, 1267 phys_bits: u8, 1268 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1269 if let Some(source_url) = source_url { 1270 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1271 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1272 1273 let mem_snapshot: MemoryManagerSnapshotData = 1274 snapshot.to_state().map_err(Error::Restore)?; 1275 1276 let mm = MemoryManager::new( 1277 vm, 1278 config, 1279 Some(prefault), 1280 phys_bits, 1281 #[cfg(feature = "tdx")] 1282 false, 1283 Some(&mem_snapshot), 1284 None, 1285 #[cfg(target_arch = "x86_64")] 1286 None, 1287 )?; 1288 1289 mm.lock() 1290 .unwrap() 1291 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1292 1293 Ok(mm) 1294 } else { 1295 Err(Error::RestoreMissingSourceUrl) 1296 } 1297 } 1298 1299 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1300 // SAFETY: FFI call with correct arguments 1301 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1302 1303 if res < 0 { 1304 Err(io::Error::last_os_error()) 1305 } else { 1306 Ok(res as RawFd) 1307 } 1308 } 1309 1310 fn mbind( 1311 addr: *mut u8, 1312 len: u64, 1313 mode: u32, 1314 nodemask: Vec<u64>, 1315 maxnode: u64, 1316 flags: u32, 1317 ) -> Result<(), io::Error> { 1318 // SAFETY: FFI call with correct arguments 1319 let res = unsafe { 1320 libc::syscall( 1321 libc::SYS_mbind, 1322 addr as *mut libc::c_void, 1323 len, 1324 mode, 1325 nodemask.as_ptr(), 1326 maxnode, 1327 flags, 1328 ) 1329 }; 1330 1331 if res < 0 { 1332 Err(io::Error::last_os_error()) 1333 } else { 1334 Ok(()) 1335 } 1336 } 1337 1338 fn create_anonymous_file( 1339 size: usize, 1340 hugepages: bool, 1341 hugepage_size: Option<u64>, 1342 ) -> Result<FileOffset, Error> { 1343 let fd = Self::memfd_create( 1344 &ffi::CString::new("ch_ram").unwrap(), 1345 libc::MFD_CLOEXEC 1346 | if hugepages { 1347 libc::MFD_HUGETLB 1348 | if let Some(hugepage_size) = hugepage_size { 1349 /* 1350 * From the Linux kernel: 1351 * Several system calls take a flag to request "hugetlb" huge pages. 1352 * Without further specification, these system calls will use the 1353 * system's default huge page size. If a system supports multiple 1354 * huge page sizes, the desired huge page size can be specified in 1355 * bits [26:31] of the flag arguments. The value in these 6 bits 1356 * will encode the log2 of the huge page size. 1357 */ 1358 1359 hugepage_size.trailing_zeros() << 26 1360 } else { 1361 // Use the system default huge page size 1362 0 1363 } 1364 } else { 1365 0 1366 }, 1367 ) 1368 .map_err(Error::SharedFileCreate)?; 1369 1370 // SAFETY: fd is valid 1371 let f = unsafe { File::from_raw_fd(fd) }; 1372 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1373 1374 Ok(FileOffset::new(f, 0)) 1375 } 1376 1377 fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> { 1378 if backing_file.is_dir() { 1379 Err(Error::DirectoryAsBackingFileForMemory) 1380 } else { 1381 let f = OpenOptions::new() 1382 .read(true) 1383 .write(true) 1384 .open(backing_file) 1385 .map_err(Error::SharedFileCreate)?; 1386 1387 Ok(FileOffset::new(f, file_offset)) 1388 } 1389 } 1390 1391 #[allow(clippy::too_many_arguments)] 1392 pub fn create_ram_region( 1393 backing_file: &Option<PathBuf>, 1394 file_offset: u64, 1395 start_addr: GuestAddress, 1396 size: usize, 1397 prefault: bool, 1398 shared: bool, 1399 hugepages: bool, 1400 hugepage_size: Option<u64>, 1401 host_numa_node: Option<u32>, 1402 existing_memory_file: Option<File>, 1403 thp: bool, 1404 ) -> Result<Arc<GuestRegionMmap>, Error> { 1405 let mut mmap_flags = libc::MAP_NORESERVE; 1406 1407 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1408 // the complexity of the handling clear. 1409 let fo = if let Some(f) = existing_memory_file { 1410 // It must be MAP_SHARED as we wouldn't already have an FD 1411 mmap_flags |= libc::MAP_SHARED; 1412 Some(FileOffset::new(f, file_offset)) 1413 } else if let Some(backing_file) = backing_file { 1414 if shared { 1415 mmap_flags |= libc::MAP_SHARED; 1416 } else { 1417 mmap_flags |= libc::MAP_PRIVATE; 1418 } 1419 Some(Self::open_backing_file(backing_file, file_offset)?) 1420 } else if shared || hugepages { 1421 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1422 // because the MAP_PRIVATE will trigger CoW against the backing file with 1423 // the VFIO pinning 1424 mmap_flags |= libc::MAP_SHARED; 1425 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1426 } else { 1427 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1428 None 1429 }; 1430 1431 let region = GuestRegionMmap::new( 1432 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1433 .map_err(Error::GuestMemoryRegion)?, 1434 start_addr, 1435 ) 1436 .map_err(Error::GuestMemory)?; 1437 1438 // Apply NUMA policy if needed. 1439 if let Some(node) = host_numa_node { 1440 let addr = region.deref().as_ptr(); 1441 let len = region.deref().size() as u64; 1442 let mode = MPOL_BIND; 1443 let mut nodemask: Vec<u64> = Vec::new(); 1444 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1445 1446 // Linux is kind of buggy in the way it interprets maxnode as it 1447 // will cut off the last node. That's why we have to add 1 to what 1448 // we would consider as the proper maxnode value. 1449 let maxnode = node as u64 + 1 + 1; 1450 1451 // Allocate the right size for the vector. 1452 nodemask.resize((node as usize / 64) + 1, 0); 1453 1454 // Fill the global bitmask through the nodemask vector. 1455 let idx = (node / 64) as usize; 1456 let shift = node % 64; 1457 nodemask[idx] |= 1u64 << shift; 1458 1459 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1460 // force the kernel to move all pages that might have been already 1461 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1462 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1463 // MPOL_BIND is the selected mode as it specifies a strict policy 1464 // that restricts memory allocation to the nodes specified in the 1465 // nodemask. 1466 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1467 .map_err(Error::ApplyNumaPolicy)?; 1468 } 1469 1470 // Prefault the region if needed, in parallel. 1471 if prefault { 1472 let page_size = 1473 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize; 1474 1475 if !is_aligned(size, page_size) { 1476 warn!( 1477 "Prefaulting memory size {} misaligned with page size {}", 1478 size, page_size 1479 ); 1480 } 1481 1482 let num_pages = size / page_size; 1483 1484 let num_threads = Self::get_prefault_num_threads(page_size, num_pages); 1485 1486 let pages_per_thread = num_pages / num_threads; 1487 let remainder = num_pages % num_threads; 1488 1489 let barrier = Arc::new(Barrier::new(num_threads)); 1490 thread::scope(|s| { 1491 let r = ®ion; 1492 for i in 0..num_threads { 1493 let barrier = Arc::clone(&barrier); 1494 s.spawn(move || { 1495 // Wait until all threads have been spawned to avoid contention 1496 // over mmap_sem between thread stack allocation and page faulting. 1497 barrier.wait(); 1498 let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; 1499 let offset = 1500 page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); 1501 // SAFETY: FFI call with correct arguments 1502 let ret = unsafe { 1503 let addr = r.as_ptr().add(offset); 1504 libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE) 1505 }; 1506 if ret != 0 { 1507 let e = io::Error::last_os_error(); 1508 warn!("Failed to prefault pages: {}", e); 1509 } 1510 }); 1511 } 1512 }); 1513 } 1514 1515 if region.file_offset().is_none() && thp { 1516 info!( 1517 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1518 region.as_ptr() as u64, 1519 size 1520 ); 1521 // SAFETY: FFI call with correct arguments 1522 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1523 if ret != 0 { 1524 let e = io::Error::last_os_error(); 1525 warn!("Failed to mark pages as THP eligible: {}", e); 1526 } 1527 } 1528 1529 Ok(Arc::new(region)) 1530 } 1531 1532 // Duplicate of `memory_zone_get_align_size` that does not require a `zone` 1533 fn get_prefault_align_size( 1534 backing_file: &Option<PathBuf>, 1535 hugepages: bool, 1536 hugepage_size: Option<u64>, 1537 ) -> Result<u64, Error> { 1538 // SAFETY: FFI call. Trivially safe. 1539 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 1540 match (hugepages, hugepage_size, backing_file) { 1541 (false, _, _) => Ok(page_size), 1542 (true, Some(hugepage_size), _) => Ok(hugepage_size), 1543 (true, None, _) => { 1544 // There are two scenarios here: 1545 // - `hugepages` is enabled but `hugepage_size` is not specified: 1546 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 1547 // - The backing file is specified: 1548 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 1549 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 1550 // value is less than or equal to the page size, just use the page size. 1551 let path = backing_file 1552 .as_ref() 1553 .map_or(Ok("/dev/hugepages"), |pathbuf| { 1554 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 1555 })?; 1556 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 1557 Ok(align_size) 1558 } 1559 } 1560 } 1561 1562 fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize { 1563 let mut n: usize = 1; 1564 1565 // Do not create more threads than processors available. 1566 // SAFETY: FFI call. Trivially safe. 1567 let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) }; 1568 if procs > 0 { 1569 n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT); 1570 } 1571 1572 // Do not create more threads than pages being allocated. 1573 n = std::cmp::min(n, num_pages); 1574 1575 // Do not create threads to allocate less than 64 MiB of memory. 1576 n = std::cmp::min( 1577 n, 1578 std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))), 1579 ); 1580 1581 n 1582 } 1583 1584 // Update the GuestMemoryMmap with the new range 1585 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1586 let guest_memory = self 1587 .guest_memory 1588 .memory() 1589 .insert_region(region) 1590 .map_err(Error::GuestMemory)?; 1591 self.guest_memory.lock().unwrap().replace(guest_memory); 1592 1593 Ok(()) 1594 } 1595 1596 // 1597 // Calculate the start address of an area next to RAM. 1598 // 1599 // If memory hotplug is allowed, the start address needs to be aligned 1600 // (rounded-up) to 128MiB boundary. 1601 // If memory hotplug is not allowed, there is no alignment required. 1602 // And it must also start at the 64bit start. 1603 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1604 let mut start_addr = if allow_mem_hotplug { 1605 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1606 } else { 1607 mem_end 1608 }; 1609 1610 start_addr = start_addr 1611 .checked_add(1) 1612 .ok_or(Error::GuestAddressOverFlow)?; 1613 1614 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1615 return Ok(arch::layout::RAM_64BIT_START); 1616 } 1617 1618 Ok(start_addr) 1619 } 1620 1621 pub fn add_ram_region( 1622 &mut self, 1623 start_addr: GuestAddress, 1624 size: usize, 1625 ) -> Result<Arc<GuestRegionMmap>, Error> { 1626 // Allocate memory for the region 1627 let region = MemoryManager::create_ram_region( 1628 &None, 1629 0, 1630 start_addr, 1631 size, 1632 self.prefault, 1633 self.shared, 1634 self.hugepages, 1635 self.hugepage_size, 1636 None, 1637 None, 1638 self.thp, 1639 )?; 1640 1641 // Map it into the guest 1642 let slot = self.create_userspace_mapping( 1643 region.start_addr().0, 1644 region.len(), 1645 region.as_ptr() as u64, 1646 self.mergeable, 1647 false, 1648 self.log_dirty, 1649 )?; 1650 self.guest_ram_mappings.push(GuestRamMapping { 1651 gpa: region.start_addr().raw_value(), 1652 size: region.len(), 1653 slot, 1654 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1655 virtio_mem: false, 1656 file_offset: 0, 1657 }); 1658 1659 self.add_region(Arc::clone(®ion))?; 1660 1661 Ok(region) 1662 } 1663 1664 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1665 info!("Hotplugging new RAM: {}", size); 1666 1667 // Check that there is a free slot 1668 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1669 return Err(Error::NoSlotAvailable); 1670 } 1671 1672 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1673 if size % (128 << 20) != 0 { 1674 return Err(Error::InvalidSize); 1675 } 1676 1677 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1678 1679 if start_addr 1680 .checked_add((size - 1).try_into().unwrap()) 1681 .unwrap() 1682 > self.end_of_ram_area 1683 { 1684 return Err(Error::InsufficientHotplugRam); 1685 } 1686 1687 let region = self.add_ram_region(start_addr, size)?; 1688 1689 // Add region to the list of regions associated with the default 1690 // memory zone. 1691 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1692 memory_zone.regions.push(Arc::clone(®ion)); 1693 } 1694 1695 // Tell the allocator 1696 self.ram_allocator 1697 .allocate(Some(start_addr), size as GuestUsize, None) 1698 .ok_or(Error::MemoryRangeAllocation)?; 1699 1700 // Update the slot so that it can be queried via the I/O port 1701 let slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1702 slot.active = true; 1703 slot.inserting = true; 1704 slot.base = region.start_addr().0; 1705 slot.length = region.len(); 1706 1707 self.next_hotplug_slot += 1; 1708 1709 Ok(region) 1710 } 1711 1712 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1713 self.guest_memory.clone() 1714 } 1715 1716 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1717 self.boot_guest_memory.clone() 1718 } 1719 1720 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1721 self.allocator.clone() 1722 } 1723 1724 pub fn start_of_device_area(&self) -> GuestAddress { 1725 self.start_of_device_area 1726 } 1727 1728 pub fn end_of_device_area(&self) -> GuestAddress { 1729 self.end_of_device_area 1730 } 1731 1732 pub fn allocate_memory_slot(&mut self) -> u32 { 1733 let slot_id = self.next_memory_slot; 1734 self.next_memory_slot += 1; 1735 slot_id 1736 } 1737 1738 pub fn create_userspace_mapping( 1739 &mut self, 1740 guest_phys_addr: u64, 1741 memory_size: u64, 1742 userspace_addr: u64, 1743 mergeable: bool, 1744 readonly: bool, 1745 log_dirty: bool, 1746 ) -> Result<u32, Error> { 1747 let slot = self.allocate_memory_slot(); 1748 let mem_region = self.vm.make_user_memory_region( 1749 slot, 1750 guest_phys_addr, 1751 memory_size, 1752 userspace_addr, 1753 readonly, 1754 log_dirty, 1755 ); 1756 1757 info!( 1758 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1759 guest_phys_addr, userspace_addr, memory_size, slot 1760 ); 1761 1762 self.vm 1763 .create_user_memory_region(mem_region) 1764 .map_err(Error::CreateUserMemoryRegion)?; 1765 1766 // SAFETY: the address and size are valid since the 1767 // mmap succeeded. 1768 let ret = unsafe { 1769 libc::madvise( 1770 userspace_addr as *mut libc::c_void, 1771 memory_size as libc::size_t, 1772 libc::MADV_DONTDUMP, 1773 ) 1774 }; 1775 if ret != 0 { 1776 let e = io::Error::last_os_error(); 1777 warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e); 1778 } 1779 1780 // Mark the pages as mergeable if explicitly asked for. 1781 if mergeable { 1782 // SAFETY: the address and size are valid since the 1783 // mmap succeeded. 1784 let ret = unsafe { 1785 libc::madvise( 1786 userspace_addr as *mut libc::c_void, 1787 memory_size as libc::size_t, 1788 libc::MADV_MERGEABLE, 1789 ) 1790 }; 1791 if ret != 0 { 1792 let err = io::Error::last_os_error(); 1793 // Safe to unwrap because the error is constructed with 1794 // last_os_error(), which ensures the output will be Some(). 1795 let errno = err.raw_os_error().unwrap(); 1796 if errno == libc::EINVAL { 1797 warn!("kernel not configured with CONFIG_KSM"); 1798 } else { 1799 warn!("madvise error: {}", err); 1800 } 1801 warn!("failed to mark pages as mergeable"); 1802 } 1803 } 1804 1805 info!( 1806 "Created userspace mapping: {:x} -> {:x} {:x}", 1807 guest_phys_addr, userspace_addr, memory_size 1808 ); 1809 1810 Ok(slot) 1811 } 1812 1813 pub fn remove_userspace_mapping( 1814 &mut self, 1815 guest_phys_addr: u64, 1816 memory_size: u64, 1817 userspace_addr: u64, 1818 mergeable: bool, 1819 slot: u32, 1820 ) -> Result<(), Error> { 1821 let mem_region = self.vm.make_user_memory_region( 1822 slot, 1823 guest_phys_addr, 1824 memory_size, 1825 userspace_addr, 1826 false, /* readonly -- don't care */ 1827 false, /* log dirty */ 1828 ); 1829 1830 self.vm 1831 .remove_user_memory_region(mem_region) 1832 .map_err(Error::RemoveUserMemoryRegion)?; 1833 1834 // Mark the pages as unmergeable if there were previously marked as 1835 // mergeable. 1836 if mergeable { 1837 // SAFETY: the address and size are valid as the region was 1838 // previously advised. 1839 let ret = unsafe { 1840 libc::madvise( 1841 userspace_addr as *mut libc::c_void, 1842 memory_size as libc::size_t, 1843 libc::MADV_UNMERGEABLE, 1844 ) 1845 }; 1846 if ret != 0 { 1847 let err = io::Error::last_os_error(); 1848 // Safe to unwrap because the error is constructed with 1849 // last_os_error(), which ensures the output will be Some(). 1850 let errno = err.raw_os_error().unwrap(); 1851 if errno == libc::EINVAL { 1852 warn!("kernel not configured with CONFIG_KSM"); 1853 } else { 1854 warn!("madvise error: {}", err); 1855 } 1856 warn!("failed to mark pages as unmergeable"); 1857 } 1858 } 1859 1860 info!( 1861 "Removed userspace mapping: {:x} -> {:x} {:x}", 1862 guest_phys_addr, userspace_addr, memory_size 1863 ); 1864 1865 Ok(()) 1866 } 1867 1868 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1869 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1870 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1871 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1872 virtio_mem_device 1873 .lock() 1874 .unwrap() 1875 .resize(size) 1876 .map_err(Error::VirtioMemResizeFail)?; 1877 } 1878 1879 // Keep the hotplugged_size up to date. 1880 virtio_mem_zone.hotplugged_size = size; 1881 } else { 1882 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1883 return Err(Error::MissingVirtioMemHandler); 1884 } 1885 1886 return Ok(()); 1887 } 1888 1889 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1890 Err(Error::UnknownMemoryZone) 1891 } 1892 1893 /// In case this function resulted in adding a new memory region to the 1894 /// guest memory, the new region is returned to the caller. The virtio-mem 1895 /// use case never adds a new region as the whole hotpluggable memory has 1896 /// already been allocated at boot time. 1897 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1898 if self.user_provided_zones { 1899 error!( 1900 "Not allowed to resize guest memory when backed with user \ 1901 defined memory zones." 1902 ); 1903 return Err(Error::InvalidResizeWithMemoryZones); 1904 } 1905 1906 let mut region: Option<Arc<GuestRegionMmap>> = None; 1907 match self.hotplug_method { 1908 HotplugMethod::VirtioMem => { 1909 if desired_ram >= self.boot_ram { 1910 if !self.dynamic { 1911 return Ok(region); 1912 } 1913 1914 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1915 self.current_ram = desired_ram; 1916 } 1917 } 1918 HotplugMethod::Acpi => { 1919 if desired_ram > self.current_ram { 1920 if !self.dynamic { 1921 return Ok(region); 1922 } 1923 1924 region = 1925 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1926 self.current_ram = desired_ram; 1927 } 1928 } 1929 } 1930 Ok(region) 1931 } 1932 1933 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1934 if !self.user_provided_zones { 1935 error!( 1936 "Not allowed to resize guest memory zone when no zone is \ 1937 defined." 1938 ); 1939 return Err(Error::ResizeZone); 1940 } 1941 1942 self.virtio_mem_resize(id, virtio_mem_size) 1943 } 1944 1945 #[cfg(target_arch = "x86_64")] 1946 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1947 let file = OpenOptions::new() 1948 .read(true) 1949 .open("/dev/sgx_provision") 1950 .map_err(Error::SgxProvisionOpen)?; 1951 self.vm 1952 .enable_sgx_attribute(file) 1953 .map_err(Error::SgxEnableProvisioning)?; 1954 1955 // Go over each EPC section and verify its size is a 4k multiple. At 1956 // the same time, calculate the total size needed for the contiguous 1957 // EPC region. 1958 let mut epc_region_size = 0; 1959 for epc_section in sgx_epc_config.iter() { 1960 if epc_section.size == 0 { 1961 return Err(Error::EpcSectionSizeInvalid); 1962 } 1963 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1964 return Err(Error::EpcSectionSizeInvalid); 1965 } 1966 1967 epc_region_size += epc_section.size; 1968 } 1969 1970 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1971 let epc_region_start = GuestAddress( 1972 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1973 ); 1974 1975 self.start_of_device_area = epc_region_start 1976 .checked_add(epc_region_size) 1977 .ok_or(Error::GuestAddressOverFlow)?; 1978 1979 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1980 info!( 1981 "SGX EPC region: 0x{:x} (0x{:x})", 1982 epc_region_start.0, epc_region_size 1983 ); 1984 1985 // Each section can be memory mapped into the allocated region. 1986 let mut epc_section_start = epc_region_start.raw_value(); 1987 for epc_section in sgx_epc_config.iter() { 1988 let file = OpenOptions::new() 1989 .read(true) 1990 .write(true) 1991 .open("/dev/sgx_vepc") 1992 .map_err(Error::SgxVirtEpcOpen)?; 1993 1994 let prot = PROT_READ | PROT_WRITE; 1995 let mut flags = MAP_NORESERVE | MAP_SHARED; 1996 if epc_section.prefault { 1997 flags |= MAP_POPULATE; 1998 } 1999 2000 // We can't use the vm-memory crate to perform the memory mapping 2001 // here as it would try to ensure the size of the backing file is 2002 // matching the size of the expected mapping. The /dev/sgx_vepc 2003 // device does not work that way, it provides a file descriptor 2004 // which is not matching the mapping size, as it's a just a way to 2005 // let KVM know that an EPC section is being created for the guest. 2006 // SAFETY: FFI call with correct arguments 2007 let host_addr = unsafe { 2008 libc::mmap( 2009 std::ptr::null_mut(), 2010 epc_section.size as usize, 2011 prot, 2012 flags, 2013 file.as_raw_fd(), 2014 0, 2015 ) 2016 } as u64; 2017 2018 info!( 2019 "Adding SGX EPC section: 0x{:x} (0x{:x})", 2020 epc_section_start, epc_section.size 2021 ); 2022 2023 let _mem_slot = self.create_userspace_mapping( 2024 epc_section_start, 2025 epc_section.size, 2026 host_addr, 2027 false, 2028 false, 2029 false, 2030 )?; 2031 2032 sgx_epc_region.insert( 2033 epc_section.id.clone(), 2034 SgxEpcSection::new( 2035 GuestAddress(epc_section_start), 2036 epc_section.size as GuestUsize, 2037 ), 2038 ); 2039 2040 epc_section_start += epc_section.size; 2041 } 2042 2043 self.sgx_epc_region = Some(sgx_epc_region); 2044 2045 Ok(()) 2046 } 2047 2048 #[cfg(target_arch = "x86_64")] 2049 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 2050 &self.sgx_epc_region 2051 } 2052 2053 pub fn is_hardlink(f: &File) -> bool { 2054 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 2055 // SAFETY: FFI call with correct arguments 2056 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 2057 if ret != 0 { 2058 error!("Couldn't fstat the backing file"); 2059 return false; 2060 } 2061 2062 // SAFETY: stat is valid 2063 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 2064 } 2065 2066 pub fn memory_zones(&self) -> &MemoryZones { 2067 &self.memory_zones 2068 } 2069 2070 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 2071 &mut self.memory_zones 2072 } 2073 2074 pub fn memory_range_table( 2075 &self, 2076 snapshot: bool, 2077 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 2078 let mut table = MemoryRangeTable::default(); 2079 2080 for memory_zone in self.memory_zones.values() { 2081 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 2082 table.extend(virtio_mem_zone.plugged_ranges()); 2083 } 2084 2085 for region in memory_zone.regions() { 2086 if snapshot { 2087 if let Some(file_offset) = region.file_offset() { 2088 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 2089 && Self::is_hardlink(file_offset.file()) 2090 { 2091 // In this very specific case, we know the memory 2092 // region is backed by a file on the host filesystem 2093 // that can be accessed by the user, and additionally 2094 // the mapping is shared, which means that modifications 2095 // to the content are written to the actual file. 2096 // When meeting these conditions, we can skip the 2097 // copy of the memory content for this specific region, 2098 // as we can assume the user will have it saved through 2099 // the backing file already. 2100 continue; 2101 } 2102 } 2103 } 2104 2105 table.push(MemoryRange { 2106 gpa: region.start_addr().raw_value(), 2107 length: region.len(), 2108 }); 2109 } 2110 } 2111 2112 Ok(table) 2113 } 2114 2115 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 2116 MemoryManagerSnapshotData { 2117 memory_ranges: self.snapshot_memory_ranges.clone(), 2118 guest_ram_mappings: self.guest_ram_mappings.clone(), 2119 start_of_device_area: self.start_of_device_area.0, 2120 boot_ram: self.boot_ram, 2121 current_ram: self.current_ram, 2122 arch_mem_regions: self.arch_mem_regions.clone(), 2123 hotplug_slots: self.hotplug_slots.clone(), 2124 next_memory_slot: self.next_memory_slot, 2125 selected_slot: self.selected_slot, 2126 next_hotplug_slot: self.next_hotplug_slot, 2127 } 2128 } 2129 2130 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 2131 let mut memory_slot_fds = HashMap::new(); 2132 for guest_ram_mapping in &self.guest_ram_mappings { 2133 let slot = guest_ram_mapping.slot; 2134 let guest_memory = self.guest_memory.memory(); 2135 let file = guest_memory 2136 .find_region(GuestAddress(guest_ram_mapping.gpa)) 2137 .unwrap() 2138 .file_offset() 2139 .unwrap() 2140 .file(); 2141 memory_slot_fds.insert(slot, file.as_raw_fd()); 2142 } 2143 memory_slot_fds 2144 } 2145 2146 pub fn acpi_address(&self) -> Option<GuestAddress> { 2147 self.acpi_address 2148 } 2149 2150 pub fn num_guest_ram_mappings(&self) -> u32 { 2151 self.guest_ram_mappings.len() as u32 2152 } 2153 2154 #[cfg(target_arch = "aarch64")] 2155 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 2156 self.uefi_flash.as_ref().unwrap().clone() 2157 } 2158 2159 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2160 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 2161 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 2162 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 2163 2164 let mut mem_offset_in_elf = mem_offset; 2165 let mut ram_maps = BTreeMap::new(); 2166 for mapping in mapping_sorted_by_gpa.iter() { 2167 ram_maps.insert( 2168 mapping.gpa, 2169 CoredumpMemoryRegion { 2170 mem_offset_in_elf, 2171 mem_size: mapping.size, 2172 }, 2173 ); 2174 mem_offset_in_elf += mapping.size; 2175 } 2176 2177 CoredumpMemoryRegions { ram_maps } 2178 } 2179 2180 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2181 pub fn coredump_iterate_save_mem( 2182 &mut self, 2183 dump_state: &DumpState, 2184 ) -> std::result::Result<(), GuestDebuggableError> { 2185 let snapshot_memory_ranges = self 2186 .memory_range_table(false) 2187 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2188 2189 if snapshot_memory_ranges.is_empty() { 2190 return Ok(()); 2191 } 2192 2193 let coredump_file = dump_state.file.as_ref().unwrap(); 2194 2195 let guest_memory = self.guest_memory.memory(); 2196 let mut total_bytes: u64 = 0; 2197 2198 for range in snapshot_memory_ranges.regions() { 2199 let mut offset: u64 = 0; 2200 loop { 2201 let bytes_written = guest_memory 2202 .write_volatile_to( 2203 GuestAddress(range.gpa + offset), 2204 &mut coredump_file.as_fd(), 2205 (range.length - offset) as usize, 2206 ) 2207 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2208 offset += bytes_written as u64; 2209 total_bytes += bytes_written as u64; 2210 2211 if offset == range.length { 2212 break; 2213 } 2214 } 2215 } 2216 2217 debug!("coredump total bytes {}", total_bytes); 2218 Ok(()) 2219 } 2220 2221 pub fn receive_memory_regions<F>( 2222 &mut self, 2223 ranges: &MemoryRangeTable, 2224 fd: &mut F, 2225 ) -> std::result::Result<(), MigratableError> 2226 where 2227 F: ReadVolatile, 2228 { 2229 let guest_memory = self.guest_memory(); 2230 let mem = guest_memory.memory(); 2231 2232 for range in ranges.regions() { 2233 let mut offset: u64 = 0; 2234 // Here we are manually handling the retry in case we can't the 2235 // whole region at once because we can't use the implementation 2236 // from vm-memory::GuestMemory of read_exact_from() as it is not 2237 // following the correct behavior. For more info about this issue 2238 // see: https://github.com/rust-vmm/vm-memory/issues/174 2239 loop { 2240 let bytes_read = mem 2241 .read_volatile_from( 2242 GuestAddress(range.gpa + offset), 2243 fd, 2244 (range.length - offset) as usize, 2245 ) 2246 .map_err(|e| { 2247 MigratableError::MigrateReceive(anyhow!( 2248 "Error receiving memory from socket: {}", 2249 e 2250 )) 2251 })?; 2252 offset += bytes_read as u64; 2253 2254 if offset == range.length { 2255 break; 2256 } 2257 } 2258 } 2259 2260 Ok(()) 2261 } 2262 } 2263 2264 struct MemoryNotify { 2265 slot_id: usize, 2266 } 2267 2268 impl Aml for MemoryNotify { 2269 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2270 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2271 aml::If::new( 2272 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2273 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2274 ) 2275 .to_aml_bytes(sink) 2276 } 2277 } 2278 2279 struct MemorySlot { 2280 slot_id: usize, 2281 } 2282 2283 impl Aml for MemorySlot { 2284 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2285 aml::Device::new( 2286 format!("M{:03}", self.slot_id).as_str().into(), 2287 vec![ 2288 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")), 2289 &aml::Name::new("_UID".into(), &self.slot_id), 2290 /* 2291 _STA return value: 2292 Bit [0] – Set if the device is present. 2293 Bit [1] – Set if the device is enabled and decoding its resources. 2294 Bit [2] – Set if the device should be shown in the UI. 2295 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2296 Bit [4] – Set if the battery is present. 2297 Bits [31:5] – Reserved (must be cleared). 2298 */ 2299 &aml::Method::new( 2300 "_STA".into(), 2301 0, 2302 false, 2303 // Call into MSTA method which will interrogate device 2304 vec![&aml::Return::new(&aml::MethodCall::new( 2305 "MSTA".into(), 2306 vec![&self.slot_id], 2307 ))], 2308 ), 2309 // Get details of memory 2310 &aml::Method::new( 2311 "_CRS".into(), 2312 0, 2313 false, 2314 // Call into MCRS which provides actual memory details 2315 vec![&aml::Return::new(&aml::MethodCall::new( 2316 "MCRS".into(), 2317 vec![&self.slot_id], 2318 ))], 2319 ), 2320 ], 2321 ) 2322 .to_aml_bytes(sink) 2323 } 2324 } 2325 2326 struct MemorySlots { 2327 slots: usize, 2328 } 2329 2330 impl Aml for MemorySlots { 2331 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2332 for slot_id in 0..self.slots { 2333 MemorySlot { slot_id }.to_aml_bytes(sink); 2334 } 2335 } 2336 } 2337 2338 struct MemoryMethods { 2339 slots: usize, 2340 } 2341 2342 impl Aml for MemoryMethods { 2343 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2344 // Add "MTFY" notification method 2345 let mut memory_notifies = Vec::new(); 2346 for slot_id in 0..self.slots { 2347 memory_notifies.push(MemoryNotify { slot_id }); 2348 } 2349 2350 let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2351 for memory_notifier in memory_notifies.iter() { 2352 memory_notifies_refs.push(memory_notifier); 2353 } 2354 2355 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink); 2356 2357 // MSCN method 2358 aml::Method::new( 2359 "MSCN".into(), 2360 0, 2361 true, 2362 vec![ 2363 // Take lock defined above 2364 &aml::Acquire::new("MLCK".into(), 0xffff), 2365 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2366 &aml::While::new( 2367 &aml::LessThan::new(&aml::Local(0), &self.slots), 2368 vec![ 2369 // Write slot number (in first argument) to I/O port via field 2370 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2371 // Check if MINS bit is set (inserting) 2372 &aml::If::new( 2373 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2374 // Notify device if it is 2375 vec![ 2376 &aml::MethodCall::new( 2377 "MTFY".into(), 2378 vec![&aml::Local(0), &aml::ONE], 2379 ), 2380 // Reset MINS bit 2381 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2382 ], 2383 ), 2384 // Check if MRMV bit is set 2385 &aml::If::new( 2386 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2387 // Notify device if it is (with the eject constant 0x3) 2388 vec![ 2389 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2390 // Reset MRMV bit 2391 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2392 ], 2393 ), 2394 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2395 ], 2396 ), 2397 // Release lock 2398 &aml::Release::new("MLCK".into()), 2399 ], 2400 ) 2401 .to_aml_bytes(sink); 2402 2403 // Memory status method 2404 aml::Method::new( 2405 "MSTA".into(), 2406 1, 2407 true, 2408 vec![ 2409 // Take lock defined above 2410 &aml::Acquire::new("MLCK".into(), 0xffff), 2411 // Write slot number (in first argument) to I/O port via field 2412 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2413 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2414 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2415 &aml::If::new( 2416 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2417 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2418 ), 2419 // Release lock 2420 &aml::Release::new("MLCK".into()), 2421 // Return 0 or 0xf 2422 &aml::Return::new(&aml::Local(0)), 2423 ], 2424 ) 2425 .to_aml_bytes(sink); 2426 2427 // Memory range method 2428 aml::Method::new( 2429 "MCRS".into(), 2430 1, 2431 true, 2432 vec![ 2433 // Take lock defined above 2434 &aml::Acquire::new("MLCK".into(), 0xffff), 2435 // Write slot number (in first argument) to I/O port via field 2436 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2437 &aml::Name::new( 2438 "MR64".into(), 2439 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2440 aml::AddressSpaceCacheable::Cacheable, 2441 true, 2442 0x0000_0000_0000_0000u64, 2443 0xFFFF_FFFF_FFFF_FFFEu64, 2444 None, 2445 )]), 2446 ), 2447 &aml::CreateQWordField::new( 2448 &aml::Path::new("MINL"), 2449 &aml::Path::new("MR64"), 2450 &14usize, 2451 ), 2452 &aml::CreateDWordField::new( 2453 &aml::Path::new("MINH"), 2454 &aml::Path::new("MR64"), 2455 &18usize, 2456 ), 2457 &aml::CreateQWordField::new( 2458 &aml::Path::new("MAXL"), 2459 &aml::Path::new("MR64"), 2460 &22usize, 2461 ), 2462 &aml::CreateDWordField::new( 2463 &aml::Path::new("MAXH"), 2464 &aml::Path::new("MR64"), 2465 &26usize, 2466 ), 2467 &aml::CreateQWordField::new( 2468 &aml::Path::new("LENL"), 2469 &aml::Path::new("MR64"), 2470 &38usize, 2471 ), 2472 &aml::CreateDWordField::new( 2473 &aml::Path::new("LENH"), 2474 &aml::Path::new("MR64"), 2475 &42usize, 2476 ), 2477 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2478 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2479 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2480 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2481 &aml::Add::new( 2482 &aml::Path::new("MAXL"), 2483 &aml::Path::new("MINL"), 2484 &aml::Path::new("LENL"), 2485 ), 2486 &aml::Add::new( 2487 &aml::Path::new("MAXH"), 2488 &aml::Path::new("MINH"), 2489 &aml::Path::new("LENH"), 2490 ), 2491 &aml::If::new( 2492 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2493 vec![&aml::Add::new( 2494 &aml::Path::new("MAXH"), 2495 &aml::ONE, 2496 &aml::Path::new("MAXH"), 2497 )], 2498 ), 2499 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2500 // Release lock 2501 &aml::Release::new("MLCK".into()), 2502 &aml::Return::new(&aml::Path::new("MR64")), 2503 ], 2504 ) 2505 .to_aml_bytes(sink) 2506 } 2507 } 2508 2509 impl Aml for MemoryManager { 2510 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2511 if let Some(acpi_address) = self.acpi_address { 2512 // Memory Hotplug Controller 2513 aml::Device::new( 2514 "_SB_.MHPC".into(), 2515 vec![ 2516 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2517 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2518 // Mutex to protect concurrent access as we write to choose slot and then read back status 2519 &aml::Mutex::new("MLCK".into(), 0), 2520 &aml::Name::new( 2521 "_CRS".into(), 2522 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2523 aml::AddressSpaceCacheable::NotCacheable, 2524 true, 2525 acpi_address.0, 2526 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2527 None, 2528 )]), 2529 ), 2530 // OpRegion and Fields map MMIO range into individual field values 2531 &aml::OpRegion::new( 2532 "MHPR".into(), 2533 aml::OpRegionSpace::SystemMemory, 2534 &(acpi_address.0 as usize), 2535 &MEMORY_MANAGER_ACPI_SIZE, 2536 ), 2537 &aml::Field::new( 2538 "MHPR".into(), 2539 aml::FieldAccessType::DWord, 2540 aml::FieldLockRule::NoLock, 2541 aml::FieldUpdateRule::Preserve, 2542 vec![ 2543 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2544 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2545 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2546 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2547 ], 2548 ), 2549 &aml::Field::new( 2550 "MHPR".into(), 2551 aml::FieldAccessType::DWord, 2552 aml::FieldLockRule::NoLock, 2553 aml::FieldUpdateRule::Preserve, 2554 vec![ 2555 aml::FieldEntry::Reserved(128), 2556 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2557 ], 2558 ), 2559 &aml::Field::new( 2560 "MHPR".into(), 2561 aml::FieldAccessType::Byte, 2562 aml::FieldLockRule::NoLock, 2563 aml::FieldUpdateRule::WriteAsZeroes, 2564 vec![ 2565 aml::FieldEntry::Reserved(160), 2566 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2567 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2568 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2569 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2570 ], 2571 ), 2572 &aml::Field::new( 2573 "MHPR".into(), 2574 aml::FieldAccessType::DWord, 2575 aml::FieldLockRule::NoLock, 2576 aml::FieldUpdateRule::Preserve, 2577 vec![ 2578 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2579 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2580 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2581 ], 2582 ), 2583 &MemoryMethods { 2584 slots: self.hotplug_slots.len(), 2585 }, 2586 &MemorySlots { 2587 slots: self.hotplug_slots.len(), 2588 }, 2589 ], 2590 ) 2591 .to_aml_bytes(sink); 2592 } else { 2593 aml::Device::new( 2594 "_SB_.MHPC".into(), 2595 vec![ 2596 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2597 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2598 // Empty MSCN for GED 2599 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2600 ], 2601 ) 2602 .to_aml_bytes(sink); 2603 } 2604 2605 #[cfg(target_arch = "x86_64")] 2606 { 2607 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2608 let min = sgx_epc_region.start().raw_value(); 2609 let max = min + sgx_epc_region.size() - 1; 2610 // SGX EPC region 2611 aml::Device::new( 2612 "_SB_.EPC_".into(), 2613 vec![ 2614 &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")), 2615 // QWORD describing the EPC region start and size 2616 &aml::Name::new( 2617 "_CRS".into(), 2618 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2619 aml::AddressSpaceCacheable::NotCacheable, 2620 true, 2621 min, 2622 max, 2623 None, 2624 )]), 2625 ), 2626 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2627 ], 2628 ) 2629 .to_aml_bytes(sink); 2630 } 2631 } 2632 } 2633 } 2634 2635 impl Pausable for MemoryManager {} 2636 2637 #[derive(Clone, Serialize, Deserialize)] 2638 pub struct MemoryManagerSnapshotData { 2639 memory_ranges: MemoryRangeTable, 2640 guest_ram_mappings: Vec<GuestRamMapping>, 2641 start_of_device_area: u64, 2642 boot_ram: u64, 2643 current_ram: u64, 2644 arch_mem_regions: Vec<ArchMemRegion>, 2645 hotplug_slots: Vec<HotPlugState>, 2646 next_memory_slot: u32, 2647 selected_slot: usize, 2648 next_hotplug_slot: usize, 2649 } 2650 2651 impl Snapshottable for MemoryManager { 2652 fn id(&self) -> String { 2653 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2654 } 2655 2656 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2657 let memory_ranges = self.memory_range_table(true)?; 2658 2659 // Store locally this list of ranges as it will be used through the 2660 // Transportable::send() implementation. The point is to avoid the 2661 // duplication of code regarding the creation of the path for each 2662 // region. The 'snapshot' step creates the list of memory regions, 2663 // including information about the need to copy a memory region or 2664 // not. This saves the 'send' step having to go through the same 2665 // process, and instead it can directly proceed with storing the 2666 // memory range content for the ranges requiring it. 2667 self.snapshot_memory_ranges = memory_ranges; 2668 2669 Ok(Snapshot::from_data(SnapshotData::new_from_state( 2670 &self.snapshot_data(), 2671 )?)) 2672 } 2673 } 2674 2675 impl Transportable for MemoryManager { 2676 fn send( 2677 &self, 2678 _snapshot: &Snapshot, 2679 destination_url: &str, 2680 ) -> result::Result<(), MigratableError> { 2681 if self.snapshot_memory_ranges.is_empty() { 2682 return Ok(()); 2683 } 2684 2685 let mut memory_file_path = url_to_path(destination_url)?; 2686 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2687 2688 // Create the snapshot file for the entire memory 2689 let mut memory_file = OpenOptions::new() 2690 .read(true) 2691 .write(true) 2692 .create_new(true) 2693 .open(memory_file_path) 2694 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2695 2696 let guest_memory = self.guest_memory.memory(); 2697 2698 for range in self.snapshot_memory_ranges.regions() { 2699 let mut offset: u64 = 0; 2700 // Here we are manually handling the retry in case we can't read 2701 // the whole region at once because we can't use the implementation 2702 // from vm-memory::GuestMemory of write_all_to() as it is not 2703 // following the correct behavior. For more info about this issue 2704 // see: https://github.com/rust-vmm/vm-memory/issues/174 2705 loop { 2706 let bytes_written = guest_memory 2707 .write_volatile_to( 2708 GuestAddress(range.gpa + offset), 2709 &mut memory_file, 2710 (range.length - offset) as usize, 2711 ) 2712 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2713 offset += bytes_written as u64; 2714 2715 if offset == range.length { 2716 break; 2717 } 2718 } 2719 } 2720 Ok(()) 2721 } 2722 } 2723 2724 impl Migratable for MemoryManager { 2725 // Start the dirty log in the hypervisor (kvm/mshv). 2726 // Also, reset the dirty bitmap logged by the vmm. 2727 // Just before we do a bulk copy we want to start/clear the dirty log so that 2728 // pages touched during our bulk copy are tracked. 2729 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2730 self.vm.start_dirty_log().map_err(|e| { 2731 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2732 })?; 2733 2734 for r in self.guest_memory.memory().iter() { 2735 r.bitmap().reset(); 2736 } 2737 2738 Ok(()) 2739 } 2740 2741 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2742 self.vm.stop_dirty_log().map_err(|e| { 2743 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2744 })?; 2745 2746 Ok(()) 2747 } 2748 2749 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2750 // together in the table if they are contiguous. 2751 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2752 let mut table = MemoryRangeTable::default(); 2753 for r in &self.guest_ram_mappings { 2754 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2755 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2756 })?; 2757 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2758 { 2759 Some(region) => { 2760 assert!(region.start_addr().raw_value() == r.gpa); 2761 assert!(region.len() == r.size); 2762 region.bitmap().get_and_reset() 2763 } 2764 None => { 2765 return Err(MigratableError::MigrateSend(anyhow!( 2766 "Error finding 'guest memory region' with address {:x}", 2767 r.gpa 2768 ))) 2769 } 2770 }; 2771 2772 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2773 .iter() 2774 .zip(vmm_dirty_bitmap.iter()) 2775 .map(|(x, y)| x | y) 2776 .collect(); 2777 2778 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2779 2780 if sub_table.regions().is_empty() { 2781 info!("Dirty Memory Range Table is empty"); 2782 } else { 2783 info!("Dirty Memory Range Table:"); 2784 for range in sub_table.regions() { 2785 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2786 } 2787 } 2788 2789 table.extend(sub_table); 2790 } 2791 Ok(table) 2792 } 2793 } 2794