1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(target_arch = "x86_64")] 6 use crate::config::SgxEpcConfig; 7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 9 use crate::coredump::{ 10 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, 11 }; 12 use crate::migration::url_to_path; 13 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 14 use crate::{GuestMemoryMmap, GuestRegionMmap}; 15 use acpi_tables::{aml, Aml}; 16 use anyhow::anyhow; 17 #[cfg(target_arch = "x86_64")] 18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 19 use arch::RegionType; 20 #[cfg(target_arch = "x86_64")] 21 use devices::ioapic; 22 #[cfg(target_arch = "aarch64")] 23 use hypervisor::HypervisorVmError; 24 use libc::_SC_NPROCESSORS_ONLN; 25 #[cfg(target_arch = "x86_64")] 26 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 27 use serde::{Deserialize, Serialize}; 28 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 29 use std::collections::BTreeMap; 30 use std::collections::HashMap; 31 use std::fs::{File, OpenOptions}; 32 use std::io::{self}; 33 use std::ops::{BitAnd, Deref, Not, Sub}; 34 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 35 use std::os::fd::AsFd; 36 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 37 use std::path::PathBuf; 38 use std::result; 39 use std::sync::{Arc, Barrier, Mutex}; 40 use std::{ffi, thread}; 41 use tracer::trace_scoped; 42 use virtio_devices::BlocksState; 43 #[cfg(target_arch = "x86_64")] 44 use vm_allocator::GsiApic; 45 use vm_allocator::{AddressAllocator, SystemAllocator}; 46 use vm_device::BusDevice; 47 use vm_memory::bitmap::AtomicBitmap; 48 use vm_memory::guest_memory::FileOffset; 49 use vm_memory::{ 50 mmap::MmapRegionError, Address, Error as MmapError, GuestAddress, GuestAddressSpace, 51 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 52 ReadVolatile, 53 }; 54 use vm_migration::{ 55 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 56 Snapshot, SnapshotData, Snapshottable, Transportable, 57 }; 58 59 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 60 61 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 62 63 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 64 65 #[cfg(target_arch = "x86_64")] 66 const X86_64_IRQ_BASE: u32 = 5; 67 68 #[cfg(target_arch = "x86_64")] 69 const SGX_PAGE_SIZE: u64 = 1 << 12; 70 71 const HOTPLUG_COUNT: usize = 8; 72 73 // Memory policy constants 74 const MPOL_BIND: u32 = 2; 75 const MPOL_MF_STRICT: u32 = 1; 76 const MPOL_MF_MOVE: u32 = 1 << 1; 77 78 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 79 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 80 81 const MAX_PREFAULT_THREAD_COUNT: usize = 16; 82 83 #[derive(Clone, Default, Serialize, Deserialize)] 84 struct HotPlugState { 85 base: u64, 86 length: u64, 87 active: bool, 88 inserting: bool, 89 removing: bool, 90 } 91 92 pub struct VirtioMemZone { 93 region: Arc<GuestRegionMmap>, 94 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 95 hotplugged_size: u64, 96 hugepages: bool, 97 blocks_state: Arc<Mutex<BlocksState>>, 98 } 99 100 impl VirtioMemZone { 101 pub fn region(&self) -> &Arc<GuestRegionMmap> { 102 &self.region 103 } 104 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 105 self.virtio_device = Some(virtio_device); 106 } 107 pub fn hotplugged_size(&self) -> u64 { 108 self.hotplugged_size 109 } 110 pub fn hugepages(&self) -> bool { 111 self.hugepages 112 } 113 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 114 &self.blocks_state 115 } 116 pub fn plugged_ranges(&self) -> MemoryRangeTable { 117 self.blocks_state 118 .lock() 119 .unwrap() 120 .memory_ranges(self.region.start_addr().raw_value(), true) 121 } 122 } 123 124 #[derive(Default)] 125 pub struct MemoryZone { 126 regions: Vec<Arc<GuestRegionMmap>>, 127 virtio_mem_zone: Option<VirtioMemZone>, 128 } 129 130 impl MemoryZone { 131 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 132 &self.regions 133 } 134 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 135 &self.virtio_mem_zone 136 } 137 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 138 self.virtio_mem_zone.as_mut() 139 } 140 } 141 142 pub type MemoryZones = HashMap<String, MemoryZone>; 143 144 #[derive(Clone, Serialize, Deserialize)] 145 struct GuestRamMapping { 146 slot: u32, 147 gpa: u64, 148 size: u64, 149 zone_id: String, 150 virtio_mem: bool, 151 file_offset: u64, 152 } 153 154 #[derive(Clone, Serialize, Deserialize)] 155 struct ArchMemRegion { 156 base: u64, 157 size: usize, 158 r_type: RegionType, 159 } 160 161 pub struct MemoryManager { 162 boot_guest_memory: GuestMemoryMmap, 163 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 164 next_memory_slot: u32, 165 start_of_device_area: GuestAddress, 166 end_of_device_area: GuestAddress, 167 end_of_ram_area: GuestAddress, 168 pub vm: Arc<dyn hypervisor::Vm>, 169 hotplug_slots: Vec<HotPlugState>, 170 selected_slot: usize, 171 mergeable: bool, 172 allocator: Arc<Mutex<SystemAllocator>>, 173 hotplug_method: HotplugMethod, 174 boot_ram: u64, 175 current_ram: u64, 176 next_hotplug_slot: usize, 177 shared: bool, 178 hugepages: bool, 179 hugepage_size: Option<u64>, 180 prefault: bool, 181 thp: bool, 182 #[cfg(target_arch = "x86_64")] 183 sgx_epc_region: Option<SgxEpcRegion>, 184 user_provided_zones: bool, 185 snapshot_memory_ranges: MemoryRangeTable, 186 memory_zones: MemoryZones, 187 log_dirty: bool, // Enable dirty logging for created RAM regions 188 arch_mem_regions: Vec<ArchMemRegion>, 189 ram_allocator: AddressAllocator, 190 dynamic: bool, 191 192 // Keep track of calls to create_userspace_mapping() for guest RAM. 193 // This is useful for getting the dirty pages as we need to know the 194 // slots that the mapping is created in. 195 guest_ram_mappings: Vec<GuestRamMapping>, 196 197 pub acpi_address: Option<GuestAddress>, 198 #[cfg(target_arch = "aarch64")] 199 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 200 } 201 202 #[derive(Debug)] 203 pub enum Error { 204 /// Failed to create shared file. 205 SharedFileCreate(io::Error), 206 207 /// Failed to set shared file length. 208 SharedFileSetLen(io::Error), 209 210 /// Mmap backed guest memory error 211 GuestMemory(MmapError), 212 213 /// Failed to allocate a memory range. 214 MemoryRangeAllocation, 215 216 /// Error from region creation 217 GuestMemoryRegion(MmapRegionError), 218 219 /// No ACPI slot available 220 NoSlotAvailable, 221 222 /// Not enough space in the hotplug RAM region 223 InsufficientHotplugRam, 224 225 /// The requested hotplug memory addition is not a valid size 226 InvalidSize, 227 228 /// Failed to create the user memory region. 229 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 230 231 /// Failed to remove the user memory region. 232 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 233 234 /// Failed to EventFd. 235 EventFdFail(io::Error), 236 237 /// Eventfd write error 238 EventfdError(io::Error), 239 240 /// Failed to virtio-mem resize 241 VirtioMemResizeFail(virtio_devices::mem::Error), 242 243 /// Cannot restore VM 244 Restore(MigratableError), 245 246 /// Cannot restore VM because source URL is missing 247 RestoreMissingSourceUrl, 248 249 /// Cannot create the system allocator 250 CreateSystemAllocator, 251 252 /// Invalid SGX EPC section size 253 #[cfg(target_arch = "x86_64")] 254 EpcSectionSizeInvalid, 255 256 /// Failed allocating SGX EPC region 257 #[cfg(target_arch = "x86_64")] 258 SgxEpcRangeAllocation, 259 260 /// Failed opening SGX virtual EPC device 261 #[cfg(target_arch = "x86_64")] 262 SgxVirtEpcOpen(io::Error), 263 264 /// Failed setting the SGX virtual EPC section size 265 #[cfg(target_arch = "x86_64")] 266 SgxVirtEpcFileSetLen(io::Error), 267 268 /// Failed opening SGX provisioning device 269 #[cfg(target_arch = "x86_64")] 270 SgxProvisionOpen(io::Error), 271 272 /// Failed enabling SGX provisioning 273 #[cfg(target_arch = "x86_64")] 274 SgxEnableProvisioning(hypervisor::HypervisorVmError), 275 276 /// Failed creating a new MmapRegion instance. 277 #[cfg(target_arch = "x86_64")] 278 NewMmapRegion(vm_memory::mmap::MmapRegionError), 279 280 /// No memory zones found. 281 MissingMemoryZones, 282 283 /// Memory configuration is not valid. 284 InvalidMemoryParameters, 285 286 /// Forbidden operation. Impossible to resize guest memory if it is 287 /// backed by user defined memory regions. 288 InvalidResizeWithMemoryZones, 289 290 /// It's invalid to try applying a NUMA policy to a memory zone that is 291 /// memory mapped with MAP_SHARED. 292 InvalidSharedMemoryZoneWithHostNuma, 293 294 /// Failed applying NUMA memory policy. 295 ApplyNumaPolicy(io::Error), 296 297 /// Memory zone identifier is not unique. 298 DuplicateZoneId, 299 300 /// No virtio-mem resizing handler found. 301 MissingVirtioMemHandler, 302 303 /// Unknown memory zone. 304 UnknownMemoryZone, 305 306 /// Invalid size for resizing. Can be anything except 0. 307 InvalidHotplugSize, 308 309 /// Invalid hotplug method associated with memory zones resizing capability. 310 InvalidHotplugMethodWithMemoryZones, 311 312 /// Could not find specified memory zone identifier from hash map. 313 MissingZoneIdentifier, 314 315 /// Resizing the memory zone failed. 316 ResizeZone, 317 318 /// Guest address overflow 319 GuestAddressOverFlow, 320 321 /// Error opening snapshot file 322 SnapshotOpen(io::Error), 323 324 // Error copying snapshot into region 325 SnapshotCopy(GuestMemoryError), 326 327 /// Failed to allocate MMIO address 328 AllocateMmioAddress, 329 330 #[cfg(target_arch = "aarch64")] 331 /// Failed to create UEFI flash 332 CreateUefiFlash(HypervisorVmError), 333 334 /// Using a directory as a backing file for memory is not supported 335 DirectoryAsBackingFileForMemory, 336 337 /// Failed to stat filesystem 338 GetFileSystemBlockSize(io::Error), 339 340 /// Memory size is misaligned with default page size or its hugepage size 341 MisalignedMemorySize, 342 } 343 344 const ENABLE_FLAG: usize = 0; 345 const INSERTING_FLAG: usize = 1; 346 const REMOVING_FLAG: usize = 2; 347 const EJECT_FLAG: usize = 3; 348 349 const BASE_OFFSET_LOW: u64 = 0; 350 const BASE_OFFSET_HIGH: u64 = 0x4; 351 const LENGTH_OFFSET_LOW: u64 = 0x8; 352 const LENGTH_OFFSET_HIGH: u64 = 0xC; 353 const STATUS_OFFSET: u64 = 0x14; 354 const SELECTION_OFFSET: u64 = 0; 355 356 // The MMIO address space size is subtracted with 64k. This is done for the 357 // following reasons: 358 // - Reduce the addressable space size by at least 4k to workaround a Linux 359 // bug when the VMM allocates devices at the end of the addressable space 360 // - Windows requires the addressable space size to be 64k aligned 361 fn mmio_address_space_size(phys_bits: u8) -> u64 { 362 (1 << phys_bits) - (1 << 16) 363 } 364 365 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the 366 // `f_bsize` field. 367 // 368 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169 369 fn statfs_get_bsize(path: &str) -> Result<u64, Error> { 370 let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?; 371 let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit(); 372 373 // SAFETY: FFI call with a valid path and buffer 374 let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) }; 375 if ret != 0 { 376 return Err(Error::GetFileSystemBlockSize( 377 std::io::Error::last_os_error(), 378 )); 379 } 380 381 // SAFETY: `buf` is valid at this point 382 // Because this value is always positive, just convert it directly. 383 // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned 384 // by `clippy` on musl target. To avoid the warning, there should be `as _` instead of 385 // `as u64`. 386 let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _; 387 Ok(bsize) 388 } 389 390 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> { 391 // SAFETY: FFI call. Trivially safe. 392 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 393 394 // There is no backend file and the `hugepages` is disabled, just use system page size. 395 if zone.file.is_none() && !zone.hugepages { 396 return Ok(page_size); 397 } 398 399 // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly. 400 if zone.hugepages && zone.hugepage_size.is_some() { 401 return Ok(zone.hugepage_size.unwrap()); 402 } 403 404 // There are two scenarios here: 405 // - `hugepages` is enabled but `hugepage_size` is not specified: 406 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 407 // - The backing file is specified: 408 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 409 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 410 // value is less than or equal to the page size, just use the page size. 411 let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| { 412 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 413 })?; 414 415 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 416 417 Ok(align_size) 418 } 419 420 #[inline] 421 fn align_down<T>(val: T, align: T) -> T 422 where 423 T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>, 424 { 425 val & !(align - 1u8.into()) 426 } 427 428 #[inline] 429 fn is_aligned<T>(val: T, align: T) -> bool 430 where 431 T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq, 432 { 433 (val & (align - 1u8.into())) == 0u8.into() 434 } 435 436 impl BusDevice for MemoryManager { 437 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 438 if self.selected_slot < self.hotplug_slots.len() { 439 let state = &self.hotplug_slots[self.selected_slot]; 440 match offset { 441 BASE_OFFSET_LOW => { 442 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 443 } 444 BASE_OFFSET_HIGH => { 445 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 446 } 447 LENGTH_OFFSET_LOW => { 448 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 449 } 450 LENGTH_OFFSET_HIGH => { 451 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 452 } 453 STATUS_OFFSET => { 454 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 455 data.fill(0); 456 if state.active { 457 data[0] |= 1 << ENABLE_FLAG; 458 } 459 if state.inserting { 460 data[0] |= 1 << INSERTING_FLAG; 461 } 462 if state.removing { 463 data[0] |= 1 << REMOVING_FLAG; 464 } 465 } 466 _ => { 467 warn!( 468 "Unexpected offset for accessing memory manager device: {:#}", 469 offset 470 ); 471 } 472 } 473 } else { 474 warn!("Out of range memory slot: {}", self.selected_slot); 475 } 476 } 477 478 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 479 match offset { 480 SELECTION_OFFSET => { 481 self.selected_slot = usize::from(data[0]); 482 } 483 STATUS_OFFSET => { 484 if self.selected_slot < self.hotplug_slots.len() { 485 let state = &mut self.hotplug_slots[self.selected_slot]; 486 // The ACPI code writes back a 1 to acknowledge the insertion 487 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 488 state.inserting = false; 489 } 490 // Ditto for removal 491 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 492 state.removing = false; 493 } 494 // Trigger removal of "DIMM" 495 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 496 warn!("Ejection of memory not currently supported"); 497 } 498 } else { 499 warn!("Out of range memory slot: {}", self.selected_slot); 500 } 501 } 502 _ => { 503 warn!( 504 "Unexpected offset for accessing memory manager device: {:#}", 505 offset 506 ); 507 } 508 }; 509 None 510 } 511 } 512 513 impl MemoryManager { 514 /// Creates all memory regions based on the available RAM ranges defined 515 /// by `ram_regions`, and based on the description of the memory zones. 516 /// In practice, this function can perform multiple memory mappings of the 517 /// same backing file if there's a hole in the address space between two 518 /// RAM ranges. 519 /// 520 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 521 /// and zones containing two zones (size 1G and size 4G). 522 /// 523 /// This function will create 3 resulting memory regions: 524 /// - First one mapping entirely the first memory zone on 0-1G range 525 /// - Second one mapping partially the second memory zone on 1G-3G range 526 /// - Third one mapping partially the second memory zone on 4G-6G range 527 /// 528 /// Also, all memory regions are page-size aligned (e.g. their sizes must 529 /// be multiple of page-size), which may leave an additional hole in the 530 /// address space when hugepage is used. 531 fn create_memory_regions_from_zones( 532 ram_regions: &[(GuestAddress, usize)], 533 zones: &[MemoryZoneConfig], 534 prefault: Option<bool>, 535 thp: bool, 536 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 537 let mut zone_iter = zones.iter(); 538 let mut mem_regions = Vec::new(); 539 let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?; 540 let mut zone_align_size = memory_zone_get_align_size(zone)?; 541 let mut zone_offset = 0u64; 542 let mut memory_zones = HashMap::new(); 543 544 if !is_aligned(zone.size, zone_align_size) { 545 return Err(Error::MisalignedMemorySize); 546 } 547 548 // Add zone id to the list of memory zones. 549 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 550 551 for ram_region in ram_regions.iter() { 552 let mut ram_region_offset = 0; 553 let mut exit = false; 554 555 loop { 556 let mut ram_region_consumed = false; 557 let mut pull_next_zone = false; 558 559 let ram_region_available_size = 560 align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size); 561 if ram_region_available_size == 0 { 562 break; 563 } 564 let zone_sub_size = zone.size - zone_offset; 565 566 let file_offset = zone_offset; 567 let region_start = ram_region 568 .0 569 .checked_add(ram_region_offset) 570 .ok_or(Error::GuestAddressOverFlow)?; 571 let region_size = if zone_sub_size <= ram_region_available_size { 572 if zone_sub_size == ram_region_available_size { 573 ram_region_consumed = true; 574 } 575 576 ram_region_offset += zone_sub_size; 577 pull_next_zone = true; 578 579 zone_sub_size 580 } else { 581 zone_offset += ram_region_available_size; 582 ram_region_consumed = true; 583 584 ram_region_available_size 585 }; 586 587 info!( 588 "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}", 589 zone.id, 590 region_start.raw_value(), 591 region_size 592 ); 593 let region = MemoryManager::create_ram_region( 594 &zone.file, 595 file_offset, 596 region_start, 597 region_size as usize, 598 prefault.unwrap_or(zone.prefault), 599 zone.shared, 600 zone.hugepages, 601 zone.hugepage_size, 602 zone.host_numa_node, 603 None, 604 thp, 605 )?; 606 607 // Add region to the list of regions associated with the 608 // current memory zone. 609 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 610 memory_zone.regions.push(region.clone()); 611 } 612 613 mem_regions.push(region); 614 615 if pull_next_zone { 616 // Get the next zone and reset the offset. 617 zone_offset = 0; 618 if let Some(z) = zone_iter.next() { 619 zone = z; 620 } else { 621 exit = true; 622 break; 623 } 624 zone_align_size = memory_zone_get_align_size(zone)?; 625 if !is_aligned(zone.size, zone_align_size) { 626 return Err(Error::MisalignedMemorySize); 627 } 628 629 // Check if zone id already exist. In case it does, throw 630 // an error as we need unique identifiers. Otherwise, add 631 // the new zone id to the list of memory zones. 632 if memory_zones.contains_key(&zone.id) { 633 error!( 634 "Memory zone identifier '{}' found more than once. \ 635 It must be unique", 636 zone.id, 637 ); 638 return Err(Error::DuplicateZoneId); 639 } 640 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 641 } 642 643 if ram_region_consumed { 644 break; 645 } 646 } 647 648 if exit { 649 break; 650 } 651 } 652 653 Ok((mem_regions, memory_zones)) 654 } 655 656 // Restore both GuestMemory regions along with MemoryZone zones. 657 fn restore_memory_regions_and_zones( 658 guest_ram_mappings: &[GuestRamMapping], 659 zones_config: &[MemoryZoneConfig], 660 prefault: Option<bool>, 661 mut existing_memory_files: HashMap<u32, File>, 662 thp: bool, 663 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 664 let mut memory_regions = Vec::new(); 665 let mut memory_zones = HashMap::new(); 666 667 for zone_config in zones_config { 668 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 669 } 670 671 for guest_ram_mapping in guest_ram_mappings { 672 for zone_config in zones_config { 673 if guest_ram_mapping.zone_id == zone_config.id { 674 let region = MemoryManager::create_ram_region( 675 if guest_ram_mapping.virtio_mem { 676 &None 677 } else { 678 &zone_config.file 679 }, 680 guest_ram_mapping.file_offset, 681 GuestAddress(guest_ram_mapping.gpa), 682 guest_ram_mapping.size as usize, 683 prefault.unwrap_or(zone_config.prefault), 684 zone_config.shared, 685 zone_config.hugepages, 686 zone_config.hugepage_size, 687 zone_config.host_numa_node, 688 existing_memory_files.remove(&guest_ram_mapping.slot), 689 thp, 690 )?; 691 memory_regions.push(Arc::clone(®ion)); 692 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 693 if guest_ram_mapping.virtio_mem { 694 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 695 let region_size = region.len(); 696 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 697 region, 698 virtio_device: None, 699 hotplugged_size, 700 hugepages: zone_config.hugepages, 701 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 702 }); 703 } else { 704 memory_zone.regions.push(region); 705 } 706 } 707 } 708 } 709 } 710 711 memory_regions.sort_by_key(|x| x.start_addr()); 712 713 Ok((memory_regions, memory_zones)) 714 } 715 716 fn fill_saved_regions( 717 &mut self, 718 file_path: PathBuf, 719 saved_regions: MemoryRangeTable, 720 ) -> Result<(), Error> { 721 if saved_regions.is_empty() { 722 return Ok(()); 723 } 724 725 // Open (read only) the snapshot file. 726 let mut memory_file = OpenOptions::new() 727 .read(true) 728 .open(file_path) 729 .map_err(Error::SnapshotOpen)?; 730 731 let guest_memory = self.guest_memory.memory(); 732 for range in saved_regions.regions() { 733 let mut offset: u64 = 0; 734 // Here we are manually handling the retry in case we can't write 735 // the whole region at once because we can't use the implementation 736 // from vm-memory::GuestMemory of read_exact_from() as it is not 737 // following the correct behavior. For more info about this issue 738 // see: https://github.com/rust-vmm/vm-memory/issues/174 739 loop { 740 let bytes_read = guest_memory 741 .read_volatile_from( 742 GuestAddress(range.gpa + offset), 743 &mut memory_file, 744 (range.length - offset) as usize, 745 ) 746 .map_err(Error::SnapshotCopy)?; 747 offset += bytes_read as u64; 748 749 if offset == range.length { 750 break; 751 } 752 } 753 } 754 755 Ok(()) 756 } 757 758 fn validate_memory_config( 759 config: &MemoryConfig, 760 user_provided_zones: bool, 761 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 762 let mut allow_mem_hotplug = false; 763 764 if !user_provided_zones { 765 if config.zones.is_some() { 766 error!( 767 "User defined memory regions can't be provided if the \ 768 memory size is not 0" 769 ); 770 return Err(Error::InvalidMemoryParameters); 771 } 772 773 if config.hotplug_size.is_some() { 774 allow_mem_hotplug = true; 775 } 776 777 if let Some(hotplugged_size) = config.hotplugged_size { 778 if let Some(hotplug_size) = config.hotplug_size { 779 if hotplugged_size > hotplug_size { 780 error!( 781 "'hotplugged_size' {} can't be bigger than \ 782 'hotplug_size' {}", 783 hotplugged_size, hotplug_size, 784 ); 785 return Err(Error::InvalidMemoryParameters); 786 } 787 } else { 788 error!( 789 "Invalid to define 'hotplugged_size' when there is\ 790 no 'hotplug_size'" 791 ); 792 return Err(Error::InvalidMemoryParameters); 793 } 794 if config.hotplug_method == HotplugMethod::Acpi { 795 error!( 796 "Invalid to define 'hotplugged_size' with hotplug \ 797 method 'acpi'" 798 ); 799 return Err(Error::InvalidMemoryParameters); 800 } 801 } 802 803 // Create a single zone from the global memory config. This lets 804 // us reuse the codepath for user defined memory zones. 805 let zones = vec![MemoryZoneConfig { 806 id: String::from(DEFAULT_MEMORY_ZONE), 807 size: config.size, 808 file: None, 809 shared: config.shared, 810 hugepages: config.hugepages, 811 hugepage_size: config.hugepage_size, 812 host_numa_node: None, 813 hotplug_size: config.hotplug_size, 814 hotplugged_size: config.hotplugged_size, 815 prefault: config.prefault, 816 }]; 817 818 Ok((config.size, zones, allow_mem_hotplug)) 819 } else { 820 if config.zones.is_none() { 821 error!( 822 "User defined memory regions must be provided if the \ 823 memory size is 0" 824 ); 825 return Err(Error::MissingMemoryZones); 826 } 827 828 // Safe to unwrap as we checked right above there were some 829 // regions. 830 let zones = config.zones.clone().unwrap(); 831 if zones.is_empty() { 832 return Err(Error::MissingMemoryZones); 833 } 834 835 let mut total_ram_size: u64 = 0; 836 for zone in zones.iter() { 837 total_ram_size += zone.size; 838 839 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 840 error!( 841 "Invalid to set host NUMA policy for a memory zone \ 842 backed by a regular file and mapped as 'shared'" 843 ); 844 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 845 } 846 847 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 848 error!("Invalid to set ACPI hotplug method for memory zones"); 849 return Err(Error::InvalidHotplugMethodWithMemoryZones); 850 } 851 852 if let Some(hotplugged_size) = zone.hotplugged_size { 853 if let Some(hotplug_size) = zone.hotplug_size { 854 if hotplugged_size > hotplug_size { 855 error!( 856 "'hotplugged_size' {} can't be bigger than \ 857 'hotplug_size' {}", 858 hotplugged_size, hotplug_size, 859 ); 860 return Err(Error::InvalidMemoryParameters); 861 } 862 } else { 863 error!( 864 "Invalid to define 'hotplugged_size' when there is\ 865 no 'hotplug_size' for a memory zone" 866 ); 867 return Err(Error::InvalidMemoryParameters); 868 } 869 if config.hotplug_method == HotplugMethod::Acpi { 870 error!( 871 "Invalid to define 'hotplugged_size' with hotplug \ 872 method 'acpi'" 873 ); 874 return Err(Error::InvalidMemoryParameters); 875 } 876 } 877 } 878 879 Ok((total_ram_size, zones, allow_mem_hotplug)) 880 } 881 } 882 883 pub fn allocate_address_space(&mut self) -> Result<(), Error> { 884 let mut list = Vec::new(); 885 886 for (zone_id, memory_zone) in self.memory_zones.iter() { 887 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 888 memory_zone 889 .regions() 890 .iter() 891 .map(|r| (r.clone(), false)) 892 .collect(); 893 894 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 895 regions.push((virtio_mem_zone.region().clone(), true)); 896 } 897 898 list.push((zone_id.clone(), regions)); 899 } 900 901 for (zone_id, regions) in list { 902 for (region, virtio_mem) in regions { 903 let slot = self.create_userspace_mapping( 904 region.start_addr().raw_value(), 905 region.len(), 906 region.as_ptr() as u64, 907 self.mergeable, 908 false, 909 self.log_dirty, 910 )?; 911 912 let file_offset = if let Some(file_offset) = region.file_offset() { 913 file_offset.start() 914 } else { 915 0 916 }; 917 918 self.guest_ram_mappings.push(GuestRamMapping { 919 gpa: region.start_addr().raw_value(), 920 size: region.len(), 921 slot, 922 zone_id: zone_id.clone(), 923 virtio_mem, 924 file_offset, 925 }); 926 self.ram_allocator 927 .allocate(Some(region.start_addr()), region.len(), None) 928 .ok_or(Error::MemoryRangeAllocation)?; 929 } 930 } 931 932 // Allocate SubRegion and Reserved address ranges. 933 for region in self.arch_mem_regions.iter() { 934 if region.r_type == RegionType::Ram { 935 // Ignore the RAM type since ranges have already been allocated 936 // based on the GuestMemory regions. 937 continue; 938 } 939 self.ram_allocator 940 .allocate( 941 Some(GuestAddress(region.base)), 942 region.size as GuestUsize, 943 None, 944 ) 945 .ok_or(Error::MemoryRangeAllocation)?; 946 } 947 948 Ok(()) 949 } 950 951 #[cfg(target_arch = "aarch64")] 952 fn add_uefi_flash(&mut self) -> Result<(), Error> { 953 // On AArch64, the UEFI binary requires a flash device at address 0. 954 // 4 MiB memory is mapped to simulate the flash. 955 let uefi_mem_slot = self.allocate_memory_slot(); 956 let uefi_region = GuestRegionMmap::new( 957 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 958 arch::layout::UEFI_START, 959 ) 960 .unwrap(); 961 let uefi_mem_region = self.vm.make_user_memory_region( 962 uefi_mem_slot, 963 uefi_region.start_addr().raw_value(), 964 uefi_region.len(), 965 uefi_region.as_ptr() as u64, 966 false, 967 false, 968 ); 969 self.vm 970 .create_user_memory_region(uefi_mem_region) 971 .map_err(Error::CreateUefiFlash)?; 972 973 let uefi_flash = 974 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 975 976 self.uefi_flash = Some(uefi_flash); 977 978 Ok(()) 979 } 980 981 #[allow(clippy::too_many_arguments)] 982 pub fn new( 983 vm: Arc<dyn hypervisor::Vm>, 984 config: &MemoryConfig, 985 prefault: Option<bool>, 986 phys_bits: u8, 987 #[cfg(feature = "tdx")] tdx_enabled: bool, 988 restore_data: Option<&MemoryManagerSnapshotData>, 989 existing_memory_files: Option<HashMap<u32, File>>, 990 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 991 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 992 trace_scoped!("MemoryManager::new"); 993 994 let user_provided_zones = config.size == 0; 995 996 let mmio_address_space_size = mmio_address_space_size(phys_bits); 997 debug_assert_eq!( 998 (((mmio_address_space_size) >> 16) << 16), 999 mmio_address_space_size 1000 ); 1001 let start_of_platform_device_area = 1002 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 1003 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 1004 1005 let (ram_size, zones, allow_mem_hotplug) = 1006 Self::validate_memory_config(config, user_provided_zones)?; 1007 1008 let ( 1009 start_of_device_area, 1010 boot_ram, 1011 current_ram, 1012 arch_mem_regions, 1013 memory_zones, 1014 guest_memory, 1015 boot_guest_memory, 1016 hotplug_slots, 1017 next_memory_slot, 1018 selected_slot, 1019 next_hotplug_slot, 1020 ) = if let Some(data) = restore_data { 1021 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 1022 &data.guest_ram_mappings, 1023 &zones, 1024 prefault, 1025 existing_memory_files.unwrap_or_default(), 1026 config.thp, 1027 )?; 1028 let guest_memory = 1029 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 1030 let boot_guest_memory = guest_memory.clone(); 1031 ( 1032 GuestAddress(data.start_of_device_area), 1033 data.boot_ram, 1034 data.current_ram, 1035 data.arch_mem_regions.clone(), 1036 memory_zones, 1037 guest_memory, 1038 boot_guest_memory, 1039 data.hotplug_slots.clone(), 1040 data.next_memory_slot, 1041 data.selected_slot, 1042 data.next_hotplug_slot, 1043 ) 1044 } else { 1045 // Init guest memory 1046 let arch_mem_regions = arch::arch_memory_regions(); 1047 1048 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1049 .iter() 1050 .filter(|r| r.2 == RegionType::Ram) 1051 .map(|r| (r.0, r.1)) 1052 .collect(); 1053 1054 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 1055 .iter() 1056 .map(|(a, b, c)| ArchMemRegion { 1057 base: a.0, 1058 size: *b, 1059 r_type: *c, 1060 }) 1061 .collect(); 1062 1063 let (mem_regions, mut memory_zones) = 1064 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 1065 1066 let mut guest_memory = 1067 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 1068 1069 let boot_guest_memory = guest_memory.clone(); 1070 1071 let mut start_of_device_area = 1072 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 1073 1074 // Update list of memory zones for resize. 1075 for zone in zones.iter() { 1076 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 1077 if let Some(hotplug_size) = zone.hotplug_size { 1078 if hotplug_size == 0 { 1079 error!("'hotplug_size' can't be 0"); 1080 return Err(Error::InvalidHotplugSize); 1081 } 1082 1083 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 1084 start_of_device_area = start_of_device_area 1085 .checked_add(hotplug_size) 1086 .ok_or(Error::GuestAddressOverFlow)?; 1087 } else { 1088 // Alignment must be "natural" i.e. same as size of block 1089 let start_addr = GuestAddress( 1090 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1091 - 1) 1092 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1093 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 1094 ); 1095 1096 // When `prefault` is set by vm_restore, memory manager 1097 // will create ram region with `prefault` option in 1098 // restore config rather than same option in zone 1099 let region = MemoryManager::create_ram_region( 1100 &None, 1101 0, 1102 start_addr, 1103 hotplug_size as usize, 1104 prefault.unwrap_or(zone.prefault), 1105 zone.shared, 1106 zone.hugepages, 1107 zone.hugepage_size, 1108 zone.host_numa_node, 1109 None, 1110 config.thp, 1111 )?; 1112 1113 guest_memory = guest_memory 1114 .insert_region(Arc::clone(®ion)) 1115 .map_err(Error::GuestMemory)?; 1116 1117 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1118 let region_size = region.len(); 1119 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1120 region, 1121 virtio_device: None, 1122 hotplugged_size, 1123 hugepages: zone.hugepages, 1124 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1125 }); 1126 1127 start_of_device_area = start_addr 1128 .checked_add(hotplug_size) 1129 .ok_or(Error::GuestAddressOverFlow)?; 1130 } 1131 } 1132 } else { 1133 return Err(Error::MissingZoneIdentifier); 1134 } 1135 } 1136 1137 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1138 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1139 1140 ( 1141 start_of_device_area, 1142 ram_size, 1143 ram_size, 1144 arch_mem_regions, 1145 memory_zones, 1146 guest_memory, 1147 boot_guest_memory, 1148 hotplug_slots, 1149 0, 1150 0, 1151 0, 1152 ) 1153 }; 1154 1155 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1156 1157 // Both MMIO and PIO address spaces start at address 0. 1158 let allocator = Arc::new(Mutex::new( 1159 SystemAllocator::new( 1160 #[cfg(target_arch = "x86_64")] 1161 { 1162 GuestAddress(0) 1163 }, 1164 #[cfg(target_arch = "x86_64")] 1165 { 1166 1 << 16 1167 }, 1168 start_of_platform_device_area, 1169 PLATFORM_DEVICE_AREA_SIZE, 1170 #[cfg(target_arch = "x86_64")] 1171 vec![GsiApic::new( 1172 X86_64_IRQ_BASE, 1173 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1174 )], 1175 ) 1176 .ok_or(Error::CreateSystemAllocator)?, 1177 )); 1178 1179 #[cfg(not(feature = "tdx"))] 1180 let dynamic = true; 1181 #[cfg(feature = "tdx")] 1182 let dynamic = !tdx_enabled; 1183 1184 let acpi_address = if dynamic 1185 && config.hotplug_method == HotplugMethod::Acpi 1186 && (config.hotplug_size.unwrap_or_default() > 0) 1187 { 1188 Some( 1189 allocator 1190 .lock() 1191 .unwrap() 1192 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1193 .ok_or(Error::AllocateMmioAddress)?, 1194 ) 1195 } else { 1196 None 1197 }; 1198 1199 // If running on SGX the start of device area and RAM area may diverge but 1200 // at this point they are next to each other. 1201 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1202 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1203 1204 let mut memory_manager = MemoryManager { 1205 boot_guest_memory, 1206 guest_memory, 1207 next_memory_slot, 1208 start_of_device_area, 1209 end_of_device_area, 1210 end_of_ram_area, 1211 vm, 1212 hotplug_slots, 1213 selected_slot, 1214 mergeable: config.mergeable, 1215 allocator, 1216 hotplug_method: config.hotplug_method, 1217 boot_ram, 1218 current_ram, 1219 next_hotplug_slot, 1220 shared: config.shared, 1221 hugepages: config.hugepages, 1222 hugepage_size: config.hugepage_size, 1223 prefault: config.prefault, 1224 #[cfg(target_arch = "x86_64")] 1225 sgx_epc_region: None, 1226 user_provided_zones, 1227 snapshot_memory_ranges: MemoryRangeTable::default(), 1228 memory_zones, 1229 guest_ram_mappings: Vec::new(), 1230 acpi_address, 1231 log_dirty: dynamic, // Cannot log dirty pages on a TD 1232 arch_mem_regions, 1233 ram_allocator, 1234 dynamic, 1235 #[cfg(target_arch = "aarch64")] 1236 uefi_flash: None, 1237 thp: config.thp, 1238 }; 1239 1240 #[cfg(target_arch = "aarch64")] 1241 { 1242 // For Aarch64 we cannot lazily allocate the address space like we 1243 // do for x86, because while restoring a VM from snapshot we would 1244 // need the address space to be allocated to properly restore VGIC. 1245 // And the restore of VGIC happens before we attempt to run the vCPUs 1246 // for the first time, thus we need to allocate the address space 1247 // beforehand. 1248 memory_manager.allocate_address_space()?; 1249 memory_manager.add_uefi_flash()?; 1250 } 1251 1252 #[cfg(target_arch = "x86_64")] 1253 if let Some(sgx_epc_config) = sgx_epc_config { 1254 memory_manager.setup_sgx(sgx_epc_config)?; 1255 } 1256 1257 Ok(Arc::new(Mutex::new(memory_manager))) 1258 } 1259 1260 pub fn new_from_snapshot( 1261 snapshot: &Snapshot, 1262 vm: Arc<dyn hypervisor::Vm>, 1263 config: &MemoryConfig, 1264 source_url: Option<&str>, 1265 prefault: bool, 1266 phys_bits: u8, 1267 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1268 if let Some(source_url) = source_url { 1269 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1270 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1271 1272 let mem_snapshot: MemoryManagerSnapshotData = 1273 snapshot.to_state().map_err(Error::Restore)?; 1274 1275 let mm = MemoryManager::new( 1276 vm, 1277 config, 1278 Some(prefault), 1279 phys_bits, 1280 #[cfg(feature = "tdx")] 1281 false, 1282 Some(&mem_snapshot), 1283 None, 1284 #[cfg(target_arch = "x86_64")] 1285 None, 1286 )?; 1287 1288 mm.lock() 1289 .unwrap() 1290 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1291 1292 Ok(mm) 1293 } else { 1294 Err(Error::RestoreMissingSourceUrl) 1295 } 1296 } 1297 1298 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1299 // SAFETY: FFI call with correct arguments 1300 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1301 1302 if res < 0 { 1303 Err(io::Error::last_os_error()) 1304 } else { 1305 Ok(res as RawFd) 1306 } 1307 } 1308 1309 fn mbind( 1310 addr: *mut u8, 1311 len: u64, 1312 mode: u32, 1313 nodemask: Vec<u64>, 1314 maxnode: u64, 1315 flags: u32, 1316 ) -> Result<(), io::Error> { 1317 // SAFETY: FFI call with correct arguments 1318 let res = unsafe { 1319 libc::syscall( 1320 libc::SYS_mbind, 1321 addr as *mut libc::c_void, 1322 len, 1323 mode, 1324 nodemask.as_ptr(), 1325 maxnode, 1326 flags, 1327 ) 1328 }; 1329 1330 if res < 0 { 1331 Err(io::Error::last_os_error()) 1332 } else { 1333 Ok(()) 1334 } 1335 } 1336 1337 fn create_anonymous_file( 1338 size: usize, 1339 hugepages: bool, 1340 hugepage_size: Option<u64>, 1341 ) -> Result<FileOffset, Error> { 1342 let fd = Self::memfd_create( 1343 &ffi::CString::new("ch_ram").unwrap(), 1344 libc::MFD_CLOEXEC 1345 | if hugepages { 1346 libc::MFD_HUGETLB 1347 | if let Some(hugepage_size) = hugepage_size { 1348 /* 1349 * From the Linux kernel: 1350 * Several system calls take a flag to request "hugetlb" huge pages. 1351 * Without further specification, these system calls will use the 1352 * system's default huge page size. If a system supports multiple 1353 * huge page sizes, the desired huge page size can be specified in 1354 * bits [26:31] of the flag arguments. The value in these 6 bits 1355 * will encode the log2 of the huge page size. 1356 */ 1357 1358 hugepage_size.trailing_zeros() << 26 1359 } else { 1360 // Use the system default huge page size 1361 0 1362 } 1363 } else { 1364 0 1365 }, 1366 ) 1367 .map_err(Error::SharedFileCreate)?; 1368 1369 // SAFETY: fd is valid 1370 let f = unsafe { File::from_raw_fd(fd) }; 1371 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1372 1373 Ok(FileOffset::new(f, 0)) 1374 } 1375 1376 fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> { 1377 if backing_file.is_dir() { 1378 Err(Error::DirectoryAsBackingFileForMemory) 1379 } else { 1380 let f = OpenOptions::new() 1381 .read(true) 1382 .write(true) 1383 .open(backing_file) 1384 .map_err(Error::SharedFileCreate)?; 1385 1386 Ok(FileOffset::new(f, file_offset)) 1387 } 1388 } 1389 1390 #[allow(clippy::too_many_arguments)] 1391 pub fn create_ram_region( 1392 backing_file: &Option<PathBuf>, 1393 file_offset: u64, 1394 start_addr: GuestAddress, 1395 size: usize, 1396 prefault: bool, 1397 shared: bool, 1398 hugepages: bool, 1399 hugepage_size: Option<u64>, 1400 host_numa_node: Option<u32>, 1401 existing_memory_file: Option<File>, 1402 thp: bool, 1403 ) -> Result<Arc<GuestRegionMmap>, Error> { 1404 let mut mmap_flags = libc::MAP_NORESERVE; 1405 1406 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1407 // the complexity of the handling clear. 1408 let fo = if let Some(f) = existing_memory_file { 1409 // It must be MAP_SHARED as we wouldn't already have an FD 1410 mmap_flags |= libc::MAP_SHARED; 1411 Some(FileOffset::new(f, file_offset)) 1412 } else if let Some(backing_file) = backing_file { 1413 if shared { 1414 mmap_flags |= libc::MAP_SHARED; 1415 } else { 1416 mmap_flags |= libc::MAP_PRIVATE; 1417 } 1418 Some(Self::open_backing_file(backing_file, file_offset)?) 1419 } else if shared || hugepages { 1420 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1421 // because the MAP_PRIVATE will trigger CoW against the backing file with 1422 // the VFIO pinning 1423 mmap_flags |= libc::MAP_SHARED; 1424 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1425 } else { 1426 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1427 None 1428 }; 1429 1430 let region = GuestRegionMmap::new( 1431 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1432 .map_err(Error::GuestMemoryRegion)?, 1433 start_addr, 1434 ) 1435 .map_err(Error::GuestMemory)?; 1436 1437 // Apply NUMA policy if needed. 1438 if let Some(node) = host_numa_node { 1439 let addr = region.deref().as_ptr(); 1440 let len = region.deref().size() as u64; 1441 let mode = MPOL_BIND; 1442 let mut nodemask: Vec<u64> = Vec::new(); 1443 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1444 1445 // Linux is kind of buggy in the way it interprets maxnode as it 1446 // will cut off the last node. That's why we have to add 1 to what 1447 // we would consider as the proper maxnode value. 1448 let maxnode = node as u64 + 1 + 1; 1449 1450 // Allocate the right size for the vector. 1451 nodemask.resize((node as usize / 64) + 1, 0); 1452 1453 // Fill the global bitmask through the nodemask vector. 1454 let idx = (node / 64) as usize; 1455 let shift = node % 64; 1456 nodemask[idx] |= 1u64 << shift; 1457 1458 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1459 // force the kernel to move all pages that might have been already 1460 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1461 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1462 // MPOL_BIND is the selected mode as it specifies a strict policy 1463 // that restricts memory allocation to the nodes specified in the 1464 // nodemask. 1465 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1466 .map_err(Error::ApplyNumaPolicy)?; 1467 } 1468 1469 // Prefault the region if needed, in parallel. 1470 if prefault { 1471 let page_size = 1472 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize; 1473 1474 if !is_aligned(size, page_size) { 1475 warn!( 1476 "Prefaulting memory size {} misaligned with page size {}", 1477 size, page_size 1478 ); 1479 } 1480 1481 let num_pages = size / page_size; 1482 1483 let num_threads = Self::get_prefault_num_threads(page_size, num_pages); 1484 1485 let pages_per_thread = num_pages / num_threads; 1486 let remainder = num_pages % num_threads; 1487 1488 let barrier = Arc::new(Barrier::new(num_threads)); 1489 thread::scope(|s| { 1490 let r = ®ion; 1491 for i in 0..num_threads { 1492 let barrier = Arc::clone(&barrier); 1493 s.spawn(move || { 1494 // Wait until all threads have been spawned to avoid contention 1495 // over mmap_sem between thread stack allocation and page faulting. 1496 barrier.wait(); 1497 let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; 1498 let offset = 1499 page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); 1500 // SAFETY: FFI call with correct arguments 1501 let ret = unsafe { 1502 let addr = r.as_ptr().add(offset); 1503 libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE) 1504 }; 1505 if ret != 0 { 1506 let e = io::Error::last_os_error(); 1507 warn!("Failed to prefault pages: {}", e); 1508 } 1509 }); 1510 } 1511 }); 1512 } 1513 1514 if region.file_offset().is_none() && thp { 1515 info!( 1516 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1517 region.as_ptr() as u64, 1518 size 1519 ); 1520 // SAFETY: FFI call with correct arguments 1521 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1522 if ret != 0 { 1523 let e = io::Error::last_os_error(); 1524 warn!("Failed to mark pages as THP eligible: {}", e); 1525 } 1526 } 1527 1528 Ok(Arc::new(region)) 1529 } 1530 1531 // Duplicate of `memory_zone_get_align_size` that does not require a `zone` 1532 fn get_prefault_align_size( 1533 backing_file: &Option<PathBuf>, 1534 hugepages: bool, 1535 hugepage_size: Option<u64>, 1536 ) -> Result<u64, Error> { 1537 // SAFETY: FFI call. Trivially safe. 1538 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 1539 match (hugepages, hugepage_size, backing_file) { 1540 (false, _, _) => Ok(page_size), 1541 (true, Some(hugepage_size), _) => Ok(hugepage_size), 1542 (true, None, _) => { 1543 // There are two scenarios here: 1544 // - `hugepages` is enabled but `hugepage_size` is not specified: 1545 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 1546 // - The backing file is specified: 1547 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 1548 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 1549 // value is less than or equal to the page size, just use the page size. 1550 let path = backing_file 1551 .as_ref() 1552 .map_or(Ok("/dev/hugepages"), |pathbuf| { 1553 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 1554 })?; 1555 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 1556 Ok(align_size) 1557 } 1558 } 1559 } 1560 1561 fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize { 1562 let mut n: usize = 1; 1563 1564 // Do not create more threads than processors available. 1565 // SAFETY: FFI call. Trivially safe. 1566 let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) }; 1567 if procs > 0 { 1568 n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT); 1569 } 1570 1571 // Do not create more threads than pages being allocated. 1572 n = std::cmp::min(n, num_pages); 1573 1574 // Do not create threads to allocate less than 64 MiB of memory. 1575 n = std::cmp::min( 1576 n, 1577 std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))), 1578 ); 1579 1580 n 1581 } 1582 1583 // Update the GuestMemoryMmap with the new range 1584 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1585 let guest_memory = self 1586 .guest_memory 1587 .memory() 1588 .insert_region(region) 1589 .map_err(Error::GuestMemory)?; 1590 self.guest_memory.lock().unwrap().replace(guest_memory); 1591 1592 Ok(()) 1593 } 1594 1595 // 1596 // Calculate the start address of an area next to RAM. 1597 // 1598 // If memory hotplug is allowed, the start address needs to be aligned 1599 // (rounded-up) to 128MiB boundary. 1600 // If memory hotplug is not allowed, there is no alignment required. 1601 // And it must also start at the 64bit start. 1602 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1603 let mut start_addr = if allow_mem_hotplug { 1604 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1605 } else { 1606 mem_end 1607 }; 1608 1609 start_addr = start_addr 1610 .checked_add(1) 1611 .ok_or(Error::GuestAddressOverFlow)?; 1612 1613 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1614 return Ok(arch::layout::RAM_64BIT_START); 1615 } 1616 1617 Ok(start_addr) 1618 } 1619 1620 pub fn add_ram_region( 1621 &mut self, 1622 start_addr: GuestAddress, 1623 size: usize, 1624 ) -> Result<Arc<GuestRegionMmap>, Error> { 1625 // Allocate memory for the region 1626 let region = MemoryManager::create_ram_region( 1627 &None, 1628 0, 1629 start_addr, 1630 size, 1631 self.prefault, 1632 self.shared, 1633 self.hugepages, 1634 self.hugepage_size, 1635 None, 1636 None, 1637 self.thp, 1638 )?; 1639 1640 // Map it into the guest 1641 let slot = self.create_userspace_mapping( 1642 region.start_addr().0, 1643 region.len(), 1644 region.as_ptr() as u64, 1645 self.mergeable, 1646 false, 1647 self.log_dirty, 1648 )?; 1649 self.guest_ram_mappings.push(GuestRamMapping { 1650 gpa: region.start_addr().raw_value(), 1651 size: region.len(), 1652 slot, 1653 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1654 virtio_mem: false, 1655 file_offset: 0, 1656 }); 1657 1658 self.add_region(Arc::clone(®ion))?; 1659 1660 Ok(region) 1661 } 1662 1663 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1664 info!("Hotplugging new RAM: {}", size); 1665 1666 // Check that there is a free slot 1667 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1668 return Err(Error::NoSlotAvailable); 1669 } 1670 1671 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1672 if size % (128 << 20) != 0 { 1673 return Err(Error::InvalidSize); 1674 } 1675 1676 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1677 1678 if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area { 1679 return Err(Error::InsufficientHotplugRam); 1680 } 1681 1682 let region = self.add_ram_region(start_addr, size)?; 1683 1684 // Add region to the list of regions associated with the default 1685 // memory zone. 1686 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1687 memory_zone.regions.push(Arc::clone(®ion)); 1688 } 1689 1690 // Tell the allocator 1691 self.ram_allocator 1692 .allocate(Some(start_addr), size as GuestUsize, None) 1693 .ok_or(Error::MemoryRangeAllocation)?; 1694 1695 // Update the slot so that it can be queried via the I/O port 1696 let slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1697 slot.active = true; 1698 slot.inserting = true; 1699 slot.base = region.start_addr().0; 1700 slot.length = region.len(); 1701 1702 self.next_hotplug_slot += 1; 1703 1704 Ok(region) 1705 } 1706 1707 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1708 self.guest_memory.clone() 1709 } 1710 1711 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1712 self.boot_guest_memory.clone() 1713 } 1714 1715 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1716 self.allocator.clone() 1717 } 1718 1719 pub fn start_of_device_area(&self) -> GuestAddress { 1720 self.start_of_device_area 1721 } 1722 1723 pub fn end_of_device_area(&self) -> GuestAddress { 1724 self.end_of_device_area 1725 } 1726 1727 pub fn allocate_memory_slot(&mut self) -> u32 { 1728 let slot_id = self.next_memory_slot; 1729 self.next_memory_slot += 1; 1730 slot_id 1731 } 1732 1733 pub fn create_userspace_mapping( 1734 &mut self, 1735 guest_phys_addr: u64, 1736 memory_size: u64, 1737 userspace_addr: u64, 1738 mergeable: bool, 1739 readonly: bool, 1740 log_dirty: bool, 1741 ) -> Result<u32, Error> { 1742 let slot = self.allocate_memory_slot(); 1743 let mem_region = self.vm.make_user_memory_region( 1744 slot, 1745 guest_phys_addr, 1746 memory_size, 1747 userspace_addr, 1748 readonly, 1749 log_dirty, 1750 ); 1751 1752 info!( 1753 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1754 guest_phys_addr, userspace_addr, memory_size, slot 1755 ); 1756 1757 self.vm 1758 .create_user_memory_region(mem_region) 1759 .map_err(Error::CreateUserMemoryRegion)?; 1760 1761 // SAFETY: the address and size are valid since the 1762 // mmap succeeded. 1763 let ret = unsafe { 1764 libc::madvise( 1765 userspace_addr as *mut libc::c_void, 1766 memory_size as libc::size_t, 1767 libc::MADV_DONTDUMP, 1768 ) 1769 }; 1770 if ret != 0 { 1771 let e = io::Error::last_os_error(); 1772 warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e); 1773 } 1774 1775 // Mark the pages as mergeable if explicitly asked for. 1776 if mergeable { 1777 // SAFETY: the address and size are valid since the 1778 // mmap succeeded. 1779 let ret = unsafe { 1780 libc::madvise( 1781 userspace_addr as *mut libc::c_void, 1782 memory_size as libc::size_t, 1783 libc::MADV_MERGEABLE, 1784 ) 1785 }; 1786 if ret != 0 { 1787 let err = io::Error::last_os_error(); 1788 // Safe to unwrap because the error is constructed with 1789 // last_os_error(), which ensures the output will be Some(). 1790 let errno = err.raw_os_error().unwrap(); 1791 if errno == libc::EINVAL { 1792 warn!("kernel not configured with CONFIG_KSM"); 1793 } else { 1794 warn!("madvise error: {}", err); 1795 } 1796 warn!("failed to mark pages as mergeable"); 1797 } 1798 } 1799 1800 info!( 1801 "Created userspace mapping: {:x} -> {:x} {:x}", 1802 guest_phys_addr, userspace_addr, memory_size 1803 ); 1804 1805 Ok(slot) 1806 } 1807 1808 pub fn remove_userspace_mapping( 1809 &mut self, 1810 guest_phys_addr: u64, 1811 memory_size: u64, 1812 userspace_addr: u64, 1813 mergeable: bool, 1814 slot: u32, 1815 ) -> Result<(), Error> { 1816 let mem_region = self.vm.make_user_memory_region( 1817 slot, 1818 guest_phys_addr, 1819 memory_size, 1820 userspace_addr, 1821 false, /* readonly -- don't care */ 1822 false, /* log dirty */ 1823 ); 1824 1825 self.vm 1826 .remove_user_memory_region(mem_region) 1827 .map_err(Error::RemoveUserMemoryRegion)?; 1828 1829 // Mark the pages as unmergeable if there were previously marked as 1830 // mergeable. 1831 if mergeable { 1832 // SAFETY: the address and size are valid as the region was 1833 // previously advised. 1834 let ret = unsafe { 1835 libc::madvise( 1836 userspace_addr as *mut libc::c_void, 1837 memory_size as libc::size_t, 1838 libc::MADV_UNMERGEABLE, 1839 ) 1840 }; 1841 if ret != 0 { 1842 let err = io::Error::last_os_error(); 1843 // Safe to unwrap because the error is constructed with 1844 // last_os_error(), which ensures the output will be Some(). 1845 let errno = err.raw_os_error().unwrap(); 1846 if errno == libc::EINVAL { 1847 warn!("kernel not configured with CONFIG_KSM"); 1848 } else { 1849 warn!("madvise error: {}", err); 1850 } 1851 warn!("failed to mark pages as unmergeable"); 1852 } 1853 } 1854 1855 info!( 1856 "Removed userspace mapping: {:x} -> {:x} {:x}", 1857 guest_phys_addr, userspace_addr, memory_size 1858 ); 1859 1860 Ok(()) 1861 } 1862 1863 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1864 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1865 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1866 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1867 virtio_mem_device 1868 .lock() 1869 .unwrap() 1870 .resize(size) 1871 .map_err(Error::VirtioMemResizeFail)?; 1872 } 1873 1874 // Keep the hotplugged_size up to date. 1875 virtio_mem_zone.hotplugged_size = size; 1876 } else { 1877 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1878 return Err(Error::MissingVirtioMemHandler); 1879 } 1880 1881 return Ok(()); 1882 } 1883 1884 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1885 Err(Error::UnknownMemoryZone) 1886 } 1887 1888 /// In case this function resulted in adding a new memory region to the 1889 /// guest memory, the new region is returned to the caller. The virtio-mem 1890 /// use case never adds a new region as the whole hotpluggable memory has 1891 /// already been allocated at boot time. 1892 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1893 if self.user_provided_zones { 1894 error!( 1895 "Not allowed to resize guest memory when backed with user \ 1896 defined memory zones." 1897 ); 1898 return Err(Error::InvalidResizeWithMemoryZones); 1899 } 1900 1901 let mut region: Option<Arc<GuestRegionMmap>> = None; 1902 match self.hotplug_method { 1903 HotplugMethod::VirtioMem => { 1904 if desired_ram >= self.boot_ram { 1905 if !self.dynamic { 1906 return Ok(region); 1907 } 1908 1909 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1910 self.current_ram = desired_ram; 1911 } 1912 } 1913 HotplugMethod::Acpi => { 1914 if desired_ram > self.current_ram { 1915 if !self.dynamic { 1916 return Ok(region); 1917 } 1918 1919 region = 1920 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1921 self.current_ram = desired_ram; 1922 } 1923 } 1924 } 1925 Ok(region) 1926 } 1927 1928 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1929 if !self.user_provided_zones { 1930 error!( 1931 "Not allowed to resize guest memory zone when no zone is \ 1932 defined." 1933 ); 1934 return Err(Error::ResizeZone); 1935 } 1936 1937 self.virtio_mem_resize(id, virtio_mem_size) 1938 } 1939 1940 #[cfg(target_arch = "x86_64")] 1941 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1942 let file = OpenOptions::new() 1943 .read(true) 1944 .open("/dev/sgx_provision") 1945 .map_err(Error::SgxProvisionOpen)?; 1946 self.vm 1947 .enable_sgx_attribute(file) 1948 .map_err(Error::SgxEnableProvisioning)?; 1949 1950 // Go over each EPC section and verify its size is a 4k multiple. At 1951 // the same time, calculate the total size needed for the contiguous 1952 // EPC region. 1953 let mut epc_region_size = 0; 1954 for epc_section in sgx_epc_config.iter() { 1955 if epc_section.size == 0 { 1956 return Err(Error::EpcSectionSizeInvalid); 1957 } 1958 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1959 return Err(Error::EpcSectionSizeInvalid); 1960 } 1961 1962 epc_region_size += epc_section.size; 1963 } 1964 1965 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1966 let epc_region_start = GuestAddress( 1967 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1968 ); 1969 1970 self.start_of_device_area = epc_region_start 1971 .checked_add(epc_region_size) 1972 .ok_or(Error::GuestAddressOverFlow)?; 1973 1974 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1975 info!( 1976 "SGX EPC region: 0x{:x} (0x{:x})", 1977 epc_region_start.0, epc_region_size 1978 ); 1979 1980 // Each section can be memory mapped into the allocated region. 1981 let mut epc_section_start = epc_region_start.raw_value(); 1982 for epc_section in sgx_epc_config.iter() { 1983 let file = OpenOptions::new() 1984 .read(true) 1985 .write(true) 1986 .open("/dev/sgx_vepc") 1987 .map_err(Error::SgxVirtEpcOpen)?; 1988 1989 let prot = PROT_READ | PROT_WRITE; 1990 let mut flags = MAP_NORESERVE | MAP_SHARED; 1991 if epc_section.prefault { 1992 flags |= MAP_POPULATE; 1993 } 1994 1995 // We can't use the vm-memory crate to perform the memory mapping 1996 // here as it would try to ensure the size of the backing file is 1997 // matching the size of the expected mapping. The /dev/sgx_vepc 1998 // device does not work that way, it provides a file descriptor 1999 // which is not matching the mapping size, as it's a just a way to 2000 // let KVM know that an EPC section is being created for the guest. 2001 // SAFETY: FFI call with correct arguments 2002 let host_addr = unsafe { 2003 libc::mmap( 2004 std::ptr::null_mut(), 2005 epc_section.size as usize, 2006 prot, 2007 flags, 2008 file.as_raw_fd(), 2009 0, 2010 ) 2011 } as u64; 2012 2013 info!( 2014 "Adding SGX EPC section: 0x{:x} (0x{:x})", 2015 epc_section_start, epc_section.size 2016 ); 2017 2018 let _mem_slot = self.create_userspace_mapping( 2019 epc_section_start, 2020 epc_section.size, 2021 host_addr, 2022 false, 2023 false, 2024 false, 2025 )?; 2026 2027 sgx_epc_region.insert( 2028 epc_section.id.clone(), 2029 SgxEpcSection::new( 2030 GuestAddress(epc_section_start), 2031 epc_section.size as GuestUsize, 2032 ), 2033 ); 2034 2035 epc_section_start += epc_section.size; 2036 } 2037 2038 self.sgx_epc_region = Some(sgx_epc_region); 2039 2040 Ok(()) 2041 } 2042 2043 #[cfg(target_arch = "x86_64")] 2044 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 2045 &self.sgx_epc_region 2046 } 2047 2048 pub fn is_hardlink(f: &File) -> bool { 2049 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 2050 // SAFETY: FFI call with correct arguments 2051 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 2052 if ret != 0 { 2053 error!("Couldn't fstat the backing file"); 2054 return false; 2055 } 2056 2057 // SAFETY: stat is valid 2058 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 2059 } 2060 2061 pub fn memory_zones(&self) -> &MemoryZones { 2062 &self.memory_zones 2063 } 2064 2065 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 2066 &mut self.memory_zones 2067 } 2068 2069 pub fn memory_range_table( 2070 &self, 2071 snapshot: bool, 2072 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 2073 let mut table = MemoryRangeTable::default(); 2074 2075 for memory_zone in self.memory_zones.values() { 2076 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 2077 table.extend(virtio_mem_zone.plugged_ranges()); 2078 } 2079 2080 for region in memory_zone.regions() { 2081 if snapshot { 2082 if let Some(file_offset) = region.file_offset() { 2083 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 2084 && Self::is_hardlink(file_offset.file()) 2085 { 2086 // In this very specific case, we know the memory 2087 // region is backed by a file on the host filesystem 2088 // that can be accessed by the user, and additionally 2089 // the mapping is shared, which means that modifications 2090 // to the content are written to the actual file. 2091 // When meeting these conditions, we can skip the 2092 // copy of the memory content for this specific region, 2093 // as we can assume the user will have it saved through 2094 // the backing file already. 2095 continue; 2096 } 2097 } 2098 } 2099 2100 table.push(MemoryRange { 2101 gpa: region.start_addr().raw_value(), 2102 length: region.len(), 2103 }); 2104 } 2105 } 2106 2107 Ok(table) 2108 } 2109 2110 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 2111 MemoryManagerSnapshotData { 2112 memory_ranges: self.snapshot_memory_ranges.clone(), 2113 guest_ram_mappings: self.guest_ram_mappings.clone(), 2114 start_of_device_area: self.start_of_device_area.0, 2115 boot_ram: self.boot_ram, 2116 current_ram: self.current_ram, 2117 arch_mem_regions: self.arch_mem_regions.clone(), 2118 hotplug_slots: self.hotplug_slots.clone(), 2119 next_memory_slot: self.next_memory_slot, 2120 selected_slot: self.selected_slot, 2121 next_hotplug_slot: self.next_hotplug_slot, 2122 } 2123 } 2124 2125 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 2126 let mut memory_slot_fds = HashMap::new(); 2127 for guest_ram_mapping in &self.guest_ram_mappings { 2128 let slot = guest_ram_mapping.slot; 2129 let guest_memory = self.guest_memory.memory(); 2130 let file = guest_memory 2131 .find_region(GuestAddress(guest_ram_mapping.gpa)) 2132 .unwrap() 2133 .file_offset() 2134 .unwrap() 2135 .file(); 2136 memory_slot_fds.insert(slot, file.as_raw_fd()); 2137 } 2138 memory_slot_fds 2139 } 2140 2141 pub fn acpi_address(&self) -> Option<GuestAddress> { 2142 self.acpi_address 2143 } 2144 2145 pub fn num_guest_ram_mappings(&self) -> u32 { 2146 self.guest_ram_mappings.len() as u32 2147 } 2148 2149 #[cfg(target_arch = "aarch64")] 2150 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 2151 self.uefi_flash.as_ref().unwrap().clone() 2152 } 2153 2154 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2155 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 2156 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 2157 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 2158 2159 let mut mem_offset_in_elf = mem_offset; 2160 let mut ram_maps = BTreeMap::new(); 2161 for mapping in mapping_sorted_by_gpa.iter() { 2162 ram_maps.insert( 2163 mapping.gpa, 2164 CoredumpMemoryRegion { 2165 mem_offset_in_elf, 2166 mem_size: mapping.size, 2167 }, 2168 ); 2169 mem_offset_in_elf += mapping.size; 2170 } 2171 2172 CoredumpMemoryRegions { ram_maps } 2173 } 2174 2175 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2176 pub fn coredump_iterate_save_mem( 2177 &mut self, 2178 dump_state: &DumpState, 2179 ) -> std::result::Result<(), GuestDebuggableError> { 2180 let snapshot_memory_ranges = self 2181 .memory_range_table(false) 2182 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2183 2184 if snapshot_memory_ranges.is_empty() { 2185 return Ok(()); 2186 } 2187 2188 let coredump_file = dump_state.file.as_ref().unwrap(); 2189 2190 let guest_memory = self.guest_memory.memory(); 2191 let mut total_bytes: u64 = 0; 2192 2193 for range in snapshot_memory_ranges.regions() { 2194 let mut offset: u64 = 0; 2195 loop { 2196 let bytes_written = guest_memory 2197 .write_volatile_to( 2198 GuestAddress(range.gpa + offset), 2199 &mut coredump_file.as_fd(), 2200 (range.length - offset) as usize, 2201 ) 2202 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2203 offset += bytes_written as u64; 2204 total_bytes += bytes_written as u64; 2205 2206 if offset == range.length { 2207 break; 2208 } 2209 } 2210 } 2211 2212 debug!("coredump total bytes {}", total_bytes); 2213 Ok(()) 2214 } 2215 2216 pub fn receive_memory_regions<F>( 2217 &mut self, 2218 ranges: &MemoryRangeTable, 2219 fd: &mut F, 2220 ) -> std::result::Result<(), MigratableError> 2221 where 2222 F: ReadVolatile, 2223 { 2224 let guest_memory = self.guest_memory(); 2225 let mem = guest_memory.memory(); 2226 2227 for range in ranges.regions() { 2228 let mut offset: u64 = 0; 2229 // Here we are manually handling the retry in case we can't the 2230 // whole region at once because we can't use the implementation 2231 // from vm-memory::GuestMemory of read_exact_from() as it is not 2232 // following the correct behavior. For more info about this issue 2233 // see: https://github.com/rust-vmm/vm-memory/issues/174 2234 loop { 2235 let bytes_read = mem 2236 .read_volatile_from( 2237 GuestAddress(range.gpa + offset), 2238 fd, 2239 (range.length - offset) as usize, 2240 ) 2241 .map_err(|e| { 2242 MigratableError::MigrateReceive(anyhow!( 2243 "Error receiving memory from socket: {}", 2244 e 2245 )) 2246 })?; 2247 offset += bytes_read as u64; 2248 2249 if offset == range.length { 2250 break; 2251 } 2252 } 2253 } 2254 2255 Ok(()) 2256 } 2257 } 2258 2259 struct MemoryNotify { 2260 slot_id: usize, 2261 } 2262 2263 impl Aml for MemoryNotify { 2264 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2265 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2266 aml::If::new( 2267 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2268 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2269 ) 2270 .to_aml_bytes(sink) 2271 } 2272 } 2273 2274 struct MemorySlot { 2275 slot_id: usize, 2276 } 2277 2278 impl Aml for MemorySlot { 2279 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2280 aml::Device::new( 2281 format!("M{:03}", self.slot_id).as_str().into(), 2282 vec![ 2283 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")), 2284 &aml::Name::new("_UID".into(), &self.slot_id), 2285 /* 2286 _STA return value: 2287 Bit [0] – Set if the device is present. 2288 Bit [1] – Set if the device is enabled and decoding its resources. 2289 Bit [2] – Set if the device should be shown in the UI. 2290 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2291 Bit [4] – Set if the battery is present. 2292 Bits [31:5] – Reserved (must be cleared). 2293 */ 2294 &aml::Method::new( 2295 "_STA".into(), 2296 0, 2297 false, 2298 // Call into MSTA method which will interrogate device 2299 vec![&aml::Return::new(&aml::MethodCall::new( 2300 "MSTA".into(), 2301 vec![&self.slot_id], 2302 ))], 2303 ), 2304 // Get details of memory 2305 &aml::Method::new( 2306 "_CRS".into(), 2307 0, 2308 false, 2309 // Call into MCRS which provides actual memory details 2310 vec![&aml::Return::new(&aml::MethodCall::new( 2311 "MCRS".into(), 2312 vec![&self.slot_id], 2313 ))], 2314 ), 2315 ], 2316 ) 2317 .to_aml_bytes(sink) 2318 } 2319 } 2320 2321 struct MemorySlots { 2322 slots: usize, 2323 } 2324 2325 impl Aml for MemorySlots { 2326 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2327 for slot_id in 0..self.slots { 2328 MemorySlot { slot_id }.to_aml_bytes(sink); 2329 } 2330 } 2331 } 2332 2333 struct MemoryMethods { 2334 slots: usize, 2335 } 2336 2337 impl Aml for MemoryMethods { 2338 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2339 // Add "MTFY" notification method 2340 let mut memory_notifies = Vec::new(); 2341 for slot_id in 0..self.slots { 2342 memory_notifies.push(MemoryNotify { slot_id }); 2343 } 2344 2345 let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2346 for memory_notifier in memory_notifies.iter() { 2347 memory_notifies_refs.push(memory_notifier); 2348 } 2349 2350 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink); 2351 2352 // MSCN method 2353 aml::Method::new( 2354 "MSCN".into(), 2355 0, 2356 true, 2357 vec![ 2358 // Take lock defined above 2359 &aml::Acquire::new("MLCK".into(), 0xffff), 2360 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2361 &aml::While::new( 2362 &aml::LessThan::new(&aml::Local(0), &self.slots), 2363 vec![ 2364 // Write slot number (in first argument) to I/O port via field 2365 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2366 // Check if MINS bit is set (inserting) 2367 &aml::If::new( 2368 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2369 // Notify device if it is 2370 vec![ 2371 &aml::MethodCall::new( 2372 "MTFY".into(), 2373 vec![&aml::Local(0), &aml::ONE], 2374 ), 2375 // Reset MINS bit 2376 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2377 ], 2378 ), 2379 // Check if MRMV bit is set 2380 &aml::If::new( 2381 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2382 // Notify device if it is (with the eject constant 0x3) 2383 vec![ 2384 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2385 // Reset MRMV bit 2386 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2387 ], 2388 ), 2389 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2390 ], 2391 ), 2392 // Release lock 2393 &aml::Release::new("MLCK".into()), 2394 ], 2395 ) 2396 .to_aml_bytes(sink); 2397 2398 // Memory status method 2399 aml::Method::new( 2400 "MSTA".into(), 2401 1, 2402 true, 2403 vec![ 2404 // Take lock defined above 2405 &aml::Acquire::new("MLCK".into(), 0xffff), 2406 // Write slot number (in first argument) to I/O port via field 2407 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2408 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2409 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2410 &aml::If::new( 2411 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2412 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2413 ), 2414 // Release lock 2415 &aml::Release::new("MLCK".into()), 2416 // Return 0 or 0xf 2417 &aml::Return::new(&aml::Local(0)), 2418 ], 2419 ) 2420 .to_aml_bytes(sink); 2421 2422 // Memory range method 2423 aml::Method::new( 2424 "MCRS".into(), 2425 1, 2426 true, 2427 vec![ 2428 // Take lock defined above 2429 &aml::Acquire::new("MLCK".into(), 0xffff), 2430 // Write slot number (in first argument) to I/O port via field 2431 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2432 &aml::Name::new( 2433 "MR64".into(), 2434 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2435 aml::AddressSpaceCacheable::Cacheable, 2436 true, 2437 0x0000_0000_0000_0000u64, 2438 0xFFFF_FFFF_FFFF_FFFEu64, 2439 None, 2440 )]), 2441 ), 2442 &aml::CreateQWordField::new( 2443 &aml::Path::new("MINL"), 2444 &aml::Path::new("MR64"), 2445 &14usize, 2446 ), 2447 &aml::CreateDWordField::new( 2448 &aml::Path::new("MINH"), 2449 &aml::Path::new("MR64"), 2450 &18usize, 2451 ), 2452 &aml::CreateQWordField::new( 2453 &aml::Path::new("MAXL"), 2454 &aml::Path::new("MR64"), 2455 &22usize, 2456 ), 2457 &aml::CreateDWordField::new( 2458 &aml::Path::new("MAXH"), 2459 &aml::Path::new("MR64"), 2460 &26usize, 2461 ), 2462 &aml::CreateQWordField::new( 2463 &aml::Path::new("LENL"), 2464 &aml::Path::new("MR64"), 2465 &38usize, 2466 ), 2467 &aml::CreateDWordField::new( 2468 &aml::Path::new("LENH"), 2469 &aml::Path::new("MR64"), 2470 &42usize, 2471 ), 2472 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2473 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2474 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2475 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2476 &aml::Add::new( 2477 &aml::Path::new("MAXL"), 2478 &aml::Path::new("MINL"), 2479 &aml::Path::new("LENL"), 2480 ), 2481 &aml::Add::new( 2482 &aml::Path::new("MAXH"), 2483 &aml::Path::new("MINH"), 2484 &aml::Path::new("LENH"), 2485 ), 2486 &aml::If::new( 2487 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2488 vec![&aml::Add::new( 2489 &aml::Path::new("MAXH"), 2490 &aml::ONE, 2491 &aml::Path::new("MAXH"), 2492 )], 2493 ), 2494 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2495 // Release lock 2496 &aml::Release::new("MLCK".into()), 2497 &aml::Return::new(&aml::Path::new("MR64")), 2498 ], 2499 ) 2500 .to_aml_bytes(sink) 2501 } 2502 } 2503 2504 impl Aml for MemoryManager { 2505 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2506 if let Some(acpi_address) = self.acpi_address { 2507 // Memory Hotplug Controller 2508 aml::Device::new( 2509 "_SB_.MHPC".into(), 2510 vec![ 2511 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2512 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2513 // Mutex to protect concurrent access as we write to choose slot and then read back status 2514 &aml::Mutex::new("MLCK".into(), 0), 2515 &aml::Name::new( 2516 "_CRS".into(), 2517 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2518 aml::AddressSpaceCacheable::NotCacheable, 2519 true, 2520 acpi_address.0, 2521 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2522 None, 2523 )]), 2524 ), 2525 // OpRegion and Fields map MMIO range into individual field values 2526 &aml::OpRegion::new( 2527 "MHPR".into(), 2528 aml::OpRegionSpace::SystemMemory, 2529 &(acpi_address.0 as usize), 2530 &MEMORY_MANAGER_ACPI_SIZE, 2531 ), 2532 &aml::Field::new( 2533 "MHPR".into(), 2534 aml::FieldAccessType::DWord, 2535 aml::FieldLockRule::NoLock, 2536 aml::FieldUpdateRule::Preserve, 2537 vec![ 2538 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2539 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2540 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2541 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2542 ], 2543 ), 2544 &aml::Field::new( 2545 "MHPR".into(), 2546 aml::FieldAccessType::DWord, 2547 aml::FieldLockRule::NoLock, 2548 aml::FieldUpdateRule::Preserve, 2549 vec![ 2550 aml::FieldEntry::Reserved(128), 2551 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2552 ], 2553 ), 2554 &aml::Field::new( 2555 "MHPR".into(), 2556 aml::FieldAccessType::Byte, 2557 aml::FieldLockRule::NoLock, 2558 aml::FieldUpdateRule::WriteAsZeroes, 2559 vec![ 2560 aml::FieldEntry::Reserved(160), 2561 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2562 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2563 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2564 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2565 ], 2566 ), 2567 &aml::Field::new( 2568 "MHPR".into(), 2569 aml::FieldAccessType::DWord, 2570 aml::FieldLockRule::NoLock, 2571 aml::FieldUpdateRule::Preserve, 2572 vec![ 2573 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2574 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2575 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2576 ], 2577 ), 2578 &MemoryMethods { 2579 slots: self.hotplug_slots.len(), 2580 }, 2581 &MemorySlots { 2582 slots: self.hotplug_slots.len(), 2583 }, 2584 ], 2585 ) 2586 .to_aml_bytes(sink); 2587 } else { 2588 aml::Device::new( 2589 "_SB_.MHPC".into(), 2590 vec![ 2591 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2592 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2593 // Empty MSCN for GED 2594 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2595 ], 2596 ) 2597 .to_aml_bytes(sink); 2598 } 2599 2600 #[cfg(target_arch = "x86_64")] 2601 { 2602 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2603 let min = sgx_epc_region.start().raw_value(); 2604 let max = min + sgx_epc_region.size() - 1; 2605 // SGX EPC region 2606 aml::Device::new( 2607 "_SB_.EPC_".into(), 2608 vec![ 2609 &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")), 2610 // QWORD describing the EPC region start and size 2611 &aml::Name::new( 2612 "_CRS".into(), 2613 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2614 aml::AddressSpaceCacheable::NotCacheable, 2615 true, 2616 min, 2617 max, 2618 None, 2619 )]), 2620 ), 2621 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2622 ], 2623 ) 2624 .to_aml_bytes(sink); 2625 } 2626 } 2627 } 2628 } 2629 2630 impl Pausable for MemoryManager {} 2631 2632 #[derive(Clone, Serialize, Deserialize)] 2633 pub struct MemoryManagerSnapshotData { 2634 memory_ranges: MemoryRangeTable, 2635 guest_ram_mappings: Vec<GuestRamMapping>, 2636 start_of_device_area: u64, 2637 boot_ram: u64, 2638 current_ram: u64, 2639 arch_mem_regions: Vec<ArchMemRegion>, 2640 hotplug_slots: Vec<HotPlugState>, 2641 next_memory_slot: u32, 2642 selected_slot: usize, 2643 next_hotplug_slot: usize, 2644 } 2645 2646 impl Snapshottable for MemoryManager { 2647 fn id(&self) -> String { 2648 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2649 } 2650 2651 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2652 let memory_ranges = self.memory_range_table(true)?; 2653 2654 // Store locally this list of ranges as it will be used through the 2655 // Transportable::send() implementation. The point is to avoid the 2656 // duplication of code regarding the creation of the path for each 2657 // region. The 'snapshot' step creates the list of memory regions, 2658 // including information about the need to copy a memory region or 2659 // not. This saves the 'send' step having to go through the same 2660 // process, and instead it can directly proceed with storing the 2661 // memory range content for the ranges requiring it. 2662 self.snapshot_memory_ranges = memory_ranges; 2663 2664 Ok(Snapshot::from_data(SnapshotData::new_from_state( 2665 &self.snapshot_data(), 2666 )?)) 2667 } 2668 } 2669 2670 impl Transportable for MemoryManager { 2671 fn send( 2672 &self, 2673 _snapshot: &Snapshot, 2674 destination_url: &str, 2675 ) -> result::Result<(), MigratableError> { 2676 if self.snapshot_memory_ranges.is_empty() { 2677 return Ok(()); 2678 } 2679 2680 let mut memory_file_path = url_to_path(destination_url)?; 2681 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2682 2683 // Create the snapshot file for the entire memory 2684 let mut memory_file = OpenOptions::new() 2685 .read(true) 2686 .write(true) 2687 .create_new(true) 2688 .open(memory_file_path) 2689 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2690 2691 let guest_memory = self.guest_memory.memory(); 2692 2693 for range in self.snapshot_memory_ranges.regions() { 2694 let mut offset: u64 = 0; 2695 // Here we are manually handling the retry in case we can't read 2696 // the whole region at once because we can't use the implementation 2697 // from vm-memory::GuestMemory of write_all_to() as it is not 2698 // following the correct behavior. For more info about this issue 2699 // see: https://github.com/rust-vmm/vm-memory/issues/174 2700 loop { 2701 let bytes_written = guest_memory 2702 .write_volatile_to( 2703 GuestAddress(range.gpa + offset), 2704 &mut memory_file, 2705 (range.length - offset) as usize, 2706 ) 2707 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2708 offset += bytes_written as u64; 2709 2710 if offset == range.length { 2711 break; 2712 } 2713 } 2714 } 2715 Ok(()) 2716 } 2717 } 2718 2719 impl Migratable for MemoryManager { 2720 // Start the dirty log in the hypervisor (kvm/mshv). 2721 // Also, reset the dirty bitmap logged by the vmm. 2722 // Just before we do a bulk copy we want to start/clear the dirty log so that 2723 // pages touched during our bulk copy are tracked. 2724 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2725 self.vm.start_dirty_log().map_err(|e| { 2726 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2727 })?; 2728 2729 for r in self.guest_memory.memory().iter() { 2730 r.bitmap().reset(); 2731 } 2732 2733 Ok(()) 2734 } 2735 2736 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2737 self.vm.stop_dirty_log().map_err(|e| { 2738 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2739 })?; 2740 2741 Ok(()) 2742 } 2743 2744 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2745 // together in the table if they are contiguous. 2746 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2747 let mut table = MemoryRangeTable::default(); 2748 for r in &self.guest_ram_mappings { 2749 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2750 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2751 })?; 2752 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2753 { 2754 Some(region) => { 2755 assert!(region.start_addr().raw_value() == r.gpa); 2756 assert!(region.len() == r.size); 2757 region.bitmap().get_and_reset() 2758 } 2759 None => { 2760 return Err(MigratableError::MigrateSend(anyhow!( 2761 "Error finding 'guest memory region' with address {:x}", 2762 r.gpa 2763 ))) 2764 } 2765 }; 2766 2767 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2768 .iter() 2769 .zip(vmm_dirty_bitmap.iter()) 2770 .map(|(x, y)| x | y) 2771 .collect(); 2772 2773 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2774 2775 if sub_table.regions().is_empty() { 2776 info!("Dirty Memory Range Table is empty"); 2777 } else { 2778 info!("Dirty Memory Range Table:"); 2779 for range in sub_table.regions() { 2780 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2781 } 2782 } 2783 2784 table.extend(sub_table); 2785 } 2786 Ok(table) 2787 } 2788 } 2789