1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(target_arch = "x86_64")] 6 use crate::config::SgxEpcConfig; 7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 9 use crate::coredump::{ 10 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, 11 }; 12 use crate::migration::url_to_path; 13 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 14 use crate::{GuestMemoryMmap, GuestRegionMmap}; 15 use acpi_tables::{aml, Aml}; 16 use anyhow::anyhow; 17 #[cfg(target_arch = "x86_64")] 18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 19 use arch::RegionType; 20 #[cfg(target_arch = "x86_64")] 21 use devices::ioapic; 22 #[cfg(target_arch = "aarch64")] 23 use hypervisor::HypervisorVmError; 24 use libc::_SC_NPROCESSORS_ONLN; 25 #[cfg(target_arch = "x86_64")] 26 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 27 use serde::{Deserialize, Serialize}; 28 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 29 use std::collections::BTreeMap; 30 use std::collections::HashMap; 31 use std::fs::{File, OpenOptions}; 32 use std::io::{self}; 33 use std::ops::{BitAnd, Deref, Not, Sub}; 34 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 35 use std::os::fd::AsFd; 36 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 37 use std::path::PathBuf; 38 use std::result; 39 use std::sync::{Arc, Barrier, Mutex}; 40 use std::{ffi, thread}; 41 use tracer::trace_scoped; 42 use versionize::{VersionMap, Versionize, VersionizeResult}; 43 use versionize_derive::Versionize; 44 use virtio_devices::BlocksState; 45 #[cfg(target_arch = "x86_64")] 46 use vm_allocator::GsiApic; 47 use vm_allocator::{AddressAllocator, SystemAllocator}; 48 use vm_device::BusDevice; 49 use vm_memory::bitmap::AtomicBitmap; 50 use vm_memory::guest_memory::FileOffset; 51 use vm_memory::{ 52 mmap::MmapRegionError, Address, Error as MmapError, GuestAddress, GuestAddressSpace, 53 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 54 ReadVolatile, 55 }; 56 use vm_migration::{ 57 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 58 Snapshot, SnapshotData, Snapshottable, Transportable, VersionMapped, 59 }; 60 61 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 62 63 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 64 65 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 66 67 #[cfg(target_arch = "x86_64")] 68 const X86_64_IRQ_BASE: u32 = 5; 69 70 #[cfg(target_arch = "x86_64")] 71 const SGX_PAGE_SIZE: u64 = 1 << 12; 72 73 const HOTPLUG_COUNT: usize = 8; 74 75 // Memory policy constants 76 const MPOL_BIND: u32 = 2; 77 const MPOL_MF_STRICT: u32 = 1; 78 const MPOL_MF_MOVE: u32 = 1 << 1; 79 80 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 81 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 82 83 const MAX_PREFAULT_THREAD_COUNT: usize = 16; 84 85 #[derive(Clone, Default, Serialize, Deserialize, Versionize)] 86 struct HotPlugState { 87 base: u64, 88 length: u64, 89 active: bool, 90 inserting: bool, 91 removing: bool, 92 } 93 94 pub struct VirtioMemZone { 95 region: Arc<GuestRegionMmap>, 96 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 97 hotplugged_size: u64, 98 hugepages: bool, 99 blocks_state: Arc<Mutex<BlocksState>>, 100 } 101 102 impl VirtioMemZone { 103 pub fn region(&self) -> &Arc<GuestRegionMmap> { 104 &self.region 105 } 106 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 107 self.virtio_device = Some(virtio_device); 108 } 109 pub fn hotplugged_size(&self) -> u64 { 110 self.hotplugged_size 111 } 112 pub fn hugepages(&self) -> bool { 113 self.hugepages 114 } 115 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 116 &self.blocks_state 117 } 118 pub fn plugged_ranges(&self) -> MemoryRangeTable { 119 self.blocks_state 120 .lock() 121 .unwrap() 122 .memory_ranges(self.region.start_addr().raw_value(), true) 123 } 124 } 125 126 #[derive(Default)] 127 pub struct MemoryZone { 128 regions: Vec<Arc<GuestRegionMmap>>, 129 virtio_mem_zone: Option<VirtioMemZone>, 130 } 131 132 impl MemoryZone { 133 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 134 &self.regions 135 } 136 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 137 &self.virtio_mem_zone 138 } 139 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 140 self.virtio_mem_zone.as_mut() 141 } 142 } 143 144 pub type MemoryZones = HashMap<String, MemoryZone>; 145 146 #[derive(Clone, Serialize, Deserialize, Versionize)] 147 struct GuestRamMapping { 148 slot: u32, 149 gpa: u64, 150 size: u64, 151 zone_id: String, 152 virtio_mem: bool, 153 file_offset: u64, 154 } 155 156 #[derive(Clone, Serialize, Deserialize, Versionize)] 157 struct ArchMemRegion { 158 base: u64, 159 size: usize, 160 r_type: RegionType, 161 } 162 163 pub struct MemoryManager { 164 boot_guest_memory: GuestMemoryMmap, 165 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 166 next_memory_slot: u32, 167 start_of_device_area: GuestAddress, 168 end_of_device_area: GuestAddress, 169 end_of_ram_area: GuestAddress, 170 pub vm: Arc<dyn hypervisor::Vm>, 171 hotplug_slots: Vec<HotPlugState>, 172 selected_slot: usize, 173 mergeable: bool, 174 allocator: Arc<Mutex<SystemAllocator>>, 175 hotplug_method: HotplugMethod, 176 boot_ram: u64, 177 current_ram: u64, 178 next_hotplug_slot: usize, 179 shared: bool, 180 hugepages: bool, 181 hugepage_size: Option<u64>, 182 prefault: bool, 183 thp: bool, 184 #[cfg(target_arch = "x86_64")] 185 sgx_epc_region: Option<SgxEpcRegion>, 186 user_provided_zones: bool, 187 snapshot_memory_ranges: MemoryRangeTable, 188 memory_zones: MemoryZones, 189 log_dirty: bool, // Enable dirty logging for created RAM regions 190 arch_mem_regions: Vec<ArchMemRegion>, 191 ram_allocator: AddressAllocator, 192 dynamic: bool, 193 194 // Keep track of calls to create_userspace_mapping() for guest RAM. 195 // This is useful for getting the dirty pages as we need to know the 196 // slots that the mapping is created in. 197 guest_ram_mappings: Vec<GuestRamMapping>, 198 199 pub acpi_address: Option<GuestAddress>, 200 #[cfg(target_arch = "aarch64")] 201 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 202 } 203 204 #[derive(Debug)] 205 pub enum Error { 206 /// Failed to create shared file. 207 SharedFileCreate(io::Error), 208 209 /// Failed to set shared file length. 210 SharedFileSetLen(io::Error), 211 212 /// Mmap backed guest memory error 213 GuestMemory(MmapError), 214 215 /// Failed to allocate a memory range. 216 MemoryRangeAllocation, 217 218 /// Error from region creation 219 GuestMemoryRegion(MmapRegionError), 220 221 /// No ACPI slot available 222 NoSlotAvailable, 223 224 /// Not enough space in the hotplug RAM region 225 InsufficientHotplugRam, 226 227 /// The requested hotplug memory addition is not a valid size 228 InvalidSize, 229 230 /// Failed to create the user memory region. 231 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 232 233 /// Failed to remove the user memory region. 234 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 235 236 /// Failed to EventFd. 237 EventFdFail(io::Error), 238 239 /// Eventfd write error 240 EventfdError(io::Error), 241 242 /// Failed to virtio-mem resize 243 VirtioMemResizeFail(virtio_devices::mem::Error), 244 245 /// Cannot restore VM 246 Restore(MigratableError), 247 248 /// Cannot restore VM because source URL is missing 249 RestoreMissingSourceUrl, 250 251 /// Cannot create the system allocator 252 CreateSystemAllocator, 253 254 /// Invalid SGX EPC section size 255 #[cfg(target_arch = "x86_64")] 256 EpcSectionSizeInvalid, 257 258 /// Failed allocating SGX EPC region 259 #[cfg(target_arch = "x86_64")] 260 SgxEpcRangeAllocation, 261 262 /// Failed opening SGX virtual EPC device 263 #[cfg(target_arch = "x86_64")] 264 SgxVirtEpcOpen(io::Error), 265 266 /// Failed setting the SGX virtual EPC section size 267 #[cfg(target_arch = "x86_64")] 268 SgxVirtEpcFileSetLen(io::Error), 269 270 /// Failed opening SGX provisioning device 271 #[cfg(target_arch = "x86_64")] 272 SgxProvisionOpen(io::Error), 273 274 /// Failed enabling SGX provisioning 275 #[cfg(target_arch = "x86_64")] 276 SgxEnableProvisioning(hypervisor::HypervisorVmError), 277 278 /// Failed creating a new MmapRegion instance. 279 #[cfg(target_arch = "x86_64")] 280 NewMmapRegion(vm_memory::mmap::MmapRegionError), 281 282 /// No memory zones found. 283 MissingMemoryZones, 284 285 /// Memory configuration is not valid. 286 InvalidMemoryParameters, 287 288 /// Forbidden operation. Impossible to resize guest memory if it is 289 /// backed by user defined memory regions. 290 InvalidResizeWithMemoryZones, 291 292 /// It's invalid to try applying a NUMA policy to a memory zone that is 293 /// memory mapped with MAP_SHARED. 294 InvalidSharedMemoryZoneWithHostNuma, 295 296 /// Failed applying NUMA memory policy. 297 ApplyNumaPolicy(io::Error), 298 299 /// Memory zone identifier is not unique. 300 DuplicateZoneId, 301 302 /// No virtio-mem resizing handler found. 303 MissingVirtioMemHandler, 304 305 /// Unknown memory zone. 306 UnknownMemoryZone, 307 308 /// Invalid size for resizing. Can be anything except 0. 309 InvalidHotplugSize, 310 311 /// Invalid hotplug method associated with memory zones resizing capability. 312 InvalidHotplugMethodWithMemoryZones, 313 314 /// Could not find specified memory zone identifier from hash map. 315 MissingZoneIdentifier, 316 317 /// Resizing the memory zone failed. 318 ResizeZone, 319 320 /// Guest address overflow 321 GuestAddressOverFlow, 322 323 /// Error opening snapshot file 324 SnapshotOpen(io::Error), 325 326 // Error copying snapshot into region 327 SnapshotCopy(GuestMemoryError), 328 329 /// Failed to allocate MMIO address 330 AllocateMmioAddress, 331 332 #[cfg(target_arch = "aarch64")] 333 /// Failed to create UEFI flash 334 CreateUefiFlash(HypervisorVmError), 335 336 /// Using a directory as a backing file for memory is not supported 337 DirectoryAsBackingFileForMemory, 338 339 /// Failed to stat filesystem 340 GetFileSystemBlockSize(io::Error), 341 342 /// Memory size is misaligned with default page size or its hugepage size 343 MisalignedMemorySize, 344 } 345 346 const ENABLE_FLAG: usize = 0; 347 const INSERTING_FLAG: usize = 1; 348 const REMOVING_FLAG: usize = 2; 349 const EJECT_FLAG: usize = 3; 350 351 const BASE_OFFSET_LOW: u64 = 0; 352 const BASE_OFFSET_HIGH: u64 = 0x4; 353 const LENGTH_OFFSET_LOW: u64 = 0x8; 354 const LENGTH_OFFSET_HIGH: u64 = 0xC; 355 const STATUS_OFFSET: u64 = 0x14; 356 const SELECTION_OFFSET: u64 = 0; 357 358 // The MMIO address space size is subtracted with 64k. This is done for the 359 // following reasons: 360 // - Reduce the addressable space size by at least 4k to workaround a Linux 361 // bug when the VMM allocates devices at the end of the addressable space 362 // - Windows requires the addressable space size to be 64k aligned 363 fn mmio_address_space_size(phys_bits: u8) -> u64 { 364 (1 << phys_bits) - (1 << 16) 365 } 366 367 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the 368 // `f_bsize` field. 369 // 370 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169 371 fn statfs_get_bsize(path: &str) -> Result<u64, Error> { 372 let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?; 373 let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit(); 374 375 // SAFETY: FFI call with a valid path and buffer 376 let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) }; 377 if ret != 0 { 378 return Err(Error::GetFileSystemBlockSize( 379 std::io::Error::last_os_error(), 380 )); 381 } 382 383 // SAFETY: `buf` is valid at this point 384 // Because this value is always positive, just convert it directly. 385 // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned 386 // by `clippy` on musl target. To avoid the warning, there should be `as _` instead of 387 // `as u64`. 388 let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _; 389 Ok(bsize) 390 } 391 392 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> { 393 // SAFETY: FFI call. Trivially safe. 394 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 395 396 // There is no backend file and the `hugepages` is disabled, just use system page size. 397 if zone.file.is_none() && !zone.hugepages { 398 return Ok(page_size); 399 } 400 401 // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly. 402 if zone.hugepages && zone.hugepage_size.is_some() { 403 return Ok(zone.hugepage_size.unwrap()); 404 } 405 406 // There are two scenarios here: 407 // - `hugepages` is enabled but `hugepage_size` is not specified: 408 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 409 // - The backing file is specified: 410 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 411 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 412 // value is less than or equal to the page size, just use the page size. 413 let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| { 414 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 415 })?; 416 417 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 418 419 Ok(align_size) 420 } 421 422 #[inline] 423 fn align_down<T>(val: T, align: T) -> T 424 where 425 T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>, 426 { 427 val & !(align - 1u8.into()) 428 } 429 430 #[inline] 431 fn is_aligned<T>(val: T, align: T) -> bool 432 where 433 T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq, 434 { 435 (val & (align - 1u8.into())) == 0u8.into() 436 } 437 438 impl BusDevice for MemoryManager { 439 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 440 if self.selected_slot < self.hotplug_slots.len() { 441 let state = &self.hotplug_slots[self.selected_slot]; 442 match offset { 443 BASE_OFFSET_LOW => { 444 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 445 } 446 BASE_OFFSET_HIGH => { 447 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 448 } 449 LENGTH_OFFSET_LOW => { 450 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 451 } 452 LENGTH_OFFSET_HIGH => { 453 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 454 } 455 STATUS_OFFSET => { 456 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 457 data.fill(0); 458 if state.active { 459 data[0] |= 1 << ENABLE_FLAG; 460 } 461 if state.inserting { 462 data[0] |= 1 << INSERTING_FLAG; 463 } 464 if state.removing { 465 data[0] |= 1 << REMOVING_FLAG; 466 } 467 } 468 _ => { 469 warn!( 470 "Unexpected offset for accessing memory manager device: {:#}", 471 offset 472 ); 473 } 474 } 475 } else { 476 warn!("Out of range memory slot: {}", self.selected_slot); 477 } 478 } 479 480 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 481 match offset { 482 SELECTION_OFFSET => { 483 self.selected_slot = usize::from(data[0]); 484 } 485 STATUS_OFFSET => { 486 if self.selected_slot < self.hotplug_slots.len() { 487 let state = &mut self.hotplug_slots[self.selected_slot]; 488 // The ACPI code writes back a 1 to acknowledge the insertion 489 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 490 state.inserting = false; 491 } 492 // Ditto for removal 493 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 494 state.removing = false; 495 } 496 // Trigger removal of "DIMM" 497 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 498 warn!("Ejection of memory not currently supported"); 499 } 500 } else { 501 warn!("Out of range memory slot: {}", self.selected_slot); 502 } 503 } 504 _ => { 505 warn!( 506 "Unexpected offset for accessing memory manager device: {:#}", 507 offset 508 ); 509 } 510 }; 511 None 512 } 513 } 514 515 impl MemoryManager { 516 /// Creates all memory regions based on the available RAM ranges defined 517 /// by `ram_regions`, and based on the description of the memory zones. 518 /// In practice, this function can perform multiple memory mappings of the 519 /// same backing file if there's a hole in the address space between two 520 /// RAM ranges. 521 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 522 /// and zones containing two zones (size 1G and size 4G). 523 /// This function will create 3 resulting memory regions: 524 /// - First one mapping entirely the first memory zone on 0-1G range 525 /// - Second one mapping partially the second memory zone on 1G-3G range 526 /// - Third one mapping partially the second memory zone on 4G-6G range 527 /// Also, all memory regions are page-size aligned (e.g. their sizes must 528 /// be multiple of page-size), which may leave an additional hole in the 529 /// address space when hugepage is used. 530 fn create_memory_regions_from_zones( 531 ram_regions: &[(GuestAddress, usize)], 532 zones: &[MemoryZoneConfig], 533 prefault: Option<bool>, 534 thp: bool, 535 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 536 let mut zone_iter = zones.iter(); 537 let mut mem_regions = Vec::new(); 538 let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?; 539 let mut zone_align_size = memory_zone_get_align_size(zone)?; 540 let mut zone_offset = 0u64; 541 let mut memory_zones = HashMap::new(); 542 543 if !is_aligned(zone.size, zone_align_size) { 544 return Err(Error::MisalignedMemorySize); 545 } 546 547 // Add zone id to the list of memory zones. 548 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 549 550 for ram_region in ram_regions.iter() { 551 let mut ram_region_offset = 0; 552 let mut exit = false; 553 554 loop { 555 let mut ram_region_consumed = false; 556 let mut pull_next_zone = false; 557 558 let ram_region_available_size = 559 align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size); 560 if ram_region_available_size == 0 { 561 break; 562 } 563 let zone_sub_size = zone.size - zone_offset; 564 565 let file_offset = zone_offset; 566 let region_start = ram_region 567 .0 568 .checked_add(ram_region_offset) 569 .ok_or(Error::GuestAddressOverFlow)?; 570 let region_size = if zone_sub_size <= ram_region_available_size { 571 if zone_sub_size == ram_region_available_size { 572 ram_region_consumed = true; 573 } 574 575 ram_region_offset += zone_sub_size; 576 pull_next_zone = true; 577 578 zone_sub_size 579 } else { 580 zone_offset += ram_region_available_size; 581 ram_region_consumed = true; 582 583 ram_region_available_size 584 }; 585 586 info!( 587 "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}", 588 zone.id, 589 region_start.raw_value(), 590 region_size 591 ); 592 let region = MemoryManager::create_ram_region( 593 &zone.file, 594 file_offset, 595 region_start, 596 region_size as usize, 597 prefault.unwrap_or(zone.prefault), 598 zone.shared, 599 zone.hugepages, 600 zone.hugepage_size, 601 zone.host_numa_node, 602 None, 603 thp, 604 )?; 605 606 // Add region to the list of regions associated with the 607 // current memory zone. 608 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 609 memory_zone.regions.push(region.clone()); 610 } 611 612 mem_regions.push(region); 613 614 if pull_next_zone { 615 // Get the next zone and reset the offset. 616 zone_offset = 0; 617 if let Some(z) = zone_iter.next() { 618 zone = z; 619 } else { 620 exit = true; 621 break; 622 } 623 zone_align_size = memory_zone_get_align_size(zone)?; 624 if !is_aligned(zone.size, zone_align_size) { 625 return Err(Error::MisalignedMemorySize); 626 } 627 628 // Check if zone id already exist. In case it does, throw 629 // an error as we need unique identifiers. Otherwise, add 630 // the new zone id to the list of memory zones. 631 if memory_zones.contains_key(&zone.id) { 632 error!( 633 "Memory zone identifier '{}' found more than once. \ 634 It must be unique", 635 zone.id, 636 ); 637 return Err(Error::DuplicateZoneId); 638 } 639 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 640 } 641 642 if ram_region_consumed { 643 break; 644 } 645 } 646 647 if exit { 648 break; 649 } 650 } 651 652 Ok((mem_regions, memory_zones)) 653 } 654 655 // Restore both GuestMemory regions along with MemoryZone zones. 656 fn restore_memory_regions_and_zones( 657 guest_ram_mappings: &[GuestRamMapping], 658 zones_config: &[MemoryZoneConfig], 659 prefault: Option<bool>, 660 mut existing_memory_files: HashMap<u32, File>, 661 thp: bool, 662 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 663 let mut memory_regions = Vec::new(); 664 let mut memory_zones = HashMap::new(); 665 666 for zone_config in zones_config { 667 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 668 } 669 670 for guest_ram_mapping in guest_ram_mappings { 671 for zone_config in zones_config { 672 if guest_ram_mapping.zone_id == zone_config.id { 673 let region = MemoryManager::create_ram_region( 674 if guest_ram_mapping.virtio_mem { 675 &None 676 } else { 677 &zone_config.file 678 }, 679 guest_ram_mapping.file_offset, 680 GuestAddress(guest_ram_mapping.gpa), 681 guest_ram_mapping.size as usize, 682 prefault.unwrap_or(zone_config.prefault), 683 zone_config.shared, 684 zone_config.hugepages, 685 zone_config.hugepage_size, 686 zone_config.host_numa_node, 687 existing_memory_files.remove(&guest_ram_mapping.slot), 688 thp, 689 )?; 690 memory_regions.push(Arc::clone(®ion)); 691 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 692 if guest_ram_mapping.virtio_mem { 693 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 694 let region_size = region.len(); 695 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 696 region, 697 virtio_device: None, 698 hotplugged_size, 699 hugepages: zone_config.hugepages, 700 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 701 }); 702 } else { 703 memory_zone.regions.push(region); 704 } 705 } 706 } 707 } 708 } 709 710 memory_regions.sort_by_key(|x| x.start_addr()); 711 712 Ok((memory_regions, memory_zones)) 713 } 714 715 fn fill_saved_regions( 716 &mut self, 717 file_path: PathBuf, 718 saved_regions: MemoryRangeTable, 719 ) -> Result<(), Error> { 720 if saved_regions.is_empty() { 721 return Ok(()); 722 } 723 724 // Open (read only) the snapshot file. 725 let mut memory_file = OpenOptions::new() 726 .read(true) 727 .open(file_path) 728 .map_err(Error::SnapshotOpen)?; 729 730 let guest_memory = self.guest_memory.memory(); 731 for range in saved_regions.regions() { 732 let mut offset: u64 = 0; 733 // Here we are manually handling the retry in case we can't write 734 // the whole region at once because we can't use the implementation 735 // from vm-memory::GuestMemory of read_exact_from() as it is not 736 // following the correct behavior. For more info about this issue 737 // see: https://github.com/rust-vmm/vm-memory/issues/174 738 loop { 739 let bytes_read = guest_memory 740 .read_volatile_from( 741 GuestAddress(range.gpa + offset), 742 &mut memory_file, 743 (range.length - offset) as usize, 744 ) 745 .map_err(Error::SnapshotCopy)?; 746 offset += bytes_read as u64; 747 748 if offset == range.length { 749 break; 750 } 751 } 752 } 753 754 Ok(()) 755 } 756 757 fn validate_memory_config( 758 config: &MemoryConfig, 759 user_provided_zones: bool, 760 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 761 let mut allow_mem_hotplug = false; 762 763 if !user_provided_zones { 764 if config.zones.is_some() { 765 error!( 766 "User defined memory regions can't be provided if the \ 767 memory size is not 0" 768 ); 769 return Err(Error::InvalidMemoryParameters); 770 } 771 772 if config.hotplug_size.is_some() { 773 allow_mem_hotplug = true; 774 } 775 776 if let Some(hotplugged_size) = config.hotplugged_size { 777 if let Some(hotplug_size) = config.hotplug_size { 778 if hotplugged_size > hotplug_size { 779 error!( 780 "'hotplugged_size' {} can't be bigger than \ 781 'hotplug_size' {}", 782 hotplugged_size, hotplug_size, 783 ); 784 return Err(Error::InvalidMemoryParameters); 785 } 786 } else { 787 error!( 788 "Invalid to define 'hotplugged_size' when there is\ 789 no 'hotplug_size'" 790 ); 791 return Err(Error::InvalidMemoryParameters); 792 } 793 if config.hotplug_method == HotplugMethod::Acpi { 794 error!( 795 "Invalid to define 'hotplugged_size' with hotplug \ 796 method 'acpi'" 797 ); 798 return Err(Error::InvalidMemoryParameters); 799 } 800 } 801 802 // Create a single zone from the global memory config. This lets 803 // us reuse the codepath for user defined memory zones. 804 let zones = vec![MemoryZoneConfig { 805 id: String::from(DEFAULT_MEMORY_ZONE), 806 size: config.size, 807 file: None, 808 shared: config.shared, 809 hugepages: config.hugepages, 810 hugepage_size: config.hugepage_size, 811 host_numa_node: None, 812 hotplug_size: config.hotplug_size, 813 hotplugged_size: config.hotplugged_size, 814 prefault: config.prefault, 815 }]; 816 817 Ok((config.size, zones, allow_mem_hotplug)) 818 } else { 819 if config.zones.is_none() { 820 error!( 821 "User defined memory regions must be provided if the \ 822 memory size is 0" 823 ); 824 return Err(Error::MissingMemoryZones); 825 } 826 827 // Safe to unwrap as we checked right above there were some 828 // regions. 829 let zones = config.zones.clone().unwrap(); 830 if zones.is_empty() { 831 return Err(Error::MissingMemoryZones); 832 } 833 834 let mut total_ram_size: u64 = 0; 835 for zone in zones.iter() { 836 total_ram_size += zone.size; 837 838 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 839 error!( 840 "Invalid to set host NUMA policy for a memory zone \ 841 backed by a regular file and mapped as 'shared'" 842 ); 843 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 844 } 845 846 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 847 error!("Invalid to set ACPI hotplug method for memory zones"); 848 return Err(Error::InvalidHotplugMethodWithMemoryZones); 849 } 850 851 if let Some(hotplugged_size) = zone.hotplugged_size { 852 if let Some(hotplug_size) = zone.hotplug_size { 853 if hotplugged_size > hotplug_size { 854 error!( 855 "'hotplugged_size' {} can't be bigger than \ 856 'hotplug_size' {}", 857 hotplugged_size, hotplug_size, 858 ); 859 return Err(Error::InvalidMemoryParameters); 860 } 861 } else { 862 error!( 863 "Invalid to define 'hotplugged_size' when there is\ 864 no 'hotplug_size' for a memory zone" 865 ); 866 return Err(Error::InvalidMemoryParameters); 867 } 868 if config.hotplug_method == HotplugMethod::Acpi { 869 error!( 870 "Invalid to define 'hotplugged_size' with hotplug \ 871 method 'acpi'" 872 ); 873 return Err(Error::InvalidMemoryParameters); 874 } 875 } 876 } 877 878 Ok((total_ram_size, zones, allow_mem_hotplug)) 879 } 880 } 881 882 pub fn allocate_address_space(&mut self) -> Result<(), Error> { 883 let mut list = Vec::new(); 884 885 for (zone_id, memory_zone) in self.memory_zones.iter() { 886 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 887 memory_zone 888 .regions() 889 .iter() 890 .map(|r| (r.clone(), false)) 891 .collect(); 892 893 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 894 regions.push((virtio_mem_zone.region().clone(), true)); 895 } 896 897 list.push((zone_id.clone(), regions)); 898 } 899 900 for (zone_id, regions) in list { 901 for (region, virtio_mem) in regions { 902 let slot = self.create_userspace_mapping( 903 region.start_addr().raw_value(), 904 region.len(), 905 region.as_ptr() as u64, 906 self.mergeable, 907 false, 908 self.log_dirty, 909 )?; 910 911 let file_offset = if let Some(file_offset) = region.file_offset() { 912 file_offset.start() 913 } else { 914 0 915 }; 916 917 self.guest_ram_mappings.push(GuestRamMapping { 918 gpa: region.start_addr().raw_value(), 919 size: region.len(), 920 slot, 921 zone_id: zone_id.clone(), 922 virtio_mem, 923 file_offset, 924 }); 925 self.ram_allocator 926 .allocate(Some(region.start_addr()), region.len(), None) 927 .ok_or(Error::MemoryRangeAllocation)?; 928 } 929 } 930 931 // Allocate SubRegion and Reserved address ranges. 932 for region in self.arch_mem_regions.iter() { 933 if region.r_type == RegionType::Ram { 934 // Ignore the RAM type since ranges have already been allocated 935 // based on the GuestMemory regions. 936 continue; 937 } 938 self.ram_allocator 939 .allocate( 940 Some(GuestAddress(region.base)), 941 region.size as GuestUsize, 942 None, 943 ) 944 .ok_or(Error::MemoryRangeAllocation)?; 945 } 946 947 Ok(()) 948 } 949 950 #[cfg(target_arch = "aarch64")] 951 fn add_uefi_flash(&mut self) -> Result<(), Error> { 952 // On AArch64, the UEFI binary requires a flash device at address 0. 953 // 4 MiB memory is mapped to simulate the flash. 954 let uefi_mem_slot = self.allocate_memory_slot(); 955 let uefi_region = GuestRegionMmap::new( 956 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 957 arch::layout::UEFI_START, 958 ) 959 .unwrap(); 960 let uefi_mem_region = self.vm.make_user_memory_region( 961 uefi_mem_slot, 962 uefi_region.start_addr().raw_value(), 963 uefi_region.len(), 964 uefi_region.as_ptr() as u64, 965 false, 966 false, 967 ); 968 self.vm 969 .create_user_memory_region(uefi_mem_region) 970 .map_err(Error::CreateUefiFlash)?; 971 972 let uefi_flash = 973 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 974 975 self.uefi_flash = Some(uefi_flash); 976 977 Ok(()) 978 } 979 980 #[allow(clippy::too_many_arguments)] 981 pub fn new( 982 vm: Arc<dyn hypervisor::Vm>, 983 config: &MemoryConfig, 984 prefault: Option<bool>, 985 phys_bits: u8, 986 #[cfg(feature = "tdx")] tdx_enabled: bool, 987 restore_data: Option<&MemoryManagerSnapshotData>, 988 existing_memory_files: Option<HashMap<u32, File>>, 989 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 990 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 991 trace_scoped!("MemoryManager::new"); 992 993 let user_provided_zones = config.size == 0; 994 995 let mmio_address_space_size = mmio_address_space_size(phys_bits); 996 debug_assert_eq!( 997 (((mmio_address_space_size) >> 16) << 16), 998 mmio_address_space_size 999 ); 1000 let start_of_platform_device_area = 1001 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 1002 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 1003 1004 let (ram_size, zones, allow_mem_hotplug) = 1005 Self::validate_memory_config(config, user_provided_zones)?; 1006 1007 let ( 1008 start_of_device_area, 1009 boot_ram, 1010 current_ram, 1011 arch_mem_regions, 1012 memory_zones, 1013 guest_memory, 1014 boot_guest_memory, 1015 hotplug_slots, 1016 next_memory_slot, 1017 selected_slot, 1018 next_hotplug_slot, 1019 ) = if let Some(data) = restore_data { 1020 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 1021 &data.guest_ram_mappings, 1022 &zones, 1023 prefault, 1024 existing_memory_files.unwrap_or_default(), 1025 config.thp, 1026 )?; 1027 let guest_memory = 1028 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 1029 let boot_guest_memory = guest_memory.clone(); 1030 ( 1031 GuestAddress(data.start_of_device_area), 1032 data.boot_ram, 1033 data.current_ram, 1034 data.arch_mem_regions.clone(), 1035 memory_zones, 1036 guest_memory, 1037 boot_guest_memory, 1038 data.hotplug_slots.clone(), 1039 data.next_memory_slot, 1040 data.selected_slot, 1041 data.next_hotplug_slot, 1042 ) 1043 } else { 1044 // Init guest memory 1045 let arch_mem_regions = arch::arch_memory_regions(); 1046 1047 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1048 .iter() 1049 .filter(|r| r.2 == RegionType::Ram) 1050 .map(|r| (r.0, r.1)) 1051 .collect(); 1052 1053 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 1054 .iter() 1055 .map(|(a, b, c)| ArchMemRegion { 1056 base: a.0, 1057 size: *b, 1058 r_type: *c, 1059 }) 1060 .collect(); 1061 1062 let (mem_regions, mut memory_zones) = 1063 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 1064 1065 let mut guest_memory = 1066 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 1067 1068 let boot_guest_memory = guest_memory.clone(); 1069 1070 let mut start_of_device_area = 1071 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 1072 1073 // Update list of memory zones for resize. 1074 for zone in zones.iter() { 1075 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 1076 if let Some(hotplug_size) = zone.hotplug_size { 1077 if hotplug_size == 0 { 1078 error!("'hotplug_size' can't be 0"); 1079 return Err(Error::InvalidHotplugSize); 1080 } 1081 1082 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 1083 start_of_device_area = start_of_device_area 1084 .checked_add(hotplug_size) 1085 .ok_or(Error::GuestAddressOverFlow)?; 1086 } else { 1087 // Alignment must be "natural" i.e. same as size of block 1088 let start_addr = GuestAddress( 1089 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1090 - 1) 1091 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1092 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 1093 ); 1094 1095 // When `prefault` is set by vm_restore, memory manager 1096 // will create ram region with `prefault` option in 1097 // restore config rather than same option in zone 1098 let region = MemoryManager::create_ram_region( 1099 &None, 1100 0, 1101 start_addr, 1102 hotplug_size as usize, 1103 prefault.unwrap_or(zone.prefault), 1104 zone.shared, 1105 zone.hugepages, 1106 zone.hugepage_size, 1107 zone.host_numa_node, 1108 None, 1109 config.thp, 1110 )?; 1111 1112 guest_memory = guest_memory 1113 .insert_region(Arc::clone(®ion)) 1114 .map_err(Error::GuestMemory)?; 1115 1116 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1117 let region_size = region.len(); 1118 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1119 region, 1120 virtio_device: None, 1121 hotplugged_size, 1122 hugepages: zone.hugepages, 1123 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1124 }); 1125 1126 start_of_device_area = start_addr 1127 .checked_add(hotplug_size) 1128 .ok_or(Error::GuestAddressOverFlow)?; 1129 } 1130 } 1131 } else { 1132 return Err(Error::MissingZoneIdentifier); 1133 } 1134 } 1135 1136 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1137 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1138 1139 ( 1140 start_of_device_area, 1141 ram_size, 1142 ram_size, 1143 arch_mem_regions, 1144 memory_zones, 1145 guest_memory, 1146 boot_guest_memory, 1147 hotplug_slots, 1148 0, 1149 0, 1150 0, 1151 ) 1152 }; 1153 1154 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1155 1156 // Both MMIO and PIO address spaces start at address 0. 1157 let allocator = Arc::new(Mutex::new( 1158 SystemAllocator::new( 1159 #[cfg(target_arch = "x86_64")] 1160 { 1161 GuestAddress(0) 1162 }, 1163 #[cfg(target_arch = "x86_64")] 1164 { 1165 1 << 16 1166 }, 1167 start_of_platform_device_area, 1168 PLATFORM_DEVICE_AREA_SIZE, 1169 #[cfg(target_arch = "x86_64")] 1170 vec![GsiApic::new( 1171 X86_64_IRQ_BASE, 1172 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1173 )], 1174 ) 1175 .ok_or(Error::CreateSystemAllocator)?, 1176 )); 1177 1178 #[cfg(not(feature = "tdx"))] 1179 let dynamic = true; 1180 #[cfg(feature = "tdx")] 1181 let dynamic = !tdx_enabled; 1182 1183 let acpi_address = if dynamic 1184 && config.hotplug_method == HotplugMethod::Acpi 1185 && (config.hotplug_size.unwrap_or_default() > 0) 1186 { 1187 Some( 1188 allocator 1189 .lock() 1190 .unwrap() 1191 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1192 .ok_or(Error::AllocateMmioAddress)?, 1193 ) 1194 } else { 1195 None 1196 }; 1197 1198 // If running on SGX the start of device area and RAM area may diverge but 1199 // at this point they are next to each other. 1200 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1201 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1202 1203 let mut memory_manager = MemoryManager { 1204 boot_guest_memory, 1205 guest_memory, 1206 next_memory_slot, 1207 start_of_device_area, 1208 end_of_device_area, 1209 end_of_ram_area, 1210 vm, 1211 hotplug_slots, 1212 selected_slot, 1213 mergeable: config.mergeable, 1214 allocator, 1215 hotplug_method: config.hotplug_method, 1216 boot_ram, 1217 current_ram, 1218 next_hotplug_slot, 1219 shared: config.shared, 1220 hugepages: config.hugepages, 1221 hugepage_size: config.hugepage_size, 1222 prefault: config.prefault, 1223 #[cfg(target_arch = "x86_64")] 1224 sgx_epc_region: None, 1225 user_provided_zones, 1226 snapshot_memory_ranges: MemoryRangeTable::default(), 1227 memory_zones, 1228 guest_ram_mappings: Vec::new(), 1229 acpi_address, 1230 log_dirty: dynamic, // Cannot log dirty pages on a TD 1231 arch_mem_regions, 1232 ram_allocator, 1233 dynamic, 1234 #[cfg(target_arch = "aarch64")] 1235 uefi_flash: None, 1236 thp: config.thp, 1237 }; 1238 1239 #[cfg(target_arch = "aarch64")] 1240 { 1241 // For Aarch64 we cannot lazily allocate the address space like we 1242 // do for x86, because while restoring a VM from snapshot we would 1243 // need the address space to be allocated to properly restore VGIC. 1244 // And the restore of VGIC happens before we attempt to run the vCPUs 1245 // for the first time, thus we need to allocate the address space 1246 // beforehand. 1247 memory_manager.allocate_address_space()?; 1248 memory_manager.add_uefi_flash()?; 1249 } 1250 1251 #[cfg(target_arch = "x86_64")] 1252 if let Some(sgx_epc_config) = sgx_epc_config { 1253 memory_manager.setup_sgx(sgx_epc_config)?; 1254 } 1255 1256 Ok(Arc::new(Mutex::new(memory_manager))) 1257 } 1258 1259 pub fn new_from_snapshot( 1260 snapshot: &Snapshot, 1261 vm: Arc<dyn hypervisor::Vm>, 1262 config: &MemoryConfig, 1263 source_url: Option<&str>, 1264 prefault: bool, 1265 phys_bits: u8, 1266 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1267 if let Some(source_url) = source_url { 1268 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1269 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1270 1271 let mem_snapshot: MemoryManagerSnapshotData = 1272 snapshot.to_versioned_state().map_err(Error::Restore)?; 1273 1274 let mm = MemoryManager::new( 1275 vm, 1276 config, 1277 Some(prefault), 1278 phys_bits, 1279 #[cfg(feature = "tdx")] 1280 false, 1281 Some(&mem_snapshot), 1282 None, 1283 #[cfg(target_arch = "x86_64")] 1284 None, 1285 )?; 1286 1287 mm.lock() 1288 .unwrap() 1289 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1290 1291 Ok(mm) 1292 } else { 1293 Err(Error::RestoreMissingSourceUrl) 1294 } 1295 } 1296 1297 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1298 // SAFETY: FFI call with correct arguments 1299 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1300 1301 if res < 0 { 1302 Err(io::Error::last_os_error()) 1303 } else { 1304 Ok(res as RawFd) 1305 } 1306 } 1307 1308 fn mbind( 1309 addr: *mut u8, 1310 len: u64, 1311 mode: u32, 1312 nodemask: Vec<u64>, 1313 maxnode: u64, 1314 flags: u32, 1315 ) -> Result<(), io::Error> { 1316 // SAFETY: FFI call with correct arguments 1317 let res = unsafe { 1318 libc::syscall( 1319 libc::SYS_mbind, 1320 addr as *mut libc::c_void, 1321 len, 1322 mode, 1323 nodemask.as_ptr(), 1324 maxnode, 1325 flags, 1326 ) 1327 }; 1328 1329 if res < 0 { 1330 Err(io::Error::last_os_error()) 1331 } else { 1332 Ok(()) 1333 } 1334 } 1335 1336 fn create_anonymous_file( 1337 size: usize, 1338 hugepages: bool, 1339 hugepage_size: Option<u64>, 1340 ) -> Result<FileOffset, Error> { 1341 let fd = Self::memfd_create( 1342 &ffi::CString::new("ch_ram").unwrap(), 1343 libc::MFD_CLOEXEC 1344 | if hugepages { 1345 libc::MFD_HUGETLB 1346 | if let Some(hugepage_size) = hugepage_size { 1347 /* 1348 * From the Linux kernel: 1349 * Several system calls take a flag to request "hugetlb" huge pages. 1350 * Without further specification, these system calls will use the 1351 * system's default huge page size. If a system supports multiple 1352 * huge page sizes, the desired huge page size can be specified in 1353 * bits [26:31] of the flag arguments. The value in these 6 bits 1354 * will encode the log2 of the huge page size. 1355 */ 1356 1357 hugepage_size.trailing_zeros() << 26 1358 } else { 1359 // Use the system default huge page size 1360 0 1361 } 1362 } else { 1363 0 1364 }, 1365 ) 1366 .map_err(Error::SharedFileCreate)?; 1367 1368 // SAFETY: fd is valid 1369 let f = unsafe { File::from_raw_fd(fd) }; 1370 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1371 1372 Ok(FileOffset::new(f, 0)) 1373 } 1374 1375 fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> { 1376 if backing_file.is_dir() { 1377 Err(Error::DirectoryAsBackingFileForMemory) 1378 } else { 1379 let f = OpenOptions::new() 1380 .read(true) 1381 .write(true) 1382 .open(backing_file) 1383 .map_err(Error::SharedFileCreate)?; 1384 1385 Ok(FileOffset::new(f, file_offset)) 1386 } 1387 } 1388 1389 #[allow(clippy::too_many_arguments)] 1390 pub fn create_ram_region( 1391 backing_file: &Option<PathBuf>, 1392 file_offset: u64, 1393 start_addr: GuestAddress, 1394 size: usize, 1395 prefault: bool, 1396 shared: bool, 1397 hugepages: bool, 1398 hugepage_size: Option<u64>, 1399 host_numa_node: Option<u32>, 1400 existing_memory_file: Option<File>, 1401 thp: bool, 1402 ) -> Result<Arc<GuestRegionMmap>, Error> { 1403 let mut mmap_flags = libc::MAP_NORESERVE; 1404 1405 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1406 // the complexity of the handling clear. 1407 let fo = if let Some(f) = existing_memory_file { 1408 // It must be MAP_SHARED as we wouldn't already have an FD 1409 mmap_flags |= libc::MAP_SHARED; 1410 Some(FileOffset::new(f, file_offset)) 1411 } else if let Some(backing_file) = backing_file { 1412 if shared { 1413 mmap_flags |= libc::MAP_SHARED; 1414 } else { 1415 mmap_flags |= libc::MAP_PRIVATE; 1416 } 1417 Some(Self::open_backing_file(backing_file, file_offset)?) 1418 } else if shared || hugepages { 1419 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1420 // because the MAP_PRIVATE will trigger CoW against the backing file with 1421 // the VFIO pinning 1422 mmap_flags |= libc::MAP_SHARED; 1423 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1424 } else { 1425 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1426 None 1427 }; 1428 1429 let region = GuestRegionMmap::new( 1430 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1431 .map_err(Error::GuestMemoryRegion)?, 1432 start_addr, 1433 ) 1434 .map_err(Error::GuestMemory)?; 1435 1436 // Apply NUMA policy if needed. 1437 if let Some(node) = host_numa_node { 1438 let addr = region.deref().as_ptr(); 1439 let len = region.deref().size() as u64; 1440 let mode = MPOL_BIND; 1441 let mut nodemask: Vec<u64> = Vec::new(); 1442 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1443 1444 // Linux is kind of buggy in the way it interprets maxnode as it 1445 // will cut off the last node. That's why we have to add 1 to what 1446 // we would consider as the proper maxnode value. 1447 let maxnode = node as u64 + 1 + 1; 1448 1449 // Allocate the right size for the vector. 1450 nodemask.resize((node as usize / 64) + 1, 0); 1451 1452 // Fill the global bitmask through the nodemask vector. 1453 let idx = (node / 64) as usize; 1454 let shift = node % 64; 1455 nodemask[idx] |= 1u64 << shift; 1456 1457 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1458 // force the kernel to move all pages that might have been already 1459 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1460 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1461 // MPOL_BIND is the selected mode as it specifies a strict policy 1462 // that restricts memory allocation to the nodes specified in the 1463 // nodemask. 1464 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1465 .map_err(Error::ApplyNumaPolicy)?; 1466 } 1467 1468 // Prefault the region if needed, in parallel. 1469 if prefault { 1470 let page_size = 1471 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize; 1472 1473 if !is_aligned(size, page_size) { 1474 warn!( 1475 "Prefaulting memory size {} misaligned with page size {}", 1476 size, page_size 1477 ); 1478 } 1479 1480 let num_pages = size / page_size; 1481 1482 let num_threads = Self::get_prefault_num_threads(page_size, num_pages); 1483 1484 let pages_per_thread = num_pages / num_threads; 1485 let remainder = num_pages % num_threads; 1486 1487 let barrier = Arc::new(Barrier::new(num_threads)); 1488 thread::scope(|s| { 1489 let r = ®ion; 1490 for i in 0..num_threads { 1491 let barrier = Arc::clone(&barrier); 1492 s.spawn(move || { 1493 // Wait until all threads have been spawned to avoid contention 1494 // over mmap_sem between thread stack allocation and page faulting. 1495 barrier.wait(); 1496 let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; 1497 let offset = 1498 page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); 1499 // SAFETY: FFI call with correct arguments 1500 let ret = unsafe { 1501 let addr = r.as_ptr().add(offset); 1502 libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE) 1503 }; 1504 if ret != 0 { 1505 let e = io::Error::last_os_error(); 1506 warn!("Failed to prefault pages: {}", e); 1507 } 1508 }); 1509 } 1510 }); 1511 } 1512 1513 if region.file_offset().is_none() && thp { 1514 info!( 1515 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1516 region.as_ptr() as u64, 1517 size 1518 ); 1519 // SAFETY: FFI call with correct arguments 1520 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1521 if ret != 0 { 1522 let e = io::Error::last_os_error(); 1523 warn!("Failed to mark pages as THP eligible: {}", e); 1524 } 1525 } 1526 1527 Ok(Arc::new(region)) 1528 } 1529 1530 // Duplicate of `memory_zone_get_align_size` that does not require a `zone` 1531 fn get_prefault_align_size( 1532 backing_file: &Option<PathBuf>, 1533 hugepages: bool, 1534 hugepage_size: Option<u64>, 1535 ) -> Result<u64, Error> { 1536 // SAFETY: FFI call. Trivially safe. 1537 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 1538 match (hugepages, hugepage_size, backing_file) { 1539 (false, _, _) => Ok(page_size), 1540 (true, Some(hugepage_size), _) => Ok(hugepage_size), 1541 (true, None, _) => { 1542 // There are two scenarios here: 1543 // - `hugepages` is enabled but `hugepage_size` is not specified: 1544 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 1545 // - The backing file is specified: 1546 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 1547 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 1548 // value is less than or equal to the page size, just use the page size. 1549 let path = backing_file 1550 .as_ref() 1551 .map_or(Ok("/dev/hugepages"), |pathbuf| { 1552 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 1553 })?; 1554 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 1555 Ok(align_size) 1556 } 1557 } 1558 } 1559 1560 fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize { 1561 let mut n: usize = 1; 1562 1563 // Do not create more threads than processors available. 1564 // SAFETY: FFI call. Trivially safe. 1565 let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) }; 1566 if procs > 0 { 1567 n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT); 1568 } 1569 1570 // Do not create more threads than pages being allocated. 1571 n = std::cmp::min(n, num_pages); 1572 1573 // Do not create threads to allocate less than 64 MiB of memory. 1574 n = std::cmp::min( 1575 n, 1576 std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))), 1577 ); 1578 1579 n 1580 } 1581 1582 // Update the GuestMemoryMmap with the new range 1583 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1584 let guest_memory = self 1585 .guest_memory 1586 .memory() 1587 .insert_region(region) 1588 .map_err(Error::GuestMemory)?; 1589 self.guest_memory.lock().unwrap().replace(guest_memory); 1590 1591 Ok(()) 1592 } 1593 1594 // 1595 // Calculate the start address of an area next to RAM. 1596 // 1597 // If memory hotplug is allowed, the start address needs to be aligned 1598 // (rounded-up) to 128MiB boundary. 1599 // If memory hotplug is not allowed, there is no alignment required. 1600 // And it must also start at the 64bit start. 1601 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1602 let mut start_addr = if allow_mem_hotplug { 1603 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1604 } else { 1605 mem_end 1606 }; 1607 1608 start_addr = start_addr 1609 .checked_add(1) 1610 .ok_or(Error::GuestAddressOverFlow)?; 1611 1612 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1613 return Ok(arch::layout::RAM_64BIT_START); 1614 } 1615 1616 Ok(start_addr) 1617 } 1618 1619 pub fn add_ram_region( 1620 &mut self, 1621 start_addr: GuestAddress, 1622 size: usize, 1623 ) -> Result<Arc<GuestRegionMmap>, Error> { 1624 // Allocate memory for the region 1625 let region = MemoryManager::create_ram_region( 1626 &None, 1627 0, 1628 start_addr, 1629 size, 1630 self.prefault, 1631 self.shared, 1632 self.hugepages, 1633 self.hugepage_size, 1634 None, 1635 None, 1636 self.thp, 1637 )?; 1638 1639 // Map it into the guest 1640 let slot = self.create_userspace_mapping( 1641 region.start_addr().0, 1642 region.len(), 1643 region.as_ptr() as u64, 1644 self.mergeable, 1645 false, 1646 self.log_dirty, 1647 )?; 1648 self.guest_ram_mappings.push(GuestRamMapping { 1649 gpa: region.start_addr().raw_value(), 1650 size: region.len(), 1651 slot, 1652 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1653 virtio_mem: false, 1654 file_offset: 0, 1655 }); 1656 1657 self.add_region(Arc::clone(®ion))?; 1658 1659 Ok(region) 1660 } 1661 1662 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1663 info!("Hotplugging new RAM: {}", size); 1664 1665 // Check that there is a free slot 1666 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1667 return Err(Error::NoSlotAvailable); 1668 } 1669 1670 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1671 if size % (128 << 20) != 0 { 1672 return Err(Error::InvalidSize); 1673 } 1674 1675 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1676 1677 if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area { 1678 return Err(Error::InsufficientHotplugRam); 1679 } 1680 1681 let region = self.add_ram_region(start_addr, size)?; 1682 1683 // Add region to the list of regions associated with the default 1684 // memory zone. 1685 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1686 memory_zone.regions.push(Arc::clone(®ion)); 1687 } 1688 1689 // Tell the allocator 1690 self.ram_allocator 1691 .allocate(Some(start_addr), size as GuestUsize, None) 1692 .ok_or(Error::MemoryRangeAllocation)?; 1693 1694 // Update the slot so that it can be queried via the I/O port 1695 let slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1696 slot.active = true; 1697 slot.inserting = true; 1698 slot.base = region.start_addr().0; 1699 slot.length = region.len(); 1700 1701 self.next_hotplug_slot += 1; 1702 1703 Ok(region) 1704 } 1705 1706 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1707 self.guest_memory.clone() 1708 } 1709 1710 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1711 self.boot_guest_memory.clone() 1712 } 1713 1714 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1715 self.allocator.clone() 1716 } 1717 1718 pub fn start_of_device_area(&self) -> GuestAddress { 1719 self.start_of_device_area 1720 } 1721 1722 pub fn end_of_device_area(&self) -> GuestAddress { 1723 self.end_of_device_area 1724 } 1725 1726 pub fn allocate_memory_slot(&mut self) -> u32 { 1727 let slot_id = self.next_memory_slot; 1728 self.next_memory_slot += 1; 1729 slot_id 1730 } 1731 1732 pub fn create_userspace_mapping( 1733 &mut self, 1734 guest_phys_addr: u64, 1735 memory_size: u64, 1736 userspace_addr: u64, 1737 mergeable: bool, 1738 readonly: bool, 1739 log_dirty: bool, 1740 ) -> Result<u32, Error> { 1741 let slot = self.allocate_memory_slot(); 1742 let mem_region = self.vm.make_user_memory_region( 1743 slot, 1744 guest_phys_addr, 1745 memory_size, 1746 userspace_addr, 1747 readonly, 1748 log_dirty, 1749 ); 1750 1751 info!( 1752 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1753 guest_phys_addr, userspace_addr, memory_size, slot 1754 ); 1755 1756 self.vm 1757 .create_user_memory_region(mem_region) 1758 .map_err(Error::CreateUserMemoryRegion)?; 1759 1760 // SAFETY: the address and size are valid since the 1761 // mmap succeeded. 1762 let ret = unsafe { 1763 libc::madvise( 1764 userspace_addr as *mut libc::c_void, 1765 memory_size as libc::size_t, 1766 libc::MADV_DONTDUMP, 1767 ) 1768 }; 1769 if ret != 0 { 1770 let e = io::Error::last_os_error(); 1771 warn!("Failed to mark mappin as MADV_DONTDUMP: {}", e); 1772 } 1773 1774 // Mark the pages as mergeable if explicitly asked for. 1775 if mergeable { 1776 // SAFETY: the address and size are valid since the 1777 // mmap succeeded. 1778 let ret = unsafe { 1779 libc::madvise( 1780 userspace_addr as *mut libc::c_void, 1781 memory_size as libc::size_t, 1782 libc::MADV_MERGEABLE, 1783 ) 1784 }; 1785 if ret != 0 { 1786 let err = io::Error::last_os_error(); 1787 // Safe to unwrap because the error is constructed with 1788 // last_os_error(), which ensures the output will be Some(). 1789 let errno = err.raw_os_error().unwrap(); 1790 if errno == libc::EINVAL { 1791 warn!("kernel not configured with CONFIG_KSM"); 1792 } else { 1793 warn!("madvise error: {}", err); 1794 } 1795 warn!("failed to mark pages as mergeable"); 1796 } 1797 } 1798 1799 info!( 1800 "Created userspace mapping: {:x} -> {:x} {:x}", 1801 guest_phys_addr, userspace_addr, memory_size 1802 ); 1803 1804 Ok(slot) 1805 } 1806 1807 pub fn remove_userspace_mapping( 1808 &mut self, 1809 guest_phys_addr: u64, 1810 memory_size: u64, 1811 userspace_addr: u64, 1812 mergeable: bool, 1813 slot: u32, 1814 ) -> Result<(), Error> { 1815 let mem_region = self.vm.make_user_memory_region( 1816 slot, 1817 guest_phys_addr, 1818 memory_size, 1819 userspace_addr, 1820 false, /* readonly -- don't care */ 1821 false, /* log dirty */ 1822 ); 1823 1824 self.vm 1825 .remove_user_memory_region(mem_region) 1826 .map_err(Error::RemoveUserMemoryRegion)?; 1827 1828 // Mark the pages as unmergeable if there were previously marked as 1829 // mergeable. 1830 if mergeable { 1831 // SAFETY: the address and size are valid as the region was 1832 // previously advised. 1833 let ret = unsafe { 1834 libc::madvise( 1835 userspace_addr as *mut libc::c_void, 1836 memory_size as libc::size_t, 1837 libc::MADV_UNMERGEABLE, 1838 ) 1839 }; 1840 if ret != 0 { 1841 let err = io::Error::last_os_error(); 1842 // Safe to unwrap because the error is constructed with 1843 // last_os_error(), which ensures the output will be Some(). 1844 let errno = err.raw_os_error().unwrap(); 1845 if errno == libc::EINVAL { 1846 warn!("kernel not configured with CONFIG_KSM"); 1847 } else { 1848 warn!("madvise error: {}", err); 1849 } 1850 warn!("failed to mark pages as unmergeable"); 1851 } 1852 } 1853 1854 info!( 1855 "Removed userspace mapping: {:x} -> {:x} {:x}", 1856 guest_phys_addr, userspace_addr, memory_size 1857 ); 1858 1859 Ok(()) 1860 } 1861 1862 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1863 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1864 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1865 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1866 virtio_mem_device 1867 .lock() 1868 .unwrap() 1869 .resize(size) 1870 .map_err(Error::VirtioMemResizeFail)?; 1871 } 1872 1873 // Keep the hotplugged_size up to date. 1874 virtio_mem_zone.hotplugged_size = size; 1875 } else { 1876 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1877 return Err(Error::MissingVirtioMemHandler); 1878 } 1879 1880 return Ok(()); 1881 } 1882 1883 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1884 Err(Error::UnknownMemoryZone) 1885 } 1886 1887 /// In case this function resulted in adding a new memory region to the 1888 /// guest memory, the new region is returned to the caller. The virtio-mem 1889 /// use case never adds a new region as the whole hotpluggable memory has 1890 /// already been allocated at boot time. 1891 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1892 if self.user_provided_zones { 1893 error!( 1894 "Not allowed to resize guest memory when backed with user \ 1895 defined memory zones." 1896 ); 1897 return Err(Error::InvalidResizeWithMemoryZones); 1898 } 1899 1900 let mut region: Option<Arc<GuestRegionMmap>> = None; 1901 match self.hotplug_method { 1902 HotplugMethod::VirtioMem => { 1903 if desired_ram >= self.boot_ram { 1904 if !self.dynamic { 1905 return Ok(region); 1906 } 1907 1908 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1909 self.current_ram = desired_ram; 1910 } 1911 } 1912 HotplugMethod::Acpi => { 1913 if desired_ram > self.current_ram { 1914 if !self.dynamic { 1915 return Ok(region); 1916 } 1917 1918 region = 1919 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1920 self.current_ram = desired_ram; 1921 } 1922 } 1923 } 1924 Ok(region) 1925 } 1926 1927 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1928 if !self.user_provided_zones { 1929 error!( 1930 "Not allowed to resize guest memory zone when no zone is \ 1931 defined." 1932 ); 1933 return Err(Error::ResizeZone); 1934 } 1935 1936 self.virtio_mem_resize(id, virtio_mem_size) 1937 } 1938 1939 #[cfg(target_arch = "x86_64")] 1940 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1941 let file = OpenOptions::new() 1942 .read(true) 1943 .open("/dev/sgx_provision") 1944 .map_err(Error::SgxProvisionOpen)?; 1945 self.vm 1946 .enable_sgx_attribute(file) 1947 .map_err(Error::SgxEnableProvisioning)?; 1948 1949 // Go over each EPC section and verify its size is a 4k multiple. At 1950 // the same time, calculate the total size needed for the contiguous 1951 // EPC region. 1952 let mut epc_region_size = 0; 1953 for epc_section in sgx_epc_config.iter() { 1954 if epc_section.size == 0 { 1955 return Err(Error::EpcSectionSizeInvalid); 1956 } 1957 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1958 return Err(Error::EpcSectionSizeInvalid); 1959 } 1960 1961 epc_region_size += epc_section.size; 1962 } 1963 1964 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1965 let epc_region_start = GuestAddress( 1966 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1967 ); 1968 1969 self.start_of_device_area = epc_region_start 1970 .checked_add(epc_region_size) 1971 .ok_or(Error::GuestAddressOverFlow)?; 1972 1973 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1974 info!( 1975 "SGX EPC region: 0x{:x} (0x{:x})", 1976 epc_region_start.0, epc_region_size 1977 ); 1978 1979 // Each section can be memory mapped into the allocated region. 1980 let mut epc_section_start = epc_region_start.raw_value(); 1981 for epc_section in sgx_epc_config.iter() { 1982 let file = OpenOptions::new() 1983 .read(true) 1984 .write(true) 1985 .open("/dev/sgx_vepc") 1986 .map_err(Error::SgxVirtEpcOpen)?; 1987 1988 let prot = PROT_READ | PROT_WRITE; 1989 let mut flags = MAP_NORESERVE | MAP_SHARED; 1990 if epc_section.prefault { 1991 flags |= MAP_POPULATE; 1992 } 1993 1994 // We can't use the vm-memory crate to perform the memory mapping 1995 // here as it would try to ensure the size of the backing file is 1996 // matching the size of the expected mapping. The /dev/sgx_vepc 1997 // device does not work that way, it provides a file descriptor 1998 // which is not matching the mapping size, as it's a just a way to 1999 // let KVM know that an EPC section is being created for the guest. 2000 // SAFETY: FFI call with correct arguments 2001 let host_addr = unsafe { 2002 libc::mmap( 2003 std::ptr::null_mut(), 2004 epc_section.size as usize, 2005 prot, 2006 flags, 2007 file.as_raw_fd(), 2008 0, 2009 ) 2010 } as u64; 2011 2012 info!( 2013 "Adding SGX EPC section: 0x{:x} (0x{:x})", 2014 epc_section_start, epc_section.size 2015 ); 2016 2017 let _mem_slot = self.create_userspace_mapping( 2018 epc_section_start, 2019 epc_section.size, 2020 host_addr, 2021 false, 2022 false, 2023 false, 2024 )?; 2025 2026 sgx_epc_region.insert( 2027 epc_section.id.clone(), 2028 SgxEpcSection::new( 2029 GuestAddress(epc_section_start), 2030 epc_section.size as GuestUsize, 2031 ), 2032 ); 2033 2034 epc_section_start += epc_section.size; 2035 } 2036 2037 self.sgx_epc_region = Some(sgx_epc_region); 2038 2039 Ok(()) 2040 } 2041 2042 #[cfg(target_arch = "x86_64")] 2043 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 2044 &self.sgx_epc_region 2045 } 2046 2047 pub fn is_hardlink(f: &File) -> bool { 2048 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 2049 // SAFETY: FFI call with correct arguments 2050 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 2051 if ret != 0 { 2052 error!("Couldn't fstat the backing file"); 2053 return false; 2054 } 2055 2056 // SAFETY: stat is valid 2057 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 2058 } 2059 2060 pub fn memory_zones(&self) -> &MemoryZones { 2061 &self.memory_zones 2062 } 2063 2064 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 2065 &mut self.memory_zones 2066 } 2067 2068 pub fn memory_range_table( 2069 &self, 2070 snapshot: bool, 2071 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 2072 let mut table = MemoryRangeTable::default(); 2073 2074 for memory_zone in self.memory_zones.values() { 2075 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 2076 table.extend(virtio_mem_zone.plugged_ranges()); 2077 } 2078 2079 for region in memory_zone.regions() { 2080 if snapshot { 2081 if let Some(file_offset) = region.file_offset() { 2082 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 2083 && Self::is_hardlink(file_offset.file()) 2084 { 2085 // In this very specific case, we know the memory 2086 // region is backed by a file on the host filesystem 2087 // that can be accessed by the user, and additionally 2088 // the mapping is shared, which means that modifications 2089 // to the content are written to the actual file. 2090 // When meeting these conditions, we can skip the 2091 // copy of the memory content for this specific region, 2092 // as we can assume the user will have it saved through 2093 // the backing file already. 2094 continue; 2095 } 2096 } 2097 } 2098 2099 table.push(MemoryRange { 2100 gpa: region.start_addr().raw_value(), 2101 length: region.len(), 2102 }); 2103 } 2104 } 2105 2106 Ok(table) 2107 } 2108 2109 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 2110 MemoryManagerSnapshotData { 2111 memory_ranges: self.snapshot_memory_ranges.clone(), 2112 guest_ram_mappings: self.guest_ram_mappings.clone(), 2113 start_of_device_area: self.start_of_device_area.0, 2114 boot_ram: self.boot_ram, 2115 current_ram: self.current_ram, 2116 arch_mem_regions: self.arch_mem_regions.clone(), 2117 hotplug_slots: self.hotplug_slots.clone(), 2118 next_memory_slot: self.next_memory_slot, 2119 selected_slot: self.selected_slot, 2120 next_hotplug_slot: self.next_hotplug_slot, 2121 } 2122 } 2123 2124 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 2125 let mut memory_slot_fds = HashMap::new(); 2126 for guest_ram_mapping in &self.guest_ram_mappings { 2127 let slot = guest_ram_mapping.slot; 2128 let guest_memory = self.guest_memory.memory(); 2129 let file = guest_memory 2130 .find_region(GuestAddress(guest_ram_mapping.gpa)) 2131 .unwrap() 2132 .file_offset() 2133 .unwrap() 2134 .file(); 2135 memory_slot_fds.insert(slot, file.as_raw_fd()); 2136 } 2137 memory_slot_fds 2138 } 2139 2140 pub fn acpi_address(&self) -> Option<GuestAddress> { 2141 self.acpi_address 2142 } 2143 2144 pub fn num_guest_ram_mappings(&self) -> u32 { 2145 self.guest_ram_mappings.len() as u32 2146 } 2147 2148 #[cfg(target_arch = "aarch64")] 2149 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 2150 self.uefi_flash.as_ref().unwrap().clone() 2151 } 2152 2153 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2154 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 2155 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 2156 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 2157 2158 let mut mem_offset_in_elf = mem_offset; 2159 let mut ram_maps = BTreeMap::new(); 2160 for mapping in mapping_sorted_by_gpa.iter() { 2161 ram_maps.insert( 2162 mapping.gpa, 2163 CoredumpMemoryRegion { 2164 mem_offset_in_elf, 2165 mem_size: mapping.size, 2166 }, 2167 ); 2168 mem_offset_in_elf += mapping.size; 2169 } 2170 2171 CoredumpMemoryRegions { ram_maps } 2172 } 2173 2174 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2175 pub fn coredump_iterate_save_mem( 2176 &mut self, 2177 dump_state: &DumpState, 2178 ) -> std::result::Result<(), GuestDebuggableError> { 2179 let snapshot_memory_ranges = self 2180 .memory_range_table(false) 2181 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2182 2183 if snapshot_memory_ranges.is_empty() { 2184 return Ok(()); 2185 } 2186 2187 let coredump_file = dump_state.file.as_ref().unwrap(); 2188 2189 let guest_memory = self.guest_memory.memory(); 2190 let mut total_bytes: u64 = 0; 2191 2192 for range in snapshot_memory_ranges.regions() { 2193 let mut offset: u64 = 0; 2194 loop { 2195 let bytes_written = guest_memory 2196 .write_volatile_to( 2197 GuestAddress(range.gpa + offset), 2198 &mut coredump_file.as_fd(), 2199 (range.length - offset) as usize, 2200 ) 2201 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2202 offset += bytes_written as u64; 2203 total_bytes += bytes_written as u64; 2204 2205 if offset == range.length { 2206 break; 2207 } 2208 } 2209 } 2210 2211 debug!("coredump total bytes {}", total_bytes); 2212 Ok(()) 2213 } 2214 2215 pub fn receive_memory_regions<F>( 2216 &mut self, 2217 ranges: &MemoryRangeTable, 2218 fd: &mut F, 2219 ) -> std::result::Result<(), MigratableError> 2220 where 2221 F: ReadVolatile, 2222 { 2223 let guest_memory = self.guest_memory(); 2224 let mem = guest_memory.memory(); 2225 2226 for range in ranges.regions() { 2227 let mut offset: u64 = 0; 2228 // Here we are manually handling the retry in case we can't the 2229 // whole region at once because we can't use the implementation 2230 // from vm-memory::GuestMemory of read_exact_from() as it is not 2231 // following the correct behavior. For more info about this issue 2232 // see: https://github.com/rust-vmm/vm-memory/issues/174 2233 loop { 2234 let bytes_read = mem 2235 .read_volatile_from( 2236 GuestAddress(range.gpa + offset), 2237 fd, 2238 (range.length - offset) as usize, 2239 ) 2240 .map_err(|e| { 2241 MigratableError::MigrateReceive(anyhow!( 2242 "Error receiving memory from socket: {}", 2243 e 2244 )) 2245 })?; 2246 offset += bytes_read as u64; 2247 2248 if offset == range.length { 2249 break; 2250 } 2251 } 2252 } 2253 2254 Ok(()) 2255 } 2256 } 2257 2258 struct MemoryNotify { 2259 slot_id: usize, 2260 } 2261 2262 impl Aml for MemoryNotify { 2263 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2264 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2265 aml::If::new( 2266 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2267 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2268 ) 2269 .to_aml_bytes(sink) 2270 } 2271 } 2272 2273 struct MemorySlot { 2274 slot_id: usize, 2275 } 2276 2277 impl Aml for MemorySlot { 2278 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2279 aml::Device::new( 2280 format!("M{:03}", self.slot_id).as_str().into(), 2281 vec![ 2282 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")), 2283 &aml::Name::new("_UID".into(), &self.slot_id), 2284 /* 2285 _STA return value: 2286 Bit [0] – Set if the device is present. 2287 Bit [1] – Set if the device is enabled and decoding its resources. 2288 Bit [2] – Set if the device should be shown in the UI. 2289 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2290 Bit [4] – Set if the battery is present. 2291 Bits [31:5] – Reserved (must be cleared). 2292 */ 2293 &aml::Method::new( 2294 "_STA".into(), 2295 0, 2296 false, 2297 // Call into MSTA method which will interrogate device 2298 vec![&aml::Return::new(&aml::MethodCall::new( 2299 "MSTA".into(), 2300 vec![&self.slot_id], 2301 ))], 2302 ), 2303 // Get details of memory 2304 &aml::Method::new( 2305 "_CRS".into(), 2306 0, 2307 false, 2308 // Call into MCRS which provides actual memory details 2309 vec![&aml::Return::new(&aml::MethodCall::new( 2310 "MCRS".into(), 2311 vec![&self.slot_id], 2312 ))], 2313 ), 2314 ], 2315 ) 2316 .to_aml_bytes(sink) 2317 } 2318 } 2319 2320 struct MemorySlots { 2321 slots: usize, 2322 } 2323 2324 impl Aml for MemorySlots { 2325 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2326 for slot_id in 0..self.slots { 2327 MemorySlot { slot_id }.to_aml_bytes(sink); 2328 } 2329 } 2330 } 2331 2332 struct MemoryMethods { 2333 slots: usize, 2334 } 2335 2336 impl Aml for MemoryMethods { 2337 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2338 // Add "MTFY" notification method 2339 let mut memory_notifies = Vec::new(); 2340 for slot_id in 0..self.slots { 2341 memory_notifies.push(MemoryNotify { slot_id }); 2342 } 2343 2344 let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2345 for memory_notifier in memory_notifies.iter() { 2346 memory_notifies_refs.push(memory_notifier); 2347 } 2348 2349 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink); 2350 2351 // MSCN method 2352 aml::Method::new( 2353 "MSCN".into(), 2354 0, 2355 true, 2356 vec![ 2357 // Take lock defined above 2358 &aml::Acquire::new("MLCK".into(), 0xffff), 2359 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2360 &aml::While::new( 2361 &aml::LessThan::new(&aml::Local(0), &self.slots), 2362 vec![ 2363 // Write slot number (in first argument) to I/O port via field 2364 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2365 // Check if MINS bit is set (inserting) 2366 &aml::If::new( 2367 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2368 // Notify device if it is 2369 vec![ 2370 &aml::MethodCall::new( 2371 "MTFY".into(), 2372 vec![&aml::Local(0), &aml::ONE], 2373 ), 2374 // Reset MINS bit 2375 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2376 ], 2377 ), 2378 // Check if MRMV bit is set 2379 &aml::If::new( 2380 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2381 // Notify device if it is (with the eject constant 0x3) 2382 vec![ 2383 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2384 // Reset MRMV bit 2385 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2386 ], 2387 ), 2388 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2389 ], 2390 ), 2391 // Release lock 2392 &aml::Release::new("MLCK".into()), 2393 ], 2394 ) 2395 .to_aml_bytes(sink); 2396 2397 // Memory status method 2398 aml::Method::new( 2399 "MSTA".into(), 2400 1, 2401 true, 2402 vec![ 2403 // Take lock defined above 2404 &aml::Acquire::new("MLCK".into(), 0xffff), 2405 // Write slot number (in first argument) to I/O port via field 2406 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2407 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2408 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2409 &aml::If::new( 2410 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2411 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2412 ), 2413 // Release lock 2414 &aml::Release::new("MLCK".into()), 2415 // Return 0 or 0xf 2416 &aml::Return::new(&aml::Local(0)), 2417 ], 2418 ) 2419 .to_aml_bytes(sink); 2420 2421 // Memory range method 2422 aml::Method::new( 2423 "MCRS".into(), 2424 1, 2425 true, 2426 vec![ 2427 // Take lock defined above 2428 &aml::Acquire::new("MLCK".into(), 0xffff), 2429 // Write slot number (in first argument) to I/O port via field 2430 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2431 &aml::Name::new( 2432 "MR64".into(), 2433 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2434 aml::AddressSpaceCacheable::Cacheable, 2435 true, 2436 0x0000_0000_0000_0000u64, 2437 0xFFFF_FFFF_FFFF_FFFEu64, 2438 None, 2439 )]), 2440 ), 2441 &aml::CreateQWordField::new( 2442 &aml::Path::new("MINL"), 2443 &aml::Path::new("MR64"), 2444 &14usize, 2445 ), 2446 &aml::CreateDWordField::new( 2447 &aml::Path::new("MINH"), 2448 &aml::Path::new("MR64"), 2449 &18usize, 2450 ), 2451 &aml::CreateQWordField::new( 2452 &aml::Path::new("MAXL"), 2453 &aml::Path::new("MR64"), 2454 &22usize, 2455 ), 2456 &aml::CreateDWordField::new( 2457 &aml::Path::new("MAXH"), 2458 &aml::Path::new("MR64"), 2459 &26usize, 2460 ), 2461 &aml::CreateQWordField::new( 2462 &aml::Path::new("LENL"), 2463 &aml::Path::new("MR64"), 2464 &38usize, 2465 ), 2466 &aml::CreateDWordField::new( 2467 &aml::Path::new("LENH"), 2468 &aml::Path::new("MR64"), 2469 &42usize, 2470 ), 2471 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2472 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2473 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2474 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2475 &aml::Add::new( 2476 &aml::Path::new("MAXL"), 2477 &aml::Path::new("MINL"), 2478 &aml::Path::new("LENL"), 2479 ), 2480 &aml::Add::new( 2481 &aml::Path::new("MAXH"), 2482 &aml::Path::new("MINH"), 2483 &aml::Path::new("LENH"), 2484 ), 2485 &aml::If::new( 2486 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2487 vec![&aml::Add::new( 2488 &aml::Path::new("MAXH"), 2489 &aml::ONE, 2490 &aml::Path::new("MAXH"), 2491 )], 2492 ), 2493 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2494 // Release lock 2495 &aml::Release::new("MLCK".into()), 2496 &aml::Return::new(&aml::Path::new("MR64")), 2497 ], 2498 ) 2499 .to_aml_bytes(sink) 2500 } 2501 } 2502 2503 impl Aml for MemoryManager { 2504 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2505 if let Some(acpi_address) = self.acpi_address { 2506 // Memory Hotplug Controller 2507 aml::Device::new( 2508 "_SB_.MHPC".into(), 2509 vec![ 2510 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2511 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2512 // Mutex to protect concurrent access as we write to choose slot and then read back status 2513 &aml::Mutex::new("MLCK".into(), 0), 2514 &aml::Name::new( 2515 "_CRS".into(), 2516 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2517 aml::AddressSpaceCacheable::NotCacheable, 2518 true, 2519 acpi_address.0, 2520 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2521 None, 2522 )]), 2523 ), 2524 // OpRegion and Fields map MMIO range into individual field values 2525 &aml::OpRegion::new( 2526 "MHPR".into(), 2527 aml::OpRegionSpace::SystemMemory, 2528 &(acpi_address.0 as usize), 2529 &MEMORY_MANAGER_ACPI_SIZE, 2530 ), 2531 &aml::Field::new( 2532 "MHPR".into(), 2533 aml::FieldAccessType::DWord, 2534 aml::FieldLockRule::NoLock, 2535 aml::FieldUpdateRule::Preserve, 2536 vec![ 2537 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2538 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2539 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2540 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2541 ], 2542 ), 2543 &aml::Field::new( 2544 "MHPR".into(), 2545 aml::FieldAccessType::DWord, 2546 aml::FieldLockRule::NoLock, 2547 aml::FieldUpdateRule::Preserve, 2548 vec![ 2549 aml::FieldEntry::Reserved(128), 2550 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2551 ], 2552 ), 2553 &aml::Field::new( 2554 "MHPR".into(), 2555 aml::FieldAccessType::Byte, 2556 aml::FieldLockRule::NoLock, 2557 aml::FieldUpdateRule::WriteAsZeroes, 2558 vec![ 2559 aml::FieldEntry::Reserved(160), 2560 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2561 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2562 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2563 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2564 ], 2565 ), 2566 &aml::Field::new( 2567 "MHPR".into(), 2568 aml::FieldAccessType::DWord, 2569 aml::FieldLockRule::NoLock, 2570 aml::FieldUpdateRule::Preserve, 2571 vec![ 2572 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2573 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2574 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2575 ], 2576 ), 2577 &MemoryMethods { 2578 slots: self.hotplug_slots.len(), 2579 }, 2580 &MemorySlots { 2581 slots: self.hotplug_slots.len(), 2582 }, 2583 ], 2584 ) 2585 .to_aml_bytes(sink); 2586 } else { 2587 aml::Device::new( 2588 "_SB_.MHPC".into(), 2589 vec![ 2590 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2591 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2592 // Empty MSCN for GED 2593 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2594 ], 2595 ) 2596 .to_aml_bytes(sink); 2597 } 2598 2599 #[cfg(target_arch = "x86_64")] 2600 { 2601 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2602 let min = sgx_epc_region.start().raw_value(); 2603 let max = min + sgx_epc_region.size() - 1; 2604 // SGX EPC region 2605 aml::Device::new( 2606 "_SB_.EPC_".into(), 2607 vec![ 2608 &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")), 2609 // QWORD describing the EPC region start and size 2610 &aml::Name::new( 2611 "_CRS".into(), 2612 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2613 aml::AddressSpaceCacheable::NotCacheable, 2614 true, 2615 min, 2616 max, 2617 None, 2618 )]), 2619 ), 2620 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2621 ], 2622 ) 2623 .to_aml_bytes(sink); 2624 } 2625 } 2626 } 2627 } 2628 2629 impl Pausable for MemoryManager {} 2630 2631 #[derive(Clone, Serialize, Deserialize, Versionize)] 2632 pub struct MemoryManagerSnapshotData { 2633 memory_ranges: MemoryRangeTable, 2634 guest_ram_mappings: Vec<GuestRamMapping>, 2635 start_of_device_area: u64, 2636 boot_ram: u64, 2637 current_ram: u64, 2638 arch_mem_regions: Vec<ArchMemRegion>, 2639 hotplug_slots: Vec<HotPlugState>, 2640 next_memory_slot: u32, 2641 selected_slot: usize, 2642 next_hotplug_slot: usize, 2643 } 2644 2645 impl VersionMapped for MemoryManagerSnapshotData {} 2646 2647 impl Snapshottable for MemoryManager { 2648 fn id(&self) -> String { 2649 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2650 } 2651 2652 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2653 let memory_ranges = self.memory_range_table(true)?; 2654 2655 // Store locally this list of ranges as it will be used through the 2656 // Transportable::send() implementation. The point is to avoid the 2657 // duplication of code regarding the creation of the path for each 2658 // region. The 'snapshot' step creates the list of memory regions, 2659 // including information about the need to copy a memory region or 2660 // not. This saves the 'send' step having to go through the same 2661 // process, and instead it can directly proceed with storing the 2662 // memory range content for the ranges requiring it. 2663 self.snapshot_memory_ranges = memory_ranges; 2664 2665 Ok(Snapshot::from_data(SnapshotData::new_from_versioned_state( 2666 &self.snapshot_data(), 2667 )?)) 2668 } 2669 } 2670 2671 impl Transportable for MemoryManager { 2672 fn send( 2673 &self, 2674 _snapshot: &Snapshot, 2675 destination_url: &str, 2676 ) -> result::Result<(), MigratableError> { 2677 if self.snapshot_memory_ranges.is_empty() { 2678 return Ok(()); 2679 } 2680 2681 let mut memory_file_path = url_to_path(destination_url)?; 2682 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2683 2684 // Create the snapshot file for the entire memory 2685 let mut memory_file = OpenOptions::new() 2686 .read(true) 2687 .write(true) 2688 .create_new(true) 2689 .open(memory_file_path) 2690 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2691 2692 let guest_memory = self.guest_memory.memory(); 2693 2694 for range in self.snapshot_memory_ranges.regions() { 2695 let mut offset: u64 = 0; 2696 // Here we are manually handling the retry in case we can't read 2697 // the whole region at once because we can't use the implementation 2698 // from vm-memory::GuestMemory of write_all_to() as it is not 2699 // following the correct behavior. For more info about this issue 2700 // see: https://github.com/rust-vmm/vm-memory/issues/174 2701 loop { 2702 let bytes_written = guest_memory 2703 .write_volatile_to( 2704 GuestAddress(range.gpa + offset), 2705 &mut memory_file, 2706 (range.length - offset) as usize, 2707 ) 2708 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2709 offset += bytes_written as u64; 2710 2711 if offset == range.length { 2712 break; 2713 } 2714 } 2715 } 2716 Ok(()) 2717 } 2718 } 2719 2720 impl Migratable for MemoryManager { 2721 // Start the dirty log in the hypervisor (kvm/mshv). 2722 // Also, reset the dirty bitmap logged by the vmm. 2723 // Just before we do a bulk copy we want to start/clear the dirty log so that 2724 // pages touched during our bulk copy are tracked. 2725 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2726 self.vm.start_dirty_log().map_err(|e| { 2727 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2728 })?; 2729 2730 for r in self.guest_memory.memory().iter() { 2731 r.bitmap().reset(); 2732 } 2733 2734 Ok(()) 2735 } 2736 2737 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2738 self.vm.stop_dirty_log().map_err(|e| { 2739 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2740 })?; 2741 2742 Ok(()) 2743 } 2744 2745 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2746 // together in the table if they are contiguous. 2747 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2748 let mut table = MemoryRangeTable::default(); 2749 for r in &self.guest_ram_mappings { 2750 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2751 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2752 })?; 2753 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2754 { 2755 Some(region) => { 2756 assert!(region.start_addr().raw_value() == r.gpa); 2757 assert!(region.len() == r.size); 2758 region.bitmap().get_and_reset() 2759 } 2760 None => { 2761 return Err(MigratableError::MigrateSend(anyhow!( 2762 "Error finding 'guest memory region' with address {:x}", 2763 r.gpa 2764 ))) 2765 } 2766 }; 2767 2768 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2769 .iter() 2770 .zip(vmm_dirty_bitmap.iter()) 2771 .map(|(x, y)| x | y) 2772 .collect(); 2773 2774 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2775 2776 if sub_table.regions().is_empty() { 2777 info!("Dirty Memory Range Table is empty"); 2778 } else { 2779 info!("Dirty Memory Range Table:"); 2780 for range in sub_table.regions() { 2781 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2782 } 2783 } 2784 2785 table.extend(sub_table); 2786 } 2787 Ok(table) 2788 } 2789 } 2790