1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 6 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 7 use std::collections::BTreeMap; 8 use std::collections::HashMap; 9 use std::fs::{File, OpenOptions}; 10 use std::io::{self}; 11 use std::ops::{BitAnd, Deref, Not, Sub}; 12 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 13 use std::os::fd::AsFd; 14 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 15 use std::path::PathBuf; 16 use std::sync::atomic::{AtomicU32, Ordering}; 17 use std::sync::{Arc, Barrier, Mutex}; 18 use std::{ffi, result, thread}; 19 20 use acpi_tables::{aml, Aml}; 21 use anyhow::anyhow; 22 #[cfg(target_arch = "x86_64")] 23 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 24 use arch::RegionType; 25 #[cfg(target_arch = "x86_64")] 26 use devices::ioapic; 27 #[cfg(target_arch = "aarch64")] 28 use hypervisor::HypervisorVmError; 29 use libc::_SC_NPROCESSORS_ONLN; 30 #[cfg(target_arch = "x86_64")] 31 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 32 use serde::{Deserialize, Serialize}; 33 use tracer::trace_scoped; 34 use virtio_devices::BlocksState; 35 #[cfg(target_arch = "x86_64")] 36 use vm_allocator::GsiApic; 37 use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator}; 38 use vm_device::BusDevice; 39 use vm_memory::bitmap::AtomicBitmap; 40 use vm_memory::guest_memory::FileOffset; 41 use vm_memory::mmap::MmapRegionError; 42 use vm_memory::{ 43 Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 44 GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile, 45 }; 46 use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; 47 use vm_migration::{ 48 Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable, 49 }; 50 51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 52 use crate::coredump::{ 53 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, 54 }; 55 use crate::migration::url_to_path; 56 #[cfg(target_arch = "x86_64")] 57 use crate::vm_config::SgxEpcConfig; 58 use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 59 use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID}; 60 61 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 62 63 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 64 65 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 66 67 #[cfg(target_arch = "x86_64")] 68 const X86_64_IRQ_BASE: u32 = 5; 69 70 #[cfg(target_arch = "x86_64")] 71 const SGX_PAGE_SIZE: u64 = 1 << 12; 72 73 const HOTPLUG_COUNT: usize = 8; 74 75 // Memory policy constants 76 const MPOL_BIND: u32 = 2; 77 const MPOL_MF_STRICT: u32 = 1; 78 const MPOL_MF_MOVE: u32 = 1 << 1; 79 80 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 81 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 82 83 const MAX_PREFAULT_THREAD_COUNT: usize = 16; 84 85 #[derive(Clone, Default, Serialize, Deserialize)] 86 struct HotPlugState { 87 base: u64, 88 length: u64, 89 active: bool, 90 inserting: bool, 91 removing: bool, 92 } 93 94 pub struct VirtioMemZone { 95 region: Arc<GuestRegionMmap>, 96 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 97 hotplugged_size: u64, 98 hugepages: bool, 99 blocks_state: Arc<Mutex<BlocksState>>, 100 } 101 102 impl VirtioMemZone { 103 pub fn region(&self) -> &Arc<GuestRegionMmap> { 104 &self.region 105 } 106 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 107 self.virtio_device = Some(virtio_device); 108 } 109 pub fn hotplugged_size(&self) -> u64 { 110 self.hotplugged_size 111 } 112 pub fn hugepages(&self) -> bool { 113 self.hugepages 114 } 115 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 116 &self.blocks_state 117 } 118 pub fn plugged_ranges(&self) -> MemoryRangeTable { 119 self.blocks_state 120 .lock() 121 .unwrap() 122 .memory_ranges(self.region.start_addr().raw_value(), true) 123 } 124 } 125 126 #[derive(Default)] 127 pub struct MemoryZone { 128 regions: Vec<Arc<GuestRegionMmap>>, 129 virtio_mem_zone: Option<VirtioMemZone>, 130 } 131 132 impl MemoryZone { 133 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 134 &self.regions 135 } 136 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 137 &self.virtio_mem_zone 138 } 139 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 140 self.virtio_mem_zone.as_mut() 141 } 142 } 143 144 pub type MemoryZones = HashMap<String, MemoryZone>; 145 146 #[derive(Clone, Serialize, Deserialize)] 147 struct GuestRamMapping { 148 slot: u32, 149 gpa: u64, 150 size: u64, 151 zone_id: String, 152 virtio_mem: bool, 153 file_offset: u64, 154 } 155 156 #[derive(Clone, Serialize, Deserialize)] 157 struct ArchMemRegion { 158 base: u64, 159 size: usize, 160 r_type: RegionType, 161 } 162 163 pub struct MemoryManager { 164 boot_guest_memory: GuestMemoryMmap, 165 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 166 next_memory_slot: Arc<AtomicU32>, 167 memory_slot_free_list: Arc<Mutex<Vec<u32>>>, 168 start_of_device_area: GuestAddress, 169 end_of_device_area: GuestAddress, 170 end_of_ram_area: GuestAddress, 171 pub vm: Arc<dyn hypervisor::Vm>, 172 hotplug_slots: Vec<HotPlugState>, 173 selected_slot: usize, 174 mergeable: bool, 175 allocator: Arc<Mutex<SystemAllocator>>, 176 hotplug_method: HotplugMethod, 177 boot_ram: u64, 178 current_ram: u64, 179 next_hotplug_slot: usize, 180 shared: bool, 181 hugepages: bool, 182 hugepage_size: Option<u64>, 183 prefault: bool, 184 thp: bool, 185 #[cfg(target_arch = "x86_64")] 186 sgx_epc_region: Option<SgxEpcRegion>, 187 user_provided_zones: bool, 188 snapshot_memory_ranges: MemoryRangeTable, 189 memory_zones: MemoryZones, 190 log_dirty: bool, // Enable dirty logging for created RAM regions 191 arch_mem_regions: Vec<ArchMemRegion>, 192 ram_allocator: AddressAllocator, 193 dynamic: bool, 194 195 // Keep track of calls to create_userspace_mapping() for guest RAM. 196 // This is useful for getting the dirty pages as we need to know the 197 // slots that the mapping is created in. 198 guest_ram_mappings: Vec<GuestRamMapping>, 199 200 pub acpi_address: Option<GuestAddress>, 201 #[cfg(target_arch = "aarch64")] 202 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 203 } 204 205 #[derive(Debug)] 206 pub enum Error { 207 /// Failed to create shared file. 208 SharedFileCreate(io::Error), 209 210 /// Failed to set shared file length. 211 SharedFileSetLen(io::Error), 212 213 /// Mmap backed guest memory error 214 GuestMemory(MmapError), 215 216 /// Failed to allocate a memory range. 217 MemoryRangeAllocation, 218 219 /// Error from region creation 220 GuestMemoryRegion(MmapRegionError), 221 222 /// No ACPI slot available 223 NoSlotAvailable, 224 225 /// Not enough space in the hotplug RAM region 226 InsufficientHotplugRam, 227 228 /// The requested hotplug memory addition is not a valid size 229 InvalidSize, 230 231 /// Failed to create the user memory region. 232 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 233 234 /// Failed to remove the user memory region. 235 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 236 237 /// Failed to EventFd. 238 EventFdFail(io::Error), 239 240 /// Eventfd write error 241 EventfdError(io::Error), 242 243 /// Failed to virtio-mem resize 244 VirtioMemResizeFail(virtio_devices::mem::Error), 245 246 /// Cannot restore VM 247 Restore(MigratableError), 248 249 /// Cannot restore VM because source URL is missing 250 RestoreMissingSourceUrl, 251 252 /// Cannot create the system allocator 253 CreateSystemAllocator, 254 255 /// Invalid SGX EPC section size 256 #[cfg(target_arch = "x86_64")] 257 EpcSectionSizeInvalid, 258 259 /// Failed allocating SGX EPC region 260 #[cfg(target_arch = "x86_64")] 261 SgxEpcRangeAllocation, 262 263 /// Failed opening SGX virtual EPC device 264 #[cfg(target_arch = "x86_64")] 265 SgxVirtEpcOpen(io::Error), 266 267 /// Failed setting the SGX virtual EPC section size 268 #[cfg(target_arch = "x86_64")] 269 SgxVirtEpcFileSetLen(io::Error), 270 271 /// Failed opening SGX provisioning device 272 #[cfg(target_arch = "x86_64")] 273 SgxProvisionOpen(io::Error), 274 275 /// Failed enabling SGX provisioning 276 #[cfg(target_arch = "x86_64")] 277 SgxEnableProvisioning(hypervisor::HypervisorVmError), 278 279 /// Failed creating a new MmapRegion instance. 280 #[cfg(target_arch = "x86_64")] 281 NewMmapRegion(vm_memory::mmap::MmapRegionError), 282 283 /// No memory zones found. 284 MissingMemoryZones, 285 286 /// Memory configuration is not valid. 287 InvalidMemoryParameters, 288 289 /// Forbidden operation. Impossible to resize guest memory if it is 290 /// backed by user defined memory regions. 291 InvalidResizeWithMemoryZones, 292 293 /// It's invalid to try applying a NUMA policy to a memory zone that is 294 /// memory mapped with MAP_SHARED. 295 InvalidSharedMemoryZoneWithHostNuma, 296 297 /// Failed applying NUMA memory policy. 298 ApplyNumaPolicy(io::Error), 299 300 /// Memory zone identifier is not unique. 301 DuplicateZoneId, 302 303 /// No virtio-mem resizing handler found. 304 MissingVirtioMemHandler, 305 306 /// Unknown memory zone. 307 UnknownMemoryZone, 308 309 /// Invalid size for resizing. Can be anything except 0. 310 InvalidHotplugSize, 311 312 /// Invalid hotplug method associated with memory zones resizing capability. 313 InvalidHotplugMethodWithMemoryZones, 314 315 /// Could not find specified memory zone identifier from hash map. 316 MissingZoneIdentifier, 317 318 /// Resizing the memory zone failed. 319 ResizeZone, 320 321 /// Guest address overflow 322 GuestAddressOverFlow, 323 324 /// Error opening snapshot file 325 SnapshotOpen(io::Error), 326 327 // Error copying snapshot into region 328 SnapshotCopy(GuestMemoryError), 329 330 /// Failed to allocate MMIO address 331 AllocateMmioAddress, 332 333 #[cfg(target_arch = "aarch64")] 334 /// Failed to create UEFI flash 335 CreateUefiFlash(HypervisorVmError), 336 337 /// Using a directory as a backing file for memory is not supported 338 DirectoryAsBackingFileForMemory, 339 340 /// Failed to stat filesystem 341 GetFileSystemBlockSize(io::Error), 342 343 /// Memory size is misaligned with default page size or its hugepage size 344 MisalignedMemorySize, 345 } 346 347 const ENABLE_FLAG: usize = 0; 348 const INSERTING_FLAG: usize = 1; 349 const REMOVING_FLAG: usize = 2; 350 const EJECT_FLAG: usize = 3; 351 352 const BASE_OFFSET_LOW: u64 = 0; 353 const BASE_OFFSET_HIGH: u64 = 0x4; 354 const LENGTH_OFFSET_LOW: u64 = 0x8; 355 const LENGTH_OFFSET_HIGH: u64 = 0xC; 356 const STATUS_OFFSET: u64 = 0x14; 357 const SELECTION_OFFSET: u64 = 0; 358 359 // The MMIO address space size is subtracted with 64k. This is done for the 360 // following reasons: 361 // - Reduce the addressable space size by at least 4k to workaround a Linux 362 // bug when the VMM allocates devices at the end of the addressable space 363 // - Windows requires the addressable space size to be 64k aligned 364 fn mmio_address_space_size(phys_bits: u8) -> u64 { 365 (1 << phys_bits) - (1 << 16) 366 } 367 368 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the 369 // `f_bsize` field. 370 // 371 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169 372 fn statfs_get_bsize(path: &str) -> Result<u64, Error> { 373 let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?; 374 let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit(); 375 376 // SAFETY: FFI call with a valid path and buffer 377 let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) }; 378 if ret != 0 { 379 return Err(Error::GetFileSystemBlockSize( 380 std::io::Error::last_os_error(), 381 )); 382 } 383 384 // SAFETY: `buf` is valid at this point 385 // Because this value is always positive, just convert it directly. 386 // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned 387 // by `clippy` on musl target. To avoid the warning, there should be `as _` instead of 388 // `as u64`. 389 let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _; 390 Ok(bsize) 391 } 392 393 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> { 394 // SAFETY: FFI call. Trivially safe. 395 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 396 397 // There is no backend file and the `hugepages` is disabled, just use system page size. 398 if zone.file.is_none() && !zone.hugepages { 399 return Ok(page_size); 400 } 401 402 // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly. 403 if zone.hugepages && zone.hugepage_size.is_some() { 404 return Ok(zone.hugepage_size.unwrap()); 405 } 406 407 // There are two scenarios here: 408 // - `hugepages` is enabled but `hugepage_size` is not specified: 409 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 410 // - The backing file is specified: 411 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 412 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 413 // value is less than or equal to the page size, just use the page size. 414 let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| { 415 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 416 })?; 417 418 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 419 420 Ok(align_size) 421 } 422 423 #[inline] 424 fn align_down<T>(val: T, align: T) -> T 425 where 426 T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>, 427 { 428 val & !(align - 1u8.into()) 429 } 430 431 #[inline] 432 fn is_aligned<T>(val: T, align: T) -> bool 433 where 434 T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq, 435 { 436 (val & (align - 1u8.into())) == 0u8.into() 437 } 438 439 impl BusDevice for MemoryManager { 440 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 441 if self.selected_slot < self.hotplug_slots.len() { 442 let state = &self.hotplug_slots[self.selected_slot]; 443 match offset { 444 BASE_OFFSET_LOW => { 445 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 446 } 447 BASE_OFFSET_HIGH => { 448 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 449 } 450 LENGTH_OFFSET_LOW => { 451 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 452 } 453 LENGTH_OFFSET_HIGH => { 454 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 455 } 456 STATUS_OFFSET => { 457 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 458 data.fill(0); 459 if state.active { 460 data[0] |= 1 << ENABLE_FLAG; 461 } 462 if state.inserting { 463 data[0] |= 1 << INSERTING_FLAG; 464 } 465 if state.removing { 466 data[0] |= 1 << REMOVING_FLAG; 467 } 468 } 469 _ => { 470 warn!( 471 "Unexpected offset for accessing memory manager device: {:#}", 472 offset 473 ); 474 } 475 } 476 } else { 477 warn!("Out of range memory slot: {}", self.selected_slot); 478 } 479 } 480 481 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 482 match offset { 483 SELECTION_OFFSET => { 484 self.selected_slot = usize::from(data[0]); 485 } 486 STATUS_OFFSET => { 487 if self.selected_slot < self.hotplug_slots.len() { 488 let state = &mut self.hotplug_slots[self.selected_slot]; 489 // The ACPI code writes back a 1 to acknowledge the insertion 490 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 491 state.inserting = false; 492 } 493 // Ditto for removal 494 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 495 state.removing = false; 496 } 497 // Trigger removal of "DIMM" 498 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 499 warn!("Ejection of memory not currently supported"); 500 } 501 } else { 502 warn!("Out of range memory slot: {}", self.selected_slot); 503 } 504 } 505 _ => { 506 warn!( 507 "Unexpected offset for accessing memory manager device: {:#}", 508 offset 509 ); 510 } 511 }; 512 None 513 } 514 } 515 516 impl MemoryManager { 517 /// Creates all memory regions based on the available RAM ranges defined 518 /// by `ram_regions`, and based on the description of the memory zones. 519 /// In practice, this function can perform multiple memory mappings of the 520 /// same backing file if there's a hole in the address space between two 521 /// RAM ranges. 522 /// 523 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 524 /// and zones containing two zones (size 1G and size 4G). 525 /// 526 /// This function will create 3 resulting memory regions: 527 /// - First one mapping entirely the first memory zone on 0-1G range 528 /// - Second one mapping partially the second memory zone on 1G-3G range 529 /// - Third one mapping partially the second memory zone on 4G-6G range 530 /// 531 /// Also, all memory regions are page-size aligned (e.g. their sizes must 532 /// be multiple of page-size), which may leave an additional hole in the 533 /// address space when hugepage is used. 534 fn create_memory_regions_from_zones( 535 ram_regions: &[(GuestAddress, usize)], 536 zones: &[MemoryZoneConfig], 537 prefault: Option<bool>, 538 thp: bool, 539 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 540 let mut zone_iter = zones.iter(); 541 let mut mem_regions = Vec::new(); 542 let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?; 543 let mut zone_align_size = memory_zone_get_align_size(zone)?; 544 let mut zone_offset = 0u64; 545 let mut memory_zones = HashMap::new(); 546 547 if !is_aligned(zone.size, zone_align_size) { 548 return Err(Error::MisalignedMemorySize); 549 } 550 551 // Add zone id to the list of memory zones. 552 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 553 554 for ram_region in ram_regions.iter() { 555 let mut ram_region_offset = 0; 556 let mut exit = false; 557 558 loop { 559 let mut ram_region_consumed = false; 560 let mut pull_next_zone = false; 561 562 let ram_region_available_size = 563 align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size); 564 if ram_region_available_size == 0 { 565 break; 566 } 567 let zone_sub_size = zone.size - zone_offset; 568 569 let file_offset = zone_offset; 570 let region_start = ram_region 571 .0 572 .checked_add(ram_region_offset) 573 .ok_or(Error::GuestAddressOverFlow)?; 574 let region_size = if zone_sub_size <= ram_region_available_size { 575 if zone_sub_size == ram_region_available_size { 576 ram_region_consumed = true; 577 } 578 579 ram_region_offset += zone_sub_size; 580 pull_next_zone = true; 581 582 zone_sub_size 583 } else { 584 zone_offset += ram_region_available_size; 585 ram_region_consumed = true; 586 587 ram_region_available_size 588 }; 589 590 info!( 591 "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}", 592 zone.id, 593 region_start.raw_value(), 594 region_size 595 ); 596 let region = MemoryManager::create_ram_region( 597 &zone.file, 598 file_offset, 599 region_start, 600 region_size as usize, 601 prefault.unwrap_or(zone.prefault), 602 zone.shared, 603 zone.hugepages, 604 zone.hugepage_size, 605 zone.host_numa_node, 606 None, 607 thp, 608 )?; 609 610 // Add region to the list of regions associated with the 611 // current memory zone. 612 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 613 memory_zone.regions.push(region.clone()); 614 } 615 616 mem_regions.push(region); 617 618 if pull_next_zone { 619 // Get the next zone and reset the offset. 620 zone_offset = 0; 621 if let Some(z) = zone_iter.next() { 622 zone = z; 623 } else { 624 exit = true; 625 break; 626 } 627 zone_align_size = memory_zone_get_align_size(zone)?; 628 if !is_aligned(zone.size, zone_align_size) { 629 return Err(Error::MisalignedMemorySize); 630 } 631 632 // Check if zone id already exist. In case it does, throw 633 // an error as we need unique identifiers. Otherwise, add 634 // the new zone id to the list of memory zones. 635 if memory_zones.contains_key(&zone.id) { 636 error!( 637 "Memory zone identifier '{}' found more than once. \ 638 It must be unique", 639 zone.id, 640 ); 641 return Err(Error::DuplicateZoneId); 642 } 643 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 644 } 645 646 if ram_region_consumed { 647 break; 648 } 649 } 650 651 if exit { 652 break; 653 } 654 } 655 656 Ok((mem_regions, memory_zones)) 657 } 658 659 // Restore both GuestMemory regions along with MemoryZone zones. 660 fn restore_memory_regions_and_zones( 661 guest_ram_mappings: &[GuestRamMapping], 662 zones_config: &[MemoryZoneConfig], 663 prefault: Option<bool>, 664 mut existing_memory_files: HashMap<u32, File>, 665 thp: bool, 666 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 667 let mut memory_regions = Vec::new(); 668 let mut memory_zones = HashMap::new(); 669 670 for zone_config in zones_config { 671 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 672 } 673 674 for guest_ram_mapping in guest_ram_mappings { 675 for zone_config in zones_config { 676 if guest_ram_mapping.zone_id == zone_config.id { 677 let region = MemoryManager::create_ram_region( 678 if guest_ram_mapping.virtio_mem { 679 &None 680 } else { 681 &zone_config.file 682 }, 683 guest_ram_mapping.file_offset, 684 GuestAddress(guest_ram_mapping.gpa), 685 guest_ram_mapping.size as usize, 686 prefault.unwrap_or(zone_config.prefault), 687 zone_config.shared, 688 zone_config.hugepages, 689 zone_config.hugepage_size, 690 zone_config.host_numa_node, 691 existing_memory_files.remove(&guest_ram_mapping.slot), 692 thp, 693 )?; 694 memory_regions.push(Arc::clone(®ion)); 695 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 696 if guest_ram_mapping.virtio_mem { 697 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 698 let region_size = region.len(); 699 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 700 region, 701 virtio_device: None, 702 hotplugged_size, 703 hugepages: zone_config.hugepages, 704 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 705 }); 706 } else { 707 memory_zone.regions.push(region); 708 } 709 } 710 } 711 } 712 } 713 714 memory_regions.sort_by_key(|x| x.start_addr()); 715 716 Ok((memory_regions, memory_zones)) 717 } 718 719 fn fill_saved_regions( 720 &mut self, 721 file_path: PathBuf, 722 saved_regions: MemoryRangeTable, 723 ) -> Result<(), Error> { 724 if saved_regions.is_empty() { 725 return Ok(()); 726 } 727 728 // Open (read only) the snapshot file. 729 let mut memory_file = OpenOptions::new() 730 .read(true) 731 .open(file_path) 732 .map_err(Error::SnapshotOpen)?; 733 734 let guest_memory = self.guest_memory.memory(); 735 for range in saved_regions.regions() { 736 let mut offset: u64 = 0; 737 // Here we are manually handling the retry in case we can't write 738 // the whole region at once because we can't use the implementation 739 // from vm-memory::GuestMemory of read_exact_from() as it is not 740 // following the correct behavior. For more info about this issue 741 // see: https://github.com/rust-vmm/vm-memory/issues/174 742 loop { 743 let bytes_read = guest_memory 744 .read_volatile_from( 745 GuestAddress(range.gpa + offset), 746 &mut memory_file, 747 (range.length - offset) as usize, 748 ) 749 .map_err(Error::SnapshotCopy)?; 750 offset += bytes_read as u64; 751 752 if offset == range.length { 753 break; 754 } 755 } 756 } 757 758 Ok(()) 759 } 760 761 fn validate_memory_config( 762 config: &MemoryConfig, 763 user_provided_zones: bool, 764 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 765 let mut allow_mem_hotplug = false; 766 767 if !user_provided_zones { 768 if config.zones.is_some() { 769 error!( 770 "User defined memory regions can't be provided if the \ 771 memory size is not 0" 772 ); 773 return Err(Error::InvalidMemoryParameters); 774 } 775 776 if config.hotplug_size.is_some() { 777 allow_mem_hotplug = true; 778 } 779 780 if let Some(hotplugged_size) = config.hotplugged_size { 781 if let Some(hotplug_size) = config.hotplug_size { 782 if hotplugged_size > hotplug_size { 783 error!( 784 "'hotplugged_size' {} can't be bigger than \ 785 'hotplug_size' {}", 786 hotplugged_size, hotplug_size, 787 ); 788 return Err(Error::InvalidMemoryParameters); 789 } 790 } else { 791 error!( 792 "Invalid to define 'hotplugged_size' when there is\ 793 no 'hotplug_size'" 794 ); 795 return Err(Error::InvalidMemoryParameters); 796 } 797 if config.hotplug_method == HotplugMethod::Acpi { 798 error!( 799 "Invalid to define 'hotplugged_size' with hotplug \ 800 method 'acpi'" 801 ); 802 return Err(Error::InvalidMemoryParameters); 803 } 804 } 805 806 // Create a single zone from the global memory config. This lets 807 // us reuse the codepath for user defined memory zones. 808 let zones = vec![MemoryZoneConfig { 809 id: String::from(DEFAULT_MEMORY_ZONE), 810 size: config.size, 811 file: None, 812 shared: config.shared, 813 hugepages: config.hugepages, 814 hugepage_size: config.hugepage_size, 815 host_numa_node: None, 816 hotplug_size: config.hotplug_size, 817 hotplugged_size: config.hotplugged_size, 818 prefault: config.prefault, 819 }]; 820 821 Ok((config.size, zones, allow_mem_hotplug)) 822 } else { 823 if config.zones.is_none() { 824 error!( 825 "User defined memory regions must be provided if the \ 826 memory size is 0" 827 ); 828 return Err(Error::MissingMemoryZones); 829 } 830 831 // Safe to unwrap as we checked right above there were some 832 // regions. 833 let zones = config.zones.clone().unwrap(); 834 if zones.is_empty() { 835 return Err(Error::MissingMemoryZones); 836 } 837 838 let mut total_ram_size: u64 = 0; 839 for zone in zones.iter() { 840 total_ram_size += zone.size; 841 842 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 843 error!( 844 "Invalid to set host NUMA policy for a memory zone \ 845 backed by a regular file and mapped as 'shared'" 846 ); 847 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 848 } 849 850 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 851 error!("Invalid to set ACPI hotplug method for memory zones"); 852 return Err(Error::InvalidHotplugMethodWithMemoryZones); 853 } 854 855 if let Some(hotplugged_size) = zone.hotplugged_size { 856 if let Some(hotplug_size) = zone.hotplug_size { 857 if hotplugged_size > hotplug_size { 858 error!( 859 "'hotplugged_size' {} can't be bigger than \ 860 'hotplug_size' {}", 861 hotplugged_size, hotplug_size, 862 ); 863 return Err(Error::InvalidMemoryParameters); 864 } 865 } else { 866 error!( 867 "Invalid to define 'hotplugged_size' when there is\ 868 no 'hotplug_size' for a memory zone" 869 ); 870 return Err(Error::InvalidMemoryParameters); 871 } 872 if config.hotplug_method == HotplugMethod::Acpi { 873 error!( 874 "Invalid to define 'hotplugged_size' with hotplug \ 875 method 'acpi'" 876 ); 877 return Err(Error::InvalidMemoryParameters); 878 } 879 } 880 } 881 882 Ok((total_ram_size, zones, allow_mem_hotplug)) 883 } 884 } 885 886 pub fn allocate_address_space(&mut self) -> Result<(), Error> { 887 let mut list = Vec::new(); 888 889 for (zone_id, memory_zone) in self.memory_zones.iter() { 890 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 891 memory_zone 892 .regions() 893 .iter() 894 .map(|r| (r.clone(), false)) 895 .collect(); 896 897 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 898 regions.push((virtio_mem_zone.region().clone(), true)); 899 } 900 901 list.push((zone_id.clone(), regions)); 902 } 903 904 for (zone_id, regions) in list { 905 for (region, virtio_mem) in regions { 906 let slot = self.create_userspace_mapping( 907 region.start_addr().raw_value(), 908 region.len(), 909 region.as_ptr() as u64, 910 self.mergeable, 911 false, 912 self.log_dirty, 913 )?; 914 915 let file_offset = if let Some(file_offset) = region.file_offset() { 916 file_offset.start() 917 } else { 918 0 919 }; 920 921 self.guest_ram_mappings.push(GuestRamMapping { 922 gpa: region.start_addr().raw_value(), 923 size: region.len(), 924 slot, 925 zone_id: zone_id.clone(), 926 virtio_mem, 927 file_offset, 928 }); 929 self.ram_allocator 930 .allocate(Some(region.start_addr()), region.len(), None) 931 .ok_or(Error::MemoryRangeAllocation)?; 932 } 933 } 934 935 // Allocate SubRegion and Reserved address ranges. 936 for region in self.arch_mem_regions.iter() { 937 if region.r_type == RegionType::Ram { 938 // Ignore the RAM type since ranges have already been allocated 939 // based on the GuestMemory regions. 940 continue; 941 } 942 self.ram_allocator 943 .allocate( 944 Some(GuestAddress(region.base)), 945 region.size as GuestUsize, 946 None, 947 ) 948 .ok_or(Error::MemoryRangeAllocation)?; 949 } 950 951 Ok(()) 952 } 953 954 #[cfg(target_arch = "aarch64")] 955 fn add_uefi_flash(&mut self) -> Result<(), Error> { 956 // On AArch64, the UEFI binary requires a flash device at address 0. 957 // 4 MiB memory is mapped to simulate the flash. 958 let uefi_mem_slot = self.allocate_memory_slot(); 959 let uefi_region = GuestRegionMmap::new( 960 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 961 arch::layout::UEFI_START, 962 ) 963 .unwrap(); 964 let uefi_mem_region = self.vm.make_user_memory_region( 965 uefi_mem_slot, 966 uefi_region.start_addr().raw_value(), 967 uefi_region.len(), 968 uefi_region.as_ptr() as u64, 969 false, 970 false, 971 ); 972 self.vm 973 .create_user_memory_region(uefi_mem_region) 974 .map_err(Error::CreateUefiFlash)?; 975 976 let uefi_flash = 977 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 978 979 self.uefi_flash = Some(uefi_flash); 980 981 Ok(()) 982 } 983 984 #[allow(clippy::too_many_arguments)] 985 pub fn new( 986 vm: Arc<dyn hypervisor::Vm>, 987 config: &MemoryConfig, 988 prefault: Option<bool>, 989 phys_bits: u8, 990 #[cfg(feature = "tdx")] tdx_enabled: bool, 991 restore_data: Option<&MemoryManagerSnapshotData>, 992 existing_memory_files: Option<HashMap<u32, File>>, 993 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 994 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 995 trace_scoped!("MemoryManager::new"); 996 997 let user_provided_zones = config.size == 0; 998 999 let mmio_address_space_size = mmio_address_space_size(phys_bits); 1000 debug_assert_eq!( 1001 (((mmio_address_space_size) >> 16) << 16), 1002 mmio_address_space_size 1003 ); 1004 let start_of_platform_device_area = 1005 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 1006 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 1007 1008 let (ram_size, zones, allow_mem_hotplug) = 1009 Self::validate_memory_config(config, user_provided_zones)?; 1010 1011 let ( 1012 start_of_device_area, 1013 boot_ram, 1014 current_ram, 1015 arch_mem_regions, 1016 memory_zones, 1017 guest_memory, 1018 boot_guest_memory, 1019 hotplug_slots, 1020 next_memory_slot, 1021 selected_slot, 1022 next_hotplug_slot, 1023 ) = if let Some(data) = restore_data { 1024 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 1025 &data.guest_ram_mappings, 1026 &zones, 1027 prefault, 1028 existing_memory_files.unwrap_or_default(), 1029 config.thp, 1030 )?; 1031 let guest_memory = 1032 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 1033 let boot_guest_memory = guest_memory.clone(); 1034 ( 1035 GuestAddress(data.start_of_device_area), 1036 data.boot_ram, 1037 data.current_ram, 1038 data.arch_mem_regions.clone(), 1039 memory_zones, 1040 guest_memory, 1041 boot_guest_memory, 1042 data.hotplug_slots.clone(), 1043 data.next_memory_slot, 1044 data.selected_slot, 1045 data.next_hotplug_slot, 1046 ) 1047 } else { 1048 // Init guest memory 1049 let arch_mem_regions = arch::arch_memory_regions(); 1050 1051 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1052 .iter() 1053 .filter(|r| r.2 == RegionType::Ram) 1054 .map(|r| (r.0, r.1)) 1055 .collect(); 1056 1057 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 1058 .iter() 1059 .map(|(a, b, c)| ArchMemRegion { 1060 base: a.0, 1061 size: *b, 1062 r_type: *c, 1063 }) 1064 .collect(); 1065 1066 let (mem_regions, mut memory_zones) = 1067 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 1068 1069 let mut guest_memory = 1070 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 1071 1072 let boot_guest_memory = guest_memory.clone(); 1073 1074 let mut start_of_device_area = 1075 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 1076 1077 // Update list of memory zones for resize. 1078 for zone in zones.iter() { 1079 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 1080 if let Some(hotplug_size) = zone.hotplug_size { 1081 if hotplug_size == 0 { 1082 error!("'hotplug_size' can't be 0"); 1083 return Err(Error::InvalidHotplugSize); 1084 } 1085 1086 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 1087 start_of_device_area = start_of_device_area 1088 .checked_add(hotplug_size) 1089 .ok_or(Error::GuestAddressOverFlow)?; 1090 } else { 1091 // Alignment must be "natural" i.e. same as size of block 1092 let start_addr = GuestAddress( 1093 start_of_device_area 1094 .0 1095 .div_ceil(virtio_devices::VIRTIO_MEM_ALIGN_SIZE) 1096 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 1097 ); 1098 1099 // When `prefault` is set by vm_restore, memory manager 1100 // will create ram region with `prefault` option in 1101 // restore config rather than same option in zone 1102 let region = MemoryManager::create_ram_region( 1103 &None, 1104 0, 1105 start_addr, 1106 hotplug_size as usize, 1107 prefault.unwrap_or(zone.prefault), 1108 zone.shared, 1109 zone.hugepages, 1110 zone.hugepage_size, 1111 zone.host_numa_node, 1112 None, 1113 config.thp, 1114 )?; 1115 1116 guest_memory = guest_memory 1117 .insert_region(Arc::clone(®ion)) 1118 .map_err(Error::GuestMemory)?; 1119 1120 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1121 let region_size = region.len(); 1122 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1123 region, 1124 virtio_device: None, 1125 hotplugged_size, 1126 hugepages: zone.hugepages, 1127 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1128 }); 1129 1130 start_of_device_area = start_addr 1131 .checked_add(hotplug_size) 1132 .ok_or(Error::GuestAddressOverFlow)?; 1133 } 1134 } 1135 } else { 1136 return Err(Error::MissingZoneIdentifier); 1137 } 1138 } 1139 1140 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1141 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1142 1143 ( 1144 start_of_device_area, 1145 ram_size, 1146 ram_size, 1147 arch_mem_regions, 1148 memory_zones, 1149 guest_memory, 1150 boot_guest_memory, 1151 hotplug_slots, 1152 0, 1153 0, 1154 0, 1155 ) 1156 }; 1157 1158 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1159 1160 // Both MMIO and PIO address spaces start at address 0. 1161 let allocator = Arc::new(Mutex::new( 1162 SystemAllocator::new( 1163 #[cfg(target_arch = "x86_64")] 1164 { 1165 GuestAddress(0) 1166 }, 1167 #[cfg(target_arch = "x86_64")] 1168 { 1169 1 << 16 1170 }, 1171 start_of_platform_device_area, 1172 PLATFORM_DEVICE_AREA_SIZE, 1173 #[cfg(target_arch = "x86_64")] 1174 vec![GsiApic::new( 1175 X86_64_IRQ_BASE, 1176 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1177 )], 1178 ) 1179 .ok_or(Error::CreateSystemAllocator)?, 1180 )); 1181 1182 #[cfg(not(feature = "tdx"))] 1183 let dynamic = true; 1184 #[cfg(feature = "tdx")] 1185 let dynamic = !tdx_enabled; 1186 1187 let acpi_address = if dynamic 1188 && config.hotplug_method == HotplugMethod::Acpi 1189 && (config.hotplug_size.unwrap_or_default() > 0) 1190 { 1191 Some( 1192 allocator 1193 .lock() 1194 .unwrap() 1195 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1196 .ok_or(Error::AllocateMmioAddress)?, 1197 ) 1198 } else { 1199 None 1200 }; 1201 1202 // If running on SGX the start of device area and RAM area may diverge but 1203 // at this point they are next to each other. 1204 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1205 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1206 1207 let mut memory_manager = MemoryManager { 1208 boot_guest_memory, 1209 guest_memory, 1210 next_memory_slot: Arc::new(AtomicU32::new(next_memory_slot)), 1211 memory_slot_free_list: Arc::new(Mutex::new(Vec::new())), 1212 start_of_device_area, 1213 end_of_device_area, 1214 end_of_ram_area, 1215 vm, 1216 hotplug_slots, 1217 selected_slot, 1218 mergeable: config.mergeable, 1219 allocator, 1220 hotplug_method: config.hotplug_method, 1221 boot_ram, 1222 current_ram, 1223 next_hotplug_slot, 1224 shared: config.shared, 1225 hugepages: config.hugepages, 1226 hugepage_size: config.hugepage_size, 1227 prefault: config.prefault, 1228 #[cfg(target_arch = "x86_64")] 1229 sgx_epc_region: None, 1230 user_provided_zones, 1231 snapshot_memory_ranges: MemoryRangeTable::default(), 1232 memory_zones, 1233 guest_ram_mappings: Vec::new(), 1234 acpi_address, 1235 log_dirty: dynamic, // Cannot log dirty pages on a TD 1236 arch_mem_regions, 1237 ram_allocator, 1238 dynamic, 1239 #[cfg(target_arch = "aarch64")] 1240 uefi_flash: None, 1241 thp: config.thp, 1242 }; 1243 1244 #[cfg(target_arch = "aarch64")] 1245 { 1246 // For Aarch64 we cannot lazily allocate the address space like we 1247 // do for x86, because while restoring a VM from snapshot we would 1248 // need the address space to be allocated to properly restore VGIC. 1249 // And the restore of VGIC happens before we attempt to run the vCPUs 1250 // for the first time, thus we need to allocate the address space 1251 // beforehand. 1252 memory_manager.allocate_address_space()?; 1253 memory_manager.add_uefi_flash()?; 1254 } 1255 1256 #[cfg(target_arch = "x86_64")] 1257 if let Some(sgx_epc_config) = sgx_epc_config { 1258 memory_manager.setup_sgx(sgx_epc_config)?; 1259 } 1260 1261 Ok(Arc::new(Mutex::new(memory_manager))) 1262 } 1263 1264 pub fn new_from_snapshot( 1265 snapshot: &Snapshot, 1266 vm: Arc<dyn hypervisor::Vm>, 1267 config: &MemoryConfig, 1268 source_url: Option<&str>, 1269 prefault: bool, 1270 phys_bits: u8, 1271 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1272 if let Some(source_url) = source_url { 1273 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1274 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1275 1276 let mem_snapshot: MemoryManagerSnapshotData = 1277 snapshot.to_state().map_err(Error::Restore)?; 1278 1279 let mm = MemoryManager::new( 1280 vm, 1281 config, 1282 Some(prefault), 1283 phys_bits, 1284 #[cfg(feature = "tdx")] 1285 false, 1286 Some(&mem_snapshot), 1287 None, 1288 #[cfg(target_arch = "x86_64")] 1289 None, 1290 )?; 1291 1292 mm.lock() 1293 .unwrap() 1294 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1295 1296 Ok(mm) 1297 } else { 1298 Err(Error::RestoreMissingSourceUrl) 1299 } 1300 } 1301 1302 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1303 // SAFETY: FFI call with correct arguments 1304 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1305 1306 if res < 0 { 1307 Err(io::Error::last_os_error()) 1308 } else { 1309 Ok(res as RawFd) 1310 } 1311 } 1312 1313 fn mbind( 1314 addr: *mut u8, 1315 len: u64, 1316 mode: u32, 1317 nodemask: Vec<u64>, 1318 maxnode: u64, 1319 flags: u32, 1320 ) -> Result<(), io::Error> { 1321 // SAFETY: FFI call with correct arguments 1322 let res = unsafe { 1323 libc::syscall( 1324 libc::SYS_mbind, 1325 addr as *mut libc::c_void, 1326 len, 1327 mode, 1328 nodemask.as_ptr(), 1329 maxnode, 1330 flags, 1331 ) 1332 }; 1333 1334 if res < 0 { 1335 Err(io::Error::last_os_error()) 1336 } else { 1337 Ok(()) 1338 } 1339 } 1340 1341 fn create_anonymous_file( 1342 size: usize, 1343 hugepages: bool, 1344 hugepage_size: Option<u64>, 1345 ) -> Result<FileOffset, Error> { 1346 let fd = Self::memfd_create( 1347 &ffi::CString::new("ch_ram").unwrap(), 1348 libc::MFD_CLOEXEC 1349 | if hugepages { 1350 libc::MFD_HUGETLB 1351 | if let Some(hugepage_size) = hugepage_size { 1352 /* 1353 * From the Linux kernel: 1354 * Several system calls take a flag to request "hugetlb" huge pages. 1355 * Without further specification, these system calls will use the 1356 * system's default huge page size. If a system supports multiple 1357 * huge page sizes, the desired huge page size can be specified in 1358 * bits [26:31] of the flag arguments. The value in these 6 bits 1359 * will encode the log2 of the huge page size. 1360 */ 1361 1362 hugepage_size.trailing_zeros() << 26 1363 } else { 1364 // Use the system default huge page size 1365 0 1366 } 1367 } else { 1368 0 1369 }, 1370 ) 1371 .map_err(Error::SharedFileCreate)?; 1372 1373 // SAFETY: fd is valid 1374 let f = unsafe { File::from_raw_fd(fd) }; 1375 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1376 1377 Ok(FileOffset::new(f, 0)) 1378 } 1379 1380 fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> { 1381 if backing_file.is_dir() { 1382 Err(Error::DirectoryAsBackingFileForMemory) 1383 } else { 1384 let f = OpenOptions::new() 1385 .read(true) 1386 .write(true) 1387 .open(backing_file) 1388 .map_err(Error::SharedFileCreate)?; 1389 1390 Ok(FileOffset::new(f, file_offset)) 1391 } 1392 } 1393 1394 #[allow(clippy::too_many_arguments)] 1395 pub fn create_ram_region( 1396 backing_file: &Option<PathBuf>, 1397 file_offset: u64, 1398 start_addr: GuestAddress, 1399 size: usize, 1400 prefault: bool, 1401 shared: bool, 1402 hugepages: bool, 1403 hugepage_size: Option<u64>, 1404 host_numa_node: Option<u32>, 1405 existing_memory_file: Option<File>, 1406 thp: bool, 1407 ) -> Result<Arc<GuestRegionMmap>, Error> { 1408 let mut mmap_flags = libc::MAP_NORESERVE; 1409 1410 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1411 // the complexity of the handling clear. 1412 let fo = if let Some(f) = existing_memory_file { 1413 // It must be MAP_SHARED as we wouldn't already have an FD 1414 mmap_flags |= libc::MAP_SHARED; 1415 Some(FileOffset::new(f, file_offset)) 1416 } else if let Some(backing_file) = backing_file { 1417 if shared { 1418 mmap_flags |= libc::MAP_SHARED; 1419 } else { 1420 mmap_flags |= libc::MAP_PRIVATE; 1421 } 1422 Some(Self::open_backing_file(backing_file, file_offset)?) 1423 } else if shared || hugepages { 1424 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1425 // because the MAP_PRIVATE will trigger CoW against the backing file with 1426 // the VFIO pinning 1427 mmap_flags |= libc::MAP_SHARED; 1428 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1429 } else { 1430 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1431 None 1432 }; 1433 1434 let region = GuestRegionMmap::new( 1435 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1436 .map_err(Error::GuestMemoryRegion)?, 1437 start_addr, 1438 ) 1439 .map_err(Error::GuestMemory)?; 1440 1441 // Apply NUMA policy if needed. 1442 if let Some(node) = host_numa_node { 1443 let addr = region.deref().as_ptr(); 1444 let len = region.deref().size() as u64; 1445 let mode = MPOL_BIND; 1446 let mut nodemask: Vec<u64> = Vec::new(); 1447 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1448 1449 // Linux is kind of buggy in the way it interprets maxnode as it 1450 // will cut off the last node. That's why we have to add 1 to what 1451 // we would consider as the proper maxnode value. 1452 let maxnode = node as u64 + 1 + 1; 1453 1454 // Allocate the right size for the vector. 1455 nodemask.resize((node as usize / 64) + 1, 0); 1456 1457 // Fill the global bitmask through the nodemask vector. 1458 let idx = (node / 64) as usize; 1459 let shift = node % 64; 1460 nodemask[idx] |= 1u64 << shift; 1461 1462 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1463 // force the kernel to move all pages that might have been already 1464 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1465 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1466 // MPOL_BIND is the selected mode as it specifies a strict policy 1467 // that restricts memory allocation to the nodes specified in the 1468 // nodemask. 1469 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1470 .map_err(Error::ApplyNumaPolicy)?; 1471 } 1472 1473 // Prefault the region if needed, in parallel. 1474 if prefault { 1475 let page_size = 1476 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize; 1477 1478 if !is_aligned(size, page_size) { 1479 warn!( 1480 "Prefaulting memory size {} misaligned with page size {}", 1481 size, page_size 1482 ); 1483 } 1484 1485 let num_pages = size / page_size; 1486 1487 let num_threads = Self::get_prefault_num_threads(page_size, num_pages); 1488 1489 let pages_per_thread = num_pages / num_threads; 1490 let remainder = num_pages % num_threads; 1491 1492 let barrier = Arc::new(Barrier::new(num_threads)); 1493 thread::scope(|s| { 1494 let r = ®ion; 1495 for i in 0..num_threads { 1496 let barrier = Arc::clone(&barrier); 1497 s.spawn(move || { 1498 // Wait until all threads have been spawned to avoid contention 1499 // over mmap_sem between thread stack allocation and page faulting. 1500 barrier.wait(); 1501 let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; 1502 let offset = 1503 page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); 1504 // SAFETY: FFI call with correct arguments 1505 let ret = unsafe { 1506 let addr = r.as_ptr().add(offset); 1507 libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE) 1508 }; 1509 if ret != 0 { 1510 let e = io::Error::last_os_error(); 1511 warn!("Failed to prefault pages: {}", e); 1512 } 1513 }); 1514 } 1515 }); 1516 } 1517 1518 if region.file_offset().is_none() && thp { 1519 info!( 1520 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1521 region.as_ptr() as u64, 1522 size 1523 ); 1524 // SAFETY: FFI call with correct arguments 1525 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1526 if ret != 0 { 1527 let e = io::Error::last_os_error(); 1528 warn!("Failed to mark pages as THP eligible: {}", e); 1529 } 1530 } 1531 1532 Ok(Arc::new(region)) 1533 } 1534 1535 // Duplicate of `memory_zone_get_align_size` that does not require a `zone` 1536 fn get_prefault_align_size( 1537 backing_file: &Option<PathBuf>, 1538 hugepages: bool, 1539 hugepage_size: Option<u64>, 1540 ) -> Result<u64, Error> { 1541 // SAFETY: FFI call. Trivially safe. 1542 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 1543 match (hugepages, hugepage_size, backing_file) { 1544 (false, _, _) => Ok(page_size), 1545 (true, Some(hugepage_size), _) => Ok(hugepage_size), 1546 (true, None, _) => { 1547 // There are two scenarios here: 1548 // - `hugepages` is enabled but `hugepage_size` is not specified: 1549 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 1550 // - The backing file is specified: 1551 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 1552 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 1553 // value is less than or equal to the page size, just use the page size. 1554 let path = backing_file 1555 .as_ref() 1556 .map_or(Ok("/dev/hugepages"), |pathbuf| { 1557 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 1558 })?; 1559 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 1560 Ok(align_size) 1561 } 1562 } 1563 } 1564 1565 fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize { 1566 let mut n: usize = 1; 1567 1568 // Do not create more threads than processors available. 1569 // SAFETY: FFI call. Trivially safe. 1570 let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) }; 1571 if procs > 0 { 1572 n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT); 1573 } 1574 1575 // Do not create more threads than pages being allocated. 1576 n = std::cmp::min(n, num_pages); 1577 1578 // Do not create threads to allocate less than 64 MiB of memory. 1579 n = std::cmp::min( 1580 n, 1581 std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))), 1582 ); 1583 1584 n 1585 } 1586 1587 // Update the GuestMemoryMmap with the new range 1588 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1589 let guest_memory = self 1590 .guest_memory 1591 .memory() 1592 .insert_region(region) 1593 .map_err(Error::GuestMemory)?; 1594 self.guest_memory.lock().unwrap().replace(guest_memory); 1595 1596 Ok(()) 1597 } 1598 1599 // 1600 // Calculate the start address of an area next to RAM. 1601 // 1602 // If memory hotplug is allowed, the start address needs to be aligned 1603 // (rounded-up) to 128MiB boundary. 1604 // If memory hotplug is not allowed, there is no alignment required. 1605 // And it must also start at the 64bit start. 1606 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1607 let mut start_addr = if allow_mem_hotplug { 1608 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1609 } else { 1610 mem_end 1611 }; 1612 1613 start_addr = start_addr 1614 .checked_add(1) 1615 .ok_or(Error::GuestAddressOverFlow)?; 1616 1617 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1618 return Ok(arch::layout::RAM_64BIT_START); 1619 } 1620 1621 Ok(start_addr) 1622 } 1623 1624 pub fn add_ram_region( 1625 &mut self, 1626 start_addr: GuestAddress, 1627 size: usize, 1628 ) -> Result<Arc<GuestRegionMmap>, Error> { 1629 // Allocate memory for the region 1630 let region = MemoryManager::create_ram_region( 1631 &None, 1632 0, 1633 start_addr, 1634 size, 1635 self.prefault, 1636 self.shared, 1637 self.hugepages, 1638 self.hugepage_size, 1639 None, 1640 None, 1641 self.thp, 1642 )?; 1643 1644 // Map it into the guest 1645 let slot = self.create_userspace_mapping( 1646 region.start_addr().0, 1647 region.len(), 1648 region.as_ptr() as u64, 1649 self.mergeable, 1650 false, 1651 self.log_dirty, 1652 )?; 1653 self.guest_ram_mappings.push(GuestRamMapping { 1654 gpa: region.start_addr().raw_value(), 1655 size: region.len(), 1656 slot, 1657 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1658 virtio_mem: false, 1659 file_offset: 0, 1660 }); 1661 1662 self.add_region(Arc::clone(®ion))?; 1663 1664 Ok(region) 1665 } 1666 1667 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1668 info!("Hotplugging new RAM: {}", size); 1669 1670 // Check that there is a free slot 1671 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1672 return Err(Error::NoSlotAvailable); 1673 } 1674 1675 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1676 if size % (128 << 20) != 0 { 1677 return Err(Error::InvalidSize); 1678 } 1679 1680 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1681 1682 if start_addr 1683 .checked_add((size - 1).try_into().unwrap()) 1684 .unwrap() 1685 > self.end_of_ram_area 1686 { 1687 return Err(Error::InsufficientHotplugRam); 1688 } 1689 1690 let region = self.add_ram_region(start_addr, size)?; 1691 1692 // Add region to the list of regions associated with the default 1693 // memory zone. 1694 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1695 memory_zone.regions.push(Arc::clone(®ion)); 1696 } 1697 1698 // Tell the allocator 1699 self.ram_allocator 1700 .allocate(Some(start_addr), size as GuestUsize, None) 1701 .ok_or(Error::MemoryRangeAllocation)?; 1702 1703 // Update the slot so that it can be queried via the I/O port 1704 let slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1705 slot.active = true; 1706 slot.inserting = true; 1707 slot.base = region.start_addr().0; 1708 slot.length = region.len(); 1709 1710 self.next_hotplug_slot += 1; 1711 1712 Ok(region) 1713 } 1714 1715 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1716 self.guest_memory.clone() 1717 } 1718 1719 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1720 self.boot_guest_memory.clone() 1721 } 1722 1723 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1724 self.allocator.clone() 1725 } 1726 1727 pub fn start_of_device_area(&self) -> GuestAddress { 1728 self.start_of_device_area 1729 } 1730 1731 pub fn end_of_device_area(&self) -> GuestAddress { 1732 self.end_of_device_area 1733 } 1734 1735 pub fn memory_slot_allocator(&mut self) -> MemorySlotAllocator { 1736 let memory_slot_free_list = Arc::clone(&self.memory_slot_free_list); 1737 let next_memory_slot = Arc::clone(&self.next_memory_slot); 1738 MemorySlotAllocator::new(next_memory_slot, memory_slot_free_list) 1739 } 1740 1741 pub fn allocate_memory_slot(&mut self) -> u32 { 1742 self.memory_slot_allocator().next_memory_slot() 1743 } 1744 1745 pub fn create_userspace_mapping( 1746 &mut self, 1747 guest_phys_addr: u64, 1748 memory_size: u64, 1749 userspace_addr: u64, 1750 mergeable: bool, 1751 readonly: bool, 1752 log_dirty: bool, 1753 ) -> Result<u32, Error> { 1754 let slot = self.allocate_memory_slot(); 1755 let mem_region = self.vm.make_user_memory_region( 1756 slot, 1757 guest_phys_addr, 1758 memory_size, 1759 userspace_addr, 1760 readonly, 1761 log_dirty, 1762 ); 1763 1764 info!( 1765 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1766 guest_phys_addr, userspace_addr, memory_size, slot 1767 ); 1768 1769 self.vm 1770 .create_user_memory_region(mem_region) 1771 .map_err(Error::CreateUserMemoryRegion)?; 1772 1773 // SAFETY: the address and size are valid since the 1774 // mmap succeeded. 1775 let ret = unsafe { 1776 libc::madvise( 1777 userspace_addr as *mut libc::c_void, 1778 memory_size as libc::size_t, 1779 libc::MADV_DONTDUMP, 1780 ) 1781 }; 1782 if ret != 0 { 1783 let e = io::Error::last_os_error(); 1784 warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e); 1785 } 1786 1787 // Mark the pages as mergeable if explicitly asked for. 1788 if mergeable { 1789 // SAFETY: the address and size are valid since the 1790 // mmap succeeded. 1791 let ret = unsafe { 1792 libc::madvise( 1793 userspace_addr as *mut libc::c_void, 1794 memory_size as libc::size_t, 1795 libc::MADV_MERGEABLE, 1796 ) 1797 }; 1798 if ret != 0 { 1799 let err = io::Error::last_os_error(); 1800 // Safe to unwrap because the error is constructed with 1801 // last_os_error(), which ensures the output will be Some(). 1802 let errno = err.raw_os_error().unwrap(); 1803 if errno == libc::EINVAL { 1804 warn!("kernel not configured with CONFIG_KSM"); 1805 } else { 1806 warn!("madvise error: {}", err); 1807 } 1808 warn!("failed to mark pages as mergeable"); 1809 } 1810 } 1811 1812 info!( 1813 "Created userspace mapping: {:x} -> {:x} {:x}", 1814 guest_phys_addr, userspace_addr, memory_size 1815 ); 1816 1817 Ok(slot) 1818 } 1819 1820 pub fn remove_userspace_mapping( 1821 &mut self, 1822 guest_phys_addr: u64, 1823 memory_size: u64, 1824 userspace_addr: u64, 1825 mergeable: bool, 1826 slot: u32, 1827 ) -> Result<(), Error> { 1828 let mem_region = self.vm.make_user_memory_region( 1829 slot, 1830 guest_phys_addr, 1831 memory_size, 1832 userspace_addr, 1833 false, /* readonly -- don't care */ 1834 false, /* log dirty */ 1835 ); 1836 1837 self.vm 1838 .remove_user_memory_region(mem_region) 1839 .map_err(Error::RemoveUserMemoryRegion)?; 1840 1841 // Mark the pages as unmergeable if there were previously marked as 1842 // mergeable. 1843 if mergeable { 1844 // SAFETY: the address and size are valid as the region was 1845 // previously advised. 1846 let ret = unsafe { 1847 libc::madvise( 1848 userspace_addr as *mut libc::c_void, 1849 memory_size as libc::size_t, 1850 libc::MADV_UNMERGEABLE, 1851 ) 1852 }; 1853 if ret != 0 { 1854 let err = io::Error::last_os_error(); 1855 // Safe to unwrap because the error is constructed with 1856 // last_os_error(), which ensures the output will be Some(). 1857 let errno = err.raw_os_error().unwrap(); 1858 if errno == libc::EINVAL { 1859 warn!("kernel not configured with CONFIG_KSM"); 1860 } else { 1861 warn!("madvise error: {}", err); 1862 } 1863 warn!("failed to mark pages as unmergeable"); 1864 } 1865 } 1866 1867 info!( 1868 "Removed userspace mapping: {:x} -> {:x} {:x}", 1869 guest_phys_addr, userspace_addr, memory_size 1870 ); 1871 1872 Ok(()) 1873 } 1874 1875 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1876 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1877 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1878 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1879 virtio_mem_device 1880 .lock() 1881 .unwrap() 1882 .resize(size) 1883 .map_err(Error::VirtioMemResizeFail)?; 1884 } 1885 1886 // Keep the hotplugged_size up to date. 1887 virtio_mem_zone.hotplugged_size = size; 1888 } else { 1889 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1890 return Err(Error::MissingVirtioMemHandler); 1891 } 1892 1893 return Ok(()); 1894 } 1895 1896 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1897 Err(Error::UnknownMemoryZone) 1898 } 1899 1900 /// In case this function resulted in adding a new memory region to the 1901 /// guest memory, the new region is returned to the caller. The virtio-mem 1902 /// use case never adds a new region as the whole hotpluggable memory has 1903 /// already been allocated at boot time. 1904 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1905 if self.user_provided_zones { 1906 error!( 1907 "Not allowed to resize guest memory when backed with user \ 1908 defined memory zones." 1909 ); 1910 return Err(Error::InvalidResizeWithMemoryZones); 1911 } 1912 1913 let mut region: Option<Arc<GuestRegionMmap>> = None; 1914 match self.hotplug_method { 1915 HotplugMethod::VirtioMem => { 1916 if desired_ram >= self.boot_ram { 1917 if !self.dynamic { 1918 return Ok(region); 1919 } 1920 1921 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1922 self.current_ram = desired_ram; 1923 } 1924 } 1925 HotplugMethod::Acpi => { 1926 if desired_ram > self.current_ram { 1927 if !self.dynamic { 1928 return Ok(region); 1929 } 1930 1931 region = 1932 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1933 self.current_ram = desired_ram; 1934 } 1935 } 1936 } 1937 Ok(region) 1938 } 1939 1940 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1941 if !self.user_provided_zones { 1942 error!( 1943 "Not allowed to resize guest memory zone when no zone is \ 1944 defined." 1945 ); 1946 return Err(Error::ResizeZone); 1947 } 1948 1949 self.virtio_mem_resize(id, virtio_mem_size) 1950 } 1951 1952 #[cfg(target_arch = "x86_64")] 1953 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1954 let file = OpenOptions::new() 1955 .read(true) 1956 .open("/dev/sgx_provision") 1957 .map_err(Error::SgxProvisionOpen)?; 1958 self.vm 1959 .enable_sgx_attribute(file) 1960 .map_err(Error::SgxEnableProvisioning)?; 1961 1962 // Go over each EPC section and verify its size is a 4k multiple. At 1963 // the same time, calculate the total size needed for the contiguous 1964 // EPC region. 1965 let mut epc_region_size = 0; 1966 for epc_section in sgx_epc_config.iter() { 1967 if epc_section.size == 0 { 1968 return Err(Error::EpcSectionSizeInvalid); 1969 } 1970 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1971 return Err(Error::EpcSectionSizeInvalid); 1972 } 1973 1974 epc_region_size += epc_section.size; 1975 } 1976 1977 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1978 let epc_region_start = 1979 GuestAddress(self.start_of_device_area.0.div_ceil(SGX_PAGE_SIZE) * SGX_PAGE_SIZE); 1980 1981 self.start_of_device_area = epc_region_start 1982 .checked_add(epc_region_size) 1983 .ok_or(Error::GuestAddressOverFlow)?; 1984 1985 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1986 info!( 1987 "SGX EPC region: 0x{:x} (0x{:x})", 1988 epc_region_start.0, epc_region_size 1989 ); 1990 1991 // Each section can be memory mapped into the allocated region. 1992 let mut epc_section_start = epc_region_start.raw_value(); 1993 for epc_section in sgx_epc_config.iter() { 1994 let file = OpenOptions::new() 1995 .read(true) 1996 .write(true) 1997 .open("/dev/sgx_vepc") 1998 .map_err(Error::SgxVirtEpcOpen)?; 1999 2000 let prot = PROT_READ | PROT_WRITE; 2001 let mut flags = MAP_NORESERVE | MAP_SHARED; 2002 if epc_section.prefault { 2003 flags |= MAP_POPULATE; 2004 } 2005 2006 // We can't use the vm-memory crate to perform the memory mapping 2007 // here as it would try to ensure the size of the backing file is 2008 // matching the size of the expected mapping. The /dev/sgx_vepc 2009 // device does not work that way, it provides a file descriptor 2010 // which is not matching the mapping size, as it's a just a way to 2011 // let KVM know that an EPC section is being created for the guest. 2012 // SAFETY: FFI call with correct arguments 2013 let host_addr = unsafe { 2014 libc::mmap( 2015 std::ptr::null_mut(), 2016 epc_section.size as usize, 2017 prot, 2018 flags, 2019 file.as_raw_fd(), 2020 0, 2021 ) 2022 } as u64; 2023 2024 info!( 2025 "Adding SGX EPC section: 0x{:x} (0x{:x})", 2026 epc_section_start, epc_section.size 2027 ); 2028 2029 let _mem_slot = self.create_userspace_mapping( 2030 epc_section_start, 2031 epc_section.size, 2032 host_addr, 2033 false, 2034 false, 2035 false, 2036 )?; 2037 2038 sgx_epc_region.insert( 2039 epc_section.id.clone(), 2040 SgxEpcSection::new( 2041 GuestAddress(epc_section_start), 2042 epc_section.size as GuestUsize, 2043 ), 2044 ); 2045 2046 epc_section_start += epc_section.size; 2047 } 2048 2049 self.sgx_epc_region = Some(sgx_epc_region); 2050 2051 Ok(()) 2052 } 2053 2054 #[cfg(target_arch = "x86_64")] 2055 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 2056 &self.sgx_epc_region 2057 } 2058 2059 pub fn is_hardlink(f: &File) -> bool { 2060 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 2061 // SAFETY: FFI call with correct arguments 2062 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 2063 if ret != 0 { 2064 error!("Couldn't fstat the backing file"); 2065 return false; 2066 } 2067 2068 // SAFETY: stat is valid 2069 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 2070 } 2071 2072 pub fn memory_zones(&self) -> &MemoryZones { 2073 &self.memory_zones 2074 } 2075 2076 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 2077 &mut self.memory_zones 2078 } 2079 2080 pub fn memory_range_table( 2081 &self, 2082 snapshot: bool, 2083 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 2084 let mut table = MemoryRangeTable::default(); 2085 2086 for memory_zone in self.memory_zones.values() { 2087 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 2088 table.extend(virtio_mem_zone.plugged_ranges()); 2089 } 2090 2091 for region in memory_zone.regions() { 2092 if snapshot { 2093 if let Some(file_offset) = region.file_offset() { 2094 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 2095 && Self::is_hardlink(file_offset.file()) 2096 { 2097 // In this very specific case, we know the memory 2098 // region is backed by a file on the host filesystem 2099 // that can be accessed by the user, and additionally 2100 // the mapping is shared, which means that modifications 2101 // to the content are written to the actual file. 2102 // When meeting these conditions, we can skip the 2103 // copy of the memory content for this specific region, 2104 // as we can assume the user will have it saved through 2105 // the backing file already. 2106 continue; 2107 } 2108 } 2109 } 2110 2111 table.push(MemoryRange { 2112 gpa: region.start_addr().raw_value(), 2113 length: region.len(), 2114 }); 2115 } 2116 } 2117 2118 Ok(table) 2119 } 2120 2121 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 2122 MemoryManagerSnapshotData { 2123 memory_ranges: self.snapshot_memory_ranges.clone(), 2124 guest_ram_mappings: self.guest_ram_mappings.clone(), 2125 start_of_device_area: self.start_of_device_area.0, 2126 boot_ram: self.boot_ram, 2127 current_ram: self.current_ram, 2128 arch_mem_regions: self.arch_mem_regions.clone(), 2129 hotplug_slots: self.hotplug_slots.clone(), 2130 next_memory_slot: self.next_memory_slot.load(Ordering::SeqCst), 2131 selected_slot: self.selected_slot, 2132 next_hotplug_slot: self.next_hotplug_slot, 2133 } 2134 } 2135 2136 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 2137 let mut memory_slot_fds = HashMap::new(); 2138 for guest_ram_mapping in &self.guest_ram_mappings { 2139 let slot = guest_ram_mapping.slot; 2140 let guest_memory = self.guest_memory.memory(); 2141 let file = guest_memory 2142 .find_region(GuestAddress(guest_ram_mapping.gpa)) 2143 .unwrap() 2144 .file_offset() 2145 .unwrap() 2146 .file(); 2147 memory_slot_fds.insert(slot, file.as_raw_fd()); 2148 } 2149 memory_slot_fds 2150 } 2151 2152 pub fn acpi_address(&self) -> Option<GuestAddress> { 2153 self.acpi_address 2154 } 2155 2156 pub fn num_guest_ram_mappings(&self) -> u32 { 2157 self.guest_ram_mappings.len() as u32 2158 } 2159 2160 #[cfg(target_arch = "aarch64")] 2161 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 2162 self.uefi_flash.as_ref().unwrap().clone() 2163 } 2164 2165 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2166 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 2167 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 2168 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 2169 2170 let mut mem_offset_in_elf = mem_offset; 2171 let mut ram_maps = BTreeMap::new(); 2172 for mapping in mapping_sorted_by_gpa.iter() { 2173 ram_maps.insert( 2174 mapping.gpa, 2175 CoredumpMemoryRegion { 2176 mem_offset_in_elf, 2177 mem_size: mapping.size, 2178 }, 2179 ); 2180 mem_offset_in_elf += mapping.size; 2181 } 2182 2183 CoredumpMemoryRegions { ram_maps } 2184 } 2185 2186 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2187 pub fn coredump_iterate_save_mem( 2188 &mut self, 2189 dump_state: &DumpState, 2190 ) -> std::result::Result<(), GuestDebuggableError> { 2191 let snapshot_memory_ranges = self 2192 .memory_range_table(false) 2193 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2194 2195 if snapshot_memory_ranges.is_empty() { 2196 return Ok(()); 2197 } 2198 2199 let coredump_file = dump_state.file.as_ref().unwrap(); 2200 2201 let guest_memory = self.guest_memory.memory(); 2202 let mut total_bytes: u64 = 0; 2203 2204 for range in snapshot_memory_ranges.regions() { 2205 let mut offset: u64 = 0; 2206 loop { 2207 let bytes_written = guest_memory 2208 .write_volatile_to( 2209 GuestAddress(range.gpa + offset), 2210 &mut coredump_file.as_fd(), 2211 (range.length - offset) as usize, 2212 ) 2213 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2214 offset += bytes_written as u64; 2215 total_bytes += bytes_written as u64; 2216 2217 if offset == range.length { 2218 break; 2219 } 2220 } 2221 } 2222 2223 debug!("coredump total bytes {}", total_bytes); 2224 Ok(()) 2225 } 2226 2227 pub fn receive_memory_regions<F>( 2228 &mut self, 2229 ranges: &MemoryRangeTable, 2230 fd: &mut F, 2231 ) -> std::result::Result<(), MigratableError> 2232 where 2233 F: ReadVolatile, 2234 { 2235 let guest_memory = self.guest_memory(); 2236 let mem = guest_memory.memory(); 2237 2238 for range in ranges.regions() { 2239 let mut offset: u64 = 0; 2240 // Here we are manually handling the retry in case we can't the 2241 // whole region at once because we can't use the implementation 2242 // from vm-memory::GuestMemory of read_exact_from() as it is not 2243 // following the correct behavior. For more info about this issue 2244 // see: https://github.com/rust-vmm/vm-memory/issues/174 2245 loop { 2246 let bytes_read = mem 2247 .read_volatile_from( 2248 GuestAddress(range.gpa + offset), 2249 fd, 2250 (range.length - offset) as usize, 2251 ) 2252 .map_err(|e| { 2253 MigratableError::MigrateReceive(anyhow!( 2254 "Error receiving memory from socket: {}", 2255 e 2256 )) 2257 })?; 2258 offset += bytes_read as u64; 2259 2260 if offset == range.length { 2261 break; 2262 } 2263 } 2264 } 2265 2266 Ok(()) 2267 } 2268 } 2269 2270 struct MemoryNotify { 2271 slot_id: usize, 2272 } 2273 2274 impl Aml for MemoryNotify { 2275 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2276 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2277 aml::If::new( 2278 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2279 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2280 ) 2281 .to_aml_bytes(sink) 2282 } 2283 } 2284 2285 struct MemorySlot { 2286 slot_id: usize, 2287 } 2288 2289 impl Aml for MemorySlot { 2290 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2291 aml::Device::new( 2292 format!("M{:03}", self.slot_id).as_str().into(), 2293 vec![ 2294 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")), 2295 &aml::Name::new("_UID".into(), &self.slot_id), 2296 /* 2297 _STA return value: 2298 Bit [0] – Set if the device is present. 2299 Bit [1] – Set if the device is enabled and decoding its resources. 2300 Bit [2] – Set if the device should be shown in the UI. 2301 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2302 Bit [4] – Set if the battery is present. 2303 Bits [31:5] – Reserved (must be cleared). 2304 */ 2305 &aml::Method::new( 2306 "_STA".into(), 2307 0, 2308 false, 2309 // Call into MSTA method which will interrogate device 2310 vec![&aml::Return::new(&aml::MethodCall::new( 2311 "MSTA".into(), 2312 vec![&self.slot_id], 2313 ))], 2314 ), 2315 // Get details of memory 2316 &aml::Method::new( 2317 "_CRS".into(), 2318 0, 2319 false, 2320 // Call into MCRS which provides actual memory details 2321 vec![&aml::Return::new(&aml::MethodCall::new( 2322 "MCRS".into(), 2323 vec![&self.slot_id], 2324 ))], 2325 ), 2326 ], 2327 ) 2328 .to_aml_bytes(sink) 2329 } 2330 } 2331 2332 struct MemorySlots { 2333 slots: usize, 2334 } 2335 2336 impl Aml for MemorySlots { 2337 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2338 for slot_id in 0..self.slots { 2339 MemorySlot { slot_id }.to_aml_bytes(sink); 2340 } 2341 } 2342 } 2343 2344 struct MemoryMethods { 2345 slots: usize, 2346 } 2347 2348 impl Aml for MemoryMethods { 2349 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2350 // Add "MTFY" notification method 2351 let mut memory_notifies = Vec::new(); 2352 for slot_id in 0..self.slots { 2353 memory_notifies.push(MemoryNotify { slot_id }); 2354 } 2355 2356 let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2357 for memory_notifier in memory_notifies.iter() { 2358 memory_notifies_refs.push(memory_notifier); 2359 } 2360 2361 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink); 2362 2363 // MSCN method 2364 aml::Method::new( 2365 "MSCN".into(), 2366 0, 2367 true, 2368 vec![ 2369 // Take lock defined above 2370 &aml::Acquire::new("MLCK".into(), 0xffff), 2371 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2372 &aml::While::new( 2373 &aml::LessThan::new(&aml::Local(0), &self.slots), 2374 vec![ 2375 // Write slot number (in first argument) to I/O port via field 2376 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2377 // Check if MINS bit is set (inserting) 2378 &aml::If::new( 2379 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2380 // Notify device if it is 2381 vec![ 2382 &aml::MethodCall::new( 2383 "MTFY".into(), 2384 vec![&aml::Local(0), &aml::ONE], 2385 ), 2386 // Reset MINS bit 2387 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2388 ], 2389 ), 2390 // Check if MRMV bit is set 2391 &aml::If::new( 2392 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2393 // Notify device if it is (with the eject constant 0x3) 2394 vec![ 2395 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2396 // Reset MRMV bit 2397 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2398 ], 2399 ), 2400 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2401 ], 2402 ), 2403 // Release lock 2404 &aml::Release::new("MLCK".into()), 2405 ], 2406 ) 2407 .to_aml_bytes(sink); 2408 2409 // Memory status method 2410 aml::Method::new( 2411 "MSTA".into(), 2412 1, 2413 true, 2414 vec![ 2415 // Take lock defined above 2416 &aml::Acquire::new("MLCK".into(), 0xffff), 2417 // Write slot number (in first argument) to I/O port via field 2418 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2419 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2420 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2421 &aml::If::new( 2422 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2423 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2424 ), 2425 // Release lock 2426 &aml::Release::new("MLCK".into()), 2427 // Return 0 or 0xf 2428 &aml::Return::new(&aml::Local(0)), 2429 ], 2430 ) 2431 .to_aml_bytes(sink); 2432 2433 // Memory range method 2434 aml::Method::new( 2435 "MCRS".into(), 2436 1, 2437 true, 2438 vec![ 2439 // Take lock defined above 2440 &aml::Acquire::new("MLCK".into(), 0xffff), 2441 // Write slot number (in first argument) to I/O port via field 2442 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2443 &aml::Name::new( 2444 "MR64".into(), 2445 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2446 aml::AddressSpaceCacheable::Cacheable, 2447 true, 2448 0x0000_0000_0000_0000u64, 2449 0xFFFF_FFFF_FFFF_FFFEu64, 2450 None, 2451 )]), 2452 ), 2453 &aml::CreateQWordField::new( 2454 &aml::Path::new("MINL"), 2455 &aml::Path::new("MR64"), 2456 &14usize, 2457 ), 2458 &aml::CreateDWordField::new( 2459 &aml::Path::new("MINH"), 2460 &aml::Path::new("MR64"), 2461 &18usize, 2462 ), 2463 &aml::CreateQWordField::new( 2464 &aml::Path::new("MAXL"), 2465 &aml::Path::new("MR64"), 2466 &22usize, 2467 ), 2468 &aml::CreateDWordField::new( 2469 &aml::Path::new("MAXH"), 2470 &aml::Path::new("MR64"), 2471 &26usize, 2472 ), 2473 &aml::CreateQWordField::new( 2474 &aml::Path::new("LENL"), 2475 &aml::Path::new("MR64"), 2476 &38usize, 2477 ), 2478 &aml::CreateDWordField::new( 2479 &aml::Path::new("LENH"), 2480 &aml::Path::new("MR64"), 2481 &42usize, 2482 ), 2483 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2484 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2485 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2486 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2487 &aml::Add::new( 2488 &aml::Path::new("MAXL"), 2489 &aml::Path::new("MINL"), 2490 &aml::Path::new("LENL"), 2491 ), 2492 &aml::Add::new( 2493 &aml::Path::new("MAXH"), 2494 &aml::Path::new("MINH"), 2495 &aml::Path::new("LENH"), 2496 ), 2497 &aml::If::new( 2498 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2499 vec![&aml::Add::new( 2500 &aml::Path::new("MAXH"), 2501 &aml::ONE, 2502 &aml::Path::new("MAXH"), 2503 )], 2504 ), 2505 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2506 // Release lock 2507 &aml::Release::new("MLCK".into()), 2508 &aml::Return::new(&aml::Path::new("MR64")), 2509 ], 2510 ) 2511 .to_aml_bytes(sink) 2512 } 2513 } 2514 2515 impl Aml for MemoryManager { 2516 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2517 if let Some(acpi_address) = self.acpi_address { 2518 // Memory Hotplug Controller 2519 aml::Device::new( 2520 "_SB_.MHPC".into(), 2521 vec![ 2522 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2523 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2524 // Mutex to protect concurrent access as we write to choose slot and then read back status 2525 &aml::Mutex::new("MLCK".into(), 0), 2526 &aml::Name::new( 2527 "_CRS".into(), 2528 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2529 aml::AddressSpaceCacheable::NotCacheable, 2530 true, 2531 acpi_address.0, 2532 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2533 None, 2534 )]), 2535 ), 2536 // OpRegion and Fields map MMIO range into individual field values 2537 &aml::OpRegion::new( 2538 "MHPR".into(), 2539 aml::OpRegionSpace::SystemMemory, 2540 &(acpi_address.0 as usize), 2541 &MEMORY_MANAGER_ACPI_SIZE, 2542 ), 2543 &aml::Field::new( 2544 "MHPR".into(), 2545 aml::FieldAccessType::DWord, 2546 aml::FieldLockRule::NoLock, 2547 aml::FieldUpdateRule::Preserve, 2548 vec![ 2549 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2550 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2551 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2552 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2553 ], 2554 ), 2555 &aml::Field::new( 2556 "MHPR".into(), 2557 aml::FieldAccessType::DWord, 2558 aml::FieldLockRule::NoLock, 2559 aml::FieldUpdateRule::Preserve, 2560 vec![ 2561 aml::FieldEntry::Reserved(128), 2562 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2563 ], 2564 ), 2565 &aml::Field::new( 2566 "MHPR".into(), 2567 aml::FieldAccessType::Byte, 2568 aml::FieldLockRule::NoLock, 2569 aml::FieldUpdateRule::WriteAsZeroes, 2570 vec![ 2571 aml::FieldEntry::Reserved(160), 2572 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2573 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2574 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2575 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2576 ], 2577 ), 2578 &aml::Field::new( 2579 "MHPR".into(), 2580 aml::FieldAccessType::DWord, 2581 aml::FieldLockRule::NoLock, 2582 aml::FieldUpdateRule::Preserve, 2583 vec![ 2584 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2585 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2586 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2587 ], 2588 ), 2589 &MemoryMethods { 2590 slots: self.hotplug_slots.len(), 2591 }, 2592 &MemorySlots { 2593 slots: self.hotplug_slots.len(), 2594 }, 2595 ], 2596 ) 2597 .to_aml_bytes(sink); 2598 } else { 2599 aml::Device::new( 2600 "_SB_.MHPC".into(), 2601 vec![ 2602 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2603 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2604 // Empty MSCN for GED 2605 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2606 ], 2607 ) 2608 .to_aml_bytes(sink); 2609 } 2610 2611 #[cfg(target_arch = "x86_64")] 2612 { 2613 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2614 let min = sgx_epc_region.start().raw_value(); 2615 let max = min + sgx_epc_region.size() - 1; 2616 // SGX EPC region 2617 aml::Device::new( 2618 "_SB_.EPC_".into(), 2619 vec![ 2620 &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")), 2621 // QWORD describing the EPC region start and size 2622 &aml::Name::new( 2623 "_CRS".into(), 2624 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2625 aml::AddressSpaceCacheable::NotCacheable, 2626 true, 2627 min, 2628 max, 2629 None, 2630 )]), 2631 ), 2632 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2633 ], 2634 ) 2635 .to_aml_bytes(sink); 2636 } 2637 } 2638 } 2639 } 2640 2641 impl Pausable for MemoryManager {} 2642 2643 #[derive(Clone, Serialize, Deserialize)] 2644 pub struct MemoryManagerSnapshotData { 2645 memory_ranges: MemoryRangeTable, 2646 guest_ram_mappings: Vec<GuestRamMapping>, 2647 start_of_device_area: u64, 2648 boot_ram: u64, 2649 current_ram: u64, 2650 arch_mem_regions: Vec<ArchMemRegion>, 2651 hotplug_slots: Vec<HotPlugState>, 2652 next_memory_slot: u32, 2653 selected_slot: usize, 2654 next_hotplug_slot: usize, 2655 } 2656 2657 impl Snapshottable for MemoryManager { 2658 fn id(&self) -> String { 2659 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2660 } 2661 2662 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2663 let memory_ranges = self.memory_range_table(true)?; 2664 2665 // Store locally this list of ranges as it will be used through the 2666 // Transportable::send() implementation. The point is to avoid the 2667 // duplication of code regarding the creation of the path for each 2668 // region. The 'snapshot' step creates the list of memory regions, 2669 // including information about the need to copy a memory region or 2670 // not. This saves the 'send' step having to go through the same 2671 // process, and instead it can directly proceed with storing the 2672 // memory range content for the ranges requiring it. 2673 self.snapshot_memory_ranges = memory_ranges; 2674 2675 Ok(Snapshot::from_data(SnapshotData::new_from_state( 2676 &self.snapshot_data(), 2677 )?)) 2678 } 2679 } 2680 2681 impl Transportable for MemoryManager { 2682 fn send( 2683 &self, 2684 _snapshot: &Snapshot, 2685 destination_url: &str, 2686 ) -> result::Result<(), MigratableError> { 2687 if self.snapshot_memory_ranges.is_empty() { 2688 return Ok(()); 2689 } 2690 2691 let mut memory_file_path = url_to_path(destination_url)?; 2692 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2693 2694 // Create the snapshot file for the entire memory 2695 let mut memory_file = OpenOptions::new() 2696 .read(true) 2697 .write(true) 2698 .create_new(true) 2699 .open(memory_file_path) 2700 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2701 2702 let guest_memory = self.guest_memory.memory(); 2703 2704 for range in self.snapshot_memory_ranges.regions() { 2705 let mut offset: u64 = 0; 2706 // Here we are manually handling the retry in case we can't read 2707 // the whole region at once because we can't use the implementation 2708 // from vm-memory::GuestMemory of write_all_to() as it is not 2709 // following the correct behavior. For more info about this issue 2710 // see: https://github.com/rust-vmm/vm-memory/issues/174 2711 loop { 2712 let bytes_written = guest_memory 2713 .write_volatile_to( 2714 GuestAddress(range.gpa + offset), 2715 &mut memory_file, 2716 (range.length - offset) as usize, 2717 ) 2718 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2719 offset += bytes_written as u64; 2720 2721 if offset == range.length { 2722 break; 2723 } 2724 } 2725 } 2726 Ok(()) 2727 } 2728 } 2729 2730 impl Migratable for MemoryManager { 2731 // Start the dirty log in the hypervisor (kvm/mshv). 2732 // Also, reset the dirty bitmap logged by the vmm. 2733 // Just before we do a bulk copy we want to start/clear the dirty log so that 2734 // pages touched during our bulk copy are tracked. 2735 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2736 self.vm.start_dirty_log().map_err(|e| { 2737 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2738 })?; 2739 2740 for r in self.guest_memory.memory().iter() { 2741 r.bitmap().reset(); 2742 } 2743 2744 Ok(()) 2745 } 2746 2747 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2748 self.vm.stop_dirty_log().map_err(|e| { 2749 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2750 })?; 2751 2752 Ok(()) 2753 } 2754 2755 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2756 // together in the table if they are contiguous. 2757 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2758 let mut table = MemoryRangeTable::default(); 2759 for r in &self.guest_ram_mappings { 2760 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2761 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2762 })?; 2763 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2764 { 2765 Some(region) => { 2766 assert!(region.start_addr().raw_value() == r.gpa); 2767 assert!(region.len() == r.size); 2768 region.bitmap().get_and_reset() 2769 } 2770 None => { 2771 return Err(MigratableError::MigrateSend(anyhow!( 2772 "Error finding 'guest memory region' with address {:x}", 2773 r.gpa 2774 ))) 2775 } 2776 }; 2777 2778 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2779 .iter() 2780 .zip(vmm_dirty_bitmap.iter()) 2781 .map(|(x, y)| x | y) 2782 .collect(); 2783 2784 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2785 2786 if sub_table.regions().is_empty() { 2787 info!("Dirty Memory Range Table is empty"); 2788 } else { 2789 info!("Dirty Memory Range Table:"); 2790 for range in sub_table.regions() { 2791 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2792 } 2793 } 2794 2795 table.extend(sub_table); 2796 } 2797 Ok(table) 2798 } 2799 } 2800