1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(target_arch = "x86_64")] 6 use crate::config::SgxEpcConfig; 7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 9 use crate::coredump::{ 10 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, 11 }; 12 use crate::migration::url_to_path; 13 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 14 use crate::{GuestMemoryMmap, GuestRegionMmap}; 15 use acpi_tables::{aml, Aml}; 16 use anyhow::anyhow; 17 #[cfg(target_arch = "x86_64")] 18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 19 use arch::RegionType; 20 #[cfg(target_arch = "x86_64")] 21 use devices::ioapic; 22 #[cfg(target_arch = "aarch64")] 23 use hypervisor::HypervisorVmError; 24 use libc::_SC_NPROCESSORS_ONLN; 25 #[cfg(target_arch = "x86_64")] 26 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 27 use serde::{Deserialize, Serialize}; 28 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 29 use std::collections::BTreeMap; 30 use std::collections::HashMap; 31 use std::convert::TryInto; 32 use std::fs::{File, OpenOptions}; 33 use std::io::{self}; 34 use std::ops::{BitAnd, Deref, Not, Sub}; 35 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 36 use std::os::fd::AsFd; 37 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 38 use std::path::PathBuf; 39 use std::result; 40 use std::sync::{Arc, Barrier, Mutex}; 41 use std::{ffi, thread}; 42 use tracer::trace_scoped; 43 use versionize::{VersionMap, Versionize, VersionizeResult}; 44 use versionize_derive::Versionize; 45 use virtio_devices::BlocksState; 46 #[cfg(target_arch = "x86_64")] 47 use vm_allocator::GsiApic; 48 use vm_allocator::{AddressAllocator, SystemAllocator}; 49 use vm_device::BusDevice; 50 use vm_memory::bitmap::AtomicBitmap; 51 use vm_memory::guest_memory::FileOffset; 52 use vm_memory::{ 53 mmap::MmapRegionError, Address, Error as MmapError, GuestAddress, GuestAddressSpace, 54 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 55 ReadVolatile, 56 }; 57 use vm_migration::{ 58 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 59 Snapshot, SnapshotData, Snapshottable, Transportable, VersionMapped, 60 }; 61 62 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 63 64 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 65 66 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 67 68 #[cfg(target_arch = "x86_64")] 69 const X86_64_IRQ_BASE: u32 = 5; 70 71 #[cfg(target_arch = "x86_64")] 72 const SGX_PAGE_SIZE: u64 = 1 << 12; 73 74 const HOTPLUG_COUNT: usize = 8; 75 76 // Memory policy constants 77 const MPOL_BIND: u32 = 2; 78 const MPOL_MF_STRICT: u32 = 1; 79 const MPOL_MF_MOVE: u32 = 1 << 1; 80 81 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 82 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 83 84 const MAX_PREFAULT_THREAD_COUNT: usize = 16; 85 86 #[derive(Clone, Default, Serialize, Deserialize, Versionize)] 87 struct HotPlugState { 88 base: u64, 89 length: u64, 90 active: bool, 91 inserting: bool, 92 removing: bool, 93 } 94 95 pub struct VirtioMemZone { 96 region: Arc<GuestRegionMmap>, 97 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 98 hotplugged_size: u64, 99 hugepages: bool, 100 blocks_state: Arc<Mutex<BlocksState>>, 101 } 102 103 impl VirtioMemZone { 104 pub fn region(&self) -> &Arc<GuestRegionMmap> { 105 &self.region 106 } 107 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 108 self.virtio_device = Some(virtio_device); 109 } 110 pub fn hotplugged_size(&self) -> u64 { 111 self.hotplugged_size 112 } 113 pub fn hugepages(&self) -> bool { 114 self.hugepages 115 } 116 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 117 &self.blocks_state 118 } 119 pub fn plugged_ranges(&self) -> MemoryRangeTable { 120 self.blocks_state 121 .lock() 122 .unwrap() 123 .memory_ranges(self.region.start_addr().raw_value(), true) 124 } 125 } 126 127 #[derive(Default)] 128 pub struct MemoryZone { 129 regions: Vec<Arc<GuestRegionMmap>>, 130 virtio_mem_zone: Option<VirtioMemZone>, 131 } 132 133 impl MemoryZone { 134 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 135 &self.regions 136 } 137 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 138 &self.virtio_mem_zone 139 } 140 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 141 self.virtio_mem_zone.as_mut() 142 } 143 } 144 145 pub type MemoryZones = HashMap<String, MemoryZone>; 146 147 #[derive(Clone, Serialize, Deserialize, Versionize)] 148 struct GuestRamMapping { 149 slot: u32, 150 gpa: u64, 151 size: u64, 152 zone_id: String, 153 virtio_mem: bool, 154 file_offset: u64, 155 } 156 157 #[derive(Clone, Serialize, Deserialize, Versionize)] 158 struct ArchMemRegion { 159 base: u64, 160 size: usize, 161 r_type: RegionType, 162 } 163 164 pub struct MemoryManager { 165 boot_guest_memory: GuestMemoryMmap, 166 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 167 next_memory_slot: u32, 168 start_of_device_area: GuestAddress, 169 end_of_device_area: GuestAddress, 170 end_of_ram_area: GuestAddress, 171 pub vm: Arc<dyn hypervisor::Vm>, 172 hotplug_slots: Vec<HotPlugState>, 173 selected_slot: usize, 174 mergeable: bool, 175 allocator: Arc<Mutex<SystemAllocator>>, 176 hotplug_method: HotplugMethod, 177 boot_ram: u64, 178 current_ram: u64, 179 next_hotplug_slot: usize, 180 shared: bool, 181 hugepages: bool, 182 hugepage_size: Option<u64>, 183 prefault: bool, 184 thp: bool, 185 #[cfg(target_arch = "x86_64")] 186 sgx_epc_region: Option<SgxEpcRegion>, 187 user_provided_zones: bool, 188 snapshot_memory_ranges: MemoryRangeTable, 189 memory_zones: MemoryZones, 190 log_dirty: bool, // Enable dirty logging for created RAM regions 191 arch_mem_regions: Vec<ArchMemRegion>, 192 ram_allocator: AddressAllocator, 193 dynamic: bool, 194 195 // Keep track of calls to create_userspace_mapping() for guest RAM. 196 // This is useful for getting the dirty pages as we need to know the 197 // slots that the mapping is created in. 198 guest_ram_mappings: Vec<GuestRamMapping>, 199 200 pub acpi_address: Option<GuestAddress>, 201 #[cfg(target_arch = "aarch64")] 202 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 203 } 204 205 #[derive(Debug)] 206 pub enum Error { 207 /// Failed to create shared file. 208 SharedFileCreate(io::Error), 209 210 /// Failed to set shared file length. 211 SharedFileSetLen(io::Error), 212 213 /// Mmap backed guest memory error 214 GuestMemory(MmapError), 215 216 /// Failed to allocate a memory range. 217 MemoryRangeAllocation, 218 219 /// Error from region creation 220 GuestMemoryRegion(MmapRegionError), 221 222 /// No ACPI slot available 223 NoSlotAvailable, 224 225 /// Not enough space in the hotplug RAM region 226 InsufficientHotplugRam, 227 228 /// The requested hotplug memory addition is not a valid size 229 InvalidSize, 230 231 /// Failed to create the user memory region. 232 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 233 234 /// Failed to remove the user memory region. 235 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 236 237 /// Failed to EventFd. 238 EventFdFail(io::Error), 239 240 /// Eventfd write error 241 EventfdError(io::Error), 242 243 /// Failed to virtio-mem resize 244 VirtioMemResizeFail(virtio_devices::mem::Error), 245 246 /// Cannot restore VM 247 Restore(MigratableError), 248 249 /// Cannot restore VM because source URL is missing 250 RestoreMissingSourceUrl, 251 252 /// Cannot create the system allocator 253 CreateSystemAllocator, 254 255 /// Invalid SGX EPC section size 256 #[cfg(target_arch = "x86_64")] 257 EpcSectionSizeInvalid, 258 259 /// Failed allocating SGX EPC region 260 #[cfg(target_arch = "x86_64")] 261 SgxEpcRangeAllocation, 262 263 /// Failed opening SGX virtual EPC device 264 #[cfg(target_arch = "x86_64")] 265 SgxVirtEpcOpen(io::Error), 266 267 /// Failed setting the SGX virtual EPC section size 268 #[cfg(target_arch = "x86_64")] 269 SgxVirtEpcFileSetLen(io::Error), 270 271 /// Failed opening SGX provisioning device 272 #[cfg(target_arch = "x86_64")] 273 SgxProvisionOpen(io::Error), 274 275 /// Failed enabling SGX provisioning 276 #[cfg(target_arch = "x86_64")] 277 SgxEnableProvisioning(hypervisor::HypervisorVmError), 278 279 /// Failed creating a new MmapRegion instance. 280 #[cfg(target_arch = "x86_64")] 281 NewMmapRegion(vm_memory::mmap::MmapRegionError), 282 283 /// No memory zones found. 284 MissingMemoryZones, 285 286 /// Memory configuration is not valid. 287 InvalidMemoryParameters, 288 289 /// Forbidden operation. Impossible to resize guest memory if it is 290 /// backed by user defined memory regions. 291 InvalidResizeWithMemoryZones, 292 293 /// It's invalid to try applying a NUMA policy to a memory zone that is 294 /// memory mapped with MAP_SHARED. 295 InvalidSharedMemoryZoneWithHostNuma, 296 297 /// Failed applying NUMA memory policy. 298 ApplyNumaPolicy(io::Error), 299 300 /// Memory zone identifier is not unique. 301 DuplicateZoneId, 302 303 /// No virtio-mem resizing handler found. 304 MissingVirtioMemHandler, 305 306 /// Unknown memory zone. 307 UnknownMemoryZone, 308 309 /// Invalid size for resizing. Can be anything except 0. 310 InvalidHotplugSize, 311 312 /// Invalid hotplug method associated with memory zones resizing capability. 313 InvalidHotplugMethodWithMemoryZones, 314 315 /// Could not find specified memory zone identifier from hash map. 316 MissingZoneIdentifier, 317 318 /// Resizing the memory zone failed. 319 ResizeZone, 320 321 /// Guest address overflow 322 GuestAddressOverFlow, 323 324 /// Error opening snapshot file 325 SnapshotOpen(io::Error), 326 327 // Error copying snapshot into region 328 SnapshotCopy(GuestMemoryError), 329 330 /// Failed to allocate MMIO address 331 AllocateMmioAddress, 332 333 #[cfg(target_arch = "aarch64")] 334 /// Failed to create UEFI flash 335 CreateUefiFlash(HypervisorVmError), 336 337 /// Using a directory as a backing file for memory is not supported 338 DirectoryAsBackingFileForMemory, 339 340 /// Failed to stat filesystem 341 GetFileSystemBlockSize(io::Error), 342 343 /// Memory size is misaligned with default page size or its hugepage size 344 MisalignedMemorySize, 345 } 346 347 const ENABLE_FLAG: usize = 0; 348 const INSERTING_FLAG: usize = 1; 349 const REMOVING_FLAG: usize = 2; 350 const EJECT_FLAG: usize = 3; 351 352 const BASE_OFFSET_LOW: u64 = 0; 353 const BASE_OFFSET_HIGH: u64 = 0x4; 354 const LENGTH_OFFSET_LOW: u64 = 0x8; 355 const LENGTH_OFFSET_HIGH: u64 = 0xC; 356 const STATUS_OFFSET: u64 = 0x14; 357 const SELECTION_OFFSET: u64 = 0; 358 359 // The MMIO address space size is subtracted with 64k. This is done for the 360 // following reasons: 361 // - Reduce the addressable space size by at least 4k to workaround a Linux 362 // bug when the VMM allocates devices at the end of the addressable space 363 // - Windows requires the addressable space size to be 64k aligned 364 fn mmio_address_space_size(phys_bits: u8) -> u64 { 365 (1 << phys_bits) - (1 << 16) 366 } 367 368 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the 369 // `f_bsize` field. 370 // 371 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169 372 fn statfs_get_bsize(path: &str) -> Result<u64, Error> { 373 let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?; 374 let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit(); 375 376 // SAFETY: FFI call with a valid path and buffer 377 let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) }; 378 if ret != 0 { 379 return Err(Error::GetFileSystemBlockSize( 380 std::io::Error::last_os_error(), 381 )); 382 } 383 384 // SAFETY: `buf` is valid at this point 385 // Because this value is always positive, just convert it directly. 386 // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned 387 // by `clippy` on musl target. To avoid the warning, there should be `as _` instead of 388 // `as u64`. 389 let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _; 390 Ok(bsize) 391 } 392 393 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> { 394 // SAFETY: FFI call. Trivially safe. 395 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 396 397 // There is no backend file and the `hugepages` is disabled, just use system page size. 398 if zone.file.is_none() && !zone.hugepages { 399 return Ok(page_size); 400 } 401 402 // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly. 403 if zone.hugepages && zone.hugepage_size.is_some() { 404 return Ok(zone.hugepage_size.unwrap()); 405 } 406 407 // There are two scenarios here: 408 // - `hugepages` is enabled but `hugepage_size` is not specified: 409 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 410 // - The backing file is specified: 411 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 412 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 413 // value is less than or equal to the page size, just use the page size. 414 let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| { 415 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 416 })?; 417 418 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 419 420 Ok(align_size) 421 } 422 423 #[inline] 424 fn align_down<T>(val: T, align: T) -> T 425 where 426 T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>, 427 { 428 val & !(align - 1u8.into()) 429 } 430 431 #[inline] 432 fn is_aligned<T>(val: T, align: T) -> bool 433 where 434 T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq, 435 { 436 (val & (align - 1u8.into())) == 0u8.into() 437 } 438 439 impl BusDevice for MemoryManager { 440 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 441 if self.selected_slot < self.hotplug_slots.len() { 442 let state = &self.hotplug_slots[self.selected_slot]; 443 match offset { 444 BASE_OFFSET_LOW => { 445 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 446 } 447 BASE_OFFSET_HIGH => { 448 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 449 } 450 LENGTH_OFFSET_LOW => { 451 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 452 } 453 LENGTH_OFFSET_HIGH => { 454 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 455 } 456 STATUS_OFFSET => { 457 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 458 data.fill(0); 459 if state.active { 460 data[0] |= 1 << ENABLE_FLAG; 461 } 462 if state.inserting { 463 data[0] |= 1 << INSERTING_FLAG; 464 } 465 if state.removing { 466 data[0] |= 1 << REMOVING_FLAG; 467 } 468 } 469 _ => { 470 warn!( 471 "Unexpected offset for accessing memory manager device: {:#}", 472 offset 473 ); 474 } 475 } 476 } else { 477 warn!("Out of range memory slot: {}", self.selected_slot); 478 } 479 } 480 481 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 482 match offset { 483 SELECTION_OFFSET => { 484 self.selected_slot = usize::from(data[0]); 485 } 486 STATUS_OFFSET => { 487 if self.selected_slot < self.hotplug_slots.len() { 488 let state = &mut self.hotplug_slots[self.selected_slot]; 489 // The ACPI code writes back a 1 to acknowledge the insertion 490 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 491 state.inserting = false; 492 } 493 // Ditto for removal 494 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 495 state.removing = false; 496 } 497 // Trigger removal of "DIMM" 498 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 499 warn!("Ejection of memory not currently supported"); 500 } 501 } else { 502 warn!("Out of range memory slot: {}", self.selected_slot); 503 } 504 } 505 _ => { 506 warn!( 507 "Unexpected offset for accessing memory manager device: {:#}", 508 offset 509 ); 510 } 511 }; 512 None 513 } 514 } 515 516 impl MemoryManager { 517 /// Creates all memory regions based on the available RAM ranges defined 518 /// by `ram_regions`, and based on the description of the memory zones. 519 /// In practice, this function can perform multiple memory mappings of the 520 /// same backing file if there's a hole in the address space between two 521 /// RAM ranges. 522 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 523 /// and zones containing two zones (size 1G and size 4G). 524 /// This function will create 3 resulting memory regions: 525 /// - First one mapping entirely the first memory zone on 0-1G range 526 /// - Second one mapping partially the second memory zone on 1G-3G range 527 /// - Third one mapping partially the second memory zone on 4G-6G range 528 /// Also, all memory regions are page-size aligned (e.g. their sizes must 529 /// be multiple of page-size), which may leave an additional hole in the 530 /// address space when hugepage is used. 531 fn create_memory_regions_from_zones( 532 ram_regions: &[(GuestAddress, usize)], 533 zones: &[MemoryZoneConfig], 534 prefault: Option<bool>, 535 thp: bool, 536 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 537 let mut zone_iter = zones.iter(); 538 let mut mem_regions = Vec::new(); 539 let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?; 540 let mut zone_align_size = memory_zone_get_align_size(zone)?; 541 let mut zone_offset = 0u64; 542 let mut memory_zones = HashMap::new(); 543 544 if !is_aligned(zone.size, zone_align_size) { 545 return Err(Error::MisalignedMemorySize); 546 } 547 548 // Add zone id to the list of memory zones. 549 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 550 551 for ram_region in ram_regions.iter() { 552 let mut ram_region_offset = 0; 553 let mut exit = false; 554 555 loop { 556 let mut ram_region_consumed = false; 557 let mut pull_next_zone = false; 558 559 let ram_region_available_size = 560 align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size); 561 if ram_region_available_size == 0 { 562 break; 563 } 564 let zone_sub_size = zone.size - zone_offset; 565 566 let file_offset = zone_offset; 567 let region_start = ram_region 568 .0 569 .checked_add(ram_region_offset) 570 .ok_or(Error::GuestAddressOverFlow)?; 571 let region_size = if zone_sub_size <= ram_region_available_size { 572 if zone_sub_size == ram_region_available_size { 573 ram_region_consumed = true; 574 } 575 576 ram_region_offset += zone_sub_size; 577 pull_next_zone = true; 578 579 zone_sub_size 580 } else { 581 zone_offset += ram_region_available_size; 582 ram_region_consumed = true; 583 584 ram_region_available_size 585 }; 586 587 info!( 588 "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}", 589 zone.id, 590 region_start.raw_value(), 591 region_size 592 ); 593 let region = MemoryManager::create_ram_region( 594 &zone.file, 595 file_offset, 596 region_start, 597 region_size as usize, 598 prefault.unwrap_or(zone.prefault), 599 zone.shared, 600 zone.hugepages, 601 zone.hugepage_size, 602 zone.host_numa_node, 603 None, 604 thp, 605 )?; 606 607 // Add region to the list of regions associated with the 608 // current memory zone. 609 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 610 memory_zone.regions.push(region.clone()); 611 } 612 613 mem_regions.push(region); 614 615 if pull_next_zone { 616 // Get the next zone and reset the offset. 617 zone_offset = 0; 618 if let Some(z) = zone_iter.next() { 619 zone = z; 620 } else { 621 exit = true; 622 break; 623 } 624 zone_align_size = memory_zone_get_align_size(zone)?; 625 if !is_aligned(zone.size, zone_align_size) { 626 return Err(Error::MisalignedMemorySize); 627 } 628 629 // Check if zone id already exist. In case it does, throw 630 // an error as we need unique identifiers. Otherwise, add 631 // the new zone id to the list of memory zones. 632 if memory_zones.contains_key(&zone.id) { 633 error!( 634 "Memory zone identifier '{}' found more than once. \ 635 It must be unique", 636 zone.id, 637 ); 638 return Err(Error::DuplicateZoneId); 639 } 640 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 641 } 642 643 if ram_region_consumed { 644 break; 645 } 646 } 647 648 if exit { 649 break; 650 } 651 } 652 653 Ok((mem_regions, memory_zones)) 654 } 655 656 // Restore both GuestMemory regions along with MemoryZone zones. 657 fn restore_memory_regions_and_zones( 658 guest_ram_mappings: &[GuestRamMapping], 659 zones_config: &[MemoryZoneConfig], 660 prefault: Option<bool>, 661 mut existing_memory_files: HashMap<u32, File>, 662 thp: bool, 663 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 664 let mut memory_regions = Vec::new(); 665 let mut memory_zones = HashMap::new(); 666 667 for zone_config in zones_config { 668 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 669 } 670 671 for guest_ram_mapping in guest_ram_mappings { 672 for zone_config in zones_config { 673 if guest_ram_mapping.zone_id == zone_config.id { 674 let region = MemoryManager::create_ram_region( 675 &zone_config.file, 676 guest_ram_mapping.file_offset, 677 GuestAddress(guest_ram_mapping.gpa), 678 guest_ram_mapping.size as usize, 679 prefault.unwrap_or(zone_config.prefault), 680 zone_config.shared, 681 zone_config.hugepages, 682 zone_config.hugepage_size, 683 zone_config.host_numa_node, 684 existing_memory_files.remove(&guest_ram_mapping.slot), 685 thp, 686 )?; 687 memory_regions.push(Arc::clone(®ion)); 688 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 689 if guest_ram_mapping.virtio_mem { 690 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 691 let region_size = region.len(); 692 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 693 region, 694 virtio_device: None, 695 hotplugged_size, 696 hugepages: zone_config.hugepages, 697 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 698 }); 699 } else { 700 memory_zone.regions.push(region); 701 } 702 } 703 } 704 } 705 } 706 707 memory_regions.sort_by_key(|x| x.start_addr()); 708 709 Ok((memory_regions, memory_zones)) 710 } 711 712 fn fill_saved_regions( 713 &mut self, 714 file_path: PathBuf, 715 saved_regions: MemoryRangeTable, 716 ) -> Result<(), Error> { 717 if saved_regions.is_empty() { 718 return Ok(()); 719 } 720 721 // Open (read only) the snapshot file. 722 let mut memory_file = OpenOptions::new() 723 .read(true) 724 .open(file_path) 725 .map_err(Error::SnapshotOpen)?; 726 727 let guest_memory = self.guest_memory.memory(); 728 for range in saved_regions.regions() { 729 let mut offset: u64 = 0; 730 // Here we are manually handling the retry in case we can't write 731 // the whole region at once because we can't use the implementation 732 // from vm-memory::GuestMemory of read_exact_from() as it is not 733 // following the correct behavior. For more info about this issue 734 // see: https://github.com/rust-vmm/vm-memory/issues/174 735 loop { 736 let bytes_read = guest_memory 737 .read_volatile_from( 738 GuestAddress(range.gpa + offset), 739 &mut memory_file, 740 (range.length - offset) as usize, 741 ) 742 .map_err(Error::SnapshotCopy)?; 743 offset += bytes_read as u64; 744 745 if offset == range.length { 746 break; 747 } 748 } 749 } 750 751 Ok(()) 752 } 753 754 fn validate_memory_config( 755 config: &MemoryConfig, 756 user_provided_zones: bool, 757 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 758 let mut allow_mem_hotplug = false; 759 760 if !user_provided_zones { 761 if config.zones.is_some() { 762 error!( 763 "User defined memory regions can't be provided if the \ 764 memory size is not 0" 765 ); 766 return Err(Error::InvalidMemoryParameters); 767 } 768 769 if config.hotplug_size.is_some() { 770 allow_mem_hotplug = true; 771 } 772 773 if let Some(hotplugged_size) = config.hotplugged_size { 774 if let Some(hotplug_size) = config.hotplug_size { 775 if hotplugged_size > hotplug_size { 776 error!( 777 "'hotplugged_size' {} can't be bigger than \ 778 'hotplug_size' {}", 779 hotplugged_size, hotplug_size, 780 ); 781 return Err(Error::InvalidMemoryParameters); 782 } 783 } else { 784 error!( 785 "Invalid to define 'hotplugged_size' when there is\ 786 no 'hotplug_size'" 787 ); 788 return Err(Error::InvalidMemoryParameters); 789 } 790 if config.hotplug_method == HotplugMethod::Acpi { 791 error!( 792 "Invalid to define 'hotplugged_size' with hotplug \ 793 method 'acpi'" 794 ); 795 return Err(Error::InvalidMemoryParameters); 796 } 797 } 798 799 // Create a single zone from the global memory config. This lets 800 // us reuse the codepath for user defined memory zones. 801 let zones = vec![MemoryZoneConfig { 802 id: String::from(DEFAULT_MEMORY_ZONE), 803 size: config.size, 804 file: None, 805 shared: config.shared, 806 hugepages: config.hugepages, 807 hugepage_size: config.hugepage_size, 808 host_numa_node: None, 809 hotplug_size: config.hotplug_size, 810 hotplugged_size: config.hotplugged_size, 811 prefault: config.prefault, 812 }]; 813 814 Ok((config.size, zones, allow_mem_hotplug)) 815 } else { 816 if config.zones.is_none() { 817 error!( 818 "User defined memory regions must be provided if the \ 819 memory size is 0" 820 ); 821 return Err(Error::MissingMemoryZones); 822 } 823 824 // Safe to unwrap as we checked right above there were some 825 // regions. 826 let zones = config.zones.clone().unwrap(); 827 if zones.is_empty() { 828 return Err(Error::MissingMemoryZones); 829 } 830 831 let mut total_ram_size: u64 = 0; 832 for zone in zones.iter() { 833 total_ram_size += zone.size; 834 835 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 836 error!( 837 "Invalid to set host NUMA policy for a memory zone \ 838 backed by a regular file and mapped as 'shared'" 839 ); 840 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 841 } 842 843 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 844 error!("Invalid to set ACPI hotplug method for memory zones"); 845 return Err(Error::InvalidHotplugMethodWithMemoryZones); 846 } 847 848 if let Some(hotplugged_size) = zone.hotplugged_size { 849 if let Some(hotplug_size) = zone.hotplug_size { 850 if hotplugged_size > hotplug_size { 851 error!( 852 "'hotplugged_size' {} can't be bigger than \ 853 'hotplug_size' {}", 854 hotplugged_size, hotplug_size, 855 ); 856 return Err(Error::InvalidMemoryParameters); 857 } 858 } else { 859 error!( 860 "Invalid to define 'hotplugged_size' when there is\ 861 no 'hotplug_size' for a memory zone" 862 ); 863 return Err(Error::InvalidMemoryParameters); 864 } 865 if config.hotplug_method == HotplugMethod::Acpi { 866 error!( 867 "Invalid to define 'hotplugged_size' with hotplug \ 868 method 'acpi'" 869 ); 870 return Err(Error::InvalidMemoryParameters); 871 } 872 } 873 } 874 875 Ok((total_ram_size, zones, allow_mem_hotplug)) 876 } 877 } 878 879 pub fn allocate_address_space(&mut self) -> Result<(), Error> { 880 let mut list = Vec::new(); 881 882 for (zone_id, memory_zone) in self.memory_zones.iter() { 883 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 884 memory_zone 885 .regions() 886 .iter() 887 .map(|r| (r.clone(), false)) 888 .collect(); 889 890 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 891 regions.push((virtio_mem_zone.region().clone(), true)); 892 } 893 894 list.push((zone_id.clone(), regions)); 895 } 896 897 for (zone_id, regions) in list { 898 for (region, virtio_mem) in regions { 899 let slot = self.create_userspace_mapping( 900 region.start_addr().raw_value(), 901 region.len(), 902 region.as_ptr() as u64, 903 self.mergeable, 904 false, 905 self.log_dirty, 906 )?; 907 908 let file_offset = if let Some(file_offset) = region.file_offset() { 909 file_offset.start() 910 } else { 911 0 912 }; 913 914 self.guest_ram_mappings.push(GuestRamMapping { 915 gpa: region.start_addr().raw_value(), 916 size: region.len(), 917 slot, 918 zone_id: zone_id.clone(), 919 virtio_mem, 920 file_offset, 921 }); 922 self.ram_allocator 923 .allocate(Some(region.start_addr()), region.len(), None) 924 .ok_or(Error::MemoryRangeAllocation)?; 925 } 926 } 927 928 // Allocate SubRegion and Reserved address ranges. 929 for region in self.arch_mem_regions.iter() { 930 if region.r_type == RegionType::Ram { 931 // Ignore the RAM type since ranges have already been allocated 932 // based on the GuestMemory regions. 933 continue; 934 } 935 self.ram_allocator 936 .allocate( 937 Some(GuestAddress(region.base)), 938 region.size as GuestUsize, 939 None, 940 ) 941 .ok_or(Error::MemoryRangeAllocation)?; 942 } 943 944 Ok(()) 945 } 946 947 #[cfg(target_arch = "aarch64")] 948 fn add_uefi_flash(&mut self) -> Result<(), Error> { 949 // On AArch64, the UEFI binary requires a flash device at address 0. 950 // 4 MiB memory is mapped to simulate the flash. 951 let uefi_mem_slot = self.allocate_memory_slot(); 952 let uefi_region = GuestRegionMmap::new( 953 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 954 arch::layout::UEFI_START, 955 ) 956 .unwrap(); 957 let uefi_mem_region = self.vm.make_user_memory_region( 958 uefi_mem_slot, 959 uefi_region.start_addr().raw_value(), 960 uefi_region.len(), 961 uefi_region.as_ptr() as u64, 962 false, 963 false, 964 ); 965 self.vm 966 .create_user_memory_region(uefi_mem_region) 967 .map_err(Error::CreateUefiFlash)?; 968 969 let uefi_flash = 970 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 971 972 self.uefi_flash = Some(uefi_flash); 973 974 Ok(()) 975 } 976 977 #[allow(clippy::too_many_arguments)] 978 pub fn new( 979 vm: Arc<dyn hypervisor::Vm>, 980 config: &MemoryConfig, 981 prefault: Option<bool>, 982 phys_bits: u8, 983 #[cfg(feature = "tdx")] tdx_enabled: bool, 984 restore_data: Option<&MemoryManagerSnapshotData>, 985 existing_memory_files: Option<HashMap<u32, File>>, 986 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 987 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 988 trace_scoped!("MemoryManager::new"); 989 990 let user_provided_zones = config.size == 0; 991 992 let mmio_address_space_size = mmio_address_space_size(phys_bits); 993 debug_assert_eq!( 994 (((mmio_address_space_size) >> 16) << 16), 995 mmio_address_space_size 996 ); 997 let start_of_platform_device_area = 998 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 999 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 1000 1001 let (ram_size, zones, allow_mem_hotplug) = 1002 Self::validate_memory_config(config, user_provided_zones)?; 1003 1004 let ( 1005 start_of_device_area, 1006 boot_ram, 1007 current_ram, 1008 arch_mem_regions, 1009 memory_zones, 1010 guest_memory, 1011 boot_guest_memory, 1012 hotplug_slots, 1013 next_memory_slot, 1014 selected_slot, 1015 next_hotplug_slot, 1016 ) = if let Some(data) = restore_data { 1017 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 1018 &data.guest_ram_mappings, 1019 &zones, 1020 prefault, 1021 existing_memory_files.unwrap_or_default(), 1022 config.thp, 1023 )?; 1024 let guest_memory = 1025 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 1026 let boot_guest_memory = guest_memory.clone(); 1027 ( 1028 GuestAddress(data.start_of_device_area), 1029 data.boot_ram, 1030 data.current_ram, 1031 data.arch_mem_regions.clone(), 1032 memory_zones, 1033 guest_memory, 1034 boot_guest_memory, 1035 data.hotplug_slots.clone(), 1036 data.next_memory_slot, 1037 data.selected_slot, 1038 data.next_hotplug_slot, 1039 ) 1040 } else { 1041 // Init guest memory 1042 let arch_mem_regions = arch::arch_memory_regions(); 1043 1044 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1045 .iter() 1046 .filter(|r| r.2 == RegionType::Ram) 1047 .map(|r| (r.0, r.1)) 1048 .collect(); 1049 1050 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 1051 .iter() 1052 .map(|(a, b, c)| ArchMemRegion { 1053 base: a.0, 1054 size: *b, 1055 r_type: *c, 1056 }) 1057 .collect(); 1058 1059 let (mem_regions, mut memory_zones) = 1060 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 1061 1062 let mut guest_memory = 1063 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 1064 1065 let boot_guest_memory = guest_memory.clone(); 1066 1067 let mut start_of_device_area = 1068 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 1069 1070 // Update list of memory zones for resize. 1071 for zone in zones.iter() { 1072 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 1073 if let Some(hotplug_size) = zone.hotplug_size { 1074 if hotplug_size == 0 { 1075 error!("'hotplug_size' can't be 0"); 1076 return Err(Error::InvalidHotplugSize); 1077 } 1078 1079 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 1080 start_of_device_area = start_of_device_area 1081 .checked_add(hotplug_size) 1082 .ok_or(Error::GuestAddressOverFlow)?; 1083 } else { 1084 // Alignment must be "natural" i.e. same as size of block 1085 let start_addr = GuestAddress( 1086 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1087 - 1) 1088 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 1089 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 1090 ); 1091 1092 // When `prefault` is set by vm_restore, memory manager 1093 // will create ram region with `prefault` option in 1094 // restore config rather than same option in zone 1095 let region = MemoryManager::create_ram_region( 1096 &None, 1097 0, 1098 start_addr, 1099 hotplug_size as usize, 1100 prefault.unwrap_or(zone.prefault), 1101 zone.shared, 1102 zone.hugepages, 1103 zone.hugepage_size, 1104 zone.host_numa_node, 1105 None, 1106 config.thp, 1107 )?; 1108 1109 guest_memory = guest_memory 1110 .insert_region(Arc::clone(®ion)) 1111 .map_err(Error::GuestMemory)?; 1112 1113 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1114 let region_size = region.len(); 1115 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1116 region, 1117 virtio_device: None, 1118 hotplugged_size, 1119 hugepages: zone.hugepages, 1120 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1121 }); 1122 1123 start_of_device_area = start_addr 1124 .checked_add(hotplug_size) 1125 .ok_or(Error::GuestAddressOverFlow)?; 1126 } 1127 } 1128 } else { 1129 return Err(Error::MissingZoneIdentifier); 1130 } 1131 } 1132 1133 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1134 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1135 1136 ( 1137 start_of_device_area, 1138 ram_size, 1139 ram_size, 1140 arch_mem_regions, 1141 memory_zones, 1142 guest_memory, 1143 boot_guest_memory, 1144 hotplug_slots, 1145 0, 1146 0, 1147 0, 1148 ) 1149 }; 1150 1151 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1152 1153 // Both MMIO and PIO address spaces start at address 0. 1154 let allocator = Arc::new(Mutex::new( 1155 SystemAllocator::new( 1156 #[cfg(target_arch = "x86_64")] 1157 { 1158 GuestAddress(0) 1159 }, 1160 #[cfg(target_arch = "x86_64")] 1161 { 1162 1 << 16 1163 }, 1164 start_of_platform_device_area, 1165 PLATFORM_DEVICE_AREA_SIZE, 1166 #[cfg(target_arch = "x86_64")] 1167 vec![GsiApic::new( 1168 X86_64_IRQ_BASE, 1169 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1170 )], 1171 ) 1172 .ok_or(Error::CreateSystemAllocator)?, 1173 )); 1174 1175 #[cfg(not(feature = "tdx"))] 1176 let dynamic = true; 1177 #[cfg(feature = "tdx")] 1178 let dynamic = !tdx_enabled; 1179 1180 let acpi_address = if dynamic 1181 && config.hotplug_method == HotplugMethod::Acpi 1182 && (config.hotplug_size.unwrap_or_default() > 0) 1183 { 1184 Some( 1185 allocator 1186 .lock() 1187 .unwrap() 1188 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1189 .ok_or(Error::AllocateMmioAddress)?, 1190 ) 1191 } else { 1192 None 1193 }; 1194 1195 // If running on SGX the start of device area and RAM area may diverge but 1196 // at this point they are next to each other. 1197 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1198 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1199 1200 let mut memory_manager = MemoryManager { 1201 boot_guest_memory, 1202 guest_memory, 1203 next_memory_slot, 1204 start_of_device_area, 1205 end_of_device_area, 1206 end_of_ram_area, 1207 vm, 1208 hotplug_slots, 1209 selected_slot, 1210 mergeable: config.mergeable, 1211 allocator, 1212 hotplug_method: config.hotplug_method, 1213 boot_ram, 1214 current_ram, 1215 next_hotplug_slot, 1216 shared: config.shared, 1217 hugepages: config.hugepages, 1218 hugepage_size: config.hugepage_size, 1219 prefault: config.prefault, 1220 #[cfg(target_arch = "x86_64")] 1221 sgx_epc_region: None, 1222 user_provided_zones, 1223 snapshot_memory_ranges: MemoryRangeTable::default(), 1224 memory_zones, 1225 guest_ram_mappings: Vec::new(), 1226 acpi_address, 1227 log_dirty: dynamic, // Cannot log dirty pages on a TD 1228 arch_mem_regions, 1229 ram_allocator, 1230 dynamic, 1231 #[cfg(target_arch = "aarch64")] 1232 uefi_flash: None, 1233 thp: config.thp, 1234 }; 1235 1236 #[cfg(target_arch = "aarch64")] 1237 { 1238 // For Aarch64 we cannot lazily allocate the address space like we 1239 // do for x86, because while restoring a VM from snapshot we would 1240 // need the address space to be allocated to properly restore VGIC. 1241 // And the restore of VGIC happens before we attempt to run the vCPUs 1242 // for the first time, thus we need to allocate the address space 1243 // beforehand. 1244 memory_manager.allocate_address_space()?; 1245 memory_manager.add_uefi_flash()?; 1246 } 1247 1248 #[cfg(target_arch = "x86_64")] 1249 if let Some(sgx_epc_config) = sgx_epc_config { 1250 memory_manager.setup_sgx(sgx_epc_config)?; 1251 } 1252 1253 Ok(Arc::new(Mutex::new(memory_manager))) 1254 } 1255 1256 pub fn new_from_snapshot( 1257 snapshot: &Snapshot, 1258 vm: Arc<dyn hypervisor::Vm>, 1259 config: &MemoryConfig, 1260 source_url: Option<&str>, 1261 prefault: bool, 1262 phys_bits: u8, 1263 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1264 if let Some(source_url) = source_url { 1265 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1266 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1267 1268 let mem_snapshot: MemoryManagerSnapshotData = 1269 snapshot.to_versioned_state().map_err(Error::Restore)?; 1270 1271 let mm = MemoryManager::new( 1272 vm, 1273 config, 1274 Some(prefault), 1275 phys_bits, 1276 #[cfg(feature = "tdx")] 1277 false, 1278 Some(&mem_snapshot), 1279 None, 1280 #[cfg(target_arch = "x86_64")] 1281 None, 1282 )?; 1283 1284 mm.lock() 1285 .unwrap() 1286 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1287 1288 Ok(mm) 1289 } else { 1290 Err(Error::RestoreMissingSourceUrl) 1291 } 1292 } 1293 1294 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1295 // SAFETY: FFI call with correct arguments 1296 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1297 1298 if res < 0 { 1299 Err(io::Error::last_os_error()) 1300 } else { 1301 Ok(res as RawFd) 1302 } 1303 } 1304 1305 fn mbind( 1306 addr: *mut u8, 1307 len: u64, 1308 mode: u32, 1309 nodemask: Vec<u64>, 1310 maxnode: u64, 1311 flags: u32, 1312 ) -> Result<(), io::Error> { 1313 // SAFETY: FFI call with correct arguments 1314 let res = unsafe { 1315 libc::syscall( 1316 libc::SYS_mbind, 1317 addr as *mut libc::c_void, 1318 len, 1319 mode, 1320 nodemask.as_ptr(), 1321 maxnode, 1322 flags, 1323 ) 1324 }; 1325 1326 if res < 0 { 1327 Err(io::Error::last_os_error()) 1328 } else { 1329 Ok(()) 1330 } 1331 } 1332 1333 fn create_anonymous_file( 1334 size: usize, 1335 hugepages: bool, 1336 hugepage_size: Option<u64>, 1337 ) -> Result<FileOffset, Error> { 1338 let fd = Self::memfd_create( 1339 &ffi::CString::new("ch_ram").unwrap(), 1340 libc::MFD_CLOEXEC 1341 | if hugepages { 1342 libc::MFD_HUGETLB 1343 | if let Some(hugepage_size) = hugepage_size { 1344 /* 1345 * From the Linux kernel: 1346 * Several system calls take a flag to request "hugetlb" huge pages. 1347 * Without further specification, these system calls will use the 1348 * system's default huge page size. If a system supports multiple 1349 * huge page sizes, the desired huge page size can be specified in 1350 * bits [26:31] of the flag arguments. The value in these 6 bits 1351 * will encode the log2 of the huge page size. 1352 */ 1353 1354 hugepage_size.trailing_zeros() << 26 1355 } else { 1356 // Use the system default huge page size 1357 0 1358 } 1359 } else { 1360 0 1361 }, 1362 ) 1363 .map_err(Error::SharedFileCreate)?; 1364 1365 // SAFETY: fd is valid 1366 let f = unsafe { File::from_raw_fd(fd) }; 1367 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1368 1369 Ok(FileOffset::new(f, 0)) 1370 } 1371 1372 fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> { 1373 if backing_file.is_dir() { 1374 Err(Error::DirectoryAsBackingFileForMemory) 1375 } else { 1376 let f = OpenOptions::new() 1377 .read(true) 1378 .write(true) 1379 .open(backing_file) 1380 .map_err(Error::SharedFileCreate)?; 1381 1382 Ok(FileOffset::new(f, file_offset)) 1383 } 1384 } 1385 1386 #[allow(clippy::too_many_arguments)] 1387 pub fn create_ram_region( 1388 backing_file: &Option<PathBuf>, 1389 file_offset: u64, 1390 start_addr: GuestAddress, 1391 size: usize, 1392 prefault: bool, 1393 shared: bool, 1394 hugepages: bool, 1395 hugepage_size: Option<u64>, 1396 host_numa_node: Option<u32>, 1397 existing_memory_file: Option<File>, 1398 thp: bool, 1399 ) -> Result<Arc<GuestRegionMmap>, Error> { 1400 let mut mmap_flags = libc::MAP_NORESERVE; 1401 1402 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1403 // the complexity of the handling clear. 1404 let fo = if let Some(f) = existing_memory_file { 1405 // It must be MAP_SHARED as we wouldn't already have an FD 1406 mmap_flags |= libc::MAP_SHARED; 1407 Some(FileOffset::new(f, file_offset)) 1408 } else if let Some(backing_file) = backing_file { 1409 if shared { 1410 mmap_flags |= libc::MAP_SHARED; 1411 } else { 1412 mmap_flags |= libc::MAP_PRIVATE; 1413 } 1414 Some(Self::open_backing_file(backing_file, file_offset)?) 1415 } else if shared || hugepages { 1416 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1417 // because the MAP_PRIVATE will trigger CoW against the backing file with 1418 // the VFIO pinning 1419 mmap_flags |= libc::MAP_SHARED; 1420 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1421 } else { 1422 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1423 None 1424 }; 1425 1426 let region = GuestRegionMmap::new( 1427 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1428 .map_err(Error::GuestMemoryRegion)?, 1429 start_addr, 1430 ) 1431 .map_err(Error::GuestMemory)?; 1432 1433 // Apply NUMA policy if needed. 1434 if let Some(node) = host_numa_node { 1435 let addr = region.deref().as_ptr(); 1436 let len = region.deref().size() as u64; 1437 let mode = MPOL_BIND; 1438 let mut nodemask: Vec<u64> = Vec::new(); 1439 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1440 1441 // Linux is kind of buggy in the way it interprets maxnode as it 1442 // will cut off the last node. That's why we have to add 1 to what 1443 // we would consider as the proper maxnode value. 1444 let maxnode = node as u64 + 1 + 1; 1445 1446 // Allocate the right size for the vector. 1447 nodemask.resize((node as usize / 64) + 1, 0); 1448 1449 // Fill the global bitmask through the nodemask vector. 1450 let idx = (node / 64) as usize; 1451 let shift = node % 64; 1452 nodemask[idx] |= 1u64 << shift; 1453 1454 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1455 // force the kernel to move all pages that might have been already 1456 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1457 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1458 // MPOL_BIND is the selected mode as it specifies a strict policy 1459 // that restricts memory allocation to the nodes specified in the 1460 // nodemask. 1461 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1462 .map_err(Error::ApplyNumaPolicy)?; 1463 } 1464 1465 // Prefault the region if needed, in parallel. 1466 if prefault { 1467 let page_size = 1468 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize; 1469 1470 if !is_aligned(size, page_size) { 1471 warn!( 1472 "Prefaulting memory size {} misaligned with page size {}", 1473 size, page_size 1474 ); 1475 } 1476 1477 let num_pages = size / page_size; 1478 1479 let num_threads = Self::get_prefault_num_threads(page_size, num_pages); 1480 1481 let pages_per_thread = num_pages / num_threads; 1482 let remainder = num_pages % num_threads; 1483 1484 let barrier = Arc::new(Barrier::new(num_threads)); 1485 thread::scope(|s| { 1486 let r = ®ion; 1487 for i in 0..num_threads { 1488 let barrier = Arc::clone(&barrier); 1489 s.spawn(move || { 1490 // Wait until all threads have been spawned to avoid contention 1491 // over mmap_sem between thread stack allocation and page faulting. 1492 barrier.wait(); 1493 let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; 1494 let offset = 1495 page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); 1496 // SAFETY: FFI call with correct arguments 1497 let ret = unsafe { 1498 let addr = r.as_ptr().add(offset); 1499 libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE) 1500 }; 1501 if ret != 0 { 1502 let e = io::Error::last_os_error(); 1503 warn!("Failed to prefault pages: {}", e); 1504 } 1505 }); 1506 } 1507 }); 1508 } 1509 1510 if region.file_offset().is_none() && thp { 1511 info!( 1512 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1513 region.as_ptr() as u64, 1514 size 1515 ); 1516 // SAFETY: FFI call with correct arguments 1517 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1518 if ret != 0 { 1519 let e = io::Error::last_os_error(); 1520 warn!("Failed to mark pages as THP eligible: {}", e); 1521 } 1522 } 1523 1524 Ok(Arc::new(region)) 1525 } 1526 1527 // Duplicate of `memory_zone_get_align_size` that does not require a `zone` 1528 fn get_prefault_align_size( 1529 backing_file: &Option<PathBuf>, 1530 hugepages: bool, 1531 hugepage_size: Option<u64>, 1532 ) -> Result<u64, Error> { 1533 // SAFETY: FFI call. Trivially safe. 1534 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 1535 match (hugepages, hugepage_size, backing_file) { 1536 (false, _, _) => Ok(page_size), 1537 (true, Some(hugepage_size), _) => Ok(hugepage_size), 1538 (true, None, _) => { 1539 // There are two scenarios here: 1540 // - `hugepages` is enabled but `hugepage_size` is not specified: 1541 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 1542 // - The backing file is specified: 1543 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 1544 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 1545 // value is less than or equal to the page size, just use the page size. 1546 let path = backing_file 1547 .as_ref() 1548 .map_or(Ok("/dev/hugepages"), |pathbuf| { 1549 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 1550 })?; 1551 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 1552 Ok(align_size) 1553 } 1554 } 1555 } 1556 1557 fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize { 1558 let mut n: usize = 1; 1559 1560 // Do not create more threads than processors available. 1561 // SAFETY: FFI call. Trivially safe. 1562 let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) }; 1563 if procs > 0 { 1564 n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT); 1565 } 1566 1567 // Do not create more threads than pages being allocated. 1568 n = std::cmp::min(n, num_pages); 1569 1570 // Do not create threads to allocate less than 64 MiB of memory. 1571 n = std::cmp::min( 1572 n, 1573 std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))), 1574 ); 1575 1576 n 1577 } 1578 1579 // Update the GuestMemoryMmap with the new range 1580 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1581 let guest_memory = self 1582 .guest_memory 1583 .memory() 1584 .insert_region(region) 1585 .map_err(Error::GuestMemory)?; 1586 self.guest_memory.lock().unwrap().replace(guest_memory); 1587 1588 Ok(()) 1589 } 1590 1591 // 1592 // Calculate the start address of an area next to RAM. 1593 // 1594 // If memory hotplug is allowed, the start address needs to be aligned 1595 // (rounded-up) to 128MiB boundary. 1596 // If memory hotplug is not allowed, there is no alignment required. 1597 // And it must also start at the 64bit start. 1598 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1599 let mut start_addr = if allow_mem_hotplug { 1600 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1601 } else { 1602 mem_end 1603 }; 1604 1605 start_addr = start_addr 1606 .checked_add(1) 1607 .ok_or(Error::GuestAddressOverFlow)?; 1608 1609 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1610 return Ok(arch::layout::RAM_64BIT_START); 1611 } 1612 1613 Ok(start_addr) 1614 } 1615 1616 pub fn add_ram_region( 1617 &mut self, 1618 start_addr: GuestAddress, 1619 size: usize, 1620 ) -> Result<Arc<GuestRegionMmap>, Error> { 1621 // Allocate memory for the region 1622 let region = MemoryManager::create_ram_region( 1623 &None, 1624 0, 1625 start_addr, 1626 size, 1627 self.prefault, 1628 self.shared, 1629 self.hugepages, 1630 self.hugepage_size, 1631 None, 1632 None, 1633 self.thp, 1634 )?; 1635 1636 // Map it into the guest 1637 let slot = self.create_userspace_mapping( 1638 region.start_addr().0, 1639 region.len(), 1640 region.as_ptr() as u64, 1641 self.mergeable, 1642 false, 1643 self.log_dirty, 1644 )?; 1645 self.guest_ram_mappings.push(GuestRamMapping { 1646 gpa: region.start_addr().raw_value(), 1647 size: region.len(), 1648 slot, 1649 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1650 virtio_mem: false, 1651 file_offset: 0, 1652 }); 1653 1654 self.add_region(Arc::clone(®ion))?; 1655 1656 Ok(region) 1657 } 1658 1659 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1660 info!("Hotplugging new RAM: {}", size); 1661 1662 // Check that there is a free slot 1663 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1664 return Err(Error::NoSlotAvailable); 1665 } 1666 1667 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1668 if size % (128 << 20) != 0 { 1669 return Err(Error::InvalidSize); 1670 } 1671 1672 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1673 1674 if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area { 1675 return Err(Error::InsufficientHotplugRam); 1676 } 1677 1678 let region = self.add_ram_region(start_addr, size)?; 1679 1680 // Add region to the list of regions associated with the default 1681 // memory zone. 1682 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1683 memory_zone.regions.push(Arc::clone(®ion)); 1684 } 1685 1686 // Tell the allocator 1687 self.ram_allocator 1688 .allocate(Some(start_addr), size as GuestUsize, None) 1689 .ok_or(Error::MemoryRangeAllocation)?; 1690 1691 // Update the slot so that it can be queried via the I/O port 1692 let slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1693 slot.active = true; 1694 slot.inserting = true; 1695 slot.base = region.start_addr().0; 1696 slot.length = region.len(); 1697 1698 self.next_hotplug_slot += 1; 1699 1700 Ok(region) 1701 } 1702 1703 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1704 self.guest_memory.clone() 1705 } 1706 1707 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1708 self.boot_guest_memory.clone() 1709 } 1710 1711 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1712 self.allocator.clone() 1713 } 1714 1715 pub fn start_of_device_area(&self) -> GuestAddress { 1716 self.start_of_device_area 1717 } 1718 1719 pub fn end_of_device_area(&self) -> GuestAddress { 1720 self.end_of_device_area 1721 } 1722 1723 pub fn allocate_memory_slot(&mut self) -> u32 { 1724 let slot_id = self.next_memory_slot; 1725 self.next_memory_slot += 1; 1726 slot_id 1727 } 1728 1729 pub fn create_userspace_mapping( 1730 &mut self, 1731 guest_phys_addr: u64, 1732 memory_size: u64, 1733 userspace_addr: u64, 1734 mergeable: bool, 1735 readonly: bool, 1736 log_dirty: bool, 1737 ) -> Result<u32, Error> { 1738 let slot = self.allocate_memory_slot(); 1739 let mem_region = self.vm.make_user_memory_region( 1740 slot, 1741 guest_phys_addr, 1742 memory_size, 1743 userspace_addr, 1744 readonly, 1745 log_dirty, 1746 ); 1747 1748 info!( 1749 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1750 guest_phys_addr, userspace_addr, memory_size, slot 1751 ); 1752 1753 self.vm 1754 .create_user_memory_region(mem_region) 1755 .map_err(Error::CreateUserMemoryRegion)?; 1756 1757 // SAFETY: the address and size are valid since the 1758 // mmap succeeded. 1759 let ret = unsafe { 1760 libc::madvise( 1761 userspace_addr as *mut libc::c_void, 1762 memory_size as libc::size_t, 1763 libc::MADV_DONTDUMP, 1764 ) 1765 }; 1766 if ret != 0 { 1767 let e = io::Error::last_os_error(); 1768 warn!("Failed to mark mappin as MADV_DONTDUMP: {}", e); 1769 } 1770 1771 // Mark the pages as mergeable if explicitly asked for. 1772 if mergeable { 1773 // SAFETY: the address and size are valid since the 1774 // mmap succeeded. 1775 let ret = unsafe { 1776 libc::madvise( 1777 userspace_addr as *mut libc::c_void, 1778 memory_size as libc::size_t, 1779 libc::MADV_MERGEABLE, 1780 ) 1781 }; 1782 if ret != 0 { 1783 let err = io::Error::last_os_error(); 1784 // Safe to unwrap because the error is constructed with 1785 // last_os_error(), which ensures the output will be Some(). 1786 let errno = err.raw_os_error().unwrap(); 1787 if errno == libc::EINVAL { 1788 warn!("kernel not configured with CONFIG_KSM"); 1789 } else { 1790 warn!("madvise error: {}", err); 1791 } 1792 warn!("failed to mark pages as mergeable"); 1793 } 1794 } 1795 1796 info!( 1797 "Created userspace mapping: {:x} -> {:x} {:x}", 1798 guest_phys_addr, userspace_addr, memory_size 1799 ); 1800 1801 Ok(slot) 1802 } 1803 1804 pub fn remove_userspace_mapping( 1805 &mut self, 1806 guest_phys_addr: u64, 1807 memory_size: u64, 1808 userspace_addr: u64, 1809 mergeable: bool, 1810 slot: u32, 1811 ) -> Result<(), Error> { 1812 let mem_region = self.vm.make_user_memory_region( 1813 slot, 1814 guest_phys_addr, 1815 memory_size, 1816 userspace_addr, 1817 false, /* readonly -- don't care */ 1818 false, /* log dirty */ 1819 ); 1820 1821 self.vm 1822 .remove_user_memory_region(mem_region) 1823 .map_err(Error::RemoveUserMemoryRegion)?; 1824 1825 // Mark the pages as unmergeable if there were previously marked as 1826 // mergeable. 1827 if mergeable { 1828 // SAFETY: the address and size are valid as the region was 1829 // previously advised. 1830 let ret = unsafe { 1831 libc::madvise( 1832 userspace_addr as *mut libc::c_void, 1833 memory_size as libc::size_t, 1834 libc::MADV_UNMERGEABLE, 1835 ) 1836 }; 1837 if ret != 0 { 1838 let err = io::Error::last_os_error(); 1839 // Safe to unwrap because the error is constructed with 1840 // last_os_error(), which ensures the output will be Some(). 1841 let errno = err.raw_os_error().unwrap(); 1842 if errno == libc::EINVAL { 1843 warn!("kernel not configured with CONFIG_KSM"); 1844 } else { 1845 warn!("madvise error: {}", err); 1846 } 1847 warn!("failed to mark pages as unmergeable"); 1848 } 1849 } 1850 1851 info!( 1852 "Removed userspace mapping: {:x} -> {:x} {:x}", 1853 guest_phys_addr, userspace_addr, memory_size 1854 ); 1855 1856 Ok(()) 1857 } 1858 1859 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1860 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1861 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1862 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1863 virtio_mem_device 1864 .lock() 1865 .unwrap() 1866 .resize(size) 1867 .map_err(Error::VirtioMemResizeFail)?; 1868 } 1869 1870 // Keep the hotplugged_size up to date. 1871 virtio_mem_zone.hotplugged_size = size; 1872 } else { 1873 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1874 return Err(Error::MissingVirtioMemHandler); 1875 } 1876 1877 return Ok(()); 1878 } 1879 1880 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1881 Err(Error::UnknownMemoryZone) 1882 } 1883 1884 /// In case this function resulted in adding a new memory region to the 1885 /// guest memory, the new region is returned to the caller. The virtio-mem 1886 /// use case never adds a new region as the whole hotpluggable memory has 1887 /// already been allocated at boot time. 1888 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1889 if self.user_provided_zones { 1890 error!( 1891 "Not allowed to resize guest memory when backed with user \ 1892 defined memory zones." 1893 ); 1894 return Err(Error::InvalidResizeWithMemoryZones); 1895 } 1896 1897 let mut region: Option<Arc<GuestRegionMmap>> = None; 1898 match self.hotplug_method { 1899 HotplugMethod::VirtioMem => { 1900 if desired_ram >= self.boot_ram { 1901 if !self.dynamic { 1902 return Ok(region); 1903 } 1904 1905 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1906 self.current_ram = desired_ram; 1907 } 1908 } 1909 HotplugMethod::Acpi => { 1910 if desired_ram > self.current_ram { 1911 if !self.dynamic { 1912 return Ok(region); 1913 } 1914 1915 region = 1916 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1917 self.current_ram = desired_ram; 1918 } 1919 } 1920 } 1921 Ok(region) 1922 } 1923 1924 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1925 if !self.user_provided_zones { 1926 error!( 1927 "Not allowed to resize guest memory zone when no zone is \ 1928 defined." 1929 ); 1930 return Err(Error::ResizeZone); 1931 } 1932 1933 self.virtio_mem_resize(id, virtio_mem_size) 1934 } 1935 1936 #[cfg(target_arch = "x86_64")] 1937 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1938 let file = OpenOptions::new() 1939 .read(true) 1940 .open("/dev/sgx_provision") 1941 .map_err(Error::SgxProvisionOpen)?; 1942 self.vm 1943 .enable_sgx_attribute(file) 1944 .map_err(Error::SgxEnableProvisioning)?; 1945 1946 // Go over each EPC section and verify its size is a 4k multiple. At 1947 // the same time, calculate the total size needed for the contiguous 1948 // EPC region. 1949 let mut epc_region_size = 0; 1950 for epc_section in sgx_epc_config.iter() { 1951 if epc_section.size == 0 { 1952 return Err(Error::EpcSectionSizeInvalid); 1953 } 1954 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1955 return Err(Error::EpcSectionSizeInvalid); 1956 } 1957 1958 epc_region_size += epc_section.size; 1959 } 1960 1961 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1962 let epc_region_start = GuestAddress( 1963 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1964 ); 1965 1966 self.start_of_device_area = epc_region_start 1967 .checked_add(epc_region_size) 1968 .ok_or(Error::GuestAddressOverFlow)?; 1969 1970 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1971 info!( 1972 "SGX EPC region: 0x{:x} (0x{:x})", 1973 epc_region_start.0, epc_region_size 1974 ); 1975 1976 // Each section can be memory mapped into the allocated region. 1977 let mut epc_section_start = epc_region_start.raw_value(); 1978 for epc_section in sgx_epc_config.iter() { 1979 let file = OpenOptions::new() 1980 .read(true) 1981 .write(true) 1982 .open("/dev/sgx_vepc") 1983 .map_err(Error::SgxVirtEpcOpen)?; 1984 1985 let prot = PROT_READ | PROT_WRITE; 1986 let mut flags = MAP_NORESERVE | MAP_SHARED; 1987 if epc_section.prefault { 1988 flags |= MAP_POPULATE; 1989 } 1990 1991 // We can't use the vm-memory crate to perform the memory mapping 1992 // here as it would try to ensure the size of the backing file is 1993 // matching the size of the expected mapping. The /dev/sgx_vepc 1994 // device does not work that way, it provides a file descriptor 1995 // which is not matching the mapping size, as it's a just a way to 1996 // let KVM know that an EPC section is being created for the guest. 1997 // SAFETY: FFI call with correct arguments 1998 let host_addr = unsafe { 1999 libc::mmap( 2000 std::ptr::null_mut(), 2001 epc_section.size as usize, 2002 prot, 2003 flags, 2004 file.as_raw_fd(), 2005 0, 2006 ) 2007 } as u64; 2008 2009 info!( 2010 "Adding SGX EPC section: 0x{:x} (0x{:x})", 2011 epc_section_start, epc_section.size 2012 ); 2013 2014 let _mem_slot = self.create_userspace_mapping( 2015 epc_section_start, 2016 epc_section.size, 2017 host_addr, 2018 false, 2019 false, 2020 false, 2021 )?; 2022 2023 sgx_epc_region.insert( 2024 epc_section.id.clone(), 2025 SgxEpcSection::new( 2026 GuestAddress(epc_section_start), 2027 epc_section.size as GuestUsize, 2028 ), 2029 ); 2030 2031 epc_section_start += epc_section.size; 2032 } 2033 2034 self.sgx_epc_region = Some(sgx_epc_region); 2035 2036 Ok(()) 2037 } 2038 2039 #[cfg(target_arch = "x86_64")] 2040 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 2041 &self.sgx_epc_region 2042 } 2043 2044 pub fn is_hardlink(f: &File) -> bool { 2045 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 2046 // SAFETY: FFI call with correct arguments 2047 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 2048 if ret != 0 { 2049 error!("Couldn't fstat the backing file"); 2050 return false; 2051 } 2052 2053 // SAFETY: stat is valid 2054 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 2055 } 2056 2057 pub fn memory_zones(&self) -> &MemoryZones { 2058 &self.memory_zones 2059 } 2060 2061 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 2062 &mut self.memory_zones 2063 } 2064 2065 pub fn memory_range_table( 2066 &self, 2067 snapshot: bool, 2068 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 2069 let mut table = MemoryRangeTable::default(); 2070 2071 for memory_zone in self.memory_zones.values() { 2072 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 2073 table.extend(virtio_mem_zone.plugged_ranges()); 2074 } 2075 2076 for region in memory_zone.regions() { 2077 if snapshot { 2078 if let Some(file_offset) = region.file_offset() { 2079 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 2080 && Self::is_hardlink(file_offset.file()) 2081 { 2082 // In this very specific case, we know the memory 2083 // region is backed by a file on the host filesystem 2084 // that can be accessed by the user, and additionally 2085 // the mapping is shared, which means that modifications 2086 // to the content are written to the actual file. 2087 // When meeting these conditions, we can skip the 2088 // copy of the memory content for this specific region, 2089 // as we can assume the user will have it saved through 2090 // the backing file already. 2091 continue; 2092 } 2093 } 2094 } 2095 2096 table.push(MemoryRange { 2097 gpa: region.start_addr().raw_value(), 2098 length: region.len(), 2099 }); 2100 } 2101 } 2102 2103 Ok(table) 2104 } 2105 2106 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 2107 MemoryManagerSnapshotData { 2108 memory_ranges: self.snapshot_memory_ranges.clone(), 2109 guest_ram_mappings: self.guest_ram_mappings.clone(), 2110 start_of_device_area: self.start_of_device_area.0, 2111 boot_ram: self.boot_ram, 2112 current_ram: self.current_ram, 2113 arch_mem_regions: self.arch_mem_regions.clone(), 2114 hotplug_slots: self.hotplug_slots.clone(), 2115 next_memory_slot: self.next_memory_slot, 2116 selected_slot: self.selected_slot, 2117 next_hotplug_slot: self.next_hotplug_slot, 2118 } 2119 } 2120 2121 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 2122 let mut memory_slot_fds = HashMap::new(); 2123 for guest_ram_mapping in &self.guest_ram_mappings { 2124 let slot = guest_ram_mapping.slot; 2125 let guest_memory = self.guest_memory.memory(); 2126 let file = guest_memory 2127 .find_region(GuestAddress(guest_ram_mapping.gpa)) 2128 .unwrap() 2129 .file_offset() 2130 .unwrap() 2131 .file(); 2132 memory_slot_fds.insert(slot, file.as_raw_fd()); 2133 } 2134 memory_slot_fds 2135 } 2136 2137 pub fn acpi_address(&self) -> Option<GuestAddress> { 2138 self.acpi_address 2139 } 2140 2141 pub fn num_guest_ram_mappings(&self) -> u32 { 2142 self.guest_ram_mappings.len() as u32 2143 } 2144 2145 #[cfg(target_arch = "aarch64")] 2146 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 2147 self.uefi_flash.as_ref().unwrap().clone() 2148 } 2149 2150 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2151 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 2152 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 2153 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 2154 2155 let mut mem_offset_in_elf = mem_offset; 2156 let mut ram_maps = BTreeMap::new(); 2157 for mapping in mapping_sorted_by_gpa.iter() { 2158 ram_maps.insert( 2159 mapping.gpa, 2160 CoredumpMemoryRegion { 2161 mem_offset_in_elf, 2162 mem_size: mapping.size, 2163 }, 2164 ); 2165 mem_offset_in_elf += mapping.size; 2166 } 2167 2168 CoredumpMemoryRegions { ram_maps } 2169 } 2170 2171 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2172 pub fn coredump_iterate_save_mem( 2173 &mut self, 2174 dump_state: &DumpState, 2175 ) -> std::result::Result<(), GuestDebuggableError> { 2176 let snapshot_memory_ranges = self 2177 .memory_range_table(false) 2178 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2179 2180 if snapshot_memory_ranges.is_empty() { 2181 return Ok(()); 2182 } 2183 2184 let coredump_file = dump_state.file.as_ref().unwrap(); 2185 2186 let guest_memory = self.guest_memory.memory(); 2187 let mut total_bytes: u64 = 0; 2188 2189 for range in snapshot_memory_ranges.regions() { 2190 let mut offset: u64 = 0; 2191 loop { 2192 let bytes_written = guest_memory 2193 .write_volatile_to( 2194 GuestAddress(range.gpa + offset), 2195 &mut coredump_file.as_fd(), 2196 (range.length - offset) as usize, 2197 ) 2198 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2199 offset += bytes_written as u64; 2200 total_bytes += bytes_written as u64; 2201 2202 if offset == range.length { 2203 break; 2204 } 2205 } 2206 } 2207 2208 debug!("coredump total bytes {}", total_bytes); 2209 Ok(()) 2210 } 2211 2212 pub fn receive_memory_regions<F>( 2213 &mut self, 2214 ranges: &MemoryRangeTable, 2215 fd: &mut F, 2216 ) -> std::result::Result<(), MigratableError> 2217 where 2218 F: ReadVolatile, 2219 { 2220 let guest_memory = self.guest_memory(); 2221 let mem = guest_memory.memory(); 2222 2223 for range in ranges.regions() { 2224 let mut offset: u64 = 0; 2225 // Here we are manually handling the retry in case we can't the 2226 // whole region at once because we can't use the implementation 2227 // from vm-memory::GuestMemory of read_exact_from() as it is not 2228 // following the correct behavior. For more info about this issue 2229 // see: https://github.com/rust-vmm/vm-memory/issues/174 2230 loop { 2231 let bytes_read = mem 2232 .read_volatile_from( 2233 GuestAddress(range.gpa + offset), 2234 fd, 2235 (range.length - offset) as usize, 2236 ) 2237 .map_err(|e| { 2238 MigratableError::MigrateReceive(anyhow!( 2239 "Error receiving memory from socket: {}", 2240 e 2241 )) 2242 })?; 2243 offset += bytes_read as u64; 2244 2245 if offset == range.length { 2246 break; 2247 } 2248 } 2249 } 2250 2251 Ok(()) 2252 } 2253 } 2254 2255 struct MemoryNotify { 2256 slot_id: usize, 2257 } 2258 2259 impl Aml for MemoryNotify { 2260 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2261 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2262 aml::If::new( 2263 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2264 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2265 ) 2266 .to_aml_bytes(sink) 2267 } 2268 } 2269 2270 struct MemorySlot { 2271 slot_id: usize, 2272 } 2273 2274 impl Aml for MemorySlot { 2275 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2276 aml::Device::new( 2277 format!("M{:03}", self.slot_id).as_str().into(), 2278 vec![ 2279 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")), 2280 &aml::Name::new("_UID".into(), &self.slot_id), 2281 /* 2282 _STA return value: 2283 Bit [0] – Set if the device is present. 2284 Bit [1] – Set if the device is enabled and decoding its resources. 2285 Bit [2] – Set if the device should be shown in the UI. 2286 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2287 Bit [4] – Set if the battery is present. 2288 Bits [31:5] – Reserved (must be cleared). 2289 */ 2290 &aml::Method::new( 2291 "_STA".into(), 2292 0, 2293 false, 2294 // Call into MSTA method which will interrogate device 2295 vec![&aml::Return::new(&aml::MethodCall::new( 2296 "MSTA".into(), 2297 vec![&self.slot_id], 2298 ))], 2299 ), 2300 // Get details of memory 2301 &aml::Method::new( 2302 "_CRS".into(), 2303 0, 2304 false, 2305 // Call into MCRS which provides actual memory details 2306 vec![&aml::Return::new(&aml::MethodCall::new( 2307 "MCRS".into(), 2308 vec![&self.slot_id], 2309 ))], 2310 ), 2311 ], 2312 ) 2313 .to_aml_bytes(sink) 2314 } 2315 } 2316 2317 struct MemorySlots { 2318 slots: usize, 2319 } 2320 2321 impl Aml for MemorySlots { 2322 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2323 for slot_id in 0..self.slots { 2324 MemorySlot { slot_id }.to_aml_bytes(sink); 2325 } 2326 } 2327 } 2328 2329 struct MemoryMethods { 2330 slots: usize, 2331 } 2332 2333 impl Aml for MemoryMethods { 2334 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2335 // Add "MTFY" notification method 2336 let mut memory_notifies = Vec::new(); 2337 for slot_id in 0..self.slots { 2338 memory_notifies.push(MemoryNotify { slot_id }); 2339 } 2340 2341 let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2342 for memory_notifier in memory_notifies.iter() { 2343 memory_notifies_refs.push(memory_notifier); 2344 } 2345 2346 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink); 2347 2348 // MSCN method 2349 aml::Method::new( 2350 "MSCN".into(), 2351 0, 2352 true, 2353 vec![ 2354 // Take lock defined above 2355 &aml::Acquire::new("MLCK".into(), 0xffff), 2356 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2357 &aml::While::new( 2358 &aml::LessThan::new(&aml::Local(0), &self.slots), 2359 vec![ 2360 // Write slot number (in first argument) to I/O port via field 2361 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2362 // Check if MINS bit is set (inserting) 2363 &aml::If::new( 2364 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2365 // Notify device if it is 2366 vec![ 2367 &aml::MethodCall::new( 2368 "MTFY".into(), 2369 vec![&aml::Local(0), &aml::ONE], 2370 ), 2371 // Reset MINS bit 2372 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2373 ], 2374 ), 2375 // Check if MRMV bit is set 2376 &aml::If::new( 2377 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2378 // Notify device if it is (with the eject constant 0x3) 2379 vec![ 2380 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2381 // Reset MRMV bit 2382 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2383 ], 2384 ), 2385 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2386 ], 2387 ), 2388 // Release lock 2389 &aml::Release::new("MLCK".into()), 2390 ], 2391 ) 2392 .to_aml_bytes(sink); 2393 2394 // Memory status method 2395 aml::Method::new( 2396 "MSTA".into(), 2397 1, 2398 true, 2399 vec![ 2400 // Take lock defined above 2401 &aml::Acquire::new("MLCK".into(), 0xffff), 2402 // Write slot number (in first argument) to I/O port via field 2403 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2404 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2405 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2406 &aml::If::new( 2407 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2408 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2409 ), 2410 // Release lock 2411 &aml::Release::new("MLCK".into()), 2412 // Return 0 or 0xf 2413 &aml::Return::new(&aml::Local(0)), 2414 ], 2415 ) 2416 .to_aml_bytes(sink); 2417 2418 // Memory range method 2419 aml::Method::new( 2420 "MCRS".into(), 2421 1, 2422 true, 2423 vec![ 2424 // Take lock defined above 2425 &aml::Acquire::new("MLCK".into(), 0xffff), 2426 // Write slot number (in first argument) to I/O port via field 2427 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2428 &aml::Name::new( 2429 "MR64".into(), 2430 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2431 aml::AddressSpaceCacheable::Cacheable, 2432 true, 2433 0x0000_0000_0000_0000u64, 2434 0xFFFF_FFFF_FFFF_FFFEu64, 2435 None, 2436 )]), 2437 ), 2438 &aml::CreateQWordField::new( 2439 &aml::Path::new("MINL"), 2440 &aml::Path::new("MR64"), 2441 &14usize, 2442 ), 2443 &aml::CreateDWordField::new( 2444 &aml::Path::new("MINH"), 2445 &aml::Path::new("MR64"), 2446 &18usize, 2447 ), 2448 &aml::CreateQWordField::new( 2449 &aml::Path::new("MAXL"), 2450 &aml::Path::new("MR64"), 2451 &22usize, 2452 ), 2453 &aml::CreateDWordField::new( 2454 &aml::Path::new("MAXH"), 2455 &aml::Path::new("MR64"), 2456 &26usize, 2457 ), 2458 &aml::CreateQWordField::new( 2459 &aml::Path::new("LENL"), 2460 &aml::Path::new("MR64"), 2461 &38usize, 2462 ), 2463 &aml::CreateDWordField::new( 2464 &aml::Path::new("LENH"), 2465 &aml::Path::new("MR64"), 2466 &42usize, 2467 ), 2468 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2469 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2470 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2471 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2472 &aml::Add::new( 2473 &aml::Path::new("MAXL"), 2474 &aml::Path::new("MINL"), 2475 &aml::Path::new("LENL"), 2476 ), 2477 &aml::Add::new( 2478 &aml::Path::new("MAXH"), 2479 &aml::Path::new("MINH"), 2480 &aml::Path::new("LENH"), 2481 ), 2482 &aml::If::new( 2483 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2484 vec![&aml::Add::new( 2485 &aml::Path::new("MAXH"), 2486 &aml::ONE, 2487 &aml::Path::new("MAXH"), 2488 )], 2489 ), 2490 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2491 // Release lock 2492 &aml::Release::new("MLCK".into()), 2493 &aml::Return::new(&aml::Path::new("MR64")), 2494 ], 2495 ) 2496 .to_aml_bytes(sink) 2497 } 2498 } 2499 2500 impl Aml for MemoryManager { 2501 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2502 if let Some(acpi_address) = self.acpi_address { 2503 // Memory Hotplug Controller 2504 aml::Device::new( 2505 "_SB_.MHPC".into(), 2506 vec![ 2507 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2508 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2509 // Mutex to protect concurrent access as we write to choose slot and then read back status 2510 &aml::Mutex::new("MLCK".into(), 0), 2511 &aml::Name::new( 2512 "_CRS".into(), 2513 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2514 aml::AddressSpaceCacheable::NotCacheable, 2515 true, 2516 acpi_address.0, 2517 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2518 None, 2519 )]), 2520 ), 2521 // OpRegion and Fields map MMIO range into individual field values 2522 &aml::OpRegion::new( 2523 "MHPR".into(), 2524 aml::OpRegionSpace::SystemMemory, 2525 &(acpi_address.0 as usize), 2526 &MEMORY_MANAGER_ACPI_SIZE, 2527 ), 2528 &aml::Field::new( 2529 "MHPR".into(), 2530 aml::FieldAccessType::DWord, 2531 aml::FieldLockRule::NoLock, 2532 aml::FieldUpdateRule::Preserve, 2533 vec![ 2534 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2535 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2536 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2537 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2538 ], 2539 ), 2540 &aml::Field::new( 2541 "MHPR".into(), 2542 aml::FieldAccessType::DWord, 2543 aml::FieldLockRule::NoLock, 2544 aml::FieldUpdateRule::Preserve, 2545 vec![ 2546 aml::FieldEntry::Reserved(128), 2547 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2548 ], 2549 ), 2550 &aml::Field::new( 2551 "MHPR".into(), 2552 aml::FieldAccessType::Byte, 2553 aml::FieldLockRule::NoLock, 2554 aml::FieldUpdateRule::WriteAsZeroes, 2555 vec![ 2556 aml::FieldEntry::Reserved(160), 2557 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2558 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2559 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2560 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2561 ], 2562 ), 2563 &aml::Field::new( 2564 "MHPR".into(), 2565 aml::FieldAccessType::DWord, 2566 aml::FieldLockRule::NoLock, 2567 aml::FieldUpdateRule::Preserve, 2568 vec![ 2569 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2570 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2571 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2572 ], 2573 ), 2574 &MemoryMethods { 2575 slots: self.hotplug_slots.len(), 2576 }, 2577 &MemorySlots { 2578 slots: self.hotplug_slots.len(), 2579 }, 2580 ], 2581 ) 2582 .to_aml_bytes(sink); 2583 } else { 2584 aml::Device::new( 2585 "_SB_.MHPC".into(), 2586 vec![ 2587 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2588 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2589 // Empty MSCN for GED 2590 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2591 ], 2592 ) 2593 .to_aml_bytes(sink); 2594 } 2595 2596 #[cfg(target_arch = "x86_64")] 2597 { 2598 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2599 let min = sgx_epc_region.start().raw_value(); 2600 let max = min + sgx_epc_region.size() - 1; 2601 // SGX EPC region 2602 aml::Device::new( 2603 "_SB_.EPC_".into(), 2604 vec![ 2605 &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")), 2606 // QWORD describing the EPC region start and size 2607 &aml::Name::new( 2608 "_CRS".into(), 2609 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2610 aml::AddressSpaceCacheable::NotCacheable, 2611 true, 2612 min, 2613 max, 2614 None, 2615 )]), 2616 ), 2617 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2618 ], 2619 ) 2620 .to_aml_bytes(sink); 2621 } 2622 } 2623 } 2624 } 2625 2626 impl Pausable for MemoryManager {} 2627 2628 #[derive(Clone, Serialize, Deserialize, Versionize)] 2629 pub struct MemoryManagerSnapshotData { 2630 memory_ranges: MemoryRangeTable, 2631 guest_ram_mappings: Vec<GuestRamMapping>, 2632 start_of_device_area: u64, 2633 boot_ram: u64, 2634 current_ram: u64, 2635 arch_mem_regions: Vec<ArchMemRegion>, 2636 hotplug_slots: Vec<HotPlugState>, 2637 next_memory_slot: u32, 2638 selected_slot: usize, 2639 next_hotplug_slot: usize, 2640 } 2641 2642 impl VersionMapped for MemoryManagerSnapshotData {} 2643 2644 impl Snapshottable for MemoryManager { 2645 fn id(&self) -> String { 2646 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2647 } 2648 2649 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2650 let memory_ranges = self.memory_range_table(true)?; 2651 2652 // Store locally this list of ranges as it will be used through the 2653 // Transportable::send() implementation. The point is to avoid the 2654 // duplication of code regarding the creation of the path for each 2655 // region. The 'snapshot' step creates the list of memory regions, 2656 // including information about the need to copy a memory region or 2657 // not. This saves the 'send' step having to go through the same 2658 // process, and instead it can directly proceed with storing the 2659 // memory range content for the ranges requiring it. 2660 self.snapshot_memory_ranges = memory_ranges; 2661 2662 Ok(Snapshot::from_data(SnapshotData::new_from_versioned_state( 2663 &self.snapshot_data(), 2664 )?)) 2665 } 2666 } 2667 2668 impl Transportable for MemoryManager { 2669 fn send( 2670 &self, 2671 _snapshot: &Snapshot, 2672 destination_url: &str, 2673 ) -> result::Result<(), MigratableError> { 2674 if self.snapshot_memory_ranges.is_empty() { 2675 return Ok(()); 2676 } 2677 2678 let mut memory_file_path = url_to_path(destination_url)?; 2679 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2680 2681 // Create the snapshot file for the entire memory 2682 let mut memory_file = OpenOptions::new() 2683 .read(true) 2684 .write(true) 2685 .create_new(true) 2686 .open(memory_file_path) 2687 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2688 2689 let guest_memory = self.guest_memory.memory(); 2690 2691 for range in self.snapshot_memory_ranges.regions() { 2692 let mut offset: u64 = 0; 2693 // Here we are manually handling the retry in case we can't read 2694 // the whole region at once because we can't use the implementation 2695 // from vm-memory::GuestMemory of write_all_to() as it is not 2696 // following the correct behavior. For more info about this issue 2697 // see: https://github.com/rust-vmm/vm-memory/issues/174 2698 loop { 2699 let bytes_written = guest_memory 2700 .write_volatile_to( 2701 GuestAddress(range.gpa + offset), 2702 &mut memory_file, 2703 (range.length - offset) as usize, 2704 ) 2705 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2706 offset += bytes_written as u64; 2707 2708 if offset == range.length { 2709 break; 2710 } 2711 } 2712 } 2713 Ok(()) 2714 } 2715 } 2716 2717 impl Migratable for MemoryManager { 2718 // Start the dirty log in the hypervisor (kvm/mshv). 2719 // Also, reset the dirty bitmap logged by the vmm. 2720 // Just before we do a bulk copy we want to start/clear the dirty log so that 2721 // pages touched during our bulk copy are tracked. 2722 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2723 self.vm.start_dirty_log().map_err(|e| { 2724 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2725 })?; 2726 2727 for r in self.guest_memory.memory().iter() { 2728 r.bitmap().reset(); 2729 } 2730 2731 Ok(()) 2732 } 2733 2734 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2735 self.vm.stop_dirty_log().map_err(|e| { 2736 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2737 })?; 2738 2739 Ok(()) 2740 } 2741 2742 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2743 // together in the table if they are contiguous. 2744 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2745 let mut table = MemoryRangeTable::default(); 2746 for r in &self.guest_ram_mappings { 2747 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2748 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2749 })?; 2750 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2751 { 2752 Some(region) => { 2753 assert!(region.start_addr().raw_value() == r.gpa); 2754 assert!(region.len() == r.size); 2755 region.bitmap().get_and_reset() 2756 } 2757 None => { 2758 return Err(MigratableError::MigrateSend(anyhow!( 2759 "Error finding 'guest memory region' with address {:x}", 2760 r.gpa 2761 ))) 2762 } 2763 }; 2764 2765 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2766 .iter() 2767 .zip(vmm_dirty_bitmap.iter()) 2768 .map(|(x, y)| x | y) 2769 .collect(); 2770 2771 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2772 2773 if sub_table.regions().is_empty() { 2774 info!("Dirty Memory Range Table is empty"); 2775 } else { 2776 info!("Dirty Memory Range Table:"); 2777 for range in sub_table.regions() { 2778 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2779 } 2780 } 2781 2782 table.extend(sub_table); 2783 } 2784 Ok(table) 2785 } 2786 } 2787