1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 6 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 7 use std::collections::BTreeMap; 8 use std::collections::HashMap; 9 use std::fs::{File, OpenOptions}; 10 use std::io::{self}; 11 use std::ops::{BitAnd, Deref, Not, Sub}; 12 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 13 use std::os::fd::AsFd; 14 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 15 use std::path::PathBuf; 16 use std::sync::atomic::{AtomicU32, Ordering}; 17 use std::sync::{Arc, Barrier, Mutex}; 18 use std::{ffi, result, thread}; 19 20 use acpi_tables::{aml, Aml}; 21 use anyhow::anyhow; 22 #[cfg(target_arch = "x86_64")] 23 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 24 use arch::RegionType; 25 #[cfg(target_arch = "x86_64")] 26 use devices::ioapic; 27 #[cfg(target_arch = "aarch64")] 28 use hypervisor::HypervisorVmError; 29 use libc::_SC_NPROCESSORS_ONLN; 30 #[cfg(target_arch = "x86_64")] 31 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 32 use serde::{Deserialize, Serialize}; 33 use thiserror::Error; 34 use tracer::trace_scoped; 35 use virtio_devices::BlocksState; 36 #[cfg(target_arch = "x86_64")] 37 use vm_allocator::GsiApic; 38 use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator}; 39 use vm_device::BusDevice; 40 use vm_memory::bitmap::AtomicBitmap; 41 use vm_memory::guest_memory::FileOffset; 42 use vm_memory::mmap::MmapRegionError; 43 use vm_memory::{ 44 Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 45 GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile, 46 }; 47 use vm_migration::protocol::{MemoryRange, MemoryRangeTable}; 48 use vm_migration::{ 49 Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable, 50 }; 51 52 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 53 use crate::coredump::{ 54 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, 55 }; 56 use crate::migration::url_to_path; 57 #[cfg(target_arch = "x86_64")] 58 use crate::vm_config::SgxEpcConfig; 59 use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 60 use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID}; 61 62 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 63 64 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 65 66 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 67 68 #[cfg(target_arch = "x86_64")] 69 const X86_64_IRQ_BASE: u32 = 5; 70 71 #[cfg(target_arch = "x86_64")] 72 const SGX_PAGE_SIZE: u64 = 1 << 12; 73 74 const HOTPLUG_COUNT: usize = 8; 75 76 // Memory policy constants 77 const MPOL_BIND: u32 = 2; 78 const MPOL_MF_STRICT: u32 = 1; 79 const MPOL_MF_MOVE: u32 = 1 << 1; 80 81 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 82 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 83 84 const MAX_PREFAULT_THREAD_COUNT: usize = 16; 85 86 #[derive(Clone, Default, Serialize, Deserialize)] 87 struct HotPlugState { 88 base: u64, 89 length: u64, 90 active: bool, 91 inserting: bool, 92 removing: bool, 93 } 94 95 pub struct VirtioMemZone { 96 region: Arc<GuestRegionMmap>, 97 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 98 hotplugged_size: u64, 99 hugepages: bool, 100 blocks_state: Arc<Mutex<BlocksState>>, 101 } 102 103 impl VirtioMemZone { 104 pub fn region(&self) -> &Arc<GuestRegionMmap> { 105 &self.region 106 } 107 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 108 self.virtio_device = Some(virtio_device); 109 } 110 pub fn hotplugged_size(&self) -> u64 { 111 self.hotplugged_size 112 } 113 pub fn hugepages(&self) -> bool { 114 self.hugepages 115 } 116 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 117 &self.blocks_state 118 } 119 pub fn plugged_ranges(&self) -> MemoryRangeTable { 120 self.blocks_state 121 .lock() 122 .unwrap() 123 .memory_ranges(self.region.start_addr().raw_value(), true) 124 } 125 } 126 127 #[derive(Default)] 128 pub struct MemoryZone { 129 regions: Vec<Arc<GuestRegionMmap>>, 130 virtio_mem_zone: Option<VirtioMemZone>, 131 } 132 133 impl MemoryZone { 134 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 135 &self.regions 136 } 137 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 138 &self.virtio_mem_zone 139 } 140 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 141 self.virtio_mem_zone.as_mut() 142 } 143 } 144 145 pub type MemoryZones = HashMap<String, MemoryZone>; 146 147 #[derive(Clone, Serialize, Deserialize)] 148 struct GuestRamMapping { 149 slot: u32, 150 gpa: u64, 151 size: u64, 152 zone_id: String, 153 virtio_mem: bool, 154 file_offset: u64, 155 } 156 157 #[derive(Clone, Serialize, Deserialize)] 158 struct ArchMemRegion { 159 base: u64, 160 size: usize, 161 r_type: RegionType, 162 } 163 164 pub struct MemoryManager { 165 boot_guest_memory: GuestMemoryMmap, 166 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 167 next_memory_slot: Arc<AtomicU32>, 168 memory_slot_free_list: Arc<Mutex<Vec<u32>>>, 169 start_of_device_area: GuestAddress, 170 end_of_device_area: GuestAddress, 171 end_of_ram_area: GuestAddress, 172 pub vm: Arc<dyn hypervisor::Vm>, 173 hotplug_slots: Vec<HotPlugState>, 174 selected_slot: usize, 175 mergeable: bool, 176 allocator: Arc<Mutex<SystemAllocator>>, 177 hotplug_method: HotplugMethod, 178 boot_ram: u64, 179 current_ram: u64, 180 next_hotplug_slot: usize, 181 shared: bool, 182 hugepages: bool, 183 hugepage_size: Option<u64>, 184 prefault: bool, 185 thp: bool, 186 #[cfg(target_arch = "x86_64")] 187 sgx_epc_region: Option<SgxEpcRegion>, 188 user_provided_zones: bool, 189 snapshot_memory_ranges: MemoryRangeTable, 190 memory_zones: MemoryZones, 191 log_dirty: bool, // Enable dirty logging for created RAM regions 192 arch_mem_regions: Vec<ArchMemRegion>, 193 ram_allocator: AddressAllocator, 194 dynamic: bool, 195 196 // Keep track of calls to create_userspace_mapping() for guest RAM. 197 // This is useful for getting the dirty pages as we need to know the 198 // slots that the mapping is created in. 199 guest_ram_mappings: Vec<GuestRamMapping>, 200 201 pub acpi_address: Option<GuestAddress>, 202 #[cfg(target_arch = "aarch64")] 203 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 204 } 205 206 #[derive(Error, Debug)] 207 pub enum Error { 208 /// Failed to create shared file. 209 #[error("Failed to create shared file")] 210 SharedFileCreate(#[source] io::Error), 211 212 /// Failed to set shared file length. 213 #[error("Failed to set shared file length")] 214 SharedFileSetLen(#[source] io::Error), 215 216 /// Mmap backed guest memory error 217 #[error("Mmap backed guest memory error")] 218 GuestMemory(#[source] MmapError), 219 220 /// Failed to allocate a memory range. 221 #[error("Failed to allocate a memory range")] 222 MemoryRangeAllocation, 223 224 /// Error from region creation 225 #[error("Error from region creation")] 226 GuestMemoryRegion(#[source] MmapRegionError), 227 228 /// No ACPI slot available 229 #[error("No ACPI slot available")] 230 NoSlotAvailable, 231 232 /// Not enough space in the hotplug RAM region 233 #[error("Not enough space in the hotplug RAM region")] 234 InsufficientHotplugRam, 235 236 /// The requested hotplug memory addition is not a valid size 237 #[error("The requested hotplug memory addition is not a valid size")] 238 InvalidSize, 239 240 /// Failed to create the user memory region. 241 #[error("Failed to create the user memory region")] 242 CreateUserMemoryRegion(#[source] hypervisor::HypervisorVmError), 243 244 /// Failed to remove the user memory region. 245 #[error("Failed to remove the user memory region")] 246 RemoveUserMemoryRegion(#[source] hypervisor::HypervisorVmError), 247 248 /// Failed to EventFd. 249 #[error("Failed to EventFd")] 250 EventFdFail(#[source] io::Error), 251 252 /// Eventfd write error 253 #[error("Eventfd write error")] 254 EventfdError(#[source] io::Error), 255 256 /// Failed to virtio-mem resize 257 #[error("Failed to virtio-mem resize")] 258 VirtioMemResizeFail(#[source] virtio_devices::mem::Error), 259 260 /// Cannot restore VM 261 #[error("Cannot restore VM")] 262 Restore(#[source] MigratableError), 263 264 /// Cannot restore VM because source URL is missing 265 #[error("Cannot restore VM because source URL is missing")] 266 RestoreMissingSourceUrl, 267 268 /// Cannot create the system allocator 269 #[error("Cannot create the system allocator")] 270 CreateSystemAllocator, 271 272 /// Invalid SGX EPC section size 273 #[cfg(target_arch = "x86_64")] 274 #[error("Invalid SGX EPC section size")] 275 EpcSectionSizeInvalid, 276 277 /// Failed allocating SGX EPC region 278 #[cfg(target_arch = "x86_64")] 279 #[error("Failed allocating SGX EPC region")] 280 SgxEpcRangeAllocation, 281 282 /// Failed opening SGX virtual EPC device 283 #[cfg(target_arch = "x86_64")] 284 #[error("Failed opening SGX virtual EPC device")] 285 SgxVirtEpcOpen(#[source] io::Error), 286 287 /// Failed setting the SGX virtual EPC section size 288 #[cfg(target_arch = "x86_64")] 289 #[error("Failed setting the SGX virtual EPC section size")] 290 SgxVirtEpcFileSetLen(#[source] io::Error), 291 292 /// Failed opening SGX provisioning device 293 #[cfg(target_arch = "x86_64")] 294 #[error("Failed opening SGX provisioning device")] 295 SgxProvisionOpen(#[source] io::Error), 296 297 /// Failed enabling SGX provisioning 298 #[cfg(target_arch = "x86_64")] 299 #[error("Failed enabling SGX provisioning")] 300 SgxEnableProvisioning(#[source] hypervisor::HypervisorVmError), 301 302 /// Failed creating a new MmapRegion instance. 303 #[cfg(target_arch = "x86_64")] 304 #[error("Failed creating a new MmapRegion instance")] 305 NewMmapRegion(#[source] vm_memory::mmap::MmapRegionError), 306 307 /// No memory zones found. 308 #[error("No memory zones found")] 309 MissingMemoryZones, 310 311 /// Memory configuration is not valid. 312 #[error("Memory configuration is not valid")] 313 InvalidMemoryParameters, 314 315 /// Forbidden operation. Impossible to resize guest memory if it is 316 /// backed by user defined memory regions. 317 #[error("Impossible to resize guest memory if it is backed by user defined memory regions")] 318 InvalidResizeWithMemoryZones, 319 320 /// It's invalid to try applying a NUMA policy to a memory zone that is 321 /// memory mapped with MAP_SHARED. 322 #[error("Invalid to try applying a NUMA policy to a memory zone that is memory mapped with MAP_SHARED")] 323 InvalidSharedMemoryZoneWithHostNuma, 324 325 /// Failed applying NUMA memory policy. 326 #[error("Failed applying NUMA memory policy")] 327 ApplyNumaPolicy(#[source] io::Error), 328 329 /// Memory zone identifier is not unique. 330 #[error("Memory zone identifier is not unique")] 331 DuplicateZoneId, 332 333 /// No virtio-mem resizing handler found. 334 #[error("No virtio-mem resizing handler found")] 335 MissingVirtioMemHandler, 336 337 /// Unknown memory zone. 338 #[error("Unknown memory zone")] 339 UnknownMemoryZone, 340 341 /// Invalid size for resizing. Can be anything except 0. 342 #[error("Invalid size for resizing. Can be anything except 0")] 343 InvalidHotplugSize, 344 345 /// Invalid hotplug method associated with memory zones resizing capability. 346 #[error("Invalid hotplug method associated with memory zones resizing capability")] 347 InvalidHotplugMethodWithMemoryZones, 348 349 /// Could not find specified memory zone identifier from hash map. 350 #[error("Could not find specified memory zone identifier from hash map")] 351 MissingZoneIdentifier, 352 353 /// Resizing the memory zone failed. 354 #[error("Resizing the memory zone failed")] 355 ResizeZone, 356 357 /// Guest address overflow 358 #[error("Guest address overflow")] 359 GuestAddressOverFlow, 360 361 /// Error opening snapshot file 362 #[error("Error opening snapshot file")] 363 SnapshotOpen(#[source] io::Error), 364 365 // Error copying snapshot into region 366 #[error("Error copying snapshot into region")] 367 SnapshotCopy(#[source] GuestMemoryError), 368 369 /// Failed to allocate MMIO address 370 #[error("Failed to allocate MMIO address")] 371 AllocateMmioAddress, 372 373 #[cfg(target_arch = "aarch64")] 374 /// Failed to create UEFI flash 375 #[error("Failed to create UEFI flash")] 376 CreateUefiFlash(#[source] HypervisorVmError), 377 378 /// Using a directory as a backing file for memory is not supported 379 #[error("Using a directory as a backing file for memory is not supported")] 380 DirectoryAsBackingFileForMemory, 381 382 /// Failed to stat filesystem 383 #[error("Failed to stat filesystem")] 384 GetFileSystemBlockSize(#[source] io::Error), 385 386 /// Memory size is misaligned with default page size or its hugepage size 387 #[error("Memory size is misaligned with default page size or its hugepage size")] 388 MisalignedMemorySize, 389 } 390 391 const ENABLE_FLAG: usize = 0; 392 const INSERTING_FLAG: usize = 1; 393 const REMOVING_FLAG: usize = 2; 394 const EJECT_FLAG: usize = 3; 395 396 const BASE_OFFSET_LOW: u64 = 0; 397 const BASE_OFFSET_HIGH: u64 = 0x4; 398 const LENGTH_OFFSET_LOW: u64 = 0x8; 399 const LENGTH_OFFSET_HIGH: u64 = 0xC; 400 const STATUS_OFFSET: u64 = 0x14; 401 const SELECTION_OFFSET: u64 = 0; 402 403 // The MMIO address space size is subtracted with 64k. This is done for the 404 // following reasons: 405 // - Reduce the addressable space size by at least 4k to workaround a Linux 406 // bug when the VMM allocates devices at the end of the addressable space 407 // - Windows requires the addressable space size to be 64k aligned 408 fn mmio_address_space_size(phys_bits: u8) -> u64 { 409 (1 << phys_bits) - (1 << 16) 410 } 411 412 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the 413 // `f_bsize` field. 414 // 415 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169 416 fn statfs_get_bsize(path: &str) -> Result<u64, Error> { 417 let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?; 418 let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit(); 419 420 // SAFETY: FFI call with a valid path and buffer 421 let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) }; 422 if ret != 0 { 423 return Err(Error::GetFileSystemBlockSize( 424 std::io::Error::last_os_error(), 425 )); 426 } 427 428 // SAFETY: `buf` is valid at this point 429 // Because this value is always positive, just convert it directly. 430 // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned 431 // by `clippy` on musl target. To avoid the warning, there should be `as _` instead of 432 // `as u64`. 433 let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _; 434 Ok(bsize) 435 } 436 437 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> { 438 // SAFETY: FFI call. Trivially safe. 439 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 440 441 // There is no backend file and the `hugepages` is disabled, just use system page size. 442 if zone.file.is_none() && !zone.hugepages { 443 return Ok(page_size); 444 } 445 446 // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly. 447 if zone.hugepages && zone.hugepage_size.is_some() { 448 return Ok(zone.hugepage_size.unwrap()); 449 } 450 451 // There are two scenarios here: 452 // - `hugepages` is enabled but `hugepage_size` is not specified: 453 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 454 // - The backing file is specified: 455 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 456 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 457 // value is less than or equal to the page size, just use the page size. 458 let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| { 459 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 460 })?; 461 462 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 463 464 Ok(align_size) 465 } 466 467 #[inline] 468 fn align_down<T>(val: T, align: T) -> T 469 where 470 T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>, 471 { 472 val & !(align - 1u8.into()) 473 } 474 475 #[inline] 476 fn is_aligned<T>(val: T, align: T) -> bool 477 where 478 T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq, 479 { 480 (val & (align - 1u8.into())) == 0u8.into() 481 } 482 483 impl BusDevice for MemoryManager { 484 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 485 if self.selected_slot < self.hotplug_slots.len() { 486 let state = &self.hotplug_slots[self.selected_slot]; 487 match offset { 488 BASE_OFFSET_LOW => { 489 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 490 } 491 BASE_OFFSET_HIGH => { 492 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 493 } 494 LENGTH_OFFSET_LOW => { 495 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 496 } 497 LENGTH_OFFSET_HIGH => { 498 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 499 } 500 STATUS_OFFSET => { 501 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 502 data.fill(0); 503 if state.active { 504 data[0] |= 1 << ENABLE_FLAG; 505 } 506 if state.inserting { 507 data[0] |= 1 << INSERTING_FLAG; 508 } 509 if state.removing { 510 data[0] |= 1 << REMOVING_FLAG; 511 } 512 } 513 _ => { 514 warn!( 515 "Unexpected offset for accessing memory manager device: {:#}", 516 offset 517 ); 518 } 519 } 520 } else { 521 warn!("Out of range memory slot: {}", self.selected_slot); 522 } 523 } 524 525 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 526 match offset { 527 SELECTION_OFFSET => { 528 self.selected_slot = usize::from(data[0]); 529 } 530 STATUS_OFFSET => { 531 if self.selected_slot < self.hotplug_slots.len() { 532 let state = &mut self.hotplug_slots[self.selected_slot]; 533 // The ACPI code writes back a 1 to acknowledge the insertion 534 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 535 state.inserting = false; 536 } 537 // Ditto for removal 538 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 539 state.removing = false; 540 } 541 // Trigger removal of "DIMM" 542 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 543 warn!("Ejection of memory not currently supported"); 544 } 545 } else { 546 warn!("Out of range memory slot: {}", self.selected_slot); 547 } 548 } 549 _ => { 550 warn!( 551 "Unexpected offset for accessing memory manager device: {:#}", 552 offset 553 ); 554 } 555 }; 556 None 557 } 558 } 559 560 impl MemoryManager { 561 /// Creates all memory regions based on the available RAM ranges defined 562 /// by `ram_regions`, and based on the description of the memory zones. 563 /// In practice, this function can perform multiple memory mappings of the 564 /// same backing file if there's a hole in the address space between two 565 /// RAM ranges. 566 /// 567 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 568 /// and zones containing two zones (size 1G and size 4G). 569 /// 570 /// This function will create 3 resulting memory regions: 571 /// - First one mapping entirely the first memory zone on 0-1G range 572 /// - Second one mapping partially the second memory zone on 1G-3G range 573 /// - Third one mapping partially the second memory zone on 4G-6G range 574 /// 575 /// Also, all memory regions are page-size aligned (e.g. their sizes must 576 /// be multiple of page-size), which may leave an additional hole in the 577 /// address space when hugepage is used. 578 fn create_memory_regions_from_zones( 579 ram_regions: &[(GuestAddress, usize)], 580 zones: &[MemoryZoneConfig], 581 prefault: Option<bool>, 582 thp: bool, 583 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 584 let mut zone_iter = zones.iter(); 585 let mut mem_regions = Vec::new(); 586 let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?; 587 let mut zone_align_size = memory_zone_get_align_size(zone)?; 588 let mut zone_offset = 0u64; 589 let mut memory_zones = HashMap::new(); 590 591 if !is_aligned(zone.size, zone_align_size) { 592 return Err(Error::MisalignedMemorySize); 593 } 594 595 // Add zone id to the list of memory zones. 596 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 597 598 for ram_region in ram_regions.iter() { 599 let mut ram_region_offset = 0; 600 let mut exit = false; 601 602 loop { 603 let mut ram_region_consumed = false; 604 let mut pull_next_zone = false; 605 606 let ram_region_available_size = 607 align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size); 608 if ram_region_available_size == 0 { 609 break; 610 } 611 let zone_sub_size = zone.size - zone_offset; 612 613 let file_offset = zone_offset; 614 let region_start = ram_region 615 .0 616 .checked_add(ram_region_offset) 617 .ok_or(Error::GuestAddressOverFlow)?; 618 let region_size = if zone_sub_size <= ram_region_available_size { 619 if zone_sub_size == ram_region_available_size { 620 ram_region_consumed = true; 621 } 622 623 ram_region_offset += zone_sub_size; 624 pull_next_zone = true; 625 626 zone_sub_size 627 } else { 628 zone_offset += ram_region_available_size; 629 ram_region_consumed = true; 630 631 ram_region_available_size 632 }; 633 634 info!( 635 "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}", 636 zone.id, 637 region_start.raw_value(), 638 region_size 639 ); 640 let region = MemoryManager::create_ram_region( 641 &zone.file, 642 file_offset, 643 region_start, 644 region_size as usize, 645 prefault.unwrap_or(zone.prefault), 646 zone.shared, 647 zone.hugepages, 648 zone.hugepage_size, 649 zone.host_numa_node, 650 None, 651 thp, 652 )?; 653 654 // Add region to the list of regions associated with the 655 // current memory zone. 656 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 657 memory_zone.regions.push(region.clone()); 658 } 659 660 mem_regions.push(region); 661 662 if pull_next_zone { 663 // Get the next zone and reset the offset. 664 zone_offset = 0; 665 if let Some(z) = zone_iter.next() { 666 zone = z; 667 } else { 668 exit = true; 669 break; 670 } 671 zone_align_size = memory_zone_get_align_size(zone)?; 672 if !is_aligned(zone.size, zone_align_size) { 673 return Err(Error::MisalignedMemorySize); 674 } 675 676 // Check if zone id already exist. In case it does, throw 677 // an error as we need unique identifiers. Otherwise, add 678 // the new zone id to the list of memory zones. 679 if memory_zones.contains_key(&zone.id) { 680 error!( 681 "Memory zone identifier '{}' found more than once. \ 682 It must be unique", 683 zone.id, 684 ); 685 return Err(Error::DuplicateZoneId); 686 } 687 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 688 } 689 690 if ram_region_consumed { 691 break; 692 } 693 } 694 695 if exit { 696 break; 697 } 698 } 699 700 Ok((mem_regions, memory_zones)) 701 } 702 703 // Restore both GuestMemory regions along with MemoryZone zones. 704 fn restore_memory_regions_and_zones( 705 guest_ram_mappings: &[GuestRamMapping], 706 zones_config: &[MemoryZoneConfig], 707 prefault: Option<bool>, 708 mut existing_memory_files: HashMap<u32, File>, 709 thp: bool, 710 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 711 let mut memory_regions = Vec::new(); 712 let mut memory_zones = HashMap::new(); 713 714 for zone_config in zones_config { 715 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 716 } 717 718 for guest_ram_mapping in guest_ram_mappings { 719 for zone_config in zones_config { 720 if guest_ram_mapping.zone_id == zone_config.id { 721 let region = MemoryManager::create_ram_region( 722 if guest_ram_mapping.virtio_mem { 723 &None 724 } else { 725 &zone_config.file 726 }, 727 guest_ram_mapping.file_offset, 728 GuestAddress(guest_ram_mapping.gpa), 729 guest_ram_mapping.size as usize, 730 prefault.unwrap_or(zone_config.prefault), 731 zone_config.shared, 732 zone_config.hugepages, 733 zone_config.hugepage_size, 734 zone_config.host_numa_node, 735 existing_memory_files.remove(&guest_ram_mapping.slot), 736 thp, 737 )?; 738 memory_regions.push(Arc::clone(®ion)); 739 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 740 if guest_ram_mapping.virtio_mem { 741 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 742 let region_size = region.len(); 743 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 744 region, 745 virtio_device: None, 746 hotplugged_size, 747 hugepages: zone_config.hugepages, 748 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 749 }); 750 } else { 751 memory_zone.regions.push(region); 752 } 753 } 754 } 755 } 756 } 757 758 memory_regions.sort_by_key(|x| x.start_addr()); 759 760 Ok((memory_regions, memory_zones)) 761 } 762 763 fn fill_saved_regions( 764 &mut self, 765 file_path: PathBuf, 766 saved_regions: MemoryRangeTable, 767 ) -> Result<(), Error> { 768 if saved_regions.is_empty() { 769 return Ok(()); 770 } 771 772 // Open (read only) the snapshot file. 773 let mut memory_file = OpenOptions::new() 774 .read(true) 775 .open(file_path) 776 .map_err(Error::SnapshotOpen)?; 777 778 let guest_memory = self.guest_memory.memory(); 779 for range in saved_regions.regions() { 780 let mut offset: u64 = 0; 781 // Here we are manually handling the retry in case we can't write 782 // the whole region at once because we can't use the implementation 783 // from vm-memory::GuestMemory of read_exact_from() as it is not 784 // following the correct behavior. For more info about this issue 785 // see: https://github.com/rust-vmm/vm-memory/issues/174 786 loop { 787 let bytes_read = guest_memory 788 .read_volatile_from( 789 GuestAddress(range.gpa + offset), 790 &mut memory_file, 791 (range.length - offset) as usize, 792 ) 793 .map_err(Error::SnapshotCopy)?; 794 offset += bytes_read as u64; 795 796 if offset == range.length { 797 break; 798 } 799 } 800 } 801 802 Ok(()) 803 } 804 805 fn validate_memory_config( 806 config: &MemoryConfig, 807 user_provided_zones: bool, 808 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 809 let mut allow_mem_hotplug = false; 810 811 if !user_provided_zones { 812 if config.zones.is_some() { 813 error!( 814 "User defined memory regions can't be provided if the \ 815 memory size is not 0" 816 ); 817 return Err(Error::InvalidMemoryParameters); 818 } 819 820 if config.hotplug_size.is_some() { 821 allow_mem_hotplug = true; 822 } 823 824 if let Some(hotplugged_size) = config.hotplugged_size { 825 if let Some(hotplug_size) = config.hotplug_size { 826 if hotplugged_size > hotplug_size { 827 error!( 828 "'hotplugged_size' {} can't be bigger than \ 829 'hotplug_size' {}", 830 hotplugged_size, hotplug_size, 831 ); 832 return Err(Error::InvalidMemoryParameters); 833 } 834 } else { 835 error!( 836 "Invalid to define 'hotplugged_size' when there is\ 837 no 'hotplug_size'" 838 ); 839 return Err(Error::InvalidMemoryParameters); 840 } 841 if config.hotplug_method == HotplugMethod::Acpi { 842 error!( 843 "Invalid to define 'hotplugged_size' with hotplug \ 844 method 'acpi'" 845 ); 846 return Err(Error::InvalidMemoryParameters); 847 } 848 } 849 850 // Create a single zone from the global memory config. This lets 851 // us reuse the codepath for user defined memory zones. 852 let zones = vec![MemoryZoneConfig { 853 id: String::from(DEFAULT_MEMORY_ZONE), 854 size: config.size, 855 file: None, 856 shared: config.shared, 857 hugepages: config.hugepages, 858 hugepage_size: config.hugepage_size, 859 host_numa_node: None, 860 hotplug_size: config.hotplug_size, 861 hotplugged_size: config.hotplugged_size, 862 prefault: config.prefault, 863 }]; 864 865 Ok((config.size, zones, allow_mem_hotplug)) 866 } else { 867 if config.zones.is_none() { 868 error!( 869 "User defined memory regions must be provided if the \ 870 memory size is 0" 871 ); 872 return Err(Error::MissingMemoryZones); 873 } 874 875 // Safe to unwrap as we checked right above there were some 876 // regions. 877 let zones = config.zones.clone().unwrap(); 878 if zones.is_empty() { 879 return Err(Error::MissingMemoryZones); 880 } 881 882 let mut total_ram_size: u64 = 0; 883 for zone in zones.iter() { 884 total_ram_size += zone.size; 885 886 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 887 error!( 888 "Invalid to set host NUMA policy for a memory zone \ 889 backed by a regular file and mapped as 'shared'" 890 ); 891 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 892 } 893 894 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 895 error!("Invalid to set ACPI hotplug method for memory zones"); 896 return Err(Error::InvalidHotplugMethodWithMemoryZones); 897 } 898 899 if let Some(hotplugged_size) = zone.hotplugged_size { 900 if let Some(hotplug_size) = zone.hotplug_size { 901 if hotplugged_size > hotplug_size { 902 error!( 903 "'hotplugged_size' {} can't be bigger than \ 904 'hotplug_size' {}", 905 hotplugged_size, hotplug_size, 906 ); 907 return Err(Error::InvalidMemoryParameters); 908 } 909 } else { 910 error!( 911 "Invalid to define 'hotplugged_size' when there is\ 912 no 'hotplug_size' for a memory zone" 913 ); 914 return Err(Error::InvalidMemoryParameters); 915 } 916 if config.hotplug_method == HotplugMethod::Acpi { 917 error!( 918 "Invalid to define 'hotplugged_size' with hotplug \ 919 method 'acpi'" 920 ); 921 return Err(Error::InvalidMemoryParameters); 922 } 923 } 924 } 925 926 Ok((total_ram_size, zones, allow_mem_hotplug)) 927 } 928 } 929 930 pub fn allocate_address_space(&mut self) -> Result<(), Error> { 931 let mut list = Vec::new(); 932 933 for (zone_id, memory_zone) in self.memory_zones.iter() { 934 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 935 memory_zone 936 .regions() 937 .iter() 938 .map(|r| (r.clone(), false)) 939 .collect(); 940 941 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 942 regions.push((virtio_mem_zone.region().clone(), true)); 943 } 944 945 list.push((zone_id.clone(), regions)); 946 } 947 948 for (zone_id, regions) in list { 949 for (region, virtio_mem) in regions { 950 let slot = self.create_userspace_mapping( 951 region.start_addr().raw_value(), 952 region.len(), 953 region.as_ptr() as u64, 954 self.mergeable, 955 false, 956 self.log_dirty, 957 )?; 958 959 let file_offset = if let Some(file_offset) = region.file_offset() { 960 file_offset.start() 961 } else { 962 0 963 }; 964 965 self.guest_ram_mappings.push(GuestRamMapping { 966 gpa: region.start_addr().raw_value(), 967 size: region.len(), 968 slot, 969 zone_id: zone_id.clone(), 970 virtio_mem, 971 file_offset, 972 }); 973 self.ram_allocator 974 .allocate(Some(region.start_addr()), region.len(), None) 975 .ok_or(Error::MemoryRangeAllocation)?; 976 } 977 } 978 979 // Allocate SubRegion and Reserved address ranges. 980 for region in self.arch_mem_regions.iter() { 981 if region.r_type == RegionType::Ram { 982 // Ignore the RAM type since ranges have already been allocated 983 // based on the GuestMemory regions. 984 continue; 985 } 986 self.ram_allocator 987 .allocate( 988 Some(GuestAddress(region.base)), 989 region.size as GuestUsize, 990 None, 991 ) 992 .ok_or(Error::MemoryRangeAllocation)?; 993 } 994 995 Ok(()) 996 } 997 998 #[cfg(target_arch = "aarch64")] 999 pub fn add_uefi_flash(&mut self) -> Result<(), Error> { 1000 // On AArch64, the UEFI binary requires a flash device at address 0. 1001 // 4 MiB memory is mapped to simulate the flash. 1002 let uefi_mem_slot = self.allocate_memory_slot(); 1003 let uefi_region = GuestRegionMmap::new( 1004 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 1005 arch::layout::UEFI_START, 1006 ) 1007 .unwrap(); 1008 let uefi_mem_region = self.vm.make_user_memory_region( 1009 uefi_mem_slot, 1010 uefi_region.start_addr().raw_value(), 1011 uefi_region.len(), 1012 uefi_region.as_ptr() as u64, 1013 false, 1014 false, 1015 ); 1016 self.vm 1017 .create_user_memory_region(uefi_mem_region) 1018 .map_err(Error::CreateUefiFlash)?; 1019 1020 let uefi_flash = 1021 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 1022 1023 self.uefi_flash = Some(uefi_flash); 1024 1025 Ok(()) 1026 } 1027 1028 #[allow(clippy::too_many_arguments)] 1029 pub fn new( 1030 vm: Arc<dyn hypervisor::Vm>, 1031 config: &MemoryConfig, 1032 prefault: Option<bool>, 1033 phys_bits: u8, 1034 #[cfg(feature = "tdx")] tdx_enabled: bool, 1035 restore_data: Option<&MemoryManagerSnapshotData>, 1036 existing_memory_files: Option<HashMap<u32, File>>, 1037 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 1038 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1039 trace_scoped!("MemoryManager::new"); 1040 1041 let user_provided_zones = config.size == 0; 1042 1043 let mmio_address_space_size = mmio_address_space_size(phys_bits); 1044 debug_assert_eq!( 1045 (((mmio_address_space_size) >> 16) << 16), 1046 mmio_address_space_size 1047 ); 1048 let start_of_platform_device_area = 1049 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 1050 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 1051 1052 let (ram_size, zones, allow_mem_hotplug) = 1053 Self::validate_memory_config(config, user_provided_zones)?; 1054 1055 let ( 1056 start_of_device_area, 1057 boot_ram, 1058 current_ram, 1059 arch_mem_regions, 1060 memory_zones, 1061 guest_memory, 1062 boot_guest_memory, 1063 hotplug_slots, 1064 next_memory_slot, 1065 selected_slot, 1066 next_hotplug_slot, 1067 ) = if let Some(data) = restore_data { 1068 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 1069 &data.guest_ram_mappings, 1070 &zones, 1071 prefault, 1072 existing_memory_files.unwrap_or_default(), 1073 config.thp, 1074 )?; 1075 let guest_memory = 1076 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 1077 let boot_guest_memory = guest_memory.clone(); 1078 ( 1079 GuestAddress(data.start_of_device_area), 1080 data.boot_ram, 1081 data.current_ram, 1082 data.arch_mem_regions.clone(), 1083 memory_zones, 1084 guest_memory, 1085 boot_guest_memory, 1086 data.hotplug_slots.clone(), 1087 data.next_memory_slot, 1088 data.selected_slot, 1089 data.next_hotplug_slot, 1090 ) 1091 } else { 1092 // Init guest memory 1093 let arch_mem_regions = arch::arch_memory_regions(); 1094 1095 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1096 .iter() 1097 .filter(|r| r.2 == RegionType::Ram) 1098 .map(|r| (r.0, r.1)) 1099 .collect(); 1100 1101 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 1102 .iter() 1103 .map(|(a, b, c)| ArchMemRegion { 1104 base: a.0, 1105 size: *b, 1106 r_type: *c, 1107 }) 1108 .collect(); 1109 1110 let (mem_regions, mut memory_zones) = 1111 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 1112 1113 let mut guest_memory = 1114 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 1115 1116 let boot_guest_memory = guest_memory.clone(); 1117 1118 let mut start_of_device_area = 1119 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 1120 1121 // Update list of memory zones for resize. 1122 for zone in zones.iter() { 1123 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 1124 if let Some(hotplug_size) = zone.hotplug_size { 1125 if hotplug_size == 0 { 1126 error!("'hotplug_size' can't be 0"); 1127 return Err(Error::InvalidHotplugSize); 1128 } 1129 1130 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 1131 start_of_device_area = start_of_device_area 1132 .checked_add(hotplug_size) 1133 .ok_or(Error::GuestAddressOverFlow)?; 1134 } else { 1135 // Alignment must be "natural" i.e. same as size of block 1136 let start_addr = GuestAddress( 1137 start_of_device_area 1138 .0 1139 .div_ceil(virtio_devices::VIRTIO_MEM_ALIGN_SIZE) 1140 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 1141 ); 1142 1143 // When `prefault` is set by vm_restore, memory manager 1144 // will create ram region with `prefault` option in 1145 // restore config rather than same option in zone 1146 let region = MemoryManager::create_ram_region( 1147 &None, 1148 0, 1149 start_addr, 1150 hotplug_size as usize, 1151 prefault.unwrap_or(zone.prefault), 1152 zone.shared, 1153 zone.hugepages, 1154 zone.hugepage_size, 1155 zone.host_numa_node, 1156 None, 1157 config.thp, 1158 )?; 1159 1160 guest_memory = guest_memory 1161 .insert_region(Arc::clone(®ion)) 1162 .map_err(Error::GuestMemory)?; 1163 1164 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1165 let region_size = region.len(); 1166 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1167 region, 1168 virtio_device: None, 1169 hotplugged_size, 1170 hugepages: zone.hugepages, 1171 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1172 }); 1173 1174 start_of_device_area = start_addr 1175 .checked_add(hotplug_size) 1176 .ok_or(Error::GuestAddressOverFlow)?; 1177 } 1178 } 1179 } else { 1180 return Err(Error::MissingZoneIdentifier); 1181 } 1182 } 1183 1184 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1185 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1186 1187 ( 1188 start_of_device_area, 1189 ram_size, 1190 ram_size, 1191 arch_mem_regions, 1192 memory_zones, 1193 guest_memory, 1194 boot_guest_memory, 1195 hotplug_slots, 1196 0, 1197 0, 1198 0, 1199 ) 1200 }; 1201 1202 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1203 1204 let allocator = Arc::new(Mutex::new( 1205 SystemAllocator::new( 1206 GuestAddress(0), 1207 1 << 16, 1208 start_of_platform_device_area, 1209 PLATFORM_DEVICE_AREA_SIZE, 1210 #[cfg(target_arch = "x86_64")] 1211 vec![GsiApic::new( 1212 X86_64_IRQ_BASE, 1213 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1214 )], 1215 ) 1216 .ok_or(Error::CreateSystemAllocator)?, 1217 )); 1218 1219 #[cfg(not(feature = "tdx"))] 1220 let dynamic = true; 1221 #[cfg(feature = "tdx")] 1222 let dynamic = !tdx_enabled; 1223 1224 let acpi_address = if dynamic 1225 && config.hotplug_method == HotplugMethod::Acpi 1226 && (config.hotplug_size.unwrap_or_default() > 0) 1227 { 1228 Some( 1229 allocator 1230 .lock() 1231 .unwrap() 1232 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1233 .ok_or(Error::AllocateMmioAddress)?, 1234 ) 1235 } else { 1236 None 1237 }; 1238 1239 // If running on SGX the start of device area and RAM area may diverge but 1240 // at this point they are next to each other. 1241 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1242 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1243 1244 #[allow(unused_mut)] 1245 let mut memory_manager = MemoryManager { 1246 boot_guest_memory, 1247 guest_memory, 1248 next_memory_slot: Arc::new(AtomicU32::new(next_memory_slot)), 1249 memory_slot_free_list: Arc::new(Mutex::new(Vec::new())), 1250 start_of_device_area, 1251 end_of_device_area, 1252 end_of_ram_area, 1253 vm, 1254 hotplug_slots, 1255 selected_slot, 1256 mergeable: config.mergeable, 1257 allocator, 1258 hotplug_method: config.hotplug_method, 1259 boot_ram, 1260 current_ram, 1261 next_hotplug_slot, 1262 shared: config.shared, 1263 hugepages: config.hugepages, 1264 hugepage_size: config.hugepage_size, 1265 prefault: config.prefault, 1266 #[cfg(target_arch = "x86_64")] 1267 sgx_epc_region: None, 1268 user_provided_zones, 1269 snapshot_memory_ranges: MemoryRangeTable::default(), 1270 memory_zones, 1271 guest_ram_mappings: Vec::new(), 1272 acpi_address, 1273 log_dirty: dynamic, // Cannot log dirty pages on a TD 1274 arch_mem_regions, 1275 ram_allocator, 1276 dynamic, 1277 #[cfg(target_arch = "aarch64")] 1278 uefi_flash: None, 1279 thp: config.thp, 1280 }; 1281 1282 #[cfg(target_arch = "x86_64")] 1283 if let Some(sgx_epc_config) = sgx_epc_config { 1284 memory_manager.setup_sgx(sgx_epc_config)?; 1285 } 1286 1287 Ok(Arc::new(Mutex::new(memory_manager))) 1288 } 1289 1290 pub fn new_from_snapshot( 1291 snapshot: &Snapshot, 1292 vm: Arc<dyn hypervisor::Vm>, 1293 config: &MemoryConfig, 1294 source_url: Option<&str>, 1295 prefault: bool, 1296 phys_bits: u8, 1297 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1298 if let Some(source_url) = source_url { 1299 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1300 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1301 1302 let mem_snapshot: MemoryManagerSnapshotData = 1303 snapshot.to_state().map_err(Error::Restore)?; 1304 1305 let mm = MemoryManager::new( 1306 vm, 1307 config, 1308 Some(prefault), 1309 phys_bits, 1310 #[cfg(feature = "tdx")] 1311 false, 1312 Some(&mem_snapshot), 1313 None, 1314 #[cfg(target_arch = "x86_64")] 1315 None, 1316 )?; 1317 1318 mm.lock() 1319 .unwrap() 1320 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1321 1322 Ok(mm) 1323 } else { 1324 Err(Error::RestoreMissingSourceUrl) 1325 } 1326 } 1327 1328 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1329 // SAFETY: FFI call with correct arguments 1330 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1331 1332 if res < 0 { 1333 Err(io::Error::last_os_error()) 1334 } else { 1335 Ok(res as RawFd) 1336 } 1337 } 1338 1339 fn mbind( 1340 addr: *mut u8, 1341 len: u64, 1342 mode: u32, 1343 nodemask: Vec<u64>, 1344 maxnode: u64, 1345 flags: u32, 1346 ) -> Result<(), io::Error> { 1347 // SAFETY: FFI call with correct arguments 1348 let res = unsafe { 1349 libc::syscall( 1350 libc::SYS_mbind, 1351 addr as *mut libc::c_void, 1352 len, 1353 mode, 1354 nodemask.as_ptr(), 1355 maxnode, 1356 flags, 1357 ) 1358 }; 1359 1360 if res < 0 { 1361 Err(io::Error::last_os_error()) 1362 } else { 1363 Ok(()) 1364 } 1365 } 1366 1367 fn create_anonymous_file( 1368 size: usize, 1369 hugepages: bool, 1370 hugepage_size: Option<u64>, 1371 ) -> Result<FileOffset, Error> { 1372 let fd = Self::memfd_create( 1373 &ffi::CString::new("ch_ram").unwrap(), 1374 libc::MFD_CLOEXEC 1375 | if hugepages { 1376 libc::MFD_HUGETLB 1377 | if let Some(hugepage_size) = hugepage_size { 1378 /* 1379 * From the Linux kernel: 1380 * Several system calls take a flag to request "hugetlb" huge pages. 1381 * Without further specification, these system calls will use the 1382 * system's default huge page size. If a system supports multiple 1383 * huge page sizes, the desired huge page size can be specified in 1384 * bits [26:31] of the flag arguments. The value in these 6 bits 1385 * will encode the log2 of the huge page size. 1386 */ 1387 1388 hugepage_size.trailing_zeros() << 26 1389 } else { 1390 // Use the system default huge page size 1391 0 1392 } 1393 } else { 1394 0 1395 }, 1396 ) 1397 .map_err(Error::SharedFileCreate)?; 1398 1399 // SAFETY: fd is valid 1400 let f = unsafe { File::from_raw_fd(fd) }; 1401 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1402 1403 Ok(FileOffset::new(f, 0)) 1404 } 1405 1406 fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> { 1407 if backing_file.is_dir() { 1408 Err(Error::DirectoryAsBackingFileForMemory) 1409 } else { 1410 let f = OpenOptions::new() 1411 .read(true) 1412 .write(true) 1413 .open(backing_file) 1414 .map_err(Error::SharedFileCreate)?; 1415 1416 Ok(FileOffset::new(f, file_offset)) 1417 } 1418 } 1419 1420 #[allow(clippy::too_many_arguments)] 1421 pub fn create_ram_region( 1422 backing_file: &Option<PathBuf>, 1423 file_offset: u64, 1424 start_addr: GuestAddress, 1425 size: usize, 1426 prefault: bool, 1427 shared: bool, 1428 hugepages: bool, 1429 hugepage_size: Option<u64>, 1430 host_numa_node: Option<u32>, 1431 existing_memory_file: Option<File>, 1432 thp: bool, 1433 ) -> Result<Arc<GuestRegionMmap>, Error> { 1434 let mut mmap_flags = libc::MAP_NORESERVE; 1435 1436 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1437 // the complexity of the handling clear. 1438 let fo = if let Some(f) = existing_memory_file { 1439 // It must be MAP_SHARED as we wouldn't already have an FD 1440 mmap_flags |= libc::MAP_SHARED; 1441 Some(FileOffset::new(f, file_offset)) 1442 } else if let Some(backing_file) = backing_file { 1443 if shared { 1444 mmap_flags |= libc::MAP_SHARED; 1445 } else { 1446 mmap_flags |= libc::MAP_PRIVATE; 1447 } 1448 Some(Self::open_backing_file(backing_file, file_offset)?) 1449 } else if shared || hugepages { 1450 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1451 // because the MAP_PRIVATE will trigger CoW against the backing file with 1452 // the VFIO pinning 1453 mmap_flags |= libc::MAP_SHARED; 1454 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1455 } else { 1456 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1457 None 1458 }; 1459 1460 let region = GuestRegionMmap::new( 1461 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1462 .map_err(Error::GuestMemoryRegion)?, 1463 start_addr, 1464 ) 1465 .map_err(Error::GuestMemory)?; 1466 1467 // Apply NUMA policy if needed. 1468 if let Some(node) = host_numa_node { 1469 let addr = region.deref().as_ptr(); 1470 let len = region.deref().size() as u64; 1471 let mode = MPOL_BIND; 1472 let mut nodemask: Vec<u64> = Vec::new(); 1473 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1474 1475 // Linux is kind of buggy in the way it interprets maxnode as it 1476 // will cut off the last node. That's why we have to add 1 to what 1477 // we would consider as the proper maxnode value. 1478 let maxnode = node as u64 + 1 + 1; 1479 1480 // Allocate the right size for the vector. 1481 nodemask.resize((node as usize / 64) + 1, 0); 1482 1483 // Fill the global bitmask through the nodemask vector. 1484 let idx = (node / 64) as usize; 1485 let shift = node % 64; 1486 nodemask[idx] |= 1u64 << shift; 1487 1488 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1489 // force the kernel to move all pages that might have been already 1490 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1491 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1492 // MPOL_BIND is the selected mode as it specifies a strict policy 1493 // that restricts memory allocation to the nodes specified in the 1494 // nodemask. 1495 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1496 .map_err(Error::ApplyNumaPolicy)?; 1497 } 1498 1499 // Prefault the region if needed, in parallel. 1500 if prefault { 1501 let page_size = 1502 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize; 1503 1504 if !is_aligned(size, page_size) { 1505 warn!( 1506 "Prefaulting memory size {} misaligned with page size {}", 1507 size, page_size 1508 ); 1509 } 1510 1511 let num_pages = size / page_size; 1512 1513 let num_threads = Self::get_prefault_num_threads(page_size, num_pages); 1514 1515 let pages_per_thread = num_pages / num_threads; 1516 let remainder = num_pages % num_threads; 1517 1518 let barrier = Arc::new(Barrier::new(num_threads)); 1519 thread::scope(|s| { 1520 let r = ®ion; 1521 for i in 0..num_threads { 1522 let barrier = Arc::clone(&barrier); 1523 s.spawn(move || { 1524 // Wait until all threads have been spawned to avoid contention 1525 // over mmap_sem between thread stack allocation and page faulting. 1526 barrier.wait(); 1527 let pages = pages_per_thread + if i < remainder { 1 } else { 0 }; 1528 let offset = 1529 page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder)); 1530 // SAFETY: FFI call with correct arguments 1531 let ret = unsafe { 1532 let addr = r.as_ptr().add(offset); 1533 libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE) 1534 }; 1535 if ret != 0 { 1536 let e = io::Error::last_os_error(); 1537 warn!("Failed to prefault pages: {}", e); 1538 } 1539 }); 1540 } 1541 }); 1542 } 1543 1544 if region.file_offset().is_none() && thp { 1545 info!( 1546 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1547 region.as_ptr() as u64, 1548 size 1549 ); 1550 // SAFETY: FFI call with correct arguments 1551 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1552 if ret != 0 { 1553 let e = io::Error::last_os_error(); 1554 warn!("Failed to mark pages as THP eligible: {}", e); 1555 } 1556 } 1557 1558 Ok(Arc::new(region)) 1559 } 1560 1561 // Duplicate of `memory_zone_get_align_size` that does not require a `zone` 1562 fn get_prefault_align_size( 1563 backing_file: &Option<PathBuf>, 1564 hugepages: bool, 1565 hugepage_size: Option<u64>, 1566 ) -> Result<u64, Error> { 1567 // SAFETY: FFI call. Trivially safe. 1568 let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 }; 1569 match (hugepages, hugepage_size, backing_file) { 1570 (false, _, _) => Ok(page_size), 1571 (true, Some(hugepage_size), _) => Ok(hugepage_size), 1572 (true, None, _) => { 1573 // There are two scenarios here: 1574 // - `hugepages` is enabled but `hugepage_size` is not specified: 1575 // Call `statfs` for `/dev/hugepages` for getting the default size of hugepage 1576 // - The backing file is specified: 1577 // Call `statfs` for the file and get its `f_bsize`. If the value is larger than the page 1578 // size of normal page, just use the `f_bsize` because the file is in a hugetlbfs. If the 1579 // value is less than or equal to the page size, just use the page size. 1580 let path = backing_file 1581 .as_ref() 1582 .map_or(Ok("/dev/hugepages"), |pathbuf| { 1583 pathbuf.to_str().ok_or(Error::InvalidMemoryParameters) 1584 })?; 1585 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?); 1586 Ok(align_size) 1587 } 1588 } 1589 } 1590 1591 fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize { 1592 let mut n: usize = 1; 1593 1594 // Do not create more threads than processors available. 1595 // SAFETY: FFI call. Trivially safe. 1596 let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) }; 1597 if procs > 0 { 1598 n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT); 1599 } 1600 1601 // Do not create more threads than pages being allocated. 1602 n = std::cmp::min(n, num_pages); 1603 1604 // Do not create threads to allocate less than 64 MiB of memory. 1605 n = std::cmp::min( 1606 n, 1607 std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))), 1608 ); 1609 1610 n 1611 } 1612 1613 // Update the GuestMemoryMmap with the new range 1614 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1615 let guest_memory = self 1616 .guest_memory 1617 .memory() 1618 .insert_region(region) 1619 .map_err(Error::GuestMemory)?; 1620 self.guest_memory.lock().unwrap().replace(guest_memory); 1621 1622 Ok(()) 1623 } 1624 1625 // 1626 // Calculate the start address of an area next to RAM. 1627 // 1628 // If memory hotplug is allowed, the start address needs to be aligned 1629 // (rounded-up) to 128MiB boundary. 1630 // If memory hotplug is not allowed, there is no alignment required. 1631 // And it must also start at the 64bit start. 1632 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1633 let mut start_addr = if allow_mem_hotplug { 1634 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1635 } else { 1636 mem_end 1637 }; 1638 1639 start_addr = start_addr 1640 .checked_add(1) 1641 .ok_or(Error::GuestAddressOverFlow)?; 1642 1643 #[cfg(not(target_arch = "riscv64"))] 1644 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1645 return Ok(arch::layout::RAM_64BIT_START); 1646 } 1647 1648 Ok(start_addr) 1649 } 1650 1651 pub fn add_ram_region( 1652 &mut self, 1653 start_addr: GuestAddress, 1654 size: usize, 1655 ) -> Result<Arc<GuestRegionMmap>, Error> { 1656 // Allocate memory for the region 1657 let region = MemoryManager::create_ram_region( 1658 &None, 1659 0, 1660 start_addr, 1661 size, 1662 self.prefault, 1663 self.shared, 1664 self.hugepages, 1665 self.hugepage_size, 1666 None, 1667 None, 1668 self.thp, 1669 )?; 1670 1671 // Map it into the guest 1672 let slot = self.create_userspace_mapping( 1673 region.start_addr().0, 1674 region.len(), 1675 region.as_ptr() as u64, 1676 self.mergeable, 1677 false, 1678 self.log_dirty, 1679 )?; 1680 self.guest_ram_mappings.push(GuestRamMapping { 1681 gpa: region.start_addr().raw_value(), 1682 size: region.len(), 1683 slot, 1684 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1685 virtio_mem: false, 1686 file_offset: 0, 1687 }); 1688 1689 self.add_region(Arc::clone(®ion))?; 1690 1691 Ok(region) 1692 } 1693 1694 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1695 info!("Hotplugging new RAM: {}", size); 1696 1697 // Check that there is a free slot 1698 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1699 return Err(Error::NoSlotAvailable); 1700 } 1701 1702 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1703 if size % (128 << 20) != 0 { 1704 return Err(Error::InvalidSize); 1705 } 1706 1707 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1708 1709 if start_addr 1710 .checked_add((size - 1).try_into().unwrap()) 1711 .unwrap() 1712 > self.end_of_ram_area 1713 { 1714 return Err(Error::InsufficientHotplugRam); 1715 } 1716 1717 let region = self.add_ram_region(start_addr, size)?; 1718 1719 // Add region to the list of regions associated with the default 1720 // memory zone. 1721 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1722 memory_zone.regions.push(Arc::clone(®ion)); 1723 } 1724 1725 // Tell the allocator 1726 self.ram_allocator 1727 .allocate(Some(start_addr), size as GuestUsize, None) 1728 .ok_or(Error::MemoryRangeAllocation)?; 1729 1730 // Update the slot so that it can be queried via the I/O port 1731 let slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1732 slot.active = true; 1733 slot.inserting = true; 1734 slot.base = region.start_addr().0; 1735 slot.length = region.len(); 1736 1737 self.next_hotplug_slot += 1; 1738 1739 Ok(region) 1740 } 1741 1742 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1743 self.guest_memory.clone() 1744 } 1745 1746 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1747 self.boot_guest_memory.clone() 1748 } 1749 1750 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1751 self.allocator.clone() 1752 } 1753 1754 pub fn start_of_device_area(&self) -> GuestAddress { 1755 self.start_of_device_area 1756 } 1757 1758 pub fn end_of_device_area(&self) -> GuestAddress { 1759 self.end_of_device_area 1760 } 1761 1762 pub fn memory_slot_allocator(&mut self) -> MemorySlotAllocator { 1763 let memory_slot_free_list = Arc::clone(&self.memory_slot_free_list); 1764 let next_memory_slot = Arc::clone(&self.next_memory_slot); 1765 MemorySlotAllocator::new(next_memory_slot, memory_slot_free_list) 1766 } 1767 1768 pub fn allocate_memory_slot(&mut self) -> u32 { 1769 self.memory_slot_allocator().next_memory_slot() 1770 } 1771 1772 pub fn create_userspace_mapping( 1773 &mut self, 1774 guest_phys_addr: u64, 1775 memory_size: u64, 1776 userspace_addr: u64, 1777 mergeable: bool, 1778 readonly: bool, 1779 log_dirty: bool, 1780 ) -> Result<u32, Error> { 1781 let slot = self.allocate_memory_slot(); 1782 let mem_region = self.vm.make_user_memory_region( 1783 slot, 1784 guest_phys_addr, 1785 memory_size, 1786 userspace_addr, 1787 readonly, 1788 log_dirty, 1789 ); 1790 1791 info!( 1792 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1793 guest_phys_addr, userspace_addr, memory_size, slot 1794 ); 1795 1796 self.vm 1797 .create_user_memory_region(mem_region) 1798 .map_err(Error::CreateUserMemoryRegion)?; 1799 1800 // SAFETY: the address and size are valid since the 1801 // mmap succeeded. 1802 let ret = unsafe { 1803 libc::madvise( 1804 userspace_addr as *mut libc::c_void, 1805 memory_size as libc::size_t, 1806 libc::MADV_DONTDUMP, 1807 ) 1808 }; 1809 if ret != 0 { 1810 let e = io::Error::last_os_error(); 1811 warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e); 1812 } 1813 1814 // Mark the pages as mergeable if explicitly asked for. 1815 if mergeable { 1816 // SAFETY: the address and size are valid since the 1817 // mmap succeeded. 1818 let ret = unsafe { 1819 libc::madvise( 1820 userspace_addr as *mut libc::c_void, 1821 memory_size as libc::size_t, 1822 libc::MADV_MERGEABLE, 1823 ) 1824 }; 1825 if ret != 0 { 1826 let err = io::Error::last_os_error(); 1827 // Safe to unwrap because the error is constructed with 1828 // last_os_error(), which ensures the output will be Some(). 1829 let errno = err.raw_os_error().unwrap(); 1830 if errno == libc::EINVAL { 1831 warn!("kernel not configured with CONFIG_KSM"); 1832 } else { 1833 warn!("madvise error: {}", err); 1834 } 1835 warn!("failed to mark pages as mergeable"); 1836 } 1837 } 1838 1839 info!( 1840 "Created userspace mapping: {:x} -> {:x} {:x}", 1841 guest_phys_addr, userspace_addr, memory_size 1842 ); 1843 1844 Ok(slot) 1845 } 1846 1847 pub fn remove_userspace_mapping( 1848 &mut self, 1849 guest_phys_addr: u64, 1850 memory_size: u64, 1851 userspace_addr: u64, 1852 mergeable: bool, 1853 slot: u32, 1854 ) -> Result<(), Error> { 1855 let mem_region = self.vm.make_user_memory_region( 1856 slot, 1857 guest_phys_addr, 1858 memory_size, 1859 userspace_addr, 1860 false, /* readonly -- don't care */ 1861 false, /* log dirty */ 1862 ); 1863 1864 self.vm 1865 .remove_user_memory_region(mem_region) 1866 .map_err(Error::RemoveUserMemoryRegion)?; 1867 1868 // Mark the pages as unmergeable if there were previously marked as 1869 // mergeable. 1870 if mergeable { 1871 // SAFETY: the address and size are valid as the region was 1872 // previously advised. 1873 let ret = unsafe { 1874 libc::madvise( 1875 userspace_addr as *mut libc::c_void, 1876 memory_size as libc::size_t, 1877 libc::MADV_UNMERGEABLE, 1878 ) 1879 }; 1880 if ret != 0 { 1881 let err = io::Error::last_os_error(); 1882 // Safe to unwrap because the error is constructed with 1883 // last_os_error(), which ensures the output will be Some(). 1884 let errno = err.raw_os_error().unwrap(); 1885 if errno == libc::EINVAL { 1886 warn!("kernel not configured with CONFIG_KSM"); 1887 } else { 1888 warn!("madvise error: {}", err); 1889 } 1890 warn!("failed to mark pages as unmergeable"); 1891 } 1892 } 1893 1894 info!( 1895 "Removed userspace mapping: {:x} -> {:x} {:x}", 1896 guest_phys_addr, userspace_addr, memory_size 1897 ); 1898 1899 Ok(()) 1900 } 1901 1902 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1903 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1904 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1905 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1906 virtio_mem_device 1907 .lock() 1908 .unwrap() 1909 .resize(size) 1910 .map_err(Error::VirtioMemResizeFail)?; 1911 } 1912 1913 // Keep the hotplugged_size up to date. 1914 virtio_mem_zone.hotplugged_size = size; 1915 } else { 1916 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1917 return Err(Error::MissingVirtioMemHandler); 1918 } 1919 1920 return Ok(()); 1921 } 1922 1923 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1924 Err(Error::UnknownMemoryZone) 1925 } 1926 1927 /// In case this function resulted in adding a new memory region to the 1928 /// guest memory, the new region is returned to the caller. The virtio-mem 1929 /// use case never adds a new region as the whole hotpluggable memory has 1930 /// already been allocated at boot time. 1931 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1932 if self.user_provided_zones { 1933 error!( 1934 "Not allowed to resize guest memory when backed with user \ 1935 defined memory zones." 1936 ); 1937 return Err(Error::InvalidResizeWithMemoryZones); 1938 } 1939 1940 let mut region: Option<Arc<GuestRegionMmap>> = None; 1941 match self.hotplug_method { 1942 HotplugMethod::VirtioMem => { 1943 if desired_ram >= self.boot_ram { 1944 if !self.dynamic { 1945 return Ok(region); 1946 } 1947 1948 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1949 self.current_ram = desired_ram; 1950 } 1951 } 1952 HotplugMethod::Acpi => { 1953 if desired_ram > self.current_ram { 1954 if !self.dynamic { 1955 return Ok(region); 1956 } 1957 1958 region = 1959 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1960 self.current_ram = desired_ram; 1961 } 1962 } 1963 } 1964 Ok(region) 1965 } 1966 1967 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1968 if !self.user_provided_zones { 1969 error!( 1970 "Not allowed to resize guest memory zone when no zone is \ 1971 defined." 1972 ); 1973 return Err(Error::ResizeZone); 1974 } 1975 1976 self.virtio_mem_resize(id, virtio_mem_size) 1977 } 1978 1979 #[cfg(target_arch = "x86_64")] 1980 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1981 let file = OpenOptions::new() 1982 .read(true) 1983 .open("/dev/sgx_provision") 1984 .map_err(Error::SgxProvisionOpen)?; 1985 self.vm 1986 .enable_sgx_attribute(file) 1987 .map_err(Error::SgxEnableProvisioning)?; 1988 1989 // Go over each EPC section and verify its size is a 4k multiple. At 1990 // the same time, calculate the total size needed for the contiguous 1991 // EPC region. 1992 let mut epc_region_size = 0; 1993 for epc_section in sgx_epc_config.iter() { 1994 if epc_section.size == 0 { 1995 return Err(Error::EpcSectionSizeInvalid); 1996 } 1997 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1998 return Err(Error::EpcSectionSizeInvalid); 1999 } 2000 2001 epc_region_size += epc_section.size; 2002 } 2003 2004 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 2005 let epc_region_start = 2006 GuestAddress(self.start_of_device_area.0.div_ceil(SGX_PAGE_SIZE) * SGX_PAGE_SIZE); 2007 2008 self.start_of_device_area = epc_region_start 2009 .checked_add(epc_region_size) 2010 .ok_or(Error::GuestAddressOverFlow)?; 2011 2012 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 2013 info!( 2014 "SGX EPC region: 0x{:x} (0x{:x})", 2015 epc_region_start.0, epc_region_size 2016 ); 2017 2018 // Each section can be memory mapped into the allocated region. 2019 let mut epc_section_start = epc_region_start.raw_value(); 2020 for epc_section in sgx_epc_config.iter() { 2021 let file = OpenOptions::new() 2022 .read(true) 2023 .write(true) 2024 .open("/dev/sgx_vepc") 2025 .map_err(Error::SgxVirtEpcOpen)?; 2026 2027 let prot = PROT_READ | PROT_WRITE; 2028 let mut flags = MAP_NORESERVE | MAP_SHARED; 2029 if epc_section.prefault { 2030 flags |= MAP_POPULATE; 2031 } 2032 2033 // We can't use the vm-memory crate to perform the memory mapping 2034 // here as it would try to ensure the size of the backing file is 2035 // matching the size of the expected mapping. The /dev/sgx_vepc 2036 // device does not work that way, it provides a file descriptor 2037 // which is not matching the mapping size, as it's a just a way to 2038 // let KVM know that an EPC section is being created for the guest. 2039 // SAFETY: FFI call with correct arguments 2040 let host_addr = unsafe { 2041 libc::mmap( 2042 std::ptr::null_mut(), 2043 epc_section.size as usize, 2044 prot, 2045 flags, 2046 file.as_raw_fd(), 2047 0, 2048 ) 2049 } as u64; 2050 2051 info!( 2052 "Adding SGX EPC section: 0x{:x} (0x{:x})", 2053 epc_section_start, epc_section.size 2054 ); 2055 2056 let _mem_slot = self.create_userspace_mapping( 2057 epc_section_start, 2058 epc_section.size, 2059 host_addr, 2060 false, 2061 false, 2062 false, 2063 )?; 2064 2065 sgx_epc_region.insert( 2066 epc_section.id.clone(), 2067 SgxEpcSection::new( 2068 GuestAddress(epc_section_start), 2069 epc_section.size as GuestUsize, 2070 ), 2071 ); 2072 2073 epc_section_start += epc_section.size; 2074 } 2075 2076 self.sgx_epc_region = Some(sgx_epc_region); 2077 2078 Ok(()) 2079 } 2080 2081 #[cfg(target_arch = "x86_64")] 2082 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 2083 &self.sgx_epc_region 2084 } 2085 2086 pub fn is_hardlink(f: &File) -> bool { 2087 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 2088 // SAFETY: FFI call with correct arguments 2089 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 2090 if ret != 0 { 2091 error!("Couldn't fstat the backing file"); 2092 return false; 2093 } 2094 2095 // SAFETY: stat is valid 2096 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 2097 } 2098 2099 pub fn memory_zones(&self) -> &MemoryZones { 2100 &self.memory_zones 2101 } 2102 2103 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 2104 &mut self.memory_zones 2105 } 2106 2107 pub fn memory_range_table( 2108 &self, 2109 snapshot: bool, 2110 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 2111 let mut table = MemoryRangeTable::default(); 2112 2113 for memory_zone in self.memory_zones.values() { 2114 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 2115 table.extend(virtio_mem_zone.plugged_ranges()); 2116 } 2117 2118 for region in memory_zone.regions() { 2119 if snapshot { 2120 if let Some(file_offset) = region.file_offset() { 2121 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 2122 && Self::is_hardlink(file_offset.file()) 2123 { 2124 // In this very specific case, we know the memory 2125 // region is backed by a file on the host filesystem 2126 // that can be accessed by the user, and additionally 2127 // the mapping is shared, which means that modifications 2128 // to the content are written to the actual file. 2129 // When meeting these conditions, we can skip the 2130 // copy of the memory content for this specific region, 2131 // as we can assume the user will have it saved through 2132 // the backing file already. 2133 continue; 2134 } 2135 } 2136 } 2137 2138 table.push(MemoryRange { 2139 gpa: region.start_addr().raw_value(), 2140 length: region.len(), 2141 }); 2142 } 2143 } 2144 2145 Ok(table) 2146 } 2147 2148 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 2149 MemoryManagerSnapshotData { 2150 memory_ranges: self.snapshot_memory_ranges.clone(), 2151 guest_ram_mappings: self.guest_ram_mappings.clone(), 2152 start_of_device_area: self.start_of_device_area.0, 2153 boot_ram: self.boot_ram, 2154 current_ram: self.current_ram, 2155 arch_mem_regions: self.arch_mem_regions.clone(), 2156 hotplug_slots: self.hotplug_slots.clone(), 2157 next_memory_slot: self.next_memory_slot.load(Ordering::SeqCst), 2158 selected_slot: self.selected_slot, 2159 next_hotplug_slot: self.next_hotplug_slot, 2160 } 2161 } 2162 2163 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 2164 let mut memory_slot_fds = HashMap::new(); 2165 for guest_ram_mapping in &self.guest_ram_mappings { 2166 let slot = guest_ram_mapping.slot; 2167 let guest_memory = self.guest_memory.memory(); 2168 let file = guest_memory 2169 .find_region(GuestAddress(guest_ram_mapping.gpa)) 2170 .unwrap() 2171 .file_offset() 2172 .unwrap() 2173 .file(); 2174 memory_slot_fds.insert(slot, file.as_raw_fd()); 2175 } 2176 memory_slot_fds 2177 } 2178 2179 pub fn acpi_address(&self) -> Option<GuestAddress> { 2180 self.acpi_address 2181 } 2182 2183 pub fn num_guest_ram_mappings(&self) -> u32 { 2184 self.guest_ram_mappings.len() as u32 2185 } 2186 2187 #[cfg(target_arch = "aarch64")] 2188 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 2189 self.uefi_flash.as_ref().unwrap().clone() 2190 } 2191 2192 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2193 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 2194 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 2195 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 2196 2197 let mut mem_offset_in_elf = mem_offset; 2198 let mut ram_maps = BTreeMap::new(); 2199 for mapping in mapping_sorted_by_gpa.iter() { 2200 ram_maps.insert( 2201 mapping.gpa, 2202 CoredumpMemoryRegion { 2203 mem_offset_in_elf, 2204 mem_size: mapping.size, 2205 }, 2206 ); 2207 mem_offset_in_elf += mapping.size; 2208 } 2209 2210 CoredumpMemoryRegions { ram_maps } 2211 } 2212 2213 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 2214 pub fn coredump_iterate_save_mem( 2215 &mut self, 2216 dump_state: &DumpState, 2217 ) -> std::result::Result<(), GuestDebuggableError> { 2218 let snapshot_memory_ranges = self 2219 .memory_range_table(false) 2220 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2221 2222 if snapshot_memory_ranges.is_empty() { 2223 return Ok(()); 2224 } 2225 2226 let coredump_file = dump_state.file.as_ref().unwrap(); 2227 2228 let guest_memory = self.guest_memory.memory(); 2229 let mut total_bytes: u64 = 0; 2230 2231 for range in snapshot_memory_ranges.regions() { 2232 let mut offset: u64 = 0; 2233 loop { 2234 let bytes_written = guest_memory 2235 .write_volatile_to( 2236 GuestAddress(range.gpa + offset), 2237 &mut coredump_file.as_fd(), 2238 (range.length - offset) as usize, 2239 ) 2240 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2241 offset += bytes_written as u64; 2242 total_bytes += bytes_written as u64; 2243 2244 if offset == range.length { 2245 break; 2246 } 2247 } 2248 } 2249 2250 debug!("coredump total bytes {}", total_bytes); 2251 Ok(()) 2252 } 2253 2254 pub fn receive_memory_regions<F>( 2255 &mut self, 2256 ranges: &MemoryRangeTable, 2257 fd: &mut F, 2258 ) -> std::result::Result<(), MigratableError> 2259 where 2260 F: ReadVolatile, 2261 { 2262 let guest_memory = self.guest_memory(); 2263 let mem = guest_memory.memory(); 2264 2265 for range in ranges.regions() { 2266 let mut offset: u64 = 0; 2267 // Here we are manually handling the retry in case we can't the 2268 // whole region at once because we can't use the implementation 2269 // from vm-memory::GuestMemory of read_exact_from() as it is not 2270 // following the correct behavior. For more info about this issue 2271 // see: https://github.com/rust-vmm/vm-memory/issues/174 2272 loop { 2273 let bytes_read = mem 2274 .read_volatile_from( 2275 GuestAddress(range.gpa + offset), 2276 fd, 2277 (range.length - offset) as usize, 2278 ) 2279 .map_err(|e| { 2280 MigratableError::MigrateReceive(anyhow!( 2281 "Error receiving memory from socket: {}", 2282 e 2283 )) 2284 })?; 2285 offset += bytes_read as u64; 2286 2287 if offset == range.length { 2288 break; 2289 } 2290 } 2291 } 2292 2293 Ok(()) 2294 } 2295 } 2296 2297 struct MemoryNotify { 2298 slot_id: usize, 2299 } 2300 2301 impl Aml for MemoryNotify { 2302 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2303 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2304 aml::If::new( 2305 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2306 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2307 ) 2308 .to_aml_bytes(sink) 2309 } 2310 } 2311 2312 struct MemorySlot { 2313 slot_id: usize, 2314 } 2315 2316 impl Aml for MemorySlot { 2317 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2318 aml::Device::new( 2319 format!("M{:03}", self.slot_id).as_str().into(), 2320 vec![ 2321 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")), 2322 &aml::Name::new("_UID".into(), &self.slot_id), 2323 /* 2324 _STA return value: 2325 Bit [0] – Set if the device is present. 2326 Bit [1] – Set if the device is enabled and decoding its resources. 2327 Bit [2] – Set if the device should be shown in the UI. 2328 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2329 Bit [4] – Set if the battery is present. 2330 Bits [31:5] – Reserved (must be cleared). 2331 */ 2332 &aml::Method::new( 2333 "_STA".into(), 2334 0, 2335 false, 2336 // Call into MSTA method which will interrogate device 2337 vec![&aml::Return::new(&aml::MethodCall::new( 2338 "MSTA".into(), 2339 vec![&self.slot_id], 2340 ))], 2341 ), 2342 // Get details of memory 2343 &aml::Method::new( 2344 "_CRS".into(), 2345 0, 2346 false, 2347 // Call into MCRS which provides actual memory details 2348 vec![&aml::Return::new(&aml::MethodCall::new( 2349 "MCRS".into(), 2350 vec![&self.slot_id], 2351 ))], 2352 ), 2353 ], 2354 ) 2355 .to_aml_bytes(sink) 2356 } 2357 } 2358 2359 struct MemorySlots { 2360 slots: usize, 2361 } 2362 2363 impl Aml for MemorySlots { 2364 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2365 for slot_id in 0..self.slots { 2366 MemorySlot { slot_id }.to_aml_bytes(sink); 2367 } 2368 } 2369 } 2370 2371 struct MemoryMethods { 2372 slots: usize, 2373 } 2374 2375 impl Aml for MemoryMethods { 2376 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2377 // Add "MTFY" notification method 2378 let mut memory_notifies = Vec::new(); 2379 for slot_id in 0..self.slots { 2380 memory_notifies.push(MemoryNotify { slot_id }); 2381 } 2382 2383 let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new(); 2384 for memory_notifier in memory_notifies.iter() { 2385 memory_notifies_refs.push(memory_notifier); 2386 } 2387 2388 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink); 2389 2390 // MSCN method 2391 aml::Method::new( 2392 "MSCN".into(), 2393 0, 2394 true, 2395 vec![ 2396 // Take lock defined above 2397 &aml::Acquire::new("MLCK".into(), 0xffff), 2398 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2399 &aml::While::new( 2400 &aml::LessThan::new(&aml::Local(0), &self.slots), 2401 vec![ 2402 // Write slot number (in first argument) to I/O port via field 2403 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2404 // Check if MINS bit is set (inserting) 2405 &aml::If::new( 2406 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2407 // Notify device if it is 2408 vec![ 2409 &aml::MethodCall::new( 2410 "MTFY".into(), 2411 vec![&aml::Local(0), &aml::ONE], 2412 ), 2413 // Reset MINS bit 2414 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2415 ], 2416 ), 2417 // Check if MRMV bit is set 2418 &aml::If::new( 2419 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2420 // Notify device if it is (with the eject constant 0x3) 2421 vec![ 2422 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2423 // Reset MRMV bit 2424 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2425 ], 2426 ), 2427 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2428 ], 2429 ), 2430 // Release lock 2431 &aml::Release::new("MLCK".into()), 2432 ], 2433 ) 2434 .to_aml_bytes(sink); 2435 2436 // Memory status method 2437 aml::Method::new( 2438 "MSTA".into(), 2439 1, 2440 true, 2441 vec![ 2442 // Take lock defined above 2443 &aml::Acquire::new("MLCK".into(), 0xffff), 2444 // Write slot number (in first argument) to I/O port via field 2445 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2446 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2447 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2448 &aml::If::new( 2449 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2450 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2451 ), 2452 // Release lock 2453 &aml::Release::new("MLCK".into()), 2454 // Return 0 or 0xf 2455 &aml::Return::new(&aml::Local(0)), 2456 ], 2457 ) 2458 .to_aml_bytes(sink); 2459 2460 // Memory range method 2461 aml::Method::new( 2462 "MCRS".into(), 2463 1, 2464 true, 2465 vec![ 2466 // Take lock defined above 2467 &aml::Acquire::new("MLCK".into(), 0xffff), 2468 // Write slot number (in first argument) to I/O port via field 2469 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2470 &aml::Name::new( 2471 "MR64".into(), 2472 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2473 aml::AddressSpaceCacheable::Cacheable, 2474 true, 2475 0x0000_0000_0000_0000u64, 2476 0xFFFF_FFFF_FFFF_FFFEu64, 2477 None, 2478 )]), 2479 ), 2480 &aml::CreateQWordField::new( 2481 &aml::Path::new("MINL"), 2482 &aml::Path::new("MR64"), 2483 &14usize, 2484 ), 2485 &aml::CreateDWordField::new( 2486 &aml::Path::new("MINH"), 2487 &aml::Path::new("MR64"), 2488 &18usize, 2489 ), 2490 &aml::CreateQWordField::new( 2491 &aml::Path::new("MAXL"), 2492 &aml::Path::new("MR64"), 2493 &22usize, 2494 ), 2495 &aml::CreateDWordField::new( 2496 &aml::Path::new("MAXH"), 2497 &aml::Path::new("MR64"), 2498 &26usize, 2499 ), 2500 &aml::CreateQWordField::new( 2501 &aml::Path::new("LENL"), 2502 &aml::Path::new("MR64"), 2503 &38usize, 2504 ), 2505 &aml::CreateDWordField::new( 2506 &aml::Path::new("LENH"), 2507 &aml::Path::new("MR64"), 2508 &42usize, 2509 ), 2510 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2511 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2512 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2513 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2514 &aml::Add::new( 2515 &aml::Path::new("MAXL"), 2516 &aml::Path::new("MINL"), 2517 &aml::Path::new("LENL"), 2518 ), 2519 &aml::Add::new( 2520 &aml::Path::new("MAXH"), 2521 &aml::Path::new("MINH"), 2522 &aml::Path::new("LENH"), 2523 ), 2524 &aml::If::new( 2525 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2526 vec![&aml::Add::new( 2527 &aml::Path::new("MAXH"), 2528 &aml::ONE, 2529 &aml::Path::new("MAXH"), 2530 )], 2531 ), 2532 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2533 // Release lock 2534 &aml::Release::new("MLCK".into()), 2535 &aml::Return::new(&aml::Path::new("MR64")), 2536 ], 2537 ) 2538 .to_aml_bytes(sink) 2539 } 2540 } 2541 2542 impl Aml for MemoryManager { 2543 fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) { 2544 if let Some(acpi_address) = self.acpi_address { 2545 // Memory Hotplug Controller 2546 aml::Device::new( 2547 "_SB_.MHPC".into(), 2548 vec![ 2549 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2550 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2551 // Mutex to protect concurrent access as we write to choose slot and then read back status 2552 &aml::Mutex::new("MLCK".into(), 0), 2553 &aml::Name::new( 2554 "_CRS".into(), 2555 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2556 aml::AddressSpaceCacheable::NotCacheable, 2557 true, 2558 acpi_address.0, 2559 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2560 None, 2561 )]), 2562 ), 2563 // OpRegion and Fields map MMIO range into individual field values 2564 &aml::OpRegion::new( 2565 "MHPR".into(), 2566 aml::OpRegionSpace::SystemMemory, 2567 &(acpi_address.0 as usize), 2568 &MEMORY_MANAGER_ACPI_SIZE, 2569 ), 2570 &aml::Field::new( 2571 "MHPR".into(), 2572 aml::FieldAccessType::DWord, 2573 aml::FieldLockRule::NoLock, 2574 aml::FieldUpdateRule::Preserve, 2575 vec![ 2576 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2577 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2578 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2579 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2580 ], 2581 ), 2582 &aml::Field::new( 2583 "MHPR".into(), 2584 aml::FieldAccessType::DWord, 2585 aml::FieldLockRule::NoLock, 2586 aml::FieldUpdateRule::Preserve, 2587 vec![ 2588 aml::FieldEntry::Reserved(128), 2589 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2590 ], 2591 ), 2592 &aml::Field::new( 2593 "MHPR".into(), 2594 aml::FieldAccessType::Byte, 2595 aml::FieldLockRule::NoLock, 2596 aml::FieldUpdateRule::WriteAsZeroes, 2597 vec![ 2598 aml::FieldEntry::Reserved(160), 2599 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2600 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2601 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2602 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2603 ], 2604 ), 2605 &aml::Field::new( 2606 "MHPR".into(), 2607 aml::FieldAccessType::DWord, 2608 aml::FieldLockRule::NoLock, 2609 aml::FieldUpdateRule::Preserve, 2610 vec![ 2611 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2612 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2613 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2614 ], 2615 ), 2616 &MemoryMethods { 2617 slots: self.hotplug_slots.len(), 2618 }, 2619 &MemorySlots { 2620 slots: self.hotplug_slots.len(), 2621 }, 2622 ], 2623 ) 2624 .to_aml_bytes(sink); 2625 } else { 2626 aml::Device::new( 2627 "_SB_.MHPC".into(), 2628 vec![ 2629 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")), 2630 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2631 // Empty MSCN for GED 2632 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2633 ], 2634 ) 2635 .to_aml_bytes(sink); 2636 } 2637 2638 #[cfg(target_arch = "x86_64")] 2639 { 2640 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2641 let min = sgx_epc_region.start().raw_value(); 2642 let max = min + sgx_epc_region.size() - 1; 2643 // SGX EPC region 2644 aml::Device::new( 2645 "_SB_.EPC_".into(), 2646 vec![ 2647 &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")), 2648 // QWORD describing the EPC region start and size 2649 &aml::Name::new( 2650 "_CRS".into(), 2651 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2652 aml::AddressSpaceCacheable::NotCacheable, 2653 true, 2654 min, 2655 max, 2656 None, 2657 )]), 2658 ), 2659 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2660 ], 2661 ) 2662 .to_aml_bytes(sink); 2663 } 2664 } 2665 } 2666 } 2667 2668 impl Pausable for MemoryManager {} 2669 2670 #[derive(Clone, Serialize, Deserialize)] 2671 pub struct MemoryManagerSnapshotData { 2672 memory_ranges: MemoryRangeTable, 2673 guest_ram_mappings: Vec<GuestRamMapping>, 2674 start_of_device_area: u64, 2675 boot_ram: u64, 2676 current_ram: u64, 2677 arch_mem_regions: Vec<ArchMemRegion>, 2678 hotplug_slots: Vec<HotPlugState>, 2679 next_memory_slot: u32, 2680 selected_slot: usize, 2681 next_hotplug_slot: usize, 2682 } 2683 2684 impl Snapshottable for MemoryManager { 2685 fn id(&self) -> String { 2686 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2687 } 2688 2689 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2690 let memory_ranges = self.memory_range_table(true)?; 2691 2692 // Store locally this list of ranges as it will be used through the 2693 // Transportable::send() implementation. The point is to avoid the 2694 // duplication of code regarding the creation of the path for each 2695 // region. The 'snapshot' step creates the list of memory regions, 2696 // including information about the need to copy a memory region or 2697 // not. This saves the 'send' step having to go through the same 2698 // process, and instead it can directly proceed with storing the 2699 // memory range content for the ranges requiring it. 2700 self.snapshot_memory_ranges = memory_ranges; 2701 2702 Ok(Snapshot::from_data(SnapshotData::new_from_state( 2703 &self.snapshot_data(), 2704 )?)) 2705 } 2706 } 2707 2708 impl Transportable for MemoryManager { 2709 fn send( 2710 &self, 2711 _snapshot: &Snapshot, 2712 destination_url: &str, 2713 ) -> result::Result<(), MigratableError> { 2714 if self.snapshot_memory_ranges.is_empty() { 2715 return Ok(()); 2716 } 2717 2718 let mut memory_file_path = url_to_path(destination_url)?; 2719 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2720 2721 // Create the snapshot file for the entire memory 2722 let mut memory_file = OpenOptions::new() 2723 .read(true) 2724 .write(true) 2725 .create_new(true) 2726 .open(memory_file_path) 2727 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2728 2729 let guest_memory = self.guest_memory.memory(); 2730 2731 for range in self.snapshot_memory_ranges.regions() { 2732 let mut offset: u64 = 0; 2733 // Here we are manually handling the retry in case we can't read 2734 // the whole region at once because we can't use the implementation 2735 // from vm-memory::GuestMemory of write_all_to() as it is not 2736 // following the correct behavior. For more info about this issue 2737 // see: https://github.com/rust-vmm/vm-memory/issues/174 2738 loop { 2739 let bytes_written = guest_memory 2740 .write_volatile_to( 2741 GuestAddress(range.gpa + offset), 2742 &mut memory_file, 2743 (range.length - offset) as usize, 2744 ) 2745 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2746 offset += bytes_written as u64; 2747 2748 if offset == range.length { 2749 break; 2750 } 2751 } 2752 } 2753 Ok(()) 2754 } 2755 } 2756 2757 impl Migratable for MemoryManager { 2758 // Start the dirty log in the hypervisor (kvm/mshv). 2759 // Also, reset the dirty bitmap logged by the vmm. 2760 // Just before we do a bulk copy we want to start/clear the dirty log so that 2761 // pages touched during our bulk copy are tracked. 2762 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2763 self.vm.start_dirty_log().map_err(|e| { 2764 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2765 })?; 2766 2767 for r in self.guest_memory.memory().iter() { 2768 (**r).bitmap().reset(); 2769 } 2770 2771 Ok(()) 2772 } 2773 2774 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2775 self.vm.stop_dirty_log().map_err(|e| { 2776 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2777 })?; 2778 2779 Ok(()) 2780 } 2781 2782 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2783 // together in the table if they are contiguous. 2784 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2785 let mut table = MemoryRangeTable::default(); 2786 for r in &self.guest_ram_mappings { 2787 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2788 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2789 })?; 2790 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2791 { 2792 Some(region) => { 2793 assert!(region.start_addr().raw_value() == r.gpa); 2794 assert!(region.len() == r.size); 2795 (**region).bitmap().get_and_reset() 2796 } 2797 None => { 2798 return Err(MigratableError::MigrateSend(anyhow!( 2799 "Error finding 'guest memory region' with address {:x}", 2800 r.gpa 2801 ))) 2802 } 2803 }; 2804 2805 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2806 .iter() 2807 .zip(vmm_dirty_bitmap.iter()) 2808 .map(|(x, y)| x | y) 2809 .collect(); 2810 2811 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2812 2813 if sub_table.regions().is_empty() { 2814 info!("Dirty Memory Range Table is empty"); 2815 } else { 2816 info!("Dirty Memory Range Table:"); 2817 for range in sub_table.regions() { 2818 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2819 } 2820 } 2821 2822 table.extend(sub_table); 2823 } 2824 Ok(table) 2825 } 2826 } 2827