xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 19d36c765fdf00be749d95b3e61028bc302d6d73)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 
6 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
7 use std::collections::BTreeMap;
8 use std::collections::HashMap;
9 use std::fs::{File, OpenOptions};
10 use std::io::{self};
11 use std::ops::{BitAnd, Deref, Not, Sub};
12 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
13 use std::os::fd::AsFd;
14 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
15 use std::path::PathBuf;
16 use std::sync::{Arc, Barrier, Mutex};
17 use std::{ffi, result, thread};
18 
19 use acpi_tables::{aml, Aml};
20 use anyhow::anyhow;
21 #[cfg(target_arch = "x86_64")]
22 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
23 use arch::RegionType;
24 #[cfg(target_arch = "x86_64")]
25 use devices::ioapic;
26 #[cfg(target_arch = "aarch64")]
27 use hypervisor::HypervisorVmError;
28 use libc::_SC_NPROCESSORS_ONLN;
29 #[cfg(target_arch = "x86_64")]
30 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
31 use serde::{Deserialize, Serialize};
32 use tracer::trace_scoped;
33 use virtio_devices::BlocksState;
34 #[cfg(target_arch = "x86_64")]
35 use vm_allocator::GsiApic;
36 use vm_allocator::{AddressAllocator, SystemAllocator};
37 use vm_device::BusDevice;
38 use vm_memory::bitmap::AtomicBitmap;
39 use vm_memory::guest_memory::FileOffset;
40 use vm_memory::mmap::MmapRegionError;
41 use vm_memory::{
42     Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
43     GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile,
44 };
45 use vm_migration::protocol::{MemoryRange, MemoryRangeTable};
46 use vm_migration::{
47     Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable,
48 };
49 
50 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
51 use crate::coredump::{
52     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
53 };
54 use crate::migration::url_to_path;
55 #[cfg(target_arch = "x86_64")]
56 use crate::vm_config::SgxEpcConfig;
57 use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
58 use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID};
59 
60 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
61 
62 const DEFAULT_MEMORY_ZONE: &str = "mem0";
63 
64 const SNAPSHOT_FILENAME: &str = "memory-ranges";
65 
66 #[cfg(target_arch = "x86_64")]
67 const X86_64_IRQ_BASE: u32 = 5;
68 
69 #[cfg(target_arch = "x86_64")]
70 const SGX_PAGE_SIZE: u64 = 1 << 12;
71 
72 const HOTPLUG_COUNT: usize = 8;
73 
74 // Memory policy constants
75 const MPOL_BIND: u32 = 2;
76 const MPOL_MF_STRICT: u32 = 1;
77 const MPOL_MF_MOVE: u32 = 1 << 1;
78 
79 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
80 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
81 
82 const MAX_PREFAULT_THREAD_COUNT: usize = 16;
83 
84 #[derive(Clone, Default, Serialize, Deserialize)]
85 struct HotPlugState {
86     base: u64,
87     length: u64,
88     active: bool,
89     inserting: bool,
90     removing: bool,
91 }
92 
93 pub struct VirtioMemZone {
94     region: Arc<GuestRegionMmap>,
95     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
96     hotplugged_size: u64,
97     hugepages: bool,
98     blocks_state: Arc<Mutex<BlocksState>>,
99 }
100 
101 impl VirtioMemZone {
102     pub fn region(&self) -> &Arc<GuestRegionMmap> {
103         &self.region
104     }
105     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
106         self.virtio_device = Some(virtio_device);
107     }
108     pub fn hotplugged_size(&self) -> u64 {
109         self.hotplugged_size
110     }
111     pub fn hugepages(&self) -> bool {
112         self.hugepages
113     }
114     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
115         &self.blocks_state
116     }
117     pub fn plugged_ranges(&self) -> MemoryRangeTable {
118         self.blocks_state
119             .lock()
120             .unwrap()
121             .memory_ranges(self.region.start_addr().raw_value(), true)
122     }
123 }
124 
125 #[derive(Default)]
126 pub struct MemoryZone {
127     regions: Vec<Arc<GuestRegionMmap>>,
128     virtio_mem_zone: Option<VirtioMemZone>,
129 }
130 
131 impl MemoryZone {
132     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
133         &self.regions
134     }
135     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
136         &self.virtio_mem_zone
137     }
138     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
139         self.virtio_mem_zone.as_mut()
140     }
141 }
142 
143 pub type MemoryZones = HashMap<String, MemoryZone>;
144 
145 #[derive(Clone, Serialize, Deserialize)]
146 struct GuestRamMapping {
147     slot: u32,
148     gpa: u64,
149     size: u64,
150     zone_id: String,
151     virtio_mem: bool,
152     file_offset: u64,
153 }
154 
155 #[derive(Clone, Serialize, Deserialize)]
156 struct ArchMemRegion {
157     base: u64,
158     size: usize,
159     r_type: RegionType,
160 }
161 
162 pub struct MemoryManager {
163     boot_guest_memory: GuestMemoryMmap,
164     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
165     next_memory_slot: u32,
166     start_of_device_area: GuestAddress,
167     end_of_device_area: GuestAddress,
168     end_of_ram_area: GuestAddress,
169     pub vm: Arc<dyn hypervisor::Vm>,
170     hotplug_slots: Vec<HotPlugState>,
171     selected_slot: usize,
172     mergeable: bool,
173     allocator: Arc<Mutex<SystemAllocator>>,
174     hotplug_method: HotplugMethod,
175     boot_ram: u64,
176     current_ram: u64,
177     next_hotplug_slot: usize,
178     shared: bool,
179     hugepages: bool,
180     hugepage_size: Option<u64>,
181     prefault: bool,
182     thp: bool,
183     #[cfg(target_arch = "x86_64")]
184     sgx_epc_region: Option<SgxEpcRegion>,
185     user_provided_zones: bool,
186     snapshot_memory_ranges: MemoryRangeTable,
187     memory_zones: MemoryZones,
188     log_dirty: bool, // Enable dirty logging for created RAM regions
189     arch_mem_regions: Vec<ArchMemRegion>,
190     ram_allocator: AddressAllocator,
191     dynamic: bool,
192 
193     // Keep track of calls to create_userspace_mapping() for guest RAM.
194     // This is useful for getting the dirty pages as we need to know the
195     // slots that the mapping is created in.
196     guest_ram_mappings: Vec<GuestRamMapping>,
197 
198     pub acpi_address: Option<GuestAddress>,
199     #[cfg(target_arch = "aarch64")]
200     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
201 }
202 
203 #[derive(Debug)]
204 pub enum Error {
205     /// Failed to create shared file.
206     SharedFileCreate(io::Error),
207 
208     /// Failed to set shared file length.
209     SharedFileSetLen(io::Error),
210 
211     /// Mmap backed guest memory error
212     GuestMemory(MmapError),
213 
214     /// Failed to allocate a memory range.
215     MemoryRangeAllocation,
216 
217     /// Error from region creation
218     GuestMemoryRegion(MmapRegionError),
219 
220     /// No ACPI slot available
221     NoSlotAvailable,
222 
223     /// Not enough space in the hotplug RAM region
224     InsufficientHotplugRam,
225 
226     /// The requested hotplug memory addition is not a valid size
227     InvalidSize,
228 
229     /// Failed to create the user memory region.
230     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
231 
232     /// Failed to remove the user memory region.
233     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
234 
235     /// Failed to EventFd.
236     EventFdFail(io::Error),
237 
238     /// Eventfd write error
239     EventfdError(io::Error),
240 
241     /// Failed to virtio-mem resize
242     VirtioMemResizeFail(virtio_devices::mem::Error),
243 
244     /// Cannot restore VM
245     Restore(MigratableError),
246 
247     /// Cannot restore VM because source URL is missing
248     RestoreMissingSourceUrl,
249 
250     /// Cannot create the system allocator
251     CreateSystemAllocator,
252 
253     /// Invalid SGX EPC section size
254     #[cfg(target_arch = "x86_64")]
255     EpcSectionSizeInvalid,
256 
257     /// Failed allocating SGX EPC region
258     #[cfg(target_arch = "x86_64")]
259     SgxEpcRangeAllocation,
260 
261     /// Failed opening SGX virtual EPC device
262     #[cfg(target_arch = "x86_64")]
263     SgxVirtEpcOpen(io::Error),
264 
265     /// Failed setting the SGX virtual EPC section size
266     #[cfg(target_arch = "x86_64")]
267     SgxVirtEpcFileSetLen(io::Error),
268 
269     /// Failed opening SGX provisioning device
270     #[cfg(target_arch = "x86_64")]
271     SgxProvisionOpen(io::Error),
272 
273     /// Failed enabling SGX provisioning
274     #[cfg(target_arch = "x86_64")]
275     SgxEnableProvisioning(hypervisor::HypervisorVmError),
276 
277     /// Failed creating a new MmapRegion instance.
278     #[cfg(target_arch = "x86_64")]
279     NewMmapRegion(vm_memory::mmap::MmapRegionError),
280 
281     /// No memory zones found.
282     MissingMemoryZones,
283 
284     /// Memory configuration is not valid.
285     InvalidMemoryParameters,
286 
287     /// Forbidden operation. Impossible to resize guest memory if it is
288     /// backed by user defined memory regions.
289     InvalidResizeWithMemoryZones,
290 
291     /// It's invalid to try applying a NUMA policy to a memory zone that is
292     /// memory mapped with MAP_SHARED.
293     InvalidSharedMemoryZoneWithHostNuma,
294 
295     /// Failed applying NUMA memory policy.
296     ApplyNumaPolicy(io::Error),
297 
298     /// Memory zone identifier is not unique.
299     DuplicateZoneId,
300 
301     /// No virtio-mem resizing handler found.
302     MissingVirtioMemHandler,
303 
304     /// Unknown memory zone.
305     UnknownMemoryZone,
306 
307     /// Invalid size for resizing. Can be anything except 0.
308     InvalidHotplugSize,
309 
310     /// Invalid hotplug method associated with memory zones resizing capability.
311     InvalidHotplugMethodWithMemoryZones,
312 
313     /// Could not find specified memory zone identifier from hash map.
314     MissingZoneIdentifier,
315 
316     /// Resizing the memory zone failed.
317     ResizeZone,
318 
319     /// Guest address overflow
320     GuestAddressOverFlow,
321 
322     /// Error opening snapshot file
323     SnapshotOpen(io::Error),
324 
325     // Error copying snapshot into region
326     SnapshotCopy(GuestMemoryError),
327 
328     /// Failed to allocate MMIO address
329     AllocateMmioAddress,
330 
331     #[cfg(target_arch = "aarch64")]
332     /// Failed to create UEFI flash
333     CreateUefiFlash(HypervisorVmError),
334 
335     /// Using a directory as a backing file for memory is not supported
336     DirectoryAsBackingFileForMemory,
337 
338     /// Failed to stat filesystem
339     GetFileSystemBlockSize(io::Error),
340 
341     /// Memory size is misaligned with default page size or its hugepage size
342     MisalignedMemorySize,
343 }
344 
345 const ENABLE_FLAG: usize = 0;
346 const INSERTING_FLAG: usize = 1;
347 const REMOVING_FLAG: usize = 2;
348 const EJECT_FLAG: usize = 3;
349 
350 const BASE_OFFSET_LOW: u64 = 0;
351 const BASE_OFFSET_HIGH: u64 = 0x4;
352 const LENGTH_OFFSET_LOW: u64 = 0x8;
353 const LENGTH_OFFSET_HIGH: u64 = 0xC;
354 const STATUS_OFFSET: u64 = 0x14;
355 const SELECTION_OFFSET: u64 = 0;
356 
357 // The MMIO address space size is subtracted with 64k. This is done for the
358 // following reasons:
359 //  - Reduce the addressable space size by at least 4k to workaround a Linux
360 //    bug when the VMM allocates devices at the end of the addressable space
361 //  - Windows requires the addressable space size to be 64k aligned
362 fn mmio_address_space_size(phys_bits: u8) -> u64 {
363     (1 << phys_bits) - (1 << 16)
364 }
365 
366 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
367 // `f_bsize` field.
368 //
369 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
370 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
371     let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
372     let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
373 
374     // SAFETY: FFI call with a valid path and buffer
375     let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
376     if ret != 0 {
377         return Err(Error::GetFileSystemBlockSize(
378             std::io::Error::last_os_error(),
379         ));
380     }
381 
382     // SAFETY: `buf` is valid at this point
383     // Because this value is always positive, just convert it directly.
384     // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
385     // by `clippy` on musl target.  To avoid the warning, there should be `as _` instead of
386     // `as u64`.
387     let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
388     Ok(bsize)
389 }
390 
391 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
392     // SAFETY: FFI call. Trivially safe.
393     let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
394 
395     // There is no backend file and the `hugepages` is disabled, just use system page size.
396     if zone.file.is_none() && !zone.hugepages {
397         return Ok(page_size);
398     }
399 
400     // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
401     if zone.hugepages && zone.hugepage_size.is_some() {
402         return Ok(zone.hugepage_size.unwrap());
403     }
404 
405     // There are two scenarios here:
406     //  - `hugepages` is enabled but `hugepage_size` is not specified:
407     //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
408     //  - The backing file is specified:
409     //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
410     //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
411     //     value is less than or equal to the page size, just use the page size.
412     let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
413         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
414     })?;
415 
416     let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
417 
418     Ok(align_size)
419 }
420 
421 #[inline]
422 fn align_down<T>(val: T, align: T) -> T
423 where
424     T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
425 {
426     val & !(align - 1u8.into())
427 }
428 
429 #[inline]
430 fn is_aligned<T>(val: T, align: T) -> bool
431 where
432     T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
433 {
434     (val & (align - 1u8.into())) == 0u8.into()
435 }
436 
437 impl BusDevice for MemoryManager {
438     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
439         if self.selected_slot < self.hotplug_slots.len() {
440             let state = &self.hotplug_slots[self.selected_slot];
441             match offset {
442                 BASE_OFFSET_LOW => {
443                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
444                 }
445                 BASE_OFFSET_HIGH => {
446                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
447                 }
448                 LENGTH_OFFSET_LOW => {
449                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
450                 }
451                 LENGTH_OFFSET_HIGH => {
452                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
453                 }
454                 STATUS_OFFSET => {
455                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
456                     data.fill(0);
457                     if state.active {
458                         data[0] |= 1 << ENABLE_FLAG;
459                     }
460                     if state.inserting {
461                         data[0] |= 1 << INSERTING_FLAG;
462                     }
463                     if state.removing {
464                         data[0] |= 1 << REMOVING_FLAG;
465                     }
466                 }
467                 _ => {
468                     warn!(
469                         "Unexpected offset for accessing memory manager device: {:#}",
470                         offset
471                     );
472                 }
473             }
474         } else {
475             warn!("Out of range memory slot: {}", self.selected_slot);
476         }
477     }
478 
479     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
480         match offset {
481             SELECTION_OFFSET => {
482                 self.selected_slot = usize::from(data[0]);
483             }
484             STATUS_OFFSET => {
485                 if self.selected_slot < self.hotplug_slots.len() {
486                     let state = &mut self.hotplug_slots[self.selected_slot];
487                     // The ACPI code writes back a 1 to acknowledge the insertion
488                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
489                         state.inserting = false;
490                     }
491                     // Ditto for removal
492                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
493                         state.removing = false;
494                     }
495                     // Trigger removal of "DIMM"
496                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
497                         warn!("Ejection of memory not currently supported");
498                     }
499                 } else {
500                     warn!("Out of range memory slot: {}", self.selected_slot);
501                 }
502             }
503             _ => {
504                 warn!(
505                     "Unexpected offset for accessing memory manager device: {:#}",
506                     offset
507                 );
508             }
509         };
510         None
511     }
512 }
513 
514 impl MemoryManager {
515     /// Creates all memory regions based on the available RAM ranges defined
516     /// by `ram_regions`, and based on the description of the memory zones.
517     /// In practice, this function can perform multiple memory mappings of the
518     /// same backing file if there's a hole in the address space between two
519     /// RAM ranges.
520     ///
521     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
522     /// and zones containing two zones (size 1G and size 4G).
523     ///
524     /// This function will create 3 resulting memory regions:
525     /// - First one mapping entirely the first memory zone on 0-1G range
526     /// - Second one mapping partially the second memory zone on 1G-3G range
527     /// - Third one mapping partially the second memory zone on 4G-6G range
528     ///
529     /// Also, all memory regions are page-size aligned (e.g. their sizes must
530     /// be multiple of page-size), which may leave an additional hole in the
531     /// address space when hugepage is used.
532     fn create_memory_regions_from_zones(
533         ram_regions: &[(GuestAddress, usize)],
534         zones: &[MemoryZoneConfig],
535         prefault: Option<bool>,
536         thp: bool,
537     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
538         let mut zone_iter = zones.iter();
539         let mut mem_regions = Vec::new();
540         let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
541         let mut zone_align_size = memory_zone_get_align_size(zone)?;
542         let mut zone_offset = 0u64;
543         let mut memory_zones = HashMap::new();
544 
545         if !is_aligned(zone.size, zone_align_size) {
546             return Err(Error::MisalignedMemorySize);
547         }
548 
549         // Add zone id to the list of memory zones.
550         memory_zones.insert(zone.id.clone(), MemoryZone::default());
551 
552         for ram_region in ram_regions.iter() {
553             let mut ram_region_offset = 0;
554             let mut exit = false;
555 
556             loop {
557                 let mut ram_region_consumed = false;
558                 let mut pull_next_zone = false;
559 
560                 let ram_region_available_size =
561                     align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
562                 if ram_region_available_size == 0 {
563                     break;
564                 }
565                 let zone_sub_size = zone.size - zone_offset;
566 
567                 let file_offset = zone_offset;
568                 let region_start = ram_region
569                     .0
570                     .checked_add(ram_region_offset)
571                     .ok_or(Error::GuestAddressOverFlow)?;
572                 let region_size = if zone_sub_size <= ram_region_available_size {
573                     if zone_sub_size == ram_region_available_size {
574                         ram_region_consumed = true;
575                     }
576 
577                     ram_region_offset += zone_sub_size;
578                     pull_next_zone = true;
579 
580                     zone_sub_size
581                 } else {
582                     zone_offset += ram_region_available_size;
583                     ram_region_consumed = true;
584 
585                     ram_region_available_size
586                 };
587 
588                 info!(
589                     "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
590                     zone.id,
591                     region_start.raw_value(),
592                     region_size
593                 );
594                 let region = MemoryManager::create_ram_region(
595                     &zone.file,
596                     file_offset,
597                     region_start,
598                     region_size as usize,
599                     prefault.unwrap_or(zone.prefault),
600                     zone.shared,
601                     zone.hugepages,
602                     zone.hugepage_size,
603                     zone.host_numa_node,
604                     None,
605                     thp,
606                 )?;
607 
608                 // Add region to the list of regions associated with the
609                 // current memory zone.
610                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
611                     memory_zone.regions.push(region.clone());
612                 }
613 
614                 mem_regions.push(region);
615 
616                 if pull_next_zone {
617                     // Get the next zone and reset the offset.
618                     zone_offset = 0;
619                     if let Some(z) = zone_iter.next() {
620                         zone = z;
621                     } else {
622                         exit = true;
623                         break;
624                     }
625                     zone_align_size = memory_zone_get_align_size(zone)?;
626                     if !is_aligned(zone.size, zone_align_size) {
627                         return Err(Error::MisalignedMemorySize);
628                     }
629 
630                     // Check if zone id already exist. In case it does, throw
631                     // an error as we need unique identifiers. Otherwise, add
632                     // the new zone id to the list of memory zones.
633                     if memory_zones.contains_key(&zone.id) {
634                         error!(
635                             "Memory zone identifier '{}' found more than once. \
636                             It must be unique",
637                             zone.id,
638                         );
639                         return Err(Error::DuplicateZoneId);
640                     }
641                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
642                 }
643 
644                 if ram_region_consumed {
645                     break;
646                 }
647             }
648 
649             if exit {
650                 break;
651             }
652         }
653 
654         Ok((mem_regions, memory_zones))
655     }
656 
657     // Restore both GuestMemory regions along with MemoryZone zones.
658     fn restore_memory_regions_and_zones(
659         guest_ram_mappings: &[GuestRamMapping],
660         zones_config: &[MemoryZoneConfig],
661         prefault: Option<bool>,
662         mut existing_memory_files: HashMap<u32, File>,
663         thp: bool,
664     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
665         let mut memory_regions = Vec::new();
666         let mut memory_zones = HashMap::new();
667 
668         for zone_config in zones_config {
669             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
670         }
671 
672         for guest_ram_mapping in guest_ram_mappings {
673             for zone_config in zones_config {
674                 if guest_ram_mapping.zone_id == zone_config.id {
675                     let region = MemoryManager::create_ram_region(
676                         if guest_ram_mapping.virtio_mem {
677                             &None
678                         } else {
679                             &zone_config.file
680                         },
681                         guest_ram_mapping.file_offset,
682                         GuestAddress(guest_ram_mapping.gpa),
683                         guest_ram_mapping.size as usize,
684                         prefault.unwrap_or(zone_config.prefault),
685                         zone_config.shared,
686                         zone_config.hugepages,
687                         zone_config.hugepage_size,
688                         zone_config.host_numa_node,
689                         existing_memory_files.remove(&guest_ram_mapping.slot),
690                         thp,
691                     )?;
692                     memory_regions.push(Arc::clone(&region));
693                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
694                         if guest_ram_mapping.virtio_mem {
695                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
696                             let region_size = region.len();
697                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
698                                 region,
699                                 virtio_device: None,
700                                 hotplugged_size,
701                                 hugepages: zone_config.hugepages,
702                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
703                             });
704                         } else {
705                             memory_zone.regions.push(region);
706                         }
707                     }
708                 }
709             }
710         }
711 
712         memory_regions.sort_by_key(|x| x.start_addr());
713 
714         Ok((memory_regions, memory_zones))
715     }
716 
717     fn fill_saved_regions(
718         &mut self,
719         file_path: PathBuf,
720         saved_regions: MemoryRangeTable,
721     ) -> Result<(), Error> {
722         if saved_regions.is_empty() {
723             return Ok(());
724         }
725 
726         // Open (read only) the snapshot file.
727         let mut memory_file = OpenOptions::new()
728             .read(true)
729             .open(file_path)
730             .map_err(Error::SnapshotOpen)?;
731 
732         let guest_memory = self.guest_memory.memory();
733         for range in saved_regions.regions() {
734             let mut offset: u64 = 0;
735             // Here we are manually handling the retry in case we can't write
736             // the whole region at once because we can't use the implementation
737             // from vm-memory::GuestMemory of read_exact_from() as it is not
738             // following the correct behavior. For more info about this issue
739             // see: https://github.com/rust-vmm/vm-memory/issues/174
740             loop {
741                 let bytes_read = guest_memory
742                     .read_volatile_from(
743                         GuestAddress(range.gpa + offset),
744                         &mut memory_file,
745                         (range.length - offset) as usize,
746                     )
747                     .map_err(Error::SnapshotCopy)?;
748                 offset += bytes_read as u64;
749 
750                 if offset == range.length {
751                     break;
752                 }
753             }
754         }
755 
756         Ok(())
757     }
758 
759     fn validate_memory_config(
760         config: &MemoryConfig,
761         user_provided_zones: bool,
762     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
763         let mut allow_mem_hotplug = false;
764 
765         if !user_provided_zones {
766             if config.zones.is_some() {
767                 error!(
768                     "User defined memory regions can't be provided if the \
769                     memory size is not 0"
770                 );
771                 return Err(Error::InvalidMemoryParameters);
772             }
773 
774             if config.hotplug_size.is_some() {
775                 allow_mem_hotplug = true;
776             }
777 
778             if let Some(hotplugged_size) = config.hotplugged_size {
779                 if let Some(hotplug_size) = config.hotplug_size {
780                     if hotplugged_size > hotplug_size {
781                         error!(
782                             "'hotplugged_size' {} can't be bigger than \
783                             'hotplug_size' {}",
784                             hotplugged_size, hotplug_size,
785                         );
786                         return Err(Error::InvalidMemoryParameters);
787                     }
788                 } else {
789                     error!(
790                         "Invalid to define 'hotplugged_size' when there is\
791                         no 'hotplug_size'"
792                     );
793                     return Err(Error::InvalidMemoryParameters);
794                 }
795                 if config.hotplug_method == HotplugMethod::Acpi {
796                     error!(
797                         "Invalid to define 'hotplugged_size' with hotplug \
798                         method 'acpi'"
799                     );
800                     return Err(Error::InvalidMemoryParameters);
801                 }
802             }
803 
804             // Create a single zone from the global memory config. This lets
805             // us reuse the codepath for user defined memory zones.
806             let zones = vec![MemoryZoneConfig {
807                 id: String::from(DEFAULT_MEMORY_ZONE),
808                 size: config.size,
809                 file: None,
810                 shared: config.shared,
811                 hugepages: config.hugepages,
812                 hugepage_size: config.hugepage_size,
813                 host_numa_node: None,
814                 hotplug_size: config.hotplug_size,
815                 hotplugged_size: config.hotplugged_size,
816                 prefault: config.prefault,
817             }];
818 
819             Ok((config.size, zones, allow_mem_hotplug))
820         } else {
821             if config.zones.is_none() {
822                 error!(
823                     "User defined memory regions must be provided if the \
824                     memory size is 0"
825                 );
826                 return Err(Error::MissingMemoryZones);
827             }
828 
829             // Safe to unwrap as we checked right above there were some
830             // regions.
831             let zones = config.zones.clone().unwrap();
832             if zones.is_empty() {
833                 return Err(Error::MissingMemoryZones);
834             }
835 
836             let mut total_ram_size: u64 = 0;
837             for zone in zones.iter() {
838                 total_ram_size += zone.size;
839 
840                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
841                     error!(
842                         "Invalid to set host NUMA policy for a memory zone \
843                         backed by a regular file and mapped as 'shared'"
844                     );
845                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
846                 }
847 
848                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
849                     error!("Invalid to set ACPI hotplug method for memory zones");
850                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
851                 }
852 
853                 if let Some(hotplugged_size) = zone.hotplugged_size {
854                     if let Some(hotplug_size) = zone.hotplug_size {
855                         if hotplugged_size > hotplug_size {
856                             error!(
857                                 "'hotplugged_size' {} can't be bigger than \
858                                 'hotplug_size' {}",
859                                 hotplugged_size, hotplug_size,
860                             );
861                             return Err(Error::InvalidMemoryParameters);
862                         }
863                     } else {
864                         error!(
865                             "Invalid to define 'hotplugged_size' when there is\
866                             no 'hotplug_size' for a memory zone"
867                         );
868                         return Err(Error::InvalidMemoryParameters);
869                     }
870                     if config.hotplug_method == HotplugMethod::Acpi {
871                         error!(
872                             "Invalid to define 'hotplugged_size' with hotplug \
873                             method 'acpi'"
874                         );
875                         return Err(Error::InvalidMemoryParameters);
876                     }
877                 }
878             }
879 
880             Ok((total_ram_size, zones, allow_mem_hotplug))
881         }
882     }
883 
884     pub fn allocate_address_space(&mut self) -> Result<(), Error> {
885         let mut list = Vec::new();
886 
887         for (zone_id, memory_zone) in self.memory_zones.iter() {
888             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
889                 memory_zone
890                     .regions()
891                     .iter()
892                     .map(|r| (r.clone(), false))
893                     .collect();
894 
895             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
896                 regions.push((virtio_mem_zone.region().clone(), true));
897             }
898 
899             list.push((zone_id.clone(), regions));
900         }
901 
902         for (zone_id, regions) in list {
903             for (region, virtio_mem) in regions {
904                 let slot = self.create_userspace_mapping(
905                     region.start_addr().raw_value(),
906                     region.len(),
907                     region.as_ptr() as u64,
908                     self.mergeable,
909                     false,
910                     self.log_dirty,
911                 )?;
912 
913                 let file_offset = if let Some(file_offset) = region.file_offset() {
914                     file_offset.start()
915                 } else {
916                     0
917                 };
918 
919                 self.guest_ram_mappings.push(GuestRamMapping {
920                     gpa: region.start_addr().raw_value(),
921                     size: region.len(),
922                     slot,
923                     zone_id: zone_id.clone(),
924                     virtio_mem,
925                     file_offset,
926                 });
927                 self.ram_allocator
928                     .allocate(Some(region.start_addr()), region.len(), None)
929                     .ok_or(Error::MemoryRangeAllocation)?;
930             }
931         }
932 
933         // Allocate SubRegion and Reserved address ranges.
934         for region in self.arch_mem_regions.iter() {
935             if region.r_type == RegionType::Ram {
936                 // Ignore the RAM type since ranges have already been allocated
937                 // based on the GuestMemory regions.
938                 continue;
939             }
940             self.ram_allocator
941                 .allocate(
942                     Some(GuestAddress(region.base)),
943                     region.size as GuestUsize,
944                     None,
945                 )
946                 .ok_or(Error::MemoryRangeAllocation)?;
947         }
948 
949         Ok(())
950     }
951 
952     #[cfg(target_arch = "aarch64")]
953     fn add_uefi_flash(&mut self) -> Result<(), Error> {
954         // On AArch64, the UEFI binary requires a flash device at address 0.
955         // 4 MiB memory is mapped to simulate the flash.
956         let uefi_mem_slot = self.allocate_memory_slot();
957         let uefi_region = GuestRegionMmap::new(
958             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
959             arch::layout::UEFI_START,
960         )
961         .unwrap();
962         let uefi_mem_region = self.vm.make_user_memory_region(
963             uefi_mem_slot,
964             uefi_region.start_addr().raw_value(),
965             uefi_region.len(),
966             uefi_region.as_ptr() as u64,
967             false,
968             false,
969         );
970         self.vm
971             .create_user_memory_region(uefi_mem_region)
972             .map_err(Error::CreateUefiFlash)?;
973 
974         let uefi_flash =
975             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
976 
977         self.uefi_flash = Some(uefi_flash);
978 
979         Ok(())
980     }
981 
982     #[allow(clippy::too_many_arguments)]
983     pub fn new(
984         vm: Arc<dyn hypervisor::Vm>,
985         config: &MemoryConfig,
986         prefault: Option<bool>,
987         phys_bits: u8,
988         #[cfg(feature = "tdx")] tdx_enabled: bool,
989         restore_data: Option<&MemoryManagerSnapshotData>,
990         existing_memory_files: Option<HashMap<u32, File>>,
991         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
992     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
993         trace_scoped!("MemoryManager::new");
994 
995         let user_provided_zones = config.size == 0;
996 
997         let mmio_address_space_size = mmio_address_space_size(phys_bits);
998         debug_assert_eq!(
999             (((mmio_address_space_size) >> 16) << 16),
1000             mmio_address_space_size
1001         );
1002         let start_of_platform_device_area =
1003             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
1004         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
1005 
1006         let (ram_size, zones, allow_mem_hotplug) =
1007             Self::validate_memory_config(config, user_provided_zones)?;
1008 
1009         let (
1010             start_of_device_area,
1011             boot_ram,
1012             current_ram,
1013             arch_mem_regions,
1014             memory_zones,
1015             guest_memory,
1016             boot_guest_memory,
1017             hotplug_slots,
1018             next_memory_slot,
1019             selected_slot,
1020             next_hotplug_slot,
1021         ) = if let Some(data) = restore_data {
1022             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1023                 &data.guest_ram_mappings,
1024                 &zones,
1025                 prefault,
1026                 existing_memory_files.unwrap_or_default(),
1027                 config.thp,
1028             )?;
1029             let guest_memory =
1030                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1031             let boot_guest_memory = guest_memory.clone();
1032             (
1033                 GuestAddress(data.start_of_device_area),
1034                 data.boot_ram,
1035                 data.current_ram,
1036                 data.arch_mem_regions.clone(),
1037                 memory_zones,
1038                 guest_memory,
1039                 boot_guest_memory,
1040                 data.hotplug_slots.clone(),
1041                 data.next_memory_slot,
1042                 data.selected_slot,
1043                 data.next_hotplug_slot,
1044             )
1045         } else {
1046             // Init guest memory
1047             let arch_mem_regions = arch::arch_memory_regions();
1048 
1049             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1050                 .iter()
1051                 .filter(|r| r.2 == RegionType::Ram)
1052                 .map(|r| (r.0, r.1))
1053                 .collect();
1054 
1055             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1056                 .iter()
1057                 .map(|(a, b, c)| ArchMemRegion {
1058                     base: a.0,
1059                     size: *b,
1060                     r_type: *c,
1061                 })
1062                 .collect();
1063 
1064             let (mem_regions, mut memory_zones) =
1065                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1066 
1067             let mut guest_memory =
1068                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1069 
1070             let boot_guest_memory = guest_memory.clone();
1071 
1072             let mut start_of_device_area =
1073                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1074 
1075             // Update list of memory zones for resize.
1076             for zone in zones.iter() {
1077                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1078                     if let Some(hotplug_size) = zone.hotplug_size {
1079                         if hotplug_size == 0 {
1080                             error!("'hotplug_size' can't be 0");
1081                             return Err(Error::InvalidHotplugSize);
1082                         }
1083 
1084                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1085                             start_of_device_area = start_of_device_area
1086                                 .checked_add(hotplug_size)
1087                                 .ok_or(Error::GuestAddressOverFlow)?;
1088                         } else {
1089                             // Alignment must be "natural" i.e. same as size of block
1090                             let start_addr = GuestAddress(
1091                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1092                                     - 1)
1093                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1094                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1095                             );
1096 
1097                             // When `prefault` is set by vm_restore, memory manager
1098                             // will create ram region with `prefault` option in
1099                             // restore config rather than same option in zone
1100                             let region = MemoryManager::create_ram_region(
1101                                 &None,
1102                                 0,
1103                                 start_addr,
1104                                 hotplug_size as usize,
1105                                 prefault.unwrap_or(zone.prefault),
1106                                 zone.shared,
1107                                 zone.hugepages,
1108                                 zone.hugepage_size,
1109                                 zone.host_numa_node,
1110                                 None,
1111                                 config.thp,
1112                             )?;
1113 
1114                             guest_memory = guest_memory
1115                                 .insert_region(Arc::clone(&region))
1116                                 .map_err(Error::GuestMemory)?;
1117 
1118                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1119                             let region_size = region.len();
1120                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1121                                 region,
1122                                 virtio_device: None,
1123                                 hotplugged_size,
1124                                 hugepages: zone.hugepages,
1125                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1126                             });
1127 
1128                             start_of_device_area = start_addr
1129                                 .checked_add(hotplug_size)
1130                                 .ok_or(Error::GuestAddressOverFlow)?;
1131                         }
1132                     }
1133                 } else {
1134                     return Err(Error::MissingZoneIdentifier);
1135                 }
1136             }
1137 
1138             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1139             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1140 
1141             (
1142                 start_of_device_area,
1143                 ram_size,
1144                 ram_size,
1145                 arch_mem_regions,
1146                 memory_zones,
1147                 guest_memory,
1148                 boot_guest_memory,
1149                 hotplug_slots,
1150                 0,
1151                 0,
1152                 0,
1153             )
1154         };
1155 
1156         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1157 
1158         // Both MMIO and PIO address spaces start at address 0.
1159         let allocator = Arc::new(Mutex::new(
1160             SystemAllocator::new(
1161                 #[cfg(target_arch = "x86_64")]
1162                 {
1163                     GuestAddress(0)
1164                 },
1165                 #[cfg(target_arch = "x86_64")]
1166                 {
1167                     1 << 16
1168                 },
1169                 start_of_platform_device_area,
1170                 PLATFORM_DEVICE_AREA_SIZE,
1171                 #[cfg(target_arch = "x86_64")]
1172                 vec![GsiApic::new(
1173                     X86_64_IRQ_BASE,
1174                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1175                 )],
1176             )
1177             .ok_or(Error::CreateSystemAllocator)?,
1178         ));
1179 
1180         #[cfg(not(feature = "tdx"))]
1181         let dynamic = true;
1182         #[cfg(feature = "tdx")]
1183         let dynamic = !tdx_enabled;
1184 
1185         let acpi_address = if dynamic
1186             && config.hotplug_method == HotplugMethod::Acpi
1187             && (config.hotplug_size.unwrap_or_default() > 0)
1188         {
1189             Some(
1190                 allocator
1191                     .lock()
1192                     .unwrap()
1193                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1194                     .ok_or(Error::AllocateMmioAddress)?,
1195             )
1196         } else {
1197             None
1198         };
1199 
1200         // If running on SGX the start of device area and RAM area may diverge but
1201         // at this point they are next to each other.
1202         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1203         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1204 
1205         let mut memory_manager = MemoryManager {
1206             boot_guest_memory,
1207             guest_memory,
1208             next_memory_slot,
1209             start_of_device_area,
1210             end_of_device_area,
1211             end_of_ram_area,
1212             vm,
1213             hotplug_slots,
1214             selected_slot,
1215             mergeable: config.mergeable,
1216             allocator,
1217             hotplug_method: config.hotplug_method,
1218             boot_ram,
1219             current_ram,
1220             next_hotplug_slot,
1221             shared: config.shared,
1222             hugepages: config.hugepages,
1223             hugepage_size: config.hugepage_size,
1224             prefault: config.prefault,
1225             #[cfg(target_arch = "x86_64")]
1226             sgx_epc_region: None,
1227             user_provided_zones,
1228             snapshot_memory_ranges: MemoryRangeTable::default(),
1229             memory_zones,
1230             guest_ram_mappings: Vec::new(),
1231             acpi_address,
1232             log_dirty: dynamic, // Cannot log dirty pages on a TD
1233             arch_mem_regions,
1234             ram_allocator,
1235             dynamic,
1236             #[cfg(target_arch = "aarch64")]
1237             uefi_flash: None,
1238             thp: config.thp,
1239         };
1240 
1241         #[cfg(target_arch = "aarch64")]
1242         {
1243             // For Aarch64 we cannot lazily allocate the address space like we
1244             // do for x86, because while restoring a VM from snapshot we would
1245             // need the address space to be allocated to properly restore VGIC.
1246             // And the restore of VGIC happens before we attempt to run the vCPUs
1247             // for the first time, thus we need to allocate the address space
1248             // beforehand.
1249             memory_manager.allocate_address_space()?;
1250             memory_manager.add_uefi_flash()?;
1251         }
1252 
1253         #[cfg(target_arch = "x86_64")]
1254         if let Some(sgx_epc_config) = sgx_epc_config {
1255             memory_manager.setup_sgx(sgx_epc_config)?;
1256         }
1257 
1258         Ok(Arc::new(Mutex::new(memory_manager)))
1259     }
1260 
1261     pub fn new_from_snapshot(
1262         snapshot: &Snapshot,
1263         vm: Arc<dyn hypervisor::Vm>,
1264         config: &MemoryConfig,
1265         source_url: Option<&str>,
1266         prefault: bool,
1267         phys_bits: u8,
1268     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1269         if let Some(source_url) = source_url {
1270             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1271             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1272 
1273             let mem_snapshot: MemoryManagerSnapshotData =
1274                 snapshot.to_state().map_err(Error::Restore)?;
1275 
1276             let mm = MemoryManager::new(
1277                 vm,
1278                 config,
1279                 Some(prefault),
1280                 phys_bits,
1281                 #[cfg(feature = "tdx")]
1282                 false,
1283                 Some(&mem_snapshot),
1284                 None,
1285                 #[cfg(target_arch = "x86_64")]
1286                 None,
1287             )?;
1288 
1289             mm.lock()
1290                 .unwrap()
1291                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1292 
1293             Ok(mm)
1294         } else {
1295             Err(Error::RestoreMissingSourceUrl)
1296         }
1297     }
1298 
1299     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1300         // SAFETY: FFI call with correct arguments
1301         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1302 
1303         if res < 0 {
1304             Err(io::Error::last_os_error())
1305         } else {
1306             Ok(res as RawFd)
1307         }
1308     }
1309 
1310     fn mbind(
1311         addr: *mut u8,
1312         len: u64,
1313         mode: u32,
1314         nodemask: Vec<u64>,
1315         maxnode: u64,
1316         flags: u32,
1317     ) -> Result<(), io::Error> {
1318         // SAFETY: FFI call with correct arguments
1319         let res = unsafe {
1320             libc::syscall(
1321                 libc::SYS_mbind,
1322                 addr as *mut libc::c_void,
1323                 len,
1324                 mode,
1325                 nodemask.as_ptr(),
1326                 maxnode,
1327                 flags,
1328             )
1329         };
1330 
1331         if res < 0 {
1332             Err(io::Error::last_os_error())
1333         } else {
1334             Ok(())
1335         }
1336     }
1337 
1338     fn create_anonymous_file(
1339         size: usize,
1340         hugepages: bool,
1341         hugepage_size: Option<u64>,
1342     ) -> Result<FileOffset, Error> {
1343         let fd = Self::memfd_create(
1344             &ffi::CString::new("ch_ram").unwrap(),
1345             libc::MFD_CLOEXEC
1346                 | if hugepages {
1347                     libc::MFD_HUGETLB
1348                         | if let Some(hugepage_size) = hugepage_size {
1349                             /*
1350                              * From the Linux kernel:
1351                              * Several system calls take a flag to request "hugetlb" huge pages.
1352                              * Without further specification, these system calls will use the
1353                              * system's default huge page size.  If a system supports multiple
1354                              * huge page sizes, the desired huge page size can be specified in
1355                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1356                              * will encode the log2 of the huge page size.
1357                              */
1358 
1359                             hugepage_size.trailing_zeros() << 26
1360                         } else {
1361                             // Use the system default huge page size
1362                             0
1363                         }
1364                 } else {
1365                     0
1366                 },
1367         )
1368         .map_err(Error::SharedFileCreate)?;
1369 
1370         // SAFETY: fd is valid
1371         let f = unsafe { File::from_raw_fd(fd) };
1372         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1373 
1374         Ok(FileOffset::new(f, 0))
1375     }
1376 
1377     fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1378         if backing_file.is_dir() {
1379             Err(Error::DirectoryAsBackingFileForMemory)
1380         } else {
1381             let f = OpenOptions::new()
1382                 .read(true)
1383                 .write(true)
1384                 .open(backing_file)
1385                 .map_err(Error::SharedFileCreate)?;
1386 
1387             Ok(FileOffset::new(f, file_offset))
1388         }
1389     }
1390 
1391     #[allow(clippy::too_many_arguments)]
1392     pub fn create_ram_region(
1393         backing_file: &Option<PathBuf>,
1394         file_offset: u64,
1395         start_addr: GuestAddress,
1396         size: usize,
1397         prefault: bool,
1398         shared: bool,
1399         hugepages: bool,
1400         hugepage_size: Option<u64>,
1401         host_numa_node: Option<u32>,
1402         existing_memory_file: Option<File>,
1403         thp: bool,
1404     ) -> Result<Arc<GuestRegionMmap>, Error> {
1405         let mut mmap_flags = libc::MAP_NORESERVE;
1406 
1407         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1408         // the complexity of the handling clear.
1409         let fo = if let Some(f) = existing_memory_file {
1410             // It must be MAP_SHARED as we wouldn't already have an FD
1411             mmap_flags |= libc::MAP_SHARED;
1412             Some(FileOffset::new(f, file_offset))
1413         } else if let Some(backing_file) = backing_file {
1414             if shared {
1415                 mmap_flags |= libc::MAP_SHARED;
1416             } else {
1417                 mmap_flags |= libc::MAP_PRIVATE;
1418             }
1419             Some(Self::open_backing_file(backing_file, file_offset)?)
1420         } else if shared || hugepages {
1421             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1422             // because the MAP_PRIVATE will trigger CoW against the backing file with
1423             // the VFIO pinning
1424             mmap_flags |= libc::MAP_SHARED;
1425             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1426         } else {
1427             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1428             None
1429         };
1430 
1431         let region = GuestRegionMmap::new(
1432             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1433                 .map_err(Error::GuestMemoryRegion)?,
1434             start_addr,
1435         )
1436         .map_err(Error::GuestMemory)?;
1437 
1438         // Apply NUMA policy if needed.
1439         if let Some(node) = host_numa_node {
1440             let addr = region.deref().as_ptr();
1441             let len = region.deref().size() as u64;
1442             let mode = MPOL_BIND;
1443             let mut nodemask: Vec<u64> = Vec::new();
1444             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1445 
1446             // Linux is kind of buggy in the way it interprets maxnode as it
1447             // will cut off the last node. That's why we have to add 1 to what
1448             // we would consider as the proper maxnode value.
1449             let maxnode = node as u64 + 1 + 1;
1450 
1451             // Allocate the right size for the vector.
1452             nodemask.resize((node as usize / 64) + 1, 0);
1453 
1454             // Fill the global bitmask through the nodemask vector.
1455             let idx = (node / 64) as usize;
1456             let shift = node % 64;
1457             nodemask[idx] |= 1u64 << shift;
1458 
1459             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1460             // force the kernel to move all pages that might have been already
1461             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1462             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1463             // MPOL_BIND is the selected mode as it specifies a strict policy
1464             // that restricts memory allocation to the nodes specified in the
1465             // nodemask.
1466             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1467                 .map_err(Error::ApplyNumaPolicy)?;
1468         }
1469 
1470         // Prefault the region if needed, in parallel.
1471         if prefault {
1472             let page_size =
1473                 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
1474 
1475             if !is_aligned(size, page_size) {
1476                 warn!(
1477                     "Prefaulting memory size {} misaligned with page size {}",
1478                     size, page_size
1479                 );
1480             }
1481 
1482             let num_pages = size / page_size;
1483 
1484             let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
1485 
1486             let pages_per_thread = num_pages / num_threads;
1487             let remainder = num_pages % num_threads;
1488 
1489             let barrier = Arc::new(Barrier::new(num_threads));
1490             thread::scope(|s| {
1491                 let r = &region;
1492                 for i in 0..num_threads {
1493                     let barrier = Arc::clone(&barrier);
1494                     s.spawn(move || {
1495                         // Wait until all threads have been spawned to avoid contention
1496                         // over mmap_sem between thread stack allocation and page faulting.
1497                         barrier.wait();
1498                         let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
1499                         let offset =
1500                             page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
1501                         // SAFETY: FFI call with correct arguments
1502                         let ret = unsafe {
1503                             let addr = r.as_ptr().add(offset);
1504                             libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
1505                         };
1506                         if ret != 0 {
1507                             let e = io::Error::last_os_error();
1508                             warn!("Failed to prefault pages: {}", e);
1509                         }
1510                     });
1511                 }
1512             });
1513         }
1514 
1515         if region.file_offset().is_none() && thp {
1516             info!(
1517                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1518                 region.as_ptr() as u64,
1519                 size
1520             );
1521             // SAFETY: FFI call with correct arguments
1522             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1523             if ret != 0 {
1524                 let e = io::Error::last_os_error();
1525                 warn!("Failed to mark pages as THP eligible: {}", e);
1526             }
1527         }
1528 
1529         Ok(Arc::new(region))
1530     }
1531 
1532     // Duplicate of `memory_zone_get_align_size` that does not require a `zone`
1533     fn get_prefault_align_size(
1534         backing_file: &Option<PathBuf>,
1535         hugepages: bool,
1536         hugepage_size: Option<u64>,
1537     ) -> Result<u64, Error> {
1538         // SAFETY: FFI call. Trivially safe.
1539         let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
1540         match (hugepages, hugepage_size, backing_file) {
1541             (false, _, _) => Ok(page_size),
1542             (true, Some(hugepage_size), _) => Ok(hugepage_size),
1543             (true, None, _) => {
1544                 // There are two scenarios here:
1545                 //  - `hugepages` is enabled but `hugepage_size` is not specified:
1546                 //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
1547                 //  - The backing file is specified:
1548                 //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
1549                 //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
1550                 //     value is less than or equal to the page size, just use the page size.
1551                 let path = backing_file
1552                     .as_ref()
1553                     .map_or(Ok("/dev/hugepages"), |pathbuf| {
1554                         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
1555                     })?;
1556                 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
1557                 Ok(align_size)
1558             }
1559         }
1560     }
1561 
1562     fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
1563         let mut n: usize = 1;
1564 
1565         // Do not create more threads than processors available.
1566         // SAFETY: FFI call. Trivially safe.
1567         let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
1568         if procs > 0 {
1569             n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
1570         }
1571 
1572         // Do not create more threads than pages being allocated.
1573         n = std::cmp::min(n, num_pages);
1574 
1575         // Do not create threads to allocate less than 64 MiB of memory.
1576         n = std::cmp::min(
1577             n,
1578             std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
1579         );
1580 
1581         n
1582     }
1583 
1584     // Update the GuestMemoryMmap with the new range
1585     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1586         let guest_memory = self
1587             .guest_memory
1588             .memory()
1589             .insert_region(region)
1590             .map_err(Error::GuestMemory)?;
1591         self.guest_memory.lock().unwrap().replace(guest_memory);
1592 
1593         Ok(())
1594     }
1595 
1596     //
1597     // Calculate the start address of an area next to RAM.
1598     //
1599     // If memory hotplug is allowed, the start address needs to be aligned
1600     // (rounded-up) to 128MiB boundary.
1601     // If memory hotplug is not allowed, there is no alignment required.
1602     // And it must also start at the 64bit start.
1603     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1604         let mut start_addr = if allow_mem_hotplug {
1605             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1606         } else {
1607             mem_end
1608         };
1609 
1610         start_addr = start_addr
1611             .checked_add(1)
1612             .ok_or(Error::GuestAddressOverFlow)?;
1613 
1614         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1615             return Ok(arch::layout::RAM_64BIT_START);
1616         }
1617 
1618         Ok(start_addr)
1619     }
1620 
1621     pub fn add_ram_region(
1622         &mut self,
1623         start_addr: GuestAddress,
1624         size: usize,
1625     ) -> Result<Arc<GuestRegionMmap>, Error> {
1626         // Allocate memory for the region
1627         let region = MemoryManager::create_ram_region(
1628             &None,
1629             0,
1630             start_addr,
1631             size,
1632             self.prefault,
1633             self.shared,
1634             self.hugepages,
1635             self.hugepage_size,
1636             None,
1637             None,
1638             self.thp,
1639         )?;
1640 
1641         // Map it into the guest
1642         let slot = self.create_userspace_mapping(
1643             region.start_addr().0,
1644             region.len(),
1645             region.as_ptr() as u64,
1646             self.mergeable,
1647             false,
1648             self.log_dirty,
1649         )?;
1650         self.guest_ram_mappings.push(GuestRamMapping {
1651             gpa: region.start_addr().raw_value(),
1652             size: region.len(),
1653             slot,
1654             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1655             virtio_mem: false,
1656             file_offset: 0,
1657         });
1658 
1659         self.add_region(Arc::clone(&region))?;
1660 
1661         Ok(region)
1662     }
1663 
1664     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1665         info!("Hotplugging new RAM: {}", size);
1666 
1667         // Check that there is a free slot
1668         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1669             return Err(Error::NoSlotAvailable);
1670         }
1671 
1672         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1673         if size % (128 << 20) != 0 {
1674             return Err(Error::InvalidSize);
1675         }
1676 
1677         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1678 
1679         if start_addr
1680             .checked_add((size - 1).try_into().unwrap())
1681             .unwrap()
1682             > self.end_of_ram_area
1683         {
1684             return Err(Error::InsufficientHotplugRam);
1685         }
1686 
1687         let region = self.add_ram_region(start_addr, size)?;
1688 
1689         // Add region to the list of regions associated with the default
1690         // memory zone.
1691         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1692             memory_zone.regions.push(Arc::clone(&region));
1693         }
1694 
1695         // Tell the allocator
1696         self.ram_allocator
1697             .allocate(Some(start_addr), size as GuestUsize, None)
1698             .ok_or(Error::MemoryRangeAllocation)?;
1699 
1700         // Update the slot so that it can be queried via the I/O port
1701         let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1702         slot.active = true;
1703         slot.inserting = true;
1704         slot.base = region.start_addr().0;
1705         slot.length = region.len();
1706 
1707         self.next_hotplug_slot += 1;
1708 
1709         Ok(region)
1710     }
1711 
1712     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1713         self.guest_memory.clone()
1714     }
1715 
1716     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1717         self.boot_guest_memory.clone()
1718     }
1719 
1720     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1721         self.allocator.clone()
1722     }
1723 
1724     pub fn start_of_device_area(&self) -> GuestAddress {
1725         self.start_of_device_area
1726     }
1727 
1728     pub fn end_of_device_area(&self) -> GuestAddress {
1729         self.end_of_device_area
1730     }
1731 
1732     pub fn allocate_memory_slot(&mut self) -> u32 {
1733         let slot_id = self.next_memory_slot;
1734         self.next_memory_slot += 1;
1735         slot_id
1736     }
1737 
1738     pub fn create_userspace_mapping(
1739         &mut self,
1740         guest_phys_addr: u64,
1741         memory_size: u64,
1742         userspace_addr: u64,
1743         mergeable: bool,
1744         readonly: bool,
1745         log_dirty: bool,
1746     ) -> Result<u32, Error> {
1747         let slot = self.allocate_memory_slot();
1748         let mem_region = self.vm.make_user_memory_region(
1749             slot,
1750             guest_phys_addr,
1751             memory_size,
1752             userspace_addr,
1753             readonly,
1754             log_dirty,
1755         );
1756 
1757         info!(
1758             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1759             guest_phys_addr, userspace_addr, memory_size, slot
1760         );
1761 
1762         self.vm
1763             .create_user_memory_region(mem_region)
1764             .map_err(Error::CreateUserMemoryRegion)?;
1765 
1766         // SAFETY: the address and size are valid since the
1767         // mmap succeeded.
1768         let ret = unsafe {
1769             libc::madvise(
1770                 userspace_addr as *mut libc::c_void,
1771                 memory_size as libc::size_t,
1772                 libc::MADV_DONTDUMP,
1773             )
1774         };
1775         if ret != 0 {
1776             let e = io::Error::last_os_error();
1777             warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e);
1778         }
1779 
1780         // Mark the pages as mergeable if explicitly asked for.
1781         if mergeable {
1782             // SAFETY: the address and size are valid since the
1783             // mmap succeeded.
1784             let ret = unsafe {
1785                 libc::madvise(
1786                     userspace_addr as *mut libc::c_void,
1787                     memory_size as libc::size_t,
1788                     libc::MADV_MERGEABLE,
1789                 )
1790             };
1791             if ret != 0 {
1792                 let err = io::Error::last_os_error();
1793                 // Safe to unwrap because the error is constructed with
1794                 // last_os_error(), which ensures the output will be Some().
1795                 let errno = err.raw_os_error().unwrap();
1796                 if errno == libc::EINVAL {
1797                     warn!("kernel not configured with CONFIG_KSM");
1798                 } else {
1799                     warn!("madvise error: {}", err);
1800                 }
1801                 warn!("failed to mark pages as mergeable");
1802             }
1803         }
1804 
1805         info!(
1806             "Created userspace mapping: {:x} -> {:x} {:x}",
1807             guest_phys_addr, userspace_addr, memory_size
1808         );
1809 
1810         Ok(slot)
1811     }
1812 
1813     pub fn remove_userspace_mapping(
1814         &mut self,
1815         guest_phys_addr: u64,
1816         memory_size: u64,
1817         userspace_addr: u64,
1818         mergeable: bool,
1819         slot: u32,
1820     ) -> Result<(), Error> {
1821         let mem_region = self.vm.make_user_memory_region(
1822             slot,
1823             guest_phys_addr,
1824             memory_size,
1825             userspace_addr,
1826             false, /* readonly -- don't care */
1827             false, /* log dirty */
1828         );
1829 
1830         self.vm
1831             .remove_user_memory_region(mem_region)
1832             .map_err(Error::RemoveUserMemoryRegion)?;
1833 
1834         // Mark the pages as unmergeable if there were previously marked as
1835         // mergeable.
1836         if mergeable {
1837             // SAFETY: the address and size are valid as the region was
1838             // previously advised.
1839             let ret = unsafe {
1840                 libc::madvise(
1841                     userspace_addr as *mut libc::c_void,
1842                     memory_size as libc::size_t,
1843                     libc::MADV_UNMERGEABLE,
1844                 )
1845             };
1846             if ret != 0 {
1847                 let err = io::Error::last_os_error();
1848                 // Safe to unwrap because the error is constructed with
1849                 // last_os_error(), which ensures the output will be Some().
1850                 let errno = err.raw_os_error().unwrap();
1851                 if errno == libc::EINVAL {
1852                     warn!("kernel not configured with CONFIG_KSM");
1853                 } else {
1854                     warn!("madvise error: {}", err);
1855                 }
1856                 warn!("failed to mark pages as unmergeable");
1857             }
1858         }
1859 
1860         info!(
1861             "Removed userspace mapping: {:x} -> {:x} {:x}",
1862             guest_phys_addr, userspace_addr, memory_size
1863         );
1864 
1865         Ok(())
1866     }
1867 
1868     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1869         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1870             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1871                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1872                     virtio_mem_device
1873                         .lock()
1874                         .unwrap()
1875                         .resize(size)
1876                         .map_err(Error::VirtioMemResizeFail)?;
1877                 }
1878 
1879                 // Keep the hotplugged_size up to date.
1880                 virtio_mem_zone.hotplugged_size = size;
1881             } else {
1882                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1883                 return Err(Error::MissingVirtioMemHandler);
1884             }
1885 
1886             return Ok(());
1887         }
1888 
1889         error!("Failed resizing virtio-mem region: Unknown memory zone");
1890         Err(Error::UnknownMemoryZone)
1891     }
1892 
1893     /// In case this function resulted in adding a new memory region to the
1894     /// guest memory, the new region is returned to the caller. The virtio-mem
1895     /// use case never adds a new region as the whole hotpluggable memory has
1896     /// already been allocated at boot time.
1897     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1898         if self.user_provided_zones {
1899             error!(
1900                 "Not allowed to resize guest memory when backed with user \
1901                 defined memory zones."
1902             );
1903             return Err(Error::InvalidResizeWithMemoryZones);
1904         }
1905 
1906         let mut region: Option<Arc<GuestRegionMmap>> = None;
1907         match self.hotplug_method {
1908             HotplugMethod::VirtioMem => {
1909                 if desired_ram >= self.boot_ram {
1910                     if !self.dynamic {
1911                         return Ok(region);
1912                     }
1913 
1914                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1915                     self.current_ram = desired_ram;
1916                 }
1917             }
1918             HotplugMethod::Acpi => {
1919                 if desired_ram > self.current_ram {
1920                     if !self.dynamic {
1921                         return Ok(region);
1922                     }
1923 
1924                     region =
1925                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1926                     self.current_ram = desired_ram;
1927                 }
1928             }
1929         }
1930         Ok(region)
1931     }
1932 
1933     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1934         if !self.user_provided_zones {
1935             error!(
1936                 "Not allowed to resize guest memory zone when no zone is \
1937                 defined."
1938             );
1939             return Err(Error::ResizeZone);
1940         }
1941 
1942         self.virtio_mem_resize(id, virtio_mem_size)
1943     }
1944 
1945     #[cfg(target_arch = "x86_64")]
1946     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1947         let file = OpenOptions::new()
1948             .read(true)
1949             .open("/dev/sgx_provision")
1950             .map_err(Error::SgxProvisionOpen)?;
1951         self.vm
1952             .enable_sgx_attribute(file)
1953             .map_err(Error::SgxEnableProvisioning)?;
1954 
1955         // Go over each EPC section and verify its size is a 4k multiple. At
1956         // the same time, calculate the total size needed for the contiguous
1957         // EPC region.
1958         let mut epc_region_size = 0;
1959         for epc_section in sgx_epc_config.iter() {
1960             if epc_section.size == 0 {
1961                 return Err(Error::EpcSectionSizeInvalid);
1962             }
1963             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1964                 return Err(Error::EpcSectionSizeInvalid);
1965             }
1966 
1967             epc_region_size += epc_section.size;
1968         }
1969 
1970         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1971         let epc_region_start = GuestAddress(
1972             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1973         );
1974 
1975         self.start_of_device_area = epc_region_start
1976             .checked_add(epc_region_size)
1977             .ok_or(Error::GuestAddressOverFlow)?;
1978 
1979         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1980         info!(
1981             "SGX EPC region: 0x{:x} (0x{:x})",
1982             epc_region_start.0, epc_region_size
1983         );
1984 
1985         // Each section can be memory mapped into the allocated region.
1986         let mut epc_section_start = epc_region_start.raw_value();
1987         for epc_section in sgx_epc_config.iter() {
1988             let file = OpenOptions::new()
1989                 .read(true)
1990                 .write(true)
1991                 .open("/dev/sgx_vepc")
1992                 .map_err(Error::SgxVirtEpcOpen)?;
1993 
1994             let prot = PROT_READ | PROT_WRITE;
1995             let mut flags = MAP_NORESERVE | MAP_SHARED;
1996             if epc_section.prefault {
1997                 flags |= MAP_POPULATE;
1998             }
1999 
2000             // We can't use the vm-memory crate to perform the memory mapping
2001             // here as it would try to ensure the size of the backing file is
2002             // matching the size of the expected mapping. The /dev/sgx_vepc
2003             // device does not work that way, it provides a file descriptor
2004             // which is not matching the mapping size, as it's a just a way to
2005             // let KVM know that an EPC section is being created for the guest.
2006             // SAFETY: FFI call with correct arguments
2007             let host_addr = unsafe {
2008                 libc::mmap(
2009                     std::ptr::null_mut(),
2010                     epc_section.size as usize,
2011                     prot,
2012                     flags,
2013                     file.as_raw_fd(),
2014                     0,
2015                 )
2016             } as u64;
2017 
2018             info!(
2019                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
2020                 epc_section_start, epc_section.size
2021             );
2022 
2023             let _mem_slot = self.create_userspace_mapping(
2024                 epc_section_start,
2025                 epc_section.size,
2026                 host_addr,
2027                 false,
2028                 false,
2029                 false,
2030             )?;
2031 
2032             sgx_epc_region.insert(
2033                 epc_section.id.clone(),
2034                 SgxEpcSection::new(
2035                     GuestAddress(epc_section_start),
2036                     epc_section.size as GuestUsize,
2037                 ),
2038             );
2039 
2040             epc_section_start += epc_section.size;
2041         }
2042 
2043         self.sgx_epc_region = Some(sgx_epc_region);
2044 
2045         Ok(())
2046     }
2047 
2048     #[cfg(target_arch = "x86_64")]
2049     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
2050         &self.sgx_epc_region
2051     }
2052 
2053     pub fn is_hardlink(f: &File) -> bool {
2054         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
2055         // SAFETY: FFI call with correct arguments
2056         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
2057         if ret != 0 {
2058             error!("Couldn't fstat the backing file");
2059             return false;
2060         }
2061 
2062         // SAFETY: stat is valid
2063         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
2064     }
2065 
2066     pub fn memory_zones(&self) -> &MemoryZones {
2067         &self.memory_zones
2068     }
2069 
2070     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
2071         &mut self.memory_zones
2072     }
2073 
2074     pub fn memory_range_table(
2075         &self,
2076         snapshot: bool,
2077     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
2078         let mut table = MemoryRangeTable::default();
2079 
2080         for memory_zone in self.memory_zones.values() {
2081             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
2082                 table.extend(virtio_mem_zone.plugged_ranges());
2083             }
2084 
2085             for region in memory_zone.regions() {
2086                 if snapshot {
2087                     if let Some(file_offset) = region.file_offset() {
2088                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
2089                             && Self::is_hardlink(file_offset.file())
2090                         {
2091                             // In this very specific case, we know the memory
2092                             // region is backed by a file on the host filesystem
2093                             // that can be accessed by the user, and additionally
2094                             // the mapping is shared, which means that modifications
2095                             // to the content are written to the actual file.
2096                             // When meeting these conditions, we can skip the
2097                             // copy of the memory content for this specific region,
2098                             // as we can assume the user will have it saved through
2099                             // the backing file already.
2100                             continue;
2101                         }
2102                     }
2103                 }
2104 
2105                 table.push(MemoryRange {
2106                     gpa: region.start_addr().raw_value(),
2107                     length: region.len(),
2108                 });
2109             }
2110         }
2111 
2112         Ok(table)
2113     }
2114 
2115     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2116         MemoryManagerSnapshotData {
2117             memory_ranges: self.snapshot_memory_ranges.clone(),
2118             guest_ram_mappings: self.guest_ram_mappings.clone(),
2119             start_of_device_area: self.start_of_device_area.0,
2120             boot_ram: self.boot_ram,
2121             current_ram: self.current_ram,
2122             arch_mem_regions: self.arch_mem_regions.clone(),
2123             hotplug_slots: self.hotplug_slots.clone(),
2124             next_memory_slot: self.next_memory_slot,
2125             selected_slot: self.selected_slot,
2126             next_hotplug_slot: self.next_hotplug_slot,
2127         }
2128     }
2129 
2130     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2131         let mut memory_slot_fds = HashMap::new();
2132         for guest_ram_mapping in &self.guest_ram_mappings {
2133             let slot = guest_ram_mapping.slot;
2134             let guest_memory = self.guest_memory.memory();
2135             let file = guest_memory
2136                 .find_region(GuestAddress(guest_ram_mapping.gpa))
2137                 .unwrap()
2138                 .file_offset()
2139                 .unwrap()
2140                 .file();
2141             memory_slot_fds.insert(slot, file.as_raw_fd());
2142         }
2143         memory_slot_fds
2144     }
2145 
2146     pub fn acpi_address(&self) -> Option<GuestAddress> {
2147         self.acpi_address
2148     }
2149 
2150     pub fn num_guest_ram_mappings(&self) -> u32 {
2151         self.guest_ram_mappings.len() as u32
2152     }
2153 
2154     #[cfg(target_arch = "aarch64")]
2155     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2156         self.uefi_flash.as_ref().unwrap().clone()
2157     }
2158 
2159     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2160     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2161         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2162         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2163 
2164         let mut mem_offset_in_elf = mem_offset;
2165         let mut ram_maps = BTreeMap::new();
2166         for mapping in mapping_sorted_by_gpa.iter() {
2167             ram_maps.insert(
2168                 mapping.gpa,
2169                 CoredumpMemoryRegion {
2170                     mem_offset_in_elf,
2171                     mem_size: mapping.size,
2172                 },
2173             );
2174             mem_offset_in_elf += mapping.size;
2175         }
2176 
2177         CoredumpMemoryRegions { ram_maps }
2178     }
2179 
2180     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2181     pub fn coredump_iterate_save_mem(
2182         &mut self,
2183         dump_state: &DumpState,
2184     ) -> std::result::Result<(), GuestDebuggableError> {
2185         let snapshot_memory_ranges = self
2186             .memory_range_table(false)
2187             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2188 
2189         if snapshot_memory_ranges.is_empty() {
2190             return Ok(());
2191         }
2192 
2193         let coredump_file = dump_state.file.as_ref().unwrap();
2194 
2195         let guest_memory = self.guest_memory.memory();
2196         let mut total_bytes: u64 = 0;
2197 
2198         for range in snapshot_memory_ranges.regions() {
2199             let mut offset: u64 = 0;
2200             loop {
2201                 let bytes_written = guest_memory
2202                     .write_volatile_to(
2203                         GuestAddress(range.gpa + offset),
2204                         &mut coredump_file.as_fd(),
2205                         (range.length - offset) as usize,
2206                     )
2207                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2208                 offset += bytes_written as u64;
2209                 total_bytes += bytes_written as u64;
2210 
2211                 if offset == range.length {
2212                     break;
2213                 }
2214             }
2215         }
2216 
2217         debug!("coredump total bytes {}", total_bytes);
2218         Ok(())
2219     }
2220 
2221     pub fn receive_memory_regions<F>(
2222         &mut self,
2223         ranges: &MemoryRangeTable,
2224         fd: &mut F,
2225     ) -> std::result::Result<(), MigratableError>
2226     where
2227         F: ReadVolatile,
2228     {
2229         let guest_memory = self.guest_memory();
2230         let mem = guest_memory.memory();
2231 
2232         for range in ranges.regions() {
2233             let mut offset: u64 = 0;
2234             // Here we are manually handling the retry in case we can't the
2235             // whole region at once because we can't use the implementation
2236             // from vm-memory::GuestMemory of read_exact_from() as it is not
2237             // following the correct behavior. For more info about this issue
2238             // see: https://github.com/rust-vmm/vm-memory/issues/174
2239             loop {
2240                 let bytes_read = mem
2241                     .read_volatile_from(
2242                         GuestAddress(range.gpa + offset),
2243                         fd,
2244                         (range.length - offset) as usize,
2245                     )
2246                     .map_err(|e| {
2247                         MigratableError::MigrateReceive(anyhow!(
2248                             "Error receiving memory from socket: {}",
2249                             e
2250                         ))
2251                     })?;
2252                 offset += bytes_read as u64;
2253 
2254                 if offset == range.length {
2255                     break;
2256                 }
2257             }
2258         }
2259 
2260         Ok(())
2261     }
2262 }
2263 
2264 struct MemoryNotify {
2265     slot_id: usize,
2266 }
2267 
2268 impl Aml for MemoryNotify {
2269     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2270         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2271         aml::If::new(
2272             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2273             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2274         )
2275         .to_aml_bytes(sink)
2276     }
2277 }
2278 
2279 struct MemorySlot {
2280     slot_id: usize,
2281 }
2282 
2283 impl Aml for MemorySlot {
2284     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2285         aml::Device::new(
2286             format!("M{:03}", self.slot_id).as_str().into(),
2287             vec![
2288                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2289                 &aml::Name::new("_UID".into(), &self.slot_id),
2290                 /*
2291                 _STA return value:
2292                 Bit [0] – Set if the device is present.
2293                 Bit [1] – Set if the device is enabled and decoding its resources.
2294                 Bit [2] – Set if the device should be shown in the UI.
2295                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2296                 Bit [4] – Set if the battery is present.
2297                 Bits [31:5] – Reserved (must be cleared).
2298                 */
2299                 &aml::Method::new(
2300                     "_STA".into(),
2301                     0,
2302                     false,
2303                     // Call into MSTA method which will interrogate device
2304                     vec![&aml::Return::new(&aml::MethodCall::new(
2305                         "MSTA".into(),
2306                         vec![&self.slot_id],
2307                     ))],
2308                 ),
2309                 // Get details of memory
2310                 &aml::Method::new(
2311                     "_CRS".into(),
2312                     0,
2313                     false,
2314                     // Call into MCRS which provides actual memory details
2315                     vec![&aml::Return::new(&aml::MethodCall::new(
2316                         "MCRS".into(),
2317                         vec![&self.slot_id],
2318                     ))],
2319                 ),
2320             ],
2321         )
2322         .to_aml_bytes(sink)
2323     }
2324 }
2325 
2326 struct MemorySlots {
2327     slots: usize,
2328 }
2329 
2330 impl Aml for MemorySlots {
2331     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2332         for slot_id in 0..self.slots {
2333             MemorySlot { slot_id }.to_aml_bytes(sink);
2334         }
2335     }
2336 }
2337 
2338 struct MemoryMethods {
2339     slots: usize,
2340 }
2341 
2342 impl Aml for MemoryMethods {
2343     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2344         // Add "MTFY" notification method
2345         let mut memory_notifies = Vec::new();
2346         for slot_id in 0..self.slots {
2347             memory_notifies.push(MemoryNotify { slot_id });
2348         }
2349 
2350         let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2351         for memory_notifier in memory_notifies.iter() {
2352             memory_notifies_refs.push(memory_notifier);
2353         }
2354 
2355         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2356 
2357         // MSCN method
2358         aml::Method::new(
2359             "MSCN".into(),
2360             0,
2361             true,
2362             vec![
2363                 // Take lock defined above
2364                 &aml::Acquire::new("MLCK".into(), 0xffff),
2365                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2366                 &aml::While::new(
2367                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2368                     vec![
2369                         // Write slot number (in first argument) to I/O port via field
2370                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2371                         // Check if MINS bit is set (inserting)
2372                         &aml::If::new(
2373                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2374                             // Notify device if it is
2375                             vec![
2376                                 &aml::MethodCall::new(
2377                                     "MTFY".into(),
2378                                     vec![&aml::Local(0), &aml::ONE],
2379                                 ),
2380                                 // Reset MINS bit
2381                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2382                             ],
2383                         ),
2384                         // Check if MRMV bit is set
2385                         &aml::If::new(
2386                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2387                             // Notify device if it is (with the eject constant 0x3)
2388                             vec![
2389                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2390                                 // Reset MRMV bit
2391                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2392                             ],
2393                         ),
2394                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2395                     ],
2396                 ),
2397                 // Release lock
2398                 &aml::Release::new("MLCK".into()),
2399             ],
2400         )
2401         .to_aml_bytes(sink);
2402 
2403         // Memory status method
2404         aml::Method::new(
2405             "MSTA".into(),
2406             1,
2407             true,
2408             vec![
2409                 // Take lock defined above
2410                 &aml::Acquire::new("MLCK".into(), 0xffff),
2411                 // Write slot number (in first argument) to I/O port via field
2412                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2413                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2414                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2415                 &aml::If::new(
2416                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2417                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2418                 ),
2419                 // Release lock
2420                 &aml::Release::new("MLCK".into()),
2421                 // Return 0 or 0xf
2422                 &aml::Return::new(&aml::Local(0)),
2423             ],
2424         )
2425         .to_aml_bytes(sink);
2426 
2427         // Memory range method
2428         aml::Method::new(
2429             "MCRS".into(),
2430             1,
2431             true,
2432             vec![
2433                 // Take lock defined above
2434                 &aml::Acquire::new("MLCK".into(), 0xffff),
2435                 // Write slot number (in first argument) to I/O port via field
2436                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2437                 &aml::Name::new(
2438                     "MR64".into(),
2439                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2440                         aml::AddressSpaceCacheable::Cacheable,
2441                         true,
2442                         0x0000_0000_0000_0000u64,
2443                         0xFFFF_FFFF_FFFF_FFFEu64,
2444                         None,
2445                     )]),
2446                 ),
2447                 &aml::CreateQWordField::new(
2448                     &aml::Path::new("MINL"),
2449                     &aml::Path::new("MR64"),
2450                     &14usize,
2451                 ),
2452                 &aml::CreateDWordField::new(
2453                     &aml::Path::new("MINH"),
2454                     &aml::Path::new("MR64"),
2455                     &18usize,
2456                 ),
2457                 &aml::CreateQWordField::new(
2458                     &aml::Path::new("MAXL"),
2459                     &aml::Path::new("MR64"),
2460                     &22usize,
2461                 ),
2462                 &aml::CreateDWordField::new(
2463                     &aml::Path::new("MAXH"),
2464                     &aml::Path::new("MR64"),
2465                     &26usize,
2466                 ),
2467                 &aml::CreateQWordField::new(
2468                     &aml::Path::new("LENL"),
2469                     &aml::Path::new("MR64"),
2470                     &38usize,
2471                 ),
2472                 &aml::CreateDWordField::new(
2473                     &aml::Path::new("LENH"),
2474                     &aml::Path::new("MR64"),
2475                     &42usize,
2476                 ),
2477                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2478                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2479                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2480                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2481                 &aml::Add::new(
2482                     &aml::Path::new("MAXL"),
2483                     &aml::Path::new("MINL"),
2484                     &aml::Path::new("LENL"),
2485                 ),
2486                 &aml::Add::new(
2487                     &aml::Path::new("MAXH"),
2488                     &aml::Path::new("MINH"),
2489                     &aml::Path::new("LENH"),
2490                 ),
2491                 &aml::If::new(
2492                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2493                     vec![&aml::Add::new(
2494                         &aml::Path::new("MAXH"),
2495                         &aml::ONE,
2496                         &aml::Path::new("MAXH"),
2497                     )],
2498                 ),
2499                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2500                 // Release lock
2501                 &aml::Release::new("MLCK".into()),
2502                 &aml::Return::new(&aml::Path::new("MR64")),
2503             ],
2504         )
2505         .to_aml_bytes(sink)
2506     }
2507 }
2508 
2509 impl Aml for MemoryManager {
2510     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2511         if let Some(acpi_address) = self.acpi_address {
2512             // Memory Hotplug Controller
2513             aml::Device::new(
2514                 "_SB_.MHPC".into(),
2515                 vec![
2516                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2517                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2518                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2519                     &aml::Mutex::new("MLCK".into(), 0),
2520                     &aml::Name::new(
2521                         "_CRS".into(),
2522                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2523                             aml::AddressSpaceCacheable::NotCacheable,
2524                             true,
2525                             acpi_address.0,
2526                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2527                             None,
2528                         )]),
2529                     ),
2530                     // OpRegion and Fields map MMIO range into individual field values
2531                     &aml::OpRegion::new(
2532                         "MHPR".into(),
2533                         aml::OpRegionSpace::SystemMemory,
2534                         &(acpi_address.0 as usize),
2535                         &MEMORY_MANAGER_ACPI_SIZE,
2536                     ),
2537                     &aml::Field::new(
2538                         "MHPR".into(),
2539                         aml::FieldAccessType::DWord,
2540                         aml::FieldLockRule::NoLock,
2541                         aml::FieldUpdateRule::Preserve,
2542                         vec![
2543                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2544                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2545                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2546                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2547                         ],
2548                     ),
2549                     &aml::Field::new(
2550                         "MHPR".into(),
2551                         aml::FieldAccessType::DWord,
2552                         aml::FieldLockRule::NoLock,
2553                         aml::FieldUpdateRule::Preserve,
2554                         vec![
2555                             aml::FieldEntry::Reserved(128),
2556                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2557                         ],
2558                     ),
2559                     &aml::Field::new(
2560                         "MHPR".into(),
2561                         aml::FieldAccessType::Byte,
2562                         aml::FieldLockRule::NoLock,
2563                         aml::FieldUpdateRule::WriteAsZeroes,
2564                         vec![
2565                             aml::FieldEntry::Reserved(160),
2566                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2567                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2568                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2569                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2570                         ],
2571                     ),
2572                     &aml::Field::new(
2573                         "MHPR".into(),
2574                         aml::FieldAccessType::DWord,
2575                         aml::FieldLockRule::NoLock,
2576                         aml::FieldUpdateRule::Preserve,
2577                         vec![
2578                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2579                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2580                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2581                         ],
2582                     ),
2583                     &MemoryMethods {
2584                         slots: self.hotplug_slots.len(),
2585                     },
2586                     &MemorySlots {
2587                         slots: self.hotplug_slots.len(),
2588                     },
2589                 ],
2590             )
2591             .to_aml_bytes(sink);
2592         } else {
2593             aml::Device::new(
2594                 "_SB_.MHPC".into(),
2595                 vec![
2596                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2597                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2598                     // Empty MSCN for GED
2599                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2600                 ],
2601             )
2602             .to_aml_bytes(sink);
2603         }
2604 
2605         #[cfg(target_arch = "x86_64")]
2606         {
2607             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2608                 let min = sgx_epc_region.start().raw_value();
2609                 let max = min + sgx_epc_region.size() - 1;
2610                 // SGX EPC region
2611                 aml::Device::new(
2612                     "_SB_.EPC_".into(),
2613                     vec![
2614                         &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2615                         // QWORD describing the EPC region start and size
2616                         &aml::Name::new(
2617                             "_CRS".into(),
2618                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2619                                 aml::AddressSpaceCacheable::NotCacheable,
2620                                 true,
2621                                 min,
2622                                 max,
2623                                 None,
2624                             )]),
2625                         ),
2626                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2627                     ],
2628                 )
2629                 .to_aml_bytes(sink);
2630             }
2631         }
2632     }
2633 }
2634 
2635 impl Pausable for MemoryManager {}
2636 
2637 #[derive(Clone, Serialize, Deserialize)]
2638 pub struct MemoryManagerSnapshotData {
2639     memory_ranges: MemoryRangeTable,
2640     guest_ram_mappings: Vec<GuestRamMapping>,
2641     start_of_device_area: u64,
2642     boot_ram: u64,
2643     current_ram: u64,
2644     arch_mem_regions: Vec<ArchMemRegion>,
2645     hotplug_slots: Vec<HotPlugState>,
2646     next_memory_slot: u32,
2647     selected_slot: usize,
2648     next_hotplug_slot: usize,
2649 }
2650 
2651 impl Snapshottable for MemoryManager {
2652     fn id(&self) -> String {
2653         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2654     }
2655 
2656     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2657         let memory_ranges = self.memory_range_table(true)?;
2658 
2659         // Store locally this list of ranges as it will be used through the
2660         // Transportable::send() implementation. The point is to avoid the
2661         // duplication of code regarding the creation of the path for each
2662         // region. The 'snapshot' step creates the list of memory regions,
2663         // including information about the need to copy a memory region or
2664         // not. This saves the 'send' step having to go through the same
2665         // process, and instead it can directly proceed with storing the
2666         // memory range content for the ranges requiring it.
2667         self.snapshot_memory_ranges = memory_ranges;
2668 
2669         Ok(Snapshot::from_data(SnapshotData::new_from_state(
2670             &self.snapshot_data(),
2671         )?))
2672     }
2673 }
2674 
2675 impl Transportable for MemoryManager {
2676     fn send(
2677         &self,
2678         _snapshot: &Snapshot,
2679         destination_url: &str,
2680     ) -> result::Result<(), MigratableError> {
2681         if self.snapshot_memory_ranges.is_empty() {
2682             return Ok(());
2683         }
2684 
2685         let mut memory_file_path = url_to_path(destination_url)?;
2686         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2687 
2688         // Create the snapshot file for the entire memory
2689         let mut memory_file = OpenOptions::new()
2690             .read(true)
2691             .write(true)
2692             .create_new(true)
2693             .open(memory_file_path)
2694             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2695 
2696         let guest_memory = self.guest_memory.memory();
2697 
2698         for range in self.snapshot_memory_ranges.regions() {
2699             let mut offset: u64 = 0;
2700             // Here we are manually handling the retry in case we can't read
2701             // the whole region at once because we can't use the implementation
2702             // from vm-memory::GuestMemory of write_all_to() as it is not
2703             // following the correct behavior. For more info about this issue
2704             // see: https://github.com/rust-vmm/vm-memory/issues/174
2705             loop {
2706                 let bytes_written = guest_memory
2707                     .write_volatile_to(
2708                         GuestAddress(range.gpa + offset),
2709                         &mut memory_file,
2710                         (range.length - offset) as usize,
2711                     )
2712                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2713                 offset += bytes_written as u64;
2714 
2715                 if offset == range.length {
2716                     break;
2717                 }
2718             }
2719         }
2720         Ok(())
2721     }
2722 }
2723 
2724 impl Migratable for MemoryManager {
2725     // Start the dirty log in the hypervisor (kvm/mshv).
2726     // Also, reset the dirty bitmap logged by the vmm.
2727     // Just before we do a bulk copy we want to start/clear the dirty log so that
2728     // pages touched during our bulk copy are tracked.
2729     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2730         self.vm.start_dirty_log().map_err(|e| {
2731             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2732         })?;
2733 
2734         for r in self.guest_memory.memory().iter() {
2735             r.bitmap().reset();
2736         }
2737 
2738         Ok(())
2739     }
2740 
2741     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2742         self.vm.stop_dirty_log().map_err(|e| {
2743             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2744         })?;
2745 
2746         Ok(())
2747     }
2748 
2749     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2750     // together in the table if they are contiguous.
2751     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2752         let mut table = MemoryRangeTable::default();
2753         for r in &self.guest_ram_mappings {
2754             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2755                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2756             })?;
2757             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2758             {
2759                 Some(region) => {
2760                     assert!(region.start_addr().raw_value() == r.gpa);
2761                     assert!(region.len() == r.size);
2762                     region.bitmap().get_and_reset()
2763                 }
2764                 None => {
2765                     return Err(MigratableError::MigrateSend(anyhow!(
2766                         "Error finding 'guest memory region' with address {:x}",
2767                         r.gpa
2768                     )))
2769                 }
2770             };
2771 
2772             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2773                 .iter()
2774                 .zip(vmm_dirty_bitmap.iter())
2775                 .map(|(x, y)| x | y)
2776                 .collect();
2777 
2778             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2779 
2780             if sub_table.regions().is_empty() {
2781                 info!("Dirty Memory Range Table is empty");
2782             } else {
2783                 info!("Dirty Memory Range Table:");
2784                 for range in sub_table.regions() {
2785                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2786                 }
2787             }
2788 
2789             table.extend(sub_table);
2790         }
2791         Ok(table)
2792     }
2793 }
2794