xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 61e57e1cb149de03ae1e0b799b9e5ba9a4a63ace)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
6 use std::collections::BTreeMap;
7 use std::collections::HashMap;
8 use std::fs::{File, OpenOptions};
9 use std::io::{self};
10 use std::ops::{BitAnd, Deref, Not, Sub};
11 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
12 use std::os::fd::AsFd;
13 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
14 use std::path::PathBuf;
15 use std::sync::{Arc, Barrier, Mutex};
16 use std::{ffi, result, thread};
17 
18 use acpi_tables::{aml, Aml};
19 use anyhow::anyhow;
20 #[cfg(target_arch = "x86_64")]
21 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
22 use arch::RegionType;
23 #[cfg(target_arch = "x86_64")]
24 use devices::ioapic;
25 #[cfg(target_arch = "aarch64")]
26 use hypervisor::HypervisorVmError;
27 use libc::_SC_NPROCESSORS_ONLN;
28 #[cfg(target_arch = "x86_64")]
29 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
30 use serde::{Deserialize, Serialize};
31 use tracer::trace_scoped;
32 use virtio_devices::BlocksState;
33 #[cfg(target_arch = "x86_64")]
34 use vm_allocator::GsiApic;
35 use vm_allocator::{AddressAllocator, SystemAllocator};
36 use vm_device::BusDevice;
37 use vm_memory::bitmap::AtomicBitmap;
38 use vm_memory::guest_memory::FileOffset;
39 use vm_memory::mmap::MmapRegionError;
40 use vm_memory::{
41     Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
42     GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile,
43 };
44 use vm_migration::protocol::{MemoryRange, MemoryRangeTable};
45 use vm_migration::{
46     Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable,
47 };
48 
49 #[cfg(target_arch = "x86_64")]
50 use crate::config::SgxEpcConfig;
51 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
52 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
53 use crate::coredump::{
54     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
55 };
56 use crate::migration::url_to_path;
57 use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID};
58 
59 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
60 
61 const DEFAULT_MEMORY_ZONE: &str = "mem0";
62 
63 const SNAPSHOT_FILENAME: &str = "memory-ranges";
64 
65 #[cfg(target_arch = "x86_64")]
66 const X86_64_IRQ_BASE: u32 = 5;
67 
68 #[cfg(target_arch = "x86_64")]
69 const SGX_PAGE_SIZE: u64 = 1 << 12;
70 
71 const HOTPLUG_COUNT: usize = 8;
72 
73 // Memory policy constants
74 const MPOL_BIND: u32 = 2;
75 const MPOL_MF_STRICT: u32 = 1;
76 const MPOL_MF_MOVE: u32 = 1 << 1;
77 
78 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
79 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
80 
81 const MAX_PREFAULT_THREAD_COUNT: usize = 16;
82 
83 #[derive(Clone, Default, Serialize, Deserialize)]
84 struct HotPlugState {
85     base: u64,
86     length: u64,
87     active: bool,
88     inserting: bool,
89     removing: bool,
90 }
91 
92 pub struct VirtioMemZone {
93     region: Arc<GuestRegionMmap>,
94     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
95     hotplugged_size: u64,
96     hugepages: bool,
97     blocks_state: Arc<Mutex<BlocksState>>,
98 }
99 
100 impl VirtioMemZone {
101     pub fn region(&self) -> &Arc<GuestRegionMmap> {
102         &self.region
103     }
104     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
105         self.virtio_device = Some(virtio_device);
106     }
107     pub fn hotplugged_size(&self) -> u64 {
108         self.hotplugged_size
109     }
110     pub fn hugepages(&self) -> bool {
111         self.hugepages
112     }
113     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
114         &self.blocks_state
115     }
116     pub fn plugged_ranges(&self) -> MemoryRangeTable {
117         self.blocks_state
118             .lock()
119             .unwrap()
120             .memory_ranges(self.region.start_addr().raw_value(), true)
121     }
122 }
123 
124 #[derive(Default)]
125 pub struct MemoryZone {
126     regions: Vec<Arc<GuestRegionMmap>>,
127     virtio_mem_zone: Option<VirtioMemZone>,
128 }
129 
130 impl MemoryZone {
131     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
132         &self.regions
133     }
134     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
135         &self.virtio_mem_zone
136     }
137     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
138         self.virtio_mem_zone.as_mut()
139     }
140 }
141 
142 pub type MemoryZones = HashMap<String, MemoryZone>;
143 
144 #[derive(Clone, Serialize, Deserialize)]
145 struct GuestRamMapping {
146     slot: u32,
147     gpa: u64,
148     size: u64,
149     zone_id: String,
150     virtio_mem: bool,
151     file_offset: u64,
152 }
153 
154 #[derive(Clone, Serialize, Deserialize)]
155 struct ArchMemRegion {
156     base: u64,
157     size: usize,
158     r_type: RegionType,
159 }
160 
161 pub struct MemoryManager {
162     boot_guest_memory: GuestMemoryMmap,
163     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
164     next_memory_slot: u32,
165     start_of_device_area: GuestAddress,
166     end_of_device_area: GuestAddress,
167     end_of_ram_area: GuestAddress,
168     pub vm: Arc<dyn hypervisor::Vm>,
169     hotplug_slots: Vec<HotPlugState>,
170     selected_slot: usize,
171     mergeable: bool,
172     allocator: Arc<Mutex<SystemAllocator>>,
173     hotplug_method: HotplugMethod,
174     boot_ram: u64,
175     current_ram: u64,
176     next_hotplug_slot: usize,
177     shared: bool,
178     hugepages: bool,
179     hugepage_size: Option<u64>,
180     prefault: bool,
181     thp: bool,
182     #[cfg(target_arch = "x86_64")]
183     sgx_epc_region: Option<SgxEpcRegion>,
184     user_provided_zones: bool,
185     snapshot_memory_ranges: MemoryRangeTable,
186     memory_zones: MemoryZones,
187     log_dirty: bool, // Enable dirty logging for created RAM regions
188     arch_mem_regions: Vec<ArchMemRegion>,
189     ram_allocator: AddressAllocator,
190     dynamic: bool,
191 
192     // Keep track of calls to create_userspace_mapping() for guest RAM.
193     // This is useful for getting the dirty pages as we need to know the
194     // slots that the mapping is created in.
195     guest_ram_mappings: Vec<GuestRamMapping>,
196 
197     pub acpi_address: Option<GuestAddress>,
198     #[cfg(target_arch = "aarch64")]
199     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
200 }
201 
202 #[derive(Debug)]
203 pub enum Error {
204     /// Failed to create shared file.
205     SharedFileCreate(io::Error),
206 
207     /// Failed to set shared file length.
208     SharedFileSetLen(io::Error),
209 
210     /// Mmap backed guest memory error
211     GuestMemory(MmapError),
212 
213     /// Failed to allocate a memory range.
214     MemoryRangeAllocation,
215 
216     /// Error from region creation
217     GuestMemoryRegion(MmapRegionError),
218 
219     /// No ACPI slot available
220     NoSlotAvailable,
221 
222     /// Not enough space in the hotplug RAM region
223     InsufficientHotplugRam,
224 
225     /// The requested hotplug memory addition is not a valid size
226     InvalidSize,
227 
228     /// Failed to create the user memory region.
229     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
230 
231     /// Failed to remove the user memory region.
232     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
233 
234     /// Failed to EventFd.
235     EventFdFail(io::Error),
236 
237     /// Eventfd write error
238     EventfdError(io::Error),
239 
240     /// Failed to virtio-mem resize
241     VirtioMemResizeFail(virtio_devices::mem::Error),
242 
243     /// Cannot restore VM
244     Restore(MigratableError),
245 
246     /// Cannot restore VM because source URL is missing
247     RestoreMissingSourceUrl,
248 
249     /// Cannot create the system allocator
250     CreateSystemAllocator,
251 
252     /// Invalid SGX EPC section size
253     #[cfg(target_arch = "x86_64")]
254     EpcSectionSizeInvalid,
255 
256     /// Failed allocating SGX EPC region
257     #[cfg(target_arch = "x86_64")]
258     SgxEpcRangeAllocation,
259 
260     /// Failed opening SGX virtual EPC device
261     #[cfg(target_arch = "x86_64")]
262     SgxVirtEpcOpen(io::Error),
263 
264     /// Failed setting the SGX virtual EPC section size
265     #[cfg(target_arch = "x86_64")]
266     SgxVirtEpcFileSetLen(io::Error),
267 
268     /// Failed opening SGX provisioning device
269     #[cfg(target_arch = "x86_64")]
270     SgxProvisionOpen(io::Error),
271 
272     /// Failed enabling SGX provisioning
273     #[cfg(target_arch = "x86_64")]
274     SgxEnableProvisioning(hypervisor::HypervisorVmError),
275 
276     /// Failed creating a new MmapRegion instance.
277     #[cfg(target_arch = "x86_64")]
278     NewMmapRegion(vm_memory::mmap::MmapRegionError),
279 
280     /// No memory zones found.
281     MissingMemoryZones,
282 
283     /// Memory configuration is not valid.
284     InvalidMemoryParameters,
285 
286     /// Forbidden operation. Impossible to resize guest memory if it is
287     /// backed by user defined memory regions.
288     InvalidResizeWithMemoryZones,
289 
290     /// It's invalid to try applying a NUMA policy to a memory zone that is
291     /// memory mapped with MAP_SHARED.
292     InvalidSharedMemoryZoneWithHostNuma,
293 
294     /// Failed applying NUMA memory policy.
295     ApplyNumaPolicy(io::Error),
296 
297     /// Memory zone identifier is not unique.
298     DuplicateZoneId,
299 
300     /// No virtio-mem resizing handler found.
301     MissingVirtioMemHandler,
302 
303     /// Unknown memory zone.
304     UnknownMemoryZone,
305 
306     /// Invalid size for resizing. Can be anything except 0.
307     InvalidHotplugSize,
308 
309     /// Invalid hotplug method associated with memory zones resizing capability.
310     InvalidHotplugMethodWithMemoryZones,
311 
312     /// Could not find specified memory zone identifier from hash map.
313     MissingZoneIdentifier,
314 
315     /// Resizing the memory zone failed.
316     ResizeZone,
317 
318     /// Guest address overflow
319     GuestAddressOverFlow,
320 
321     /// Error opening snapshot file
322     SnapshotOpen(io::Error),
323 
324     // Error copying snapshot into region
325     SnapshotCopy(GuestMemoryError),
326 
327     /// Failed to allocate MMIO address
328     AllocateMmioAddress,
329 
330     #[cfg(target_arch = "aarch64")]
331     /// Failed to create UEFI flash
332     CreateUefiFlash(HypervisorVmError),
333 
334     /// Using a directory as a backing file for memory is not supported
335     DirectoryAsBackingFileForMemory,
336 
337     /// Failed to stat filesystem
338     GetFileSystemBlockSize(io::Error),
339 
340     /// Memory size is misaligned with default page size or its hugepage size
341     MisalignedMemorySize,
342 }
343 
344 const ENABLE_FLAG: usize = 0;
345 const INSERTING_FLAG: usize = 1;
346 const REMOVING_FLAG: usize = 2;
347 const EJECT_FLAG: usize = 3;
348 
349 const BASE_OFFSET_LOW: u64 = 0;
350 const BASE_OFFSET_HIGH: u64 = 0x4;
351 const LENGTH_OFFSET_LOW: u64 = 0x8;
352 const LENGTH_OFFSET_HIGH: u64 = 0xC;
353 const STATUS_OFFSET: u64 = 0x14;
354 const SELECTION_OFFSET: u64 = 0;
355 
356 // The MMIO address space size is subtracted with 64k. This is done for the
357 // following reasons:
358 //  - Reduce the addressable space size by at least 4k to workaround a Linux
359 //    bug when the VMM allocates devices at the end of the addressable space
360 //  - Windows requires the addressable space size to be 64k aligned
361 fn mmio_address_space_size(phys_bits: u8) -> u64 {
362     (1 << phys_bits) - (1 << 16)
363 }
364 
365 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
366 // `f_bsize` field.
367 //
368 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
369 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
370     let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
371     let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
372 
373     // SAFETY: FFI call with a valid path and buffer
374     let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
375     if ret != 0 {
376         return Err(Error::GetFileSystemBlockSize(
377             std::io::Error::last_os_error(),
378         ));
379     }
380 
381     // SAFETY: `buf` is valid at this point
382     // Because this value is always positive, just convert it directly.
383     // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
384     // by `clippy` on musl target.  To avoid the warning, there should be `as _` instead of
385     // `as u64`.
386     let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
387     Ok(bsize)
388 }
389 
390 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
391     // SAFETY: FFI call. Trivially safe.
392     let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
393 
394     // There is no backend file and the `hugepages` is disabled, just use system page size.
395     if zone.file.is_none() && !zone.hugepages {
396         return Ok(page_size);
397     }
398 
399     // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
400     if zone.hugepages && zone.hugepage_size.is_some() {
401         return Ok(zone.hugepage_size.unwrap());
402     }
403 
404     // There are two scenarios here:
405     //  - `hugepages` is enabled but `hugepage_size` is not specified:
406     //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
407     //  - The backing file is specified:
408     //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
409     //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
410     //     value is less than or equal to the page size, just use the page size.
411     let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
412         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
413     })?;
414 
415     let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
416 
417     Ok(align_size)
418 }
419 
420 #[inline]
421 fn align_down<T>(val: T, align: T) -> T
422 where
423     T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
424 {
425     val & !(align - 1u8.into())
426 }
427 
428 #[inline]
429 fn is_aligned<T>(val: T, align: T) -> bool
430 where
431     T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
432 {
433     (val & (align - 1u8.into())) == 0u8.into()
434 }
435 
436 impl BusDevice for MemoryManager {
437     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
438         if self.selected_slot < self.hotplug_slots.len() {
439             let state = &self.hotplug_slots[self.selected_slot];
440             match offset {
441                 BASE_OFFSET_LOW => {
442                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
443                 }
444                 BASE_OFFSET_HIGH => {
445                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
446                 }
447                 LENGTH_OFFSET_LOW => {
448                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
449                 }
450                 LENGTH_OFFSET_HIGH => {
451                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
452                 }
453                 STATUS_OFFSET => {
454                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
455                     data.fill(0);
456                     if state.active {
457                         data[0] |= 1 << ENABLE_FLAG;
458                     }
459                     if state.inserting {
460                         data[0] |= 1 << INSERTING_FLAG;
461                     }
462                     if state.removing {
463                         data[0] |= 1 << REMOVING_FLAG;
464                     }
465                 }
466                 _ => {
467                     warn!(
468                         "Unexpected offset for accessing memory manager device: {:#}",
469                         offset
470                     );
471                 }
472             }
473         } else {
474             warn!("Out of range memory slot: {}", self.selected_slot);
475         }
476     }
477 
478     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
479         match offset {
480             SELECTION_OFFSET => {
481                 self.selected_slot = usize::from(data[0]);
482             }
483             STATUS_OFFSET => {
484                 if self.selected_slot < self.hotplug_slots.len() {
485                     let state = &mut self.hotplug_slots[self.selected_slot];
486                     // The ACPI code writes back a 1 to acknowledge the insertion
487                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
488                         state.inserting = false;
489                     }
490                     // Ditto for removal
491                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
492                         state.removing = false;
493                     }
494                     // Trigger removal of "DIMM"
495                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
496                         warn!("Ejection of memory not currently supported");
497                     }
498                 } else {
499                     warn!("Out of range memory slot: {}", self.selected_slot);
500                 }
501             }
502             _ => {
503                 warn!(
504                     "Unexpected offset for accessing memory manager device: {:#}",
505                     offset
506                 );
507             }
508         };
509         None
510     }
511 }
512 
513 impl MemoryManager {
514     /// Creates all memory regions based on the available RAM ranges defined
515     /// by `ram_regions`, and based on the description of the memory zones.
516     /// In practice, this function can perform multiple memory mappings of the
517     /// same backing file if there's a hole in the address space between two
518     /// RAM ranges.
519     ///
520     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
521     /// and zones containing two zones (size 1G and size 4G).
522     ///
523     /// This function will create 3 resulting memory regions:
524     /// - First one mapping entirely the first memory zone on 0-1G range
525     /// - Second one mapping partially the second memory zone on 1G-3G range
526     /// - Third one mapping partially the second memory zone on 4G-6G range
527     ///
528     /// Also, all memory regions are page-size aligned (e.g. their sizes must
529     /// be multiple of page-size), which may leave an additional hole in the
530     /// address space when hugepage is used.
531     fn create_memory_regions_from_zones(
532         ram_regions: &[(GuestAddress, usize)],
533         zones: &[MemoryZoneConfig],
534         prefault: Option<bool>,
535         thp: bool,
536     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
537         let mut zone_iter = zones.iter();
538         let mut mem_regions = Vec::new();
539         let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
540         let mut zone_align_size = memory_zone_get_align_size(zone)?;
541         let mut zone_offset = 0u64;
542         let mut memory_zones = HashMap::new();
543 
544         if !is_aligned(zone.size, zone_align_size) {
545             return Err(Error::MisalignedMemorySize);
546         }
547 
548         // Add zone id to the list of memory zones.
549         memory_zones.insert(zone.id.clone(), MemoryZone::default());
550 
551         for ram_region in ram_regions.iter() {
552             let mut ram_region_offset = 0;
553             let mut exit = false;
554 
555             loop {
556                 let mut ram_region_consumed = false;
557                 let mut pull_next_zone = false;
558 
559                 let ram_region_available_size =
560                     align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
561                 if ram_region_available_size == 0 {
562                     break;
563                 }
564                 let zone_sub_size = zone.size - zone_offset;
565 
566                 let file_offset = zone_offset;
567                 let region_start = ram_region
568                     .0
569                     .checked_add(ram_region_offset)
570                     .ok_or(Error::GuestAddressOverFlow)?;
571                 let region_size = if zone_sub_size <= ram_region_available_size {
572                     if zone_sub_size == ram_region_available_size {
573                         ram_region_consumed = true;
574                     }
575 
576                     ram_region_offset += zone_sub_size;
577                     pull_next_zone = true;
578 
579                     zone_sub_size
580                 } else {
581                     zone_offset += ram_region_available_size;
582                     ram_region_consumed = true;
583 
584                     ram_region_available_size
585                 };
586 
587                 info!(
588                     "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
589                     zone.id,
590                     region_start.raw_value(),
591                     region_size
592                 );
593                 let region = MemoryManager::create_ram_region(
594                     &zone.file,
595                     file_offset,
596                     region_start,
597                     region_size as usize,
598                     prefault.unwrap_or(zone.prefault),
599                     zone.shared,
600                     zone.hugepages,
601                     zone.hugepage_size,
602                     zone.host_numa_node,
603                     None,
604                     thp,
605                 )?;
606 
607                 // Add region to the list of regions associated with the
608                 // current memory zone.
609                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
610                     memory_zone.regions.push(region.clone());
611                 }
612 
613                 mem_regions.push(region);
614 
615                 if pull_next_zone {
616                     // Get the next zone and reset the offset.
617                     zone_offset = 0;
618                     if let Some(z) = zone_iter.next() {
619                         zone = z;
620                     } else {
621                         exit = true;
622                         break;
623                     }
624                     zone_align_size = memory_zone_get_align_size(zone)?;
625                     if !is_aligned(zone.size, zone_align_size) {
626                         return Err(Error::MisalignedMemorySize);
627                     }
628 
629                     // Check if zone id already exist. In case it does, throw
630                     // an error as we need unique identifiers. Otherwise, add
631                     // the new zone id to the list of memory zones.
632                     if memory_zones.contains_key(&zone.id) {
633                         error!(
634                             "Memory zone identifier '{}' found more than once. \
635                             It must be unique",
636                             zone.id,
637                         );
638                         return Err(Error::DuplicateZoneId);
639                     }
640                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
641                 }
642 
643                 if ram_region_consumed {
644                     break;
645                 }
646             }
647 
648             if exit {
649                 break;
650             }
651         }
652 
653         Ok((mem_regions, memory_zones))
654     }
655 
656     // Restore both GuestMemory regions along with MemoryZone zones.
657     fn restore_memory_regions_and_zones(
658         guest_ram_mappings: &[GuestRamMapping],
659         zones_config: &[MemoryZoneConfig],
660         prefault: Option<bool>,
661         mut existing_memory_files: HashMap<u32, File>,
662         thp: bool,
663     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
664         let mut memory_regions = Vec::new();
665         let mut memory_zones = HashMap::new();
666 
667         for zone_config in zones_config {
668             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
669         }
670 
671         for guest_ram_mapping in guest_ram_mappings {
672             for zone_config in zones_config {
673                 if guest_ram_mapping.zone_id == zone_config.id {
674                     let region = MemoryManager::create_ram_region(
675                         if guest_ram_mapping.virtio_mem {
676                             &None
677                         } else {
678                             &zone_config.file
679                         },
680                         guest_ram_mapping.file_offset,
681                         GuestAddress(guest_ram_mapping.gpa),
682                         guest_ram_mapping.size as usize,
683                         prefault.unwrap_or(zone_config.prefault),
684                         zone_config.shared,
685                         zone_config.hugepages,
686                         zone_config.hugepage_size,
687                         zone_config.host_numa_node,
688                         existing_memory_files.remove(&guest_ram_mapping.slot),
689                         thp,
690                     )?;
691                     memory_regions.push(Arc::clone(&region));
692                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
693                         if guest_ram_mapping.virtio_mem {
694                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
695                             let region_size = region.len();
696                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
697                                 region,
698                                 virtio_device: None,
699                                 hotplugged_size,
700                                 hugepages: zone_config.hugepages,
701                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
702                             });
703                         } else {
704                             memory_zone.regions.push(region);
705                         }
706                     }
707                 }
708             }
709         }
710 
711         memory_regions.sort_by_key(|x| x.start_addr());
712 
713         Ok((memory_regions, memory_zones))
714     }
715 
716     fn fill_saved_regions(
717         &mut self,
718         file_path: PathBuf,
719         saved_regions: MemoryRangeTable,
720     ) -> Result<(), Error> {
721         if saved_regions.is_empty() {
722             return Ok(());
723         }
724 
725         // Open (read only) the snapshot file.
726         let mut memory_file = OpenOptions::new()
727             .read(true)
728             .open(file_path)
729             .map_err(Error::SnapshotOpen)?;
730 
731         let guest_memory = self.guest_memory.memory();
732         for range in saved_regions.regions() {
733             let mut offset: u64 = 0;
734             // Here we are manually handling the retry in case we can't write
735             // the whole region at once because we can't use the implementation
736             // from vm-memory::GuestMemory of read_exact_from() as it is not
737             // following the correct behavior. For more info about this issue
738             // see: https://github.com/rust-vmm/vm-memory/issues/174
739             loop {
740                 let bytes_read = guest_memory
741                     .read_volatile_from(
742                         GuestAddress(range.gpa + offset),
743                         &mut memory_file,
744                         (range.length - offset) as usize,
745                     )
746                     .map_err(Error::SnapshotCopy)?;
747                 offset += bytes_read as u64;
748 
749                 if offset == range.length {
750                     break;
751                 }
752             }
753         }
754 
755         Ok(())
756     }
757 
758     fn validate_memory_config(
759         config: &MemoryConfig,
760         user_provided_zones: bool,
761     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
762         let mut allow_mem_hotplug = false;
763 
764         if !user_provided_zones {
765             if config.zones.is_some() {
766                 error!(
767                     "User defined memory regions can't be provided if the \
768                     memory size is not 0"
769                 );
770                 return Err(Error::InvalidMemoryParameters);
771             }
772 
773             if config.hotplug_size.is_some() {
774                 allow_mem_hotplug = true;
775             }
776 
777             if let Some(hotplugged_size) = config.hotplugged_size {
778                 if let Some(hotplug_size) = config.hotplug_size {
779                     if hotplugged_size > hotplug_size {
780                         error!(
781                             "'hotplugged_size' {} can't be bigger than \
782                             'hotplug_size' {}",
783                             hotplugged_size, hotplug_size,
784                         );
785                         return Err(Error::InvalidMemoryParameters);
786                     }
787                 } else {
788                     error!(
789                         "Invalid to define 'hotplugged_size' when there is\
790                         no 'hotplug_size'"
791                     );
792                     return Err(Error::InvalidMemoryParameters);
793                 }
794                 if config.hotplug_method == HotplugMethod::Acpi {
795                     error!(
796                         "Invalid to define 'hotplugged_size' with hotplug \
797                         method 'acpi'"
798                     );
799                     return Err(Error::InvalidMemoryParameters);
800                 }
801             }
802 
803             // Create a single zone from the global memory config. This lets
804             // us reuse the codepath for user defined memory zones.
805             let zones = vec![MemoryZoneConfig {
806                 id: String::from(DEFAULT_MEMORY_ZONE),
807                 size: config.size,
808                 file: None,
809                 shared: config.shared,
810                 hugepages: config.hugepages,
811                 hugepage_size: config.hugepage_size,
812                 host_numa_node: None,
813                 hotplug_size: config.hotplug_size,
814                 hotplugged_size: config.hotplugged_size,
815                 prefault: config.prefault,
816             }];
817 
818             Ok((config.size, zones, allow_mem_hotplug))
819         } else {
820             if config.zones.is_none() {
821                 error!(
822                     "User defined memory regions must be provided if the \
823                     memory size is 0"
824                 );
825                 return Err(Error::MissingMemoryZones);
826             }
827 
828             // Safe to unwrap as we checked right above there were some
829             // regions.
830             let zones = config.zones.clone().unwrap();
831             if zones.is_empty() {
832                 return Err(Error::MissingMemoryZones);
833             }
834 
835             let mut total_ram_size: u64 = 0;
836             for zone in zones.iter() {
837                 total_ram_size += zone.size;
838 
839                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
840                     error!(
841                         "Invalid to set host NUMA policy for a memory zone \
842                         backed by a regular file and mapped as 'shared'"
843                     );
844                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
845                 }
846 
847                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
848                     error!("Invalid to set ACPI hotplug method for memory zones");
849                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
850                 }
851 
852                 if let Some(hotplugged_size) = zone.hotplugged_size {
853                     if let Some(hotplug_size) = zone.hotplug_size {
854                         if hotplugged_size > hotplug_size {
855                             error!(
856                                 "'hotplugged_size' {} can't be bigger than \
857                                 'hotplug_size' {}",
858                                 hotplugged_size, hotplug_size,
859                             );
860                             return Err(Error::InvalidMemoryParameters);
861                         }
862                     } else {
863                         error!(
864                             "Invalid to define 'hotplugged_size' when there is\
865                             no 'hotplug_size' for a memory zone"
866                         );
867                         return Err(Error::InvalidMemoryParameters);
868                     }
869                     if config.hotplug_method == HotplugMethod::Acpi {
870                         error!(
871                             "Invalid to define 'hotplugged_size' with hotplug \
872                             method 'acpi'"
873                         );
874                         return Err(Error::InvalidMemoryParameters);
875                     }
876                 }
877             }
878 
879             Ok((total_ram_size, zones, allow_mem_hotplug))
880         }
881     }
882 
883     pub fn allocate_address_space(&mut self) -> Result<(), Error> {
884         let mut list = Vec::new();
885 
886         for (zone_id, memory_zone) in self.memory_zones.iter() {
887             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
888                 memory_zone
889                     .regions()
890                     .iter()
891                     .map(|r| (r.clone(), false))
892                     .collect();
893 
894             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
895                 regions.push((virtio_mem_zone.region().clone(), true));
896             }
897 
898             list.push((zone_id.clone(), regions));
899         }
900 
901         for (zone_id, regions) in list {
902             for (region, virtio_mem) in regions {
903                 let slot = self.create_userspace_mapping(
904                     region.start_addr().raw_value(),
905                     region.len(),
906                     region.as_ptr() as u64,
907                     self.mergeable,
908                     false,
909                     self.log_dirty,
910                 )?;
911 
912                 let file_offset = if let Some(file_offset) = region.file_offset() {
913                     file_offset.start()
914                 } else {
915                     0
916                 };
917 
918                 self.guest_ram_mappings.push(GuestRamMapping {
919                     gpa: region.start_addr().raw_value(),
920                     size: region.len(),
921                     slot,
922                     zone_id: zone_id.clone(),
923                     virtio_mem,
924                     file_offset,
925                 });
926                 self.ram_allocator
927                     .allocate(Some(region.start_addr()), region.len(), None)
928                     .ok_or(Error::MemoryRangeAllocation)?;
929             }
930         }
931 
932         // Allocate SubRegion and Reserved address ranges.
933         for region in self.arch_mem_regions.iter() {
934             if region.r_type == RegionType::Ram {
935                 // Ignore the RAM type since ranges have already been allocated
936                 // based on the GuestMemory regions.
937                 continue;
938             }
939             self.ram_allocator
940                 .allocate(
941                     Some(GuestAddress(region.base)),
942                     region.size as GuestUsize,
943                     None,
944                 )
945                 .ok_or(Error::MemoryRangeAllocation)?;
946         }
947 
948         Ok(())
949     }
950 
951     #[cfg(target_arch = "aarch64")]
952     fn add_uefi_flash(&mut self) -> Result<(), Error> {
953         // On AArch64, the UEFI binary requires a flash device at address 0.
954         // 4 MiB memory is mapped to simulate the flash.
955         let uefi_mem_slot = self.allocate_memory_slot();
956         let uefi_region = GuestRegionMmap::new(
957             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
958             arch::layout::UEFI_START,
959         )
960         .unwrap();
961         let uefi_mem_region = self.vm.make_user_memory_region(
962             uefi_mem_slot,
963             uefi_region.start_addr().raw_value(),
964             uefi_region.len(),
965             uefi_region.as_ptr() as u64,
966             false,
967             false,
968         );
969         self.vm
970             .create_user_memory_region(uefi_mem_region)
971             .map_err(Error::CreateUefiFlash)?;
972 
973         let uefi_flash =
974             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
975 
976         self.uefi_flash = Some(uefi_flash);
977 
978         Ok(())
979     }
980 
981     #[allow(clippy::too_many_arguments)]
982     pub fn new(
983         vm: Arc<dyn hypervisor::Vm>,
984         config: &MemoryConfig,
985         prefault: Option<bool>,
986         phys_bits: u8,
987         #[cfg(feature = "tdx")] tdx_enabled: bool,
988         restore_data: Option<&MemoryManagerSnapshotData>,
989         existing_memory_files: Option<HashMap<u32, File>>,
990         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
991     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
992         trace_scoped!("MemoryManager::new");
993 
994         let user_provided_zones = config.size == 0;
995 
996         let mmio_address_space_size = mmio_address_space_size(phys_bits);
997         debug_assert_eq!(
998             (((mmio_address_space_size) >> 16) << 16),
999             mmio_address_space_size
1000         );
1001         let start_of_platform_device_area =
1002             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
1003         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
1004 
1005         let (ram_size, zones, allow_mem_hotplug) =
1006             Self::validate_memory_config(config, user_provided_zones)?;
1007 
1008         let (
1009             start_of_device_area,
1010             boot_ram,
1011             current_ram,
1012             arch_mem_regions,
1013             memory_zones,
1014             guest_memory,
1015             boot_guest_memory,
1016             hotplug_slots,
1017             next_memory_slot,
1018             selected_slot,
1019             next_hotplug_slot,
1020         ) = if let Some(data) = restore_data {
1021             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1022                 &data.guest_ram_mappings,
1023                 &zones,
1024                 prefault,
1025                 existing_memory_files.unwrap_or_default(),
1026                 config.thp,
1027             )?;
1028             let guest_memory =
1029                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1030             let boot_guest_memory = guest_memory.clone();
1031             (
1032                 GuestAddress(data.start_of_device_area),
1033                 data.boot_ram,
1034                 data.current_ram,
1035                 data.arch_mem_regions.clone(),
1036                 memory_zones,
1037                 guest_memory,
1038                 boot_guest_memory,
1039                 data.hotplug_slots.clone(),
1040                 data.next_memory_slot,
1041                 data.selected_slot,
1042                 data.next_hotplug_slot,
1043             )
1044         } else {
1045             // Init guest memory
1046             let arch_mem_regions = arch::arch_memory_regions();
1047 
1048             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1049                 .iter()
1050                 .filter(|r| r.2 == RegionType::Ram)
1051                 .map(|r| (r.0, r.1))
1052                 .collect();
1053 
1054             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1055                 .iter()
1056                 .map(|(a, b, c)| ArchMemRegion {
1057                     base: a.0,
1058                     size: *b,
1059                     r_type: *c,
1060                 })
1061                 .collect();
1062 
1063             let (mem_regions, mut memory_zones) =
1064                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1065 
1066             let mut guest_memory =
1067                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1068 
1069             let boot_guest_memory = guest_memory.clone();
1070 
1071             let mut start_of_device_area =
1072                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1073 
1074             // Update list of memory zones for resize.
1075             for zone in zones.iter() {
1076                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1077                     if let Some(hotplug_size) = zone.hotplug_size {
1078                         if hotplug_size == 0 {
1079                             error!("'hotplug_size' can't be 0");
1080                             return Err(Error::InvalidHotplugSize);
1081                         }
1082 
1083                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1084                             start_of_device_area = start_of_device_area
1085                                 .checked_add(hotplug_size)
1086                                 .ok_or(Error::GuestAddressOverFlow)?;
1087                         } else {
1088                             // Alignment must be "natural" i.e. same as size of block
1089                             let start_addr = GuestAddress(
1090                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1091                                     - 1)
1092                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1093                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1094                             );
1095 
1096                             // When `prefault` is set by vm_restore, memory manager
1097                             // will create ram region with `prefault` option in
1098                             // restore config rather than same option in zone
1099                             let region = MemoryManager::create_ram_region(
1100                                 &None,
1101                                 0,
1102                                 start_addr,
1103                                 hotplug_size as usize,
1104                                 prefault.unwrap_or(zone.prefault),
1105                                 zone.shared,
1106                                 zone.hugepages,
1107                                 zone.hugepage_size,
1108                                 zone.host_numa_node,
1109                                 None,
1110                                 config.thp,
1111                             )?;
1112 
1113                             guest_memory = guest_memory
1114                                 .insert_region(Arc::clone(&region))
1115                                 .map_err(Error::GuestMemory)?;
1116 
1117                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1118                             let region_size = region.len();
1119                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1120                                 region,
1121                                 virtio_device: None,
1122                                 hotplugged_size,
1123                                 hugepages: zone.hugepages,
1124                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1125                             });
1126 
1127                             start_of_device_area = start_addr
1128                                 .checked_add(hotplug_size)
1129                                 .ok_or(Error::GuestAddressOverFlow)?;
1130                         }
1131                     }
1132                 } else {
1133                     return Err(Error::MissingZoneIdentifier);
1134                 }
1135             }
1136 
1137             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1138             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1139 
1140             (
1141                 start_of_device_area,
1142                 ram_size,
1143                 ram_size,
1144                 arch_mem_regions,
1145                 memory_zones,
1146                 guest_memory,
1147                 boot_guest_memory,
1148                 hotplug_slots,
1149                 0,
1150                 0,
1151                 0,
1152             )
1153         };
1154 
1155         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1156 
1157         // Both MMIO and PIO address spaces start at address 0.
1158         let allocator = Arc::new(Mutex::new(
1159             SystemAllocator::new(
1160                 #[cfg(target_arch = "x86_64")]
1161                 {
1162                     GuestAddress(0)
1163                 },
1164                 #[cfg(target_arch = "x86_64")]
1165                 {
1166                     1 << 16
1167                 },
1168                 start_of_platform_device_area,
1169                 PLATFORM_DEVICE_AREA_SIZE,
1170                 #[cfg(target_arch = "x86_64")]
1171                 vec![GsiApic::new(
1172                     X86_64_IRQ_BASE,
1173                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1174                 )],
1175             )
1176             .ok_or(Error::CreateSystemAllocator)?,
1177         ));
1178 
1179         #[cfg(not(feature = "tdx"))]
1180         let dynamic = true;
1181         #[cfg(feature = "tdx")]
1182         let dynamic = !tdx_enabled;
1183 
1184         let acpi_address = if dynamic
1185             && config.hotplug_method == HotplugMethod::Acpi
1186             && (config.hotplug_size.unwrap_or_default() > 0)
1187         {
1188             Some(
1189                 allocator
1190                     .lock()
1191                     .unwrap()
1192                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1193                     .ok_or(Error::AllocateMmioAddress)?,
1194             )
1195         } else {
1196             None
1197         };
1198 
1199         // If running on SGX the start of device area and RAM area may diverge but
1200         // at this point they are next to each other.
1201         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1202         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1203 
1204         let mut memory_manager = MemoryManager {
1205             boot_guest_memory,
1206             guest_memory,
1207             next_memory_slot,
1208             start_of_device_area,
1209             end_of_device_area,
1210             end_of_ram_area,
1211             vm,
1212             hotplug_slots,
1213             selected_slot,
1214             mergeable: config.mergeable,
1215             allocator,
1216             hotplug_method: config.hotplug_method,
1217             boot_ram,
1218             current_ram,
1219             next_hotplug_slot,
1220             shared: config.shared,
1221             hugepages: config.hugepages,
1222             hugepage_size: config.hugepage_size,
1223             prefault: config.prefault,
1224             #[cfg(target_arch = "x86_64")]
1225             sgx_epc_region: None,
1226             user_provided_zones,
1227             snapshot_memory_ranges: MemoryRangeTable::default(),
1228             memory_zones,
1229             guest_ram_mappings: Vec::new(),
1230             acpi_address,
1231             log_dirty: dynamic, // Cannot log dirty pages on a TD
1232             arch_mem_regions,
1233             ram_allocator,
1234             dynamic,
1235             #[cfg(target_arch = "aarch64")]
1236             uefi_flash: None,
1237             thp: config.thp,
1238         };
1239 
1240         #[cfg(target_arch = "aarch64")]
1241         {
1242             // For Aarch64 we cannot lazily allocate the address space like we
1243             // do for x86, because while restoring a VM from snapshot we would
1244             // need the address space to be allocated to properly restore VGIC.
1245             // And the restore of VGIC happens before we attempt to run the vCPUs
1246             // for the first time, thus we need to allocate the address space
1247             // beforehand.
1248             memory_manager.allocate_address_space()?;
1249             memory_manager.add_uefi_flash()?;
1250         }
1251 
1252         #[cfg(target_arch = "x86_64")]
1253         if let Some(sgx_epc_config) = sgx_epc_config {
1254             memory_manager.setup_sgx(sgx_epc_config)?;
1255         }
1256 
1257         Ok(Arc::new(Mutex::new(memory_manager)))
1258     }
1259 
1260     pub fn new_from_snapshot(
1261         snapshot: &Snapshot,
1262         vm: Arc<dyn hypervisor::Vm>,
1263         config: &MemoryConfig,
1264         source_url: Option<&str>,
1265         prefault: bool,
1266         phys_bits: u8,
1267     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1268         if let Some(source_url) = source_url {
1269             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1270             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1271 
1272             let mem_snapshot: MemoryManagerSnapshotData =
1273                 snapshot.to_state().map_err(Error::Restore)?;
1274 
1275             let mm = MemoryManager::new(
1276                 vm,
1277                 config,
1278                 Some(prefault),
1279                 phys_bits,
1280                 #[cfg(feature = "tdx")]
1281                 false,
1282                 Some(&mem_snapshot),
1283                 None,
1284                 #[cfg(target_arch = "x86_64")]
1285                 None,
1286             )?;
1287 
1288             mm.lock()
1289                 .unwrap()
1290                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1291 
1292             Ok(mm)
1293         } else {
1294             Err(Error::RestoreMissingSourceUrl)
1295         }
1296     }
1297 
1298     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1299         // SAFETY: FFI call with correct arguments
1300         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1301 
1302         if res < 0 {
1303             Err(io::Error::last_os_error())
1304         } else {
1305             Ok(res as RawFd)
1306         }
1307     }
1308 
1309     fn mbind(
1310         addr: *mut u8,
1311         len: u64,
1312         mode: u32,
1313         nodemask: Vec<u64>,
1314         maxnode: u64,
1315         flags: u32,
1316     ) -> Result<(), io::Error> {
1317         // SAFETY: FFI call with correct arguments
1318         let res = unsafe {
1319             libc::syscall(
1320                 libc::SYS_mbind,
1321                 addr as *mut libc::c_void,
1322                 len,
1323                 mode,
1324                 nodemask.as_ptr(),
1325                 maxnode,
1326                 flags,
1327             )
1328         };
1329 
1330         if res < 0 {
1331             Err(io::Error::last_os_error())
1332         } else {
1333             Ok(())
1334         }
1335     }
1336 
1337     fn create_anonymous_file(
1338         size: usize,
1339         hugepages: bool,
1340         hugepage_size: Option<u64>,
1341     ) -> Result<FileOffset, Error> {
1342         let fd = Self::memfd_create(
1343             &ffi::CString::new("ch_ram").unwrap(),
1344             libc::MFD_CLOEXEC
1345                 | if hugepages {
1346                     libc::MFD_HUGETLB
1347                         | if let Some(hugepage_size) = hugepage_size {
1348                             /*
1349                              * From the Linux kernel:
1350                              * Several system calls take a flag to request "hugetlb" huge pages.
1351                              * Without further specification, these system calls will use the
1352                              * system's default huge page size.  If a system supports multiple
1353                              * huge page sizes, the desired huge page size can be specified in
1354                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1355                              * will encode the log2 of the huge page size.
1356                              */
1357 
1358                             hugepage_size.trailing_zeros() << 26
1359                         } else {
1360                             // Use the system default huge page size
1361                             0
1362                         }
1363                 } else {
1364                     0
1365                 },
1366         )
1367         .map_err(Error::SharedFileCreate)?;
1368 
1369         // SAFETY: fd is valid
1370         let f = unsafe { File::from_raw_fd(fd) };
1371         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1372 
1373         Ok(FileOffset::new(f, 0))
1374     }
1375 
1376     fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1377         if backing_file.is_dir() {
1378             Err(Error::DirectoryAsBackingFileForMemory)
1379         } else {
1380             let f = OpenOptions::new()
1381                 .read(true)
1382                 .write(true)
1383                 .open(backing_file)
1384                 .map_err(Error::SharedFileCreate)?;
1385 
1386             Ok(FileOffset::new(f, file_offset))
1387         }
1388     }
1389 
1390     #[allow(clippy::too_many_arguments)]
1391     pub fn create_ram_region(
1392         backing_file: &Option<PathBuf>,
1393         file_offset: u64,
1394         start_addr: GuestAddress,
1395         size: usize,
1396         prefault: bool,
1397         shared: bool,
1398         hugepages: bool,
1399         hugepage_size: Option<u64>,
1400         host_numa_node: Option<u32>,
1401         existing_memory_file: Option<File>,
1402         thp: bool,
1403     ) -> Result<Arc<GuestRegionMmap>, Error> {
1404         let mut mmap_flags = libc::MAP_NORESERVE;
1405 
1406         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1407         // the complexity of the handling clear.
1408         let fo = if let Some(f) = existing_memory_file {
1409             // It must be MAP_SHARED as we wouldn't already have an FD
1410             mmap_flags |= libc::MAP_SHARED;
1411             Some(FileOffset::new(f, file_offset))
1412         } else if let Some(backing_file) = backing_file {
1413             if shared {
1414                 mmap_flags |= libc::MAP_SHARED;
1415             } else {
1416                 mmap_flags |= libc::MAP_PRIVATE;
1417             }
1418             Some(Self::open_backing_file(backing_file, file_offset)?)
1419         } else if shared || hugepages {
1420             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1421             // because the MAP_PRIVATE will trigger CoW against the backing file with
1422             // the VFIO pinning
1423             mmap_flags |= libc::MAP_SHARED;
1424             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1425         } else {
1426             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1427             None
1428         };
1429 
1430         let region = GuestRegionMmap::new(
1431             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1432                 .map_err(Error::GuestMemoryRegion)?,
1433             start_addr,
1434         )
1435         .map_err(Error::GuestMemory)?;
1436 
1437         // Apply NUMA policy if needed.
1438         if let Some(node) = host_numa_node {
1439             let addr = region.deref().as_ptr();
1440             let len = region.deref().size() as u64;
1441             let mode = MPOL_BIND;
1442             let mut nodemask: Vec<u64> = Vec::new();
1443             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1444 
1445             // Linux is kind of buggy in the way it interprets maxnode as it
1446             // will cut off the last node. That's why we have to add 1 to what
1447             // we would consider as the proper maxnode value.
1448             let maxnode = node as u64 + 1 + 1;
1449 
1450             // Allocate the right size for the vector.
1451             nodemask.resize((node as usize / 64) + 1, 0);
1452 
1453             // Fill the global bitmask through the nodemask vector.
1454             let idx = (node / 64) as usize;
1455             let shift = node % 64;
1456             nodemask[idx] |= 1u64 << shift;
1457 
1458             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1459             // force the kernel to move all pages that might have been already
1460             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1461             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1462             // MPOL_BIND is the selected mode as it specifies a strict policy
1463             // that restricts memory allocation to the nodes specified in the
1464             // nodemask.
1465             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1466                 .map_err(Error::ApplyNumaPolicy)?;
1467         }
1468 
1469         // Prefault the region if needed, in parallel.
1470         if prefault {
1471             let page_size =
1472                 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
1473 
1474             if !is_aligned(size, page_size) {
1475                 warn!(
1476                     "Prefaulting memory size {} misaligned with page size {}",
1477                     size, page_size
1478                 );
1479             }
1480 
1481             let num_pages = size / page_size;
1482 
1483             let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
1484 
1485             let pages_per_thread = num_pages / num_threads;
1486             let remainder = num_pages % num_threads;
1487 
1488             let barrier = Arc::new(Barrier::new(num_threads));
1489             thread::scope(|s| {
1490                 let r = &region;
1491                 for i in 0..num_threads {
1492                     let barrier = Arc::clone(&barrier);
1493                     s.spawn(move || {
1494                         // Wait until all threads have been spawned to avoid contention
1495                         // over mmap_sem between thread stack allocation and page faulting.
1496                         barrier.wait();
1497                         let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
1498                         let offset =
1499                             page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
1500                         // SAFETY: FFI call with correct arguments
1501                         let ret = unsafe {
1502                             let addr = r.as_ptr().add(offset);
1503                             libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
1504                         };
1505                         if ret != 0 {
1506                             let e = io::Error::last_os_error();
1507                             warn!("Failed to prefault pages: {}", e);
1508                         }
1509                     });
1510                 }
1511             });
1512         }
1513 
1514         if region.file_offset().is_none() && thp {
1515             info!(
1516                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1517                 region.as_ptr() as u64,
1518                 size
1519             );
1520             // SAFETY: FFI call with correct arguments
1521             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1522             if ret != 0 {
1523                 let e = io::Error::last_os_error();
1524                 warn!("Failed to mark pages as THP eligible: {}", e);
1525             }
1526         }
1527 
1528         Ok(Arc::new(region))
1529     }
1530 
1531     // Duplicate of `memory_zone_get_align_size` that does not require a `zone`
1532     fn get_prefault_align_size(
1533         backing_file: &Option<PathBuf>,
1534         hugepages: bool,
1535         hugepage_size: Option<u64>,
1536     ) -> Result<u64, Error> {
1537         // SAFETY: FFI call. Trivially safe.
1538         let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
1539         match (hugepages, hugepage_size, backing_file) {
1540             (false, _, _) => Ok(page_size),
1541             (true, Some(hugepage_size), _) => Ok(hugepage_size),
1542             (true, None, _) => {
1543                 // There are two scenarios here:
1544                 //  - `hugepages` is enabled but `hugepage_size` is not specified:
1545                 //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
1546                 //  - The backing file is specified:
1547                 //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
1548                 //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
1549                 //     value is less than or equal to the page size, just use the page size.
1550                 let path = backing_file
1551                     .as_ref()
1552                     .map_or(Ok("/dev/hugepages"), |pathbuf| {
1553                         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
1554                     })?;
1555                 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
1556                 Ok(align_size)
1557             }
1558         }
1559     }
1560 
1561     fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
1562         let mut n: usize = 1;
1563 
1564         // Do not create more threads than processors available.
1565         // SAFETY: FFI call. Trivially safe.
1566         let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
1567         if procs > 0 {
1568             n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
1569         }
1570 
1571         // Do not create more threads than pages being allocated.
1572         n = std::cmp::min(n, num_pages);
1573 
1574         // Do not create threads to allocate less than 64 MiB of memory.
1575         n = std::cmp::min(
1576             n,
1577             std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
1578         );
1579 
1580         n
1581     }
1582 
1583     // Update the GuestMemoryMmap with the new range
1584     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1585         let guest_memory = self
1586             .guest_memory
1587             .memory()
1588             .insert_region(region)
1589             .map_err(Error::GuestMemory)?;
1590         self.guest_memory.lock().unwrap().replace(guest_memory);
1591 
1592         Ok(())
1593     }
1594 
1595     //
1596     // Calculate the start address of an area next to RAM.
1597     //
1598     // If memory hotplug is allowed, the start address needs to be aligned
1599     // (rounded-up) to 128MiB boundary.
1600     // If memory hotplug is not allowed, there is no alignment required.
1601     // And it must also start at the 64bit start.
1602     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1603         let mut start_addr = if allow_mem_hotplug {
1604             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1605         } else {
1606             mem_end
1607         };
1608 
1609         start_addr = start_addr
1610             .checked_add(1)
1611             .ok_or(Error::GuestAddressOverFlow)?;
1612 
1613         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1614             return Ok(arch::layout::RAM_64BIT_START);
1615         }
1616 
1617         Ok(start_addr)
1618     }
1619 
1620     pub fn add_ram_region(
1621         &mut self,
1622         start_addr: GuestAddress,
1623         size: usize,
1624     ) -> Result<Arc<GuestRegionMmap>, Error> {
1625         // Allocate memory for the region
1626         let region = MemoryManager::create_ram_region(
1627             &None,
1628             0,
1629             start_addr,
1630             size,
1631             self.prefault,
1632             self.shared,
1633             self.hugepages,
1634             self.hugepage_size,
1635             None,
1636             None,
1637             self.thp,
1638         )?;
1639 
1640         // Map it into the guest
1641         let slot = self.create_userspace_mapping(
1642             region.start_addr().0,
1643             region.len(),
1644             region.as_ptr() as u64,
1645             self.mergeable,
1646             false,
1647             self.log_dirty,
1648         )?;
1649         self.guest_ram_mappings.push(GuestRamMapping {
1650             gpa: region.start_addr().raw_value(),
1651             size: region.len(),
1652             slot,
1653             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1654             virtio_mem: false,
1655             file_offset: 0,
1656         });
1657 
1658         self.add_region(Arc::clone(&region))?;
1659 
1660         Ok(region)
1661     }
1662 
1663     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1664         info!("Hotplugging new RAM: {}", size);
1665 
1666         // Check that there is a free slot
1667         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1668             return Err(Error::NoSlotAvailable);
1669         }
1670 
1671         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1672         if size % (128 << 20) != 0 {
1673             return Err(Error::InvalidSize);
1674         }
1675 
1676         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1677 
1678         if start_addr
1679             .checked_add((size - 1).try_into().unwrap())
1680             .unwrap()
1681             > self.end_of_ram_area
1682         {
1683             return Err(Error::InsufficientHotplugRam);
1684         }
1685 
1686         let region = self.add_ram_region(start_addr, size)?;
1687 
1688         // Add region to the list of regions associated with the default
1689         // memory zone.
1690         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1691             memory_zone.regions.push(Arc::clone(&region));
1692         }
1693 
1694         // Tell the allocator
1695         self.ram_allocator
1696             .allocate(Some(start_addr), size as GuestUsize, None)
1697             .ok_or(Error::MemoryRangeAllocation)?;
1698 
1699         // Update the slot so that it can be queried via the I/O port
1700         let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1701         slot.active = true;
1702         slot.inserting = true;
1703         slot.base = region.start_addr().0;
1704         slot.length = region.len();
1705 
1706         self.next_hotplug_slot += 1;
1707 
1708         Ok(region)
1709     }
1710 
1711     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1712         self.guest_memory.clone()
1713     }
1714 
1715     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1716         self.boot_guest_memory.clone()
1717     }
1718 
1719     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1720         self.allocator.clone()
1721     }
1722 
1723     pub fn start_of_device_area(&self) -> GuestAddress {
1724         self.start_of_device_area
1725     }
1726 
1727     pub fn end_of_device_area(&self) -> GuestAddress {
1728         self.end_of_device_area
1729     }
1730 
1731     pub fn allocate_memory_slot(&mut self) -> u32 {
1732         let slot_id = self.next_memory_slot;
1733         self.next_memory_slot += 1;
1734         slot_id
1735     }
1736 
1737     pub fn create_userspace_mapping(
1738         &mut self,
1739         guest_phys_addr: u64,
1740         memory_size: u64,
1741         userspace_addr: u64,
1742         mergeable: bool,
1743         readonly: bool,
1744         log_dirty: bool,
1745     ) -> Result<u32, Error> {
1746         let slot = self.allocate_memory_slot();
1747         let mem_region = self.vm.make_user_memory_region(
1748             slot,
1749             guest_phys_addr,
1750             memory_size,
1751             userspace_addr,
1752             readonly,
1753             log_dirty,
1754         );
1755 
1756         info!(
1757             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1758             guest_phys_addr, userspace_addr, memory_size, slot
1759         );
1760 
1761         self.vm
1762             .create_user_memory_region(mem_region)
1763             .map_err(Error::CreateUserMemoryRegion)?;
1764 
1765         // SAFETY: the address and size are valid since the
1766         // mmap succeeded.
1767         let ret = unsafe {
1768             libc::madvise(
1769                 userspace_addr as *mut libc::c_void,
1770                 memory_size as libc::size_t,
1771                 libc::MADV_DONTDUMP,
1772             )
1773         };
1774         if ret != 0 {
1775             let e = io::Error::last_os_error();
1776             warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e);
1777         }
1778 
1779         // Mark the pages as mergeable if explicitly asked for.
1780         if mergeable {
1781             // SAFETY: the address and size are valid since the
1782             // mmap succeeded.
1783             let ret = unsafe {
1784                 libc::madvise(
1785                     userspace_addr as *mut libc::c_void,
1786                     memory_size as libc::size_t,
1787                     libc::MADV_MERGEABLE,
1788                 )
1789             };
1790             if ret != 0 {
1791                 let err = io::Error::last_os_error();
1792                 // Safe to unwrap because the error is constructed with
1793                 // last_os_error(), which ensures the output will be Some().
1794                 let errno = err.raw_os_error().unwrap();
1795                 if errno == libc::EINVAL {
1796                     warn!("kernel not configured with CONFIG_KSM");
1797                 } else {
1798                     warn!("madvise error: {}", err);
1799                 }
1800                 warn!("failed to mark pages as mergeable");
1801             }
1802         }
1803 
1804         info!(
1805             "Created userspace mapping: {:x} -> {:x} {:x}",
1806             guest_phys_addr, userspace_addr, memory_size
1807         );
1808 
1809         Ok(slot)
1810     }
1811 
1812     pub fn remove_userspace_mapping(
1813         &mut self,
1814         guest_phys_addr: u64,
1815         memory_size: u64,
1816         userspace_addr: u64,
1817         mergeable: bool,
1818         slot: u32,
1819     ) -> Result<(), Error> {
1820         let mem_region = self.vm.make_user_memory_region(
1821             slot,
1822             guest_phys_addr,
1823             memory_size,
1824             userspace_addr,
1825             false, /* readonly -- don't care */
1826             false, /* log dirty */
1827         );
1828 
1829         self.vm
1830             .remove_user_memory_region(mem_region)
1831             .map_err(Error::RemoveUserMemoryRegion)?;
1832 
1833         // Mark the pages as unmergeable if there were previously marked as
1834         // mergeable.
1835         if mergeable {
1836             // SAFETY: the address and size are valid as the region was
1837             // previously advised.
1838             let ret = unsafe {
1839                 libc::madvise(
1840                     userspace_addr as *mut libc::c_void,
1841                     memory_size as libc::size_t,
1842                     libc::MADV_UNMERGEABLE,
1843                 )
1844             };
1845             if ret != 0 {
1846                 let err = io::Error::last_os_error();
1847                 // Safe to unwrap because the error is constructed with
1848                 // last_os_error(), which ensures the output will be Some().
1849                 let errno = err.raw_os_error().unwrap();
1850                 if errno == libc::EINVAL {
1851                     warn!("kernel not configured with CONFIG_KSM");
1852                 } else {
1853                     warn!("madvise error: {}", err);
1854                 }
1855                 warn!("failed to mark pages as unmergeable");
1856             }
1857         }
1858 
1859         info!(
1860             "Removed userspace mapping: {:x} -> {:x} {:x}",
1861             guest_phys_addr, userspace_addr, memory_size
1862         );
1863 
1864         Ok(())
1865     }
1866 
1867     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1868         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1869             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1870                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1871                     virtio_mem_device
1872                         .lock()
1873                         .unwrap()
1874                         .resize(size)
1875                         .map_err(Error::VirtioMemResizeFail)?;
1876                 }
1877 
1878                 // Keep the hotplugged_size up to date.
1879                 virtio_mem_zone.hotplugged_size = size;
1880             } else {
1881                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1882                 return Err(Error::MissingVirtioMemHandler);
1883             }
1884 
1885             return Ok(());
1886         }
1887 
1888         error!("Failed resizing virtio-mem region: Unknown memory zone");
1889         Err(Error::UnknownMemoryZone)
1890     }
1891 
1892     /// In case this function resulted in adding a new memory region to the
1893     /// guest memory, the new region is returned to the caller. The virtio-mem
1894     /// use case never adds a new region as the whole hotpluggable memory has
1895     /// already been allocated at boot time.
1896     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1897         if self.user_provided_zones {
1898             error!(
1899                 "Not allowed to resize guest memory when backed with user \
1900                 defined memory zones."
1901             );
1902             return Err(Error::InvalidResizeWithMemoryZones);
1903         }
1904 
1905         let mut region: Option<Arc<GuestRegionMmap>> = None;
1906         match self.hotplug_method {
1907             HotplugMethod::VirtioMem => {
1908                 if desired_ram >= self.boot_ram {
1909                     if !self.dynamic {
1910                         return Ok(region);
1911                     }
1912 
1913                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1914                     self.current_ram = desired_ram;
1915                 }
1916             }
1917             HotplugMethod::Acpi => {
1918                 if desired_ram > self.current_ram {
1919                     if !self.dynamic {
1920                         return Ok(region);
1921                     }
1922 
1923                     region =
1924                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1925                     self.current_ram = desired_ram;
1926                 }
1927             }
1928         }
1929         Ok(region)
1930     }
1931 
1932     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1933         if !self.user_provided_zones {
1934             error!(
1935                 "Not allowed to resize guest memory zone when no zone is \
1936                 defined."
1937             );
1938             return Err(Error::ResizeZone);
1939         }
1940 
1941         self.virtio_mem_resize(id, virtio_mem_size)
1942     }
1943 
1944     #[cfg(target_arch = "x86_64")]
1945     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1946         let file = OpenOptions::new()
1947             .read(true)
1948             .open("/dev/sgx_provision")
1949             .map_err(Error::SgxProvisionOpen)?;
1950         self.vm
1951             .enable_sgx_attribute(file)
1952             .map_err(Error::SgxEnableProvisioning)?;
1953 
1954         // Go over each EPC section and verify its size is a 4k multiple. At
1955         // the same time, calculate the total size needed for the contiguous
1956         // EPC region.
1957         let mut epc_region_size = 0;
1958         for epc_section in sgx_epc_config.iter() {
1959             if epc_section.size == 0 {
1960                 return Err(Error::EpcSectionSizeInvalid);
1961             }
1962             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1963                 return Err(Error::EpcSectionSizeInvalid);
1964             }
1965 
1966             epc_region_size += epc_section.size;
1967         }
1968 
1969         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1970         let epc_region_start = GuestAddress(
1971             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1972         );
1973 
1974         self.start_of_device_area = epc_region_start
1975             .checked_add(epc_region_size)
1976             .ok_or(Error::GuestAddressOverFlow)?;
1977 
1978         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1979         info!(
1980             "SGX EPC region: 0x{:x} (0x{:x})",
1981             epc_region_start.0, epc_region_size
1982         );
1983 
1984         // Each section can be memory mapped into the allocated region.
1985         let mut epc_section_start = epc_region_start.raw_value();
1986         for epc_section in sgx_epc_config.iter() {
1987             let file = OpenOptions::new()
1988                 .read(true)
1989                 .write(true)
1990                 .open("/dev/sgx_vepc")
1991                 .map_err(Error::SgxVirtEpcOpen)?;
1992 
1993             let prot = PROT_READ | PROT_WRITE;
1994             let mut flags = MAP_NORESERVE | MAP_SHARED;
1995             if epc_section.prefault {
1996                 flags |= MAP_POPULATE;
1997             }
1998 
1999             // We can't use the vm-memory crate to perform the memory mapping
2000             // here as it would try to ensure the size of the backing file is
2001             // matching the size of the expected mapping. The /dev/sgx_vepc
2002             // device does not work that way, it provides a file descriptor
2003             // which is not matching the mapping size, as it's a just a way to
2004             // let KVM know that an EPC section is being created for the guest.
2005             // SAFETY: FFI call with correct arguments
2006             let host_addr = unsafe {
2007                 libc::mmap(
2008                     std::ptr::null_mut(),
2009                     epc_section.size as usize,
2010                     prot,
2011                     flags,
2012                     file.as_raw_fd(),
2013                     0,
2014                 )
2015             } as u64;
2016 
2017             info!(
2018                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
2019                 epc_section_start, epc_section.size
2020             );
2021 
2022             let _mem_slot = self.create_userspace_mapping(
2023                 epc_section_start,
2024                 epc_section.size,
2025                 host_addr,
2026                 false,
2027                 false,
2028                 false,
2029             )?;
2030 
2031             sgx_epc_region.insert(
2032                 epc_section.id.clone(),
2033                 SgxEpcSection::new(
2034                     GuestAddress(epc_section_start),
2035                     epc_section.size as GuestUsize,
2036                 ),
2037             );
2038 
2039             epc_section_start += epc_section.size;
2040         }
2041 
2042         self.sgx_epc_region = Some(sgx_epc_region);
2043 
2044         Ok(())
2045     }
2046 
2047     #[cfg(target_arch = "x86_64")]
2048     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
2049         &self.sgx_epc_region
2050     }
2051 
2052     pub fn is_hardlink(f: &File) -> bool {
2053         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
2054         // SAFETY: FFI call with correct arguments
2055         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
2056         if ret != 0 {
2057             error!("Couldn't fstat the backing file");
2058             return false;
2059         }
2060 
2061         // SAFETY: stat is valid
2062         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
2063     }
2064 
2065     pub fn memory_zones(&self) -> &MemoryZones {
2066         &self.memory_zones
2067     }
2068 
2069     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
2070         &mut self.memory_zones
2071     }
2072 
2073     pub fn memory_range_table(
2074         &self,
2075         snapshot: bool,
2076     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
2077         let mut table = MemoryRangeTable::default();
2078 
2079         for memory_zone in self.memory_zones.values() {
2080             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
2081                 table.extend(virtio_mem_zone.plugged_ranges());
2082             }
2083 
2084             for region in memory_zone.regions() {
2085                 if snapshot {
2086                     if let Some(file_offset) = region.file_offset() {
2087                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
2088                             && Self::is_hardlink(file_offset.file())
2089                         {
2090                             // In this very specific case, we know the memory
2091                             // region is backed by a file on the host filesystem
2092                             // that can be accessed by the user, and additionally
2093                             // the mapping is shared, which means that modifications
2094                             // to the content are written to the actual file.
2095                             // When meeting these conditions, we can skip the
2096                             // copy of the memory content for this specific region,
2097                             // as we can assume the user will have it saved through
2098                             // the backing file already.
2099                             continue;
2100                         }
2101                     }
2102                 }
2103 
2104                 table.push(MemoryRange {
2105                     gpa: region.start_addr().raw_value(),
2106                     length: region.len(),
2107                 });
2108             }
2109         }
2110 
2111         Ok(table)
2112     }
2113 
2114     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2115         MemoryManagerSnapshotData {
2116             memory_ranges: self.snapshot_memory_ranges.clone(),
2117             guest_ram_mappings: self.guest_ram_mappings.clone(),
2118             start_of_device_area: self.start_of_device_area.0,
2119             boot_ram: self.boot_ram,
2120             current_ram: self.current_ram,
2121             arch_mem_regions: self.arch_mem_regions.clone(),
2122             hotplug_slots: self.hotplug_slots.clone(),
2123             next_memory_slot: self.next_memory_slot,
2124             selected_slot: self.selected_slot,
2125             next_hotplug_slot: self.next_hotplug_slot,
2126         }
2127     }
2128 
2129     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2130         let mut memory_slot_fds = HashMap::new();
2131         for guest_ram_mapping in &self.guest_ram_mappings {
2132             let slot = guest_ram_mapping.slot;
2133             let guest_memory = self.guest_memory.memory();
2134             let file = guest_memory
2135                 .find_region(GuestAddress(guest_ram_mapping.gpa))
2136                 .unwrap()
2137                 .file_offset()
2138                 .unwrap()
2139                 .file();
2140             memory_slot_fds.insert(slot, file.as_raw_fd());
2141         }
2142         memory_slot_fds
2143     }
2144 
2145     pub fn acpi_address(&self) -> Option<GuestAddress> {
2146         self.acpi_address
2147     }
2148 
2149     pub fn num_guest_ram_mappings(&self) -> u32 {
2150         self.guest_ram_mappings.len() as u32
2151     }
2152 
2153     #[cfg(target_arch = "aarch64")]
2154     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2155         self.uefi_flash.as_ref().unwrap().clone()
2156     }
2157 
2158     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2159     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2160         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2161         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2162 
2163         let mut mem_offset_in_elf = mem_offset;
2164         let mut ram_maps = BTreeMap::new();
2165         for mapping in mapping_sorted_by_gpa.iter() {
2166             ram_maps.insert(
2167                 mapping.gpa,
2168                 CoredumpMemoryRegion {
2169                     mem_offset_in_elf,
2170                     mem_size: mapping.size,
2171                 },
2172             );
2173             mem_offset_in_elf += mapping.size;
2174         }
2175 
2176         CoredumpMemoryRegions { ram_maps }
2177     }
2178 
2179     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2180     pub fn coredump_iterate_save_mem(
2181         &mut self,
2182         dump_state: &DumpState,
2183     ) -> std::result::Result<(), GuestDebuggableError> {
2184         let snapshot_memory_ranges = self
2185             .memory_range_table(false)
2186             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2187 
2188         if snapshot_memory_ranges.is_empty() {
2189             return Ok(());
2190         }
2191 
2192         let coredump_file = dump_state.file.as_ref().unwrap();
2193 
2194         let guest_memory = self.guest_memory.memory();
2195         let mut total_bytes: u64 = 0;
2196 
2197         for range in snapshot_memory_ranges.regions() {
2198             let mut offset: u64 = 0;
2199             loop {
2200                 let bytes_written = guest_memory
2201                     .write_volatile_to(
2202                         GuestAddress(range.gpa + offset),
2203                         &mut coredump_file.as_fd(),
2204                         (range.length - offset) as usize,
2205                     )
2206                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2207                 offset += bytes_written as u64;
2208                 total_bytes += bytes_written as u64;
2209 
2210                 if offset == range.length {
2211                     break;
2212                 }
2213             }
2214         }
2215 
2216         debug!("coredump total bytes {}", total_bytes);
2217         Ok(())
2218     }
2219 
2220     pub fn receive_memory_regions<F>(
2221         &mut self,
2222         ranges: &MemoryRangeTable,
2223         fd: &mut F,
2224     ) -> std::result::Result<(), MigratableError>
2225     where
2226         F: ReadVolatile,
2227     {
2228         let guest_memory = self.guest_memory();
2229         let mem = guest_memory.memory();
2230 
2231         for range in ranges.regions() {
2232             let mut offset: u64 = 0;
2233             // Here we are manually handling the retry in case we can't the
2234             // whole region at once because we can't use the implementation
2235             // from vm-memory::GuestMemory of read_exact_from() as it is not
2236             // following the correct behavior. For more info about this issue
2237             // see: https://github.com/rust-vmm/vm-memory/issues/174
2238             loop {
2239                 let bytes_read = mem
2240                     .read_volatile_from(
2241                         GuestAddress(range.gpa + offset),
2242                         fd,
2243                         (range.length - offset) as usize,
2244                     )
2245                     .map_err(|e| {
2246                         MigratableError::MigrateReceive(anyhow!(
2247                             "Error receiving memory from socket: {}",
2248                             e
2249                         ))
2250                     })?;
2251                 offset += bytes_read as u64;
2252 
2253                 if offset == range.length {
2254                     break;
2255                 }
2256             }
2257         }
2258 
2259         Ok(())
2260     }
2261 }
2262 
2263 struct MemoryNotify {
2264     slot_id: usize,
2265 }
2266 
2267 impl Aml for MemoryNotify {
2268     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2269         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2270         aml::If::new(
2271             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2272             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2273         )
2274         .to_aml_bytes(sink)
2275     }
2276 }
2277 
2278 struct MemorySlot {
2279     slot_id: usize,
2280 }
2281 
2282 impl Aml for MemorySlot {
2283     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2284         aml::Device::new(
2285             format!("M{:03}", self.slot_id).as_str().into(),
2286             vec![
2287                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2288                 &aml::Name::new("_UID".into(), &self.slot_id),
2289                 /*
2290                 _STA return value:
2291                 Bit [0] – Set if the device is present.
2292                 Bit [1] – Set if the device is enabled and decoding its resources.
2293                 Bit [2] – Set if the device should be shown in the UI.
2294                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2295                 Bit [4] – Set if the battery is present.
2296                 Bits [31:5] – Reserved (must be cleared).
2297                 */
2298                 &aml::Method::new(
2299                     "_STA".into(),
2300                     0,
2301                     false,
2302                     // Call into MSTA method which will interrogate device
2303                     vec![&aml::Return::new(&aml::MethodCall::new(
2304                         "MSTA".into(),
2305                         vec![&self.slot_id],
2306                     ))],
2307                 ),
2308                 // Get details of memory
2309                 &aml::Method::new(
2310                     "_CRS".into(),
2311                     0,
2312                     false,
2313                     // Call into MCRS which provides actual memory details
2314                     vec![&aml::Return::new(&aml::MethodCall::new(
2315                         "MCRS".into(),
2316                         vec![&self.slot_id],
2317                     ))],
2318                 ),
2319             ],
2320         )
2321         .to_aml_bytes(sink)
2322     }
2323 }
2324 
2325 struct MemorySlots {
2326     slots: usize,
2327 }
2328 
2329 impl Aml for MemorySlots {
2330     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2331         for slot_id in 0..self.slots {
2332             MemorySlot { slot_id }.to_aml_bytes(sink);
2333         }
2334     }
2335 }
2336 
2337 struct MemoryMethods {
2338     slots: usize,
2339 }
2340 
2341 impl Aml for MemoryMethods {
2342     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2343         // Add "MTFY" notification method
2344         let mut memory_notifies = Vec::new();
2345         for slot_id in 0..self.slots {
2346             memory_notifies.push(MemoryNotify { slot_id });
2347         }
2348 
2349         let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2350         for memory_notifier in memory_notifies.iter() {
2351             memory_notifies_refs.push(memory_notifier);
2352         }
2353 
2354         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2355 
2356         // MSCN method
2357         aml::Method::new(
2358             "MSCN".into(),
2359             0,
2360             true,
2361             vec![
2362                 // Take lock defined above
2363                 &aml::Acquire::new("MLCK".into(), 0xffff),
2364                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2365                 &aml::While::new(
2366                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2367                     vec![
2368                         // Write slot number (in first argument) to I/O port via field
2369                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2370                         // Check if MINS bit is set (inserting)
2371                         &aml::If::new(
2372                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2373                             // Notify device if it is
2374                             vec![
2375                                 &aml::MethodCall::new(
2376                                     "MTFY".into(),
2377                                     vec![&aml::Local(0), &aml::ONE],
2378                                 ),
2379                                 // Reset MINS bit
2380                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2381                             ],
2382                         ),
2383                         // Check if MRMV bit is set
2384                         &aml::If::new(
2385                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2386                             // Notify device if it is (with the eject constant 0x3)
2387                             vec![
2388                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2389                                 // Reset MRMV bit
2390                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2391                             ],
2392                         ),
2393                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2394                     ],
2395                 ),
2396                 // Release lock
2397                 &aml::Release::new("MLCK".into()),
2398             ],
2399         )
2400         .to_aml_bytes(sink);
2401 
2402         // Memory status method
2403         aml::Method::new(
2404             "MSTA".into(),
2405             1,
2406             true,
2407             vec![
2408                 // Take lock defined above
2409                 &aml::Acquire::new("MLCK".into(), 0xffff),
2410                 // Write slot number (in first argument) to I/O port via field
2411                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2412                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2413                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2414                 &aml::If::new(
2415                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2416                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2417                 ),
2418                 // Release lock
2419                 &aml::Release::new("MLCK".into()),
2420                 // Return 0 or 0xf
2421                 &aml::Return::new(&aml::Local(0)),
2422             ],
2423         )
2424         .to_aml_bytes(sink);
2425 
2426         // Memory range method
2427         aml::Method::new(
2428             "MCRS".into(),
2429             1,
2430             true,
2431             vec![
2432                 // Take lock defined above
2433                 &aml::Acquire::new("MLCK".into(), 0xffff),
2434                 // Write slot number (in first argument) to I/O port via field
2435                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2436                 &aml::Name::new(
2437                     "MR64".into(),
2438                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2439                         aml::AddressSpaceCacheable::Cacheable,
2440                         true,
2441                         0x0000_0000_0000_0000u64,
2442                         0xFFFF_FFFF_FFFF_FFFEu64,
2443                         None,
2444                     )]),
2445                 ),
2446                 &aml::CreateQWordField::new(
2447                     &aml::Path::new("MINL"),
2448                     &aml::Path::new("MR64"),
2449                     &14usize,
2450                 ),
2451                 &aml::CreateDWordField::new(
2452                     &aml::Path::new("MINH"),
2453                     &aml::Path::new("MR64"),
2454                     &18usize,
2455                 ),
2456                 &aml::CreateQWordField::new(
2457                     &aml::Path::new("MAXL"),
2458                     &aml::Path::new("MR64"),
2459                     &22usize,
2460                 ),
2461                 &aml::CreateDWordField::new(
2462                     &aml::Path::new("MAXH"),
2463                     &aml::Path::new("MR64"),
2464                     &26usize,
2465                 ),
2466                 &aml::CreateQWordField::new(
2467                     &aml::Path::new("LENL"),
2468                     &aml::Path::new("MR64"),
2469                     &38usize,
2470                 ),
2471                 &aml::CreateDWordField::new(
2472                     &aml::Path::new("LENH"),
2473                     &aml::Path::new("MR64"),
2474                     &42usize,
2475                 ),
2476                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2477                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2478                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2479                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2480                 &aml::Add::new(
2481                     &aml::Path::new("MAXL"),
2482                     &aml::Path::new("MINL"),
2483                     &aml::Path::new("LENL"),
2484                 ),
2485                 &aml::Add::new(
2486                     &aml::Path::new("MAXH"),
2487                     &aml::Path::new("MINH"),
2488                     &aml::Path::new("LENH"),
2489                 ),
2490                 &aml::If::new(
2491                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2492                     vec![&aml::Add::new(
2493                         &aml::Path::new("MAXH"),
2494                         &aml::ONE,
2495                         &aml::Path::new("MAXH"),
2496                     )],
2497                 ),
2498                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2499                 // Release lock
2500                 &aml::Release::new("MLCK".into()),
2501                 &aml::Return::new(&aml::Path::new("MR64")),
2502             ],
2503         )
2504         .to_aml_bytes(sink)
2505     }
2506 }
2507 
2508 impl Aml for MemoryManager {
2509     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2510         if let Some(acpi_address) = self.acpi_address {
2511             // Memory Hotplug Controller
2512             aml::Device::new(
2513                 "_SB_.MHPC".into(),
2514                 vec![
2515                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2516                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2517                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2518                     &aml::Mutex::new("MLCK".into(), 0),
2519                     &aml::Name::new(
2520                         "_CRS".into(),
2521                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2522                             aml::AddressSpaceCacheable::NotCacheable,
2523                             true,
2524                             acpi_address.0,
2525                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2526                             None,
2527                         )]),
2528                     ),
2529                     // OpRegion and Fields map MMIO range into individual field values
2530                     &aml::OpRegion::new(
2531                         "MHPR".into(),
2532                         aml::OpRegionSpace::SystemMemory,
2533                         &(acpi_address.0 as usize),
2534                         &MEMORY_MANAGER_ACPI_SIZE,
2535                     ),
2536                     &aml::Field::new(
2537                         "MHPR".into(),
2538                         aml::FieldAccessType::DWord,
2539                         aml::FieldLockRule::NoLock,
2540                         aml::FieldUpdateRule::Preserve,
2541                         vec![
2542                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2543                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2544                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2545                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2546                         ],
2547                     ),
2548                     &aml::Field::new(
2549                         "MHPR".into(),
2550                         aml::FieldAccessType::DWord,
2551                         aml::FieldLockRule::NoLock,
2552                         aml::FieldUpdateRule::Preserve,
2553                         vec![
2554                             aml::FieldEntry::Reserved(128),
2555                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2556                         ],
2557                     ),
2558                     &aml::Field::new(
2559                         "MHPR".into(),
2560                         aml::FieldAccessType::Byte,
2561                         aml::FieldLockRule::NoLock,
2562                         aml::FieldUpdateRule::WriteAsZeroes,
2563                         vec![
2564                             aml::FieldEntry::Reserved(160),
2565                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2566                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2567                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2568                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2569                         ],
2570                     ),
2571                     &aml::Field::new(
2572                         "MHPR".into(),
2573                         aml::FieldAccessType::DWord,
2574                         aml::FieldLockRule::NoLock,
2575                         aml::FieldUpdateRule::Preserve,
2576                         vec![
2577                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2578                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2579                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2580                         ],
2581                     ),
2582                     &MemoryMethods {
2583                         slots: self.hotplug_slots.len(),
2584                     },
2585                     &MemorySlots {
2586                         slots: self.hotplug_slots.len(),
2587                     },
2588                 ],
2589             )
2590             .to_aml_bytes(sink);
2591         } else {
2592             aml::Device::new(
2593                 "_SB_.MHPC".into(),
2594                 vec![
2595                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2596                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2597                     // Empty MSCN for GED
2598                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2599                 ],
2600             )
2601             .to_aml_bytes(sink);
2602         }
2603 
2604         #[cfg(target_arch = "x86_64")]
2605         {
2606             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2607                 let min = sgx_epc_region.start().raw_value();
2608                 let max = min + sgx_epc_region.size() - 1;
2609                 // SGX EPC region
2610                 aml::Device::new(
2611                     "_SB_.EPC_".into(),
2612                     vec![
2613                         &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2614                         // QWORD describing the EPC region start and size
2615                         &aml::Name::new(
2616                             "_CRS".into(),
2617                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2618                                 aml::AddressSpaceCacheable::NotCacheable,
2619                                 true,
2620                                 min,
2621                                 max,
2622                                 None,
2623                             )]),
2624                         ),
2625                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2626                     ],
2627                 )
2628                 .to_aml_bytes(sink);
2629             }
2630         }
2631     }
2632 }
2633 
2634 impl Pausable for MemoryManager {}
2635 
2636 #[derive(Clone, Serialize, Deserialize)]
2637 pub struct MemoryManagerSnapshotData {
2638     memory_ranges: MemoryRangeTable,
2639     guest_ram_mappings: Vec<GuestRamMapping>,
2640     start_of_device_area: u64,
2641     boot_ram: u64,
2642     current_ram: u64,
2643     arch_mem_regions: Vec<ArchMemRegion>,
2644     hotplug_slots: Vec<HotPlugState>,
2645     next_memory_slot: u32,
2646     selected_slot: usize,
2647     next_hotplug_slot: usize,
2648 }
2649 
2650 impl Snapshottable for MemoryManager {
2651     fn id(&self) -> String {
2652         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2653     }
2654 
2655     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2656         let memory_ranges = self.memory_range_table(true)?;
2657 
2658         // Store locally this list of ranges as it will be used through the
2659         // Transportable::send() implementation. The point is to avoid the
2660         // duplication of code regarding the creation of the path for each
2661         // region. The 'snapshot' step creates the list of memory regions,
2662         // including information about the need to copy a memory region or
2663         // not. This saves the 'send' step having to go through the same
2664         // process, and instead it can directly proceed with storing the
2665         // memory range content for the ranges requiring it.
2666         self.snapshot_memory_ranges = memory_ranges;
2667 
2668         Ok(Snapshot::from_data(SnapshotData::new_from_state(
2669             &self.snapshot_data(),
2670         )?))
2671     }
2672 }
2673 
2674 impl Transportable for MemoryManager {
2675     fn send(
2676         &self,
2677         _snapshot: &Snapshot,
2678         destination_url: &str,
2679     ) -> result::Result<(), MigratableError> {
2680         if self.snapshot_memory_ranges.is_empty() {
2681             return Ok(());
2682         }
2683 
2684         let mut memory_file_path = url_to_path(destination_url)?;
2685         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2686 
2687         // Create the snapshot file for the entire memory
2688         let mut memory_file = OpenOptions::new()
2689             .read(true)
2690             .write(true)
2691             .create_new(true)
2692             .open(memory_file_path)
2693             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2694 
2695         let guest_memory = self.guest_memory.memory();
2696 
2697         for range in self.snapshot_memory_ranges.regions() {
2698             let mut offset: u64 = 0;
2699             // Here we are manually handling the retry in case we can't read
2700             // the whole region at once because we can't use the implementation
2701             // from vm-memory::GuestMemory of write_all_to() as it is not
2702             // following the correct behavior. For more info about this issue
2703             // see: https://github.com/rust-vmm/vm-memory/issues/174
2704             loop {
2705                 let bytes_written = guest_memory
2706                     .write_volatile_to(
2707                         GuestAddress(range.gpa + offset),
2708                         &mut memory_file,
2709                         (range.length - offset) as usize,
2710                     )
2711                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2712                 offset += bytes_written as u64;
2713 
2714                 if offset == range.length {
2715                     break;
2716                 }
2717             }
2718         }
2719         Ok(())
2720     }
2721 }
2722 
2723 impl Migratable for MemoryManager {
2724     // Start the dirty log in the hypervisor (kvm/mshv).
2725     // Also, reset the dirty bitmap logged by the vmm.
2726     // Just before we do a bulk copy we want to start/clear the dirty log so that
2727     // pages touched during our bulk copy are tracked.
2728     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2729         self.vm.start_dirty_log().map_err(|e| {
2730             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2731         })?;
2732 
2733         for r in self.guest_memory.memory().iter() {
2734             r.bitmap().reset();
2735         }
2736 
2737         Ok(())
2738     }
2739 
2740     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2741         self.vm.stop_dirty_log().map_err(|e| {
2742             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2743         })?;
2744 
2745         Ok(())
2746     }
2747 
2748     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2749     // together in the table if they are contiguous.
2750     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2751         let mut table = MemoryRangeTable::default();
2752         for r in &self.guest_ram_mappings {
2753             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2754                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2755             })?;
2756             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2757             {
2758                 Some(region) => {
2759                     assert!(region.start_addr().raw_value() == r.gpa);
2760                     assert!(region.len() == r.size);
2761                     region.bitmap().get_and_reset()
2762                 }
2763                 None => {
2764                     return Err(MigratableError::MigrateSend(anyhow!(
2765                         "Error finding 'guest memory region' with address {:x}",
2766                         r.gpa
2767                     )))
2768                 }
2769             };
2770 
2771             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2772                 .iter()
2773                 .zip(vmm_dirty_bitmap.iter())
2774                 .map(|(x, y)| x | y)
2775                 .collect();
2776 
2777             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2778 
2779             if sub_table.regions().is_empty() {
2780                 info!("Dirty Memory Range Table is empty");
2781             } else {
2782                 info!("Dirty Memory Range Table:");
2783                 for range in sub_table.regions() {
2784                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2785                 }
2786             }
2787 
2788             table.extend(sub_table);
2789         }
2790         Ok(table)
2791     }
2792 }
2793