xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 30cf1eed5e63499f3101ed320fc384b59c60fc6b)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 
6 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
7 use std::collections::BTreeMap;
8 use std::collections::HashMap;
9 use std::fs::{File, OpenOptions};
10 use std::io::{self};
11 use std::ops::{BitAnd, Deref, Not, Sub};
12 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
13 use std::os::fd::AsFd;
14 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
15 use std::path::PathBuf;
16 use std::sync::atomic::{AtomicU32, Ordering};
17 use std::sync::{Arc, Barrier, Mutex};
18 use std::{ffi, result, thread};
19 
20 use acpi_tables::{aml, Aml};
21 use anyhow::anyhow;
22 #[cfg(target_arch = "x86_64")]
23 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
24 use arch::RegionType;
25 #[cfg(target_arch = "x86_64")]
26 use devices::ioapic;
27 #[cfg(target_arch = "aarch64")]
28 use hypervisor::HypervisorVmError;
29 use libc::_SC_NPROCESSORS_ONLN;
30 #[cfg(target_arch = "x86_64")]
31 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
32 use serde::{Deserialize, Serialize};
33 use tracer::trace_scoped;
34 use virtio_devices::BlocksState;
35 #[cfg(target_arch = "x86_64")]
36 use vm_allocator::GsiApic;
37 use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator};
38 use vm_device::BusDevice;
39 use vm_memory::bitmap::AtomicBitmap;
40 use vm_memory::guest_memory::FileOffset;
41 use vm_memory::mmap::MmapRegionError;
42 use vm_memory::{
43     Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
44     GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile,
45 };
46 use vm_migration::protocol::{MemoryRange, MemoryRangeTable};
47 use vm_migration::{
48     Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable,
49 };
50 
51 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
52 use crate::coredump::{
53     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
54 };
55 use crate::migration::url_to_path;
56 #[cfg(target_arch = "x86_64")]
57 use crate::vm_config::SgxEpcConfig;
58 use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
59 use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID};
60 
61 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
62 
63 const DEFAULT_MEMORY_ZONE: &str = "mem0";
64 
65 const SNAPSHOT_FILENAME: &str = "memory-ranges";
66 
67 #[cfg(target_arch = "x86_64")]
68 const X86_64_IRQ_BASE: u32 = 5;
69 
70 #[cfg(target_arch = "x86_64")]
71 const SGX_PAGE_SIZE: u64 = 1 << 12;
72 
73 const HOTPLUG_COUNT: usize = 8;
74 
75 // Memory policy constants
76 const MPOL_BIND: u32 = 2;
77 const MPOL_MF_STRICT: u32 = 1;
78 const MPOL_MF_MOVE: u32 = 1 << 1;
79 
80 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
81 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
82 
83 const MAX_PREFAULT_THREAD_COUNT: usize = 16;
84 
85 #[derive(Clone, Default, Serialize, Deserialize)]
86 struct HotPlugState {
87     base: u64,
88     length: u64,
89     active: bool,
90     inserting: bool,
91     removing: bool,
92 }
93 
94 pub struct VirtioMemZone {
95     region: Arc<GuestRegionMmap>,
96     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
97     hotplugged_size: u64,
98     hugepages: bool,
99     blocks_state: Arc<Mutex<BlocksState>>,
100 }
101 
102 impl VirtioMemZone {
103     pub fn region(&self) -> &Arc<GuestRegionMmap> {
104         &self.region
105     }
106     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
107         self.virtio_device = Some(virtio_device);
108     }
109     pub fn hotplugged_size(&self) -> u64 {
110         self.hotplugged_size
111     }
112     pub fn hugepages(&self) -> bool {
113         self.hugepages
114     }
115     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
116         &self.blocks_state
117     }
118     pub fn plugged_ranges(&self) -> MemoryRangeTable {
119         self.blocks_state
120             .lock()
121             .unwrap()
122             .memory_ranges(self.region.start_addr().raw_value(), true)
123     }
124 }
125 
126 #[derive(Default)]
127 pub struct MemoryZone {
128     regions: Vec<Arc<GuestRegionMmap>>,
129     virtio_mem_zone: Option<VirtioMemZone>,
130 }
131 
132 impl MemoryZone {
133     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
134         &self.regions
135     }
136     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
137         &self.virtio_mem_zone
138     }
139     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
140         self.virtio_mem_zone.as_mut()
141     }
142 }
143 
144 pub type MemoryZones = HashMap<String, MemoryZone>;
145 
146 #[derive(Clone, Serialize, Deserialize)]
147 struct GuestRamMapping {
148     slot: u32,
149     gpa: u64,
150     size: u64,
151     zone_id: String,
152     virtio_mem: bool,
153     file_offset: u64,
154 }
155 
156 #[derive(Clone, Serialize, Deserialize)]
157 struct ArchMemRegion {
158     base: u64,
159     size: usize,
160     r_type: RegionType,
161 }
162 
163 pub struct MemoryManager {
164     boot_guest_memory: GuestMemoryMmap,
165     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
166     next_memory_slot: Arc<AtomicU32>,
167     memory_slot_free_list: Arc<Mutex<Vec<u32>>>,
168     start_of_device_area: GuestAddress,
169     end_of_device_area: GuestAddress,
170     end_of_ram_area: GuestAddress,
171     pub vm: Arc<dyn hypervisor::Vm>,
172     hotplug_slots: Vec<HotPlugState>,
173     selected_slot: usize,
174     mergeable: bool,
175     allocator: Arc<Mutex<SystemAllocator>>,
176     hotplug_method: HotplugMethod,
177     boot_ram: u64,
178     current_ram: u64,
179     next_hotplug_slot: usize,
180     shared: bool,
181     hugepages: bool,
182     hugepage_size: Option<u64>,
183     prefault: bool,
184     thp: bool,
185     #[cfg(target_arch = "x86_64")]
186     sgx_epc_region: Option<SgxEpcRegion>,
187     user_provided_zones: bool,
188     snapshot_memory_ranges: MemoryRangeTable,
189     memory_zones: MemoryZones,
190     log_dirty: bool, // Enable dirty logging for created RAM regions
191     arch_mem_regions: Vec<ArchMemRegion>,
192     ram_allocator: AddressAllocator,
193     dynamic: bool,
194 
195     // Keep track of calls to create_userspace_mapping() for guest RAM.
196     // This is useful for getting the dirty pages as we need to know the
197     // slots that the mapping is created in.
198     guest_ram_mappings: Vec<GuestRamMapping>,
199 
200     pub acpi_address: Option<GuestAddress>,
201     #[cfg(target_arch = "aarch64")]
202     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
203 }
204 
205 #[derive(Debug)]
206 pub enum Error {
207     /// Failed to create shared file.
208     SharedFileCreate(io::Error),
209 
210     /// Failed to set shared file length.
211     SharedFileSetLen(io::Error),
212 
213     /// Mmap backed guest memory error
214     GuestMemory(MmapError),
215 
216     /// Failed to allocate a memory range.
217     MemoryRangeAllocation,
218 
219     /// Error from region creation
220     GuestMemoryRegion(MmapRegionError),
221 
222     /// No ACPI slot available
223     NoSlotAvailable,
224 
225     /// Not enough space in the hotplug RAM region
226     InsufficientHotplugRam,
227 
228     /// The requested hotplug memory addition is not a valid size
229     InvalidSize,
230 
231     /// Failed to create the user memory region.
232     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
233 
234     /// Failed to remove the user memory region.
235     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
236 
237     /// Failed to EventFd.
238     EventFdFail(io::Error),
239 
240     /// Eventfd write error
241     EventfdError(io::Error),
242 
243     /// Failed to virtio-mem resize
244     VirtioMemResizeFail(virtio_devices::mem::Error),
245 
246     /// Cannot restore VM
247     Restore(MigratableError),
248 
249     /// Cannot restore VM because source URL is missing
250     RestoreMissingSourceUrl,
251 
252     /// Cannot create the system allocator
253     CreateSystemAllocator,
254 
255     /// Invalid SGX EPC section size
256     #[cfg(target_arch = "x86_64")]
257     EpcSectionSizeInvalid,
258 
259     /// Failed allocating SGX EPC region
260     #[cfg(target_arch = "x86_64")]
261     SgxEpcRangeAllocation,
262 
263     /// Failed opening SGX virtual EPC device
264     #[cfg(target_arch = "x86_64")]
265     SgxVirtEpcOpen(io::Error),
266 
267     /// Failed setting the SGX virtual EPC section size
268     #[cfg(target_arch = "x86_64")]
269     SgxVirtEpcFileSetLen(io::Error),
270 
271     /// Failed opening SGX provisioning device
272     #[cfg(target_arch = "x86_64")]
273     SgxProvisionOpen(io::Error),
274 
275     /// Failed enabling SGX provisioning
276     #[cfg(target_arch = "x86_64")]
277     SgxEnableProvisioning(hypervisor::HypervisorVmError),
278 
279     /// Failed creating a new MmapRegion instance.
280     #[cfg(target_arch = "x86_64")]
281     NewMmapRegion(vm_memory::mmap::MmapRegionError),
282 
283     /// No memory zones found.
284     MissingMemoryZones,
285 
286     /// Memory configuration is not valid.
287     InvalidMemoryParameters,
288 
289     /// Forbidden operation. Impossible to resize guest memory if it is
290     /// backed by user defined memory regions.
291     InvalidResizeWithMemoryZones,
292 
293     /// It's invalid to try applying a NUMA policy to a memory zone that is
294     /// memory mapped with MAP_SHARED.
295     InvalidSharedMemoryZoneWithHostNuma,
296 
297     /// Failed applying NUMA memory policy.
298     ApplyNumaPolicy(io::Error),
299 
300     /// Memory zone identifier is not unique.
301     DuplicateZoneId,
302 
303     /// No virtio-mem resizing handler found.
304     MissingVirtioMemHandler,
305 
306     /// Unknown memory zone.
307     UnknownMemoryZone,
308 
309     /// Invalid size for resizing. Can be anything except 0.
310     InvalidHotplugSize,
311 
312     /// Invalid hotplug method associated with memory zones resizing capability.
313     InvalidHotplugMethodWithMemoryZones,
314 
315     /// Could not find specified memory zone identifier from hash map.
316     MissingZoneIdentifier,
317 
318     /// Resizing the memory zone failed.
319     ResizeZone,
320 
321     /// Guest address overflow
322     GuestAddressOverFlow,
323 
324     /// Error opening snapshot file
325     SnapshotOpen(io::Error),
326 
327     // Error copying snapshot into region
328     SnapshotCopy(GuestMemoryError),
329 
330     /// Failed to allocate MMIO address
331     AllocateMmioAddress,
332 
333     #[cfg(target_arch = "aarch64")]
334     /// Failed to create UEFI flash
335     CreateUefiFlash(HypervisorVmError),
336 
337     /// Using a directory as a backing file for memory is not supported
338     DirectoryAsBackingFileForMemory,
339 
340     /// Failed to stat filesystem
341     GetFileSystemBlockSize(io::Error),
342 
343     /// Memory size is misaligned with default page size or its hugepage size
344     MisalignedMemorySize,
345 }
346 
347 const ENABLE_FLAG: usize = 0;
348 const INSERTING_FLAG: usize = 1;
349 const REMOVING_FLAG: usize = 2;
350 const EJECT_FLAG: usize = 3;
351 
352 const BASE_OFFSET_LOW: u64 = 0;
353 const BASE_OFFSET_HIGH: u64 = 0x4;
354 const LENGTH_OFFSET_LOW: u64 = 0x8;
355 const LENGTH_OFFSET_HIGH: u64 = 0xC;
356 const STATUS_OFFSET: u64 = 0x14;
357 const SELECTION_OFFSET: u64 = 0;
358 
359 // The MMIO address space size is subtracted with 64k. This is done for the
360 // following reasons:
361 //  - Reduce the addressable space size by at least 4k to workaround a Linux
362 //    bug when the VMM allocates devices at the end of the addressable space
363 //  - Windows requires the addressable space size to be 64k aligned
364 fn mmio_address_space_size(phys_bits: u8) -> u64 {
365     (1 << phys_bits) - (1 << 16)
366 }
367 
368 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
369 // `f_bsize` field.
370 //
371 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
372 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
373     let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
374     let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
375 
376     // SAFETY: FFI call with a valid path and buffer
377     let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
378     if ret != 0 {
379         return Err(Error::GetFileSystemBlockSize(
380             std::io::Error::last_os_error(),
381         ));
382     }
383 
384     // SAFETY: `buf` is valid at this point
385     // Because this value is always positive, just convert it directly.
386     // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
387     // by `clippy` on musl target.  To avoid the warning, there should be `as _` instead of
388     // `as u64`.
389     let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
390     Ok(bsize)
391 }
392 
393 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
394     // SAFETY: FFI call. Trivially safe.
395     let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
396 
397     // There is no backend file and the `hugepages` is disabled, just use system page size.
398     if zone.file.is_none() && !zone.hugepages {
399         return Ok(page_size);
400     }
401 
402     // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
403     if zone.hugepages && zone.hugepage_size.is_some() {
404         return Ok(zone.hugepage_size.unwrap());
405     }
406 
407     // There are two scenarios here:
408     //  - `hugepages` is enabled but `hugepage_size` is not specified:
409     //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
410     //  - The backing file is specified:
411     //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
412     //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
413     //     value is less than or equal to the page size, just use the page size.
414     let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
415         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
416     })?;
417 
418     let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
419 
420     Ok(align_size)
421 }
422 
423 #[inline]
424 fn align_down<T>(val: T, align: T) -> T
425 where
426     T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
427 {
428     val & !(align - 1u8.into())
429 }
430 
431 #[inline]
432 fn is_aligned<T>(val: T, align: T) -> bool
433 where
434     T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
435 {
436     (val & (align - 1u8.into())) == 0u8.into()
437 }
438 
439 impl BusDevice for MemoryManager {
440     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
441         if self.selected_slot < self.hotplug_slots.len() {
442             let state = &self.hotplug_slots[self.selected_slot];
443             match offset {
444                 BASE_OFFSET_LOW => {
445                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
446                 }
447                 BASE_OFFSET_HIGH => {
448                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
449                 }
450                 LENGTH_OFFSET_LOW => {
451                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
452                 }
453                 LENGTH_OFFSET_HIGH => {
454                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
455                 }
456                 STATUS_OFFSET => {
457                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
458                     data.fill(0);
459                     if state.active {
460                         data[0] |= 1 << ENABLE_FLAG;
461                     }
462                     if state.inserting {
463                         data[0] |= 1 << INSERTING_FLAG;
464                     }
465                     if state.removing {
466                         data[0] |= 1 << REMOVING_FLAG;
467                     }
468                 }
469                 _ => {
470                     warn!(
471                         "Unexpected offset for accessing memory manager device: {:#}",
472                         offset
473                     );
474                 }
475             }
476         } else {
477             warn!("Out of range memory slot: {}", self.selected_slot);
478         }
479     }
480 
481     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
482         match offset {
483             SELECTION_OFFSET => {
484                 self.selected_slot = usize::from(data[0]);
485             }
486             STATUS_OFFSET => {
487                 if self.selected_slot < self.hotplug_slots.len() {
488                     let state = &mut self.hotplug_slots[self.selected_slot];
489                     // The ACPI code writes back a 1 to acknowledge the insertion
490                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
491                         state.inserting = false;
492                     }
493                     // Ditto for removal
494                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
495                         state.removing = false;
496                     }
497                     // Trigger removal of "DIMM"
498                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
499                         warn!("Ejection of memory not currently supported");
500                     }
501                 } else {
502                     warn!("Out of range memory slot: {}", self.selected_slot);
503                 }
504             }
505             _ => {
506                 warn!(
507                     "Unexpected offset for accessing memory manager device: {:#}",
508                     offset
509                 );
510             }
511         };
512         None
513     }
514 }
515 
516 impl MemoryManager {
517     /// Creates all memory regions based on the available RAM ranges defined
518     /// by `ram_regions`, and based on the description of the memory zones.
519     /// In practice, this function can perform multiple memory mappings of the
520     /// same backing file if there's a hole in the address space between two
521     /// RAM ranges.
522     ///
523     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
524     /// and zones containing two zones (size 1G and size 4G).
525     ///
526     /// This function will create 3 resulting memory regions:
527     /// - First one mapping entirely the first memory zone on 0-1G range
528     /// - Second one mapping partially the second memory zone on 1G-3G range
529     /// - Third one mapping partially the second memory zone on 4G-6G range
530     ///
531     /// Also, all memory regions are page-size aligned (e.g. their sizes must
532     /// be multiple of page-size), which may leave an additional hole in the
533     /// address space when hugepage is used.
534     fn create_memory_regions_from_zones(
535         ram_regions: &[(GuestAddress, usize)],
536         zones: &[MemoryZoneConfig],
537         prefault: Option<bool>,
538         thp: bool,
539     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
540         let mut zone_iter = zones.iter();
541         let mut mem_regions = Vec::new();
542         let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
543         let mut zone_align_size = memory_zone_get_align_size(zone)?;
544         let mut zone_offset = 0u64;
545         let mut memory_zones = HashMap::new();
546 
547         if !is_aligned(zone.size, zone_align_size) {
548             return Err(Error::MisalignedMemorySize);
549         }
550 
551         // Add zone id to the list of memory zones.
552         memory_zones.insert(zone.id.clone(), MemoryZone::default());
553 
554         for ram_region in ram_regions.iter() {
555             let mut ram_region_offset = 0;
556             let mut exit = false;
557 
558             loop {
559                 let mut ram_region_consumed = false;
560                 let mut pull_next_zone = false;
561 
562                 let ram_region_available_size =
563                     align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
564                 if ram_region_available_size == 0 {
565                     break;
566                 }
567                 let zone_sub_size = zone.size - zone_offset;
568 
569                 let file_offset = zone_offset;
570                 let region_start = ram_region
571                     .0
572                     .checked_add(ram_region_offset)
573                     .ok_or(Error::GuestAddressOverFlow)?;
574                 let region_size = if zone_sub_size <= ram_region_available_size {
575                     if zone_sub_size == ram_region_available_size {
576                         ram_region_consumed = true;
577                     }
578 
579                     ram_region_offset += zone_sub_size;
580                     pull_next_zone = true;
581 
582                     zone_sub_size
583                 } else {
584                     zone_offset += ram_region_available_size;
585                     ram_region_consumed = true;
586 
587                     ram_region_available_size
588                 };
589 
590                 info!(
591                     "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
592                     zone.id,
593                     region_start.raw_value(),
594                     region_size
595                 );
596                 let region = MemoryManager::create_ram_region(
597                     &zone.file,
598                     file_offset,
599                     region_start,
600                     region_size as usize,
601                     prefault.unwrap_or(zone.prefault),
602                     zone.shared,
603                     zone.hugepages,
604                     zone.hugepage_size,
605                     zone.host_numa_node,
606                     None,
607                     thp,
608                 )?;
609 
610                 // Add region to the list of regions associated with the
611                 // current memory zone.
612                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
613                     memory_zone.regions.push(region.clone());
614                 }
615 
616                 mem_regions.push(region);
617 
618                 if pull_next_zone {
619                     // Get the next zone and reset the offset.
620                     zone_offset = 0;
621                     if let Some(z) = zone_iter.next() {
622                         zone = z;
623                     } else {
624                         exit = true;
625                         break;
626                     }
627                     zone_align_size = memory_zone_get_align_size(zone)?;
628                     if !is_aligned(zone.size, zone_align_size) {
629                         return Err(Error::MisalignedMemorySize);
630                     }
631 
632                     // Check if zone id already exist. In case it does, throw
633                     // an error as we need unique identifiers. Otherwise, add
634                     // the new zone id to the list of memory zones.
635                     if memory_zones.contains_key(&zone.id) {
636                         error!(
637                             "Memory zone identifier '{}' found more than once. \
638                             It must be unique",
639                             zone.id,
640                         );
641                         return Err(Error::DuplicateZoneId);
642                     }
643                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
644                 }
645 
646                 if ram_region_consumed {
647                     break;
648                 }
649             }
650 
651             if exit {
652                 break;
653             }
654         }
655 
656         Ok((mem_regions, memory_zones))
657     }
658 
659     // Restore both GuestMemory regions along with MemoryZone zones.
660     fn restore_memory_regions_and_zones(
661         guest_ram_mappings: &[GuestRamMapping],
662         zones_config: &[MemoryZoneConfig],
663         prefault: Option<bool>,
664         mut existing_memory_files: HashMap<u32, File>,
665         thp: bool,
666     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
667         let mut memory_regions = Vec::new();
668         let mut memory_zones = HashMap::new();
669 
670         for zone_config in zones_config {
671             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
672         }
673 
674         for guest_ram_mapping in guest_ram_mappings {
675             for zone_config in zones_config {
676                 if guest_ram_mapping.zone_id == zone_config.id {
677                     let region = MemoryManager::create_ram_region(
678                         if guest_ram_mapping.virtio_mem {
679                             &None
680                         } else {
681                             &zone_config.file
682                         },
683                         guest_ram_mapping.file_offset,
684                         GuestAddress(guest_ram_mapping.gpa),
685                         guest_ram_mapping.size as usize,
686                         prefault.unwrap_or(zone_config.prefault),
687                         zone_config.shared,
688                         zone_config.hugepages,
689                         zone_config.hugepage_size,
690                         zone_config.host_numa_node,
691                         existing_memory_files.remove(&guest_ram_mapping.slot),
692                         thp,
693                     )?;
694                     memory_regions.push(Arc::clone(&region));
695                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
696                         if guest_ram_mapping.virtio_mem {
697                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
698                             let region_size = region.len();
699                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
700                                 region,
701                                 virtio_device: None,
702                                 hotplugged_size,
703                                 hugepages: zone_config.hugepages,
704                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
705                             });
706                         } else {
707                             memory_zone.regions.push(region);
708                         }
709                     }
710                 }
711             }
712         }
713 
714         memory_regions.sort_by_key(|x| x.start_addr());
715 
716         Ok((memory_regions, memory_zones))
717     }
718 
719     fn fill_saved_regions(
720         &mut self,
721         file_path: PathBuf,
722         saved_regions: MemoryRangeTable,
723     ) -> Result<(), Error> {
724         if saved_regions.is_empty() {
725             return Ok(());
726         }
727 
728         // Open (read only) the snapshot file.
729         let mut memory_file = OpenOptions::new()
730             .read(true)
731             .open(file_path)
732             .map_err(Error::SnapshotOpen)?;
733 
734         let guest_memory = self.guest_memory.memory();
735         for range in saved_regions.regions() {
736             let mut offset: u64 = 0;
737             // Here we are manually handling the retry in case we can't write
738             // the whole region at once because we can't use the implementation
739             // from vm-memory::GuestMemory of read_exact_from() as it is not
740             // following the correct behavior. For more info about this issue
741             // see: https://github.com/rust-vmm/vm-memory/issues/174
742             loop {
743                 let bytes_read = guest_memory
744                     .read_volatile_from(
745                         GuestAddress(range.gpa + offset),
746                         &mut memory_file,
747                         (range.length - offset) as usize,
748                     )
749                     .map_err(Error::SnapshotCopy)?;
750                 offset += bytes_read as u64;
751 
752                 if offset == range.length {
753                     break;
754                 }
755             }
756         }
757 
758         Ok(())
759     }
760 
761     fn validate_memory_config(
762         config: &MemoryConfig,
763         user_provided_zones: bool,
764     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
765         let mut allow_mem_hotplug = false;
766 
767         if !user_provided_zones {
768             if config.zones.is_some() {
769                 error!(
770                     "User defined memory regions can't be provided if the \
771                     memory size is not 0"
772                 );
773                 return Err(Error::InvalidMemoryParameters);
774             }
775 
776             if config.hotplug_size.is_some() {
777                 allow_mem_hotplug = true;
778             }
779 
780             if let Some(hotplugged_size) = config.hotplugged_size {
781                 if let Some(hotplug_size) = config.hotplug_size {
782                     if hotplugged_size > hotplug_size {
783                         error!(
784                             "'hotplugged_size' {} can't be bigger than \
785                             'hotplug_size' {}",
786                             hotplugged_size, hotplug_size,
787                         );
788                         return Err(Error::InvalidMemoryParameters);
789                     }
790                 } else {
791                     error!(
792                         "Invalid to define 'hotplugged_size' when there is\
793                         no 'hotplug_size'"
794                     );
795                     return Err(Error::InvalidMemoryParameters);
796                 }
797                 if config.hotplug_method == HotplugMethod::Acpi {
798                     error!(
799                         "Invalid to define 'hotplugged_size' with hotplug \
800                         method 'acpi'"
801                     );
802                     return Err(Error::InvalidMemoryParameters);
803                 }
804             }
805 
806             // Create a single zone from the global memory config. This lets
807             // us reuse the codepath for user defined memory zones.
808             let zones = vec![MemoryZoneConfig {
809                 id: String::from(DEFAULT_MEMORY_ZONE),
810                 size: config.size,
811                 file: None,
812                 shared: config.shared,
813                 hugepages: config.hugepages,
814                 hugepage_size: config.hugepage_size,
815                 host_numa_node: None,
816                 hotplug_size: config.hotplug_size,
817                 hotplugged_size: config.hotplugged_size,
818                 prefault: config.prefault,
819             }];
820 
821             Ok((config.size, zones, allow_mem_hotplug))
822         } else {
823             if config.zones.is_none() {
824                 error!(
825                     "User defined memory regions must be provided if the \
826                     memory size is 0"
827                 );
828                 return Err(Error::MissingMemoryZones);
829             }
830 
831             // Safe to unwrap as we checked right above there were some
832             // regions.
833             let zones = config.zones.clone().unwrap();
834             if zones.is_empty() {
835                 return Err(Error::MissingMemoryZones);
836             }
837 
838             let mut total_ram_size: u64 = 0;
839             for zone in zones.iter() {
840                 total_ram_size += zone.size;
841 
842                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
843                     error!(
844                         "Invalid to set host NUMA policy for a memory zone \
845                         backed by a regular file and mapped as 'shared'"
846                     );
847                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
848                 }
849 
850                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
851                     error!("Invalid to set ACPI hotplug method for memory zones");
852                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
853                 }
854 
855                 if let Some(hotplugged_size) = zone.hotplugged_size {
856                     if let Some(hotplug_size) = zone.hotplug_size {
857                         if hotplugged_size > hotplug_size {
858                             error!(
859                                 "'hotplugged_size' {} can't be bigger than \
860                                 'hotplug_size' {}",
861                                 hotplugged_size, hotplug_size,
862                             );
863                             return Err(Error::InvalidMemoryParameters);
864                         }
865                     } else {
866                         error!(
867                             "Invalid to define 'hotplugged_size' when there is\
868                             no 'hotplug_size' for a memory zone"
869                         );
870                         return Err(Error::InvalidMemoryParameters);
871                     }
872                     if config.hotplug_method == HotplugMethod::Acpi {
873                         error!(
874                             "Invalid to define 'hotplugged_size' with hotplug \
875                             method 'acpi'"
876                         );
877                         return Err(Error::InvalidMemoryParameters);
878                     }
879                 }
880             }
881 
882             Ok((total_ram_size, zones, allow_mem_hotplug))
883         }
884     }
885 
886     pub fn allocate_address_space(&mut self) -> Result<(), Error> {
887         let mut list = Vec::new();
888 
889         for (zone_id, memory_zone) in self.memory_zones.iter() {
890             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
891                 memory_zone
892                     .regions()
893                     .iter()
894                     .map(|r| (r.clone(), false))
895                     .collect();
896 
897             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
898                 regions.push((virtio_mem_zone.region().clone(), true));
899             }
900 
901             list.push((zone_id.clone(), regions));
902         }
903 
904         for (zone_id, regions) in list {
905             for (region, virtio_mem) in regions {
906                 let slot = self.create_userspace_mapping(
907                     region.start_addr().raw_value(),
908                     region.len(),
909                     region.as_ptr() as u64,
910                     self.mergeable,
911                     false,
912                     self.log_dirty,
913                 )?;
914 
915                 let file_offset = if let Some(file_offset) = region.file_offset() {
916                     file_offset.start()
917                 } else {
918                     0
919                 };
920 
921                 self.guest_ram_mappings.push(GuestRamMapping {
922                     gpa: region.start_addr().raw_value(),
923                     size: region.len(),
924                     slot,
925                     zone_id: zone_id.clone(),
926                     virtio_mem,
927                     file_offset,
928                 });
929                 self.ram_allocator
930                     .allocate(Some(region.start_addr()), region.len(), None)
931                     .ok_or(Error::MemoryRangeAllocation)?;
932             }
933         }
934 
935         // Allocate SubRegion and Reserved address ranges.
936         for region in self.arch_mem_regions.iter() {
937             if region.r_type == RegionType::Ram {
938                 // Ignore the RAM type since ranges have already been allocated
939                 // based on the GuestMemory regions.
940                 continue;
941             }
942             self.ram_allocator
943                 .allocate(
944                     Some(GuestAddress(region.base)),
945                     region.size as GuestUsize,
946                     None,
947                 )
948                 .ok_or(Error::MemoryRangeAllocation)?;
949         }
950 
951         Ok(())
952     }
953 
954     #[cfg(target_arch = "aarch64")]
955     fn add_uefi_flash(&mut self) -> Result<(), Error> {
956         // On AArch64, the UEFI binary requires a flash device at address 0.
957         // 4 MiB memory is mapped to simulate the flash.
958         let uefi_mem_slot = self.allocate_memory_slot();
959         let uefi_region = GuestRegionMmap::new(
960             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
961             arch::layout::UEFI_START,
962         )
963         .unwrap();
964         let uefi_mem_region = self.vm.make_user_memory_region(
965             uefi_mem_slot,
966             uefi_region.start_addr().raw_value(),
967             uefi_region.len(),
968             uefi_region.as_ptr() as u64,
969             false,
970             false,
971         );
972         self.vm
973             .create_user_memory_region(uefi_mem_region)
974             .map_err(Error::CreateUefiFlash)?;
975 
976         let uefi_flash =
977             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
978 
979         self.uefi_flash = Some(uefi_flash);
980 
981         Ok(())
982     }
983 
984     #[allow(clippy::too_many_arguments)]
985     pub fn new(
986         vm: Arc<dyn hypervisor::Vm>,
987         config: &MemoryConfig,
988         prefault: Option<bool>,
989         phys_bits: u8,
990         #[cfg(feature = "tdx")] tdx_enabled: bool,
991         restore_data: Option<&MemoryManagerSnapshotData>,
992         existing_memory_files: Option<HashMap<u32, File>>,
993         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
994     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
995         trace_scoped!("MemoryManager::new");
996 
997         let user_provided_zones = config.size == 0;
998 
999         let mmio_address_space_size = mmio_address_space_size(phys_bits);
1000         debug_assert_eq!(
1001             (((mmio_address_space_size) >> 16) << 16),
1002             mmio_address_space_size
1003         );
1004         let start_of_platform_device_area =
1005             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
1006         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
1007 
1008         let (ram_size, zones, allow_mem_hotplug) =
1009             Self::validate_memory_config(config, user_provided_zones)?;
1010 
1011         let (
1012             start_of_device_area,
1013             boot_ram,
1014             current_ram,
1015             arch_mem_regions,
1016             memory_zones,
1017             guest_memory,
1018             boot_guest_memory,
1019             hotplug_slots,
1020             next_memory_slot,
1021             selected_slot,
1022             next_hotplug_slot,
1023         ) = if let Some(data) = restore_data {
1024             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1025                 &data.guest_ram_mappings,
1026                 &zones,
1027                 prefault,
1028                 existing_memory_files.unwrap_or_default(),
1029                 config.thp,
1030             )?;
1031             let guest_memory =
1032                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1033             let boot_guest_memory = guest_memory.clone();
1034             (
1035                 GuestAddress(data.start_of_device_area),
1036                 data.boot_ram,
1037                 data.current_ram,
1038                 data.arch_mem_regions.clone(),
1039                 memory_zones,
1040                 guest_memory,
1041                 boot_guest_memory,
1042                 data.hotplug_slots.clone(),
1043                 data.next_memory_slot,
1044                 data.selected_slot,
1045                 data.next_hotplug_slot,
1046             )
1047         } else {
1048             // Init guest memory
1049             let arch_mem_regions = arch::arch_memory_regions();
1050 
1051             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1052                 .iter()
1053                 .filter(|r| r.2 == RegionType::Ram)
1054                 .map(|r| (r.0, r.1))
1055                 .collect();
1056 
1057             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1058                 .iter()
1059                 .map(|(a, b, c)| ArchMemRegion {
1060                     base: a.0,
1061                     size: *b,
1062                     r_type: *c,
1063                 })
1064                 .collect();
1065 
1066             let (mem_regions, mut memory_zones) =
1067                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1068 
1069             let mut guest_memory =
1070                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1071 
1072             let boot_guest_memory = guest_memory.clone();
1073 
1074             let mut start_of_device_area =
1075                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1076 
1077             // Update list of memory zones for resize.
1078             for zone in zones.iter() {
1079                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1080                     if let Some(hotplug_size) = zone.hotplug_size {
1081                         if hotplug_size == 0 {
1082                             error!("'hotplug_size' can't be 0");
1083                             return Err(Error::InvalidHotplugSize);
1084                         }
1085 
1086                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1087                             start_of_device_area = start_of_device_area
1088                                 .checked_add(hotplug_size)
1089                                 .ok_or(Error::GuestAddressOverFlow)?;
1090                         } else {
1091                             // Alignment must be "natural" i.e. same as size of block
1092                             let start_addr = GuestAddress(
1093                                 start_of_device_area
1094                                     .0
1095                                     .div_ceil(virtio_devices::VIRTIO_MEM_ALIGN_SIZE)
1096                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1097                             );
1098 
1099                             // When `prefault` is set by vm_restore, memory manager
1100                             // will create ram region with `prefault` option in
1101                             // restore config rather than same option in zone
1102                             let region = MemoryManager::create_ram_region(
1103                                 &None,
1104                                 0,
1105                                 start_addr,
1106                                 hotplug_size as usize,
1107                                 prefault.unwrap_or(zone.prefault),
1108                                 zone.shared,
1109                                 zone.hugepages,
1110                                 zone.hugepage_size,
1111                                 zone.host_numa_node,
1112                                 None,
1113                                 config.thp,
1114                             )?;
1115 
1116                             guest_memory = guest_memory
1117                                 .insert_region(Arc::clone(&region))
1118                                 .map_err(Error::GuestMemory)?;
1119 
1120                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1121                             let region_size = region.len();
1122                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1123                                 region,
1124                                 virtio_device: None,
1125                                 hotplugged_size,
1126                                 hugepages: zone.hugepages,
1127                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1128                             });
1129 
1130                             start_of_device_area = start_addr
1131                                 .checked_add(hotplug_size)
1132                                 .ok_or(Error::GuestAddressOverFlow)?;
1133                         }
1134                     }
1135                 } else {
1136                     return Err(Error::MissingZoneIdentifier);
1137                 }
1138             }
1139 
1140             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1141             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1142 
1143             (
1144                 start_of_device_area,
1145                 ram_size,
1146                 ram_size,
1147                 arch_mem_regions,
1148                 memory_zones,
1149                 guest_memory,
1150                 boot_guest_memory,
1151                 hotplug_slots,
1152                 0,
1153                 0,
1154                 0,
1155             )
1156         };
1157 
1158         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1159 
1160         // Both MMIO and PIO address spaces start at address 0.
1161         let allocator = Arc::new(Mutex::new(
1162             SystemAllocator::new(
1163                 #[cfg(target_arch = "x86_64")]
1164                 {
1165                     GuestAddress(0)
1166                 },
1167                 #[cfg(target_arch = "x86_64")]
1168                 {
1169                     1 << 16
1170                 },
1171                 start_of_platform_device_area,
1172                 PLATFORM_DEVICE_AREA_SIZE,
1173                 #[cfg(target_arch = "x86_64")]
1174                 vec![GsiApic::new(
1175                     X86_64_IRQ_BASE,
1176                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1177                 )],
1178             )
1179             .ok_or(Error::CreateSystemAllocator)?,
1180         ));
1181 
1182         #[cfg(not(feature = "tdx"))]
1183         let dynamic = true;
1184         #[cfg(feature = "tdx")]
1185         let dynamic = !tdx_enabled;
1186 
1187         let acpi_address = if dynamic
1188             && config.hotplug_method == HotplugMethod::Acpi
1189             && (config.hotplug_size.unwrap_or_default() > 0)
1190         {
1191             Some(
1192                 allocator
1193                     .lock()
1194                     .unwrap()
1195                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1196                     .ok_or(Error::AllocateMmioAddress)?,
1197             )
1198         } else {
1199             None
1200         };
1201 
1202         // If running on SGX the start of device area and RAM area may diverge but
1203         // at this point they are next to each other.
1204         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1205         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1206 
1207         let mut memory_manager = MemoryManager {
1208             boot_guest_memory,
1209             guest_memory,
1210             next_memory_slot: Arc::new(AtomicU32::new(next_memory_slot)),
1211             memory_slot_free_list: Arc::new(Mutex::new(Vec::new())),
1212             start_of_device_area,
1213             end_of_device_area,
1214             end_of_ram_area,
1215             vm,
1216             hotplug_slots,
1217             selected_slot,
1218             mergeable: config.mergeable,
1219             allocator,
1220             hotplug_method: config.hotplug_method,
1221             boot_ram,
1222             current_ram,
1223             next_hotplug_slot,
1224             shared: config.shared,
1225             hugepages: config.hugepages,
1226             hugepage_size: config.hugepage_size,
1227             prefault: config.prefault,
1228             #[cfg(target_arch = "x86_64")]
1229             sgx_epc_region: None,
1230             user_provided_zones,
1231             snapshot_memory_ranges: MemoryRangeTable::default(),
1232             memory_zones,
1233             guest_ram_mappings: Vec::new(),
1234             acpi_address,
1235             log_dirty: dynamic, // Cannot log dirty pages on a TD
1236             arch_mem_regions,
1237             ram_allocator,
1238             dynamic,
1239             #[cfg(target_arch = "aarch64")]
1240             uefi_flash: None,
1241             thp: config.thp,
1242         };
1243 
1244         #[cfg(target_arch = "aarch64")]
1245         {
1246             // For Aarch64 we cannot lazily allocate the address space like we
1247             // do for x86, because while restoring a VM from snapshot we would
1248             // need the address space to be allocated to properly restore VGIC.
1249             // And the restore of VGIC happens before we attempt to run the vCPUs
1250             // for the first time, thus we need to allocate the address space
1251             // beforehand.
1252             memory_manager.allocate_address_space()?;
1253             memory_manager.add_uefi_flash()?;
1254         }
1255 
1256         #[cfg(target_arch = "x86_64")]
1257         if let Some(sgx_epc_config) = sgx_epc_config {
1258             memory_manager.setup_sgx(sgx_epc_config)?;
1259         }
1260 
1261         Ok(Arc::new(Mutex::new(memory_manager)))
1262     }
1263 
1264     pub fn new_from_snapshot(
1265         snapshot: &Snapshot,
1266         vm: Arc<dyn hypervisor::Vm>,
1267         config: &MemoryConfig,
1268         source_url: Option<&str>,
1269         prefault: bool,
1270         phys_bits: u8,
1271     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1272         if let Some(source_url) = source_url {
1273             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1274             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1275 
1276             let mem_snapshot: MemoryManagerSnapshotData =
1277                 snapshot.to_state().map_err(Error::Restore)?;
1278 
1279             let mm = MemoryManager::new(
1280                 vm,
1281                 config,
1282                 Some(prefault),
1283                 phys_bits,
1284                 #[cfg(feature = "tdx")]
1285                 false,
1286                 Some(&mem_snapshot),
1287                 None,
1288                 #[cfg(target_arch = "x86_64")]
1289                 None,
1290             )?;
1291 
1292             mm.lock()
1293                 .unwrap()
1294                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1295 
1296             Ok(mm)
1297         } else {
1298             Err(Error::RestoreMissingSourceUrl)
1299         }
1300     }
1301 
1302     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1303         // SAFETY: FFI call with correct arguments
1304         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1305 
1306         if res < 0 {
1307             Err(io::Error::last_os_error())
1308         } else {
1309             Ok(res as RawFd)
1310         }
1311     }
1312 
1313     fn mbind(
1314         addr: *mut u8,
1315         len: u64,
1316         mode: u32,
1317         nodemask: Vec<u64>,
1318         maxnode: u64,
1319         flags: u32,
1320     ) -> Result<(), io::Error> {
1321         // SAFETY: FFI call with correct arguments
1322         let res = unsafe {
1323             libc::syscall(
1324                 libc::SYS_mbind,
1325                 addr as *mut libc::c_void,
1326                 len,
1327                 mode,
1328                 nodemask.as_ptr(),
1329                 maxnode,
1330                 flags,
1331             )
1332         };
1333 
1334         if res < 0 {
1335             Err(io::Error::last_os_error())
1336         } else {
1337             Ok(())
1338         }
1339     }
1340 
1341     fn create_anonymous_file(
1342         size: usize,
1343         hugepages: bool,
1344         hugepage_size: Option<u64>,
1345     ) -> Result<FileOffset, Error> {
1346         let fd = Self::memfd_create(
1347             &ffi::CString::new("ch_ram").unwrap(),
1348             libc::MFD_CLOEXEC
1349                 | if hugepages {
1350                     libc::MFD_HUGETLB
1351                         | if let Some(hugepage_size) = hugepage_size {
1352                             /*
1353                              * From the Linux kernel:
1354                              * Several system calls take a flag to request "hugetlb" huge pages.
1355                              * Without further specification, these system calls will use the
1356                              * system's default huge page size.  If a system supports multiple
1357                              * huge page sizes, the desired huge page size can be specified in
1358                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1359                              * will encode the log2 of the huge page size.
1360                              */
1361 
1362                             hugepage_size.trailing_zeros() << 26
1363                         } else {
1364                             // Use the system default huge page size
1365                             0
1366                         }
1367                 } else {
1368                     0
1369                 },
1370         )
1371         .map_err(Error::SharedFileCreate)?;
1372 
1373         // SAFETY: fd is valid
1374         let f = unsafe { File::from_raw_fd(fd) };
1375         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1376 
1377         Ok(FileOffset::new(f, 0))
1378     }
1379 
1380     fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1381         if backing_file.is_dir() {
1382             Err(Error::DirectoryAsBackingFileForMemory)
1383         } else {
1384             let f = OpenOptions::new()
1385                 .read(true)
1386                 .write(true)
1387                 .open(backing_file)
1388                 .map_err(Error::SharedFileCreate)?;
1389 
1390             Ok(FileOffset::new(f, file_offset))
1391         }
1392     }
1393 
1394     #[allow(clippy::too_many_arguments)]
1395     pub fn create_ram_region(
1396         backing_file: &Option<PathBuf>,
1397         file_offset: u64,
1398         start_addr: GuestAddress,
1399         size: usize,
1400         prefault: bool,
1401         shared: bool,
1402         hugepages: bool,
1403         hugepage_size: Option<u64>,
1404         host_numa_node: Option<u32>,
1405         existing_memory_file: Option<File>,
1406         thp: bool,
1407     ) -> Result<Arc<GuestRegionMmap>, Error> {
1408         let mut mmap_flags = libc::MAP_NORESERVE;
1409 
1410         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1411         // the complexity of the handling clear.
1412         let fo = if let Some(f) = existing_memory_file {
1413             // It must be MAP_SHARED as we wouldn't already have an FD
1414             mmap_flags |= libc::MAP_SHARED;
1415             Some(FileOffset::new(f, file_offset))
1416         } else if let Some(backing_file) = backing_file {
1417             if shared {
1418                 mmap_flags |= libc::MAP_SHARED;
1419             } else {
1420                 mmap_flags |= libc::MAP_PRIVATE;
1421             }
1422             Some(Self::open_backing_file(backing_file, file_offset)?)
1423         } else if shared || hugepages {
1424             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1425             // because the MAP_PRIVATE will trigger CoW against the backing file with
1426             // the VFIO pinning
1427             mmap_flags |= libc::MAP_SHARED;
1428             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1429         } else {
1430             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1431             None
1432         };
1433 
1434         let region = GuestRegionMmap::new(
1435             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1436                 .map_err(Error::GuestMemoryRegion)?,
1437             start_addr,
1438         )
1439         .map_err(Error::GuestMemory)?;
1440 
1441         // Apply NUMA policy if needed.
1442         if let Some(node) = host_numa_node {
1443             let addr = region.deref().as_ptr();
1444             let len = region.deref().size() as u64;
1445             let mode = MPOL_BIND;
1446             let mut nodemask: Vec<u64> = Vec::new();
1447             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1448 
1449             // Linux is kind of buggy in the way it interprets maxnode as it
1450             // will cut off the last node. That's why we have to add 1 to what
1451             // we would consider as the proper maxnode value.
1452             let maxnode = node as u64 + 1 + 1;
1453 
1454             // Allocate the right size for the vector.
1455             nodemask.resize((node as usize / 64) + 1, 0);
1456 
1457             // Fill the global bitmask through the nodemask vector.
1458             let idx = (node / 64) as usize;
1459             let shift = node % 64;
1460             nodemask[idx] |= 1u64 << shift;
1461 
1462             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1463             // force the kernel to move all pages that might have been already
1464             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1465             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1466             // MPOL_BIND is the selected mode as it specifies a strict policy
1467             // that restricts memory allocation to the nodes specified in the
1468             // nodemask.
1469             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1470                 .map_err(Error::ApplyNumaPolicy)?;
1471         }
1472 
1473         // Prefault the region if needed, in parallel.
1474         if prefault {
1475             let page_size =
1476                 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
1477 
1478             if !is_aligned(size, page_size) {
1479                 warn!(
1480                     "Prefaulting memory size {} misaligned with page size {}",
1481                     size, page_size
1482                 );
1483             }
1484 
1485             let num_pages = size / page_size;
1486 
1487             let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
1488 
1489             let pages_per_thread = num_pages / num_threads;
1490             let remainder = num_pages % num_threads;
1491 
1492             let barrier = Arc::new(Barrier::new(num_threads));
1493             thread::scope(|s| {
1494                 let r = &region;
1495                 for i in 0..num_threads {
1496                     let barrier = Arc::clone(&barrier);
1497                     s.spawn(move || {
1498                         // Wait until all threads have been spawned to avoid contention
1499                         // over mmap_sem between thread stack allocation and page faulting.
1500                         barrier.wait();
1501                         let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
1502                         let offset =
1503                             page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
1504                         // SAFETY: FFI call with correct arguments
1505                         let ret = unsafe {
1506                             let addr = r.as_ptr().add(offset);
1507                             libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
1508                         };
1509                         if ret != 0 {
1510                             let e = io::Error::last_os_error();
1511                             warn!("Failed to prefault pages: {}", e);
1512                         }
1513                     });
1514                 }
1515             });
1516         }
1517 
1518         if region.file_offset().is_none() && thp {
1519             info!(
1520                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1521                 region.as_ptr() as u64,
1522                 size
1523             );
1524             // SAFETY: FFI call with correct arguments
1525             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1526             if ret != 0 {
1527                 let e = io::Error::last_os_error();
1528                 warn!("Failed to mark pages as THP eligible: {}", e);
1529             }
1530         }
1531 
1532         Ok(Arc::new(region))
1533     }
1534 
1535     // Duplicate of `memory_zone_get_align_size` that does not require a `zone`
1536     fn get_prefault_align_size(
1537         backing_file: &Option<PathBuf>,
1538         hugepages: bool,
1539         hugepage_size: Option<u64>,
1540     ) -> Result<u64, Error> {
1541         // SAFETY: FFI call. Trivially safe.
1542         let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
1543         match (hugepages, hugepage_size, backing_file) {
1544             (false, _, _) => Ok(page_size),
1545             (true, Some(hugepage_size), _) => Ok(hugepage_size),
1546             (true, None, _) => {
1547                 // There are two scenarios here:
1548                 //  - `hugepages` is enabled but `hugepage_size` is not specified:
1549                 //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
1550                 //  - The backing file is specified:
1551                 //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
1552                 //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
1553                 //     value is less than or equal to the page size, just use the page size.
1554                 let path = backing_file
1555                     .as_ref()
1556                     .map_or(Ok("/dev/hugepages"), |pathbuf| {
1557                         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
1558                     })?;
1559                 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
1560                 Ok(align_size)
1561             }
1562         }
1563     }
1564 
1565     fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
1566         let mut n: usize = 1;
1567 
1568         // Do not create more threads than processors available.
1569         // SAFETY: FFI call. Trivially safe.
1570         let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
1571         if procs > 0 {
1572             n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
1573         }
1574 
1575         // Do not create more threads than pages being allocated.
1576         n = std::cmp::min(n, num_pages);
1577 
1578         // Do not create threads to allocate less than 64 MiB of memory.
1579         n = std::cmp::min(
1580             n,
1581             std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
1582         );
1583 
1584         n
1585     }
1586 
1587     // Update the GuestMemoryMmap with the new range
1588     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1589         let guest_memory = self
1590             .guest_memory
1591             .memory()
1592             .insert_region(region)
1593             .map_err(Error::GuestMemory)?;
1594         self.guest_memory.lock().unwrap().replace(guest_memory);
1595 
1596         Ok(())
1597     }
1598 
1599     //
1600     // Calculate the start address of an area next to RAM.
1601     //
1602     // If memory hotplug is allowed, the start address needs to be aligned
1603     // (rounded-up) to 128MiB boundary.
1604     // If memory hotplug is not allowed, there is no alignment required.
1605     // And it must also start at the 64bit start.
1606     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1607         let mut start_addr = if allow_mem_hotplug {
1608             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1609         } else {
1610             mem_end
1611         };
1612 
1613         start_addr = start_addr
1614             .checked_add(1)
1615             .ok_or(Error::GuestAddressOverFlow)?;
1616 
1617         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1618             return Ok(arch::layout::RAM_64BIT_START);
1619         }
1620 
1621         Ok(start_addr)
1622     }
1623 
1624     pub fn add_ram_region(
1625         &mut self,
1626         start_addr: GuestAddress,
1627         size: usize,
1628     ) -> Result<Arc<GuestRegionMmap>, Error> {
1629         // Allocate memory for the region
1630         let region = MemoryManager::create_ram_region(
1631             &None,
1632             0,
1633             start_addr,
1634             size,
1635             self.prefault,
1636             self.shared,
1637             self.hugepages,
1638             self.hugepage_size,
1639             None,
1640             None,
1641             self.thp,
1642         )?;
1643 
1644         // Map it into the guest
1645         let slot = self.create_userspace_mapping(
1646             region.start_addr().0,
1647             region.len(),
1648             region.as_ptr() as u64,
1649             self.mergeable,
1650             false,
1651             self.log_dirty,
1652         )?;
1653         self.guest_ram_mappings.push(GuestRamMapping {
1654             gpa: region.start_addr().raw_value(),
1655             size: region.len(),
1656             slot,
1657             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1658             virtio_mem: false,
1659             file_offset: 0,
1660         });
1661 
1662         self.add_region(Arc::clone(&region))?;
1663 
1664         Ok(region)
1665     }
1666 
1667     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1668         info!("Hotplugging new RAM: {}", size);
1669 
1670         // Check that there is a free slot
1671         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1672             return Err(Error::NoSlotAvailable);
1673         }
1674 
1675         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1676         if size % (128 << 20) != 0 {
1677             return Err(Error::InvalidSize);
1678         }
1679 
1680         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1681 
1682         if start_addr
1683             .checked_add((size - 1).try_into().unwrap())
1684             .unwrap()
1685             > self.end_of_ram_area
1686         {
1687             return Err(Error::InsufficientHotplugRam);
1688         }
1689 
1690         let region = self.add_ram_region(start_addr, size)?;
1691 
1692         // Add region to the list of regions associated with the default
1693         // memory zone.
1694         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1695             memory_zone.regions.push(Arc::clone(&region));
1696         }
1697 
1698         // Tell the allocator
1699         self.ram_allocator
1700             .allocate(Some(start_addr), size as GuestUsize, None)
1701             .ok_or(Error::MemoryRangeAllocation)?;
1702 
1703         // Update the slot so that it can be queried via the I/O port
1704         let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1705         slot.active = true;
1706         slot.inserting = true;
1707         slot.base = region.start_addr().0;
1708         slot.length = region.len();
1709 
1710         self.next_hotplug_slot += 1;
1711 
1712         Ok(region)
1713     }
1714 
1715     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1716         self.guest_memory.clone()
1717     }
1718 
1719     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1720         self.boot_guest_memory.clone()
1721     }
1722 
1723     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1724         self.allocator.clone()
1725     }
1726 
1727     pub fn start_of_device_area(&self) -> GuestAddress {
1728         self.start_of_device_area
1729     }
1730 
1731     pub fn end_of_device_area(&self) -> GuestAddress {
1732         self.end_of_device_area
1733     }
1734 
1735     pub fn memory_slot_allocator(&mut self) -> MemorySlotAllocator {
1736         let memory_slot_free_list = Arc::clone(&self.memory_slot_free_list);
1737         let next_memory_slot = Arc::clone(&self.next_memory_slot);
1738         MemorySlotAllocator::new(next_memory_slot, memory_slot_free_list)
1739     }
1740 
1741     pub fn allocate_memory_slot(&mut self) -> u32 {
1742         self.memory_slot_allocator().next_memory_slot()
1743     }
1744 
1745     pub fn create_userspace_mapping(
1746         &mut self,
1747         guest_phys_addr: u64,
1748         memory_size: u64,
1749         userspace_addr: u64,
1750         mergeable: bool,
1751         readonly: bool,
1752         log_dirty: bool,
1753     ) -> Result<u32, Error> {
1754         let slot = self.allocate_memory_slot();
1755         let mem_region = self.vm.make_user_memory_region(
1756             slot,
1757             guest_phys_addr,
1758             memory_size,
1759             userspace_addr,
1760             readonly,
1761             log_dirty,
1762         );
1763 
1764         info!(
1765             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1766             guest_phys_addr, userspace_addr, memory_size, slot
1767         );
1768 
1769         self.vm
1770             .create_user_memory_region(mem_region)
1771             .map_err(Error::CreateUserMemoryRegion)?;
1772 
1773         // SAFETY: the address and size are valid since the
1774         // mmap succeeded.
1775         let ret = unsafe {
1776             libc::madvise(
1777                 userspace_addr as *mut libc::c_void,
1778                 memory_size as libc::size_t,
1779                 libc::MADV_DONTDUMP,
1780             )
1781         };
1782         if ret != 0 {
1783             let e = io::Error::last_os_error();
1784             warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e);
1785         }
1786 
1787         // Mark the pages as mergeable if explicitly asked for.
1788         if mergeable {
1789             // SAFETY: the address and size are valid since the
1790             // mmap succeeded.
1791             let ret = unsafe {
1792                 libc::madvise(
1793                     userspace_addr as *mut libc::c_void,
1794                     memory_size as libc::size_t,
1795                     libc::MADV_MERGEABLE,
1796                 )
1797             };
1798             if ret != 0 {
1799                 let err = io::Error::last_os_error();
1800                 // Safe to unwrap because the error is constructed with
1801                 // last_os_error(), which ensures the output will be Some().
1802                 let errno = err.raw_os_error().unwrap();
1803                 if errno == libc::EINVAL {
1804                     warn!("kernel not configured with CONFIG_KSM");
1805                 } else {
1806                     warn!("madvise error: {}", err);
1807                 }
1808                 warn!("failed to mark pages as mergeable");
1809             }
1810         }
1811 
1812         info!(
1813             "Created userspace mapping: {:x} -> {:x} {:x}",
1814             guest_phys_addr, userspace_addr, memory_size
1815         );
1816 
1817         Ok(slot)
1818     }
1819 
1820     pub fn remove_userspace_mapping(
1821         &mut self,
1822         guest_phys_addr: u64,
1823         memory_size: u64,
1824         userspace_addr: u64,
1825         mergeable: bool,
1826         slot: u32,
1827     ) -> Result<(), Error> {
1828         let mem_region = self.vm.make_user_memory_region(
1829             slot,
1830             guest_phys_addr,
1831             memory_size,
1832             userspace_addr,
1833             false, /* readonly -- don't care */
1834             false, /* log dirty */
1835         );
1836 
1837         self.vm
1838             .remove_user_memory_region(mem_region)
1839             .map_err(Error::RemoveUserMemoryRegion)?;
1840 
1841         // Mark the pages as unmergeable if there were previously marked as
1842         // mergeable.
1843         if mergeable {
1844             // SAFETY: the address and size are valid as the region was
1845             // previously advised.
1846             let ret = unsafe {
1847                 libc::madvise(
1848                     userspace_addr as *mut libc::c_void,
1849                     memory_size as libc::size_t,
1850                     libc::MADV_UNMERGEABLE,
1851                 )
1852             };
1853             if ret != 0 {
1854                 let err = io::Error::last_os_error();
1855                 // Safe to unwrap because the error is constructed with
1856                 // last_os_error(), which ensures the output will be Some().
1857                 let errno = err.raw_os_error().unwrap();
1858                 if errno == libc::EINVAL {
1859                     warn!("kernel not configured with CONFIG_KSM");
1860                 } else {
1861                     warn!("madvise error: {}", err);
1862                 }
1863                 warn!("failed to mark pages as unmergeable");
1864             }
1865         }
1866 
1867         info!(
1868             "Removed userspace mapping: {:x} -> {:x} {:x}",
1869             guest_phys_addr, userspace_addr, memory_size
1870         );
1871 
1872         Ok(())
1873     }
1874 
1875     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1876         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1877             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1878                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1879                     virtio_mem_device
1880                         .lock()
1881                         .unwrap()
1882                         .resize(size)
1883                         .map_err(Error::VirtioMemResizeFail)?;
1884                 }
1885 
1886                 // Keep the hotplugged_size up to date.
1887                 virtio_mem_zone.hotplugged_size = size;
1888             } else {
1889                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1890                 return Err(Error::MissingVirtioMemHandler);
1891             }
1892 
1893             return Ok(());
1894         }
1895 
1896         error!("Failed resizing virtio-mem region: Unknown memory zone");
1897         Err(Error::UnknownMemoryZone)
1898     }
1899 
1900     /// In case this function resulted in adding a new memory region to the
1901     /// guest memory, the new region is returned to the caller. The virtio-mem
1902     /// use case never adds a new region as the whole hotpluggable memory has
1903     /// already been allocated at boot time.
1904     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1905         if self.user_provided_zones {
1906             error!(
1907                 "Not allowed to resize guest memory when backed with user \
1908                 defined memory zones."
1909             );
1910             return Err(Error::InvalidResizeWithMemoryZones);
1911         }
1912 
1913         let mut region: Option<Arc<GuestRegionMmap>> = None;
1914         match self.hotplug_method {
1915             HotplugMethod::VirtioMem => {
1916                 if desired_ram >= self.boot_ram {
1917                     if !self.dynamic {
1918                         return Ok(region);
1919                     }
1920 
1921                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1922                     self.current_ram = desired_ram;
1923                 }
1924             }
1925             HotplugMethod::Acpi => {
1926                 if desired_ram > self.current_ram {
1927                     if !self.dynamic {
1928                         return Ok(region);
1929                     }
1930 
1931                     region =
1932                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1933                     self.current_ram = desired_ram;
1934                 }
1935             }
1936         }
1937         Ok(region)
1938     }
1939 
1940     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1941         if !self.user_provided_zones {
1942             error!(
1943                 "Not allowed to resize guest memory zone when no zone is \
1944                 defined."
1945             );
1946             return Err(Error::ResizeZone);
1947         }
1948 
1949         self.virtio_mem_resize(id, virtio_mem_size)
1950     }
1951 
1952     #[cfg(target_arch = "x86_64")]
1953     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1954         let file = OpenOptions::new()
1955             .read(true)
1956             .open("/dev/sgx_provision")
1957             .map_err(Error::SgxProvisionOpen)?;
1958         self.vm
1959             .enable_sgx_attribute(file)
1960             .map_err(Error::SgxEnableProvisioning)?;
1961 
1962         // Go over each EPC section and verify its size is a 4k multiple. At
1963         // the same time, calculate the total size needed for the contiguous
1964         // EPC region.
1965         let mut epc_region_size = 0;
1966         for epc_section in sgx_epc_config.iter() {
1967             if epc_section.size == 0 {
1968                 return Err(Error::EpcSectionSizeInvalid);
1969             }
1970             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1971                 return Err(Error::EpcSectionSizeInvalid);
1972             }
1973 
1974             epc_region_size += epc_section.size;
1975         }
1976 
1977         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1978         let epc_region_start =
1979             GuestAddress(self.start_of_device_area.0.div_ceil(SGX_PAGE_SIZE) * SGX_PAGE_SIZE);
1980 
1981         self.start_of_device_area = epc_region_start
1982             .checked_add(epc_region_size)
1983             .ok_or(Error::GuestAddressOverFlow)?;
1984 
1985         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1986         info!(
1987             "SGX EPC region: 0x{:x} (0x{:x})",
1988             epc_region_start.0, epc_region_size
1989         );
1990 
1991         // Each section can be memory mapped into the allocated region.
1992         let mut epc_section_start = epc_region_start.raw_value();
1993         for epc_section in sgx_epc_config.iter() {
1994             let file = OpenOptions::new()
1995                 .read(true)
1996                 .write(true)
1997                 .open("/dev/sgx_vepc")
1998                 .map_err(Error::SgxVirtEpcOpen)?;
1999 
2000             let prot = PROT_READ | PROT_WRITE;
2001             let mut flags = MAP_NORESERVE | MAP_SHARED;
2002             if epc_section.prefault {
2003                 flags |= MAP_POPULATE;
2004             }
2005 
2006             // We can't use the vm-memory crate to perform the memory mapping
2007             // here as it would try to ensure the size of the backing file is
2008             // matching the size of the expected mapping. The /dev/sgx_vepc
2009             // device does not work that way, it provides a file descriptor
2010             // which is not matching the mapping size, as it's a just a way to
2011             // let KVM know that an EPC section is being created for the guest.
2012             // SAFETY: FFI call with correct arguments
2013             let host_addr = unsafe {
2014                 libc::mmap(
2015                     std::ptr::null_mut(),
2016                     epc_section.size as usize,
2017                     prot,
2018                     flags,
2019                     file.as_raw_fd(),
2020                     0,
2021                 )
2022             } as u64;
2023 
2024             info!(
2025                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
2026                 epc_section_start, epc_section.size
2027             );
2028 
2029             let _mem_slot = self.create_userspace_mapping(
2030                 epc_section_start,
2031                 epc_section.size,
2032                 host_addr,
2033                 false,
2034                 false,
2035                 false,
2036             )?;
2037 
2038             sgx_epc_region.insert(
2039                 epc_section.id.clone(),
2040                 SgxEpcSection::new(
2041                     GuestAddress(epc_section_start),
2042                     epc_section.size as GuestUsize,
2043                 ),
2044             );
2045 
2046             epc_section_start += epc_section.size;
2047         }
2048 
2049         self.sgx_epc_region = Some(sgx_epc_region);
2050 
2051         Ok(())
2052     }
2053 
2054     #[cfg(target_arch = "x86_64")]
2055     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
2056         &self.sgx_epc_region
2057     }
2058 
2059     pub fn is_hardlink(f: &File) -> bool {
2060         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
2061         // SAFETY: FFI call with correct arguments
2062         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
2063         if ret != 0 {
2064             error!("Couldn't fstat the backing file");
2065             return false;
2066         }
2067 
2068         // SAFETY: stat is valid
2069         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
2070     }
2071 
2072     pub fn memory_zones(&self) -> &MemoryZones {
2073         &self.memory_zones
2074     }
2075 
2076     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
2077         &mut self.memory_zones
2078     }
2079 
2080     pub fn memory_range_table(
2081         &self,
2082         snapshot: bool,
2083     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
2084         let mut table = MemoryRangeTable::default();
2085 
2086         for memory_zone in self.memory_zones.values() {
2087             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
2088                 table.extend(virtio_mem_zone.plugged_ranges());
2089             }
2090 
2091             for region in memory_zone.regions() {
2092                 if snapshot {
2093                     if let Some(file_offset) = region.file_offset() {
2094                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
2095                             && Self::is_hardlink(file_offset.file())
2096                         {
2097                             // In this very specific case, we know the memory
2098                             // region is backed by a file on the host filesystem
2099                             // that can be accessed by the user, and additionally
2100                             // the mapping is shared, which means that modifications
2101                             // to the content are written to the actual file.
2102                             // When meeting these conditions, we can skip the
2103                             // copy of the memory content for this specific region,
2104                             // as we can assume the user will have it saved through
2105                             // the backing file already.
2106                             continue;
2107                         }
2108                     }
2109                 }
2110 
2111                 table.push(MemoryRange {
2112                     gpa: region.start_addr().raw_value(),
2113                     length: region.len(),
2114                 });
2115             }
2116         }
2117 
2118         Ok(table)
2119     }
2120 
2121     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2122         MemoryManagerSnapshotData {
2123             memory_ranges: self.snapshot_memory_ranges.clone(),
2124             guest_ram_mappings: self.guest_ram_mappings.clone(),
2125             start_of_device_area: self.start_of_device_area.0,
2126             boot_ram: self.boot_ram,
2127             current_ram: self.current_ram,
2128             arch_mem_regions: self.arch_mem_regions.clone(),
2129             hotplug_slots: self.hotplug_slots.clone(),
2130             next_memory_slot: self.next_memory_slot.load(Ordering::SeqCst),
2131             selected_slot: self.selected_slot,
2132             next_hotplug_slot: self.next_hotplug_slot,
2133         }
2134     }
2135 
2136     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2137         let mut memory_slot_fds = HashMap::new();
2138         for guest_ram_mapping in &self.guest_ram_mappings {
2139             let slot = guest_ram_mapping.slot;
2140             let guest_memory = self.guest_memory.memory();
2141             let file = guest_memory
2142                 .find_region(GuestAddress(guest_ram_mapping.gpa))
2143                 .unwrap()
2144                 .file_offset()
2145                 .unwrap()
2146                 .file();
2147             memory_slot_fds.insert(slot, file.as_raw_fd());
2148         }
2149         memory_slot_fds
2150     }
2151 
2152     pub fn acpi_address(&self) -> Option<GuestAddress> {
2153         self.acpi_address
2154     }
2155 
2156     pub fn num_guest_ram_mappings(&self) -> u32 {
2157         self.guest_ram_mappings.len() as u32
2158     }
2159 
2160     #[cfg(target_arch = "aarch64")]
2161     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2162         self.uefi_flash.as_ref().unwrap().clone()
2163     }
2164 
2165     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2166     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2167         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2168         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2169 
2170         let mut mem_offset_in_elf = mem_offset;
2171         let mut ram_maps = BTreeMap::new();
2172         for mapping in mapping_sorted_by_gpa.iter() {
2173             ram_maps.insert(
2174                 mapping.gpa,
2175                 CoredumpMemoryRegion {
2176                     mem_offset_in_elf,
2177                     mem_size: mapping.size,
2178                 },
2179             );
2180             mem_offset_in_elf += mapping.size;
2181         }
2182 
2183         CoredumpMemoryRegions { ram_maps }
2184     }
2185 
2186     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2187     pub fn coredump_iterate_save_mem(
2188         &mut self,
2189         dump_state: &DumpState,
2190     ) -> std::result::Result<(), GuestDebuggableError> {
2191         let snapshot_memory_ranges = self
2192             .memory_range_table(false)
2193             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2194 
2195         if snapshot_memory_ranges.is_empty() {
2196             return Ok(());
2197         }
2198 
2199         let coredump_file = dump_state.file.as_ref().unwrap();
2200 
2201         let guest_memory = self.guest_memory.memory();
2202         let mut total_bytes: u64 = 0;
2203 
2204         for range in snapshot_memory_ranges.regions() {
2205             let mut offset: u64 = 0;
2206             loop {
2207                 let bytes_written = guest_memory
2208                     .write_volatile_to(
2209                         GuestAddress(range.gpa + offset),
2210                         &mut coredump_file.as_fd(),
2211                         (range.length - offset) as usize,
2212                     )
2213                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2214                 offset += bytes_written as u64;
2215                 total_bytes += bytes_written as u64;
2216 
2217                 if offset == range.length {
2218                     break;
2219                 }
2220             }
2221         }
2222 
2223         debug!("coredump total bytes {}", total_bytes);
2224         Ok(())
2225     }
2226 
2227     pub fn receive_memory_regions<F>(
2228         &mut self,
2229         ranges: &MemoryRangeTable,
2230         fd: &mut F,
2231     ) -> std::result::Result<(), MigratableError>
2232     where
2233         F: ReadVolatile,
2234     {
2235         let guest_memory = self.guest_memory();
2236         let mem = guest_memory.memory();
2237 
2238         for range in ranges.regions() {
2239             let mut offset: u64 = 0;
2240             // Here we are manually handling the retry in case we can't the
2241             // whole region at once because we can't use the implementation
2242             // from vm-memory::GuestMemory of read_exact_from() as it is not
2243             // following the correct behavior. For more info about this issue
2244             // see: https://github.com/rust-vmm/vm-memory/issues/174
2245             loop {
2246                 let bytes_read = mem
2247                     .read_volatile_from(
2248                         GuestAddress(range.gpa + offset),
2249                         fd,
2250                         (range.length - offset) as usize,
2251                     )
2252                     .map_err(|e| {
2253                         MigratableError::MigrateReceive(anyhow!(
2254                             "Error receiving memory from socket: {}",
2255                             e
2256                         ))
2257                     })?;
2258                 offset += bytes_read as u64;
2259 
2260                 if offset == range.length {
2261                     break;
2262                 }
2263             }
2264         }
2265 
2266         Ok(())
2267     }
2268 }
2269 
2270 struct MemoryNotify {
2271     slot_id: usize,
2272 }
2273 
2274 impl Aml for MemoryNotify {
2275     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2276         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2277         aml::If::new(
2278             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2279             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2280         )
2281         .to_aml_bytes(sink)
2282     }
2283 }
2284 
2285 struct MemorySlot {
2286     slot_id: usize,
2287 }
2288 
2289 impl Aml for MemorySlot {
2290     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2291         aml::Device::new(
2292             format!("M{:03}", self.slot_id).as_str().into(),
2293             vec![
2294                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2295                 &aml::Name::new("_UID".into(), &self.slot_id),
2296                 /*
2297                 _STA return value:
2298                 Bit [0] – Set if the device is present.
2299                 Bit [1] – Set if the device is enabled and decoding its resources.
2300                 Bit [2] – Set if the device should be shown in the UI.
2301                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2302                 Bit [4] – Set if the battery is present.
2303                 Bits [31:5] – Reserved (must be cleared).
2304                 */
2305                 &aml::Method::new(
2306                     "_STA".into(),
2307                     0,
2308                     false,
2309                     // Call into MSTA method which will interrogate device
2310                     vec![&aml::Return::new(&aml::MethodCall::new(
2311                         "MSTA".into(),
2312                         vec![&self.slot_id],
2313                     ))],
2314                 ),
2315                 // Get details of memory
2316                 &aml::Method::new(
2317                     "_CRS".into(),
2318                     0,
2319                     false,
2320                     // Call into MCRS which provides actual memory details
2321                     vec![&aml::Return::new(&aml::MethodCall::new(
2322                         "MCRS".into(),
2323                         vec![&self.slot_id],
2324                     ))],
2325                 ),
2326             ],
2327         )
2328         .to_aml_bytes(sink)
2329     }
2330 }
2331 
2332 struct MemorySlots {
2333     slots: usize,
2334 }
2335 
2336 impl Aml for MemorySlots {
2337     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2338         for slot_id in 0..self.slots {
2339             MemorySlot { slot_id }.to_aml_bytes(sink);
2340         }
2341     }
2342 }
2343 
2344 struct MemoryMethods {
2345     slots: usize,
2346 }
2347 
2348 impl Aml for MemoryMethods {
2349     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2350         // Add "MTFY" notification method
2351         let mut memory_notifies = Vec::new();
2352         for slot_id in 0..self.slots {
2353             memory_notifies.push(MemoryNotify { slot_id });
2354         }
2355 
2356         let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2357         for memory_notifier in memory_notifies.iter() {
2358             memory_notifies_refs.push(memory_notifier);
2359         }
2360 
2361         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2362 
2363         // MSCN method
2364         aml::Method::new(
2365             "MSCN".into(),
2366             0,
2367             true,
2368             vec![
2369                 // Take lock defined above
2370                 &aml::Acquire::new("MLCK".into(), 0xffff),
2371                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2372                 &aml::While::new(
2373                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2374                     vec![
2375                         // Write slot number (in first argument) to I/O port via field
2376                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2377                         // Check if MINS bit is set (inserting)
2378                         &aml::If::new(
2379                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2380                             // Notify device if it is
2381                             vec![
2382                                 &aml::MethodCall::new(
2383                                     "MTFY".into(),
2384                                     vec![&aml::Local(0), &aml::ONE],
2385                                 ),
2386                                 // Reset MINS bit
2387                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2388                             ],
2389                         ),
2390                         // Check if MRMV bit is set
2391                         &aml::If::new(
2392                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2393                             // Notify device if it is (with the eject constant 0x3)
2394                             vec![
2395                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2396                                 // Reset MRMV bit
2397                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2398                             ],
2399                         ),
2400                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2401                     ],
2402                 ),
2403                 // Release lock
2404                 &aml::Release::new("MLCK".into()),
2405             ],
2406         )
2407         .to_aml_bytes(sink);
2408 
2409         // Memory status method
2410         aml::Method::new(
2411             "MSTA".into(),
2412             1,
2413             true,
2414             vec![
2415                 // Take lock defined above
2416                 &aml::Acquire::new("MLCK".into(), 0xffff),
2417                 // Write slot number (in first argument) to I/O port via field
2418                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2419                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2420                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2421                 &aml::If::new(
2422                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2423                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2424                 ),
2425                 // Release lock
2426                 &aml::Release::new("MLCK".into()),
2427                 // Return 0 or 0xf
2428                 &aml::Return::new(&aml::Local(0)),
2429             ],
2430         )
2431         .to_aml_bytes(sink);
2432 
2433         // Memory range method
2434         aml::Method::new(
2435             "MCRS".into(),
2436             1,
2437             true,
2438             vec![
2439                 // Take lock defined above
2440                 &aml::Acquire::new("MLCK".into(), 0xffff),
2441                 // Write slot number (in first argument) to I/O port via field
2442                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2443                 &aml::Name::new(
2444                     "MR64".into(),
2445                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2446                         aml::AddressSpaceCacheable::Cacheable,
2447                         true,
2448                         0x0000_0000_0000_0000u64,
2449                         0xFFFF_FFFF_FFFF_FFFEu64,
2450                         None,
2451                     )]),
2452                 ),
2453                 &aml::CreateQWordField::new(
2454                     &aml::Path::new("MINL"),
2455                     &aml::Path::new("MR64"),
2456                     &14usize,
2457                 ),
2458                 &aml::CreateDWordField::new(
2459                     &aml::Path::new("MINH"),
2460                     &aml::Path::new("MR64"),
2461                     &18usize,
2462                 ),
2463                 &aml::CreateQWordField::new(
2464                     &aml::Path::new("MAXL"),
2465                     &aml::Path::new("MR64"),
2466                     &22usize,
2467                 ),
2468                 &aml::CreateDWordField::new(
2469                     &aml::Path::new("MAXH"),
2470                     &aml::Path::new("MR64"),
2471                     &26usize,
2472                 ),
2473                 &aml::CreateQWordField::new(
2474                     &aml::Path::new("LENL"),
2475                     &aml::Path::new("MR64"),
2476                     &38usize,
2477                 ),
2478                 &aml::CreateDWordField::new(
2479                     &aml::Path::new("LENH"),
2480                     &aml::Path::new("MR64"),
2481                     &42usize,
2482                 ),
2483                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2484                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2485                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2486                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2487                 &aml::Add::new(
2488                     &aml::Path::new("MAXL"),
2489                     &aml::Path::new("MINL"),
2490                     &aml::Path::new("LENL"),
2491                 ),
2492                 &aml::Add::new(
2493                     &aml::Path::new("MAXH"),
2494                     &aml::Path::new("MINH"),
2495                     &aml::Path::new("LENH"),
2496                 ),
2497                 &aml::If::new(
2498                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2499                     vec![&aml::Add::new(
2500                         &aml::Path::new("MAXH"),
2501                         &aml::ONE,
2502                         &aml::Path::new("MAXH"),
2503                     )],
2504                 ),
2505                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2506                 // Release lock
2507                 &aml::Release::new("MLCK".into()),
2508                 &aml::Return::new(&aml::Path::new("MR64")),
2509             ],
2510         )
2511         .to_aml_bytes(sink)
2512     }
2513 }
2514 
2515 impl Aml for MemoryManager {
2516     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2517         if let Some(acpi_address) = self.acpi_address {
2518             // Memory Hotplug Controller
2519             aml::Device::new(
2520                 "_SB_.MHPC".into(),
2521                 vec![
2522                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2523                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2524                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2525                     &aml::Mutex::new("MLCK".into(), 0),
2526                     &aml::Name::new(
2527                         "_CRS".into(),
2528                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2529                             aml::AddressSpaceCacheable::NotCacheable,
2530                             true,
2531                             acpi_address.0,
2532                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2533                             None,
2534                         )]),
2535                     ),
2536                     // OpRegion and Fields map MMIO range into individual field values
2537                     &aml::OpRegion::new(
2538                         "MHPR".into(),
2539                         aml::OpRegionSpace::SystemMemory,
2540                         &(acpi_address.0 as usize),
2541                         &MEMORY_MANAGER_ACPI_SIZE,
2542                     ),
2543                     &aml::Field::new(
2544                         "MHPR".into(),
2545                         aml::FieldAccessType::DWord,
2546                         aml::FieldLockRule::NoLock,
2547                         aml::FieldUpdateRule::Preserve,
2548                         vec![
2549                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2550                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2551                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2552                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2553                         ],
2554                     ),
2555                     &aml::Field::new(
2556                         "MHPR".into(),
2557                         aml::FieldAccessType::DWord,
2558                         aml::FieldLockRule::NoLock,
2559                         aml::FieldUpdateRule::Preserve,
2560                         vec![
2561                             aml::FieldEntry::Reserved(128),
2562                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2563                         ],
2564                     ),
2565                     &aml::Field::new(
2566                         "MHPR".into(),
2567                         aml::FieldAccessType::Byte,
2568                         aml::FieldLockRule::NoLock,
2569                         aml::FieldUpdateRule::WriteAsZeroes,
2570                         vec![
2571                             aml::FieldEntry::Reserved(160),
2572                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2573                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2574                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2575                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2576                         ],
2577                     ),
2578                     &aml::Field::new(
2579                         "MHPR".into(),
2580                         aml::FieldAccessType::DWord,
2581                         aml::FieldLockRule::NoLock,
2582                         aml::FieldUpdateRule::Preserve,
2583                         vec![
2584                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2585                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2586                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2587                         ],
2588                     ),
2589                     &MemoryMethods {
2590                         slots: self.hotplug_slots.len(),
2591                     },
2592                     &MemorySlots {
2593                         slots: self.hotplug_slots.len(),
2594                     },
2595                 ],
2596             )
2597             .to_aml_bytes(sink);
2598         } else {
2599             aml::Device::new(
2600                 "_SB_.MHPC".into(),
2601                 vec![
2602                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2603                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2604                     // Empty MSCN for GED
2605                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2606                 ],
2607             )
2608             .to_aml_bytes(sink);
2609         }
2610 
2611         #[cfg(target_arch = "x86_64")]
2612         {
2613             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2614                 let min = sgx_epc_region.start().raw_value();
2615                 let max = min + sgx_epc_region.size() - 1;
2616                 // SGX EPC region
2617                 aml::Device::new(
2618                     "_SB_.EPC_".into(),
2619                     vec![
2620                         &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2621                         // QWORD describing the EPC region start and size
2622                         &aml::Name::new(
2623                             "_CRS".into(),
2624                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2625                                 aml::AddressSpaceCacheable::NotCacheable,
2626                                 true,
2627                                 min,
2628                                 max,
2629                                 None,
2630                             )]),
2631                         ),
2632                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2633                     ],
2634                 )
2635                 .to_aml_bytes(sink);
2636             }
2637         }
2638     }
2639 }
2640 
2641 impl Pausable for MemoryManager {}
2642 
2643 #[derive(Clone, Serialize, Deserialize)]
2644 pub struct MemoryManagerSnapshotData {
2645     memory_ranges: MemoryRangeTable,
2646     guest_ram_mappings: Vec<GuestRamMapping>,
2647     start_of_device_area: u64,
2648     boot_ram: u64,
2649     current_ram: u64,
2650     arch_mem_regions: Vec<ArchMemRegion>,
2651     hotplug_slots: Vec<HotPlugState>,
2652     next_memory_slot: u32,
2653     selected_slot: usize,
2654     next_hotplug_slot: usize,
2655 }
2656 
2657 impl Snapshottable for MemoryManager {
2658     fn id(&self) -> String {
2659         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2660     }
2661 
2662     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2663         let memory_ranges = self.memory_range_table(true)?;
2664 
2665         // Store locally this list of ranges as it will be used through the
2666         // Transportable::send() implementation. The point is to avoid the
2667         // duplication of code regarding the creation of the path for each
2668         // region. The 'snapshot' step creates the list of memory regions,
2669         // including information about the need to copy a memory region or
2670         // not. This saves the 'send' step having to go through the same
2671         // process, and instead it can directly proceed with storing the
2672         // memory range content for the ranges requiring it.
2673         self.snapshot_memory_ranges = memory_ranges;
2674 
2675         Ok(Snapshot::from_data(SnapshotData::new_from_state(
2676             &self.snapshot_data(),
2677         )?))
2678     }
2679 }
2680 
2681 impl Transportable for MemoryManager {
2682     fn send(
2683         &self,
2684         _snapshot: &Snapshot,
2685         destination_url: &str,
2686     ) -> result::Result<(), MigratableError> {
2687         if self.snapshot_memory_ranges.is_empty() {
2688             return Ok(());
2689         }
2690 
2691         let mut memory_file_path = url_to_path(destination_url)?;
2692         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2693 
2694         // Create the snapshot file for the entire memory
2695         let mut memory_file = OpenOptions::new()
2696             .read(true)
2697             .write(true)
2698             .create_new(true)
2699             .open(memory_file_path)
2700             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2701 
2702         let guest_memory = self.guest_memory.memory();
2703 
2704         for range in self.snapshot_memory_ranges.regions() {
2705             let mut offset: u64 = 0;
2706             // Here we are manually handling the retry in case we can't read
2707             // the whole region at once because we can't use the implementation
2708             // from vm-memory::GuestMemory of write_all_to() as it is not
2709             // following the correct behavior. For more info about this issue
2710             // see: https://github.com/rust-vmm/vm-memory/issues/174
2711             loop {
2712                 let bytes_written = guest_memory
2713                     .write_volatile_to(
2714                         GuestAddress(range.gpa + offset),
2715                         &mut memory_file,
2716                         (range.length - offset) as usize,
2717                     )
2718                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2719                 offset += bytes_written as u64;
2720 
2721                 if offset == range.length {
2722                     break;
2723                 }
2724             }
2725         }
2726         Ok(())
2727     }
2728 }
2729 
2730 impl Migratable for MemoryManager {
2731     // Start the dirty log in the hypervisor (kvm/mshv).
2732     // Also, reset the dirty bitmap logged by the vmm.
2733     // Just before we do a bulk copy we want to start/clear the dirty log so that
2734     // pages touched during our bulk copy are tracked.
2735     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2736         self.vm.start_dirty_log().map_err(|e| {
2737             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2738         })?;
2739 
2740         for r in self.guest_memory.memory().iter() {
2741             r.bitmap().reset();
2742         }
2743 
2744         Ok(())
2745     }
2746 
2747     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2748         self.vm.stop_dirty_log().map_err(|e| {
2749             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2750         })?;
2751 
2752         Ok(())
2753     }
2754 
2755     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2756     // together in the table if they are contiguous.
2757     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2758         let mut table = MemoryRangeTable::default();
2759         for r in &self.guest_ram_mappings {
2760             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2761                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2762             })?;
2763             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2764             {
2765                 Some(region) => {
2766                     assert!(region.start_addr().raw_value() == r.gpa);
2767                     assert!(region.len() == r.size);
2768                     region.bitmap().get_and_reset()
2769                 }
2770                 None => {
2771                     return Err(MigratableError::MigrateSend(anyhow!(
2772                         "Error finding 'guest memory region' with address {:x}",
2773                         r.gpa
2774                     )))
2775                 }
2776             };
2777 
2778             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2779                 .iter()
2780                 .zip(vmm_dirty_bitmap.iter())
2781                 .map(|(x, y)| x | y)
2782                 .collect();
2783 
2784             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2785 
2786             if sub_table.regions().is_empty() {
2787                 info!("Dirty Memory Range Table is empty");
2788             } else {
2789                 info!("Dirty Memory Range Table:");
2790                 for range in sub_table.regions() {
2791                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2792                 }
2793             }
2794 
2795             table.extend(sub_table);
2796         }
2797         Ok(table)
2798     }
2799 }
2800