xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 07d1208dd53a207a65b649b8952780dfd0ca59d9)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(target_arch = "x86_64")]
6 use crate::config::SgxEpcConfig;
7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
9 use crate::coredump::{
10     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
11 };
12 use crate::migration::url_to_path;
13 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
14 use crate::{GuestMemoryMmap, GuestRegionMmap};
15 use acpi_tables::{aml, Aml};
16 use anyhow::anyhow;
17 #[cfg(target_arch = "x86_64")]
18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
19 use arch::{layout, RegionType};
20 #[cfg(target_arch = "x86_64")]
21 use devices::ioapic;
22 #[cfg(target_arch = "aarch64")]
23 use hypervisor::HypervisorVmError;
24 #[cfg(target_arch = "x86_64")]
25 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
26 use serde::{Deserialize, Serialize};
27 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
28 use std::collections::BTreeMap;
29 use std::collections::HashMap;
30 use std::convert::TryInto;
31 use std::ffi;
32 use std::fs::{File, OpenOptions};
33 use std::io::{self, Read};
34 use std::ops::{BitAnd, Deref, Not, Sub};
35 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
36 use std::path::PathBuf;
37 use std::result;
38 use std::sync::{Arc, Barrier, Mutex};
39 use tracer::trace_scoped;
40 use versionize::{VersionMap, Versionize, VersionizeResult};
41 use versionize_derive::Versionize;
42 use virtio_devices::BlocksState;
43 #[cfg(target_arch = "x86_64")]
44 use vm_allocator::GsiApic;
45 use vm_allocator::{AddressAllocator, SystemAllocator};
46 use vm_device::BusDevice;
47 use vm_memory::bitmap::AtomicBitmap;
48 use vm_memory::guest_memory::FileOffset;
49 use vm_memory::{
50     mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace,
51     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
52 };
53 use vm_migration::{
54     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
55     Snapshot, SnapshotData, Snapshottable, Transportable, VersionMapped,
56 };
57 
58 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
59 
60 const DEFAULT_MEMORY_ZONE: &str = "mem0";
61 
62 const SNAPSHOT_FILENAME: &str = "memory-ranges";
63 
64 #[cfg(target_arch = "x86_64")]
65 const X86_64_IRQ_BASE: u32 = 5;
66 
67 #[cfg(target_arch = "x86_64")]
68 const SGX_PAGE_SIZE: u64 = 1 << 12;
69 
70 const HOTPLUG_COUNT: usize = 8;
71 
72 // Memory policy constants
73 const MPOL_BIND: u32 = 2;
74 const MPOL_MF_STRICT: u32 = 1;
75 const MPOL_MF_MOVE: u32 = 1 << 1;
76 
77 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
78 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
79 
80 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
81 struct HotPlugState {
82     base: u64,
83     length: u64,
84     active: bool,
85     inserting: bool,
86     removing: bool,
87 }
88 
89 pub struct VirtioMemZone {
90     region: Arc<GuestRegionMmap>,
91     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
92     hotplugged_size: u64,
93     hugepages: bool,
94     blocks_state: Arc<Mutex<BlocksState>>,
95 }
96 
97 impl VirtioMemZone {
98     pub fn region(&self) -> &Arc<GuestRegionMmap> {
99         &self.region
100     }
101     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
102         self.virtio_device = Some(virtio_device);
103     }
104     pub fn hotplugged_size(&self) -> u64 {
105         self.hotplugged_size
106     }
107     pub fn hugepages(&self) -> bool {
108         self.hugepages
109     }
110     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
111         &self.blocks_state
112     }
113     pub fn plugged_ranges(&self) -> MemoryRangeTable {
114         self.blocks_state
115             .lock()
116             .unwrap()
117             .memory_ranges(self.region.start_addr().raw_value(), true)
118     }
119 }
120 
121 #[derive(Default)]
122 pub struct MemoryZone {
123     regions: Vec<Arc<GuestRegionMmap>>,
124     virtio_mem_zone: Option<VirtioMemZone>,
125 }
126 
127 impl MemoryZone {
128     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
129         &self.regions
130     }
131     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
132         &self.virtio_mem_zone
133     }
134     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
135         self.virtio_mem_zone.as_mut()
136     }
137 }
138 
139 pub type MemoryZones = HashMap<String, MemoryZone>;
140 
141 #[derive(Clone, Serialize, Deserialize, Versionize)]
142 struct GuestRamMapping {
143     slot: u32,
144     gpa: u64,
145     size: u64,
146     zone_id: String,
147     virtio_mem: bool,
148     file_offset: u64,
149 }
150 
151 #[derive(Clone, Serialize, Deserialize, Versionize)]
152 struct ArchMemRegion {
153     base: u64,
154     size: usize,
155     r_type: RegionType,
156 }
157 
158 pub struct MemoryManager {
159     boot_guest_memory: GuestMemoryMmap,
160     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
161     next_memory_slot: u32,
162     start_of_device_area: GuestAddress,
163     end_of_device_area: GuestAddress,
164     end_of_ram_area: GuestAddress,
165     pub vm: Arc<dyn hypervisor::Vm>,
166     hotplug_slots: Vec<HotPlugState>,
167     selected_slot: usize,
168     mergeable: bool,
169     allocator: Arc<Mutex<SystemAllocator>>,
170     hotplug_method: HotplugMethod,
171     boot_ram: u64,
172     current_ram: u64,
173     next_hotplug_slot: usize,
174     shared: bool,
175     hugepages: bool,
176     hugepage_size: Option<u64>,
177     prefault: bool,
178     thp: bool,
179     #[cfg(target_arch = "x86_64")]
180     sgx_epc_region: Option<SgxEpcRegion>,
181     user_provided_zones: bool,
182     snapshot_memory_ranges: MemoryRangeTable,
183     memory_zones: MemoryZones,
184     log_dirty: bool, // Enable dirty logging for created RAM regions
185     arch_mem_regions: Vec<ArchMemRegion>,
186     ram_allocator: AddressAllocator,
187     dynamic: bool,
188 
189     // Keep track of calls to create_userspace_mapping() for guest RAM.
190     // This is useful for getting the dirty pages as we need to know the
191     // slots that the mapping is created in.
192     guest_ram_mappings: Vec<GuestRamMapping>,
193 
194     pub acpi_address: Option<GuestAddress>,
195     #[cfg(target_arch = "aarch64")]
196     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
197 }
198 
199 #[derive(Debug)]
200 pub enum Error {
201     /// Failed to create shared file.
202     SharedFileCreate(io::Error),
203 
204     /// Failed to set shared file length.
205     SharedFileSetLen(io::Error),
206 
207     /// Mmap backed guest memory error
208     GuestMemory(MmapError),
209 
210     /// Failed to allocate a memory range.
211     MemoryRangeAllocation,
212 
213     /// Error from region creation
214     GuestMemoryRegion(MmapRegionError),
215 
216     /// No ACPI slot available
217     NoSlotAvailable,
218 
219     /// Not enough space in the hotplug RAM region
220     InsufficientHotplugRam,
221 
222     /// The requested hotplug memory addition is not a valid size
223     InvalidSize,
224 
225     /// Failed to create the user memory region.
226     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
227 
228     /// Failed to remove the user memory region.
229     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
230 
231     /// Failed to EventFd.
232     EventFdFail(io::Error),
233 
234     /// Eventfd write error
235     EventfdError(io::Error),
236 
237     /// Failed to virtio-mem resize
238     VirtioMemResizeFail(virtio_devices::mem::Error),
239 
240     /// Cannot restore VM
241     Restore(MigratableError),
242 
243     /// Cannot restore VM because source URL is missing
244     RestoreMissingSourceUrl,
245 
246     /// Cannot create the system allocator
247     CreateSystemAllocator,
248 
249     /// Invalid SGX EPC section size
250     #[cfg(target_arch = "x86_64")]
251     EpcSectionSizeInvalid,
252 
253     /// Failed allocating SGX EPC region
254     #[cfg(target_arch = "x86_64")]
255     SgxEpcRangeAllocation,
256 
257     /// Failed opening SGX virtual EPC device
258     #[cfg(target_arch = "x86_64")]
259     SgxVirtEpcOpen(io::Error),
260 
261     /// Failed setting the SGX virtual EPC section size
262     #[cfg(target_arch = "x86_64")]
263     SgxVirtEpcFileSetLen(io::Error),
264 
265     /// Failed opening SGX provisioning device
266     #[cfg(target_arch = "x86_64")]
267     SgxProvisionOpen(io::Error),
268 
269     /// Failed enabling SGX provisioning
270     #[cfg(target_arch = "x86_64")]
271     SgxEnableProvisioning(hypervisor::HypervisorVmError),
272 
273     /// Failed creating a new MmapRegion instance.
274     #[cfg(target_arch = "x86_64")]
275     NewMmapRegion(vm_memory::mmap::MmapRegionError),
276 
277     /// No memory zones found.
278     MissingMemoryZones,
279 
280     /// Memory configuration is not valid.
281     InvalidMemoryParameters,
282 
283     /// Forbidden operation. Impossible to resize guest memory if it is
284     /// backed by user defined memory regions.
285     InvalidResizeWithMemoryZones,
286 
287     /// It's invalid to try applying a NUMA policy to a memory zone that is
288     /// memory mapped with MAP_SHARED.
289     InvalidSharedMemoryZoneWithHostNuma,
290 
291     /// Failed applying NUMA memory policy.
292     ApplyNumaPolicy(io::Error),
293 
294     /// Memory zone identifier is not unique.
295     DuplicateZoneId,
296 
297     /// No virtio-mem resizing handler found.
298     MissingVirtioMemHandler,
299 
300     /// Unknown memory zone.
301     UnknownMemoryZone,
302 
303     /// Invalid size for resizing. Can be anything except 0.
304     InvalidHotplugSize,
305 
306     /// Invalid hotplug method associated with memory zones resizing capability.
307     InvalidHotplugMethodWithMemoryZones,
308 
309     /// Could not find specified memory zone identifier from hash map.
310     MissingZoneIdentifier,
311 
312     /// Resizing the memory zone failed.
313     ResizeZone,
314 
315     /// Guest address overflow
316     GuestAddressOverFlow,
317 
318     /// Error opening snapshot file
319     SnapshotOpen(io::Error),
320 
321     // Error copying snapshot into region
322     SnapshotCopy(GuestMemoryError),
323 
324     /// Failed to allocate MMIO address
325     AllocateMmioAddress,
326 
327     #[cfg(target_arch = "aarch64")]
328     /// Failed to create UEFI flash
329     CreateUefiFlash(HypervisorVmError),
330 
331     /// Using a directory as a backing file for memory is not supported
332     DirectoryAsBackingFileForMemory,
333 
334     /// Failed to stat filesystem
335     GetFileSystemBlockSize(io::Error),
336 
337     /// Memory size is misaligned with default page size or its hugepage size
338     MisalignedMemorySize,
339 }
340 
341 const ENABLE_FLAG: usize = 0;
342 const INSERTING_FLAG: usize = 1;
343 const REMOVING_FLAG: usize = 2;
344 const EJECT_FLAG: usize = 3;
345 
346 const BASE_OFFSET_LOW: u64 = 0;
347 const BASE_OFFSET_HIGH: u64 = 0x4;
348 const LENGTH_OFFSET_LOW: u64 = 0x8;
349 const LENGTH_OFFSET_HIGH: u64 = 0xC;
350 const STATUS_OFFSET: u64 = 0x14;
351 const SELECTION_OFFSET: u64 = 0;
352 
353 // The MMIO address space size is subtracted with 64k. This is done for the
354 // following reasons:
355 //  - Reduce the addressable space size by at least 4k to workaround a Linux
356 //    bug when the VMM allocates devices at the end of the addressable space
357 //  - Windows requires the addressable space size to be 64k aligned
358 fn mmio_address_space_size(phys_bits: u8) -> u64 {
359     (1 << phys_bits) - (1 << 16)
360 }
361 
362 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
363 // `f_bsize` field.
364 //
365 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
366 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
367     let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
368     let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
369 
370     // SAFETY: FFI call with a valid path and buffer
371     let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
372     if ret != 0 {
373         return Err(Error::GetFileSystemBlockSize(
374             std::io::Error::last_os_error(),
375         ));
376     }
377 
378     // SAFETY: `buf` is valid at this point
379     // Because this value is always positive, just convert it directly.
380     // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
381     // by `clippy` on musl target.  To avoid the warning, there should be `as _` instead of
382     // `as u64`.
383     let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
384     Ok(bsize)
385 }
386 
387 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
388     // SAFETY: FFI call. Trivially safe.
389     let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
390 
391     // There is no backend file and the `hugepages` is disabled, just use system page size.
392     if zone.file.is_none() && !zone.hugepages {
393         return Ok(page_size);
394     }
395 
396     // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
397     if zone.hugepages && zone.hugepage_size.is_some() {
398         return Ok(zone.hugepage_size.unwrap());
399     }
400 
401     // There are two scenarios here:
402     //  - `hugepages` is enabled but `hugepage_size` is not specified:
403     //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
404     //  - The backing file is specified:
405     //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
406     //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
407     //     value is less than or equal to the page size, just use the page size.
408     let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
409         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
410     })?;
411 
412     let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
413 
414     Ok(align_size)
415 }
416 
417 #[inline]
418 fn align_down<T>(val: T, align: T) -> T
419 where
420     T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
421 {
422     val & !(align - 1u8.into())
423 }
424 
425 #[inline]
426 fn is_aligned<T>(val: T, align: T) -> bool
427 where
428     T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
429 {
430     (val & (align - 1u8.into())) == 0u8.into()
431 }
432 
433 impl BusDevice for MemoryManager {
434     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
435         if self.selected_slot < self.hotplug_slots.len() {
436             let state = &self.hotplug_slots[self.selected_slot];
437             match offset {
438                 BASE_OFFSET_LOW => {
439                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
440                 }
441                 BASE_OFFSET_HIGH => {
442                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
443                 }
444                 LENGTH_OFFSET_LOW => {
445                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
446                 }
447                 LENGTH_OFFSET_HIGH => {
448                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
449                 }
450                 STATUS_OFFSET => {
451                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
452                     data.fill(0);
453                     if state.active {
454                         data[0] |= 1 << ENABLE_FLAG;
455                     }
456                     if state.inserting {
457                         data[0] |= 1 << INSERTING_FLAG;
458                     }
459                     if state.removing {
460                         data[0] |= 1 << REMOVING_FLAG;
461                     }
462                 }
463                 _ => {
464                     warn!(
465                         "Unexpected offset for accessing memory manager device: {:#}",
466                         offset
467                     );
468                 }
469             }
470         } else {
471             warn!("Out of range memory slot: {}", self.selected_slot);
472         }
473     }
474 
475     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
476         match offset {
477             SELECTION_OFFSET => {
478                 self.selected_slot = usize::from(data[0]);
479             }
480             STATUS_OFFSET => {
481                 if self.selected_slot < self.hotplug_slots.len() {
482                     let state = &mut self.hotplug_slots[self.selected_slot];
483                     // The ACPI code writes back a 1 to acknowledge the insertion
484                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
485                         state.inserting = false;
486                     }
487                     // Ditto for removal
488                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
489                         state.removing = false;
490                     }
491                     // Trigger removal of "DIMM"
492                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
493                         warn!("Ejection of memory not currently supported");
494                     }
495                 } else {
496                     warn!("Out of range memory slot: {}", self.selected_slot);
497                 }
498             }
499             _ => {
500                 warn!(
501                     "Unexpected offset for accessing memory manager device: {:#}",
502                     offset
503                 );
504             }
505         };
506         None
507     }
508 }
509 
510 impl MemoryManager {
511     /// Creates all memory regions based on the available RAM ranges defined
512     /// by `ram_regions`, and based on the description of the memory zones.
513     /// In practice, this function can perform multiple memory mappings of the
514     /// same backing file if there's a hole in the address space between two
515     /// RAM ranges.
516     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
517     /// and zones containing two zones (size 1G and size 4G).
518     /// This function will create 3 resulting memory regions:
519     /// - First one mapping entirely the first memory zone on 0-1G range
520     /// - Second one mapping partially the second memory zone on 1G-3G range
521     /// - Third one mapping partially the second memory zone on 4G-6G range
522     /// Also, all memory regions are page-size aligned (e.g. their sizes must
523     /// be multiple of page-size), which may leave an additional hole in the
524     /// address space when hugepage is used.
525     fn create_memory_regions_from_zones(
526         ram_regions: &[(GuestAddress, usize)],
527         zones: &[MemoryZoneConfig],
528         prefault: Option<bool>,
529         thp: bool,
530     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
531         let mut zone_iter = zones.iter();
532         let mut mem_regions = Vec::new();
533         let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
534         let mut zone_align_size = memory_zone_get_align_size(zone)?;
535         let mut zone_offset = 0u64;
536         let mut memory_zones = HashMap::new();
537 
538         if !is_aligned(zone.size, zone_align_size) {
539             return Err(Error::MisalignedMemorySize);
540         }
541 
542         // Add zone id to the list of memory zones.
543         memory_zones.insert(zone.id.clone(), MemoryZone::default());
544 
545         for ram_region in ram_regions.iter() {
546             let mut ram_region_offset = 0;
547             let mut exit = false;
548 
549             loop {
550                 let mut ram_region_consumed = false;
551                 let mut pull_next_zone = false;
552 
553                 let ram_region_available_size =
554                     align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
555                 if ram_region_available_size == 0 {
556                     break;
557                 }
558                 let zone_sub_size = zone.size - zone_offset;
559 
560                 let file_offset = zone_offset;
561                 let region_start = ram_region
562                     .0
563                     .checked_add(ram_region_offset)
564                     .ok_or(Error::GuestAddressOverFlow)?;
565                 let region_size = if zone_sub_size <= ram_region_available_size {
566                     if zone_sub_size == ram_region_available_size {
567                         ram_region_consumed = true;
568                     }
569 
570                     ram_region_offset += zone_sub_size;
571                     pull_next_zone = true;
572 
573                     zone_sub_size
574                 } else {
575                     zone_offset += ram_region_available_size;
576                     ram_region_consumed = true;
577 
578                     ram_region_available_size
579                 };
580 
581                 info!(
582                     "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
583                     zone.id,
584                     region_start.raw_value(),
585                     region_size
586                 );
587                 let region = MemoryManager::create_ram_region(
588                     &zone.file,
589                     file_offset,
590                     region_start,
591                     region_size as usize,
592                     prefault.unwrap_or(zone.prefault),
593                     zone.shared,
594                     zone.hugepages,
595                     zone.hugepage_size,
596                     zone.host_numa_node,
597                     None,
598                     thp,
599                 )?;
600 
601                 // Add region to the list of regions associated with the
602                 // current memory zone.
603                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
604                     memory_zone.regions.push(region.clone());
605                 }
606 
607                 mem_regions.push(region);
608 
609                 if pull_next_zone {
610                     // Get the next zone and reset the offset.
611                     zone_offset = 0;
612                     if let Some(z) = zone_iter.next() {
613                         zone = z;
614                     } else {
615                         exit = true;
616                         break;
617                     }
618                     zone_align_size = memory_zone_get_align_size(zone)?;
619                     if !is_aligned(zone.size, zone_align_size) {
620                         return Err(Error::MisalignedMemorySize);
621                     }
622 
623                     // Check if zone id already exist. In case it does, throw
624                     // an error as we need unique identifiers. Otherwise, add
625                     // the new zone id to the list of memory zones.
626                     if memory_zones.contains_key(&zone.id) {
627                         error!(
628                             "Memory zone identifier '{}' found more than once. \
629                             It must be unique",
630                             zone.id,
631                         );
632                         return Err(Error::DuplicateZoneId);
633                     }
634                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
635                 }
636 
637                 if ram_region_consumed {
638                     break;
639                 }
640             }
641 
642             if exit {
643                 break;
644             }
645         }
646 
647         Ok((mem_regions, memory_zones))
648     }
649 
650     // Restore both GuestMemory regions along with MemoryZone zones.
651     fn restore_memory_regions_and_zones(
652         guest_ram_mappings: &[GuestRamMapping],
653         zones_config: &[MemoryZoneConfig],
654         prefault: Option<bool>,
655         mut existing_memory_files: HashMap<u32, File>,
656         thp: bool,
657     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
658         let mut memory_regions = Vec::new();
659         let mut memory_zones = HashMap::new();
660 
661         for zone_config in zones_config {
662             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
663         }
664 
665         for guest_ram_mapping in guest_ram_mappings {
666             for zone_config in zones_config {
667                 if guest_ram_mapping.zone_id == zone_config.id {
668                     let region = MemoryManager::create_ram_region(
669                         &zone_config.file,
670                         guest_ram_mapping.file_offset,
671                         GuestAddress(guest_ram_mapping.gpa),
672                         guest_ram_mapping.size as usize,
673                         prefault.unwrap_or(zone_config.prefault),
674                         zone_config.shared,
675                         zone_config.hugepages,
676                         zone_config.hugepage_size,
677                         zone_config.host_numa_node,
678                         existing_memory_files.remove(&guest_ram_mapping.slot),
679                         thp,
680                     )?;
681                     memory_regions.push(Arc::clone(&region));
682                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
683                         if guest_ram_mapping.virtio_mem {
684                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
685                             let region_size = region.len();
686                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
687                                 region,
688                                 virtio_device: None,
689                                 hotplugged_size,
690                                 hugepages: zone_config.hugepages,
691                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
692                             });
693                         } else {
694                             memory_zone.regions.push(region);
695                         }
696                     }
697                 }
698             }
699         }
700 
701         memory_regions.sort_by_key(|x| x.start_addr());
702 
703         Ok((memory_regions, memory_zones))
704     }
705 
706     fn fill_saved_regions(
707         &mut self,
708         file_path: PathBuf,
709         saved_regions: MemoryRangeTable,
710     ) -> Result<(), Error> {
711         if saved_regions.is_empty() {
712             return Ok(());
713         }
714 
715         // Open (read only) the snapshot file.
716         let mut memory_file = OpenOptions::new()
717             .read(true)
718             .open(file_path)
719             .map_err(Error::SnapshotOpen)?;
720 
721         let guest_memory = self.guest_memory.memory();
722         for range in saved_regions.regions() {
723             let mut offset: u64 = 0;
724             // Here we are manually handling the retry in case we can't write
725             // the whole region at once because we can't use the implementation
726             // from vm-memory::GuestMemory of read_exact_from() as it is not
727             // following the correct behavior. For more info about this issue
728             // see: https://github.com/rust-vmm/vm-memory/issues/174
729             loop {
730                 let bytes_read = guest_memory
731                     .read_from(
732                         GuestAddress(range.gpa + offset),
733                         &mut memory_file,
734                         (range.length - offset) as usize,
735                     )
736                     .map_err(Error::SnapshotCopy)?;
737                 offset += bytes_read as u64;
738 
739                 if offset == range.length {
740                     break;
741                 }
742             }
743         }
744 
745         Ok(())
746     }
747 
748     fn validate_memory_config(
749         config: &MemoryConfig,
750         user_provided_zones: bool,
751     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
752         let mut allow_mem_hotplug = false;
753 
754         if !user_provided_zones {
755             if config.zones.is_some() {
756                 error!(
757                     "User defined memory regions can't be provided if the \
758                     memory size is not 0"
759                 );
760                 return Err(Error::InvalidMemoryParameters);
761             }
762 
763             if config.hotplug_size.is_some() {
764                 allow_mem_hotplug = true;
765             }
766 
767             if let Some(hotplugged_size) = config.hotplugged_size {
768                 if let Some(hotplug_size) = config.hotplug_size {
769                     if hotplugged_size > hotplug_size {
770                         error!(
771                             "'hotplugged_size' {} can't be bigger than \
772                             'hotplug_size' {}",
773                             hotplugged_size, hotplug_size,
774                         );
775                         return Err(Error::InvalidMemoryParameters);
776                     }
777                 } else {
778                     error!(
779                         "Invalid to define 'hotplugged_size' when there is\
780                         no 'hotplug_size'"
781                     );
782                     return Err(Error::InvalidMemoryParameters);
783                 }
784                 if config.hotplug_method == HotplugMethod::Acpi {
785                     error!(
786                         "Invalid to define 'hotplugged_size' with hotplug \
787                         method 'acpi'"
788                     );
789                     return Err(Error::InvalidMemoryParameters);
790                 }
791             }
792 
793             // Create a single zone from the global memory config. This lets
794             // us reuse the codepath for user defined memory zones.
795             let zones = vec![MemoryZoneConfig {
796                 id: String::from(DEFAULT_MEMORY_ZONE),
797                 size: config.size,
798                 file: None,
799                 shared: config.shared,
800                 hugepages: config.hugepages,
801                 hugepage_size: config.hugepage_size,
802                 host_numa_node: None,
803                 hotplug_size: config.hotplug_size,
804                 hotplugged_size: config.hotplugged_size,
805                 prefault: config.prefault,
806             }];
807 
808             Ok((config.size, zones, allow_mem_hotplug))
809         } else {
810             if config.zones.is_none() {
811                 error!(
812                     "User defined memory regions must be provided if the \
813                     memory size is 0"
814                 );
815                 return Err(Error::MissingMemoryZones);
816             }
817 
818             // Safe to unwrap as we checked right above there were some
819             // regions.
820             let zones = config.zones.clone().unwrap();
821             if zones.is_empty() {
822                 return Err(Error::MissingMemoryZones);
823             }
824 
825             let mut total_ram_size: u64 = 0;
826             for zone in zones.iter() {
827                 total_ram_size += zone.size;
828 
829                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
830                     error!(
831                         "Invalid to set host NUMA policy for a memory zone \
832                         backed by a regular file and mapped as 'shared'"
833                     );
834                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
835                 }
836 
837                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
838                     error!("Invalid to set ACPI hotplug method for memory zones");
839                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
840                 }
841 
842                 if let Some(hotplugged_size) = zone.hotplugged_size {
843                     if let Some(hotplug_size) = zone.hotplug_size {
844                         if hotplugged_size > hotplug_size {
845                             error!(
846                                 "'hotplugged_size' {} can't be bigger than \
847                                 'hotplug_size' {}",
848                                 hotplugged_size, hotplug_size,
849                             );
850                             return Err(Error::InvalidMemoryParameters);
851                         }
852                     } else {
853                         error!(
854                             "Invalid to define 'hotplugged_size' when there is\
855                             no 'hotplug_size' for a memory zone"
856                         );
857                         return Err(Error::InvalidMemoryParameters);
858                     }
859                     if config.hotplug_method == HotplugMethod::Acpi {
860                         error!(
861                             "Invalid to define 'hotplugged_size' with hotplug \
862                             method 'acpi'"
863                         );
864                         return Err(Error::InvalidMemoryParameters);
865                     }
866                 }
867             }
868 
869             Ok((total_ram_size, zones, allow_mem_hotplug))
870         }
871     }
872 
873     pub fn allocate_address_space(&mut self) -> Result<(), Error> {
874         let mut list = Vec::new();
875 
876         for (zone_id, memory_zone) in self.memory_zones.iter() {
877             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
878                 memory_zone
879                     .regions()
880                     .iter()
881                     .map(|r| (r.clone(), false))
882                     .collect();
883 
884             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
885                 regions.push((virtio_mem_zone.region().clone(), true));
886             }
887 
888             list.push((zone_id.clone(), regions));
889         }
890 
891         for (zone_id, regions) in list {
892             for (region, virtio_mem) in regions {
893                 let slot = self.create_userspace_mapping(
894                     region.start_addr().raw_value(),
895                     region.len(),
896                     region.as_ptr() as u64,
897                     self.mergeable,
898                     false,
899                     self.log_dirty,
900                 )?;
901 
902                 let file_offset = if let Some(file_offset) = region.file_offset() {
903                     file_offset.start()
904                 } else {
905                     0
906                 };
907 
908                 self.guest_ram_mappings.push(GuestRamMapping {
909                     gpa: region.start_addr().raw_value(),
910                     size: region.len(),
911                     slot,
912                     zone_id: zone_id.clone(),
913                     virtio_mem,
914                     file_offset,
915                 });
916                 self.ram_allocator
917                     .allocate(Some(region.start_addr()), region.len(), None)
918                     .ok_or(Error::MemoryRangeAllocation)?;
919             }
920         }
921 
922         // Allocate SubRegion and Reserved address ranges.
923         for region in self.arch_mem_regions.iter() {
924             if region.r_type == RegionType::Ram {
925                 // Ignore the RAM type since ranges have already been allocated
926                 // based on the GuestMemory regions.
927                 continue;
928             }
929             self.ram_allocator
930                 .allocate(
931                     Some(GuestAddress(region.base)),
932                     region.size as GuestUsize,
933                     None,
934                 )
935                 .ok_or(Error::MemoryRangeAllocation)?;
936         }
937 
938         Ok(())
939     }
940 
941     #[cfg(target_arch = "aarch64")]
942     fn add_uefi_flash(&mut self) -> Result<(), Error> {
943         // On AArch64, the UEFI binary requires a flash device at address 0.
944         // 4 MiB memory is mapped to simulate the flash.
945         let uefi_mem_slot = self.allocate_memory_slot();
946         let uefi_region = GuestRegionMmap::new(
947             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
948             arch::layout::UEFI_START,
949         )
950         .unwrap();
951         let uefi_mem_region = self.vm.make_user_memory_region(
952             uefi_mem_slot,
953             uefi_region.start_addr().raw_value(),
954             uefi_region.len(),
955             uefi_region.as_ptr() as u64,
956             false,
957             false,
958         );
959         self.vm
960             .create_user_memory_region(uefi_mem_region)
961             .map_err(Error::CreateUefiFlash)?;
962 
963         let uefi_flash =
964             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
965 
966         self.uefi_flash = Some(uefi_flash);
967 
968         Ok(())
969     }
970 
971     #[allow(clippy::too_many_arguments)]
972     pub fn new(
973         vm: Arc<dyn hypervisor::Vm>,
974         config: &MemoryConfig,
975         prefault: Option<bool>,
976         phys_bits: u8,
977         #[cfg(feature = "tdx")] tdx_enabled: bool,
978         restore_data: Option<&MemoryManagerSnapshotData>,
979         existing_memory_files: Option<HashMap<u32, File>>,
980         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
981     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
982         trace_scoped!("MemoryManager::new");
983 
984         let user_provided_zones = config.size == 0;
985 
986         let mmio_address_space_size = mmio_address_space_size(phys_bits);
987         debug_assert_eq!(
988             (((mmio_address_space_size) >> 16) << 16),
989             mmio_address_space_size
990         );
991         let start_of_platform_device_area =
992             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
993         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
994 
995         let (ram_size, zones, allow_mem_hotplug) =
996             Self::validate_memory_config(config, user_provided_zones)?;
997 
998         let (
999             start_of_device_area,
1000             boot_ram,
1001             current_ram,
1002             arch_mem_regions,
1003             memory_zones,
1004             guest_memory,
1005             boot_guest_memory,
1006             hotplug_slots,
1007             next_memory_slot,
1008             selected_slot,
1009             next_hotplug_slot,
1010         ) = if let Some(data) = restore_data {
1011             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1012                 &data.guest_ram_mappings,
1013                 &zones,
1014                 prefault,
1015                 existing_memory_files.unwrap_or_default(),
1016                 config.thp,
1017             )?;
1018             let guest_memory =
1019                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1020             let boot_guest_memory = guest_memory.clone();
1021             (
1022                 GuestAddress(data.start_of_device_area),
1023                 data.boot_ram,
1024                 data.current_ram,
1025                 data.arch_mem_regions.clone(),
1026                 memory_zones,
1027                 guest_memory,
1028                 boot_guest_memory,
1029                 data.hotplug_slots.clone(),
1030                 data.next_memory_slot,
1031                 data.selected_slot,
1032                 data.next_hotplug_slot,
1033             )
1034         } else {
1035             // Init guest memory
1036             let arch_mem_regions = arch::arch_memory_regions();
1037 
1038             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1039                 .iter()
1040                 .filter(|r| r.2 == RegionType::Ram)
1041                 .map(|r| (r.0, r.1))
1042                 .collect();
1043 
1044             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1045                 .iter()
1046                 .map(|(a, b, c)| ArchMemRegion {
1047                     base: a.0,
1048                     size: *b,
1049                     r_type: *c,
1050                 })
1051                 .collect();
1052 
1053             let (mem_regions, mut memory_zones) =
1054                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1055 
1056             let mut guest_memory =
1057                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1058 
1059             let boot_guest_memory = guest_memory.clone();
1060 
1061             let mut start_of_device_area =
1062                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1063 
1064             // Update list of memory zones for resize.
1065             for zone in zones.iter() {
1066                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1067                     if let Some(hotplug_size) = zone.hotplug_size {
1068                         if hotplug_size == 0 {
1069                             error!("'hotplug_size' can't be 0");
1070                             return Err(Error::InvalidHotplugSize);
1071                         }
1072 
1073                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1074                             start_of_device_area = start_of_device_area
1075                                 .checked_add(hotplug_size)
1076                                 .ok_or(Error::GuestAddressOverFlow)?;
1077                         } else {
1078                             // Alignment must be "natural" i.e. same as size of block
1079                             let start_addr = GuestAddress(
1080                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1081                                     - 1)
1082                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
1083                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1084                             );
1085 
1086                             // When `prefault` is set by vm_restore, memory manager
1087                             // will create ram region with `prefault` option in
1088                             // restore config rather than same option in zone
1089                             let region = MemoryManager::create_ram_region(
1090                                 &None,
1091                                 0,
1092                                 start_addr,
1093                                 hotplug_size as usize,
1094                                 prefault.unwrap_or(zone.prefault),
1095                                 zone.shared,
1096                                 zone.hugepages,
1097                                 zone.hugepage_size,
1098                                 zone.host_numa_node,
1099                                 None,
1100                                 config.thp,
1101                             )?;
1102 
1103                             guest_memory = guest_memory
1104                                 .insert_region(Arc::clone(&region))
1105                                 .map_err(Error::GuestMemory)?;
1106 
1107                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1108                             let region_size = region.len();
1109                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1110                                 region,
1111                                 virtio_device: None,
1112                                 hotplugged_size,
1113                                 hugepages: zone.hugepages,
1114                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1115                             });
1116 
1117                             start_of_device_area = start_addr
1118                                 .checked_add(hotplug_size)
1119                                 .ok_or(Error::GuestAddressOverFlow)?;
1120                         }
1121                     }
1122                 } else {
1123                     return Err(Error::MissingZoneIdentifier);
1124                 }
1125             }
1126 
1127             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1128             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1129 
1130             (
1131                 start_of_device_area,
1132                 ram_size,
1133                 ram_size,
1134                 arch_mem_regions,
1135                 memory_zones,
1136                 guest_memory,
1137                 boot_guest_memory,
1138                 hotplug_slots,
1139                 0,
1140                 0,
1141                 0,
1142             )
1143         };
1144 
1145         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1146 
1147         // Both MMIO and PIO address spaces start at address 0.
1148         let allocator = Arc::new(Mutex::new(
1149             SystemAllocator::new(
1150                 #[cfg(target_arch = "x86_64")]
1151                 {
1152                     GuestAddress(0)
1153                 },
1154                 #[cfg(target_arch = "x86_64")]
1155                 {
1156                     1 << 16
1157                 },
1158                 start_of_platform_device_area,
1159                 PLATFORM_DEVICE_AREA_SIZE,
1160                 layout::MEM_32BIT_DEVICES_START,
1161                 layout::MEM_32BIT_DEVICES_SIZE,
1162                 #[cfg(target_arch = "x86_64")]
1163                 vec![GsiApic::new(
1164                     X86_64_IRQ_BASE,
1165                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1166                 )],
1167             )
1168             .ok_or(Error::CreateSystemAllocator)?,
1169         ));
1170 
1171         #[cfg(not(feature = "tdx"))]
1172         let dynamic = true;
1173         #[cfg(feature = "tdx")]
1174         let dynamic = !tdx_enabled;
1175 
1176         let acpi_address = if dynamic
1177             && config.hotplug_method == HotplugMethod::Acpi
1178             && (config.hotplug_size.unwrap_or_default() > 0)
1179         {
1180             Some(
1181                 allocator
1182                     .lock()
1183                     .unwrap()
1184                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1185                     .ok_or(Error::AllocateMmioAddress)?,
1186             )
1187         } else {
1188             None
1189         };
1190 
1191         // If running on SGX the start of device area and RAM area may diverge but
1192         // at this point they are next to each other.
1193         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1194         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1195 
1196         let mut memory_manager = MemoryManager {
1197             boot_guest_memory,
1198             guest_memory,
1199             next_memory_slot,
1200             start_of_device_area,
1201             end_of_device_area,
1202             end_of_ram_area,
1203             vm,
1204             hotplug_slots,
1205             selected_slot,
1206             mergeable: config.mergeable,
1207             allocator,
1208             hotplug_method: config.hotplug_method,
1209             boot_ram,
1210             current_ram,
1211             next_hotplug_slot,
1212             shared: config.shared,
1213             hugepages: config.hugepages,
1214             hugepage_size: config.hugepage_size,
1215             prefault: config.prefault,
1216             #[cfg(target_arch = "x86_64")]
1217             sgx_epc_region: None,
1218             user_provided_zones,
1219             snapshot_memory_ranges: MemoryRangeTable::default(),
1220             memory_zones,
1221             guest_ram_mappings: Vec::new(),
1222             acpi_address,
1223             log_dirty: dynamic, // Cannot log dirty pages on a TD
1224             arch_mem_regions,
1225             ram_allocator,
1226             dynamic,
1227             #[cfg(target_arch = "aarch64")]
1228             uefi_flash: None,
1229             thp: config.thp,
1230         };
1231 
1232         #[cfg(target_arch = "aarch64")]
1233         {
1234             // For Aarch64 we cannot lazily allocate the address space like we
1235             // do for x86, because while restoring a VM from snapshot we would
1236             // need the address space to be allocated to properly restore VGIC.
1237             // And the restore of VGIC happens before we attempt to run the vCPUs
1238             // for the first time, thus we need to allocate the address space
1239             // beforehand.
1240             memory_manager.allocate_address_space()?;
1241             memory_manager.add_uefi_flash()?;
1242         }
1243 
1244         #[cfg(target_arch = "x86_64")]
1245         if let Some(sgx_epc_config) = sgx_epc_config {
1246             memory_manager.setup_sgx(sgx_epc_config)?;
1247         }
1248 
1249         Ok(Arc::new(Mutex::new(memory_manager)))
1250     }
1251 
1252     pub fn new_from_snapshot(
1253         snapshot: &Snapshot,
1254         vm: Arc<dyn hypervisor::Vm>,
1255         config: &MemoryConfig,
1256         source_url: Option<&str>,
1257         prefault: bool,
1258         phys_bits: u8,
1259     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1260         if let Some(source_url) = source_url {
1261             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1262             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1263 
1264             let mem_snapshot: MemoryManagerSnapshotData =
1265                 snapshot.to_versioned_state().map_err(Error::Restore)?;
1266 
1267             let mm = MemoryManager::new(
1268                 vm,
1269                 config,
1270                 Some(prefault),
1271                 phys_bits,
1272                 #[cfg(feature = "tdx")]
1273                 false,
1274                 Some(&mem_snapshot),
1275                 None,
1276                 #[cfg(target_arch = "x86_64")]
1277                 None,
1278             )?;
1279 
1280             mm.lock()
1281                 .unwrap()
1282                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1283 
1284             Ok(mm)
1285         } else {
1286             Err(Error::RestoreMissingSourceUrl)
1287         }
1288     }
1289 
1290     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1291         // SAFETY: FFI call with correct arguments
1292         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1293 
1294         if res < 0 {
1295             Err(io::Error::last_os_error())
1296         } else {
1297             Ok(res as RawFd)
1298         }
1299     }
1300 
1301     fn mbind(
1302         addr: *mut u8,
1303         len: u64,
1304         mode: u32,
1305         nodemask: Vec<u64>,
1306         maxnode: u64,
1307         flags: u32,
1308     ) -> Result<(), io::Error> {
1309         // SAFETY: FFI call with correct arguments
1310         let res = unsafe {
1311             libc::syscall(
1312                 libc::SYS_mbind,
1313                 addr as *mut libc::c_void,
1314                 len,
1315                 mode,
1316                 nodemask.as_ptr(),
1317                 maxnode,
1318                 flags,
1319             )
1320         };
1321 
1322         if res < 0 {
1323             Err(io::Error::last_os_error())
1324         } else {
1325             Ok(())
1326         }
1327     }
1328 
1329     fn create_anonymous_file(
1330         size: usize,
1331         hugepages: bool,
1332         hugepage_size: Option<u64>,
1333     ) -> Result<FileOffset, Error> {
1334         let fd = Self::memfd_create(
1335             &ffi::CString::new("ch_ram").unwrap(),
1336             libc::MFD_CLOEXEC
1337                 | if hugepages {
1338                     libc::MFD_HUGETLB
1339                         | if let Some(hugepage_size) = hugepage_size {
1340                             /*
1341                              * From the Linux kernel:
1342                              * Several system calls take a flag to request "hugetlb" huge pages.
1343                              * Without further specification, these system calls will use the
1344                              * system's default huge page size.  If a system supports multiple
1345                              * huge page sizes, the desired huge page size can be specified in
1346                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1347                              * will encode the log2 of the huge page size.
1348                              */
1349 
1350                             hugepage_size.trailing_zeros() << 26
1351                         } else {
1352                             // Use the system default huge page size
1353                             0
1354                         }
1355                 } else {
1356                     0
1357                 },
1358         )
1359         .map_err(Error::SharedFileCreate)?;
1360 
1361         // SAFETY: fd is valid
1362         let f = unsafe { File::from_raw_fd(fd) };
1363         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1364 
1365         Ok(FileOffset::new(f, 0))
1366     }
1367 
1368     fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1369         if backing_file.is_dir() {
1370             Err(Error::DirectoryAsBackingFileForMemory)
1371         } else {
1372             let f = OpenOptions::new()
1373                 .read(true)
1374                 .write(true)
1375                 .open(backing_file)
1376                 .map_err(Error::SharedFileCreate)?;
1377 
1378             Ok(FileOffset::new(f, file_offset))
1379         }
1380     }
1381 
1382     #[allow(clippy::too_many_arguments)]
1383     pub fn create_ram_region(
1384         backing_file: &Option<PathBuf>,
1385         file_offset: u64,
1386         start_addr: GuestAddress,
1387         size: usize,
1388         prefault: bool,
1389         shared: bool,
1390         hugepages: bool,
1391         hugepage_size: Option<u64>,
1392         host_numa_node: Option<u32>,
1393         existing_memory_file: Option<File>,
1394         thp: bool,
1395     ) -> Result<Arc<GuestRegionMmap>, Error> {
1396         let mut mmap_flags = libc::MAP_NORESERVE;
1397 
1398         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1399         // the complexity of the handling clear.
1400         let fo = if let Some(f) = existing_memory_file {
1401             // It must be MAP_SHARED as we wouldn't already have an FD
1402             mmap_flags |= libc::MAP_SHARED;
1403             Some(FileOffset::new(f, file_offset))
1404         } else if let Some(backing_file) = backing_file {
1405             if shared {
1406                 mmap_flags |= libc::MAP_SHARED;
1407             } else {
1408                 mmap_flags |= libc::MAP_PRIVATE;
1409             }
1410             Some(Self::open_backing_file(backing_file, file_offset)?)
1411         } else if shared || hugepages {
1412             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1413             // because the MAP_PRIVATE will trigger CoW against the backing file with
1414             // the VFIO pinning
1415             mmap_flags |= libc::MAP_SHARED;
1416             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1417         } else {
1418             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1419             None
1420         };
1421 
1422         if prefault {
1423             mmap_flags |= libc::MAP_POPULATE;
1424         }
1425 
1426         let region = GuestRegionMmap::new(
1427             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1428                 .map_err(Error::GuestMemoryRegion)?,
1429             start_addr,
1430         )
1431         .map_err(Error::GuestMemory)?;
1432 
1433         if region.file_offset().is_none() && thp {
1434             info!(
1435                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1436                 region.as_ptr() as u64,
1437                 size
1438             );
1439             // SAFETY: FFI call with corect arguments
1440             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1441             if ret != 0 {
1442                 let e = io::Error::last_os_error();
1443                 warn!("Failed to mark pages as THP eligible: {}", e);
1444             }
1445         }
1446 
1447         // Apply NUMA policy if needed.
1448         if let Some(node) = host_numa_node {
1449             let addr = region.deref().as_ptr();
1450             let len = region.deref().size() as u64;
1451             let mode = MPOL_BIND;
1452             let mut nodemask: Vec<u64> = Vec::new();
1453             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1454 
1455             // Linux is kind of buggy in the way it interprets maxnode as it
1456             // will cut off the last node. That's why we have to add 1 to what
1457             // we would consider as the proper maxnode value.
1458             let maxnode = node as u64 + 1 + 1;
1459 
1460             // Allocate the right size for the vector.
1461             nodemask.resize((node as usize / 64) + 1, 0);
1462 
1463             // Fill the global bitmask through the nodemask vector.
1464             let idx = (node / 64) as usize;
1465             let shift = node % 64;
1466             nodemask[idx] |= 1u64 << shift;
1467 
1468             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1469             // force the kernel to move all pages that might have been already
1470             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1471             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1472             // MPOL_BIND is the selected mode as it specifies a strict policy
1473             // that restricts memory allocation to the nodes specified in the
1474             // nodemask.
1475             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1476                 .map_err(Error::ApplyNumaPolicy)?;
1477         }
1478 
1479         Ok(Arc::new(region))
1480     }
1481 
1482     // Update the GuestMemoryMmap with the new range
1483     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1484         let guest_memory = self
1485             .guest_memory
1486             .memory()
1487             .insert_region(region)
1488             .map_err(Error::GuestMemory)?;
1489         self.guest_memory.lock().unwrap().replace(guest_memory);
1490 
1491         Ok(())
1492     }
1493 
1494     //
1495     // Calculate the start address of an area next to RAM.
1496     //
1497     // If memory hotplug is allowed, the start address needs to be aligned
1498     // (rounded-up) to 128MiB boundary.
1499     // If memory hotplug is not allowed, there is no alignment required.
1500     // And it must also start at the 64bit start.
1501     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1502         let mut start_addr = if allow_mem_hotplug {
1503             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1504         } else {
1505             mem_end
1506         };
1507 
1508         start_addr = start_addr
1509             .checked_add(1)
1510             .ok_or(Error::GuestAddressOverFlow)?;
1511 
1512         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1513             return Ok(arch::layout::RAM_64BIT_START);
1514         }
1515 
1516         Ok(start_addr)
1517     }
1518 
1519     pub fn add_ram_region(
1520         &mut self,
1521         start_addr: GuestAddress,
1522         size: usize,
1523     ) -> Result<Arc<GuestRegionMmap>, Error> {
1524         // Allocate memory for the region
1525         let region = MemoryManager::create_ram_region(
1526             &None,
1527             0,
1528             start_addr,
1529             size,
1530             self.prefault,
1531             self.shared,
1532             self.hugepages,
1533             self.hugepage_size,
1534             None,
1535             None,
1536             self.thp,
1537         )?;
1538 
1539         // Map it into the guest
1540         let slot = self.create_userspace_mapping(
1541             region.start_addr().0,
1542             region.len(),
1543             region.as_ptr() as u64,
1544             self.mergeable,
1545             false,
1546             self.log_dirty,
1547         )?;
1548         self.guest_ram_mappings.push(GuestRamMapping {
1549             gpa: region.start_addr().raw_value(),
1550             size: region.len(),
1551             slot,
1552             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1553             virtio_mem: false,
1554             file_offset: 0,
1555         });
1556 
1557         self.add_region(Arc::clone(&region))?;
1558 
1559         Ok(region)
1560     }
1561 
1562     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1563         info!("Hotplugging new RAM: {}", size);
1564 
1565         // Check that there is a free slot
1566         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1567             return Err(Error::NoSlotAvailable);
1568         }
1569 
1570         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1571         if size % (128 << 20) != 0 {
1572             return Err(Error::InvalidSize);
1573         }
1574 
1575         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1576 
1577         if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area {
1578             return Err(Error::InsufficientHotplugRam);
1579         }
1580 
1581         let region = self.add_ram_region(start_addr, size)?;
1582 
1583         // Add region to the list of regions associated with the default
1584         // memory zone.
1585         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1586             memory_zone.regions.push(Arc::clone(&region));
1587         }
1588 
1589         // Tell the allocator
1590         self.ram_allocator
1591             .allocate(Some(start_addr), size as GuestUsize, None)
1592             .ok_or(Error::MemoryRangeAllocation)?;
1593 
1594         // Update the slot so that it can be queried via the I/O port
1595         let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1596         slot.active = true;
1597         slot.inserting = true;
1598         slot.base = region.start_addr().0;
1599         slot.length = region.len();
1600 
1601         self.next_hotplug_slot += 1;
1602 
1603         Ok(region)
1604     }
1605 
1606     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1607         self.guest_memory.clone()
1608     }
1609 
1610     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1611         self.boot_guest_memory.clone()
1612     }
1613 
1614     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1615         self.allocator.clone()
1616     }
1617 
1618     pub fn start_of_device_area(&self) -> GuestAddress {
1619         self.start_of_device_area
1620     }
1621 
1622     pub fn end_of_device_area(&self) -> GuestAddress {
1623         self.end_of_device_area
1624     }
1625 
1626     pub fn allocate_memory_slot(&mut self) -> u32 {
1627         let slot_id = self.next_memory_slot;
1628         self.next_memory_slot += 1;
1629         slot_id
1630     }
1631 
1632     pub fn create_userspace_mapping(
1633         &mut self,
1634         guest_phys_addr: u64,
1635         memory_size: u64,
1636         userspace_addr: u64,
1637         mergeable: bool,
1638         readonly: bool,
1639         log_dirty: bool,
1640     ) -> Result<u32, Error> {
1641         let slot = self.allocate_memory_slot();
1642         let mem_region = self.vm.make_user_memory_region(
1643             slot,
1644             guest_phys_addr,
1645             memory_size,
1646             userspace_addr,
1647             readonly,
1648             log_dirty,
1649         );
1650 
1651         info!(
1652             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1653             guest_phys_addr, userspace_addr, memory_size, slot
1654         );
1655 
1656         self.vm
1657             .create_user_memory_region(mem_region)
1658             .map_err(Error::CreateUserMemoryRegion)?;
1659 
1660         // SAFETY: the address and size are valid since the
1661         // mmap succeeded.
1662         let ret = unsafe {
1663             libc::madvise(
1664                 userspace_addr as *mut libc::c_void,
1665                 memory_size as libc::size_t,
1666                 libc::MADV_DONTDUMP,
1667             )
1668         };
1669         if ret != 0 {
1670             let e = io::Error::last_os_error();
1671             warn!("Failed to mark mappin as MADV_DONTDUMP: {}", e);
1672         }
1673 
1674         // Mark the pages as mergeable if explicitly asked for.
1675         if mergeable {
1676             // SAFETY: the address and size are valid since the
1677             // mmap succeeded.
1678             let ret = unsafe {
1679                 libc::madvise(
1680                     userspace_addr as *mut libc::c_void,
1681                     memory_size as libc::size_t,
1682                     libc::MADV_MERGEABLE,
1683                 )
1684             };
1685             if ret != 0 {
1686                 let err = io::Error::last_os_error();
1687                 // Safe to unwrap because the error is constructed with
1688                 // last_os_error(), which ensures the output will be Some().
1689                 let errno = err.raw_os_error().unwrap();
1690                 if errno == libc::EINVAL {
1691                     warn!("kernel not configured with CONFIG_KSM");
1692                 } else {
1693                     warn!("madvise error: {}", err);
1694                 }
1695                 warn!("failed to mark pages as mergeable");
1696             }
1697         }
1698 
1699         info!(
1700             "Created userspace mapping: {:x} -> {:x} {:x}",
1701             guest_phys_addr, userspace_addr, memory_size
1702         );
1703 
1704         Ok(slot)
1705     }
1706 
1707     pub fn remove_userspace_mapping(
1708         &mut self,
1709         guest_phys_addr: u64,
1710         memory_size: u64,
1711         userspace_addr: u64,
1712         mergeable: bool,
1713         slot: u32,
1714     ) -> Result<(), Error> {
1715         let mem_region = self.vm.make_user_memory_region(
1716             slot,
1717             guest_phys_addr,
1718             memory_size,
1719             userspace_addr,
1720             false, /* readonly -- don't care */
1721             false, /* log dirty */
1722         );
1723 
1724         self.vm
1725             .remove_user_memory_region(mem_region)
1726             .map_err(Error::RemoveUserMemoryRegion)?;
1727 
1728         // Mark the pages as unmergeable if there were previously marked as
1729         // mergeable.
1730         if mergeable {
1731             // SAFETY: the address and size are valid as the region was
1732             // previously advised.
1733             let ret = unsafe {
1734                 libc::madvise(
1735                     userspace_addr as *mut libc::c_void,
1736                     memory_size as libc::size_t,
1737                     libc::MADV_UNMERGEABLE,
1738                 )
1739             };
1740             if ret != 0 {
1741                 let err = io::Error::last_os_error();
1742                 // Safe to unwrap because the error is constructed with
1743                 // last_os_error(), which ensures the output will be Some().
1744                 let errno = err.raw_os_error().unwrap();
1745                 if errno == libc::EINVAL {
1746                     warn!("kernel not configured with CONFIG_KSM");
1747                 } else {
1748                     warn!("madvise error: {}", err);
1749                 }
1750                 warn!("failed to mark pages as unmergeable");
1751             }
1752         }
1753 
1754         info!(
1755             "Removed userspace mapping: {:x} -> {:x} {:x}",
1756             guest_phys_addr, userspace_addr, memory_size
1757         );
1758 
1759         Ok(())
1760     }
1761 
1762     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1763         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1764             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1765                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1766                     virtio_mem_device
1767                         .lock()
1768                         .unwrap()
1769                         .resize(size)
1770                         .map_err(Error::VirtioMemResizeFail)?;
1771                 }
1772 
1773                 // Keep the hotplugged_size up to date.
1774                 virtio_mem_zone.hotplugged_size = size;
1775             } else {
1776                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1777                 return Err(Error::MissingVirtioMemHandler);
1778             }
1779 
1780             return Ok(());
1781         }
1782 
1783         error!("Failed resizing virtio-mem region: Unknown memory zone");
1784         Err(Error::UnknownMemoryZone)
1785     }
1786 
1787     /// In case this function resulted in adding a new memory region to the
1788     /// guest memory, the new region is returned to the caller. The virtio-mem
1789     /// use case never adds a new region as the whole hotpluggable memory has
1790     /// already been allocated at boot time.
1791     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1792         if self.user_provided_zones {
1793             error!(
1794                 "Not allowed to resize guest memory when backed with user \
1795                 defined memory zones."
1796             );
1797             return Err(Error::InvalidResizeWithMemoryZones);
1798         }
1799 
1800         let mut region: Option<Arc<GuestRegionMmap>> = None;
1801         match self.hotplug_method {
1802             HotplugMethod::VirtioMem => {
1803                 if desired_ram >= self.boot_ram {
1804                     if !self.dynamic {
1805                         return Ok(region);
1806                     }
1807 
1808                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1809                     self.current_ram = desired_ram;
1810                 }
1811             }
1812             HotplugMethod::Acpi => {
1813                 if desired_ram > self.current_ram {
1814                     if !self.dynamic {
1815                         return Ok(region);
1816                     }
1817 
1818                     region =
1819                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1820                     self.current_ram = desired_ram;
1821                 }
1822             }
1823         }
1824         Ok(region)
1825     }
1826 
1827     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1828         if !self.user_provided_zones {
1829             error!(
1830                 "Not allowed to resize guest memory zone when no zone is \
1831                 defined."
1832             );
1833             return Err(Error::ResizeZone);
1834         }
1835 
1836         self.virtio_mem_resize(id, virtio_mem_size)
1837     }
1838 
1839     #[cfg(target_arch = "x86_64")]
1840     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1841         let file = OpenOptions::new()
1842             .read(true)
1843             .open("/dev/sgx_provision")
1844             .map_err(Error::SgxProvisionOpen)?;
1845         self.vm
1846             .enable_sgx_attribute(file)
1847             .map_err(Error::SgxEnableProvisioning)?;
1848 
1849         // Go over each EPC section and verify its size is a 4k multiple. At
1850         // the same time, calculate the total size needed for the contiguous
1851         // EPC region.
1852         let mut epc_region_size = 0;
1853         for epc_section in sgx_epc_config.iter() {
1854             if epc_section.size == 0 {
1855                 return Err(Error::EpcSectionSizeInvalid);
1856             }
1857             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1858                 return Err(Error::EpcSectionSizeInvalid);
1859             }
1860 
1861             epc_region_size += epc_section.size;
1862         }
1863 
1864         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1865         let epc_region_start = GuestAddress(
1866             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1867         );
1868 
1869         self.start_of_device_area = epc_region_start
1870             .checked_add(epc_region_size)
1871             .ok_or(Error::GuestAddressOverFlow)?;
1872 
1873         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1874         info!(
1875             "SGX EPC region: 0x{:x} (0x{:x})",
1876             epc_region_start.0, epc_region_size
1877         );
1878 
1879         // Each section can be memory mapped into the allocated region.
1880         let mut epc_section_start = epc_region_start.raw_value();
1881         for epc_section in sgx_epc_config.iter() {
1882             let file = OpenOptions::new()
1883                 .read(true)
1884                 .write(true)
1885                 .open("/dev/sgx_vepc")
1886                 .map_err(Error::SgxVirtEpcOpen)?;
1887 
1888             let prot = PROT_READ | PROT_WRITE;
1889             let mut flags = MAP_NORESERVE | MAP_SHARED;
1890             if epc_section.prefault {
1891                 flags |= MAP_POPULATE;
1892             }
1893 
1894             // We can't use the vm-memory crate to perform the memory mapping
1895             // here as it would try to ensure the size of the backing file is
1896             // matching the size of the expected mapping. The /dev/sgx_vepc
1897             // device does not work that way, it provides a file descriptor
1898             // which is not matching the mapping size, as it's a just a way to
1899             // let KVM know that an EPC section is being created for the guest.
1900             // SAFETY: FFI call with correct arguments
1901             let host_addr = unsafe {
1902                 libc::mmap(
1903                     std::ptr::null_mut(),
1904                     epc_section.size as usize,
1905                     prot,
1906                     flags,
1907                     file.as_raw_fd(),
1908                     0,
1909                 )
1910             } as u64;
1911 
1912             info!(
1913                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
1914                 epc_section_start, epc_section.size
1915             );
1916 
1917             let _mem_slot = self.create_userspace_mapping(
1918                 epc_section_start,
1919                 epc_section.size,
1920                 host_addr,
1921                 false,
1922                 false,
1923                 false,
1924             )?;
1925 
1926             sgx_epc_region.insert(
1927                 epc_section.id.clone(),
1928                 SgxEpcSection::new(
1929                     GuestAddress(epc_section_start),
1930                     epc_section.size as GuestUsize,
1931                 ),
1932             );
1933 
1934             epc_section_start += epc_section.size;
1935         }
1936 
1937         self.sgx_epc_region = Some(sgx_epc_region);
1938 
1939         Ok(())
1940     }
1941 
1942     #[cfg(target_arch = "x86_64")]
1943     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
1944         &self.sgx_epc_region
1945     }
1946 
1947     pub fn is_hardlink(f: &File) -> bool {
1948         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
1949         // SAFETY: FFI call with correct arguments
1950         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
1951         if ret != 0 {
1952             error!("Couldn't fstat the backing file");
1953             return false;
1954         }
1955 
1956         // SAFETY: stat is valid
1957         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
1958     }
1959 
1960     pub fn memory_zones(&self) -> &MemoryZones {
1961         &self.memory_zones
1962     }
1963 
1964     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
1965         &mut self.memory_zones
1966     }
1967 
1968     pub fn memory_range_table(
1969         &self,
1970         snapshot: bool,
1971     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
1972         let mut table = MemoryRangeTable::default();
1973 
1974         for memory_zone in self.memory_zones.values() {
1975             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
1976                 table.extend(virtio_mem_zone.plugged_ranges());
1977             }
1978 
1979             for region in memory_zone.regions() {
1980                 if snapshot {
1981                     if let Some(file_offset) = region.file_offset() {
1982                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
1983                             && Self::is_hardlink(file_offset.file())
1984                         {
1985                             // In this very specific case, we know the memory
1986                             // region is backed by a file on the host filesystem
1987                             // that can be accessed by the user, and additionally
1988                             // the mapping is shared, which means that modifications
1989                             // to the content are written to the actual file.
1990                             // When meeting these conditions, we can skip the
1991                             // copy of the memory content for this specific region,
1992                             // as we can assume the user will have it saved through
1993                             // the backing file already.
1994                             continue;
1995                         }
1996                     }
1997                 }
1998 
1999                 table.push(MemoryRange {
2000                     gpa: region.start_addr().raw_value(),
2001                     length: region.len(),
2002                 });
2003             }
2004         }
2005 
2006         Ok(table)
2007     }
2008 
2009     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2010         MemoryManagerSnapshotData {
2011             memory_ranges: self.snapshot_memory_ranges.clone(),
2012             guest_ram_mappings: self.guest_ram_mappings.clone(),
2013             start_of_device_area: self.start_of_device_area.0,
2014             boot_ram: self.boot_ram,
2015             current_ram: self.current_ram,
2016             arch_mem_regions: self.arch_mem_regions.clone(),
2017             hotplug_slots: self.hotplug_slots.clone(),
2018             next_memory_slot: self.next_memory_slot,
2019             selected_slot: self.selected_slot,
2020             next_hotplug_slot: self.next_hotplug_slot,
2021         }
2022     }
2023 
2024     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2025         let mut memory_slot_fds = HashMap::new();
2026         for guest_ram_mapping in &self.guest_ram_mappings {
2027             let slot = guest_ram_mapping.slot;
2028             let guest_memory = self.guest_memory.memory();
2029             let file = guest_memory
2030                 .find_region(GuestAddress(guest_ram_mapping.gpa))
2031                 .unwrap()
2032                 .file_offset()
2033                 .unwrap()
2034                 .file();
2035             memory_slot_fds.insert(slot, file.as_raw_fd());
2036         }
2037         memory_slot_fds
2038     }
2039 
2040     pub fn acpi_address(&self) -> Option<GuestAddress> {
2041         self.acpi_address
2042     }
2043 
2044     pub fn num_guest_ram_mappings(&self) -> u32 {
2045         self.guest_ram_mappings.len() as u32
2046     }
2047 
2048     #[cfg(target_arch = "aarch64")]
2049     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2050         self.uefi_flash.as_ref().unwrap().clone()
2051     }
2052 
2053     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2054     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2055         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2056         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2057 
2058         let mut mem_offset_in_elf = mem_offset;
2059         let mut ram_maps = BTreeMap::new();
2060         for mapping in mapping_sorted_by_gpa.iter() {
2061             ram_maps.insert(
2062                 mapping.gpa,
2063                 CoredumpMemoryRegion {
2064                     mem_offset_in_elf,
2065                     mem_size: mapping.size,
2066                 },
2067             );
2068             mem_offset_in_elf += mapping.size;
2069         }
2070 
2071         CoredumpMemoryRegions { ram_maps }
2072     }
2073 
2074     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
2075     pub fn coredump_iterate_save_mem(
2076         &mut self,
2077         dump_state: &DumpState,
2078     ) -> std::result::Result<(), GuestDebuggableError> {
2079         let snapshot_memory_ranges = self
2080             .memory_range_table(false)
2081             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2082 
2083         if snapshot_memory_ranges.is_empty() {
2084             return Ok(());
2085         }
2086 
2087         let mut coredump_file = dump_state.file.as_ref().unwrap();
2088 
2089         let guest_memory = self.guest_memory.memory();
2090         let mut total_bytes: u64 = 0;
2091 
2092         for range in snapshot_memory_ranges.regions() {
2093             let mut offset: u64 = 0;
2094             loop {
2095                 let bytes_written = guest_memory
2096                     .write_to(
2097                         GuestAddress(range.gpa + offset),
2098                         &mut coredump_file,
2099                         (range.length - offset) as usize,
2100                     )
2101                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2102                 offset += bytes_written as u64;
2103                 total_bytes += bytes_written as u64;
2104 
2105                 if offset == range.length {
2106                     break;
2107                 }
2108             }
2109         }
2110 
2111         debug!("coredump total bytes {}", total_bytes);
2112         Ok(())
2113     }
2114 
2115     pub fn receive_memory_regions<F>(
2116         &mut self,
2117         ranges: &MemoryRangeTable,
2118         fd: &mut F,
2119     ) -> std::result::Result<(), MigratableError>
2120     where
2121         F: Read,
2122     {
2123         let guest_memory = self.guest_memory();
2124         let mem = guest_memory.memory();
2125 
2126         for range in ranges.regions() {
2127             let mut offset: u64 = 0;
2128             // Here we are manually handling the retry in case we can't the
2129             // whole region at once because we can't use the implementation
2130             // from vm-memory::GuestMemory of read_exact_from() as it is not
2131             // following the correct behavior. For more info about this issue
2132             // see: https://github.com/rust-vmm/vm-memory/issues/174
2133             loop {
2134                 let bytes_read = mem
2135                     .read_from(
2136                         GuestAddress(range.gpa + offset),
2137                         fd,
2138                         (range.length - offset) as usize,
2139                     )
2140                     .map_err(|e| {
2141                         MigratableError::MigrateReceive(anyhow!(
2142                             "Error receiving memory from socket: {}",
2143                             e
2144                         ))
2145                     })?;
2146                 offset += bytes_read as u64;
2147 
2148                 if offset == range.length {
2149                     break;
2150                 }
2151             }
2152         }
2153 
2154         Ok(())
2155     }
2156 }
2157 
2158 struct MemoryNotify {
2159     slot_id: usize,
2160 }
2161 
2162 impl Aml for MemoryNotify {
2163     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2164         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2165         aml::If::new(
2166             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2167             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2168         )
2169         .to_aml_bytes(sink)
2170     }
2171 }
2172 
2173 struct MemorySlot {
2174     slot_id: usize,
2175 }
2176 
2177 impl Aml for MemorySlot {
2178     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2179         aml::Device::new(
2180             format!("M{:03}", self.slot_id).as_str().into(),
2181             vec![
2182                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2183                 &aml::Name::new("_UID".into(), &self.slot_id),
2184                 /*
2185                 _STA return value:
2186                 Bit [0] – Set if the device is present.
2187                 Bit [1] – Set if the device is enabled and decoding its resources.
2188                 Bit [2] – Set if the device should be shown in the UI.
2189                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2190                 Bit [4] – Set if the battery is present.
2191                 Bits [31:5] – Reserved (must be cleared).
2192                 */
2193                 &aml::Method::new(
2194                     "_STA".into(),
2195                     0,
2196                     false,
2197                     // Call into MSTA method which will interrogate device
2198                     vec![&aml::Return::new(&aml::MethodCall::new(
2199                         "MSTA".into(),
2200                         vec![&self.slot_id],
2201                     ))],
2202                 ),
2203                 // Get details of memory
2204                 &aml::Method::new(
2205                     "_CRS".into(),
2206                     0,
2207                     false,
2208                     // Call into MCRS which provides actual memory details
2209                     vec![&aml::Return::new(&aml::MethodCall::new(
2210                         "MCRS".into(),
2211                         vec![&self.slot_id],
2212                     ))],
2213                 ),
2214             ],
2215         )
2216         .to_aml_bytes(sink)
2217     }
2218 }
2219 
2220 struct MemorySlots {
2221     slots: usize,
2222 }
2223 
2224 impl Aml for MemorySlots {
2225     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2226         for slot_id in 0..self.slots {
2227             MemorySlot { slot_id }.to_aml_bytes(sink);
2228         }
2229     }
2230 }
2231 
2232 struct MemoryMethods {
2233     slots: usize,
2234 }
2235 
2236 impl Aml for MemoryMethods {
2237     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2238         // Add "MTFY" notification method
2239         let mut memory_notifies = Vec::new();
2240         for slot_id in 0..self.slots {
2241             memory_notifies.push(MemoryNotify { slot_id });
2242         }
2243 
2244         let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2245         for memory_notifier in memory_notifies.iter() {
2246             memory_notifies_refs.push(memory_notifier);
2247         }
2248 
2249         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2250 
2251         // MSCN method
2252         aml::Method::new(
2253             "MSCN".into(),
2254             0,
2255             true,
2256             vec![
2257                 // Take lock defined above
2258                 &aml::Acquire::new("MLCK".into(), 0xffff),
2259                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2260                 &aml::While::new(
2261                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2262                     vec![
2263                         // Write slot number (in first argument) to I/O port via field
2264                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2265                         // Check if MINS bit is set (inserting)
2266                         &aml::If::new(
2267                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2268                             // Notify device if it is
2269                             vec![
2270                                 &aml::MethodCall::new(
2271                                     "MTFY".into(),
2272                                     vec![&aml::Local(0), &aml::ONE],
2273                                 ),
2274                                 // Reset MINS bit
2275                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2276                             ],
2277                         ),
2278                         // Check if MRMV bit is set
2279                         &aml::If::new(
2280                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2281                             // Notify device if it is (with the eject constant 0x3)
2282                             vec![
2283                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2284                                 // Reset MRMV bit
2285                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2286                             ],
2287                         ),
2288                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2289                     ],
2290                 ),
2291                 // Release lock
2292                 &aml::Release::new("MLCK".into()),
2293             ],
2294         )
2295         .to_aml_bytes(sink);
2296 
2297         // Memory status method
2298         aml::Method::new(
2299             "MSTA".into(),
2300             1,
2301             true,
2302             vec![
2303                 // Take lock defined above
2304                 &aml::Acquire::new("MLCK".into(), 0xffff),
2305                 // Write slot number (in first argument) to I/O port via field
2306                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2307                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2308                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2309                 &aml::If::new(
2310                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2311                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2312                 ),
2313                 // Release lock
2314                 &aml::Release::new("MLCK".into()),
2315                 // Return 0 or 0xf
2316                 &aml::Return::new(&aml::Local(0)),
2317             ],
2318         )
2319         .to_aml_bytes(sink);
2320 
2321         // Memory range method
2322         aml::Method::new(
2323             "MCRS".into(),
2324             1,
2325             true,
2326             vec![
2327                 // Take lock defined above
2328                 &aml::Acquire::new("MLCK".into(), 0xffff),
2329                 // Write slot number (in first argument) to I/O port via field
2330                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2331                 &aml::Name::new(
2332                     "MR64".into(),
2333                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2334                         aml::AddressSpaceCacheable::Cacheable,
2335                         true,
2336                         0x0000_0000_0000_0000u64,
2337                         0xFFFF_FFFF_FFFF_FFFEu64,
2338                         None,
2339                     )]),
2340                 ),
2341                 &aml::CreateQWordField::new(
2342                     &aml::Path::new("MINL"),
2343                     &aml::Path::new("MR64"),
2344                     &14usize,
2345                 ),
2346                 &aml::CreateDWordField::new(
2347                     &aml::Path::new("MINH"),
2348                     &aml::Path::new("MR64"),
2349                     &18usize,
2350                 ),
2351                 &aml::CreateQWordField::new(
2352                     &aml::Path::new("MAXL"),
2353                     &aml::Path::new("MR64"),
2354                     &22usize,
2355                 ),
2356                 &aml::CreateDWordField::new(
2357                     &aml::Path::new("MAXH"),
2358                     &aml::Path::new("MR64"),
2359                     &26usize,
2360                 ),
2361                 &aml::CreateQWordField::new(
2362                     &aml::Path::new("LENL"),
2363                     &aml::Path::new("MR64"),
2364                     &38usize,
2365                 ),
2366                 &aml::CreateDWordField::new(
2367                     &aml::Path::new("LENH"),
2368                     &aml::Path::new("MR64"),
2369                     &42usize,
2370                 ),
2371                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2372                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2373                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2374                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2375                 &aml::Add::new(
2376                     &aml::Path::new("MAXL"),
2377                     &aml::Path::new("MINL"),
2378                     &aml::Path::new("LENL"),
2379                 ),
2380                 &aml::Add::new(
2381                     &aml::Path::new("MAXH"),
2382                     &aml::Path::new("MINH"),
2383                     &aml::Path::new("LENH"),
2384                 ),
2385                 &aml::If::new(
2386                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2387                     vec![&aml::Add::new(
2388                         &aml::Path::new("MAXH"),
2389                         &aml::ONE,
2390                         &aml::Path::new("MAXH"),
2391                     )],
2392                 ),
2393                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2394                 // Release lock
2395                 &aml::Release::new("MLCK".into()),
2396                 &aml::Return::new(&aml::Path::new("MR64")),
2397             ],
2398         )
2399         .to_aml_bytes(sink)
2400     }
2401 }
2402 
2403 impl Aml for MemoryManager {
2404     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2405         if let Some(acpi_address) = self.acpi_address {
2406             // Memory Hotplug Controller
2407             aml::Device::new(
2408                 "_SB_.MHPC".into(),
2409                 vec![
2410                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2411                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2412                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2413                     &aml::Mutex::new("MLCK".into(), 0),
2414                     &aml::Name::new(
2415                         "_CRS".into(),
2416                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2417                             aml::AddressSpaceCacheable::NotCacheable,
2418                             true,
2419                             acpi_address.0,
2420                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2421                             None,
2422                         )]),
2423                     ),
2424                     // OpRegion and Fields map MMIO range into individual field values
2425                     &aml::OpRegion::new(
2426                         "MHPR".into(),
2427                         aml::OpRegionSpace::SystemMemory,
2428                         &(acpi_address.0 as usize),
2429                         &MEMORY_MANAGER_ACPI_SIZE,
2430                     ),
2431                     &aml::Field::new(
2432                         "MHPR".into(),
2433                         aml::FieldAccessType::DWord,
2434                         aml::FieldLockRule::NoLock,
2435                         aml::FieldUpdateRule::Preserve,
2436                         vec![
2437                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2438                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2439                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2440                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2441                         ],
2442                     ),
2443                     &aml::Field::new(
2444                         "MHPR".into(),
2445                         aml::FieldAccessType::DWord,
2446                         aml::FieldLockRule::NoLock,
2447                         aml::FieldUpdateRule::Preserve,
2448                         vec![
2449                             aml::FieldEntry::Reserved(128),
2450                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2451                         ],
2452                     ),
2453                     &aml::Field::new(
2454                         "MHPR".into(),
2455                         aml::FieldAccessType::Byte,
2456                         aml::FieldLockRule::NoLock,
2457                         aml::FieldUpdateRule::WriteAsZeroes,
2458                         vec![
2459                             aml::FieldEntry::Reserved(160),
2460                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2461                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2462                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2463                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2464                         ],
2465                     ),
2466                     &aml::Field::new(
2467                         "MHPR".into(),
2468                         aml::FieldAccessType::DWord,
2469                         aml::FieldLockRule::NoLock,
2470                         aml::FieldUpdateRule::Preserve,
2471                         vec![
2472                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2473                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2474                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2475                         ],
2476                     ),
2477                     &MemoryMethods {
2478                         slots: self.hotplug_slots.len(),
2479                     },
2480                     &MemorySlots {
2481                         slots: self.hotplug_slots.len(),
2482                     },
2483                 ],
2484             )
2485             .to_aml_bytes(sink);
2486         } else {
2487             aml::Device::new(
2488                 "_SB_.MHPC".into(),
2489                 vec![
2490                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2491                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2492                     // Empty MSCN for GED
2493                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2494                 ],
2495             )
2496             .to_aml_bytes(sink);
2497         }
2498 
2499         #[cfg(target_arch = "x86_64")]
2500         {
2501             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2502                 let min = sgx_epc_region.start().raw_value();
2503                 let max = min + sgx_epc_region.size() - 1;
2504                 // SGX EPC region
2505                 aml::Device::new(
2506                     "_SB_.EPC_".into(),
2507                     vec![
2508                         &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2509                         // QWORD describing the EPC region start and size
2510                         &aml::Name::new(
2511                             "_CRS".into(),
2512                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2513                                 aml::AddressSpaceCacheable::NotCacheable,
2514                                 true,
2515                                 min,
2516                                 max,
2517                                 None,
2518                             )]),
2519                         ),
2520                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2521                     ],
2522                 )
2523                 .to_aml_bytes(sink);
2524             }
2525         }
2526     }
2527 }
2528 
2529 impl Pausable for MemoryManager {}
2530 
2531 #[derive(Clone, Serialize, Deserialize, Versionize)]
2532 pub struct MemoryManagerSnapshotData {
2533     memory_ranges: MemoryRangeTable,
2534     guest_ram_mappings: Vec<GuestRamMapping>,
2535     start_of_device_area: u64,
2536     boot_ram: u64,
2537     current_ram: u64,
2538     arch_mem_regions: Vec<ArchMemRegion>,
2539     hotplug_slots: Vec<HotPlugState>,
2540     next_memory_slot: u32,
2541     selected_slot: usize,
2542     next_hotplug_slot: usize,
2543 }
2544 
2545 impl VersionMapped for MemoryManagerSnapshotData {}
2546 
2547 impl Snapshottable for MemoryManager {
2548     fn id(&self) -> String {
2549         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2550     }
2551 
2552     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2553         let memory_ranges = self.memory_range_table(true)?;
2554 
2555         // Store locally this list of ranges as it will be used through the
2556         // Transportable::send() implementation. The point is to avoid the
2557         // duplication of code regarding the creation of the path for each
2558         // region. The 'snapshot' step creates the list of memory regions,
2559         // including information about the need to copy a memory region or
2560         // not. This saves the 'send' step having to go through the same
2561         // process, and instead it can directly proceed with storing the
2562         // memory range content for the ranges requiring it.
2563         self.snapshot_memory_ranges = memory_ranges;
2564 
2565         Ok(Snapshot::from_data(SnapshotData::new_from_versioned_state(
2566             &self.snapshot_data(),
2567         )?))
2568     }
2569 }
2570 
2571 impl Transportable for MemoryManager {
2572     fn send(
2573         &self,
2574         _snapshot: &Snapshot,
2575         destination_url: &str,
2576     ) -> result::Result<(), MigratableError> {
2577         if self.snapshot_memory_ranges.is_empty() {
2578             return Ok(());
2579         }
2580 
2581         let mut memory_file_path = url_to_path(destination_url)?;
2582         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2583 
2584         // Create the snapshot file for the entire memory
2585         let mut memory_file = OpenOptions::new()
2586             .read(true)
2587             .write(true)
2588             .create_new(true)
2589             .open(memory_file_path)
2590             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2591 
2592         let guest_memory = self.guest_memory.memory();
2593 
2594         for range in self.snapshot_memory_ranges.regions() {
2595             let mut offset: u64 = 0;
2596             // Here we are manually handling the retry in case we can't read
2597             // the whole region at once because we can't use the implementation
2598             // from vm-memory::GuestMemory of write_all_to() as it is not
2599             // following the correct behavior. For more info about this issue
2600             // see: https://github.com/rust-vmm/vm-memory/issues/174
2601             loop {
2602                 let bytes_written = guest_memory
2603                     .write_to(
2604                         GuestAddress(range.gpa + offset),
2605                         &mut memory_file,
2606                         (range.length - offset) as usize,
2607                     )
2608                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2609                 offset += bytes_written as u64;
2610 
2611                 if offset == range.length {
2612                     break;
2613                 }
2614             }
2615         }
2616         Ok(())
2617     }
2618 }
2619 
2620 impl Migratable for MemoryManager {
2621     // Start the dirty log in the hypervisor (kvm/mshv).
2622     // Also, reset the dirty bitmap logged by the vmm.
2623     // Just before we do a bulk copy we want to start/clear the dirty log so that
2624     // pages touched during our bulk copy are tracked.
2625     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2626         self.vm.start_dirty_log().map_err(|e| {
2627             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2628         })?;
2629 
2630         for r in self.guest_memory.memory().iter() {
2631             r.bitmap().reset();
2632         }
2633 
2634         Ok(())
2635     }
2636 
2637     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2638         self.vm.stop_dirty_log().map_err(|e| {
2639             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2640         })?;
2641 
2642         Ok(())
2643     }
2644 
2645     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2646     // together in the table if they are contiguous.
2647     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2648         let mut table = MemoryRangeTable::default();
2649         for r in &self.guest_ram_mappings {
2650             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2651                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2652             })?;
2653             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2654             {
2655                 Some(region) => {
2656                     assert!(region.start_addr().raw_value() == r.gpa);
2657                     assert!(region.len() == r.size);
2658                     region.bitmap().get_and_reset()
2659                 }
2660                 None => {
2661                     return Err(MigratableError::MigrateSend(anyhow!(
2662                         "Error finding 'guest memory region' with address {:x}",
2663                         r.gpa
2664                     )))
2665                 }
2666             };
2667 
2668             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2669                 .iter()
2670                 .zip(vmm_dirty_bitmap.iter())
2671                 .map(|(x, y)| x | y)
2672                 .collect();
2673 
2674             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2675 
2676             if sub_table.regions().is_empty() {
2677                 info!("Dirty Memory Range Table is empty");
2678             } else {
2679                 info!("Dirty Memory Range Table:");
2680                 for range in sub_table.regions() {
2681                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2682                 }
2683             }
2684 
2685             table.extend(sub_table);
2686         }
2687         Ok(table)
2688     }
2689 }
2690