xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 6f8bd27cf7629733582d930519e98d19e90afb16)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(target_arch = "x86_64")]
6 use crate::config::SgxEpcConfig;
7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
8 #[cfg(feature = "guest_debug")]
9 use crate::coredump::{CoredumpMemoryRegion, CoredumpMemoryRegions};
10 #[cfg(feature = "guest_debug")]
11 use crate::coredump::{DumpState, GuestDebuggableError};
12 use crate::migration::url_to_path;
13 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
14 use crate::{GuestMemoryMmap, GuestRegionMmap};
15 use acpi_tables::{aml, aml::Aml};
16 use anyhow::anyhow;
17 #[cfg(target_arch = "x86_64")]
18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
19 use arch::{layout, RegionType};
20 #[cfg(target_arch = "x86_64")]
21 use devices::ioapic;
22 #[cfg(target_arch = "aarch64")]
23 use hypervisor::HypervisorVmError;
24 #[cfg(target_arch = "x86_64")]
25 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
26 use serde::{Deserialize, Serialize};
27 #[cfg(feature = "guest_debug")]
28 use std::collections::BTreeMap;
29 use std::collections::HashMap;
30 use std::convert::TryInto;
31 use std::ffi;
32 use std::fs::{File, OpenOptions};
33 use std::io::{self, Read};
34 use std::ops::Deref;
35 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
36 use std::path::PathBuf;
37 use std::result;
38 use std::sync::{Arc, Barrier, Mutex};
39 use tracer::trace_scoped;
40 use versionize::{VersionMap, Versionize, VersionizeResult};
41 use versionize_derive::Versionize;
42 use virtio_devices::BlocksState;
43 #[cfg(target_arch = "x86_64")]
44 use vm_allocator::GsiApic;
45 use vm_allocator::{AddressAllocator, SystemAllocator};
46 use vm_device::BusDevice;
47 use vm_memory::bitmap::AtomicBitmap;
48 use vm_memory::guest_memory::FileOffset;
49 use vm_memory::{
50     mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace,
51     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
52 };
53 use vm_migration::{
54     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
55     Snapshot, SnapshotDataSection, Snapshottable, Transportable, VersionMapped,
56 };
57 
58 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
59 
60 const DEFAULT_MEMORY_ZONE: &str = "mem0";
61 
62 const SNAPSHOT_FILENAME: &str = "memory-ranges";
63 
64 #[cfg(target_arch = "x86_64")]
65 const X86_64_IRQ_BASE: u32 = 5;
66 
67 #[cfg(target_arch = "x86_64")]
68 const SGX_PAGE_SIZE: u64 = 1 << 12;
69 
70 const HOTPLUG_COUNT: usize = 8;
71 
72 // Memory policy constants
73 const MPOL_BIND: u32 = 2;
74 const MPOL_MF_STRICT: u32 = 1;
75 const MPOL_MF_MOVE: u32 = 1 << 1;
76 
77 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
78 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
79 
80 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
81 struct HotPlugState {
82     base: u64,
83     length: u64,
84     active: bool,
85     inserting: bool,
86     removing: bool,
87 }
88 
89 pub struct VirtioMemZone {
90     region: Arc<GuestRegionMmap>,
91     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
92     hotplugged_size: u64,
93     hugepages: bool,
94     blocks_state: Arc<Mutex<BlocksState>>,
95 }
96 
97 impl VirtioMemZone {
98     pub fn region(&self) -> &Arc<GuestRegionMmap> {
99         &self.region
100     }
101     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
102         self.virtio_device = Some(virtio_device);
103     }
104     pub fn hotplugged_size(&self) -> u64 {
105         self.hotplugged_size
106     }
107     pub fn hugepages(&self) -> bool {
108         self.hugepages
109     }
110     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
111         &self.blocks_state
112     }
113     pub fn plugged_ranges(&self) -> MemoryRangeTable {
114         self.blocks_state
115             .lock()
116             .unwrap()
117             .memory_ranges(self.region.start_addr().raw_value(), true)
118     }
119 }
120 
121 #[derive(Default)]
122 pub struct MemoryZone {
123     regions: Vec<Arc<GuestRegionMmap>>,
124     virtio_mem_zone: Option<VirtioMemZone>,
125 }
126 
127 impl MemoryZone {
128     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
129         &self.regions
130     }
131     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
132         &self.virtio_mem_zone
133     }
134     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
135         self.virtio_mem_zone.as_mut()
136     }
137 }
138 
139 pub type MemoryZones = HashMap<String, MemoryZone>;
140 
141 #[derive(Clone, Serialize, Deserialize, Versionize)]
142 struct GuestRamMapping {
143     slot: u32,
144     gpa: u64,
145     size: u64,
146     zone_id: String,
147     virtio_mem: bool,
148     file_offset: u64,
149 }
150 
151 #[derive(Clone, Serialize, Deserialize, Versionize)]
152 struct ArchMemRegion {
153     base: u64,
154     size: usize,
155     r_type: RegionType,
156 }
157 
158 pub struct MemoryManager {
159     boot_guest_memory: GuestMemoryMmap,
160     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
161     next_memory_slot: u32,
162     start_of_device_area: GuestAddress,
163     end_of_device_area: GuestAddress,
164     end_of_ram_area: GuestAddress,
165     pub vm: Arc<dyn hypervisor::Vm>,
166     hotplug_slots: Vec<HotPlugState>,
167     selected_slot: usize,
168     mergeable: bool,
169     allocator: Arc<Mutex<SystemAllocator>>,
170     hotplug_method: HotplugMethod,
171     boot_ram: u64,
172     current_ram: u64,
173     next_hotplug_slot: usize,
174     shared: bool,
175     hugepages: bool,
176     hugepage_size: Option<u64>,
177     prefault: bool,
178     thp: bool,
179     #[cfg(target_arch = "x86_64")]
180     sgx_epc_region: Option<SgxEpcRegion>,
181     user_provided_zones: bool,
182     snapshot_memory_ranges: MemoryRangeTable,
183     memory_zones: MemoryZones,
184     log_dirty: bool, // Enable dirty logging for created RAM regions
185     arch_mem_regions: Vec<ArchMemRegion>,
186     ram_allocator: AddressAllocator,
187     dynamic: bool,
188 
189     // Keep track of calls to create_userspace_mapping() for guest RAM.
190     // This is useful for getting the dirty pages as we need to know the
191     // slots that the mapping is created in.
192     guest_ram_mappings: Vec<GuestRamMapping>,
193 
194     pub acpi_address: Option<GuestAddress>,
195     #[cfg(target_arch = "aarch64")]
196     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
197 }
198 
199 #[derive(Debug)]
200 pub enum Error {
201     /// Failed to create shared file.
202     SharedFileCreate(io::Error),
203 
204     /// Failed to set shared file length.
205     SharedFileSetLen(io::Error),
206 
207     /// Mmap backed guest memory error
208     GuestMemory(MmapError),
209 
210     /// Failed to allocate a memory range.
211     MemoryRangeAllocation,
212 
213     /// Error from region creation
214     GuestMemoryRegion(MmapRegionError),
215 
216     /// No ACPI slot available
217     NoSlotAvailable,
218 
219     /// Not enough space in the hotplug RAM region
220     InsufficientHotplugRam,
221 
222     /// The requested hotplug memory addition is not a valid size
223     InvalidSize,
224 
225     /// Failed to create the user memory region.
226     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
227 
228     /// Failed to remove the user memory region.
229     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
230 
231     /// Failed to EventFd.
232     EventFdFail(io::Error),
233 
234     /// Eventfd write error
235     EventfdError(io::Error),
236 
237     /// Failed to virtio-mem resize
238     VirtioMemResizeFail(virtio_devices::mem::Error),
239 
240     /// Cannot restore VM
241     Restore(MigratableError),
242 
243     /// Cannot restore VM because source URL is missing
244     RestoreMissingSourceUrl,
245 
246     /// Cannot create the system allocator
247     CreateSystemAllocator,
248 
249     /// Invalid SGX EPC section size
250     #[cfg(target_arch = "x86_64")]
251     EpcSectionSizeInvalid,
252 
253     /// Failed allocating SGX EPC region
254     #[cfg(target_arch = "x86_64")]
255     SgxEpcRangeAllocation,
256 
257     /// Failed opening SGX virtual EPC device
258     #[cfg(target_arch = "x86_64")]
259     SgxVirtEpcOpen(io::Error),
260 
261     /// Failed setting the SGX virtual EPC section size
262     #[cfg(target_arch = "x86_64")]
263     SgxVirtEpcFileSetLen(io::Error),
264 
265     /// Failed opening SGX provisioning device
266     #[cfg(target_arch = "x86_64")]
267     SgxProvisionOpen(io::Error),
268 
269     /// Failed enabling SGX provisioning
270     #[cfg(target_arch = "x86_64")]
271     SgxEnableProvisioning(hypervisor::HypervisorVmError),
272 
273     /// Failed creating a new MmapRegion instance.
274     #[cfg(target_arch = "x86_64")]
275     NewMmapRegion(vm_memory::mmap::MmapRegionError),
276 
277     /// No memory zones found.
278     MissingMemoryZones,
279 
280     /// Memory configuration is not valid.
281     InvalidMemoryParameters,
282 
283     /// Forbidden operation. Impossible to resize guest memory if it is
284     /// backed by user defined memory regions.
285     InvalidResizeWithMemoryZones,
286 
287     /// It's invalid to try applying a NUMA policy to a memory zone that is
288     /// memory mapped with MAP_SHARED.
289     InvalidSharedMemoryZoneWithHostNuma,
290 
291     /// Failed applying NUMA memory policy.
292     ApplyNumaPolicy(io::Error),
293 
294     /// Memory zone identifier is not unique.
295     DuplicateZoneId,
296 
297     /// No virtio-mem resizing handler found.
298     MissingVirtioMemHandler,
299 
300     /// Unknown memory zone.
301     UnknownMemoryZone,
302 
303     /// Invalid size for resizing. Can be anything except 0.
304     InvalidHotplugSize,
305 
306     /// Invalid hotplug method associated with memory zones resizing capability.
307     InvalidHotplugMethodWithMemoryZones,
308 
309     /// Could not find specified memory zone identifier from hash map.
310     MissingZoneIdentifier,
311 
312     /// Resizing the memory zone failed.
313     ResizeZone,
314 
315     /// Guest address overflow
316     GuestAddressOverFlow,
317 
318     /// Error opening snapshot file
319     SnapshotOpen(io::Error),
320 
321     // Error copying snapshot into region
322     SnapshotCopy(GuestMemoryError),
323 
324     /// Failed to allocate MMIO address
325     AllocateMmioAddress,
326 
327     #[cfg(target_arch = "aarch64")]
328     /// Failed to create UEFI flash
329     CreateUefiFlash(HypervisorVmError),
330 }
331 
332 const ENABLE_FLAG: usize = 0;
333 const INSERTING_FLAG: usize = 1;
334 const REMOVING_FLAG: usize = 2;
335 const EJECT_FLAG: usize = 3;
336 
337 const BASE_OFFSET_LOW: u64 = 0;
338 const BASE_OFFSET_HIGH: u64 = 0x4;
339 const LENGTH_OFFSET_LOW: u64 = 0x8;
340 const LENGTH_OFFSET_HIGH: u64 = 0xC;
341 const STATUS_OFFSET: u64 = 0x14;
342 const SELECTION_OFFSET: u64 = 0;
343 
344 // The MMIO address space size is subtracted with 64k. This is done for the
345 // following reasons:
346 //  - Reduce the addressable space size by at least 4k to workaround a Linux
347 //    bug when the VMM allocates devices at the end of the addressable space
348 //  - Windows requires the addressable space size to be 64k aligned
349 fn mmio_address_space_size(phys_bits: u8) -> u64 {
350     (1 << phys_bits) - (1 << 16)
351 }
352 
353 impl BusDevice for MemoryManager {
354     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
355         if self.selected_slot < self.hotplug_slots.len() {
356             let state = &self.hotplug_slots[self.selected_slot];
357             match offset {
358                 BASE_OFFSET_LOW => {
359                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
360                 }
361                 BASE_OFFSET_HIGH => {
362                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
363                 }
364                 LENGTH_OFFSET_LOW => {
365                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
366                 }
367                 LENGTH_OFFSET_HIGH => {
368                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
369                 }
370                 STATUS_OFFSET => {
371                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
372                     data.fill(0);
373                     if state.active {
374                         data[0] |= 1 << ENABLE_FLAG;
375                     }
376                     if state.inserting {
377                         data[0] |= 1 << INSERTING_FLAG;
378                     }
379                     if state.removing {
380                         data[0] |= 1 << REMOVING_FLAG;
381                     }
382                 }
383                 _ => {
384                     warn!(
385                         "Unexpected offset for accessing memory manager device: {:#}",
386                         offset
387                     );
388                 }
389             }
390         } else {
391             warn!("Out of range memory slot: {}", self.selected_slot);
392         }
393     }
394 
395     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
396         match offset {
397             SELECTION_OFFSET => {
398                 self.selected_slot = usize::from(data[0]);
399             }
400             STATUS_OFFSET => {
401                 if self.selected_slot < self.hotplug_slots.len() {
402                     let state = &mut self.hotplug_slots[self.selected_slot];
403                     // The ACPI code writes back a 1 to acknowledge the insertion
404                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
405                         state.inserting = false;
406                     }
407                     // Ditto for removal
408                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
409                         state.removing = false;
410                     }
411                     // Trigger removal of "DIMM"
412                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
413                         warn!("Ejection of memory not currently supported");
414                     }
415                 } else {
416                     warn!("Out of range memory slot: {}", self.selected_slot);
417                 }
418             }
419             _ => {
420                 warn!(
421                     "Unexpected offset for accessing memory manager device: {:#}",
422                     offset
423                 );
424             }
425         };
426         None
427     }
428 }
429 
430 impl MemoryManager {
431     /// Creates all memory regions based on the available RAM ranges defined
432     /// by `ram_regions`, and based on the description of the memory zones.
433     /// In practice, this function can perform multiple memory mappings of the
434     /// same backing file if there's a hole in the address space between two
435     /// RAM ranges.
436     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
437     /// and zones containing two zones (size 1G and size 4G).
438     /// This function will create 3 resulting memory regions:
439     /// - First one mapping entirely the first memory zone on 0-1G range
440     /// - Second one mapping partially the second memory zone on 1G-3G range
441     /// - Third one mapping partially the second memory zone on 4G-6G range
442     fn create_memory_regions_from_zones(
443         ram_regions: &[(GuestAddress, usize)],
444         zones: &[MemoryZoneConfig],
445         prefault: Option<bool>,
446         thp: bool,
447     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
448         let mut zones = zones.to_owned();
449         let mut mem_regions = Vec::new();
450         let mut zone = zones.remove(0);
451         let mut zone_offset = 0;
452         let mut memory_zones = HashMap::new();
453 
454         // Add zone id to the list of memory zones.
455         memory_zones.insert(zone.id.clone(), MemoryZone::default());
456 
457         for ram_region in ram_regions.iter() {
458             let mut ram_region_offset = 0;
459             let mut exit = false;
460 
461             loop {
462                 let mut ram_region_consumed = false;
463                 let mut pull_next_zone = false;
464 
465                 let ram_region_sub_size = ram_region.1 - ram_region_offset;
466                 let zone_sub_size = zone.size as usize - zone_offset;
467 
468                 let file_offset = zone_offset as u64;
469                 let region_start = ram_region
470                     .0
471                     .checked_add(ram_region_offset as u64)
472                     .ok_or(Error::GuestAddressOverFlow)?;
473                 let region_size = if zone_sub_size <= ram_region_sub_size {
474                     if zone_sub_size == ram_region_sub_size {
475                         ram_region_consumed = true;
476                     }
477 
478                     ram_region_offset += zone_sub_size;
479                     pull_next_zone = true;
480 
481                     zone_sub_size
482                 } else {
483                     zone_offset += ram_region_sub_size;
484                     ram_region_consumed = true;
485 
486                     ram_region_sub_size
487                 };
488 
489                 let region = MemoryManager::create_ram_region(
490                     &zone.file,
491                     file_offset,
492                     region_start,
493                     region_size,
494                     match prefault {
495                         Some(pf) => pf,
496                         None => zone.prefault,
497                     },
498                     zone.shared,
499                     zone.hugepages,
500                     zone.hugepage_size,
501                     zone.host_numa_node,
502                     None,
503                     thp,
504                 )?;
505 
506                 // Add region to the list of regions associated with the
507                 // current memory zone.
508                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
509                     memory_zone.regions.push(region.clone());
510                 }
511 
512                 mem_regions.push(region);
513 
514                 if pull_next_zone {
515                     // Get the next zone and reset the offset.
516                     zone_offset = 0;
517                     if zones.is_empty() {
518                         exit = true;
519                         break;
520                     }
521                     zone = zones.remove(0);
522 
523                     // Check if zone id already exist. In case it does, throw
524                     // an error as we need unique identifiers. Otherwise, add
525                     // the new zone id to the list of memory zones.
526                     if memory_zones.contains_key(&zone.id) {
527                         error!(
528                             "Memory zone identifier '{}' found more than once. \
529                             It must be unique",
530                             zone.id,
531                         );
532                         return Err(Error::DuplicateZoneId);
533                     }
534                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
535                 }
536 
537                 if ram_region_consumed {
538                     break;
539                 }
540             }
541 
542             if exit {
543                 break;
544             }
545         }
546 
547         Ok((mem_regions, memory_zones))
548     }
549 
550     // Restore both GuestMemory regions along with MemoryZone zones.
551     fn restore_memory_regions_and_zones(
552         guest_ram_mappings: &[GuestRamMapping],
553         zones_config: &[MemoryZoneConfig],
554         prefault: Option<bool>,
555         mut existing_memory_files: HashMap<u32, File>,
556         thp: bool,
557     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
558         let mut memory_regions = Vec::new();
559         let mut memory_zones = HashMap::new();
560 
561         for zone_config in zones_config {
562             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
563         }
564 
565         for guest_ram_mapping in guest_ram_mappings {
566             for zone_config in zones_config {
567                 if guest_ram_mapping.zone_id == zone_config.id {
568                     let region = MemoryManager::create_ram_region(
569                         &zone_config.file,
570                         guest_ram_mapping.file_offset,
571                         GuestAddress(guest_ram_mapping.gpa),
572                         guest_ram_mapping.size as usize,
573                         match prefault {
574                             Some(pf) => pf,
575                             None => zone_config.prefault,
576                         },
577                         zone_config.shared,
578                         zone_config.hugepages,
579                         zone_config.hugepage_size,
580                         zone_config.host_numa_node,
581                         existing_memory_files.remove(&guest_ram_mapping.slot),
582                         thp,
583                     )?;
584                     memory_regions.push(Arc::clone(&region));
585                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
586                         if guest_ram_mapping.virtio_mem {
587                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
588                             let region_size = region.len();
589                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
590                                 region,
591                                 virtio_device: None,
592                                 hotplugged_size,
593                                 hugepages: zone_config.hugepages,
594                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
595                             });
596                         } else {
597                             memory_zone.regions.push(region);
598                         }
599                     }
600                 }
601             }
602         }
603 
604         memory_regions.sort_by_key(|x| x.start_addr());
605 
606         Ok((memory_regions, memory_zones))
607     }
608 
609     fn fill_saved_regions(
610         &mut self,
611         file_path: PathBuf,
612         saved_regions: MemoryRangeTable,
613     ) -> Result<(), Error> {
614         if saved_regions.is_empty() {
615             return Ok(());
616         }
617 
618         // Open (read only) the snapshot file.
619         let mut memory_file = OpenOptions::new()
620             .read(true)
621             .open(file_path)
622             .map_err(Error::SnapshotOpen)?;
623 
624         let guest_memory = self.guest_memory.memory();
625         for range in saved_regions.regions() {
626             let mut offset: u64 = 0;
627             // Here we are manually handling the retry in case we can't write
628             // the whole region at once because we can't use the implementation
629             // from vm-memory::GuestMemory of read_exact_from() as it is not
630             // following the correct behavior. For more info about this issue
631             // see: https://github.com/rust-vmm/vm-memory/issues/174
632             loop {
633                 let bytes_read = guest_memory
634                     .read_from(
635                         GuestAddress(range.gpa + offset),
636                         &mut memory_file,
637                         (range.length - offset) as usize,
638                     )
639                     .map_err(Error::SnapshotCopy)?;
640                 offset += bytes_read as u64;
641 
642                 if offset == range.length {
643                     break;
644                 }
645             }
646         }
647 
648         Ok(())
649     }
650 
651     fn validate_memory_config(
652         config: &MemoryConfig,
653         user_provided_zones: bool,
654     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
655         let mut allow_mem_hotplug = false;
656 
657         if !user_provided_zones {
658             if config.zones.is_some() {
659                 error!(
660                     "User defined memory regions can't be provided if the \
661                     memory size is not 0"
662                 );
663                 return Err(Error::InvalidMemoryParameters);
664             }
665 
666             if config.hotplug_size.is_some() {
667                 allow_mem_hotplug = true;
668             }
669 
670             if let Some(hotplugged_size) = config.hotplugged_size {
671                 if let Some(hotplug_size) = config.hotplug_size {
672                     if hotplugged_size > hotplug_size {
673                         error!(
674                             "'hotplugged_size' {} can't be bigger than \
675                             'hotplug_size' {}",
676                             hotplugged_size, hotplug_size,
677                         );
678                         return Err(Error::InvalidMemoryParameters);
679                     }
680                 } else {
681                     error!(
682                         "Invalid to define 'hotplugged_size' when there is\
683                         no 'hotplug_size'"
684                     );
685                     return Err(Error::InvalidMemoryParameters);
686                 }
687                 if config.hotplug_method == HotplugMethod::Acpi {
688                     error!(
689                         "Invalid to define 'hotplugged_size' with hotplug \
690                         method 'acpi'"
691                     );
692                     return Err(Error::InvalidMemoryParameters);
693                 }
694             }
695 
696             // Create a single zone from the global memory config. This lets
697             // us reuse the codepath for user defined memory zones.
698             let zones = vec![MemoryZoneConfig {
699                 id: String::from(DEFAULT_MEMORY_ZONE),
700                 size: config.size,
701                 file: None,
702                 shared: config.shared,
703                 hugepages: config.hugepages,
704                 hugepage_size: config.hugepage_size,
705                 host_numa_node: None,
706                 hotplug_size: config.hotplug_size,
707                 hotplugged_size: config.hotplugged_size,
708                 prefault: config.prefault,
709             }];
710 
711             Ok((config.size, zones, allow_mem_hotplug))
712         } else {
713             if config.zones.is_none() {
714                 error!(
715                     "User defined memory regions must be provided if the \
716                     memory size is 0"
717                 );
718                 return Err(Error::MissingMemoryZones);
719             }
720 
721             // Safe to unwrap as we checked right above there were some
722             // regions.
723             let zones = config.zones.clone().unwrap();
724             if zones.is_empty() {
725                 return Err(Error::MissingMemoryZones);
726             }
727 
728             let mut total_ram_size: u64 = 0;
729             for zone in zones.iter() {
730                 total_ram_size += zone.size;
731 
732                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
733                     error!(
734                         "Invalid to set host NUMA policy for a memory zone \
735                         backed by a regular file and mapped as 'shared'"
736                     );
737                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
738                 }
739 
740                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
741                     error!("Invalid to set ACPI hotplug method for memory zones");
742                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
743                 }
744 
745                 if let Some(hotplugged_size) = zone.hotplugged_size {
746                     if let Some(hotplug_size) = zone.hotplug_size {
747                         if hotplugged_size > hotplug_size {
748                             error!(
749                                 "'hotplugged_size' {} can't be bigger than \
750                                 'hotplug_size' {}",
751                                 hotplugged_size, hotplug_size,
752                             );
753                             return Err(Error::InvalidMemoryParameters);
754                         }
755                     } else {
756                         error!(
757                             "Invalid to define 'hotplugged_size' when there is\
758                             no 'hotplug_size' for a memory zone"
759                         );
760                         return Err(Error::InvalidMemoryParameters);
761                     }
762                     if config.hotplug_method == HotplugMethod::Acpi {
763                         error!(
764                             "Invalid to define 'hotplugged_size' with hotplug \
765                             method 'acpi'"
766                         );
767                         return Err(Error::InvalidMemoryParameters);
768                     }
769                 }
770             }
771 
772             Ok((total_ram_size, zones, allow_mem_hotplug))
773         }
774     }
775 
776     fn allocate_address_space(&mut self) -> Result<(), Error> {
777         let mut list = Vec::new();
778 
779         for (zone_id, memory_zone) in self.memory_zones.iter() {
780             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
781                 memory_zone
782                     .regions()
783                     .iter()
784                     .map(|r| (r.clone(), false))
785                     .collect();
786 
787             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
788                 regions.push((virtio_mem_zone.region().clone(), true));
789             }
790 
791             list.push((zone_id.clone(), regions));
792         }
793 
794         for (zone_id, regions) in list {
795             for (region, virtio_mem) in regions {
796                 let slot = self.create_userspace_mapping(
797                     region.start_addr().raw_value(),
798                     region.len(),
799                     region.as_ptr() as u64,
800                     self.mergeable,
801                     false,
802                     self.log_dirty,
803                 )?;
804 
805                 let file_offset = if let Some(file_offset) = region.file_offset() {
806                     file_offset.start()
807                 } else {
808                     0
809                 };
810 
811                 self.guest_ram_mappings.push(GuestRamMapping {
812                     gpa: region.start_addr().raw_value(),
813                     size: region.len(),
814                     slot,
815                     zone_id: zone_id.clone(),
816                     virtio_mem,
817                     file_offset,
818                 });
819                 self.ram_allocator
820                     .allocate(Some(region.start_addr()), region.len(), None)
821                     .ok_or(Error::MemoryRangeAllocation)?;
822             }
823         }
824 
825         // Allocate SubRegion and Reserved address ranges.
826         for region in self.arch_mem_regions.iter() {
827             if region.r_type == RegionType::Ram {
828                 // Ignore the RAM type since ranges have already been allocated
829                 // based on the GuestMemory regions.
830                 continue;
831             }
832             self.ram_allocator
833                 .allocate(
834                     Some(GuestAddress(region.base)),
835                     region.size as GuestUsize,
836                     None,
837                 )
838                 .ok_or(Error::MemoryRangeAllocation)?;
839         }
840 
841         Ok(())
842     }
843 
844     #[cfg(target_arch = "aarch64")]
845     fn add_uefi_flash(&mut self) -> Result<(), Error> {
846         // On AArch64, the UEFI binary requires a flash device at address 0.
847         // 4 MiB memory is mapped to simulate the flash.
848         let uefi_mem_slot = self.allocate_memory_slot();
849         let uefi_region = GuestRegionMmap::new(
850             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
851             arch::layout::UEFI_START,
852         )
853         .unwrap();
854         let uefi_mem_region = self.vm.make_user_memory_region(
855             uefi_mem_slot,
856             uefi_region.start_addr().raw_value(),
857             uefi_region.len() as u64,
858             uefi_region.as_ptr() as u64,
859             false,
860             false,
861         );
862         self.vm
863             .create_user_memory_region(uefi_mem_region)
864             .map_err(Error::CreateUefiFlash)?;
865 
866         let uefi_flash =
867             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
868 
869         self.uefi_flash = Some(uefi_flash);
870 
871         Ok(())
872     }
873 
874     #[allow(clippy::too_many_arguments)]
875     pub fn new(
876         vm: Arc<dyn hypervisor::Vm>,
877         config: &MemoryConfig,
878         prefault: Option<bool>,
879         phys_bits: u8,
880         #[cfg(feature = "tdx")] tdx_enabled: bool,
881         restore_data: Option<&MemoryManagerSnapshotData>,
882         existing_memory_files: Option<HashMap<u32, File>>,
883         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
884     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
885         trace_scoped!("MemoryManager::new");
886 
887         let user_provided_zones = config.size == 0;
888 
889         let mmio_address_space_size = mmio_address_space_size(phys_bits);
890         debug_assert_eq!(
891             (((mmio_address_space_size) >> 16) << 16),
892             mmio_address_space_size
893         );
894         let start_of_platform_device_area =
895             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
896         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
897 
898         let (ram_size, zones, allow_mem_hotplug) =
899             Self::validate_memory_config(config, user_provided_zones)?;
900 
901         let (
902             start_of_device_area,
903             boot_ram,
904             current_ram,
905             arch_mem_regions,
906             memory_zones,
907             guest_memory,
908             boot_guest_memory,
909             hotplug_slots,
910             next_memory_slot,
911             selected_slot,
912             next_hotplug_slot,
913         ) = if let Some(data) = restore_data {
914             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
915                 &data.guest_ram_mappings,
916                 &zones,
917                 prefault,
918                 existing_memory_files.unwrap_or_default(),
919                 config.thp,
920             )?;
921             let guest_memory =
922                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
923             let boot_guest_memory = guest_memory.clone();
924             (
925                 GuestAddress(data.start_of_device_area),
926                 data.boot_ram,
927                 data.current_ram,
928                 data.arch_mem_regions.clone(),
929                 memory_zones,
930                 guest_memory,
931                 boot_guest_memory,
932                 data.hotplug_slots.clone(),
933                 data.next_memory_slot,
934                 data.selected_slot,
935                 data.next_hotplug_slot,
936             )
937         } else {
938             // Init guest memory
939             let arch_mem_regions = arch::arch_memory_regions(ram_size);
940 
941             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
942                 .iter()
943                 .filter(|r| r.2 == RegionType::Ram)
944                 .map(|r| (r.0, r.1))
945                 .collect();
946 
947             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
948                 .iter()
949                 .map(|(a, b, c)| ArchMemRegion {
950                     base: a.0,
951                     size: *b,
952                     r_type: *c,
953                 })
954                 .collect();
955 
956             let (mem_regions, mut memory_zones) =
957                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
958 
959             let mut guest_memory =
960                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
961 
962             let boot_guest_memory = guest_memory.clone();
963 
964             let mut start_of_device_area =
965                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
966 
967             // Update list of memory zones for resize.
968             for zone in zones.iter() {
969                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
970                     if let Some(hotplug_size) = zone.hotplug_size {
971                         if hotplug_size == 0 {
972                             error!("'hotplug_size' can't be 0");
973                             return Err(Error::InvalidHotplugSize);
974                         }
975 
976                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
977                             start_of_device_area = start_of_device_area
978                                 .checked_add(hotplug_size)
979                                 .ok_or(Error::GuestAddressOverFlow)?;
980                         } else {
981                             // Alignment must be "natural" i.e. same as size of block
982                             let start_addr = GuestAddress(
983                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
984                                     - 1)
985                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
986                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
987                             );
988 
989                             // When `prefault` is set by vm_restore, memory manager
990                             // will create ram region with `prefault` option in
991                             // restore config rather than same option in zone
992                             let region = MemoryManager::create_ram_region(
993                                 &None,
994                                 0,
995                                 start_addr,
996                                 hotplug_size as usize,
997                                 match prefault {
998                                     Some(pf) => pf,
999                                     None => zone.prefault,
1000                                 },
1001                                 zone.shared,
1002                                 zone.hugepages,
1003                                 zone.hugepage_size,
1004                                 zone.host_numa_node,
1005                                 None,
1006                                 config.thp,
1007                             )?;
1008 
1009                             guest_memory = guest_memory
1010                                 .insert_region(Arc::clone(&region))
1011                                 .map_err(Error::GuestMemory)?;
1012 
1013                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1014                             let region_size = region.len();
1015                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1016                                 region,
1017                                 virtio_device: None,
1018                                 hotplugged_size,
1019                                 hugepages: zone.hugepages,
1020                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1021                             });
1022 
1023                             start_of_device_area = start_addr
1024                                 .checked_add(hotplug_size)
1025                                 .ok_or(Error::GuestAddressOverFlow)?;
1026                         }
1027                     }
1028                 } else {
1029                     return Err(Error::MissingZoneIdentifier);
1030                 }
1031             }
1032 
1033             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1034             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1035 
1036             (
1037                 start_of_device_area,
1038                 ram_size,
1039                 ram_size,
1040                 arch_mem_regions,
1041                 memory_zones,
1042                 guest_memory,
1043                 boot_guest_memory,
1044                 hotplug_slots,
1045                 0,
1046                 0,
1047                 0,
1048             )
1049         };
1050 
1051         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1052 
1053         // Both MMIO and PIO address spaces start at address 0.
1054         let allocator = Arc::new(Mutex::new(
1055             SystemAllocator::new(
1056                 #[cfg(target_arch = "x86_64")]
1057                 {
1058                     GuestAddress(0)
1059                 },
1060                 #[cfg(target_arch = "x86_64")]
1061                 {
1062                     1 << 16
1063                 },
1064                 start_of_platform_device_area,
1065                 PLATFORM_DEVICE_AREA_SIZE,
1066                 layout::MEM_32BIT_DEVICES_START,
1067                 layout::MEM_32BIT_DEVICES_SIZE,
1068                 #[cfg(target_arch = "x86_64")]
1069                 vec![GsiApic::new(
1070                     X86_64_IRQ_BASE,
1071                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1072                 )],
1073             )
1074             .ok_or(Error::CreateSystemAllocator)?,
1075         ));
1076 
1077         #[cfg(not(feature = "tdx"))]
1078         let dynamic = true;
1079         #[cfg(feature = "tdx")]
1080         let dynamic = !tdx_enabled;
1081 
1082         let acpi_address = if dynamic
1083             && config.hotplug_method == HotplugMethod::Acpi
1084             && (config.hotplug_size.unwrap_or_default() > 0)
1085         {
1086             Some(
1087                 allocator
1088                     .lock()
1089                     .unwrap()
1090                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1091                     .ok_or(Error::AllocateMmioAddress)?,
1092             )
1093         } else {
1094             None
1095         };
1096 
1097         // If running on SGX the start of device area and RAM area may diverge but
1098         // at this point they are next to each other.
1099         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1100         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1101 
1102         let mut memory_manager = MemoryManager {
1103             boot_guest_memory,
1104             guest_memory,
1105             next_memory_slot,
1106             start_of_device_area,
1107             end_of_device_area,
1108             end_of_ram_area,
1109             vm,
1110             hotplug_slots,
1111             selected_slot,
1112             mergeable: config.mergeable,
1113             allocator,
1114             hotplug_method: config.hotplug_method,
1115             boot_ram,
1116             current_ram,
1117             next_hotplug_slot,
1118             shared: config.shared,
1119             hugepages: config.hugepages,
1120             hugepage_size: config.hugepage_size,
1121             prefault: config.prefault,
1122             #[cfg(target_arch = "x86_64")]
1123             sgx_epc_region: None,
1124             user_provided_zones,
1125             snapshot_memory_ranges: MemoryRangeTable::default(),
1126             memory_zones,
1127             guest_ram_mappings: Vec::new(),
1128             acpi_address,
1129             log_dirty: dynamic, // Cannot log dirty pages on a TD
1130             arch_mem_regions,
1131             ram_allocator,
1132             dynamic,
1133             #[cfg(target_arch = "aarch64")]
1134             uefi_flash: None,
1135             thp: config.thp,
1136         };
1137 
1138         memory_manager.allocate_address_space()?;
1139 
1140         #[cfg(target_arch = "aarch64")]
1141         memory_manager.add_uefi_flash()?;
1142 
1143         #[cfg(target_arch = "x86_64")]
1144         if let Some(sgx_epc_config) = sgx_epc_config {
1145             memory_manager.setup_sgx(sgx_epc_config)?;
1146         }
1147 
1148         Ok(Arc::new(Mutex::new(memory_manager)))
1149     }
1150 
1151     pub fn new_from_snapshot(
1152         snapshot: &Snapshot,
1153         vm: Arc<dyn hypervisor::Vm>,
1154         config: &MemoryConfig,
1155         source_url: Option<&str>,
1156         prefault: bool,
1157         phys_bits: u8,
1158     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1159         if let Some(source_url) = source_url {
1160             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1161             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1162 
1163             let mem_snapshot: MemoryManagerSnapshotData = snapshot
1164                 .to_versioned_state(MEMORY_MANAGER_SNAPSHOT_ID)
1165                 .map_err(Error::Restore)?;
1166 
1167             let mm = MemoryManager::new(
1168                 vm,
1169                 config,
1170                 Some(prefault),
1171                 phys_bits,
1172                 #[cfg(feature = "tdx")]
1173                 false,
1174                 Some(&mem_snapshot),
1175                 None,
1176                 #[cfg(target_arch = "x86_64")]
1177                 None,
1178             )?;
1179 
1180             mm.lock()
1181                 .unwrap()
1182                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1183 
1184             Ok(mm)
1185         } else {
1186             Err(Error::RestoreMissingSourceUrl)
1187         }
1188     }
1189 
1190     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1191         // SAFETY: FFI call with correct arguments
1192         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1193 
1194         if res < 0 {
1195             Err(io::Error::last_os_error())
1196         } else {
1197             Ok(res as RawFd)
1198         }
1199     }
1200 
1201     fn mbind(
1202         addr: *mut u8,
1203         len: u64,
1204         mode: u32,
1205         nodemask: Vec<u64>,
1206         maxnode: u64,
1207         flags: u32,
1208     ) -> Result<(), io::Error> {
1209         // SAFETY: FFI call with correct arguments
1210         let res = unsafe {
1211             libc::syscall(
1212                 libc::SYS_mbind,
1213                 addr as *mut libc::c_void,
1214                 len,
1215                 mode,
1216                 nodemask.as_ptr(),
1217                 maxnode,
1218                 flags,
1219             )
1220         };
1221 
1222         if res < 0 {
1223             Err(io::Error::last_os_error())
1224         } else {
1225             Ok(())
1226         }
1227     }
1228 
1229     fn create_anonymous_file(
1230         size: usize,
1231         hugepages: bool,
1232         hugepage_size: Option<u64>,
1233     ) -> Result<FileOffset, Error> {
1234         let fd = Self::memfd_create(
1235             &ffi::CString::new("ch_ram").unwrap(),
1236             libc::MFD_CLOEXEC
1237                 | if hugepages {
1238                     libc::MFD_HUGETLB
1239                         | if let Some(hugepage_size) = hugepage_size {
1240                             /*
1241                              * From the Linux kernel:
1242                              * Several system calls take a flag to request "hugetlb" huge pages.
1243                              * Without further specification, these system calls will use the
1244                              * system's default huge page size.  If a system supports multiple
1245                              * huge page sizes, the desired huge page size can be specified in
1246                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1247                              * will encode the log2 of the huge page size.
1248                              */
1249 
1250                             hugepage_size.trailing_zeros() << 26
1251                         } else {
1252                             // Use the system default huge page size
1253                             0
1254                         }
1255                 } else {
1256                     0
1257                 },
1258         )
1259         .map_err(Error::SharedFileCreate)?;
1260 
1261         // SAFETY: fd is valid
1262         let f = unsafe { File::from_raw_fd(fd) };
1263         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1264 
1265         Ok(FileOffset::new(f, 0))
1266     }
1267 
1268     fn open_backing_file(
1269         backing_file: &PathBuf,
1270         file_offset: u64,
1271         size: usize,
1272     ) -> Result<FileOffset, Error> {
1273         if backing_file.is_dir() {
1274             // Override file offset as it does not apply in this case.
1275             info!(
1276                 "Ignoring file offset since the backing file is a \
1277                         temporary file created from the specified directory."
1278             );
1279             let fs_str = format!("{}{}", backing_file.display(), "/tmpfile_XXXXXX");
1280             let fs = ffi::CString::new(fs_str).unwrap();
1281             let mut path = fs.as_bytes_with_nul().to_owned();
1282             let path_ptr = path.as_mut_ptr() as *mut _;
1283             // SAFETY: FFI call
1284             let fd = unsafe { libc::mkstemp(path_ptr) };
1285             if fd == -1 {
1286                 return Err(Error::SharedFileCreate(std::io::Error::last_os_error()));
1287             }
1288             // SAFETY: FFI call
1289             unsafe { libc::unlink(path_ptr) };
1290             // SAFETY: fd is valid
1291             let f = unsafe { File::from_raw_fd(fd) };
1292             f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1293 
1294             Ok(FileOffset::new(f, 0))
1295         } else {
1296             let f = OpenOptions::new()
1297                 .read(true)
1298                 .write(true)
1299                 .open(backing_file)
1300                 .map_err(Error::SharedFileCreate)?;
1301 
1302             Ok(FileOffset::new(f, file_offset))
1303         }
1304     }
1305 
1306     #[allow(clippy::too_many_arguments)]
1307     pub fn create_ram_region(
1308         backing_file: &Option<PathBuf>,
1309         file_offset: u64,
1310         start_addr: GuestAddress,
1311         size: usize,
1312         prefault: bool,
1313         shared: bool,
1314         hugepages: bool,
1315         hugepage_size: Option<u64>,
1316         host_numa_node: Option<u32>,
1317         existing_memory_file: Option<File>,
1318         thp: bool,
1319     ) -> Result<Arc<GuestRegionMmap>, Error> {
1320         let mut mmap_flags = libc::MAP_NORESERVE;
1321 
1322         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1323         // the complexity of the handling clear.
1324         let fo = if let Some(f) = existing_memory_file {
1325             // It must be MAP_SHARED as we wouldn't already have an FD
1326             mmap_flags |= libc::MAP_SHARED;
1327             Some(FileOffset::new(f, file_offset))
1328         } else if let Some(backing_file) = backing_file {
1329             if shared {
1330                 mmap_flags |= libc::MAP_SHARED;
1331             } else {
1332                 mmap_flags |= libc::MAP_PRIVATE;
1333             }
1334             Some(Self::open_backing_file(backing_file, file_offset, size)?)
1335         } else if shared || hugepages {
1336             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1337             // because the MAP_PRIVATE will trigger CoW against the backing file with
1338             // the VFIO pinning
1339             mmap_flags |= libc::MAP_SHARED;
1340             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1341         } else {
1342             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1343             None
1344         };
1345 
1346         if prefault {
1347             mmap_flags |= libc::MAP_POPULATE;
1348         }
1349 
1350         let region = GuestRegionMmap::new(
1351             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1352                 .map_err(Error::GuestMemoryRegion)?,
1353             start_addr,
1354         )
1355         .map_err(Error::GuestMemory)?;
1356 
1357         if region.file_offset().is_none() && thp {
1358             info!(
1359                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1360                 region.as_ptr() as u64,
1361                 size
1362             );
1363             // SAFETY: FFI call with corect arguments
1364             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1365             if ret != 0 {
1366                 let e = io::Error::last_os_error();
1367                 warn!("Failed to mark pages as THP eligible: {}", e);
1368             }
1369         }
1370 
1371         // Apply NUMA policy if needed.
1372         if let Some(node) = host_numa_node {
1373             let addr = region.deref().as_ptr();
1374             let len = region.deref().size() as u64;
1375             let mode = MPOL_BIND;
1376             let mut nodemask: Vec<u64> = Vec::new();
1377             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1378 
1379             // Linux is kind of buggy in the way it interprets maxnode as it
1380             // will cut off the last node. That's why we have to add 1 to what
1381             // we would consider as the proper maxnode value.
1382             let maxnode = node as u64 + 1 + 1;
1383 
1384             // Allocate the right size for the vector.
1385             nodemask.resize((node as usize / 64) + 1, 0);
1386 
1387             // Fill the global bitmask through the nodemask vector.
1388             let idx = (node / 64) as usize;
1389             let shift = node % 64;
1390             nodemask[idx] |= 1u64 << shift;
1391 
1392             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1393             // force the kernel to move all pages that might have been already
1394             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1395             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1396             // MPOL_BIND is the selected mode as it specifies a strict policy
1397             // that restricts memory allocation to the nodes specified in the
1398             // nodemask.
1399             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1400                 .map_err(Error::ApplyNumaPolicy)?;
1401         }
1402 
1403         Ok(Arc::new(region))
1404     }
1405 
1406     // Update the GuestMemoryMmap with the new range
1407     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1408         let guest_memory = self
1409             .guest_memory
1410             .memory()
1411             .insert_region(region)
1412             .map_err(Error::GuestMemory)?;
1413         self.guest_memory.lock().unwrap().replace(guest_memory);
1414 
1415         Ok(())
1416     }
1417 
1418     //
1419     // Calculate the start address of an area next to RAM.
1420     //
1421     // If memory hotplug is allowed, the start address needs to be aligned
1422     // (rounded-up) to 128MiB boundary.
1423     // If memory hotplug is not allowed, there is no alignment required.
1424     // And it must also start at the 64bit start.
1425     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1426         let mut start_addr = if allow_mem_hotplug {
1427             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1428         } else {
1429             mem_end
1430         };
1431 
1432         start_addr = start_addr
1433             .checked_add(1)
1434             .ok_or(Error::GuestAddressOverFlow)?;
1435 
1436         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1437             return Ok(arch::layout::RAM_64BIT_START);
1438         }
1439 
1440         Ok(start_addr)
1441     }
1442 
1443     pub fn add_ram_region(
1444         &mut self,
1445         start_addr: GuestAddress,
1446         size: usize,
1447     ) -> Result<Arc<GuestRegionMmap>, Error> {
1448         // Allocate memory for the region
1449         let region = MemoryManager::create_ram_region(
1450             &None,
1451             0,
1452             start_addr,
1453             size,
1454             self.prefault,
1455             self.shared,
1456             self.hugepages,
1457             self.hugepage_size,
1458             None,
1459             None,
1460             self.thp,
1461         )?;
1462 
1463         // Map it into the guest
1464         let slot = self.create_userspace_mapping(
1465             region.start_addr().0,
1466             region.len(),
1467             region.as_ptr() as u64,
1468             self.mergeable,
1469             false,
1470             self.log_dirty,
1471         )?;
1472         self.guest_ram_mappings.push(GuestRamMapping {
1473             gpa: region.start_addr().raw_value(),
1474             size: region.len(),
1475             slot,
1476             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1477             virtio_mem: false,
1478             file_offset: 0,
1479         });
1480 
1481         self.add_region(Arc::clone(&region))?;
1482 
1483         Ok(region)
1484     }
1485 
1486     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1487         info!("Hotplugging new RAM: {}", size);
1488 
1489         // Check that there is a free slot
1490         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1491             return Err(Error::NoSlotAvailable);
1492         }
1493 
1494         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1495         if size % (128 << 20) != 0 {
1496             return Err(Error::InvalidSize);
1497         }
1498 
1499         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1500 
1501         if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area {
1502             return Err(Error::InsufficientHotplugRam);
1503         }
1504 
1505         let region = self.add_ram_region(start_addr, size)?;
1506 
1507         // Add region to the list of regions associated with the default
1508         // memory zone.
1509         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1510             memory_zone.regions.push(Arc::clone(&region));
1511         }
1512 
1513         // Tell the allocator
1514         self.ram_allocator
1515             .allocate(Some(start_addr), size as GuestUsize, None)
1516             .ok_or(Error::MemoryRangeAllocation)?;
1517 
1518         // Update the slot so that it can be queried via the I/O port
1519         let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1520         slot.active = true;
1521         slot.inserting = true;
1522         slot.base = region.start_addr().0;
1523         slot.length = region.len();
1524 
1525         self.next_hotplug_slot += 1;
1526 
1527         Ok(region)
1528     }
1529 
1530     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1531         self.guest_memory.clone()
1532     }
1533 
1534     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1535         self.boot_guest_memory.clone()
1536     }
1537 
1538     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1539         self.allocator.clone()
1540     }
1541 
1542     pub fn start_of_device_area(&self) -> GuestAddress {
1543         self.start_of_device_area
1544     }
1545 
1546     pub fn end_of_device_area(&self) -> GuestAddress {
1547         self.end_of_device_area
1548     }
1549 
1550     pub fn allocate_memory_slot(&mut self) -> u32 {
1551         let slot_id = self.next_memory_slot;
1552         self.next_memory_slot += 1;
1553         slot_id
1554     }
1555 
1556     pub fn create_userspace_mapping(
1557         &mut self,
1558         guest_phys_addr: u64,
1559         memory_size: u64,
1560         userspace_addr: u64,
1561         mergeable: bool,
1562         readonly: bool,
1563         log_dirty: bool,
1564     ) -> Result<u32, Error> {
1565         let slot = self.allocate_memory_slot();
1566         let mem_region = self.vm.make_user_memory_region(
1567             slot,
1568             guest_phys_addr,
1569             memory_size,
1570             userspace_addr,
1571             readonly,
1572             log_dirty,
1573         );
1574 
1575         info!(
1576             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1577             guest_phys_addr, userspace_addr, memory_size, slot
1578         );
1579 
1580         self.vm
1581             .create_user_memory_region(mem_region)
1582             .map_err(Error::CreateUserMemoryRegion)?;
1583 
1584         // Mark the pages as mergeable if explicitly asked for.
1585         if mergeable {
1586             // SAFETY: the address and size are valid since the
1587             // mmap succeeded.
1588             let ret = unsafe {
1589                 libc::madvise(
1590                     userspace_addr as *mut libc::c_void,
1591                     memory_size as libc::size_t,
1592                     libc::MADV_MERGEABLE,
1593                 )
1594             };
1595             if ret != 0 {
1596                 let err = io::Error::last_os_error();
1597                 // Safe to unwrap because the error is constructed with
1598                 // last_os_error(), which ensures the output will be Some().
1599                 let errno = err.raw_os_error().unwrap();
1600                 if errno == libc::EINVAL {
1601                     warn!("kernel not configured with CONFIG_KSM");
1602                 } else {
1603                     warn!("madvise error: {}", err);
1604                 }
1605                 warn!("failed to mark pages as mergeable");
1606             }
1607         }
1608 
1609         info!(
1610             "Created userspace mapping: {:x} -> {:x} {:x}",
1611             guest_phys_addr, userspace_addr, memory_size
1612         );
1613 
1614         Ok(slot)
1615     }
1616 
1617     pub fn remove_userspace_mapping(
1618         &mut self,
1619         guest_phys_addr: u64,
1620         memory_size: u64,
1621         userspace_addr: u64,
1622         mergeable: bool,
1623         slot: u32,
1624     ) -> Result<(), Error> {
1625         let mem_region = self.vm.make_user_memory_region(
1626             slot,
1627             guest_phys_addr,
1628             memory_size,
1629             userspace_addr,
1630             false, /* readonly -- don't care */
1631             false, /* log dirty */
1632         );
1633 
1634         self.vm
1635             .remove_user_memory_region(mem_region)
1636             .map_err(Error::RemoveUserMemoryRegion)?;
1637 
1638         // Mark the pages as unmergeable if there were previously marked as
1639         // mergeable.
1640         if mergeable {
1641             // SAFETY: the address and size are valid as the region was
1642             // previously advised.
1643             let ret = unsafe {
1644                 libc::madvise(
1645                     userspace_addr as *mut libc::c_void,
1646                     memory_size as libc::size_t,
1647                     libc::MADV_UNMERGEABLE,
1648                 )
1649             };
1650             if ret != 0 {
1651                 let err = io::Error::last_os_error();
1652                 // Safe to unwrap because the error is constructed with
1653                 // last_os_error(), which ensures the output will be Some().
1654                 let errno = err.raw_os_error().unwrap();
1655                 if errno == libc::EINVAL {
1656                     warn!("kernel not configured with CONFIG_KSM");
1657                 } else {
1658                     warn!("madvise error: {}", err);
1659                 }
1660                 warn!("failed to mark pages as unmergeable");
1661             }
1662         }
1663 
1664         info!(
1665             "Removed userspace mapping: {:x} -> {:x} {:x}",
1666             guest_phys_addr, userspace_addr, memory_size
1667         );
1668 
1669         Ok(())
1670     }
1671 
1672     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1673         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1674             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1675                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1676                     virtio_mem_device
1677                         .lock()
1678                         .unwrap()
1679                         .resize(size)
1680                         .map_err(Error::VirtioMemResizeFail)?;
1681                 }
1682 
1683                 // Keep the hotplugged_size up to date.
1684                 virtio_mem_zone.hotplugged_size = size;
1685             } else {
1686                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1687                 return Err(Error::MissingVirtioMemHandler);
1688             }
1689 
1690             return Ok(());
1691         }
1692 
1693         error!("Failed resizing virtio-mem region: Unknown memory zone");
1694         Err(Error::UnknownMemoryZone)
1695     }
1696 
1697     /// In case this function resulted in adding a new memory region to the
1698     /// guest memory, the new region is returned to the caller. The virtio-mem
1699     /// use case never adds a new region as the whole hotpluggable memory has
1700     /// already been allocated at boot time.
1701     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1702         if self.user_provided_zones {
1703             error!(
1704                 "Not allowed to resize guest memory when backed with user \
1705                 defined memory zones."
1706             );
1707             return Err(Error::InvalidResizeWithMemoryZones);
1708         }
1709 
1710         let mut region: Option<Arc<GuestRegionMmap>> = None;
1711         match self.hotplug_method {
1712             HotplugMethod::VirtioMem => {
1713                 if desired_ram >= self.boot_ram {
1714                     if !self.dynamic {
1715                         return Ok(region);
1716                     }
1717 
1718                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1719                     self.current_ram = desired_ram;
1720                 }
1721             }
1722             HotplugMethod::Acpi => {
1723                 if desired_ram > self.current_ram {
1724                     if !self.dynamic {
1725                         return Ok(region);
1726                     }
1727 
1728                     region =
1729                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1730                     self.current_ram = desired_ram;
1731                 }
1732             }
1733         }
1734         Ok(region)
1735     }
1736 
1737     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1738         if !self.user_provided_zones {
1739             error!(
1740                 "Not allowed to resize guest memory zone when no zone is \
1741                 defined."
1742             );
1743             return Err(Error::ResizeZone);
1744         }
1745 
1746         self.virtio_mem_resize(id, virtio_mem_size)
1747     }
1748 
1749     #[cfg(target_arch = "x86_64")]
1750     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1751         let file = OpenOptions::new()
1752             .read(true)
1753             .open("/dev/sgx_provision")
1754             .map_err(Error::SgxProvisionOpen)?;
1755         self.vm
1756             .enable_sgx_attribute(file)
1757             .map_err(Error::SgxEnableProvisioning)?;
1758 
1759         // Go over each EPC section and verify its size is a 4k multiple. At
1760         // the same time, calculate the total size needed for the contiguous
1761         // EPC region.
1762         let mut epc_region_size = 0;
1763         for epc_section in sgx_epc_config.iter() {
1764             if epc_section.size == 0 {
1765                 return Err(Error::EpcSectionSizeInvalid);
1766             }
1767             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1768                 return Err(Error::EpcSectionSizeInvalid);
1769             }
1770 
1771             epc_region_size += epc_section.size;
1772         }
1773 
1774         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1775         let epc_region_start = GuestAddress(
1776             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1777         );
1778 
1779         self.start_of_device_area = epc_region_start
1780             .checked_add(epc_region_size)
1781             .ok_or(Error::GuestAddressOverFlow)?;
1782 
1783         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1784         info!(
1785             "SGX EPC region: 0x{:x} (0x{:x})",
1786             epc_region_start.0, epc_region_size
1787         );
1788 
1789         // Each section can be memory mapped into the allocated region.
1790         let mut epc_section_start = epc_region_start.raw_value();
1791         for epc_section in sgx_epc_config.iter() {
1792             let file = OpenOptions::new()
1793                 .read(true)
1794                 .write(true)
1795                 .open("/dev/sgx_vepc")
1796                 .map_err(Error::SgxVirtEpcOpen)?;
1797 
1798             let prot = PROT_READ | PROT_WRITE;
1799             let mut flags = MAP_NORESERVE | MAP_SHARED;
1800             if epc_section.prefault {
1801                 flags |= MAP_POPULATE;
1802             }
1803 
1804             // We can't use the vm-memory crate to perform the memory mapping
1805             // here as it would try to ensure the size of the backing file is
1806             // matching the size of the expected mapping. The /dev/sgx_vepc
1807             // device does not work that way, it provides a file descriptor
1808             // which is not matching the mapping size, as it's a just a way to
1809             // let KVM know that an EPC section is being created for the guest.
1810             // SAFETY: FFI call with correct arguments
1811             let host_addr = unsafe {
1812                 libc::mmap(
1813                     std::ptr::null_mut(),
1814                     epc_section.size as usize,
1815                     prot,
1816                     flags,
1817                     file.as_raw_fd(),
1818                     0,
1819                 )
1820             } as u64;
1821 
1822             info!(
1823                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
1824                 epc_section_start, epc_section.size
1825             );
1826 
1827             let _mem_slot = self.create_userspace_mapping(
1828                 epc_section_start,
1829                 epc_section.size,
1830                 host_addr,
1831                 false,
1832                 false,
1833                 false,
1834             )?;
1835 
1836             sgx_epc_region.insert(
1837                 epc_section.id.clone(),
1838                 SgxEpcSection::new(
1839                     GuestAddress(epc_section_start),
1840                     epc_section.size as GuestUsize,
1841                 ),
1842             );
1843 
1844             epc_section_start += epc_section.size;
1845         }
1846 
1847         self.sgx_epc_region = Some(sgx_epc_region);
1848 
1849         Ok(())
1850     }
1851 
1852     #[cfg(target_arch = "x86_64")]
1853     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
1854         &self.sgx_epc_region
1855     }
1856 
1857     pub fn is_hardlink(f: &File) -> bool {
1858         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
1859         // SAFETY: FFI call with correct arguments
1860         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
1861         if ret != 0 {
1862             error!("Couldn't fstat the backing file");
1863             return false;
1864         }
1865 
1866         // SAFETY: stat is valid
1867         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
1868     }
1869 
1870     pub fn memory_zones(&self) -> &MemoryZones {
1871         &self.memory_zones
1872     }
1873 
1874     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
1875         &mut self.memory_zones
1876     }
1877 
1878     pub fn memory_range_table(
1879         &self,
1880         snapshot: bool,
1881     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
1882         let mut table = MemoryRangeTable::default();
1883 
1884         for memory_zone in self.memory_zones.values() {
1885             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
1886                 table.extend(virtio_mem_zone.plugged_ranges());
1887             }
1888 
1889             for region in memory_zone.regions() {
1890                 if snapshot {
1891                     if let Some(file_offset) = region.file_offset() {
1892                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
1893                             && Self::is_hardlink(file_offset.file())
1894                         {
1895                             // In this very specific case, we know the memory
1896                             // region is backed by a file on the host filesystem
1897                             // that can be accessed by the user, and additionally
1898                             // the mapping is shared, which means that modifications
1899                             // to the content are written to the actual file.
1900                             // When meeting these conditions, we can skip the
1901                             // copy of the memory content for this specific region,
1902                             // as we can assume the user will have it saved through
1903                             // the backing file already.
1904                             continue;
1905                         }
1906                     }
1907                 }
1908 
1909                 table.push(MemoryRange {
1910                     gpa: region.start_addr().raw_value(),
1911                     length: region.len(),
1912                 });
1913             }
1914         }
1915 
1916         Ok(table)
1917     }
1918 
1919     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
1920         MemoryManagerSnapshotData {
1921             memory_ranges: self.snapshot_memory_ranges.clone(),
1922             guest_ram_mappings: self.guest_ram_mappings.clone(),
1923             start_of_device_area: self.start_of_device_area.0,
1924             boot_ram: self.boot_ram,
1925             current_ram: self.current_ram,
1926             arch_mem_regions: self.arch_mem_regions.clone(),
1927             hotplug_slots: self.hotplug_slots.clone(),
1928             next_memory_slot: self.next_memory_slot,
1929             selected_slot: self.selected_slot,
1930             next_hotplug_slot: self.next_hotplug_slot,
1931         }
1932     }
1933 
1934     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
1935         let mut memory_slot_fds = HashMap::new();
1936         for guest_ram_mapping in &self.guest_ram_mappings {
1937             let slot = guest_ram_mapping.slot;
1938             let guest_memory = self.guest_memory.memory();
1939             let file = guest_memory
1940                 .find_region(GuestAddress(guest_ram_mapping.gpa))
1941                 .unwrap()
1942                 .file_offset()
1943                 .unwrap()
1944                 .file();
1945             memory_slot_fds.insert(slot, file.as_raw_fd());
1946         }
1947         memory_slot_fds
1948     }
1949 
1950     pub fn acpi_address(&self) -> Option<GuestAddress> {
1951         self.acpi_address
1952     }
1953 
1954     pub fn num_guest_ram_mappings(&self) -> u32 {
1955         self.guest_ram_mappings.len() as u32
1956     }
1957 
1958     #[cfg(target_arch = "aarch64")]
1959     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1960         self.uefi_flash.as_ref().unwrap().clone()
1961     }
1962 
1963     #[cfg(feature = "guest_debug")]
1964     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
1965         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
1966         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
1967 
1968         let mut mem_offset_in_elf = mem_offset;
1969         let mut ram_maps = BTreeMap::new();
1970         for mapping in mapping_sorted_by_gpa.iter() {
1971             ram_maps.insert(
1972                 mapping.gpa,
1973                 CoredumpMemoryRegion {
1974                     mem_offset_in_elf,
1975                     mem_size: mapping.size,
1976                 },
1977             );
1978             mem_offset_in_elf += mapping.size;
1979         }
1980 
1981         CoredumpMemoryRegions { ram_maps }
1982     }
1983 
1984     #[cfg(feature = "guest_debug")]
1985     pub fn coredump_iterate_save_mem(
1986         &mut self,
1987         dump_state: &DumpState,
1988     ) -> std::result::Result<(), GuestDebuggableError> {
1989         let snapshot_memory_ranges = self
1990             .memory_range_table(false)
1991             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
1992 
1993         if snapshot_memory_ranges.is_empty() {
1994             return Ok(());
1995         }
1996 
1997         let mut coredump_file = dump_state.file.as_ref().unwrap();
1998 
1999         let guest_memory = self.guest_memory.memory();
2000         let mut total_bytes: u64 = 0;
2001 
2002         for range in snapshot_memory_ranges.regions() {
2003             let mut offset: u64 = 0;
2004             loop {
2005                 let bytes_written = guest_memory
2006                     .write_to(
2007                         GuestAddress(range.gpa + offset),
2008                         &mut coredump_file,
2009                         (range.length - offset) as usize,
2010                     )
2011                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2012                 offset += bytes_written as u64;
2013                 total_bytes += bytes_written as u64;
2014 
2015                 if offset == range.length {
2016                     break;
2017                 }
2018             }
2019         }
2020 
2021         debug!("coredump total bytes {}", total_bytes);
2022         Ok(())
2023     }
2024 
2025     pub fn receive_memory_regions<F>(
2026         &mut self,
2027         ranges: &MemoryRangeTable,
2028         fd: &mut F,
2029     ) -> std::result::Result<(), MigratableError>
2030     where
2031         F: Read,
2032     {
2033         let guest_memory = self.guest_memory();
2034         let mem = guest_memory.memory();
2035 
2036         for range in ranges.regions() {
2037             let mut offset: u64 = 0;
2038             // Here we are manually handling the retry in case we can't the
2039             // whole region at once because we can't use the implementation
2040             // from vm-memory::GuestMemory of read_exact_from() as it is not
2041             // following the correct behavior. For more info about this issue
2042             // see: https://github.com/rust-vmm/vm-memory/issues/174
2043             loop {
2044                 let bytes_read = mem
2045                     .read_from(
2046                         GuestAddress(range.gpa + offset),
2047                         fd,
2048                         (range.length - offset) as usize,
2049                     )
2050                     .map_err(|e| {
2051                         MigratableError::MigrateReceive(anyhow!(
2052                             "Error receiving memory from socket: {}",
2053                             e
2054                         ))
2055                     })?;
2056                 offset += bytes_read as u64;
2057 
2058                 if offset == range.length {
2059                     break;
2060                 }
2061             }
2062         }
2063 
2064         Ok(())
2065     }
2066 }
2067 
2068 struct MemoryNotify {
2069     slot_id: usize,
2070 }
2071 
2072 impl Aml for MemoryNotify {
2073     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2074         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2075         aml::If::new(
2076             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2077             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2078         )
2079         .append_aml_bytes(bytes)
2080     }
2081 }
2082 
2083 struct MemorySlot {
2084     slot_id: usize,
2085 }
2086 
2087 impl Aml for MemorySlot {
2088     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2089         aml::Device::new(
2090             format!("M{:03}", self.slot_id).as_str().into(),
2091             vec![
2092                 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")),
2093                 &aml::Name::new("_UID".into(), &self.slot_id),
2094                 /*
2095                 _STA return value:
2096                 Bit [0] – Set if the device is present.
2097                 Bit [1] – Set if the device is enabled and decoding its resources.
2098                 Bit [2] – Set if the device should be shown in the UI.
2099                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2100                 Bit [4] – Set if the battery is present.
2101                 Bits [31:5] – Reserved (must be cleared).
2102                 */
2103                 &aml::Method::new(
2104                     "_STA".into(),
2105                     0,
2106                     false,
2107                     // Call into MSTA method which will interrogate device
2108                     vec![&aml::Return::new(&aml::MethodCall::new(
2109                         "MSTA".into(),
2110                         vec![&self.slot_id],
2111                     ))],
2112                 ),
2113                 // Get details of memory
2114                 &aml::Method::new(
2115                     "_CRS".into(),
2116                     0,
2117                     false,
2118                     // Call into MCRS which provides actual memory details
2119                     vec![&aml::Return::new(&aml::MethodCall::new(
2120                         "MCRS".into(),
2121                         vec![&self.slot_id],
2122                     ))],
2123                 ),
2124             ],
2125         )
2126         .append_aml_bytes(bytes)
2127     }
2128 }
2129 
2130 struct MemorySlots {
2131     slots: usize,
2132 }
2133 
2134 impl Aml for MemorySlots {
2135     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2136         for slot_id in 0..self.slots {
2137             MemorySlot { slot_id }.append_aml_bytes(bytes);
2138         }
2139     }
2140 }
2141 
2142 struct MemoryMethods {
2143     slots: usize,
2144 }
2145 
2146 impl Aml for MemoryMethods {
2147     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2148         // Add "MTFY" notification method
2149         let mut memory_notifies = Vec::new();
2150         for slot_id in 0..self.slots {
2151             memory_notifies.push(MemoryNotify { slot_id });
2152         }
2153 
2154         let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
2155         for memory_notifier in memory_notifies.iter() {
2156             memory_notifies_refs.push(memory_notifier);
2157         }
2158 
2159         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes);
2160 
2161         // MSCN method
2162         aml::Method::new(
2163             "MSCN".into(),
2164             0,
2165             true,
2166             vec![
2167                 // Take lock defined above
2168                 &aml::Acquire::new("MLCK".into(), 0xffff),
2169                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2170                 &aml::While::new(
2171                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2172                     vec![
2173                         // Write slot number (in first argument) to I/O port via field
2174                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2175                         // Check if MINS bit is set (inserting)
2176                         &aml::If::new(
2177                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2178                             // Notify device if it is
2179                             vec![
2180                                 &aml::MethodCall::new(
2181                                     "MTFY".into(),
2182                                     vec![&aml::Local(0), &aml::ONE],
2183                                 ),
2184                                 // Reset MINS bit
2185                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2186                             ],
2187                         ),
2188                         // Check if MRMV bit is set
2189                         &aml::If::new(
2190                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2191                             // Notify device if it is (with the eject constant 0x3)
2192                             vec![
2193                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2194                                 // Reset MRMV bit
2195                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2196                             ],
2197                         ),
2198                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2199                     ],
2200                 ),
2201                 // Release lock
2202                 &aml::Release::new("MLCK".into()),
2203             ],
2204         )
2205         .append_aml_bytes(bytes);
2206 
2207         // Memory status method
2208         aml::Method::new(
2209             "MSTA".into(),
2210             1,
2211             true,
2212             vec![
2213                 // Take lock defined above
2214                 &aml::Acquire::new("MLCK".into(), 0xffff),
2215                 // Write slot number (in first argument) to I/O port via field
2216                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2217                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2218                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2219                 &aml::If::new(
2220                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2221                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2222                 ),
2223                 // Release lock
2224                 &aml::Release::new("MLCK".into()),
2225                 // Return 0 or 0xf
2226                 &aml::Return::new(&aml::Local(0)),
2227             ],
2228         )
2229         .append_aml_bytes(bytes);
2230 
2231         // Memory range method
2232         aml::Method::new(
2233             "MCRS".into(),
2234             1,
2235             true,
2236             vec![
2237                 // Take lock defined above
2238                 &aml::Acquire::new("MLCK".into(), 0xffff),
2239                 // Write slot number (in first argument) to I/O port via field
2240                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2241                 &aml::Name::new(
2242                     "MR64".into(),
2243                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2244                         aml::AddressSpaceCachable::Cacheable,
2245                         true,
2246                         0x0000_0000_0000_0000u64,
2247                         0xFFFF_FFFF_FFFF_FFFEu64,
2248                     )]),
2249                 ),
2250                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()),
2251                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()),
2252                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()),
2253                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()),
2254                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()),
2255                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()),
2256                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2257                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2258                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2259                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2260                 &aml::Add::new(
2261                     &aml::Path::new("MAXL"),
2262                     &aml::Path::new("MINL"),
2263                     &aml::Path::new("LENL"),
2264                 ),
2265                 &aml::Add::new(
2266                     &aml::Path::new("MAXH"),
2267                     &aml::Path::new("MINH"),
2268                     &aml::Path::new("LENH"),
2269                 ),
2270                 &aml::If::new(
2271                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2272                     vec![&aml::Add::new(
2273                         &aml::Path::new("MAXH"),
2274                         &aml::ONE,
2275                         &aml::Path::new("MAXH"),
2276                     )],
2277                 ),
2278                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2279                 // Release lock
2280                 &aml::Release::new("MLCK".into()),
2281                 &aml::Return::new(&aml::Path::new("MR64")),
2282             ],
2283         )
2284         .append_aml_bytes(bytes)
2285     }
2286 }
2287 
2288 impl Aml for MemoryManager {
2289     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2290         if let Some(acpi_address) = self.acpi_address {
2291             // Memory Hotplug Controller
2292             aml::Device::new(
2293                 "_SB_.MHPC".into(),
2294                 vec![
2295                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2296                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2297                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2298                     &aml::Mutex::new("MLCK".into(), 0),
2299                     &aml::Name::new(
2300                         "_CRS".into(),
2301                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2302                             aml::AddressSpaceCachable::NotCacheable,
2303                             true,
2304                             acpi_address.0,
2305                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2306                         )]),
2307                     ),
2308                     // OpRegion and Fields map MMIO range into individual field values
2309                     &aml::OpRegion::new(
2310                         "MHPR".into(),
2311                         aml::OpRegionSpace::SystemMemory,
2312                         acpi_address.0 as usize,
2313                         MEMORY_MANAGER_ACPI_SIZE,
2314                     ),
2315                     &aml::Field::new(
2316                         "MHPR".into(),
2317                         aml::FieldAccessType::DWord,
2318                         aml::FieldUpdateRule::Preserve,
2319                         vec![
2320                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2321                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2322                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2323                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2324                         ],
2325                     ),
2326                     &aml::Field::new(
2327                         "MHPR".into(),
2328                         aml::FieldAccessType::DWord,
2329                         aml::FieldUpdateRule::Preserve,
2330                         vec![
2331                             aml::FieldEntry::Reserved(128),
2332                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2333                         ],
2334                     ),
2335                     &aml::Field::new(
2336                         "MHPR".into(),
2337                         aml::FieldAccessType::Byte,
2338                         aml::FieldUpdateRule::WriteAsZeroes,
2339                         vec![
2340                             aml::FieldEntry::Reserved(160),
2341                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2342                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2343                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2344                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2345                         ],
2346                     ),
2347                     &aml::Field::new(
2348                         "MHPR".into(),
2349                         aml::FieldAccessType::DWord,
2350                         aml::FieldUpdateRule::Preserve,
2351                         vec![
2352                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2353                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2354                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2355                         ],
2356                     ),
2357                     &MemoryMethods {
2358                         slots: self.hotplug_slots.len(),
2359                     },
2360                     &MemorySlots {
2361                         slots: self.hotplug_slots.len(),
2362                     },
2363                 ],
2364             )
2365             .append_aml_bytes(bytes);
2366         } else {
2367             aml::Device::new(
2368                 "_SB_.MHPC".into(),
2369                 vec![
2370                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2371                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2372                     // Empty MSCN for GED
2373                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2374                 ],
2375             )
2376             .append_aml_bytes(bytes);
2377         }
2378 
2379         #[cfg(target_arch = "x86_64")]
2380         {
2381             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2382                 let min = sgx_epc_region.start().raw_value();
2383                 let max = min + sgx_epc_region.size() - 1;
2384                 // SGX EPC region
2385                 aml::Device::new(
2386                     "_SB_.EPC_".into(),
2387                     vec![
2388                         &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")),
2389                         // QWORD describing the EPC region start and size
2390                         &aml::Name::new(
2391                             "_CRS".into(),
2392                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2393                                 aml::AddressSpaceCachable::NotCacheable,
2394                                 true,
2395                                 min,
2396                                 max,
2397                             )]),
2398                         ),
2399                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2400                     ],
2401                 )
2402                 .append_aml_bytes(bytes);
2403             }
2404         }
2405     }
2406 }
2407 
2408 impl Pausable for MemoryManager {}
2409 
2410 #[derive(Clone, Serialize, Deserialize, Versionize)]
2411 pub struct MemoryManagerSnapshotData {
2412     memory_ranges: MemoryRangeTable,
2413     guest_ram_mappings: Vec<GuestRamMapping>,
2414     start_of_device_area: u64,
2415     boot_ram: u64,
2416     current_ram: u64,
2417     arch_mem_regions: Vec<ArchMemRegion>,
2418     hotplug_slots: Vec<HotPlugState>,
2419     next_memory_slot: u32,
2420     selected_slot: usize,
2421     next_hotplug_slot: usize,
2422 }
2423 
2424 impl VersionMapped for MemoryManagerSnapshotData {}
2425 
2426 impl Snapshottable for MemoryManager {
2427     fn id(&self) -> String {
2428         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2429     }
2430 
2431     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2432         let mut memory_manager_snapshot = Snapshot::new(MEMORY_MANAGER_SNAPSHOT_ID);
2433 
2434         let memory_ranges = self.memory_range_table(true)?;
2435 
2436         // Store locally this list of ranges as it will be used through the
2437         // Transportable::send() implementation. The point is to avoid the
2438         // duplication of code regarding the creation of the path for each
2439         // region. The 'snapshot' step creates the list of memory regions,
2440         // including information about the need to copy a memory region or
2441         // not. This saves the 'send' step having to go through the same
2442         // process, and instead it can directly proceed with storing the
2443         // memory range content for the ranges requiring it.
2444         self.snapshot_memory_ranges = memory_ranges;
2445 
2446         memory_manager_snapshot.add_data_section(SnapshotDataSection::new_from_versioned_state(
2447             MEMORY_MANAGER_SNAPSHOT_ID,
2448             &self.snapshot_data(),
2449         )?);
2450 
2451         Ok(memory_manager_snapshot)
2452     }
2453 }
2454 
2455 impl Transportable for MemoryManager {
2456     fn send(
2457         &self,
2458         _snapshot: &Snapshot,
2459         destination_url: &str,
2460     ) -> result::Result<(), MigratableError> {
2461         if self.snapshot_memory_ranges.is_empty() {
2462             return Ok(());
2463         }
2464 
2465         let mut memory_file_path = url_to_path(destination_url)?;
2466         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2467 
2468         // Create the snapshot file for the entire memory
2469         let mut memory_file = OpenOptions::new()
2470             .read(true)
2471             .write(true)
2472             .create_new(true)
2473             .open(memory_file_path)
2474             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2475 
2476         let guest_memory = self.guest_memory.memory();
2477 
2478         for range in self.snapshot_memory_ranges.regions() {
2479             let mut offset: u64 = 0;
2480             // Here we are manually handling the retry in case we can't read
2481             // the whole region at once because we can't use the implementation
2482             // from vm-memory::GuestMemory of write_all_to() as it is not
2483             // following the correct behavior. For more info about this issue
2484             // see: https://github.com/rust-vmm/vm-memory/issues/174
2485             loop {
2486                 let bytes_written = guest_memory
2487                     .write_to(
2488                         GuestAddress(range.gpa + offset),
2489                         &mut memory_file,
2490                         (range.length - offset) as usize,
2491                     )
2492                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2493                 offset += bytes_written as u64;
2494 
2495                 if offset == range.length {
2496                     break;
2497                 }
2498             }
2499         }
2500         Ok(())
2501     }
2502 }
2503 
2504 impl Migratable for MemoryManager {
2505     // Start the dirty log in the hypervisor (kvm/mshv).
2506     // Also, reset the dirty bitmap logged by the vmm.
2507     // Just before we do a bulk copy we want to start/clear the dirty log so that
2508     // pages touched during our bulk copy are tracked.
2509     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2510         self.vm.start_dirty_log().map_err(|e| {
2511             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2512         })?;
2513 
2514         for r in self.guest_memory.memory().iter() {
2515             r.bitmap().reset();
2516         }
2517 
2518         Ok(())
2519     }
2520 
2521     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2522         self.vm.stop_dirty_log().map_err(|e| {
2523             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2524         })?;
2525 
2526         Ok(())
2527     }
2528 
2529     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2530     // together in the table if they are contiguous.
2531     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2532         let mut table = MemoryRangeTable::default();
2533         for r in &self.guest_ram_mappings {
2534             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2535                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2536             })?;
2537             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2538             {
2539                 Some(region) => {
2540                     assert!(region.start_addr().raw_value() == r.gpa);
2541                     assert!(region.len() == r.size);
2542                     region.bitmap().get_and_reset()
2543                 }
2544                 None => {
2545                     return Err(MigratableError::MigrateSend(anyhow!(
2546                         "Error finding 'guest memory region' with address {:x}",
2547                         r.gpa
2548                     )))
2549                 }
2550             };
2551 
2552             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2553                 .iter()
2554                 .zip(vmm_dirty_bitmap.iter())
2555                 .map(|(x, y)| x | y)
2556                 .collect();
2557 
2558             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2559 
2560             if sub_table.regions().is_empty() {
2561                 info!("Dirty Memory Range Table is empty");
2562             } else {
2563                 info!("Dirty Memory Range Table:");
2564                 for range in sub_table.regions() {
2565                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2566                 }
2567             }
2568 
2569             table.extend(sub_table);
2570         }
2571         Ok(table)
2572     }
2573 }
2574