xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 5e52729453cb62edbe4fb3a4aa24f8cca31e667e)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 #[cfg(target_arch = "x86_64")]
6 use crate::config::SgxEpcConfig;
7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
9 use crate::coredump::{
10     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
11 };
12 use crate::migration::url_to_path;
13 use crate::MEMORY_MANAGER_SNAPSHOT_ID;
14 use crate::{GuestMemoryMmap, GuestRegionMmap};
15 use acpi_tables::{aml, aml::Aml};
16 use anyhow::anyhow;
17 #[cfg(target_arch = "x86_64")]
18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
19 use arch::{layout, RegionType};
20 #[cfg(target_arch = "x86_64")]
21 use devices::ioapic;
22 #[cfg(target_arch = "aarch64")]
23 use hypervisor::HypervisorVmError;
24 #[cfg(target_arch = "x86_64")]
25 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
26 use serde::{Deserialize, Serialize};
27 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
28 use std::collections::BTreeMap;
29 use std::collections::HashMap;
30 use std::convert::TryInto;
31 use std::ffi;
32 use std::fs::{File, OpenOptions};
33 use std::io::{self, Read};
34 use std::ops::Deref;
35 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
36 use std::path::PathBuf;
37 use std::result;
38 use std::sync::{Arc, Barrier, Mutex};
39 use tracer::trace_scoped;
40 use versionize::{VersionMap, Versionize, VersionizeResult};
41 use versionize_derive::Versionize;
42 use virtio_devices::BlocksState;
43 #[cfg(target_arch = "x86_64")]
44 use vm_allocator::GsiApic;
45 use vm_allocator::{AddressAllocator, SystemAllocator};
46 use vm_device::BusDevice;
47 use vm_memory::bitmap::AtomicBitmap;
48 use vm_memory::guest_memory::FileOffset;
49 use vm_memory::{
50     mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace,
51     GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion,
52 };
53 use vm_migration::{
54     protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable,
55     Snapshot, SnapshotData, Snapshottable, Transportable, VersionMapped,
56 };
57 
58 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
59 
60 const DEFAULT_MEMORY_ZONE: &str = "mem0";
61 
62 const SNAPSHOT_FILENAME: &str = "memory-ranges";
63 
64 #[cfg(target_arch = "x86_64")]
65 const X86_64_IRQ_BASE: u32 = 5;
66 
67 #[cfg(target_arch = "x86_64")]
68 const SGX_PAGE_SIZE: u64 = 1 << 12;
69 
70 const HOTPLUG_COUNT: usize = 8;
71 
72 // Memory policy constants
73 const MPOL_BIND: u32 = 2;
74 const MPOL_MF_STRICT: u32 = 1;
75 const MPOL_MF_MOVE: u32 = 1 << 1;
76 
77 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
78 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
79 
80 #[derive(Clone, Default, Serialize, Deserialize, Versionize)]
81 struct HotPlugState {
82     base: u64,
83     length: u64,
84     active: bool,
85     inserting: bool,
86     removing: bool,
87 }
88 
89 pub struct VirtioMemZone {
90     region: Arc<GuestRegionMmap>,
91     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
92     hotplugged_size: u64,
93     hugepages: bool,
94     blocks_state: Arc<Mutex<BlocksState>>,
95 }
96 
97 impl VirtioMemZone {
98     pub fn region(&self) -> &Arc<GuestRegionMmap> {
99         &self.region
100     }
101     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
102         self.virtio_device = Some(virtio_device);
103     }
104     pub fn hotplugged_size(&self) -> u64 {
105         self.hotplugged_size
106     }
107     pub fn hugepages(&self) -> bool {
108         self.hugepages
109     }
110     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
111         &self.blocks_state
112     }
113     pub fn plugged_ranges(&self) -> MemoryRangeTable {
114         self.blocks_state
115             .lock()
116             .unwrap()
117             .memory_ranges(self.region.start_addr().raw_value(), true)
118     }
119 }
120 
121 #[derive(Default)]
122 pub struct MemoryZone {
123     regions: Vec<Arc<GuestRegionMmap>>,
124     virtio_mem_zone: Option<VirtioMemZone>,
125 }
126 
127 impl MemoryZone {
128     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
129         &self.regions
130     }
131     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
132         &self.virtio_mem_zone
133     }
134     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
135         self.virtio_mem_zone.as_mut()
136     }
137 }
138 
139 pub type MemoryZones = HashMap<String, MemoryZone>;
140 
141 #[derive(Clone, Serialize, Deserialize, Versionize)]
142 struct GuestRamMapping {
143     slot: u32,
144     gpa: u64,
145     size: u64,
146     zone_id: String,
147     virtio_mem: bool,
148     file_offset: u64,
149 }
150 
151 #[derive(Clone, Serialize, Deserialize, Versionize)]
152 struct ArchMemRegion {
153     base: u64,
154     size: usize,
155     r_type: RegionType,
156 }
157 
158 pub struct MemoryManager {
159     boot_guest_memory: GuestMemoryMmap,
160     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
161     next_memory_slot: u32,
162     start_of_device_area: GuestAddress,
163     end_of_device_area: GuestAddress,
164     end_of_ram_area: GuestAddress,
165     pub vm: Arc<dyn hypervisor::Vm>,
166     hotplug_slots: Vec<HotPlugState>,
167     selected_slot: usize,
168     mergeable: bool,
169     allocator: Arc<Mutex<SystemAllocator>>,
170     hotplug_method: HotplugMethod,
171     boot_ram: u64,
172     current_ram: u64,
173     next_hotplug_slot: usize,
174     shared: bool,
175     hugepages: bool,
176     hugepage_size: Option<u64>,
177     prefault: bool,
178     thp: bool,
179     #[cfg(target_arch = "x86_64")]
180     sgx_epc_region: Option<SgxEpcRegion>,
181     user_provided_zones: bool,
182     snapshot_memory_ranges: MemoryRangeTable,
183     memory_zones: MemoryZones,
184     log_dirty: bool, // Enable dirty logging for created RAM regions
185     arch_mem_regions: Vec<ArchMemRegion>,
186     ram_allocator: AddressAllocator,
187     dynamic: bool,
188 
189     // Keep track of calls to create_userspace_mapping() for guest RAM.
190     // This is useful for getting the dirty pages as we need to know the
191     // slots that the mapping is created in.
192     guest_ram_mappings: Vec<GuestRamMapping>,
193 
194     pub acpi_address: Option<GuestAddress>,
195     #[cfg(target_arch = "aarch64")]
196     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
197 }
198 
199 #[derive(Debug)]
200 pub enum Error {
201     /// Failed to create shared file.
202     SharedFileCreate(io::Error),
203 
204     /// Failed to set shared file length.
205     SharedFileSetLen(io::Error),
206 
207     /// Mmap backed guest memory error
208     GuestMemory(MmapError),
209 
210     /// Failed to allocate a memory range.
211     MemoryRangeAllocation,
212 
213     /// Error from region creation
214     GuestMemoryRegion(MmapRegionError),
215 
216     /// No ACPI slot available
217     NoSlotAvailable,
218 
219     /// Not enough space in the hotplug RAM region
220     InsufficientHotplugRam,
221 
222     /// The requested hotplug memory addition is not a valid size
223     InvalidSize,
224 
225     /// Failed to create the user memory region.
226     CreateUserMemoryRegion(hypervisor::HypervisorVmError),
227 
228     /// Failed to remove the user memory region.
229     RemoveUserMemoryRegion(hypervisor::HypervisorVmError),
230 
231     /// Failed to EventFd.
232     EventFdFail(io::Error),
233 
234     /// Eventfd write error
235     EventfdError(io::Error),
236 
237     /// Failed to virtio-mem resize
238     VirtioMemResizeFail(virtio_devices::mem::Error),
239 
240     /// Cannot restore VM
241     Restore(MigratableError),
242 
243     /// Cannot restore VM because source URL is missing
244     RestoreMissingSourceUrl,
245 
246     /// Cannot create the system allocator
247     CreateSystemAllocator,
248 
249     /// Invalid SGX EPC section size
250     #[cfg(target_arch = "x86_64")]
251     EpcSectionSizeInvalid,
252 
253     /// Failed allocating SGX EPC region
254     #[cfg(target_arch = "x86_64")]
255     SgxEpcRangeAllocation,
256 
257     /// Failed opening SGX virtual EPC device
258     #[cfg(target_arch = "x86_64")]
259     SgxVirtEpcOpen(io::Error),
260 
261     /// Failed setting the SGX virtual EPC section size
262     #[cfg(target_arch = "x86_64")]
263     SgxVirtEpcFileSetLen(io::Error),
264 
265     /// Failed opening SGX provisioning device
266     #[cfg(target_arch = "x86_64")]
267     SgxProvisionOpen(io::Error),
268 
269     /// Failed enabling SGX provisioning
270     #[cfg(target_arch = "x86_64")]
271     SgxEnableProvisioning(hypervisor::HypervisorVmError),
272 
273     /// Failed creating a new MmapRegion instance.
274     #[cfg(target_arch = "x86_64")]
275     NewMmapRegion(vm_memory::mmap::MmapRegionError),
276 
277     /// No memory zones found.
278     MissingMemoryZones,
279 
280     /// Memory configuration is not valid.
281     InvalidMemoryParameters,
282 
283     /// Forbidden operation. Impossible to resize guest memory if it is
284     /// backed by user defined memory regions.
285     InvalidResizeWithMemoryZones,
286 
287     /// It's invalid to try applying a NUMA policy to a memory zone that is
288     /// memory mapped with MAP_SHARED.
289     InvalidSharedMemoryZoneWithHostNuma,
290 
291     /// Failed applying NUMA memory policy.
292     ApplyNumaPolicy(io::Error),
293 
294     /// Memory zone identifier is not unique.
295     DuplicateZoneId,
296 
297     /// No virtio-mem resizing handler found.
298     MissingVirtioMemHandler,
299 
300     /// Unknown memory zone.
301     UnknownMemoryZone,
302 
303     /// Invalid size for resizing. Can be anything except 0.
304     InvalidHotplugSize,
305 
306     /// Invalid hotplug method associated with memory zones resizing capability.
307     InvalidHotplugMethodWithMemoryZones,
308 
309     /// Could not find specified memory zone identifier from hash map.
310     MissingZoneIdentifier,
311 
312     /// Resizing the memory zone failed.
313     ResizeZone,
314 
315     /// Guest address overflow
316     GuestAddressOverFlow,
317 
318     /// Error opening snapshot file
319     SnapshotOpen(io::Error),
320 
321     // Error copying snapshot into region
322     SnapshotCopy(GuestMemoryError),
323 
324     /// Failed to allocate MMIO address
325     AllocateMmioAddress,
326 
327     #[cfg(target_arch = "aarch64")]
328     /// Failed to create UEFI flash
329     CreateUefiFlash(HypervisorVmError),
330 }
331 
332 const ENABLE_FLAG: usize = 0;
333 const INSERTING_FLAG: usize = 1;
334 const REMOVING_FLAG: usize = 2;
335 const EJECT_FLAG: usize = 3;
336 
337 const BASE_OFFSET_LOW: u64 = 0;
338 const BASE_OFFSET_HIGH: u64 = 0x4;
339 const LENGTH_OFFSET_LOW: u64 = 0x8;
340 const LENGTH_OFFSET_HIGH: u64 = 0xC;
341 const STATUS_OFFSET: u64 = 0x14;
342 const SELECTION_OFFSET: u64 = 0;
343 
344 // The MMIO address space size is subtracted with 64k. This is done for the
345 // following reasons:
346 //  - Reduce the addressable space size by at least 4k to workaround a Linux
347 //    bug when the VMM allocates devices at the end of the addressable space
348 //  - Windows requires the addressable space size to be 64k aligned
349 fn mmio_address_space_size(phys_bits: u8) -> u64 {
350     (1 << phys_bits) - (1 << 16)
351 }
352 
353 impl BusDevice for MemoryManager {
354     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
355         if self.selected_slot < self.hotplug_slots.len() {
356             let state = &self.hotplug_slots[self.selected_slot];
357             match offset {
358                 BASE_OFFSET_LOW => {
359                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
360                 }
361                 BASE_OFFSET_HIGH => {
362                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
363                 }
364                 LENGTH_OFFSET_LOW => {
365                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
366                 }
367                 LENGTH_OFFSET_HIGH => {
368                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
369                 }
370                 STATUS_OFFSET => {
371                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
372                     data.fill(0);
373                     if state.active {
374                         data[0] |= 1 << ENABLE_FLAG;
375                     }
376                     if state.inserting {
377                         data[0] |= 1 << INSERTING_FLAG;
378                     }
379                     if state.removing {
380                         data[0] |= 1 << REMOVING_FLAG;
381                     }
382                 }
383                 _ => {
384                     warn!(
385                         "Unexpected offset for accessing memory manager device: {:#}",
386                         offset
387                     );
388                 }
389             }
390         } else {
391             warn!("Out of range memory slot: {}", self.selected_slot);
392         }
393     }
394 
395     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
396         match offset {
397             SELECTION_OFFSET => {
398                 self.selected_slot = usize::from(data[0]);
399             }
400             STATUS_OFFSET => {
401                 if self.selected_slot < self.hotplug_slots.len() {
402                     let state = &mut self.hotplug_slots[self.selected_slot];
403                     // The ACPI code writes back a 1 to acknowledge the insertion
404                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
405                         state.inserting = false;
406                     }
407                     // Ditto for removal
408                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
409                         state.removing = false;
410                     }
411                     // Trigger removal of "DIMM"
412                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
413                         warn!("Ejection of memory not currently supported");
414                     }
415                 } else {
416                     warn!("Out of range memory slot: {}", self.selected_slot);
417                 }
418             }
419             _ => {
420                 warn!(
421                     "Unexpected offset for accessing memory manager device: {:#}",
422                     offset
423                 );
424             }
425         };
426         None
427     }
428 }
429 
430 impl MemoryManager {
431     /// Creates all memory regions based on the available RAM ranges defined
432     /// by `ram_regions`, and based on the description of the memory zones.
433     /// In practice, this function can perform multiple memory mappings of the
434     /// same backing file if there's a hole in the address space between two
435     /// RAM ranges.
436     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
437     /// and zones containing two zones (size 1G and size 4G).
438     /// This function will create 3 resulting memory regions:
439     /// - First one mapping entirely the first memory zone on 0-1G range
440     /// - Second one mapping partially the second memory zone on 1G-3G range
441     /// - Third one mapping partially the second memory zone on 4G-6G range
442     fn create_memory_regions_from_zones(
443         ram_regions: &[(GuestAddress, usize)],
444         zones: &[MemoryZoneConfig],
445         prefault: Option<bool>,
446         thp: bool,
447     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
448         let mut zones = zones.to_owned();
449         let mut mem_regions = Vec::new();
450         let mut zone = zones.remove(0);
451         let mut zone_offset = 0;
452         let mut memory_zones = HashMap::new();
453 
454         // Add zone id to the list of memory zones.
455         memory_zones.insert(zone.id.clone(), MemoryZone::default());
456 
457         for ram_region in ram_regions.iter() {
458             let mut ram_region_offset = 0;
459             let mut exit = false;
460 
461             loop {
462                 let mut ram_region_consumed = false;
463                 let mut pull_next_zone = false;
464 
465                 let ram_region_sub_size = ram_region.1 - ram_region_offset;
466                 let zone_sub_size = zone.size as usize - zone_offset;
467 
468                 let file_offset = zone_offset as u64;
469                 let region_start = ram_region
470                     .0
471                     .checked_add(ram_region_offset as u64)
472                     .ok_or(Error::GuestAddressOverFlow)?;
473                 let region_size = if zone_sub_size <= ram_region_sub_size {
474                     if zone_sub_size == ram_region_sub_size {
475                         ram_region_consumed = true;
476                     }
477 
478                     ram_region_offset += zone_sub_size;
479                     pull_next_zone = true;
480 
481                     zone_sub_size
482                 } else {
483                     zone_offset += ram_region_sub_size;
484                     ram_region_consumed = true;
485 
486                     ram_region_sub_size
487                 };
488 
489                 let region = MemoryManager::create_ram_region(
490                     &zone.file,
491                     file_offset,
492                     region_start,
493                     region_size,
494                     match prefault {
495                         Some(pf) => pf,
496                         None => zone.prefault,
497                     },
498                     zone.shared,
499                     zone.hugepages,
500                     zone.hugepage_size,
501                     zone.host_numa_node,
502                     None,
503                     thp,
504                 )?;
505 
506                 // Add region to the list of regions associated with the
507                 // current memory zone.
508                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
509                     memory_zone.regions.push(region.clone());
510                 }
511 
512                 mem_regions.push(region);
513 
514                 if pull_next_zone {
515                     // Get the next zone and reset the offset.
516                     zone_offset = 0;
517                     if zones.is_empty() {
518                         exit = true;
519                         break;
520                     }
521                     zone = zones.remove(0);
522 
523                     // Check if zone id already exist. In case it does, throw
524                     // an error as we need unique identifiers. Otherwise, add
525                     // the new zone id to the list of memory zones.
526                     if memory_zones.contains_key(&zone.id) {
527                         error!(
528                             "Memory zone identifier '{}' found more than once. \
529                             It must be unique",
530                             zone.id,
531                         );
532                         return Err(Error::DuplicateZoneId);
533                     }
534                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
535                 }
536 
537                 if ram_region_consumed {
538                     break;
539                 }
540             }
541 
542             if exit {
543                 break;
544             }
545         }
546 
547         Ok((mem_regions, memory_zones))
548     }
549 
550     // Restore both GuestMemory regions along with MemoryZone zones.
551     fn restore_memory_regions_and_zones(
552         guest_ram_mappings: &[GuestRamMapping],
553         zones_config: &[MemoryZoneConfig],
554         prefault: Option<bool>,
555         mut existing_memory_files: HashMap<u32, File>,
556         thp: bool,
557     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
558         let mut memory_regions = Vec::new();
559         let mut memory_zones = HashMap::new();
560 
561         for zone_config in zones_config {
562             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
563         }
564 
565         for guest_ram_mapping in guest_ram_mappings {
566             for zone_config in zones_config {
567                 if guest_ram_mapping.zone_id == zone_config.id {
568                     let region = MemoryManager::create_ram_region(
569                         &zone_config.file,
570                         guest_ram_mapping.file_offset,
571                         GuestAddress(guest_ram_mapping.gpa),
572                         guest_ram_mapping.size as usize,
573                         match prefault {
574                             Some(pf) => pf,
575                             None => zone_config.prefault,
576                         },
577                         zone_config.shared,
578                         zone_config.hugepages,
579                         zone_config.hugepage_size,
580                         zone_config.host_numa_node,
581                         existing_memory_files.remove(&guest_ram_mapping.slot),
582                         thp,
583                     )?;
584                     memory_regions.push(Arc::clone(&region));
585                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
586                         if guest_ram_mapping.virtio_mem {
587                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
588                             let region_size = region.len();
589                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
590                                 region,
591                                 virtio_device: None,
592                                 hotplugged_size,
593                                 hugepages: zone_config.hugepages,
594                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
595                             });
596                         } else {
597                             memory_zone.regions.push(region);
598                         }
599                     }
600                 }
601             }
602         }
603 
604         memory_regions.sort_by_key(|x| x.start_addr());
605 
606         Ok((memory_regions, memory_zones))
607     }
608 
609     fn fill_saved_regions(
610         &mut self,
611         file_path: PathBuf,
612         saved_regions: MemoryRangeTable,
613     ) -> Result<(), Error> {
614         if saved_regions.is_empty() {
615             return Ok(());
616         }
617 
618         // Open (read only) the snapshot file.
619         let mut memory_file = OpenOptions::new()
620             .read(true)
621             .open(file_path)
622             .map_err(Error::SnapshotOpen)?;
623 
624         let guest_memory = self.guest_memory.memory();
625         for range in saved_regions.regions() {
626             let mut offset: u64 = 0;
627             // Here we are manually handling the retry in case we can't write
628             // the whole region at once because we can't use the implementation
629             // from vm-memory::GuestMemory of read_exact_from() as it is not
630             // following the correct behavior. For more info about this issue
631             // see: https://github.com/rust-vmm/vm-memory/issues/174
632             loop {
633                 let bytes_read = guest_memory
634                     .read_from(
635                         GuestAddress(range.gpa + offset),
636                         &mut memory_file,
637                         (range.length - offset) as usize,
638                     )
639                     .map_err(Error::SnapshotCopy)?;
640                 offset += bytes_read as u64;
641 
642                 if offset == range.length {
643                     break;
644                 }
645             }
646         }
647 
648         Ok(())
649     }
650 
651     fn validate_memory_config(
652         config: &MemoryConfig,
653         user_provided_zones: bool,
654     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
655         let mut allow_mem_hotplug = false;
656 
657         if !user_provided_zones {
658             if config.zones.is_some() {
659                 error!(
660                     "User defined memory regions can't be provided if the \
661                     memory size is not 0"
662                 );
663                 return Err(Error::InvalidMemoryParameters);
664             }
665 
666             if config.hotplug_size.is_some() {
667                 allow_mem_hotplug = true;
668             }
669 
670             if let Some(hotplugged_size) = config.hotplugged_size {
671                 if let Some(hotplug_size) = config.hotplug_size {
672                     if hotplugged_size > hotplug_size {
673                         error!(
674                             "'hotplugged_size' {} can't be bigger than \
675                             'hotplug_size' {}",
676                             hotplugged_size, hotplug_size,
677                         );
678                         return Err(Error::InvalidMemoryParameters);
679                     }
680                 } else {
681                     error!(
682                         "Invalid to define 'hotplugged_size' when there is\
683                         no 'hotplug_size'"
684                     );
685                     return Err(Error::InvalidMemoryParameters);
686                 }
687                 if config.hotplug_method == HotplugMethod::Acpi {
688                     error!(
689                         "Invalid to define 'hotplugged_size' with hotplug \
690                         method 'acpi'"
691                     );
692                     return Err(Error::InvalidMemoryParameters);
693                 }
694             }
695 
696             // Create a single zone from the global memory config. This lets
697             // us reuse the codepath for user defined memory zones.
698             let zones = vec![MemoryZoneConfig {
699                 id: String::from(DEFAULT_MEMORY_ZONE),
700                 size: config.size,
701                 file: None,
702                 shared: config.shared,
703                 hugepages: config.hugepages,
704                 hugepage_size: config.hugepage_size,
705                 host_numa_node: None,
706                 hotplug_size: config.hotplug_size,
707                 hotplugged_size: config.hotplugged_size,
708                 prefault: config.prefault,
709             }];
710 
711             Ok((config.size, zones, allow_mem_hotplug))
712         } else {
713             if config.zones.is_none() {
714                 error!(
715                     "User defined memory regions must be provided if the \
716                     memory size is 0"
717                 );
718                 return Err(Error::MissingMemoryZones);
719             }
720 
721             // Safe to unwrap as we checked right above there were some
722             // regions.
723             let zones = config.zones.clone().unwrap();
724             if zones.is_empty() {
725                 return Err(Error::MissingMemoryZones);
726             }
727 
728             let mut total_ram_size: u64 = 0;
729             for zone in zones.iter() {
730                 total_ram_size += zone.size;
731 
732                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
733                     error!(
734                         "Invalid to set host NUMA policy for a memory zone \
735                         backed by a regular file and mapped as 'shared'"
736                     );
737                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
738                 }
739 
740                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
741                     error!("Invalid to set ACPI hotplug method for memory zones");
742                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
743                 }
744 
745                 if let Some(hotplugged_size) = zone.hotplugged_size {
746                     if let Some(hotplug_size) = zone.hotplug_size {
747                         if hotplugged_size > hotplug_size {
748                             error!(
749                                 "'hotplugged_size' {} can't be bigger than \
750                                 'hotplug_size' {}",
751                                 hotplugged_size, hotplug_size,
752                             );
753                             return Err(Error::InvalidMemoryParameters);
754                         }
755                     } else {
756                         error!(
757                             "Invalid to define 'hotplugged_size' when there is\
758                             no 'hotplug_size' for a memory zone"
759                         );
760                         return Err(Error::InvalidMemoryParameters);
761                     }
762                     if config.hotplug_method == HotplugMethod::Acpi {
763                         error!(
764                             "Invalid to define 'hotplugged_size' with hotplug \
765                             method 'acpi'"
766                         );
767                         return Err(Error::InvalidMemoryParameters);
768                     }
769                 }
770             }
771 
772             Ok((total_ram_size, zones, allow_mem_hotplug))
773         }
774     }
775 
776     fn allocate_address_space(&mut self) -> Result<(), Error> {
777         let mut list = Vec::new();
778 
779         for (zone_id, memory_zone) in self.memory_zones.iter() {
780             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
781                 memory_zone
782                     .regions()
783                     .iter()
784                     .map(|r| (r.clone(), false))
785                     .collect();
786 
787             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
788                 regions.push((virtio_mem_zone.region().clone(), true));
789             }
790 
791             list.push((zone_id.clone(), regions));
792         }
793 
794         for (zone_id, regions) in list {
795             for (region, virtio_mem) in regions {
796                 let slot = self.create_userspace_mapping(
797                     region.start_addr().raw_value(),
798                     region.len(),
799                     region.as_ptr() as u64,
800                     self.mergeable,
801                     false,
802                     self.log_dirty,
803                 )?;
804 
805                 let file_offset = if let Some(file_offset) = region.file_offset() {
806                     file_offset.start()
807                 } else {
808                     0
809                 };
810 
811                 self.guest_ram_mappings.push(GuestRamMapping {
812                     gpa: region.start_addr().raw_value(),
813                     size: region.len(),
814                     slot,
815                     zone_id: zone_id.clone(),
816                     virtio_mem,
817                     file_offset,
818                 });
819                 self.ram_allocator
820                     .allocate(Some(region.start_addr()), region.len(), None)
821                     .ok_or(Error::MemoryRangeAllocation)?;
822             }
823         }
824 
825         // Allocate SubRegion and Reserved address ranges.
826         for region in self.arch_mem_regions.iter() {
827             if region.r_type == RegionType::Ram {
828                 // Ignore the RAM type since ranges have already been allocated
829                 // based on the GuestMemory regions.
830                 continue;
831             }
832             self.ram_allocator
833                 .allocate(
834                     Some(GuestAddress(region.base)),
835                     region.size as GuestUsize,
836                     None,
837                 )
838                 .ok_or(Error::MemoryRangeAllocation)?;
839         }
840 
841         Ok(())
842     }
843 
844     #[cfg(target_arch = "aarch64")]
845     fn add_uefi_flash(&mut self) -> Result<(), Error> {
846         // On AArch64, the UEFI binary requires a flash device at address 0.
847         // 4 MiB memory is mapped to simulate the flash.
848         let uefi_mem_slot = self.allocate_memory_slot();
849         let uefi_region = GuestRegionMmap::new(
850             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
851             arch::layout::UEFI_START,
852         )
853         .unwrap();
854         let uefi_mem_region = self.vm.make_user_memory_region(
855             uefi_mem_slot,
856             uefi_region.start_addr().raw_value(),
857             uefi_region.len(),
858             uefi_region.as_ptr() as u64,
859             false,
860             false,
861         );
862         self.vm
863             .create_user_memory_region(uefi_mem_region)
864             .map_err(Error::CreateUefiFlash)?;
865 
866         let uefi_flash =
867             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
868 
869         self.uefi_flash = Some(uefi_flash);
870 
871         Ok(())
872     }
873 
874     #[allow(clippy::too_many_arguments)]
875     pub fn new(
876         vm: Arc<dyn hypervisor::Vm>,
877         config: &MemoryConfig,
878         prefault: Option<bool>,
879         phys_bits: u8,
880         #[cfg(feature = "tdx")] tdx_enabled: bool,
881         restore_data: Option<&MemoryManagerSnapshotData>,
882         existing_memory_files: Option<HashMap<u32, File>>,
883         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
884     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
885         trace_scoped!("MemoryManager::new");
886 
887         let user_provided_zones = config.size == 0;
888 
889         let mmio_address_space_size = mmio_address_space_size(phys_bits);
890         debug_assert_eq!(
891             (((mmio_address_space_size) >> 16) << 16),
892             mmio_address_space_size
893         );
894         let start_of_platform_device_area =
895             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
896         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
897 
898         let (ram_size, zones, allow_mem_hotplug) =
899             Self::validate_memory_config(config, user_provided_zones)?;
900 
901         let (
902             start_of_device_area,
903             boot_ram,
904             current_ram,
905             arch_mem_regions,
906             memory_zones,
907             guest_memory,
908             boot_guest_memory,
909             hotplug_slots,
910             next_memory_slot,
911             selected_slot,
912             next_hotplug_slot,
913         ) = if let Some(data) = restore_data {
914             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
915                 &data.guest_ram_mappings,
916                 &zones,
917                 prefault,
918                 existing_memory_files.unwrap_or_default(),
919                 config.thp,
920             )?;
921             let guest_memory =
922                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
923             let boot_guest_memory = guest_memory.clone();
924             (
925                 GuestAddress(data.start_of_device_area),
926                 data.boot_ram,
927                 data.current_ram,
928                 data.arch_mem_regions.clone(),
929                 memory_zones,
930                 guest_memory,
931                 boot_guest_memory,
932                 data.hotplug_slots.clone(),
933                 data.next_memory_slot,
934                 data.selected_slot,
935                 data.next_hotplug_slot,
936             )
937         } else {
938             // Init guest memory
939             let arch_mem_regions = arch::arch_memory_regions(ram_size);
940 
941             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
942                 .iter()
943                 .filter(|r| r.2 == RegionType::Ram)
944                 .map(|r| (r.0, r.1))
945                 .collect();
946 
947             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
948                 .iter()
949                 .map(|(a, b, c)| ArchMemRegion {
950                     base: a.0,
951                     size: *b,
952                     r_type: *c,
953                 })
954                 .collect();
955 
956             let (mem_regions, mut memory_zones) =
957                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
958 
959             let mut guest_memory =
960                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
961 
962             let boot_guest_memory = guest_memory.clone();
963 
964             let mut start_of_device_area =
965                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
966 
967             // Update list of memory zones for resize.
968             for zone in zones.iter() {
969                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
970                     if let Some(hotplug_size) = zone.hotplug_size {
971                         if hotplug_size == 0 {
972                             error!("'hotplug_size' can't be 0");
973                             return Err(Error::InvalidHotplugSize);
974                         }
975 
976                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
977                             start_of_device_area = start_of_device_area
978                                 .checked_add(hotplug_size)
979                                 .ok_or(Error::GuestAddressOverFlow)?;
980                         } else {
981                             // Alignment must be "natural" i.e. same as size of block
982                             let start_addr = GuestAddress(
983                                 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE
984                                     - 1)
985                                     / virtio_devices::VIRTIO_MEM_ALIGN_SIZE
986                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
987                             );
988 
989                             // When `prefault` is set by vm_restore, memory manager
990                             // will create ram region with `prefault` option in
991                             // restore config rather than same option in zone
992                             let region = MemoryManager::create_ram_region(
993                                 &None,
994                                 0,
995                                 start_addr,
996                                 hotplug_size as usize,
997                                 match prefault {
998                                     Some(pf) => pf,
999                                     None => zone.prefault,
1000                                 },
1001                                 zone.shared,
1002                                 zone.hugepages,
1003                                 zone.hugepage_size,
1004                                 zone.host_numa_node,
1005                                 None,
1006                                 config.thp,
1007                             )?;
1008 
1009                             guest_memory = guest_memory
1010                                 .insert_region(Arc::clone(&region))
1011                                 .map_err(Error::GuestMemory)?;
1012 
1013                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1014                             let region_size = region.len();
1015                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1016                                 region,
1017                                 virtio_device: None,
1018                                 hotplugged_size,
1019                                 hugepages: zone.hugepages,
1020                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1021                             });
1022 
1023                             start_of_device_area = start_addr
1024                                 .checked_add(hotplug_size)
1025                                 .ok_or(Error::GuestAddressOverFlow)?;
1026                         }
1027                     }
1028                 } else {
1029                     return Err(Error::MissingZoneIdentifier);
1030                 }
1031             }
1032 
1033             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1034             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1035 
1036             (
1037                 start_of_device_area,
1038                 ram_size,
1039                 ram_size,
1040                 arch_mem_regions,
1041                 memory_zones,
1042                 guest_memory,
1043                 boot_guest_memory,
1044                 hotplug_slots,
1045                 0,
1046                 0,
1047                 0,
1048             )
1049         };
1050 
1051         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1052 
1053         // Both MMIO and PIO address spaces start at address 0.
1054         let allocator = Arc::new(Mutex::new(
1055             SystemAllocator::new(
1056                 #[cfg(target_arch = "x86_64")]
1057                 {
1058                     GuestAddress(0)
1059                 },
1060                 #[cfg(target_arch = "x86_64")]
1061                 {
1062                     1 << 16
1063                 },
1064                 start_of_platform_device_area,
1065                 PLATFORM_DEVICE_AREA_SIZE,
1066                 layout::MEM_32BIT_DEVICES_START,
1067                 layout::MEM_32BIT_DEVICES_SIZE,
1068                 #[cfg(target_arch = "x86_64")]
1069                 vec![GsiApic::new(
1070                     X86_64_IRQ_BASE,
1071                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1072                 )],
1073             )
1074             .ok_or(Error::CreateSystemAllocator)?,
1075         ));
1076 
1077         #[cfg(not(feature = "tdx"))]
1078         let dynamic = true;
1079         #[cfg(feature = "tdx")]
1080         let dynamic = !tdx_enabled;
1081 
1082         let acpi_address = if dynamic
1083             && config.hotplug_method == HotplugMethod::Acpi
1084             && (config.hotplug_size.unwrap_or_default() > 0)
1085         {
1086             Some(
1087                 allocator
1088                     .lock()
1089                     .unwrap()
1090                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1091                     .ok_or(Error::AllocateMmioAddress)?,
1092             )
1093         } else {
1094             None
1095         };
1096 
1097         // If running on SGX the start of device area and RAM area may diverge but
1098         // at this point they are next to each other.
1099         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1100         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1101 
1102         let mut memory_manager = MemoryManager {
1103             boot_guest_memory,
1104             guest_memory,
1105             next_memory_slot,
1106             start_of_device_area,
1107             end_of_device_area,
1108             end_of_ram_area,
1109             vm,
1110             hotplug_slots,
1111             selected_slot,
1112             mergeable: config.mergeable,
1113             allocator,
1114             hotplug_method: config.hotplug_method,
1115             boot_ram,
1116             current_ram,
1117             next_hotplug_slot,
1118             shared: config.shared,
1119             hugepages: config.hugepages,
1120             hugepage_size: config.hugepage_size,
1121             prefault: config.prefault,
1122             #[cfg(target_arch = "x86_64")]
1123             sgx_epc_region: None,
1124             user_provided_zones,
1125             snapshot_memory_ranges: MemoryRangeTable::default(),
1126             memory_zones,
1127             guest_ram_mappings: Vec::new(),
1128             acpi_address,
1129             log_dirty: dynamic, // Cannot log dirty pages on a TD
1130             arch_mem_regions,
1131             ram_allocator,
1132             dynamic,
1133             #[cfg(target_arch = "aarch64")]
1134             uefi_flash: None,
1135             thp: config.thp,
1136         };
1137 
1138         memory_manager.allocate_address_space()?;
1139 
1140         #[cfg(target_arch = "aarch64")]
1141         memory_manager.add_uefi_flash()?;
1142 
1143         #[cfg(target_arch = "x86_64")]
1144         if let Some(sgx_epc_config) = sgx_epc_config {
1145             memory_manager.setup_sgx(sgx_epc_config)?;
1146         }
1147 
1148         Ok(Arc::new(Mutex::new(memory_manager)))
1149     }
1150 
1151     pub fn new_from_snapshot(
1152         snapshot: &Snapshot,
1153         vm: Arc<dyn hypervisor::Vm>,
1154         config: &MemoryConfig,
1155         source_url: Option<&str>,
1156         prefault: bool,
1157         phys_bits: u8,
1158     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1159         if let Some(source_url) = source_url {
1160             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1161             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1162 
1163             let mem_snapshot: MemoryManagerSnapshotData =
1164                 snapshot.to_versioned_state().map_err(Error::Restore)?;
1165 
1166             let mm = MemoryManager::new(
1167                 vm,
1168                 config,
1169                 Some(prefault),
1170                 phys_bits,
1171                 #[cfg(feature = "tdx")]
1172                 false,
1173                 Some(&mem_snapshot),
1174                 None,
1175                 #[cfg(target_arch = "x86_64")]
1176                 None,
1177             )?;
1178 
1179             mm.lock()
1180                 .unwrap()
1181                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1182 
1183             Ok(mm)
1184         } else {
1185             Err(Error::RestoreMissingSourceUrl)
1186         }
1187     }
1188 
1189     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1190         // SAFETY: FFI call with correct arguments
1191         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1192 
1193         if res < 0 {
1194             Err(io::Error::last_os_error())
1195         } else {
1196             Ok(res as RawFd)
1197         }
1198     }
1199 
1200     fn mbind(
1201         addr: *mut u8,
1202         len: u64,
1203         mode: u32,
1204         nodemask: Vec<u64>,
1205         maxnode: u64,
1206         flags: u32,
1207     ) -> Result<(), io::Error> {
1208         // SAFETY: FFI call with correct arguments
1209         let res = unsafe {
1210             libc::syscall(
1211                 libc::SYS_mbind,
1212                 addr as *mut libc::c_void,
1213                 len,
1214                 mode,
1215                 nodemask.as_ptr(),
1216                 maxnode,
1217                 flags,
1218             )
1219         };
1220 
1221         if res < 0 {
1222             Err(io::Error::last_os_error())
1223         } else {
1224             Ok(())
1225         }
1226     }
1227 
1228     fn create_anonymous_file(
1229         size: usize,
1230         hugepages: bool,
1231         hugepage_size: Option<u64>,
1232     ) -> Result<FileOffset, Error> {
1233         let fd = Self::memfd_create(
1234             &ffi::CString::new("ch_ram").unwrap(),
1235             libc::MFD_CLOEXEC
1236                 | if hugepages {
1237                     libc::MFD_HUGETLB
1238                         | if let Some(hugepage_size) = hugepage_size {
1239                             /*
1240                              * From the Linux kernel:
1241                              * Several system calls take a flag to request "hugetlb" huge pages.
1242                              * Without further specification, these system calls will use the
1243                              * system's default huge page size.  If a system supports multiple
1244                              * huge page sizes, the desired huge page size can be specified in
1245                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1246                              * will encode the log2 of the huge page size.
1247                              */
1248 
1249                             hugepage_size.trailing_zeros() << 26
1250                         } else {
1251                             // Use the system default huge page size
1252                             0
1253                         }
1254                 } else {
1255                     0
1256                 },
1257         )
1258         .map_err(Error::SharedFileCreate)?;
1259 
1260         // SAFETY: fd is valid
1261         let f = unsafe { File::from_raw_fd(fd) };
1262         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1263 
1264         Ok(FileOffset::new(f, 0))
1265     }
1266 
1267     fn open_backing_file(
1268         backing_file: &PathBuf,
1269         file_offset: u64,
1270         size: usize,
1271     ) -> Result<FileOffset, Error> {
1272         if backing_file.is_dir() {
1273             // Override file offset as it does not apply in this case.
1274             info!(
1275                 "Ignoring file offset since the backing file is a \
1276                         temporary file created from the specified directory."
1277             );
1278             let fs_str = format!("{}{}", backing_file.display(), "/tmpfile_XXXXXX");
1279             let fs = ffi::CString::new(fs_str).unwrap();
1280             let mut path = fs.as_bytes_with_nul().to_owned();
1281             let path_ptr = path.as_mut_ptr() as *mut _;
1282             // SAFETY: FFI call
1283             let fd = unsafe { libc::mkstemp(path_ptr) };
1284             if fd == -1 {
1285                 return Err(Error::SharedFileCreate(std::io::Error::last_os_error()));
1286             }
1287             // SAFETY: FFI call
1288             unsafe { libc::unlink(path_ptr) };
1289             // SAFETY: fd is valid
1290             let f = unsafe { File::from_raw_fd(fd) };
1291             f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1292 
1293             Ok(FileOffset::new(f, 0))
1294         } else {
1295             let f = OpenOptions::new()
1296                 .read(true)
1297                 .write(true)
1298                 .open(backing_file)
1299                 .map_err(Error::SharedFileCreate)?;
1300 
1301             Ok(FileOffset::new(f, file_offset))
1302         }
1303     }
1304 
1305     #[allow(clippy::too_many_arguments)]
1306     pub fn create_ram_region(
1307         backing_file: &Option<PathBuf>,
1308         file_offset: u64,
1309         start_addr: GuestAddress,
1310         size: usize,
1311         prefault: bool,
1312         shared: bool,
1313         hugepages: bool,
1314         hugepage_size: Option<u64>,
1315         host_numa_node: Option<u32>,
1316         existing_memory_file: Option<File>,
1317         thp: bool,
1318     ) -> Result<Arc<GuestRegionMmap>, Error> {
1319         let mut mmap_flags = libc::MAP_NORESERVE;
1320 
1321         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1322         // the complexity of the handling clear.
1323         let fo = if let Some(f) = existing_memory_file {
1324             // It must be MAP_SHARED as we wouldn't already have an FD
1325             mmap_flags |= libc::MAP_SHARED;
1326             Some(FileOffset::new(f, file_offset))
1327         } else if let Some(backing_file) = backing_file {
1328             if shared {
1329                 mmap_flags |= libc::MAP_SHARED;
1330             } else {
1331                 mmap_flags |= libc::MAP_PRIVATE;
1332             }
1333             Some(Self::open_backing_file(backing_file, file_offset, size)?)
1334         } else if shared || hugepages {
1335             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1336             // because the MAP_PRIVATE will trigger CoW against the backing file with
1337             // the VFIO pinning
1338             mmap_flags |= libc::MAP_SHARED;
1339             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1340         } else {
1341             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1342             None
1343         };
1344 
1345         if prefault {
1346             mmap_flags |= libc::MAP_POPULATE;
1347         }
1348 
1349         let region = GuestRegionMmap::new(
1350             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1351                 .map_err(Error::GuestMemoryRegion)?,
1352             start_addr,
1353         )
1354         .map_err(Error::GuestMemory)?;
1355 
1356         if region.file_offset().is_none() && thp {
1357             info!(
1358                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1359                 region.as_ptr() as u64,
1360                 size
1361             );
1362             // SAFETY: FFI call with corect arguments
1363             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1364             if ret != 0 {
1365                 let e = io::Error::last_os_error();
1366                 warn!("Failed to mark pages as THP eligible: {}", e);
1367             }
1368         }
1369 
1370         // Apply NUMA policy if needed.
1371         if let Some(node) = host_numa_node {
1372             let addr = region.deref().as_ptr();
1373             let len = region.deref().size() as u64;
1374             let mode = MPOL_BIND;
1375             let mut nodemask: Vec<u64> = Vec::new();
1376             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1377 
1378             // Linux is kind of buggy in the way it interprets maxnode as it
1379             // will cut off the last node. That's why we have to add 1 to what
1380             // we would consider as the proper maxnode value.
1381             let maxnode = node as u64 + 1 + 1;
1382 
1383             // Allocate the right size for the vector.
1384             nodemask.resize((node as usize / 64) + 1, 0);
1385 
1386             // Fill the global bitmask through the nodemask vector.
1387             let idx = (node / 64) as usize;
1388             let shift = node % 64;
1389             nodemask[idx] |= 1u64 << shift;
1390 
1391             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1392             // force the kernel to move all pages that might have been already
1393             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1394             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1395             // MPOL_BIND is the selected mode as it specifies a strict policy
1396             // that restricts memory allocation to the nodes specified in the
1397             // nodemask.
1398             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1399                 .map_err(Error::ApplyNumaPolicy)?;
1400         }
1401 
1402         Ok(Arc::new(region))
1403     }
1404 
1405     // Update the GuestMemoryMmap with the new range
1406     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1407         let guest_memory = self
1408             .guest_memory
1409             .memory()
1410             .insert_region(region)
1411             .map_err(Error::GuestMemory)?;
1412         self.guest_memory.lock().unwrap().replace(guest_memory);
1413 
1414         Ok(())
1415     }
1416 
1417     //
1418     // Calculate the start address of an area next to RAM.
1419     //
1420     // If memory hotplug is allowed, the start address needs to be aligned
1421     // (rounded-up) to 128MiB boundary.
1422     // If memory hotplug is not allowed, there is no alignment required.
1423     // And it must also start at the 64bit start.
1424     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1425         let mut start_addr = if allow_mem_hotplug {
1426             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1427         } else {
1428             mem_end
1429         };
1430 
1431         start_addr = start_addr
1432             .checked_add(1)
1433             .ok_or(Error::GuestAddressOverFlow)?;
1434 
1435         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1436             return Ok(arch::layout::RAM_64BIT_START);
1437         }
1438 
1439         Ok(start_addr)
1440     }
1441 
1442     pub fn add_ram_region(
1443         &mut self,
1444         start_addr: GuestAddress,
1445         size: usize,
1446     ) -> Result<Arc<GuestRegionMmap>, Error> {
1447         // Allocate memory for the region
1448         let region = MemoryManager::create_ram_region(
1449             &None,
1450             0,
1451             start_addr,
1452             size,
1453             self.prefault,
1454             self.shared,
1455             self.hugepages,
1456             self.hugepage_size,
1457             None,
1458             None,
1459             self.thp,
1460         )?;
1461 
1462         // Map it into the guest
1463         let slot = self.create_userspace_mapping(
1464             region.start_addr().0,
1465             region.len(),
1466             region.as_ptr() as u64,
1467             self.mergeable,
1468             false,
1469             self.log_dirty,
1470         )?;
1471         self.guest_ram_mappings.push(GuestRamMapping {
1472             gpa: region.start_addr().raw_value(),
1473             size: region.len(),
1474             slot,
1475             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1476             virtio_mem: false,
1477             file_offset: 0,
1478         });
1479 
1480         self.add_region(Arc::clone(&region))?;
1481 
1482         Ok(region)
1483     }
1484 
1485     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1486         info!("Hotplugging new RAM: {}", size);
1487 
1488         // Check that there is a free slot
1489         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1490             return Err(Error::NoSlotAvailable);
1491         }
1492 
1493         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1494         if size % (128 << 20) != 0 {
1495             return Err(Error::InvalidSize);
1496         }
1497 
1498         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1499 
1500         if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area {
1501             return Err(Error::InsufficientHotplugRam);
1502         }
1503 
1504         let region = self.add_ram_region(start_addr, size)?;
1505 
1506         // Add region to the list of regions associated with the default
1507         // memory zone.
1508         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1509             memory_zone.regions.push(Arc::clone(&region));
1510         }
1511 
1512         // Tell the allocator
1513         self.ram_allocator
1514             .allocate(Some(start_addr), size as GuestUsize, None)
1515             .ok_or(Error::MemoryRangeAllocation)?;
1516 
1517         // Update the slot so that it can be queried via the I/O port
1518         let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1519         slot.active = true;
1520         slot.inserting = true;
1521         slot.base = region.start_addr().0;
1522         slot.length = region.len();
1523 
1524         self.next_hotplug_slot += 1;
1525 
1526         Ok(region)
1527     }
1528 
1529     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1530         self.guest_memory.clone()
1531     }
1532 
1533     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1534         self.boot_guest_memory.clone()
1535     }
1536 
1537     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1538         self.allocator.clone()
1539     }
1540 
1541     pub fn start_of_device_area(&self) -> GuestAddress {
1542         self.start_of_device_area
1543     }
1544 
1545     pub fn end_of_device_area(&self) -> GuestAddress {
1546         self.end_of_device_area
1547     }
1548 
1549     pub fn allocate_memory_slot(&mut self) -> u32 {
1550         let slot_id = self.next_memory_slot;
1551         self.next_memory_slot += 1;
1552         slot_id
1553     }
1554 
1555     pub fn create_userspace_mapping(
1556         &mut self,
1557         guest_phys_addr: u64,
1558         memory_size: u64,
1559         userspace_addr: u64,
1560         mergeable: bool,
1561         readonly: bool,
1562         log_dirty: bool,
1563     ) -> Result<u32, Error> {
1564         let slot = self.allocate_memory_slot();
1565         let mem_region = self.vm.make_user_memory_region(
1566             slot,
1567             guest_phys_addr,
1568             memory_size,
1569             userspace_addr,
1570             readonly,
1571             log_dirty,
1572         );
1573 
1574         info!(
1575             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1576             guest_phys_addr, userspace_addr, memory_size, slot
1577         );
1578 
1579         self.vm
1580             .create_user_memory_region(mem_region)
1581             .map_err(Error::CreateUserMemoryRegion)?;
1582 
1583         // Mark the pages as mergeable if explicitly asked for.
1584         if mergeable {
1585             // SAFETY: the address and size are valid since the
1586             // mmap succeeded.
1587             let ret = unsafe {
1588                 libc::madvise(
1589                     userspace_addr as *mut libc::c_void,
1590                     memory_size as libc::size_t,
1591                     libc::MADV_MERGEABLE,
1592                 )
1593             };
1594             if ret != 0 {
1595                 let err = io::Error::last_os_error();
1596                 // Safe to unwrap because the error is constructed with
1597                 // last_os_error(), which ensures the output will be Some().
1598                 let errno = err.raw_os_error().unwrap();
1599                 if errno == libc::EINVAL {
1600                     warn!("kernel not configured with CONFIG_KSM");
1601                 } else {
1602                     warn!("madvise error: {}", err);
1603                 }
1604                 warn!("failed to mark pages as mergeable");
1605             }
1606         }
1607 
1608         info!(
1609             "Created userspace mapping: {:x} -> {:x} {:x}",
1610             guest_phys_addr, userspace_addr, memory_size
1611         );
1612 
1613         Ok(slot)
1614     }
1615 
1616     pub fn remove_userspace_mapping(
1617         &mut self,
1618         guest_phys_addr: u64,
1619         memory_size: u64,
1620         userspace_addr: u64,
1621         mergeable: bool,
1622         slot: u32,
1623     ) -> Result<(), Error> {
1624         let mem_region = self.vm.make_user_memory_region(
1625             slot,
1626             guest_phys_addr,
1627             memory_size,
1628             userspace_addr,
1629             false, /* readonly -- don't care */
1630             false, /* log dirty */
1631         );
1632 
1633         self.vm
1634             .remove_user_memory_region(mem_region)
1635             .map_err(Error::RemoveUserMemoryRegion)?;
1636 
1637         // Mark the pages as unmergeable if there were previously marked as
1638         // mergeable.
1639         if mergeable {
1640             // SAFETY: the address and size are valid as the region was
1641             // previously advised.
1642             let ret = unsafe {
1643                 libc::madvise(
1644                     userspace_addr as *mut libc::c_void,
1645                     memory_size as libc::size_t,
1646                     libc::MADV_UNMERGEABLE,
1647                 )
1648             };
1649             if ret != 0 {
1650                 let err = io::Error::last_os_error();
1651                 // Safe to unwrap because the error is constructed with
1652                 // last_os_error(), which ensures the output will be Some().
1653                 let errno = err.raw_os_error().unwrap();
1654                 if errno == libc::EINVAL {
1655                     warn!("kernel not configured with CONFIG_KSM");
1656                 } else {
1657                     warn!("madvise error: {}", err);
1658                 }
1659                 warn!("failed to mark pages as unmergeable");
1660             }
1661         }
1662 
1663         info!(
1664             "Removed userspace mapping: {:x} -> {:x} {:x}",
1665             guest_phys_addr, userspace_addr, memory_size
1666         );
1667 
1668         Ok(())
1669     }
1670 
1671     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1672         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1673             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1674                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1675                     virtio_mem_device
1676                         .lock()
1677                         .unwrap()
1678                         .resize(size)
1679                         .map_err(Error::VirtioMemResizeFail)?;
1680                 }
1681 
1682                 // Keep the hotplugged_size up to date.
1683                 virtio_mem_zone.hotplugged_size = size;
1684             } else {
1685                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1686                 return Err(Error::MissingVirtioMemHandler);
1687             }
1688 
1689             return Ok(());
1690         }
1691 
1692         error!("Failed resizing virtio-mem region: Unknown memory zone");
1693         Err(Error::UnknownMemoryZone)
1694     }
1695 
1696     /// In case this function resulted in adding a new memory region to the
1697     /// guest memory, the new region is returned to the caller. The virtio-mem
1698     /// use case never adds a new region as the whole hotpluggable memory has
1699     /// already been allocated at boot time.
1700     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1701         if self.user_provided_zones {
1702             error!(
1703                 "Not allowed to resize guest memory when backed with user \
1704                 defined memory zones."
1705             );
1706             return Err(Error::InvalidResizeWithMemoryZones);
1707         }
1708 
1709         let mut region: Option<Arc<GuestRegionMmap>> = None;
1710         match self.hotplug_method {
1711             HotplugMethod::VirtioMem => {
1712                 if desired_ram >= self.boot_ram {
1713                     if !self.dynamic {
1714                         return Ok(region);
1715                     }
1716 
1717                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1718                     self.current_ram = desired_ram;
1719                 }
1720             }
1721             HotplugMethod::Acpi => {
1722                 if desired_ram > self.current_ram {
1723                     if !self.dynamic {
1724                         return Ok(region);
1725                     }
1726 
1727                     region =
1728                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1729                     self.current_ram = desired_ram;
1730                 }
1731             }
1732         }
1733         Ok(region)
1734     }
1735 
1736     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1737         if !self.user_provided_zones {
1738             error!(
1739                 "Not allowed to resize guest memory zone when no zone is \
1740                 defined."
1741             );
1742             return Err(Error::ResizeZone);
1743         }
1744 
1745         self.virtio_mem_resize(id, virtio_mem_size)
1746     }
1747 
1748     #[cfg(target_arch = "x86_64")]
1749     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1750         let file = OpenOptions::new()
1751             .read(true)
1752             .open("/dev/sgx_provision")
1753             .map_err(Error::SgxProvisionOpen)?;
1754         self.vm
1755             .enable_sgx_attribute(file)
1756             .map_err(Error::SgxEnableProvisioning)?;
1757 
1758         // Go over each EPC section and verify its size is a 4k multiple. At
1759         // the same time, calculate the total size needed for the contiguous
1760         // EPC region.
1761         let mut epc_region_size = 0;
1762         for epc_section in sgx_epc_config.iter() {
1763             if epc_section.size == 0 {
1764                 return Err(Error::EpcSectionSizeInvalid);
1765             }
1766             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1767                 return Err(Error::EpcSectionSizeInvalid);
1768             }
1769 
1770             epc_region_size += epc_section.size;
1771         }
1772 
1773         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
1774         let epc_region_start = GuestAddress(
1775             ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE,
1776         );
1777 
1778         self.start_of_device_area = epc_region_start
1779             .checked_add(epc_region_size)
1780             .ok_or(Error::GuestAddressOverFlow)?;
1781 
1782         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
1783         info!(
1784             "SGX EPC region: 0x{:x} (0x{:x})",
1785             epc_region_start.0, epc_region_size
1786         );
1787 
1788         // Each section can be memory mapped into the allocated region.
1789         let mut epc_section_start = epc_region_start.raw_value();
1790         for epc_section in sgx_epc_config.iter() {
1791             let file = OpenOptions::new()
1792                 .read(true)
1793                 .write(true)
1794                 .open("/dev/sgx_vepc")
1795                 .map_err(Error::SgxVirtEpcOpen)?;
1796 
1797             let prot = PROT_READ | PROT_WRITE;
1798             let mut flags = MAP_NORESERVE | MAP_SHARED;
1799             if epc_section.prefault {
1800                 flags |= MAP_POPULATE;
1801             }
1802 
1803             // We can't use the vm-memory crate to perform the memory mapping
1804             // here as it would try to ensure the size of the backing file is
1805             // matching the size of the expected mapping. The /dev/sgx_vepc
1806             // device does not work that way, it provides a file descriptor
1807             // which is not matching the mapping size, as it's a just a way to
1808             // let KVM know that an EPC section is being created for the guest.
1809             // SAFETY: FFI call with correct arguments
1810             let host_addr = unsafe {
1811                 libc::mmap(
1812                     std::ptr::null_mut(),
1813                     epc_section.size as usize,
1814                     prot,
1815                     flags,
1816                     file.as_raw_fd(),
1817                     0,
1818                 )
1819             } as u64;
1820 
1821             info!(
1822                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
1823                 epc_section_start, epc_section.size
1824             );
1825 
1826             let _mem_slot = self.create_userspace_mapping(
1827                 epc_section_start,
1828                 epc_section.size,
1829                 host_addr,
1830                 false,
1831                 false,
1832                 false,
1833             )?;
1834 
1835             sgx_epc_region.insert(
1836                 epc_section.id.clone(),
1837                 SgxEpcSection::new(
1838                     GuestAddress(epc_section_start),
1839                     epc_section.size as GuestUsize,
1840                 ),
1841             );
1842 
1843             epc_section_start += epc_section.size;
1844         }
1845 
1846         self.sgx_epc_region = Some(sgx_epc_region);
1847 
1848         Ok(())
1849     }
1850 
1851     #[cfg(target_arch = "x86_64")]
1852     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
1853         &self.sgx_epc_region
1854     }
1855 
1856     pub fn is_hardlink(f: &File) -> bool {
1857         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
1858         // SAFETY: FFI call with correct arguments
1859         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
1860         if ret != 0 {
1861             error!("Couldn't fstat the backing file");
1862             return false;
1863         }
1864 
1865         // SAFETY: stat is valid
1866         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
1867     }
1868 
1869     pub fn memory_zones(&self) -> &MemoryZones {
1870         &self.memory_zones
1871     }
1872 
1873     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
1874         &mut self.memory_zones
1875     }
1876 
1877     pub fn memory_range_table(
1878         &self,
1879         snapshot: bool,
1880     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
1881         let mut table = MemoryRangeTable::default();
1882 
1883         for memory_zone in self.memory_zones.values() {
1884             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
1885                 table.extend(virtio_mem_zone.plugged_ranges());
1886             }
1887 
1888             for region in memory_zone.regions() {
1889                 if snapshot {
1890                     if let Some(file_offset) = region.file_offset() {
1891                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
1892                             && Self::is_hardlink(file_offset.file())
1893                         {
1894                             // In this very specific case, we know the memory
1895                             // region is backed by a file on the host filesystem
1896                             // that can be accessed by the user, and additionally
1897                             // the mapping is shared, which means that modifications
1898                             // to the content are written to the actual file.
1899                             // When meeting these conditions, we can skip the
1900                             // copy of the memory content for this specific region,
1901                             // as we can assume the user will have it saved through
1902                             // the backing file already.
1903                             continue;
1904                         }
1905                     }
1906                 }
1907 
1908                 table.push(MemoryRange {
1909                     gpa: region.start_addr().raw_value(),
1910                     length: region.len(),
1911                 });
1912             }
1913         }
1914 
1915         Ok(table)
1916     }
1917 
1918     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
1919         MemoryManagerSnapshotData {
1920             memory_ranges: self.snapshot_memory_ranges.clone(),
1921             guest_ram_mappings: self.guest_ram_mappings.clone(),
1922             start_of_device_area: self.start_of_device_area.0,
1923             boot_ram: self.boot_ram,
1924             current_ram: self.current_ram,
1925             arch_mem_regions: self.arch_mem_regions.clone(),
1926             hotplug_slots: self.hotplug_slots.clone(),
1927             next_memory_slot: self.next_memory_slot,
1928             selected_slot: self.selected_slot,
1929             next_hotplug_slot: self.next_hotplug_slot,
1930         }
1931     }
1932 
1933     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
1934         let mut memory_slot_fds = HashMap::new();
1935         for guest_ram_mapping in &self.guest_ram_mappings {
1936             let slot = guest_ram_mapping.slot;
1937             let guest_memory = self.guest_memory.memory();
1938             let file = guest_memory
1939                 .find_region(GuestAddress(guest_ram_mapping.gpa))
1940                 .unwrap()
1941                 .file_offset()
1942                 .unwrap()
1943                 .file();
1944             memory_slot_fds.insert(slot, file.as_raw_fd());
1945         }
1946         memory_slot_fds
1947     }
1948 
1949     pub fn acpi_address(&self) -> Option<GuestAddress> {
1950         self.acpi_address
1951     }
1952 
1953     pub fn num_guest_ram_mappings(&self) -> u32 {
1954         self.guest_ram_mappings.len() as u32
1955     }
1956 
1957     #[cfg(target_arch = "aarch64")]
1958     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1959         self.uefi_flash.as_ref().unwrap().clone()
1960     }
1961 
1962     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1963     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
1964         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
1965         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
1966 
1967         let mut mem_offset_in_elf = mem_offset;
1968         let mut ram_maps = BTreeMap::new();
1969         for mapping in mapping_sorted_by_gpa.iter() {
1970             ram_maps.insert(
1971                 mapping.gpa,
1972                 CoredumpMemoryRegion {
1973                     mem_offset_in_elf,
1974                     mem_size: mapping.size,
1975                 },
1976             );
1977             mem_offset_in_elf += mapping.size;
1978         }
1979 
1980         CoredumpMemoryRegions { ram_maps }
1981     }
1982 
1983     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
1984     pub fn coredump_iterate_save_mem(
1985         &mut self,
1986         dump_state: &DumpState,
1987     ) -> std::result::Result<(), GuestDebuggableError> {
1988         let snapshot_memory_ranges = self
1989             .memory_range_table(false)
1990             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
1991 
1992         if snapshot_memory_ranges.is_empty() {
1993             return Ok(());
1994         }
1995 
1996         let mut coredump_file = dump_state.file.as_ref().unwrap();
1997 
1998         let guest_memory = self.guest_memory.memory();
1999         let mut total_bytes: u64 = 0;
2000 
2001         for range in snapshot_memory_ranges.regions() {
2002             let mut offset: u64 = 0;
2003             loop {
2004                 let bytes_written = guest_memory
2005                     .write_to(
2006                         GuestAddress(range.gpa + offset),
2007                         &mut coredump_file,
2008                         (range.length - offset) as usize,
2009                     )
2010                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2011                 offset += bytes_written as u64;
2012                 total_bytes += bytes_written as u64;
2013 
2014                 if offset == range.length {
2015                     break;
2016                 }
2017             }
2018         }
2019 
2020         debug!("coredump total bytes {}", total_bytes);
2021         Ok(())
2022     }
2023 
2024     pub fn receive_memory_regions<F>(
2025         &mut self,
2026         ranges: &MemoryRangeTable,
2027         fd: &mut F,
2028     ) -> std::result::Result<(), MigratableError>
2029     where
2030         F: Read,
2031     {
2032         let guest_memory = self.guest_memory();
2033         let mem = guest_memory.memory();
2034 
2035         for range in ranges.regions() {
2036             let mut offset: u64 = 0;
2037             // Here we are manually handling the retry in case we can't the
2038             // whole region at once because we can't use the implementation
2039             // from vm-memory::GuestMemory of read_exact_from() as it is not
2040             // following the correct behavior. For more info about this issue
2041             // see: https://github.com/rust-vmm/vm-memory/issues/174
2042             loop {
2043                 let bytes_read = mem
2044                     .read_from(
2045                         GuestAddress(range.gpa + offset),
2046                         fd,
2047                         (range.length - offset) as usize,
2048                     )
2049                     .map_err(|e| {
2050                         MigratableError::MigrateReceive(anyhow!(
2051                             "Error receiving memory from socket: {}",
2052                             e
2053                         ))
2054                     })?;
2055                 offset += bytes_read as u64;
2056 
2057                 if offset == range.length {
2058                     break;
2059                 }
2060             }
2061         }
2062 
2063         Ok(())
2064     }
2065 }
2066 
2067 struct MemoryNotify {
2068     slot_id: usize,
2069 }
2070 
2071 impl Aml for MemoryNotify {
2072     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2073         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2074         aml::If::new(
2075             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2076             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2077         )
2078         .append_aml_bytes(bytes)
2079     }
2080 }
2081 
2082 struct MemorySlot {
2083     slot_id: usize,
2084 }
2085 
2086 impl Aml for MemorySlot {
2087     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2088         aml::Device::new(
2089             format!("M{:03}", self.slot_id).as_str().into(),
2090             vec![
2091                 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")),
2092                 &aml::Name::new("_UID".into(), &self.slot_id),
2093                 /*
2094                 _STA return value:
2095                 Bit [0] – Set if the device is present.
2096                 Bit [1] – Set if the device is enabled and decoding its resources.
2097                 Bit [2] – Set if the device should be shown in the UI.
2098                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2099                 Bit [4] – Set if the battery is present.
2100                 Bits [31:5] – Reserved (must be cleared).
2101                 */
2102                 &aml::Method::new(
2103                     "_STA".into(),
2104                     0,
2105                     false,
2106                     // Call into MSTA method which will interrogate device
2107                     vec![&aml::Return::new(&aml::MethodCall::new(
2108                         "MSTA".into(),
2109                         vec![&self.slot_id],
2110                     ))],
2111                 ),
2112                 // Get details of memory
2113                 &aml::Method::new(
2114                     "_CRS".into(),
2115                     0,
2116                     false,
2117                     // Call into MCRS which provides actual memory details
2118                     vec![&aml::Return::new(&aml::MethodCall::new(
2119                         "MCRS".into(),
2120                         vec![&self.slot_id],
2121                     ))],
2122                 ),
2123             ],
2124         )
2125         .append_aml_bytes(bytes)
2126     }
2127 }
2128 
2129 struct MemorySlots {
2130     slots: usize,
2131 }
2132 
2133 impl Aml for MemorySlots {
2134     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2135         for slot_id in 0..self.slots {
2136             MemorySlot { slot_id }.append_aml_bytes(bytes);
2137         }
2138     }
2139 }
2140 
2141 struct MemoryMethods {
2142     slots: usize,
2143 }
2144 
2145 impl Aml for MemoryMethods {
2146     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2147         // Add "MTFY" notification method
2148         let mut memory_notifies = Vec::new();
2149         for slot_id in 0..self.slots {
2150             memory_notifies.push(MemoryNotify { slot_id });
2151         }
2152 
2153         let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new();
2154         for memory_notifier in memory_notifies.iter() {
2155             memory_notifies_refs.push(memory_notifier);
2156         }
2157 
2158         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes);
2159 
2160         // MSCN method
2161         aml::Method::new(
2162             "MSCN".into(),
2163             0,
2164             true,
2165             vec![
2166                 // Take lock defined above
2167                 &aml::Acquire::new("MLCK".into(), 0xffff),
2168                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2169                 &aml::While::new(
2170                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2171                     vec![
2172                         // Write slot number (in first argument) to I/O port via field
2173                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2174                         // Check if MINS bit is set (inserting)
2175                         &aml::If::new(
2176                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2177                             // Notify device if it is
2178                             vec![
2179                                 &aml::MethodCall::new(
2180                                     "MTFY".into(),
2181                                     vec![&aml::Local(0), &aml::ONE],
2182                                 ),
2183                                 // Reset MINS bit
2184                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2185                             ],
2186                         ),
2187                         // Check if MRMV bit is set
2188                         &aml::If::new(
2189                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2190                             // Notify device if it is (with the eject constant 0x3)
2191                             vec![
2192                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2193                                 // Reset MRMV bit
2194                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2195                             ],
2196                         ),
2197                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2198                     ],
2199                 ),
2200                 // Release lock
2201                 &aml::Release::new("MLCK".into()),
2202             ],
2203         )
2204         .append_aml_bytes(bytes);
2205 
2206         // Memory status method
2207         aml::Method::new(
2208             "MSTA".into(),
2209             1,
2210             true,
2211             vec![
2212                 // Take lock defined above
2213                 &aml::Acquire::new("MLCK".into(), 0xffff),
2214                 // Write slot number (in first argument) to I/O port via field
2215                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2216                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2217                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2218                 &aml::If::new(
2219                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2220                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2221                 ),
2222                 // Release lock
2223                 &aml::Release::new("MLCK".into()),
2224                 // Return 0 or 0xf
2225                 &aml::Return::new(&aml::Local(0)),
2226             ],
2227         )
2228         .append_aml_bytes(bytes);
2229 
2230         // Memory range method
2231         aml::Method::new(
2232             "MCRS".into(),
2233             1,
2234             true,
2235             vec![
2236                 // Take lock defined above
2237                 &aml::Acquire::new("MLCK".into(), 0xffff),
2238                 // Write slot number (in first argument) to I/O port via field
2239                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2240                 &aml::Name::new(
2241                     "MR64".into(),
2242                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2243                         aml::AddressSpaceCachable::Cacheable,
2244                         true,
2245                         0x0000_0000_0000_0000u64,
2246                         0xFFFF_FFFF_FFFF_FFFEu64,
2247                     )]),
2248                 ),
2249                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()),
2250                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()),
2251                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()),
2252                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()),
2253                 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()),
2254                 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()),
2255                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2256                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2257                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2258                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2259                 &aml::Add::new(
2260                     &aml::Path::new("MAXL"),
2261                     &aml::Path::new("MINL"),
2262                     &aml::Path::new("LENL"),
2263                 ),
2264                 &aml::Add::new(
2265                     &aml::Path::new("MAXH"),
2266                     &aml::Path::new("MINH"),
2267                     &aml::Path::new("LENH"),
2268                 ),
2269                 &aml::If::new(
2270                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2271                     vec![&aml::Add::new(
2272                         &aml::Path::new("MAXH"),
2273                         &aml::ONE,
2274                         &aml::Path::new("MAXH"),
2275                     )],
2276                 ),
2277                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2278                 // Release lock
2279                 &aml::Release::new("MLCK".into()),
2280                 &aml::Return::new(&aml::Path::new("MR64")),
2281             ],
2282         )
2283         .append_aml_bytes(bytes)
2284     }
2285 }
2286 
2287 impl Aml for MemoryManager {
2288     fn append_aml_bytes(&self, bytes: &mut Vec<u8>) {
2289         if let Some(acpi_address) = self.acpi_address {
2290             // Memory Hotplug Controller
2291             aml::Device::new(
2292                 "_SB_.MHPC".into(),
2293                 vec![
2294                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2295                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2296                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2297                     &aml::Mutex::new("MLCK".into(), 0),
2298                     &aml::Name::new(
2299                         "_CRS".into(),
2300                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2301                             aml::AddressSpaceCachable::NotCacheable,
2302                             true,
2303                             acpi_address.0,
2304                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2305                         )]),
2306                     ),
2307                     // OpRegion and Fields map MMIO range into individual field values
2308                     &aml::OpRegion::new(
2309                         "MHPR".into(),
2310                         aml::OpRegionSpace::SystemMemory,
2311                         acpi_address.0 as usize,
2312                         MEMORY_MANAGER_ACPI_SIZE,
2313                     ),
2314                     &aml::Field::new(
2315                         "MHPR".into(),
2316                         aml::FieldAccessType::DWord,
2317                         aml::FieldUpdateRule::Preserve,
2318                         vec![
2319                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2320                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2321                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2322                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2323                         ],
2324                     ),
2325                     &aml::Field::new(
2326                         "MHPR".into(),
2327                         aml::FieldAccessType::DWord,
2328                         aml::FieldUpdateRule::Preserve,
2329                         vec![
2330                             aml::FieldEntry::Reserved(128),
2331                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2332                         ],
2333                     ),
2334                     &aml::Field::new(
2335                         "MHPR".into(),
2336                         aml::FieldAccessType::Byte,
2337                         aml::FieldUpdateRule::WriteAsZeroes,
2338                         vec![
2339                             aml::FieldEntry::Reserved(160),
2340                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2341                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2342                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2343                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2344                         ],
2345                     ),
2346                     &aml::Field::new(
2347                         "MHPR".into(),
2348                         aml::FieldAccessType::DWord,
2349                         aml::FieldUpdateRule::Preserve,
2350                         vec![
2351                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2352                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2353                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2354                         ],
2355                     ),
2356                     &MemoryMethods {
2357                         slots: self.hotplug_slots.len(),
2358                     },
2359                     &MemorySlots {
2360                         slots: self.hotplug_slots.len(),
2361                     },
2362                 ],
2363             )
2364             .append_aml_bytes(bytes);
2365         } else {
2366             aml::Device::new(
2367                 "_SB_.MHPC".into(),
2368                 vec![
2369                     &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")),
2370                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2371                     // Empty MSCN for GED
2372                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2373                 ],
2374             )
2375             .append_aml_bytes(bytes);
2376         }
2377 
2378         #[cfg(target_arch = "x86_64")]
2379         {
2380             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2381                 let min = sgx_epc_region.start().raw_value();
2382                 let max = min + sgx_epc_region.size() - 1;
2383                 // SGX EPC region
2384                 aml::Device::new(
2385                     "_SB_.EPC_".into(),
2386                     vec![
2387                         &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")),
2388                         // QWORD describing the EPC region start and size
2389                         &aml::Name::new(
2390                             "_CRS".into(),
2391                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2392                                 aml::AddressSpaceCachable::NotCacheable,
2393                                 true,
2394                                 min,
2395                                 max,
2396                             )]),
2397                         ),
2398                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2399                     ],
2400                 )
2401                 .append_aml_bytes(bytes);
2402             }
2403         }
2404     }
2405 }
2406 
2407 impl Pausable for MemoryManager {}
2408 
2409 #[derive(Clone, Serialize, Deserialize, Versionize)]
2410 pub struct MemoryManagerSnapshotData {
2411     memory_ranges: MemoryRangeTable,
2412     guest_ram_mappings: Vec<GuestRamMapping>,
2413     start_of_device_area: u64,
2414     boot_ram: u64,
2415     current_ram: u64,
2416     arch_mem_regions: Vec<ArchMemRegion>,
2417     hotplug_slots: Vec<HotPlugState>,
2418     next_memory_slot: u32,
2419     selected_slot: usize,
2420     next_hotplug_slot: usize,
2421 }
2422 
2423 impl VersionMapped for MemoryManagerSnapshotData {}
2424 
2425 impl Snapshottable for MemoryManager {
2426     fn id(&self) -> String {
2427         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2428     }
2429 
2430     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2431         let memory_ranges = self.memory_range_table(true)?;
2432 
2433         // Store locally this list of ranges as it will be used through the
2434         // Transportable::send() implementation. The point is to avoid the
2435         // duplication of code regarding the creation of the path for each
2436         // region. The 'snapshot' step creates the list of memory regions,
2437         // including information about the need to copy a memory region or
2438         // not. This saves the 'send' step having to go through the same
2439         // process, and instead it can directly proceed with storing the
2440         // memory range content for the ranges requiring it.
2441         self.snapshot_memory_ranges = memory_ranges;
2442 
2443         Ok(Snapshot::from_data(SnapshotData::new_from_versioned_state(
2444             &self.snapshot_data(),
2445         )?))
2446     }
2447 }
2448 
2449 impl Transportable for MemoryManager {
2450     fn send(
2451         &self,
2452         _snapshot: &Snapshot,
2453         destination_url: &str,
2454     ) -> result::Result<(), MigratableError> {
2455         if self.snapshot_memory_ranges.is_empty() {
2456             return Ok(());
2457         }
2458 
2459         let mut memory_file_path = url_to_path(destination_url)?;
2460         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2461 
2462         // Create the snapshot file for the entire memory
2463         let mut memory_file = OpenOptions::new()
2464             .read(true)
2465             .write(true)
2466             .create_new(true)
2467             .open(memory_file_path)
2468             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2469 
2470         let guest_memory = self.guest_memory.memory();
2471 
2472         for range in self.snapshot_memory_ranges.regions() {
2473             let mut offset: u64 = 0;
2474             // Here we are manually handling the retry in case we can't read
2475             // the whole region at once because we can't use the implementation
2476             // from vm-memory::GuestMemory of write_all_to() as it is not
2477             // following the correct behavior. For more info about this issue
2478             // see: https://github.com/rust-vmm/vm-memory/issues/174
2479             loop {
2480                 let bytes_written = guest_memory
2481                     .write_to(
2482                         GuestAddress(range.gpa + offset),
2483                         &mut memory_file,
2484                         (range.length - offset) as usize,
2485                     )
2486                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2487                 offset += bytes_written as u64;
2488 
2489                 if offset == range.length {
2490                     break;
2491                 }
2492             }
2493         }
2494         Ok(())
2495     }
2496 }
2497 
2498 impl Migratable for MemoryManager {
2499     // Start the dirty log in the hypervisor (kvm/mshv).
2500     // Also, reset the dirty bitmap logged by the vmm.
2501     // Just before we do a bulk copy we want to start/clear the dirty log so that
2502     // pages touched during our bulk copy are tracked.
2503     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2504         self.vm.start_dirty_log().map_err(|e| {
2505             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2506         })?;
2507 
2508         for r in self.guest_memory.memory().iter() {
2509             r.bitmap().reset();
2510         }
2511 
2512         Ok(())
2513     }
2514 
2515     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2516         self.vm.stop_dirty_log().map_err(|e| {
2517             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2518         })?;
2519 
2520         Ok(())
2521     }
2522 
2523     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2524     // together in the table if they are contiguous.
2525     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2526         let mut table = MemoryRangeTable::default();
2527         for r in &self.guest_ram_mappings {
2528             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2529                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2530             })?;
2531             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2532             {
2533                 Some(region) => {
2534                     assert!(region.start_addr().raw_value() == r.gpa);
2535                     assert!(region.len() == r.size);
2536                     region.bitmap().get_and_reset()
2537                 }
2538                 None => {
2539                     return Err(MigratableError::MigrateSend(anyhow!(
2540                         "Error finding 'guest memory region' with address {:x}",
2541                         r.gpa
2542                     )))
2543                 }
2544             };
2545 
2546             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2547                 .iter()
2548                 .zip(vmm_dirty_bitmap.iter())
2549                 .map(|(x, y)| x | y)
2550                 .collect();
2551 
2552             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2553 
2554             if sub_table.regions().is_empty() {
2555                 info!("Dirty Memory Range Table is empty");
2556             } else {
2557                 info!("Dirty Memory Range Table:");
2558                 for range in sub_table.regions() {
2559                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2560                 }
2561             }
2562 
2563             table.extend(sub_table);
2564         }
2565         Ok(table)
2566     }
2567 }
2568