xref: /cloud-hypervisor/vmm/src/memory_manager.rs (revision 24998c1672001b479386825acf09319b89c650e3)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0
4 //
5 
6 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
7 use std::collections::BTreeMap;
8 use std::collections::HashMap;
9 use std::fs::{File, OpenOptions};
10 use std::io::{self};
11 use std::ops::{BitAnd, Deref, Not, Sub};
12 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
13 use std::os::fd::AsFd;
14 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd};
15 use std::path::PathBuf;
16 use std::sync::atomic::{AtomicU32, Ordering};
17 use std::sync::{Arc, Barrier, Mutex};
18 use std::{ffi, result, thread};
19 
20 use acpi_tables::{aml, Aml};
21 use anyhow::anyhow;
22 #[cfg(target_arch = "x86_64")]
23 use arch::x86_64::{SgxEpcRegion, SgxEpcSection};
24 use arch::RegionType;
25 #[cfg(target_arch = "x86_64")]
26 use devices::ioapic;
27 #[cfg(target_arch = "aarch64")]
28 use hypervisor::HypervisorVmError;
29 use libc::_SC_NPROCESSORS_ONLN;
30 #[cfg(target_arch = "x86_64")]
31 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE};
32 use serde::{Deserialize, Serialize};
33 use thiserror::Error;
34 use tracer::trace_scoped;
35 use virtio_devices::BlocksState;
36 #[cfg(target_arch = "x86_64")]
37 use vm_allocator::GsiApic;
38 use vm_allocator::{AddressAllocator, MemorySlotAllocator, SystemAllocator};
39 use vm_device::BusDevice;
40 use vm_memory::bitmap::AtomicBitmap;
41 use vm_memory::guest_memory::FileOffset;
42 use vm_memory::mmap::MmapRegionError;
43 use vm_memory::{
44     Address, Error as MmapError, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
45     GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, ReadVolatile,
46 };
47 use vm_migration::protocol::{MemoryRange, MemoryRangeTable};
48 use vm_migration::{
49     Migratable, MigratableError, Pausable, Snapshot, SnapshotData, Snapshottable, Transportable,
50 };
51 
52 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
53 use crate::coredump::{
54     CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError,
55 };
56 use crate::migration::url_to_path;
57 #[cfg(target_arch = "x86_64")]
58 use crate::vm_config::SgxEpcConfig;
59 use crate::vm_config::{HotplugMethod, MemoryConfig, MemoryZoneConfig};
60 use crate::{GuestMemoryMmap, GuestRegionMmap, MEMORY_MANAGER_SNAPSHOT_ID};
61 
62 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18;
63 
64 const DEFAULT_MEMORY_ZONE: &str = "mem0";
65 
66 const SNAPSHOT_FILENAME: &str = "memory-ranges";
67 
68 #[cfg(target_arch = "x86_64")]
69 const X86_64_IRQ_BASE: u32 = 5;
70 
71 #[cfg(target_arch = "x86_64")]
72 const SGX_PAGE_SIZE: u64 = 1 << 12;
73 
74 const HOTPLUG_COUNT: usize = 8;
75 
76 // Memory policy constants
77 const MPOL_BIND: u32 = 2;
78 const MPOL_MF_STRICT: u32 = 1;
79 const MPOL_MF_MOVE: u32 = 1 << 1;
80 
81 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices)
82 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20;
83 
84 const MAX_PREFAULT_THREAD_COUNT: usize = 16;
85 
86 #[derive(Clone, Default, Serialize, Deserialize)]
87 struct HotPlugState {
88     base: u64,
89     length: u64,
90     active: bool,
91     inserting: bool,
92     removing: bool,
93 }
94 
95 pub struct VirtioMemZone {
96     region: Arc<GuestRegionMmap>,
97     virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>,
98     hotplugged_size: u64,
99     hugepages: bool,
100     blocks_state: Arc<Mutex<BlocksState>>,
101 }
102 
103 impl VirtioMemZone {
region(&self) -> &Arc<GuestRegionMmap>104     pub fn region(&self) -> &Arc<GuestRegionMmap> {
105         &self.region
106     }
set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>)107     pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) {
108         self.virtio_device = Some(virtio_device);
109     }
hotplugged_size(&self) -> u64110     pub fn hotplugged_size(&self) -> u64 {
111         self.hotplugged_size
112     }
hugepages(&self) -> bool113     pub fn hugepages(&self) -> bool {
114         self.hugepages
115     }
blocks_state(&self) -> &Arc<Mutex<BlocksState>>116     pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> {
117         &self.blocks_state
118     }
plugged_ranges(&self) -> MemoryRangeTable119     pub fn plugged_ranges(&self) -> MemoryRangeTable {
120         self.blocks_state
121             .lock()
122             .unwrap()
123             .memory_ranges(self.region.start_addr().raw_value(), true)
124     }
125 }
126 
127 #[derive(Default)]
128 pub struct MemoryZone {
129     regions: Vec<Arc<GuestRegionMmap>>,
130     virtio_mem_zone: Option<VirtioMemZone>,
131 }
132 
133 impl MemoryZone {
regions(&self) -> &Vec<Arc<GuestRegionMmap>>134     pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> {
135         &self.regions
136     }
virtio_mem_zone(&self) -> &Option<VirtioMemZone>137     pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> {
138         &self.virtio_mem_zone
139     }
virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone>140     pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> {
141         self.virtio_mem_zone.as_mut()
142     }
143 }
144 
145 pub type MemoryZones = HashMap<String, MemoryZone>;
146 
147 #[derive(Clone, Serialize, Deserialize)]
148 struct GuestRamMapping {
149     slot: u32,
150     gpa: u64,
151     size: u64,
152     zone_id: String,
153     virtio_mem: bool,
154     file_offset: u64,
155 }
156 
157 #[derive(Clone, Serialize, Deserialize)]
158 struct ArchMemRegion {
159     base: u64,
160     size: usize,
161     r_type: RegionType,
162 }
163 
164 pub struct MemoryManager {
165     boot_guest_memory: GuestMemoryMmap,
166     guest_memory: GuestMemoryAtomic<GuestMemoryMmap>,
167     next_memory_slot: Arc<AtomicU32>,
168     memory_slot_free_list: Arc<Mutex<Vec<u32>>>,
169     start_of_device_area: GuestAddress,
170     end_of_device_area: GuestAddress,
171     end_of_ram_area: GuestAddress,
172     pub vm: Arc<dyn hypervisor::Vm>,
173     hotplug_slots: Vec<HotPlugState>,
174     selected_slot: usize,
175     mergeable: bool,
176     allocator: Arc<Mutex<SystemAllocator>>,
177     hotplug_method: HotplugMethod,
178     boot_ram: u64,
179     current_ram: u64,
180     next_hotplug_slot: usize,
181     shared: bool,
182     hugepages: bool,
183     hugepage_size: Option<u64>,
184     prefault: bool,
185     thp: bool,
186     #[cfg(target_arch = "x86_64")]
187     sgx_epc_region: Option<SgxEpcRegion>,
188     user_provided_zones: bool,
189     snapshot_memory_ranges: MemoryRangeTable,
190     memory_zones: MemoryZones,
191     log_dirty: bool, // Enable dirty logging for created RAM regions
192     arch_mem_regions: Vec<ArchMemRegion>,
193     ram_allocator: AddressAllocator,
194     dynamic: bool,
195 
196     // Keep track of calls to create_userspace_mapping() for guest RAM.
197     // This is useful for getting the dirty pages as we need to know the
198     // slots that the mapping is created in.
199     guest_ram_mappings: Vec<GuestRamMapping>,
200 
201     pub acpi_address: Option<GuestAddress>,
202     #[cfg(target_arch = "aarch64")]
203     uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>,
204 }
205 
206 #[derive(Error, Debug)]
207 pub enum Error {
208     /// Failed to create shared file.
209     #[error("Failed to create shared file")]
210     SharedFileCreate(#[source] io::Error),
211 
212     /// Failed to set shared file length.
213     #[error("Failed to set shared file length")]
214     SharedFileSetLen(#[source] io::Error),
215 
216     /// Mmap backed guest memory error
217     #[error("Mmap backed guest memory error")]
218     GuestMemory(#[source] MmapError),
219 
220     /// Failed to allocate a memory range.
221     #[error("Failed to allocate a memory range")]
222     MemoryRangeAllocation,
223 
224     /// Error from region creation
225     #[error("Error from region creation")]
226     GuestMemoryRegion(#[source] MmapRegionError),
227 
228     /// No ACPI slot available
229     #[error("No ACPI slot available")]
230     NoSlotAvailable,
231 
232     /// Not enough space in the hotplug RAM region
233     #[error("Not enough space in the hotplug RAM region")]
234     InsufficientHotplugRam,
235 
236     /// The requested hotplug memory addition is not a valid size
237     #[error("The requested hotplug memory addition is not a valid size")]
238     InvalidSize,
239 
240     /// Failed to create the user memory region.
241     #[error("Failed to create the user memory region")]
242     CreateUserMemoryRegion(#[source] hypervisor::HypervisorVmError),
243 
244     /// Failed to remove the user memory region.
245     #[error("Failed to remove the user memory region")]
246     RemoveUserMemoryRegion(#[source] hypervisor::HypervisorVmError),
247 
248     /// Failed to EventFd.
249     #[error("Failed to EventFd")]
250     EventFdFail(#[source] io::Error),
251 
252     /// Eventfd write error
253     #[error("Eventfd write error")]
254     EventfdError(#[source] io::Error),
255 
256     /// Failed to virtio-mem resize
257     #[error("Failed to virtio-mem resize")]
258     VirtioMemResizeFail(#[source] virtio_devices::mem::Error),
259 
260     /// Cannot restore VM
261     #[error("Cannot restore VM")]
262     Restore(#[source] MigratableError),
263 
264     /// Cannot restore VM because source URL is missing
265     #[error("Cannot restore VM because source URL is missing")]
266     RestoreMissingSourceUrl,
267 
268     /// Cannot create the system allocator
269     #[error("Cannot create the system allocator")]
270     CreateSystemAllocator,
271 
272     /// Invalid SGX EPC section size
273     #[cfg(target_arch = "x86_64")]
274     #[error("Invalid SGX EPC section size")]
275     EpcSectionSizeInvalid,
276 
277     /// Failed allocating SGX EPC region
278     #[cfg(target_arch = "x86_64")]
279     #[error("Failed allocating SGX EPC region")]
280     SgxEpcRangeAllocation,
281 
282     /// Failed opening SGX virtual EPC device
283     #[cfg(target_arch = "x86_64")]
284     #[error("Failed opening SGX virtual EPC device")]
285     SgxVirtEpcOpen(#[source] io::Error),
286 
287     /// Failed setting the SGX virtual EPC section size
288     #[cfg(target_arch = "x86_64")]
289     #[error("Failed setting the SGX virtual EPC section size")]
290     SgxVirtEpcFileSetLen(#[source] io::Error),
291 
292     /// Failed opening SGX provisioning device
293     #[cfg(target_arch = "x86_64")]
294     #[error("Failed opening SGX provisioning device")]
295     SgxProvisionOpen(#[source] io::Error),
296 
297     /// Failed enabling SGX provisioning
298     #[cfg(target_arch = "x86_64")]
299     #[error("Failed enabling SGX provisioning")]
300     SgxEnableProvisioning(#[source] hypervisor::HypervisorVmError),
301 
302     /// Failed creating a new MmapRegion instance.
303     #[cfg(target_arch = "x86_64")]
304     #[error("Failed creating a new MmapRegion instance")]
305     NewMmapRegion(#[source] vm_memory::mmap::MmapRegionError),
306 
307     /// No memory zones found.
308     #[error("No memory zones found")]
309     MissingMemoryZones,
310 
311     /// Memory configuration is not valid.
312     #[error("Memory configuration is not valid")]
313     InvalidMemoryParameters,
314 
315     /// Forbidden operation. Impossible to resize guest memory if it is
316     /// backed by user defined memory regions.
317     #[error("Impossible to resize guest memory if it is backed by user defined memory regions")]
318     InvalidResizeWithMemoryZones,
319 
320     /// It's invalid to try applying a NUMA policy to a memory zone that is
321     /// memory mapped with MAP_SHARED.
322     #[error("Invalid to try applying a NUMA policy to a memory zone that is memory mapped with MAP_SHARED")]
323     InvalidSharedMemoryZoneWithHostNuma,
324 
325     /// Failed applying NUMA memory policy.
326     #[error("Failed applying NUMA memory policy")]
327     ApplyNumaPolicy(#[source] io::Error),
328 
329     /// Memory zone identifier is not unique.
330     #[error("Memory zone identifier is not unique")]
331     DuplicateZoneId,
332 
333     /// No virtio-mem resizing handler found.
334     #[error("No virtio-mem resizing handler found")]
335     MissingVirtioMemHandler,
336 
337     /// Unknown memory zone.
338     #[error("Unknown memory zone")]
339     UnknownMemoryZone,
340 
341     /// Invalid size for resizing. Can be anything except 0.
342     #[error("Invalid size for resizing. Can be anything except 0")]
343     InvalidHotplugSize,
344 
345     /// Invalid hotplug method associated with memory zones resizing capability.
346     #[error("Invalid hotplug method associated with memory zones resizing capability")]
347     InvalidHotplugMethodWithMemoryZones,
348 
349     /// Could not find specified memory zone identifier from hash map.
350     #[error("Could not find specified memory zone identifier from hash map")]
351     MissingZoneIdentifier,
352 
353     /// Resizing the memory zone failed.
354     #[error("Resizing the memory zone failed")]
355     ResizeZone,
356 
357     /// Guest address overflow
358     #[error("Guest address overflow")]
359     GuestAddressOverFlow,
360 
361     /// Error opening snapshot file
362     #[error("Error opening snapshot file")]
363     SnapshotOpen(#[source] io::Error),
364 
365     // Error copying snapshot into region
366     #[error("Error copying snapshot into region")]
367     SnapshotCopy(#[source] GuestMemoryError),
368 
369     /// Failed to allocate MMIO address
370     #[error("Failed to allocate MMIO address")]
371     AllocateMmioAddress,
372 
373     #[cfg(target_arch = "aarch64")]
374     /// Failed to create UEFI flash
375     #[error("Failed to create UEFI flash")]
376     CreateUefiFlash(#[source] HypervisorVmError),
377 
378     /// Using a directory as a backing file for memory is not supported
379     #[error("Using a directory as a backing file for memory is not supported")]
380     DirectoryAsBackingFileForMemory,
381 
382     /// Failed to stat filesystem
383     #[error("Failed to stat filesystem")]
384     GetFileSystemBlockSize(#[source] io::Error),
385 
386     /// Memory size is misaligned with default page size or its hugepage size
387     #[error("Memory size is misaligned with default page size or its hugepage size")]
388     MisalignedMemorySize,
389 }
390 
391 const ENABLE_FLAG: usize = 0;
392 const INSERTING_FLAG: usize = 1;
393 const REMOVING_FLAG: usize = 2;
394 const EJECT_FLAG: usize = 3;
395 
396 const BASE_OFFSET_LOW: u64 = 0;
397 const BASE_OFFSET_HIGH: u64 = 0x4;
398 const LENGTH_OFFSET_LOW: u64 = 0x8;
399 const LENGTH_OFFSET_HIGH: u64 = 0xC;
400 const STATUS_OFFSET: u64 = 0x14;
401 const SELECTION_OFFSET: u64 = 0;
402 
403 // The MMIO address space size is subtracted with 64k. This is done for the
404 // following reasons:
405 //  - Reduce the addressable space size by at least 4k to workaround a Linux
406 //    bug when the VMM allocates devices at the end of the addressable space
407 //  - Windows requires the addressable space size to be 64k aligned
mmio_address_space_size(phys_bits: u8) -> u64408 fn mmio_address_space_size(phys_bits: u8) -> u64 {
409     (1 << phys_bits) - (1 << 16)
410 }
411 
412 // The `statfs` function can get information of hugetlbfs, and the hugepage size is in the
413 // `f_bsize` field.
414 //
415 // See: https://github.com/torvalds/linux/blob/v6.3/fs/hugetlbfs/inode.c#L1169
statfs_get_bsize(path: &str) -> Result<u64, Error>416 fn statfs_get_bsize(path: &str) -> Result<u64, Error> {
417     let path = std::ffi::CString::new(path).map_err(|_| Error::InvalidMemoryParameters)?;
418     let mut buf = std::mem::MaybeUninit::<libc::statfs>::uninit();
419 
420     // SAFETY: FFI call with a valid path and buffer
421     let ret = unsafe { libc::statfs(path.as_ptr(), buf.as_mut_ptr()) };
422     if ret != 0 {
423         return Err(Error::GetFileSystemBlockSize(
424             std::io::Error::last_os_error(),
425         ));
426     }
427 
428     // SAFETY: `buf` is valid at this point
429     // Because this value is always positive, just convert it directly.
430     // Note that the `f_bsize` is `i64` in glibc and `u64` in musl, using `as u64` will be warned
431     // by `clippy` on musl target.  To avoid the warning, there should be `as _` instead of
432     // `as u64`.
433     let bsize = unsafe { (*buf.as_ptr()).f_bsize } as _;
434     Ok(bsize)
435 }
436 
memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error>437 fn memory_zone_get_align_size(zone: &MemoryZoneConfig) -> Result<u64, Error> {
438     // SAFETY: FFI call. Trivially safe.
439     let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
440 
441     // There is no backend file and the `hugepages` is disabled, just use system page size.
442     if zone.file.is_none() && !zone.hugepages {
443         return Ok(page_size);
444     }
445 
446     // The `hugepages` is enabled and the `hugepage_size` is specified, just use it directly.
447     if zone.hugepages && zone.hugepage_size.is_some() {
448         return Ok(zone.hugepage_size.unwrap());
449     }
450 
451     // There are two scenarios here:
452     //  - `hugepages` is enabled but `hugepage_size` is not specified:
453     //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
454     //  - The backing file is specified:
455     //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
456     //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
457     //     value is less than or equal to the page size, just use the page size.
458     let path = zone.file.as_ref().map_or(Ok("/dev/hugepages"), |pathbuf| {
459         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
460     })?;
461 
462     let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
463 
464     Ok(align_size)
465 }
466 
467 #[inline]
align_down<T>(val: T, align: T) -> T where T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,468 fn align_down<T>(val: T, align: T) -> T
469 where
470     T: BitAnd<Output = T> + Not<Output = T> + Sub<Output = T> + From<u8>,
471 {
472     val & !(align - 1u8.into())
473 }
474 
475 #[inline]
is_aligned<T>(val: T, align: T) -> bool where T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,476 fn is_aligned<T>(val: T, align: T) -> bool
477 where
478     T: BitAnd<Output = T> + Sub<Output = T> + From<u8> + PartialEq,
479 {
480     (val & (align - 1u8.into())) == 0u8.into()
481 }
482 
483 impl BusDevice for MemoryManager {
read(&mut self, _base: u64, offset: u64, data: &mut [u8])484     fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) {
485         if self.selected_slot < self.hotplug_slots.len() {
486             let state = &self.hotplug_slots[self.selected_slot];
487             match offset {
488                 BASE_OFFSET_LOW => {
489                     data.copy_from_slice(&state.base.to_le_bytes()[..4]);
490                 }
491                 BASE_OFFSET_HIGH => {
492                     data.copy_from_slice(&state.base.to_le_bytes()[4..]);
493                 }
494                 LENGTH_OFFSET_LOW => {
495                     data.copy_from_slice(&state.length.to_le_bytes()[..4]);
496                 }
497                 LENGTH_OFFSET_HIGH => {
498                     data.copy_from_slice(&state.length.to_le_bytes()[4..]);
499                 }
500                 STATUS_OFFSET => {
501                     // The Linux kernel, quite reasonably, doesn't zero the memory it gives us.
502                     data.fill(0);
503                     if state.active {
504                         data[0] |= 1 << ENABLE_FLAG;
505                     }
506                     if state.inserting {
507                         data[0] |= 1 << INSERTING_FLAG;
508                     }
509                     if state.removing {
510                         data[0] |= 1 << REMOVING_FLAG;
511                     }
512                 }
513                 _ => {
514                     warn!(
515                         "Unexpected offset for accessing memory manager device: {:#}",
516                         offset
517                     );
518                 }
519             }
520         } else {
521             warn!("Out of range memory slot: {}", self.selected_slot);
522         }
523     }
524 
write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>>525     fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> {
526         match offset {
527             SELECTION_OFFSET => {
528                 self.selected_slot = usize::from(data[0]);
529             }
530             STATUS_OFFSET => {
531                 if self.selected_slot < self.hotplug_slots.len() {
532                     let state = &mut self.hotplug_slots[self.selected_slot];
533                     // The ACPI code writes back a 1 to acknowledge the insertion
534                     if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting {
535                         state.inserting = false;
536                     }
537                     // Ditto for removal
538                     if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing {
539                         state.removing = false;
540                     }
541                     // Trigger removal of "DIMM"
542                     if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG {
543                         warn!("Ejection of memory not currently supported");
544                     }
545                 } else {
546                     warn!("Out of range memory slot: {}", self.selected_slot);
547                 }
548             }
549             _ => {
550                 warn!(
551                     "Unexpected offset for accessing memory manager device: {:#}",
552                     offset
553                 );
554             }
555         };
556         None
557     }
558 }
559 
560 impl MemoryManager {
561     /// Creates all memory regions based on the available RAM ranges defined
562     /// by `ram_regions`, and based on the description of the memory zones.
563     /// In practice, this function can perform multiple memory mappings of the
564     /// same backing file if there's a hole in the address space between two
565     /// RAM ranges.
566     ///
567     /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G)
568     /// and zones containing two zones (size 1G and size 4G).
569     ///
570     /// This function will create 3 resulting memory regions:
571     /// - First one mapping entirely the first memory zone on 0-1G range
572     /// - Second one mapping partially the second memory zone on 1G-3G range
573     /// - Third one mapping partially the second memory zone on 4G-6G range
574     ///
575     /// Also, all memory regions are page-size aligned (e.g. their sizes must
576     /// be multiple of page-size), which may leave an additional hole in the
577     /// address space when hugepage is used.
create_memory_regions_from_zones( ram_regions: &[(GuestAddress, usize)], zones: &[MemoryZoneConfig], prefault: Option<bool>, thp: bool, ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error>578     fn create_memory_regions_from_zones(
579         ram_regions: &[(GuestAddress, usize)],
580         zones: &[MemoryZoneConfig],
581         prefault: Option<bool>,
582         thp: bool,
583     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
584         let mut zone_iter = zones.iter();
585         let mut mem_regions = Vec::new();
586         let mut zone = zone_iter.next().ok_or(Error::MissingMemoryZones)?;
587         let mut zone_align_size = memory_zone_get_align_size(zone)?;
588         let mut zone_offset = 0u64;
589         let mut memory_zones = HashMap::new();
590 
591         if !is_aligned(zone.size, zone_align_size) {
592             return Err(Error::MisalignedMemorySize);
593         }
594 
595         // Add zone id to the list of memory zones.
596         memory_zones.insert(zone.id.clone(), MemoryZone::default());
597 
598         for ram_region in ram_regions.iter() {
599             let mut ram_region_offset = 0;
600             let mut exit = false;
601 
602             loop {
603                 let mut ram_region_consumed = false;
604                 let mut pull_next_zone = false;
605 
606                 let ram_region_available_size =
607                     align_down(ram_region.1 as u64 - ram_region_offset, zone_align_size);
608                 if ram_region_available_size == 0 {
609                     break;
610                 }
611                 let zone_sub_size = zone.size - zone_offset;
612 
613                 let file_offset = zone_offset;
614                 let region_start = ram_region
615                     .0
616                     .checked_add(ram_region_offset)
617                     .ok_or(Error::GuestAddressOverFlow)?;
618                 let region_size = if zone_sub_size <= ram_region_available_size {
619                     if zone_sub_size == ram_region_available_size {
620                         ram_region_consumed = true;
621                     }
622 
623                     ram_region_offset += zone_sub_size;
624                     pull_next_zone = true;
625 
626                     zone_sub_size
627                 } else {
628                     zone_offset += ram_region_available_size;
629                     ram_region_consumed = true;
630 
631                     ram_region_available_size
632                 };
633 
634                 info!(
635                     "create ram region for zone {}, region_start: {:#x}, region_size: {:#x}",
636                     zone.id,
637                     region_start.raw_value(),
638                     region_size
639                 );
640                 let region = MemoryManager::create_ram_region(
641                     &zone.file,
642                     file_offset,
643                     region_start,
644                     region_size as usize,
645                     prefault.unwrap_or(zone.prefault),
646                     zone.shared,
647                     zone.hugepages,
648                     zone.hugepage_size,
649                     zone.host_numa_node,
650                     None,
651                     thp,
652                 )?;
653 
654                 // Add region to the list of regions associated with the
655                 // current memory zone.
656                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
657                     memory_zone.regions.push(region.clone());
658                 }
659 
660                 mem_regions.push(region);
661 
662                 if pull_next_zone {
663                     // Get the next zone and reset the offset.
664                     zone_offset = 0;
665                     if let Some(z) = zone_iter.next() {
666                         zone = z;
667                     } else {
668                         exit = true;
669                         break;
670                     }
671                     zone_align_size = memory_zone_get_align_size(zone)?;
672                     if !is_aligned(zone.size, zone_align_size) {
673                         return Err(Error::MisalignedMemorySize);
674                     }
675 
676                     // Check if zone id already exist. In case it does, throw
677                     // an error as we need unique identifiers. Otherwise, add
678                     // the new zone id to the list of memory zones.
679                     if memory_zones.contains_key(&zone.id) {
680                         error!(
681                             "Memory zone identifier '{}' found more than once. \
682                             It must be unique",
683                             zone.id,
684                         );
685                         return Err(Error::DuplicateZoneId);
686                     }
687                     memory_zones.insert(zone.id.clone(), MemoryZone::default());
688                 }
689 
690                 if ram_region_consumed {
691                     break;
692                 }
693             }
694 
695             if exit {
696                 break;
697             }
698         }
699 
700         Ok((mem_regions, memory_zones))
701     }
702 
703     // Restore both GuestMemory regions along with MemoryZone zones.
restore_memory_regions_and_zones( guest_ram_mappings: &[GuestRamMapping], zones_config: &[MemoryZoneConfig], prefault: Option<bool>, mut existing_memory_files: HashMap<u32, File>, thp: bool, ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error>704     fn restore_memory_regions_and_zones(
705         guest_ram_mappings: &[GuestRamMapping],
706         zones_config: &[MemoryZoneConfig],
707         prefault: Option<bool>,
708         mut existing_memory_files: HashMap<u32, File>,
709         thp: bool,
710     ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> {
711         let mut memory_regions = Vec::new();
712         let mut memory_zones = HashMap::new();
713 
714         for zone_config in zones_config {
715             memory_zones.insert(zone_config.id.clone(), MemoryZone::default());
716         }
717 
718         for guest_ram_mapping in guest_ram_mappings {
719             for zone_config in zones_config {
720                 if guest_ram_mapping.zone_id == zone_config.id {
721                     let region = MemoryManager::create_ram_region(
722                         if guest_ram_mapping.virtio_mem {
723                             &None
724                         } else {
725                             &zone_config.file
726                         },
727                         guest_ram_mapping.file_offset,
728                         GuestAddress(guest_ram_mapping.gpa),
729                         guest_ram_mapping.size as usize,
730                         prefault.unwrap_or(zone_config.prefault),
731                         zone_config.shared,
732                         zone_config.hugepages,
733                         zone_config.hugepage_size,
734                         zone_config.host_numa_node,
735                         existing_memory_files.remove(&guest_ram_mapping.slot),
736                         thp,
737                     )?;
738                     memory_regions.push(Arc::clone(&region));
739                     if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) {
740                         if guest_ram_mapping.virtio_mem {
741                             let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0);
742                             let region_size = region.len();
743                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
744                                 region,
745                                 virtio_device: None,
746                                 hotplugged_size,
747                                 hugepages: zone_config.hugepages,
748                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
749                             });
750                         } else {
751                             memory_zone.regions.push(region);
752                         }
753                     }
754                 }
755             }
756         }
757 
758         memory_regions.sort_by_key(|x| x.start_addr());
759 
760         Ok((memory_regions, memory_zones))
761     }
762 
fill_saved_regions( &mut self, file_path: PathBuf, saved_regions: MemoryRangeTable, ) -> Result<(), Error>763     fn fill_saved_regions(
764         &mut self,
765         file_path: PathBuf,
766         saved_regions: MemoryRangeTable,
767     ) -> Result<(), Error> {
768         if saved_regions.is_empty() {
769             return Ok(());
770         }
771 
772         // Open (read only) the snapshot file.
773         let mut memory_file = OpenOptions::new()
774             .read(true)
775             .open(file_path)
776             .map_err(Error::SnapshotOpen)?;
777 
778         let guest_memory = self.guest_memory.memory();
779         for range in saved_regions.regions() {
780             let mut offset: u64 = 0;
781             // Here we are manually handling the retry in case we can't write
782             // the whole region at once because we can't use the implementation
783             // from vm-memory::GuestMemory of read_exact_from() as it is not
784             // following the correct behavior. For more info about this issue
785             // see: https://github.com/rust-vmm/vm-memory/issues/174
786             loop {
787                 let bytes_read = guest_memory
788                     .read_volatile_from(
789                         GuestAddress(range.gpa + offset),
790                         &mut memory_file,
791                         (range.length - offset) as usize,
792                     )
793                     .map_err(Error::SnapshotCopy)?;
794                 offset += bytes_read as u64;
795 
796                 if offset == range.length {
797                     break;
798                 }
799             }
800         }
801 
802         Ok(())
803     }
804 
validate_memory_config( config: &MemoryConfig, user_provided_zones: bool, ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error>805     fn validate_memory_config(
806         config: &MemoryConfig,
807         user_provided_zones: bool,
808     ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> {
809         let mut allow_mem_hotplug = false;
810 
811         if !user_provided_zones {
812             if config.zones.is_some() {
813                 error!(
814                     "User defined memory regions can't be provided if the \
815                     memory size is not 0"
816                 );
817                 return Err(Error::InvalidMemoryParameters);
818             }
819 
820             if config.hotplug_size.is_some() {
821                 allow_mem_hotplug = true;
822             }
823 
824             if let Some(hotplugged_size) = config.hotplugged_size {
825                 if let Some(hotplug_size) = config.hotplug_size {
826                     if hotplugged_size > hotplug_size {
827                         error!(
828                             "'hotplugged_size' {} can't be bigger than \
829                             'hotplug_size' {}",
830                             hotplugged_size, hotplug_size,
831                         );
832                         return Err(Error::InvalidMemoryParameters);
833                     }
834                 } else {
835                     error!(
836                         "Invalid to define 'hotplugged_size' when there is\
837                         no 'hotplug_size'"
838                     );
839                     return Err(Error::InvalidMemoryParameters);
840                 }
841                 if config.hotplug_method == HotplugMethod::Acpi {
842                     error!(
843                         "Invalid to define 'hotplugged_size' with hotplug \
844                         method 'acpi'"
845                     );
846                     return Err(Error::InvalidMemoryParameters);
847                 }
848             }
849 
850             // Create a single zone from the global memory config. This lets
851             // us reuse the codepath for user defined memory zones.
852             let zones = vec![MemoryZoneConfig {
853                 id: String::from(DEFAULT_MEMORY_ZONE),
854                 size: config.size,
855                 file: None,
856                 shared: config.shared,
857                 hugepages: config.hugepages,
858                 hugepage_size: config.hugepage_size,
859                 host_numa_node: None,
860                 hotplug_size: config.hotplug_size,
861                 hotplugged_size: config.hotplugged_size,
862                 prefault: config.prefault,
863             }];
864 
865             Ok((config.size, zones, allow_mem_hotplug))
866         } else {
867             if config.zones.is_none() {
868                 error!(
869                     "User defined memory regions must be provided if the \
870                     memory size is 0"
871                 );
872                 return Err(Error::MissingMemoryZones);
873             }
874 
875             // Safe to unwrap as we checked right above there were some
876             // regions.
877             let zones = config.zones.clone().unwrap();
878             if zones.is_empty() {
879                 return Err(Error::MissingMemoryZones);
880             }
881 
882             let mut total_ram_size: u64 = 0;
883             for zone in zones.iter() {
884                 total_ram_size += zone.size;
885 
886                 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() {
887                     error!(
888                         "Invalid to set host NUMA policy for a memory zone \
889                         backed by a regular file and mapped as 'shared'"
890                     );
891                     return Err(Error::InvalidSharedMemoryZoneWithHostNuma);
892                 }
893 
894                 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi {
895                     error!("Invalid to set ACPI hotplug method for memory zones");
896                     return Err(Error::InvalidHotplugMethodWithMemoryZones);
897                 }
898 
899                 if let Some(hotplugged_size) = zone.hotplugged_size {
900                     if let Some(hotplug_size) = zone.hotplug_size {
901                         if hotplugged_size > hotplug_size {
902                             error!(
903                                 "'hotplugged_size' {} can't be bigger than \
904                                 'hotplug_size' {}",
905                                 hotplugged_size, hotplug_size,
906                             );
907                             return Err(Error::InvalidMemoryParameters);
908                         }
909                     } else {
910                         error!(
911                             "Invalid to define 'hotplugged_size' when there is\
912                             no 'hotplug_size' for a memory zone"
913                         );
914                         return Err(Error::InvalidMemoryParameters);
915                     }
916                     if config.hotplug_method == HotplugMethod::Acpi {
917                         error!(
918                             "Invalid to define 'hotplugged_size' with hotplug \
919                             method 'acpi'"
920                         );
921                         return Err(Error::InvalidMemoryParameters);
922                     }
923                 }
924             }
925 
926             Ok((total_ram_size, zones, allow_mem_hotplug))
927         }
928     }
929 
allocate_address_space(&mut self) -> Result<(), Error>930     pub fn allocate_address_space(&mut self) -> Result<(), Error> {
931         let mut list = Vec::new();
932 
933         for (zone_id, memory_zone) in self.memory_zones.iter() {
934             let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> =
935                 memory_zone
936                     .regions()
937                     .iter()
938                     .map(|r| (r.clone(), false))
939                     .collect();
940 
941             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
942                 regions.push((virtio_mem_zone.region().clone(), true));
943             }
944 
945             list.push((zone_id.clone(), regions));
946         }
947 
948         for (zone_id, regions) in list {
949             for (region, virtio_mem) in regions {
950                 let slot = self.create_userspace_mapping(
951                     region.start_addr().raw_value(),
952                     region.len(),
953                     region.as_ptr() as u64,
954                     self.mergeable,
955                     false,
956                     self.log_dirty,
957                 )?;
958 
959                 let file_offset = if let Some(file_offset) = region.file_offset() {
960                     file_offset.start()
961                 } else {
962                     0
963                 };
964 
965                 self.guest_ram_mappings.push(GuestRamMapping {
966                     gpa: region.start_addr().raw_value(),
967                     size: region.len(),
968                     slot,
969                     zone_id: zone_id.clone(),
970                     virtio_mem,
971                     file_offset,
972                 });
973                 self.ram_allocator
974                     .allocate(Some(region.start_addr()), region.len(), None)
975                     .ok_or(Error::MemoryRangeAllocation)?;
976             }
977         }
978 
979         // Allocate SubRegion and Reserved address ranges.
980         for region in self.arch_mem_regions.iter() {
981             if region.r_type == RegionType::Ram {
982                 // Ignore the RAM type since ranges have already been allocated
983                 // based on the GuestMemory regions.
984                 continue;
985             }
986             self.ram_allocator
987                 .allocate(
988                     Some(GuestAddress(region.base)),
989                     region.size as GuestUsize,
990                     None,
991                 )
992                 .ok_or(Error::MemoryRangeAllocation)?;
993         }
994 
995         Ok(())
996     }
997 
998     #[cfg(target_arch = "aarch64")]
add_uefi_flash(&mut self) -> Result<(), Error>999     pub fn add_uefi_flash(&mut self) -> Result<(), Error> {
1000         // On AArch64, the UEFI binary requires a flash device at address 0.
1001         // 4 MiB memory is mapped to simulate the flash.
1002         let uefi_mem_slot = self.allocate_memory_slot();
1003         let uefi_region = GuestRegionMmap::new(
1004             MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(),
1005             arch::layout::UEFI_START,
1006         )
1007         .unwrap();
1008         let uefi_mem_region = self.vm.make_user_memory_region(
1009             uefi_mem_slot,
1010             uefi_region.start_addr().raw_value(),
1011             uefi_region.len(),
1012             uefi_region.as_ptr() as u64,
1013             false,
1014             false,
1015         );
1016         self.vm
1017             .create_user_memory_region(uefi_mem_region)
1018             .map_err(Error::CreateUefiFlash)?;
1019 
1020         let uefi_flash =
1021             GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap());
1022 
1023         self.uefi_flash = Some(uefi_flash);
1024 
1025         Ok(())
1026     }
1027 
1028     #[allow(clippy::too_many_arguments)]
new( vm: Arc<dyn hypervisor::Vm>, config: &MemoryConfig, prefault: Option<bool>, phys_bits: u8, #[cfg(feature = "tdx")] tdx_enabled: bool, restore_data: Option<&MemoryManagerSnapshotData>, existing_memory_files: Option<HashMap<u32, File>>, #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, ) -> Result<Arc<Mutex<MemoryManager>>, Error>1029     pub fn new(
1030         vm: Arc<dyn hypervisor::Vm>,
1031         config: &MemoryConfig,
1032         prefault: Option<bool>,
1033         phys_bits: u8,
1034         #[cfg(feature = "tdx")] tdx_enabled: bool,
1035         restore_data: Option<&MemoryManagerSnapshotData>,
1036         existing_memory_files: Option<HashMap<u32, File>>,
1037         #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>,
1038     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1039         trace_scoped!("MemoryManager::new");
1040 
1041         let user_provided_zones = config.size == 0;
1042 
1043         let mmio_address_space_size = mmio_address_space_size(phys_bits);
1044         debug_assert_eq!(
1045             (((mmio_address_space_size) >> 16) << 16),
1046             mmio_address_space_size
1047         );
1048         let start_of_platform_device_area =
1049             GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE);
1050         let end_of_device_area = start_of_platform_device_area.unchecked_sub(1);
1051 
1052         let (ram_size, zones, allow_mem_hotplug) =
1053             Self::validate_memory_config(config, user_provided_zones)?;
1054 
1055         let (
1056             start_of_device_area,
1057             boot_ram,
1058             current_ram,
1059             arch_mem_regions,
1060             memory_zones,
1061             guest_memory,
1062             boot_guest_memory,
1063             hotplug_slots,
1064             next_memory_slot,
1065             selected_slot,
1066             next_hotplug_slot,
1067         ) = if let Some(data) = restore_data {
1068             let (regions, memory_zones) = Self::restore_memory_regions_and_zones(
1069                 &data.guest_ram_mappings,
1070                 &zones,
1071                 prefault,
1072                 existing_memory_files.unwrap_or_default(),
1073                 config.thp,
1074             )?;
1075             let guest_memory =
1076                 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?;
1077             let boot_guest_memory = guest_memory.clone();
1078             (
1079                 GuestAddress(data.start_of_device_area),
1080                 data.boot_ram,
1081                 data.current_ram,
1082                 data.arch_mem_regions.clone(),
1083                 memory_zones,
1084                 guest_memory,
1085                 boot_guest_memory,
1086                 data.hotplug_slots.clone(),
1087                 data.next_memory_slot,
1088                 data.selected_slot,
1089                 data.next_hotplug_slot,
1090             )
1091         } else {
1092             // Init guest memory
1093             let arch_mem_regions = arch::arch_memory_regions();
1094 
1095             let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1096                 .iter()
1097                 .filter(|r| r.2 == RegionType::Ram)
1098                 .map(|r| (r.0, r.1))
1099                 .collect();
1100 
1101             let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions
1102                 .iter()
1103                 .map(|(a, b, c)| ArchMemRegion {
1104                     base: a.0,
1105                     size: *b,
1106                     r_type: *c,
1107                 })
1108                 .collect();
1109 
1110             let (mem_regions, mut memory_zones) =
1111                 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?;
1112 
1113             let mut guest_memory =
1114                 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?;
1115 
1116             let boot_guest_memory = guest_memory.clone();
1117 
1118             let mut start_of_device_area =
1119                 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?;
1120 
1121             // Update list of memory zones for resize.
1122             for zone in zones.iter() {
1123                 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) {
1124                     if let Some(hotplug_size) = zone.hotplug_size {
1125                         if hotplug_size == 0 {
1126                             error!("'hotplug_size' can't be 0");
1127                             return Err(Error::InvalidHotplugSize);
1128                         }
1129 
1130                         if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi {
1131                             start_of_device_area = start_of_device_area
1132                                 .checked_add(hotplug_size)
1133                                 .ok_or(Error::GuestAddressOverFlow)?;
1134                         } else {
1135                             // Alignment must be "natural" i.e. same as size of block
1136                             let start_addr = GuestAddress(
1137                                 start_of_device_area
1138                                     .0
1139                                     .div_ceil(virtio_devices::VIRTIO_MEM_ALIGN_SIZE)
1140                                     * virtio_devices::VIRTIO_MEM_ALIGN_SIZE,
1141                             );
1142 
1143                             // When `prefault` is set by vm_restore, memory manager
1144                             // will create ram region with `prefault` option in
1145                             // restore config rather than same option in zone
1146                             let region = MemoryManager::create_ram_region(
1147                                 &None,
1148                                 0,
1149                                 start_addr,
1150                                 hotplug_size as usize,
1151                                 prefault.unwrap_or(zone.prefault),
1152                                 zone.shared,
1153                                 zone.hugepages,
1154                                 zone.hugepage_size,
1155                                 zone.host_numa_node,
1156                                 None,
1157                                 config.thp,
1158                             )?;
1159 
1160                             guest_memory = guest_memory
1161                                 .insert_region(Arc::clone(&region))
1162                                 .map_err(Error::GuestMemory)?;
1163 
1164                             let hotplugged_size = zone.hotplugged_size.unwrap_or(0);
1165                             let region_size = region.len();
1166                             memory_zone.virtio_mem_zone = Some(VirtioMemZone {
1167                                 region,
1168                                 virtio_device: None,
1169                                 hotplugged_size,
1170                                 hugepages: zone.hugepages,
1171                                 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))),
1172                             });
1173 
1174                             start_of_device_area = start_addr
1175                                 .checked_add(hotplug_size)
1176                                 .ok_or(Error::GuestAddressOverFlow)?;
1177                         }
1178                     }
1179                 } else {
1180                     return Err(Error::MissingZoneIdentifier);
1181                 }
1182             }
1183 
1184             let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT);
1185             hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default);
1186 
1187             (
1188                 start_of_device_area,
1189                 ram_size,
1190                 ram_size,
1191                 arch_mem_regions,
1192                 memory_zones,
1193                 guest_memory,
1194                 boot_guest_memory,
1195                 hotplug_slots,
1196                 0,
1197                 0,
1198                 0,
1199             )
1200         };
1201 
1202         let guest_memory = GuestMemoryAtomic::new(guest_memory);
1203 
1204         let allocator = Arc::new(Mutex::new(
1205             SystemAllocator::new(
1206                 GuestAddress(0),
1207                 1 << 16,
1208                 start_of_platform_device_area,
1209                 PLATFORM_DEVICE_AREA_SIZE,
1210                 #[cfg(target_arch = "x86_64")]
1211                 vec![GsiApic::new(
1212                     X86_64_IRQ_BASE,
1213                     ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE,
1214                 )],
1215             )
1216             .ok_or(Error::CreateSystemAllocator)?,
1217         ));
1218 
1219         #[cfg(not(feature = "tdx"))]
1220         let dynamic = true;
1221         #[cfg(feature = "tdx")]
1222         let dynamic = !tdx_enabled;
1223 
1224         let acpi_address = if dynamic
1225             && config.hotplug_method == HotplugMethod::Acpi
1226             && (config.hotplug_size.unwrap_or_default() > 0)
1227         {
1228             Some(
1229                 allocator
1230                     .lock()
1231                     .unwrap()
1232                     .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None)
1233                     .ok_or(Error::AllocateMmioAddress)?,
1234             )
1235         } else {
1236             None
1237         };
1238 
1239         // If running on SGX the start of device area and RAM area may diverge but
1240         // at this point they are next to each other.
1241         let end_of_ram_area = start_of_device_area.unchecked_sub(1);
1242         let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap();
1243 
1244         #[allow(unused_mut)]
1245         let mut memory_manager = MemoryManager {
1246             boot_guest_memory,
1247             guest_memory,
1248             next_memory_slot: Arc::new(AtomicU32::new(next_memory_slot)),
1249             memory_slot_free_list: Arc::new(Mutex::new(Vec::new())),
1250             start_of_device_area,
1251             end_of_device_area,
1252             end_of_ram_area,
1253             vm,
1254             hotplug_slots,
1255             selected_slot,
1256             mergeable: config.mergeable,
1257             allocator,
1258             hotplug_method: config.hotplug_method,
1259             boot_ram,
1260             current_ram,
1261             next_hotplug_slot,
1262             shared: config.shared,
1263             hugepages: config.hugepages,
1264             hugepage_size: config.hugepage_size,
1265             prefault: config.prefault,
1266             #[cfg(target_arch = "x86_64")]
1267             sgx_epc_region: None,
1268             user_provided_zones,
1269             snapshot_memory_ranges: MemoryRangeTable::default(),
1270             memory_zones,
1271             guest_ram_mappings: Vec::new(),
1272             acpi_address,
1273             log_dirty: dynamic, // Cannot log dirty pages on a TD
1274             arch_mem_regions,
1275             ram_allocator,
1276             dynamic,
1277             #[cfg(target_arch = "aarch64")]
1278             uefi_flash: None,
1279             thp: config.thp,
1280         };
1281 
1282         #[cfg(target_arch = "x86_64")]
1283         if let Some(sgx_epc_config) = sgx_epc_config {
1284             memory_manager.setup_sgx(sgx_epc_config)?;
1285         }
1286 
1287         Ok(Arc::new(Mutex::new(memory_manager)))
1288     }
1289 
new_from_snapshot( snapshot: &Snapshot, vm: Arc<dyn hypervisor::Vm>, config: &MemoryConfig, source_url: Option<&str>, prefault: bool, phys_bits: u8, ) -> Result<Arc<Mutex<MemoryManager>>, Error>1290     pub fn new_from_snapshot(
1291         snapshot: &Snapshot,
1292         vm: Arc<dyn hypervisor::Vm>,
1293         config: &MemoryConfig,
1294         source_url: Option<&str>,
1295         prefault: bool,
1296         phys_bits: u8,
1297     ) -> Result<Arc<Mutex<MemoryManager>>, Error> {
1298         if let Some(source_url) = source_url {
1299             let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?;
1300             memory_file_path.push(String::from(SNAPSHOT_FILENAME));
1301 
1302             let mem_snapshot: MemoryManagerSnapshotData =
1303                 snapshot.to_state().map_err(Error::Restore)?;
1304 
1305             let mm = MemoryManager::new(
1306                 vm,
1307                 config,
1308                 Some(prefault),
1309                 phys_bits,
1310                 #[cfg(feature = "tdx")]
1311                 false,
1312                 Some(&mem_snapshot),
1313                 None,
1314                 #[cfg(target_arch = "x86_64")]
1315                 None,
1316             )?;
1317 
1318             mm.lock()
1319                 .unwrap()
1320                 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?;
1321 
1322             Ok(mm)
1323         } else {
1324             Err(Error::RestoreMissingSourceUrl)
1325         }
1326     }
1327 
memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error>1328     fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> {
1329         // SAFETY: FFI call with correct arguments
1330         let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) };
1331 
1332         if res < 0 {
1333             Err(io::Error::last_os_error())
1334         } else {
1335             Ok(res as RawFd)
1336         }
1337     }
1338 
mbind( addr: *mut u8, len: u64, mode: u32, nodemask: Vec<u64>, maxnode: u64, flags: u32, ) -> Result<(), io::Error>1339     fn mbind(
1340         addr: *mut u8,
1341         len: u64,
1342         mode: u32,
1343         nodemask: Vec<u64>,
1344         maxnode: u64,
1345         flags: u32,
1346     ) -> Result<(), io::Error> {
1347         // SAFETY: FFI call with correct arguments
1348         let res = unsafe {
1349             libc::syscall(
1350                 libc::SYS_mbind,
1351                 addr as *mut libc::c_void,
1352                 len,
1353                 mode,
1354                 nodemask.as_ptr(),
1355                 maxnode,
1356                 flags,
1357             )
1358         };
1359 
1360         if res < 0 {
1361             Err(io::Error::last_os_error())
1362         } else {
1363             Ok(())
1364         }
1365     }
1366 
create_anonymous_file( size: usize, hugepages: bool, hugepage_size: Option<u64>, ) -> Result<FileOffset, Error>1367     fn create_anonymous_file(
1368         size: usize,
1369         hugepages: bool,
1370         hugepage_size: Option<u64>,
1371     ) -> Result<FileOffset, Error> {
1372         let fd = Self::memfd_create(
1373             &ffi::CString::new("ch_ram").unwrap(),
1374             libc::MFD_CLOEXEC
1375                 | if hugepages {
1376                     libc::MFD_HUGETLB
1377                         | if let Some(hugepage_size) = hugepage_size {
1378                             /*
1379                              * From the Linux kernel:
1380                              * Several system calls take a flag to request "hugetlb" huge pages.
1381                              * Without further specification, these system calls will use the
1382                              * system's default huge page size.  If a system supports multiple
1383                              * huge page sizes, the desired huge page size can be specified in
1384                              * bits [26:31] of the flag arguments.  The value in these 6 bits
1385                              * will encode the log2 of the huge page size.
1386                              */
1387 
1388                             hugepage_size.trailing_zeros() << 26
1389                         } else {
1390                             // Use the system default huge page size
1391                             0
1392                         }
1393                 } else {
1394                     0
1395                 },
1396         )
1397         .map_err(Error::SharedFileCreate)?;
1398 
1399         // SAFETY: fd is valid
1400         let f = unsafe { File::from_raw_fd(fd) };
1401         f.set_len(size as u64).map_err(Error::SharedFileSetLen)?;
1402 
1403         Ok(FileOffset::new(f, 0))
1404     }
1405 
open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error>1406     fn open_backing_file(backing_file: &PathBuf, file_offset: u64) -> Result<FileOffset, Error> {
1407         if backing_file.is_dir() {
1408             Err(Error::DirectoryAsBackingFileForMemory)
1409         } else {
1410             let f = OpenOptions::new()
1411                 .read(true)
1412                 .write(true)
1413                 .open(backing_file)
1414                 .map_err(Error::SharedFileCreate)?;
1415 
1416             Ok(FileOffset::new(f, file_offset))
1417         }
1418     }
1419 
1420     #[allow(clippy::too_many_arguments)]
create_ram_region( backing_file: &Option<PathBuf>, file_offset: u64, start_addr: GuestAddress, size: usize, prefault: bool, shared: bool, hugepages: bool, hugepage_size: Option<u64>, host_numa_node: Option<u32>, existing_memory_file: Option<File>, thp: bool, ) -> Result<Arc<GuestRegionMmap>, Error>1421     pub fn create_ram_region(
1422         backing_file: &Option<PathBuf>,
1423         file_offset: u64,
1424         start_addr: GuestAddress,
1425         size: usize,
1426         prefault: bool,
1427         shared: bool,
1428         hugepages: bool,
1429         hugepage_size: Option<u64>,
1430         host_numa_node: Option<u32>,
1431         existing_memory_file: Option<File>,
1432         thp: bool,
1433     ) -> Result<Arc<GuestRegionMmap>, Error> {
1434         let mut mmap_flags = libc::MAP_NORESERVE;
1435 
1436         // The duplication of mmap_flags ORing here is unfortunate but it also makes
1437         // the complexity of the handling clear.
1438         let fo = if let Some(f) = existing_memory_file {
1439             // It must be MAP_SHARED as we wouldn't already have an FD
1440             mmap_flags |= libc::MAP_SHARED;
1441             Some(FileOffset::new(f, file_offset))
1442         } else if let Some(backing_file) = backing_file {
1443             if shared {
1444                 mmap_flags |= libc::MAP_SHARED;
1445             } else {
1446                 mmap_flags |= libc::MAP_PRIVATE;
1447             }
1448             Some(Self::open_backing_file(backing_file, file_offset)?)
1449         } else if shared || hugepages {
1450             // For hugepages we must also MAP_SHARED otherwise we will trigger #4805
1451             // because the MAP_PRIVATE will trigger CoW against the backing file with
1452             // the VFIO pinning
1453             mmap_flags |= libc::MAP_SHARED;
1454             Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?)
1455         } else {
1456             mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS;
1457             None
1458         };
1459 
1460         let region = GuestRegionMmap::new(
1461             MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags)
1462                 .map_err(Error::GuestMemoryRegion)?,
1463             start_addr,
1464         )
1465         .map_err(Error::GuestMemory)?;
1466 
1467         // Apply NUMA policy if needed.
1468         if let Some(node) = host_numa_node {
1469             let addr = region.deref().as_ptr();
1470             let len = region.deref().size() as u64;
1471             let mode = MPOL_BIND;
1472             let mut nodemask: Vec<u64> = Vec::new();
1473             let flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
1474 
1475             // Linux is kind of buggy in the way it interprets maxnode as it
1476             // will cut off the last node. That's why we have to add 1 to what
1477             // we would consider as the proper maxnode value.
1478             let maxnode = node as u64 + 1 + 1;
1479 
1480             // Allocate the right size for the vector.
1481             nodemask.resize((node as usize / 64) + 1, 0);
1482 
1483             // Fill the global bitmask through the nodemask vector.
1484             let idx = (node / 64) as usize;
1485             let shift = node % 64;
1486             nodemask[idx] |= 1u64 << shift;
1487 
1488             // Policies are enforced by using MPOL_MF_MOVE flag as it will
1489             // force the kernel to move all pages that might have been already
1490             // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is
1491             // used to throw an error if MPOL_MF_MOVE didn't succeed.
1492             // MPOL_BIND is the selected mode as it specifies a strict policy
1493             // that restricts memory allocation to the nodes specified in the
1494             // nodemask.
1495             Self::mbind(addr, len, mode, nodemask, maxnode, flags)
1496                 .map_err(Error::ApplyNumaPolicy)?;
1497         }
1498 
1499         // Prefault the region if needed, in parallel.
1500         if prefault {
1501             let page_size =
1502                 Self::get_prefault_align_size(backing_file, hugepages, hugepage_size)? as usize;
1503 
1504             if !is_aligned(size, page_size) {
1505                 warn!(
1506                     "Prefaulting memory size {} misaligned with page size {}",
1507                     size, page_size
1508                 );
1509             }
1510 
1511             let num_pages = size / page_size;
1512 
1513             let num_threads = Self::get_prefault_num_threads(page_size, num_pages);
1514 
1515             let pages_per_thread = num_pages / num_threads;
1516             let remainder = num_pages % num_threads;
1517 
1518             let barrier = Arc::new(Barrier::new(num_threads));
1519             thread::scope(|s| {
1520                 let r = &region;
1521                 for i in 0..num_threads {
1522                     let barrier = Arc::clone(&barrier);
1523                     s.spawn(move || {
1524                         // Wait until all threads have been spawned to avoid contention
1525                         // over mmap_sem between thread stack allocation and page faulting.
1526                         barrier.wait();
1527                         let pages = pages_per_thread + if i < remainder { 1 } else { 0 };
1528                         let offset =
1529                             page_size * ((i * pages_per_thread) + std::cmp::min(i, remainder));
1530                         // SAFETY: FFI call with correct arguments
1531                         let ret = unsafe {
1532                             let addr = r.as_ptr().add(offset);
1533                             libc::madvise(addr as _, pages * page_size, libc::MADV_POPULATE_WRITE)
1534                         };
1535                         if ret != 0 {
1536                             let e = io::Error::last_os_error();
1537                             warn!("Failed to prefault pages: {}", e);
1538                         }
1539                     });
1540                 }
1541             });
1542         }
1543 
1544         if region.file_offset().is_none() && thp {
1545             info!(
1546                 "Anonymous mapping at 0x{:x} (size = 0x{:x})",
1547                 region.as_ptr() as u64,
1548                 size
1549             );
1550             // SAFETY: FFI call with correct arguments
1551             let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) };
1552             if ret != 0 {
1553                 let e = io::Error::last_os_error();
1554                 warn!("Failed to mark pages as THP eligible: {}", e);
1555             }
1556         }
1557 
1558         Ok(Arc::new(region))
1559     }
1560 
1561     // Duplicate of `memory_zone_get_align_size` that does not require a `zone`
get_prefault_align_size( backing_file: &Option<PathBuf>, hugepages: bool, hugepage_size: Option<u64>, ) -> Result<u64, Error>1562     fn get_prefault_align_size(
1563         backing_file: &Option<PathBuf>,
1564         hugepages: bool,
1565         hugepage_size: Option<u64>,
1566     ) -> Result<u64, Error> {
1567         // SAFETY: FFI call. Trivially safe.
1568         let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as u64 };
1569         match (hugepages, hugepage_size, backing_file) {
1570             (false, _, _) => Ok(page_size),
1571             (true, Some(hugepage_size), _) => Ok(hugepage_size),
1572             (true, None, _) => {
1573                 // There are two scenarios here:
1574                 //  - `hugepages` is enabled but `hugepage_size` is not specified:
1575                 //     Call `statfs` for `/dev/hugepages` for getting the default size of hugepage
1576                 //  - The backing file is specified:
1577                 //     Call `statfs` for the file and get its `f_bsize`.  If the value is larger than the page
1578                 //     size of normal page, just use the `f_bsize` because the file is in a hugetlbfs.  If the
1579                 //     value is less than or equal to the page size, just use the page size.
1580                 let path = backing_file
1581                     .as_ref()
1582                     .map_or(Ok("/dev/hugepages"), |pathbuf| {
1583                         pathbuf.to_str().ok_or(Error::InvalidMemoryParameters)
1584                     })?;
1585                 let align_size = std::cmp::max(page_size, statfs_get_bsize(path)?);
1586                 Ok(align_size)
1587             }
1588         }
1589     }
1590 
get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize1591     fn get_prefault_num_threads(page_size: usize, num_pages: usize) -> usize {
1592         let mut n: usize = 1;
1593 
1594         // Do not create more threads than processors available.
1595         // SAFETY: FFI call. Trivially safe.
1596         let procs = unsafe { libc::sysconf(_SC_NPROCESSORS_ONLN) };
1597         if procs > 0 {
1598             n = std::cmp::min(procs as usize, MAX_PREFAULT_THREAD_COUNT);
1599         }
1600 
1601         // Do not create more threads than pages being allocated.
1602         n = std::cmp::min(n, num_pages);
1603 
1604         // Do not create threads to allocate less than 64 MiB of memory.
1605         n = std::cmp::min(
1606             n,
1607             std::cmp::max(1, page_size * num_pages / (64 * (1 << 26))),
1608         );
1609 
1610         n
1611     }
1612 
1613     // Update the GuestMemoryMmap with the new range
add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error>1614     fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> {
1615         let guest_memory = self
1616             .guest_memory
1617             .memory()
1618             .insert_region(region)
1619             .map_err(Error::GuestMemory)?;
1620         self.guest_memory.lock().unwrap().replace(guest_memory);
1621 
1622         Ok(())
1623     }
1624 
1625     //
1626     // Calculate the start address of an area next to RAM.
1627     //
1628     // If memory hotplug is allowed, the start address needs to be aligned
1629     // (rounded-up) to 128MiB boundary.
1630     // If memory hotplug is not allowed, there is no alignment required.
1631     // And it must also start at the 64bit start.
start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error>1632     fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> {
1633         let mut start_addr = if allow_mem_hotplug {
1634             GuestAddress(mem_end.0 | ((128 << 20) - 1))
1635         } else {
1636             mem_end
1637         };
1638 
1639         start_addr = start_addr
1640             .checked_add(1)
1641             .ok_or(Error::GuestAddressOverFlow)?;
1642 
1643         #[cfg(not(target_arch = "riscv64"))]
1644         if mem_end < arch::layout::MEM_32BIT_RESERVED_START {
1645             return Ok(arch::layout::RAM_64BIT_START);
1646         }
1647 
1648         Ok(start_addr)
1649     }
1650 
add_ram_region( &mut self, start_addr: GuestAddress, size: usize, ) -> Result<Arc<GuestRegionMmap>, Error>1651     pub fn add_ram_region(
1652         &mut self,
1653         start_addr: GuestAddress,
1654         size: usize,
1655     ) -> Result<Arc<GuestRegionMmap>, Error> {
1656         // Allocate memory for the region
1657         let region = MemoryManager::create_ram_region(
1658             &None,
1659             0,
1660             start_addr,
1661             size,
1662             self.prefault,
1663             self.shared,
1664             self.hugepages,
1665             self.hugepage_size,
1666             None,
1667             None,
1668             self.thp,
1669         )?;
1670 
1671         // Map it into the guest
1672         let slot = self.create_userspace_mapping(
1673             region.start_addr().0,
1674             region.len(),
1675             region.as_ptr() as u64,
1676             self.mergeable,
1677             false,
1678             self.log_dirty,
1679         )?;
1680         self.guest_ram_mappings.push(GuestRamMapping {
1681             gpa: region.start_addr().raw_value(),
1682             size: region.len(),
1683             slot,
1684             zone_id: DEFAULT_MEMORY_ZONE.to_string(),
1685             virtio_mem: false,
1686             file_offset: 0,
1687         });
1688 
1689         self.add_region(Arc::clone(&region))?;
1690 
1691         Ok(region)
1692     }
1693 
hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error>1694     fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> {
1695         info!("Hotplugging new RAM: {}", size);
1696 
1697         // Check that there is a free slot
1698         if self.next_hotplug_slot >= HOTPLUG_COUNT {
1699             return Err(Error::NoSlotAvailable);
1700         }
1701 
1702         // "Inserted" DIMM must have a size that is a multiple of 128MiB
1703         if size % (128 << 20) != 0 {
1704             return Err(Error::InvalidSize);
1705         }
1706 
1707         let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?;
1708 
1709         if start_addr
1710             .checked_add((size - 1).try_into().unwrap())
1711             .unwrap()
1712             > self.end_of_ram_area
1713         {
1714             return Err(Error::InsufficientHotplugRam);
1715         }
1716 
1717         let region = self.add_ram_region(start_addr, size)?;
1718 
1719         // Add region to the list of regions associated with the default
1720         // memory zone.
1721         if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) {
1722             memory_zone.regions.push(Arc::clone(&region));
1723         }
1724 
1725         // Tell the allocator
1726         self.ram_allocator
1727             .allocate(Some(start_addr), size as GuestUsize, None)
1728             .ok_or(Error::MemoryRangeAllocation)?;
1729 
1730         // Update the slot so that it can be queried via the I/O port
1731         let slot = &mut self.hotplug_slots[self.next_hotplug_slot];
1732         slot.active = true;
1733         slot.inserting = true;
1734         slot.base = region.start_addr().0;
1735         slot.length = region.len();
1736 
1737         self.next_hotplug_slot += 1;
1738 
1739         Ok(region)
1740     }
1741 
guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap>1742     pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
1743         self.guest_memory.clone()
1744     }
1745 
boot_guest_memory(&self) -> GuestMemoryMmap1746     pub fn boot_guest_memory(&self) -> GuestMemoryMmap {
1747         self.boot_guest_memory.clone()
1748     }
1749 
allocator(&self) -> Arc<Mutex<SystemAllocator>>1750     pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> {
1751         self.allocator.clone()
1752     }
1753 
start_of_device_area(&self) -> GuestAddress1754     pub fn start_of_device_area(&self) -> GuestAddress {
1755         self.start_of_device_area
1756     }
1757 
end_of_device_area(&self) -> GuestAddress1758     pub fn end_of_device_area(&self) -> GuestAddress {
1759         self.end_of_device_area
1760     }
1761 
memory_slot_allocator(&mut self) -> MemorySlotAllocator1762     pub fn memory_slot_allocator(&mut self) -> MemorySlotAllocator {
1763         let memory_slot_free_list = Arc::clone(&self.memory_slot_free_list);
1764         let next_memory_slot = Arc::clone(&self.next_memory_slot);
1765         MemorySlotAllocator::new(next_memory_slot, memory_slot_free_list)
1766     }
1767 
allocate_memory_slot(&mut self) -> u321768     pub fn allocate_memory_slot(&mut self) -> u32 {
1769         self.memory_slot_allocator().next_memory_slot()
1770     }
1771 
create_userspace_mapping( &mut self, guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, mergeable: bool, readonly: bool, log_dirty: bool, ) -> Result<u32, Error>1772     pub fn create_userspace_mapping(
1773         &mut self,
1774         guest_phys_addr: u64,
1775         memory_size: u64,
1776         userspace_addr: u64,
1777         mergeable: bool,
1778         readonly: bool,
1779         log_dirty: bool,
1780     ) -> Result<u32, Error> {
1781         let slot = self.allocate_memory_slot();
1782         let mem_region = self.vm.make_user_memory_region(
1783             slot,
1784             guest_phys_addr,
1785             memory_size,
1786             userspace_addr,
1787             readonly,
1788             log_dirty,
1789         );
1790 
1791         info!(
1792             "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}",
1793             guest_phys_addr, userspace_addr, memory_size, slot
1794         );
1795 
1796         self.vm
1797             .create_user_memory_region(mem_region)
1798             .map_err(Error::CreateUserMemoryRegion)?;
1799 
1800         // SAFETY: the address and size are valid since the
1801         // mmap succeeded.
1802         let ret = unsafe {
1803             libc::madvise(
1804                 userspace_addr as *mut libc::c_void,
1805                 memory_size as libc::size_t,
1806                 libc::MADV_DONTDUMP,
1807             )
1808         };
1809         if ret != 0 {
1810             let e = io::Error::last_os_error();
1811             warn!("Failed to mark mapping as MADV_DONTDUMP: {}", e);
1812         }
1813 
1814         // Mark the pages as mergeable if explicitly asked for.
1815         if mergeable {
1816             // SAFETY: the address and size are valid since the
1817             // mmap succeeded.
1818             let ret = unsafe {
1819                 libc::madvise(
1820                     userspace_addr as *mut libc::c_void,
1821                     memory_size as libc::size_t,
1822                     libc::MADV_MERGEABLE,
1823                 )
1824             };
1825             if ret != 0 {
1826                 let err = io::Error::last_os_error();
1827                 // Safe to unwrap because the error is constructed with
1828                 // last_os_error(), which ensures the output will be Some().
1829                 let errno = err.raw_os_error().unwrap();
1830                 if errno == libc::EINVAL {
1831                     warn!("kernel not configured with CONFIG_KSM");
1832                 } else {
1833                     warn!("madvise error: {}", err);
1834                 }
1835                 warn!("failed to mark pages as mergeable");
1836             }
1837         }
1838 
1839         info!(
1840             "Created userspace mapping: {:x} -> {:x} {:x}",
1841             guest_phys_addr, userspace_addr, memory_size
1842         );
1843 
1844         Ok(slot)
1845     }
1846 
remove_userspace_mapping( &mut self, guest_phys_addr: u64, memory_size: u64, userspace_addr: u64, mergeable: bool, slot: u32, ) -> Result<(), Error>1847     pub fn remove_userspace_mapping(
1848         &mut self,
1849         guest_phys_addr: u64,
1850         memory_size: u64,
1851         userspace_addr: u64,
1852         mergeable: bool,
1853         slot: u32,
1854     ) -> Result<(), Error> {
1855         let mem_region = self.vm.make_user_memory_region(
1856             slot,
1857             guest_phys_addr,
1858             memory_size,
1859             userspace_addr,
1860             false, /* readonly -- don't care */
1861             false, /* log dirty */
1862         );
1863 
1864         self.vm
1865             .remove_user_memory_region(mem_region)
1866             .map_err(Error::RemoveUserMemoryRegion)?;
1867 
1868         // Mark the pages as unmergeable if there were previously marked as
1869         // mergeable.
1870         if mergeable {
1871             // SAFETY: the address and size are valid as the region was
1872             // previously advised.
1873             let ret = unsafe {
1874                 libc::madvise(
1875                     userspace_addr as *mut libc::c_void,
1876                     memory_size as libc::size_t,
1877                     libc::MADV_UNMERGEABLE,
1878                 )
1879             };
1880             if ret != 0 {
1881                 let err = io::Error::last_os_error();
1882                 // Safe to unwrap because the error is constructed with
1883                 // last_os_error(), which ensures the output will be Some().
1884                 let errno = err.raw_os_error().unwrap();
1885                 if errno == libc::EINVAL {
1886                     warn!("kernel not configured with CONFIG_KSM");
1887                 } else {
1888                     warn!("madvise error: {}", err);
1889                 }
1890                 warn!("failed to mark pages as unmergeable");
1891             }
1892         }
1893 
1894         info!(
1895             "Removed userspace mapping: {:x} -> {:x} {:x}",
1896             guest_phys_addr, userspace_addr, memory_size
1897         );
1898 
1899         Ok(())
1900     }
1901 
virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error>1902     pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> {
1903         if let Some(memory_zone) = self.memory_zones.get_mut(id) {
1904             if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone {
1905                 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() {
1906                     virtio_mem_device
1907                         .lock()
1908                         .unwrap()
1909                         .resize(size)
1910                         .map_err(Error::VirtioMemResizeFail)?;
1911                 }
1912 
1913                 // Keep the hotplugged_size up to date.
1914                 virtio_mem_zone.hotplugged_size = size;
1915             } else {
1916                 error!("Failed resizing virtio-mem region: No virtio-mem handler");
1917                 return Err(Error::MissingVirtioMemHandler);
1918             }
1919 
1920             return Ok(());
1921         }
1922 
1923         error!("Failed resizing virtio-mem region: Unknown memory zone");
1924         Err(Error::UnknownMemoryZone)
1925     }
1926 
1927     /// In case this function resulted in adding a new memory region to the
1928     /// guest memory, the new region is returned to the caller. The virtio-mem
1929     /// use case never adds a new region as the whole hotpluggable memory has
1930     /// already been allocated at boot time.
resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error>1931     pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> {
1932         if self.user_provided_zones {
1933             error!(
1934                 "Not allowed to resize guest memory when backed with user \
1935                 defined memory zones."
1936             );
1937             return Err(Error::InvalidResizeWithMemoryZones);
1938         }
1939 
1940         let mut region: Option<Arc<GuestRegionMmap>> = None;
1941         match self.hotplug_method {
1942             HotplugMethod::VirtioMem => {
1943                 if desired_ram >= self.boot_ram {
1944                     if !self.dynamic {
1945                         return Ok(region);
1946                     }
1947 
1948                     self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?;
1949                     self.current_ram = desired_ram;
1950                 }
1951             }
1952             HotplugMethod::Acpi => {
1953                 if desired_ram > self.current_ram {
1954                     if !self.dynamic {
1955                         return Ok(region);
1956                     }
1957 
1958                     region =
1959                         Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?);
1960                     self.current_ram = desired_ram;
1961                 }
1962             }
1963         }
1964         Ok(region)
1965     }
1966 
resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error>1967     pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> {
1968         if !self.user_provided_zones {
1969             error!(
1970                 "Not allowed to resize guest memory zone when no zone is \
1971                 defined."
1972             );
1973             return Err(Error::ResizeZone);
1974         }
1975 
1976         self.virtio_mem_resize(id, virtio_mem_size)
1977     }
1978 
1979     #[cfg(target_arch = "x86_64")]
setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error>1980     pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> {
1981         let file = OpenOptions::new()
1982             .read(true)
1983             .open("/dev/sgx_provision")
1984             .map_err(Error::SgxProvisionOpen)?;
1985         self.vm
1986             .enable_sgx_attribute(file)
1987             .map_err(Error::SgxEnableProvisioning)?;
1988 
1989         // Go over each EPC section and verify its size is a 4k multiple. At
1990         // the same time, calculate the total size needed for the contiguous
1991         // EPC region.
1992         let mut epc_region_size = 0;
1993         for epc_section in sgx_epc_config.iter() {
1994             if epc_section.size == 0 {
1995                 return Err(Error::EpcSectionSizeInvalid);
1996             }
1997             if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 {
1998                 return Err(Error::EpcSectionSizeInvalid);
1999             }
2000 
2001             epc_region_size += epc_section.size;
2002         }
2003 
2004         // Place the SGX EPC region on a 4k boundary between the RAM and the device area
2005         let epc_region_start =
2006             GuestAddress(self.start_of_device_area.0.div_ceil(SGX_PAGE_SIZE) * SGX_PAGE_SIZE);
2007 
2008         self.start_of_device_area = epc_region_start
2009             .checked_add(epc_region_size)
2010             .ok_or(Error::GuestAddressOverFlow)?;
2011 
2012         let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize);
2013         info!(
2014             "SGX EPC region: 0x{:x} (0x{:x})",
2015             epc_region_start.0, epc_region_size
2016         );
2017 
2018         // Each section can be memory mapped into the allocated region.
2019         let mut epc_section_start = epc_region_start.raw_value();
2020         for epc_section in sgx_epc_config.iter() {
2021             let file = OpenOptions::new()
2022                 .read(true)
2023                 .write(true)
2024                 .open("/dev/sgx_vepc")
2025                 .map_err(Error::SgxVirtEpcOpen)?;
2026 
2027             let prot = PROT_READ | PROT_WRITE;
2028             let mut flags = MAP_NORESERVE | MAP_SHARED;
2029             if epc_section.prefault {
2030                 flags |= MAP_POPULATE;
2031             }
2032 
2033             // We can't use the vm-memory crate to perform the memory mapping
2034             // here as it would try to ensure the size of the backing file is
2035             // matching the size of the expected mapping. The /dev/sgx_vepc
2036             // device does not work that way, it provides a file descriptor
2037             // which is not matching the mapping size, as it's a just a way to
2038             // let KVM know that an EPC section is being created for the guest.
2039             // SAFETY: FFI call with correct arguments
2040             let host_addr = unsafe {
2041                 libc::mmap(
2042                     std::ptr::null_mut(),
2043                     epc_section.size as usize,
2044                     prot,
2045                     flags,
2046                     file.as_raw_fd(),
2047                     0,
2048                 )
2049             };
2050 
2051             if host_addr == libc::MAP_FAILED {
2052                 error!(
2053                     "Could not add SGX EPC section (size 0x{:x})",
2054                     epc_section.size
2055                 );
2056                 return Err(Error::SgxEpcRangeAllocation);
2057             }
2058 
2059             info!(
2060                 "Adding SGX EPC section: 0x{:x} (0x{:x})",
2061                 epc_section_start, epc_section.size
2062             );
2063 
2064             let _mem_slot = self.create_userspace_mapping(
2065                 epc_section_start,
2066                 epc_section.size,
2067                 host_addr as u64,
2068                 false,
2069                 false,
2070                 false,
2071             )?;
2072 
2073             sgx_epc_region.insert(
2074                 epc_section.id.clone(),
2075                 SgxEpcSection::new(
2076                     GuestAddress(epc_section_start),
2077                     epc_section.size as GuestUsize,
2078                 ),
2079             );
2080 
2081             epc_section_start += epc_section.size;
2082         }
2083 
2084         self.sgx_epc_region = Some(sgx_epc_region);
2085 
2086         Ok(())
2087     }
2088 
2089     #[cfg(target_arch = "x86_64")]
sgx_epc_region(&self) -> &Option<SgxEpcRegion>2090     pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> {
2091         &self.sgx_epc_region
2092     }
2093 
is_hardlink(f: &File) -> bool2094     pub fn is_hardlink(f: &File) -> bool {
2095         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
2096         // SAFETY: FFI call with correct arguments
2097         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
2098         if ret != 0 {
2099             error!("Couldn't fstat the backing file");
2100             return false;
2101         }
2102 
2103         // SAFETY: stat is valid
2104         unsafe { (*stat.as_ptr()).st_nlink as usize > 0 }
2105     }
2106 
memory_zones(&self) -> &MemoryZones2107     pub fn memory_zones(&self) -> &MemoryZones {
2108         &self.memory_zones
2109     }
2110 
memory_zones_mut(&mut self) -> &mut MemoryZones2111     pub fn memory_zones_mut(&mut self) -> &mut MemoryZones {
2112         &mut self.memory_zones
2113     }
2114 
memory_range_table( &self, snapshot: bool, ) -> std::result::Result<MemoryRangeTable, MigratableError>2115     pub fn memory_range_table(
2116         &self,
2117         snapshot: bool,
2118     ) -> std::result::Result<MemoryRangeTable, MigratableError> {
2119         let mut table = MemoryRangeTable::default();
2120 
2121         for memory_zone in self.memory_zones.values() {
2122             if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() {
2123                 table.extend(virtio_mem_zone.plugged_ranges());
2124             }
2125 
2126             for region in memory_zone.regions() {
2127                 if snapshot {
2128                     if let Some(file_offset) = region.file_offset() {
2129                         if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED)
2130                             && Self::is_hardlink(file_offset.file())
2131                         {
2132                             // In this very specific case, we know the memory
2133                             // region is backed by a file on the host filesystem
2134                             // that can be accessed by the user, and additionally
2135                             // the mapping is shared, which means that modifications
2136                             // to the content are written to the actual file.
2137                             // When meeting these conditions, we can skip the
2138                             // copy of the memory content for this specific region,
2139                             // as we can assume the user will have it saved through
2140                             // the backing file already.
2141                             continue;
2142                         }
2143                     }
2144                 }
2145 
2146                 table.push(MemoryRange {
2147                     gpa: region.start_addr().raw_value(),
2148                     length: region.len(),
2149                 });
2150             }
2151         }
2152 
2153         Ok(table)
2154     }
2155 
snapshot_data(&self) -> MemoryManagerSnapshotData2156     pub fn snapshot_data(&self) -> MemoryManagerSnapshotData {
2157         MemoryManagerSnapshotData {
2158             memory_ranges: self.snapshot_memory_ranges.clone(),
2159             guest_ram_mappings: self.guest_ram_mappings.clone(),
2160             start_of_device_area: self.start_of_device_area.0,
2161             boot_ram: self.boot_ram,
2162             current_ram: self.current_ram,
2163             arch_mem_regions: self.arch_mem_regions.clone(),
2164             hotplug_slots: self.hotplug_slots.clone(),
2165             next_memory_slot: self.next_memory_slot.load(Ordering::SeqCst),
2166             selected_slot: self.selected_slot,
2167             next_hotplug_slot: self.next_hotplug_slot,
2168         }
2169     }
2170 
memory_slot_fds(&self) -> HashMap<u32, RawFd>2171     pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> {
2172         let mut memory_slot_fds = HashMap::new();
2173         for guest_ram_mapping in &self.guest_ram_mappings {
2174             let slot = guest_ram_mapping.slot;
2175             let guest_memory = self.guest_memory.memory();
2176             let file = guest_memory
2177                 .find_region(GuestAddress(guest_ram_mapping.gpa))
2178                 .unwrap()
2179                 .file_offset()
2180                 .unwrap()
2181                 .file();
2182             memory_slot_fds.insert(slot, file.as_raw_fd());
2183         }
2184         memory_slot_fds
2185     }
2186 
acpi_address(&self) -> Option<GuestAddress>2187     pub fn acpi_address(&self) -> Option<GuestAddress> {
2188         self.acpi_address
2189     }
2190 
num_guest_ram_mappings(&self) -> u322191     pub fn num_guest_ram_mappings(&self) -> u32 {
2192         self.guest_ram_mappings.len() as u32
2193     }
2194 
2195     #[cfg(target_arch = "aarch64")]
uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap>2196     pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> {
2197         self.uefi_flash.as_ref().unwrap().clone()
2198     }
2199 
2200     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions2201     pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions {
2202         let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone();
2203         mapping_sorted_by_gpa.sort_by_key(|m| m.gpa);
2204 
2205         let mut mem_offset_in_elf = mem_offset;
2206         let mut ram_maps = BTreeMap::new();
2207         for mapping in mapping_sorted_by_gpa.iter() {
2208             ram_maps.insert(
2209                 mapping.gpa,
2210                 CoredumpMemoryRegion {
2211                     mem_offset_in_elf,
2212                     mem_size: mapping.size,
2213                 },
2214             );
2215             mem_offset_in_elf += mapping.size;
2216         }
2217 
2218         CoredumpMemoryRegions { ram_maps }
2219     }
2220 
2221     #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))]
coredump_iterate_save_mem( &mut self, dump_state: &DumpState, ) -> std::result::Result<(), GuestDebuggableError>2222     pub fn coredump_iterate_save_mem(
2223         &mut self,
2224         dump_state: &DumpState,
2225     ) -> std::result::Result<(), GuestDebuggableError> {
2226         let snapshot_memory_ranges = self
2227             .memory_range_table(false)
2228             .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2229 
2230         if snapshot_memory_ranges.is_empty() {
2231             return Ok(());
2232         }
2233 
2234         let coredump_file = dump_state.file.as_ref().unwrap();
2235 
2236         let guest_memory = self.guest_memory.memory();
2237         let mut total_bytes: u64 = 0;
2238 
2239         for range in snapshot_memory_ranges.regions() {
2240             let mut offset: u64 = 0;
2241             loop {
2242                 let bytes_written = guest_memory
2243                     .write_volatile_to(
2244                         GuestAddress(range.gpa + offset),
2245                         &mut coredump_file.as_fd(),
2246                         (range.length - offset) as usize,
2247                     )
2248                     .map_err(|e| GuestDebuggableError::Coredump(e.into()))?;
2249                 offset += bytes_written as u64;
2250                 total_bytes += bytes_written as u64;
2251 
2252                 if offset == range.length {
2253                     break;
2254                 }
2255             }
2256         }
2257 
2258         debug!("coredump total bytes {}", total_bytes);
2259         Ok(())
2260     }
2261 
receive_memory_regions<F>( &mut self, ranges: &MemoryRangeTable, fd: &mut F, ) -> std::result::Result<(), MigratableError> where F: ReadVolatile,2262     pub fn receive_memory_regions<F>(
2263         &mut self,
2264         ranges: &MemoryRangeTable,
2265         fd: &mut F,
2266     ) -> std::result::Result<(), MigratableError>
2267     where
2268         F: ReadVolatile,
2269     {
2270         let guest_memory = self.guest_memory();
2271         let mem = guest_memory.memory();
2272 
2273         for range in ranges.regions() {
2274             let mut offset: u64 = 0;
2275             // Here we are manually handling the retry in case we can't the
2276             // whole region at once because we can't use the implementation
2277             // from vm-memory::GuestMemory of read_exact_from() as it is not
2278             // following the correct behavior. For more info about this issue
2279             // see: https://github.com/rust-vmm/vm-memory/issues/174
2280             loop {
2281                 let bytes_read = mem
2282                     .read_volatile_from(
2283                         GuestAddress(range.gpa + offset),
2284                         fd,
2285                         (range.length - offset) as usize,
2286                     )
2287                     .map_err(|e| {
2288                         MigratableError::MigrateReceive(anyhow!(
2289                             "Error receiving memory from socket: {}",
2290                             e
2291                         ))
2292                     })?;
2293                 offset += bytes_read as u64;
2294 
2295                 if offset == range.length {
2296                     break;
2297                 }
2298             }
2299         }
2300 
2301         Ok(())
2302     }
2303 }
2304 
2305 struct MemoryNotify {
2306     slot_id: usize,
2307 }
2308 
2309 impl Aml for MemoryNotify {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2310     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2311         let object = aml::Path::new(&format!("M{:03}", self.slot_id));
2312         aml::If::new(
2313             &aml::Equal::new(&aml::Arg(0), &self.slot_id),
2314             vec![&aml::Notify::new(&object, &aml::Arg(1))],
2315         )
2316         .to_aml_bytes(sink)
2317     }
2318 }
2319 
2320 struct MemorySlot {
2321     slot_id: usize,
2322 }
2323 
2324 impl Aml for MemorySlot {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2325     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2326         aml::Device::new(
2327             format!("M{:03}", self.slot_id).as_str().into(),
2328             vec![
2329                 &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0C80")),
2330                 &aml::Name::new("_UID".into(), &self.slot_id),
2331                 /*
2332                 _STA return value:
2333                 Bit [0] – Set if the device is present.
2334                 Bit [1] – Set if the device is enabled and decoding its resources.
2335                 Bit [2] – Set if the device should be shown in the UI.
2336                 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics).
2337                 Bit [4] – Set if the battery is present.
2338                 Bits [31:5] – Reserved (must be cleared).
2339                 */
2340                 &aml::Method::new(
2341                     "_STA".into(),
2342                     0,
2343                     false,
2344                     // Call into MSTA method which will interrogate device
2345                     vec![&aml::Return::new(&aml::MethodCall::new(
2346                         "MSTA".into(),
2347                         vec![&self.slot_id],
2348                     ))],
2349                 ),
2350                 // Get details of memory
2351                 &aml::Method::new(
2352                     "_CRS".into(),
2353                     0,
2354                     false,
2355                     // Call into MCRS which provides actual memory details
2356                     vec![&aml::Return::new(&aml::MethodCall::new(
2357                         "MCRS".into(),
2358                         vec![&self.slot_id],
2359                     ))],
2360                 ),
2361             ],
2362         )
2363         .to_aml_bytes(sink)
2364     }
2365 }
2366 
2367 struct MemorySlots {
2368     slots: usize,
2369 }
2370 
2371 impl Aml for MemorySlots {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2372     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2373         for slot_id in 0..self.slots {
2374             MemorySlot { slot_id }.to_aml_bytes(sink);
2375         }
2376     }
2377 }
2378 
2379 struct MemoryMethods {
2380     slots: usize,
2381 }
2382 
2383 impl Aml for MemoryMethods {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2384     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2385         // Add "MTFY" notification method
2386         let mut memory_notifies = Vec::new();
2387         for slot_id in 0..self.slots {
2388             memory_notifies.push(MemoryNotify { slot_id });
2389         }
2390 
2391         let mut memory_notifies_refs: Vec<&dyn Aml> = Vec::new();
2392         for memory_notifier in memory_notifies.iter() {
2393             memory_notifies_refs.push(memory_notifier);
2394         }
2395 
2396         aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).to_aml_bytes(sink);
2397 
2398         // MSCN method
2399         aml::Method::new(
2400             "MSCN".into(),
2401             0,
2402             true,
2403             vec![
2404                 // Take lock defined above
2405                 &aml::Acquire::new("MLCK".into(), 0xffff),
2406                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2407                 &aml::While::new(
2408                     &aml::LessThan::new(&aml::Local(0), &self.slots),
2409                     vec![
2410                         // Write slot number (in first argument) to I/O port via field
2411                         &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)),
2412                         // Check if MINS bit is set (inserting)
2413                         &aml::If::new(
2414                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2415                             // Notify device if it is
2416                             vec![
2417                                 &aml::MethodCall::new(
2418                                     "MTFY".into(),
2419                                     vec![&aml::Local(0), &aml::ONE],
2420                                 ),
2421                                 // Reset MINS bit
2422                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE),
2423                             ],
2424                         ),
2425                         // Check if MRMV bit is set
2426                         &aml::If::new(
2427                             &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2428                             // Notify device if it is (with the eject constant 0x3)
2429                             vec![
2430                                 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]),
2431                                 // Reset MRMV bit
2432                                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE),
2433                             ],
2434                         ),
2435                         &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE),
2436                     ],
2437                 ),
2438                 // Release lock
2439                 &aml::Release::new("MLCK".into()),
2440             ],
2441         )
2442         .to_aml_bytes(sink);
2443 
2444         // Memory status method
2445         aml::Method::new(
2446             "MSTA".into(),
2447             1,
2448             true,
2449             vec![
2450                 // Take lock defined above
2451                 &aml::Acquire::new("MLCK".into(), 0xffff),
2452                 // Write slot number (in first argument) to I/O port via field
2453                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2454                 &aml::Store::new(&aml::Local(0), &aml::ZERO),
2455                 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning)
2456                 &aml::If::new(
2457                     &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE),
2458                     vec![&aml::Store::new(&aml::Local(0), &0xfu8)],
2459                 ),
2460                 // Release lock
2461                 &aml::Release::new("MLCK".into()),
2462                 // Return 0 or 0xf
2463                 &aml::Return::new(&aml::Local(0)),
2464             ],
2465         )
2466         .to_aml_bytes(sink);
2467 
2468         // Memory range method
2469         aml::Method::new(
2470             "MCRS".into(),
2471             1,
2472             true,
2473             vec![
2474                 // Take lock defined above
2475                 &aml::Acquire::new("MLCK".into(), 0xffff),
2476                 // Write slot number (in first argument) to I/O port via field
2477                 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)),
2478                 &aml::Name::new(
2479                     "MR64".into(),
2480                     &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2481                         aml::AddressSpaceCacheable::Cacheable,
2482                         true,
2483                         0x0000_0000_0000_0000u64,
2484                         0xFFFF_FFFF_FFFF_FFFEu64,
2485                         None,
2486                     )]),
2487                 ),
2488                 &aml::CreateQWordField::new(
2489                     &aml::Path::new("MINL"),
2490                     &aml::Path::new("MR64"),
2491                     &14usize,
2492                 ),
2493                 &aml::CreateDWordField::new(
2494                     &aml::Path::new("MINH"),
2495                     &aml::Path::new("MR64"),
2496                     &18usize,
2497                 ),
2498                 &aml::CreateQWordField::new(
2499                     &aml::Path::new("MAXL"),
2500                     &aml::Path::new("MR64"),
2501                     &22usize,
2502                 ),
2503                 &aml::CreateDWordField::new(
2504                     &aml::Path::new("MAXH"),
2505                     &aml::Path::new("MR64"),
2506                     &26usize,
2507                 ),
2508                 &aml::CreateQWordField::new(
2509                     &aml::Path::new("LENL"),
2510                     &aml::Path::new("MR64"),
2511                     &38usize,
2512                 ),
2513                 &aml::CreateDWordField::new(
2514                     &aml::Path::new("LENH"),
2515                     &aml::Path::new("MR64"),
2516                     &42usize,
2517                 ),
2518                 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")),
2519                 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")),
2520                 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")),
2521                 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")),
2522                 &aml::Add::new(
2523                     &aml::Path::new("MAXL"),
2524                     &aml::Path::new("MINL"),
2525                     &aml::Path::new("LENL"),
2526                 ),
2527                 &aml::Add::new(
2528                     &aml::Path::new("MAXH"),
2529                     &aml::Path::new("MINH"),
2530                     &aml::Path::new("LENH"),
2531                 ),
2532                 &aml::If::new(
2533                     &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")),
2534                     vec![&aml::Add::new(
2535                         &aml::Path::new("MAXH"),
2536                         &aml::ONE,
2537                         &aml::Path::new("MAXH"),
2538                     )],
2539                 ),
2540                 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE),
2541                 // Release lock
2542                 &aml::Release::new("MLCK".into()),
2543                 &aml::Return::new(&aml::Path::new("MR64")),
2544             ],
2545         )
2546         .to_aml_bytes(sink)
2547     }
2548 }
2549 
2550 impl Aml for MemoryManager {
to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink)2551     fn to_aml_bytes(&self, sink: &mut dyn acpi_tables::AmlSink) {
2552         if let Some(acpi_address) = self.acpi_address {
2553             // Memory Hotplug Controller
2554             aml::Device::new(
2555                 "_SB_.MHPC".into(),
2556                 vec![
2557                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2558                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2559                     // Mutex to protect concurrent access as we write to choose slot and then read back status
2560                     &aml::Mutex::new("MLCK".into(), 0),
2561                     &aml::Name::new(
2562                         "_CRS".into(),
2563                         &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2564                             aml::AddressSpaceCacheable::NotCacheable,
2565                             true,
2566                             acpi_address.0,
2567                             acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1,
2568                             None,
2569                         )]),
2570                     ),
2571                     // OpRegion and Fields map MMIO range into individual field values
2572                     &aml::OpRegion::new(
2573                         "MHPR".into(),
2574                         aml::OpRegionSpace::SystemMemory,
2575                         &(acpi_address.0 as usize),
2576                         &MEMORY_MANAGER_ACPI_SIZE,
2577                     ),
2578                     &aml::Field::new(
2579                         "MHPR".into(),
2580                         aml::FieldAccessType::DWord,
2581                         aml::FieldLockRule::NoLock,
2582                         aml::FieldUpdateRule::Preserve,
2583                         vec![
2584                             aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes)
2585                             aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes)
2586                             aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes)
2587                             aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes)
2588                         ],
2589                     ),
2590                     &aml::Field::new(
2591                         "MHPR".into(),
2592                         aml::FieldAccessType::DWord,
2593                         aml::FieldLockRule::NoLock,
2594                         aml::FieldUpdateRule::Preserve,
2595                         vec![
2596                             aml::FieldEntry::Reserved(128),
2597                             aml::FieldEntry::Named(*b"MHPX", 32), // PXM
2598                         ],
2599                     ),
2600                     &aml::Field::new(
2601                         "MHPR".into(),
2602                         aml::FieldAccessType::Byte,
2603                         aml::FieldLockRule::NoLock,
2604                         aml::FieldUpdateRule::WriteAsZeroes,
2605                         vec![
2606                             aml::FieldEntry::Reserved(160),
2607                             aml::FieldEntry::Named(*b"MEN_", 1), // Enabled
2608                             aml::FieldEntry::Named(*b"MINS", 1), // Inserting
2609                             aml::FieldEntry::Named(*b"MRMV", 1), // Removing
2610                             aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting
2611                         ],
2612                     ),
2613                     &aml::Field::new(
2614                         "MHPR".into(),
2615                         aml::FieldAccessType::DWord,
2616                         aml::FieldLockRule::NoLock,
2617                         aml::FieldUpdateRule::Preserve,
2618                         vec![
2619                             aml::FieldEntry::Named(*b"MSEL", 32), // Selector
2620                             aml::FieldEntry::Named(*b"MOEV", 32), // Event
2621                             aml::FieldEntry::Named(*b"MOSC", 32), // OSC
2622                         ],
2623                     ),
2624                     &MemoryMethods {
2625                         slots: self.hotplug_slots.len(),
2626                     },
2627                     &MemorySlots {
2628                         slots: self.hotplug_slots.len(),
2629                     },
2630                 ],
2631             )
2632             .to_aml_bytes(sink);
2633         } else {
2634             aml::Device::new(
2635                 "_SB_.MHPC".into(),
2636                 vec![
2637                     &aml::Name::new("_HID".into(), &aml::EISAName::new("PNP0A06")),
2638                     &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"),
2639                     // Empty MSCN for GED
2640                     &aml::Method::new("MSCN".into(), 0, true, vec![]),
2641                 ],
2642             )
2643             .to_aml_bytes(sink);
2644         }
2645 
2646         #[cfg(target_arch = "x86_64")]
2647         {
2648             if let Some(sgx_epc_region) = &self.sgx_epc_region {
2649                 let min = sgx_epc_region.start().raw_value();
2650                 let max = min + sgx_epc_region.size() - 1;
2651                 // SGX EPC region
2652                 aml::Device::new(
2653                     "_SB_.EPC_".into(),
2654                     vec![
2655                         &aml::Name::new("_HID".into(), &aml::EISAName::new("INT0E0C")),
2656                         // QWORD describing the EPC region start and size
2657                         &aml::Name::new(
2658                             "_CRS".into(),
2659                             &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory(
2660                                 aml::AddressSpaceCacheable::NotCacheable,
2661                                 true,
2662                                 min,
2663                                 max,
2664                                 None,
2665                             )]),
2666                         ),
2667                         &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]),
2668                     ],
2669                 )
2670                 .to_aml_bytes(sink);
2671             }
2672         }
2673     }
2674 }
2675 
2676 impl Pausable for MemoryManager {}
2677 
2678 #[derive(Clone, Serialize, Deserialize)]
2679 pub struct MemoryManagerSnapshotData {
2680     memory_ranges: MemoryRangeTable,
2681     guest_ram_mappings: Vec<GuestRamMapping>,
2682     start_of_device_area: u64,
2683     boot_ram: u64,
2684     current_ram: u64,
2685     arch_mem_regions: Vec<ArchMemRegion>,
2686     hotplug_slots: Vec<HotPlugState>,
2687     next_memory_slot: u32,
2688     selected_slot: usize,
2689     next_hotplug_slot: usize,
2690 }
2691 
2692 impl Snapshottable for MemoryManager {
id(&self) -> String2693     fn id(&self) -> String {
2694         MEMORY_MANAGER_SNAPSHOT_ID.to_string()
2695     }
2696 
snapshot(&mut self) -> result::Result<Snapshot, MigratableError>2697     fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> {
2698         let memory_ranges = self.memory_range_table(true)?;
2699 
2700         // Store locally this list of ranges as it will be used through the
2701         // Transportable::send() implementation. The point is to avoid the
2702         // duplication of code regarding the creation of the path for each
2703         // region. The 'snapshot' step creates the list of memory regions,
2704         // including information about the need to copy a memory region or
2705         // not. This saves the 'send' step having to go through the same
2706         // process, and instead it can directly proceed with storing the
2707         // memory range content for the ranges requiring it.
2708         self.snapshot_memory_ranges = memory_ranges;
2709 
2710         Ok(Snapshot::from_data(SnapshotData::new_from_state(
2711             &self.snapshot_data(),
2712         )?))
2713     }
2714 }
2715 
2716 impl Transportable for MemoryManager {
send( &self, _snapshot: &Snapshot, destination_url: &str, ) -> result::Result<(), MigratableError>2717     fn send(
2718         &self,
2719         _snapshot: &Snapshot,
2720         destination_url: &str,
2721     ) -> result::Result<(), MigratableError> {
2722         if self.snapshot_memory_ranges.is_empty() {
2723             return Ok(());
2724         }
2725 
2726         let mut memory_file_path = url_to_path(destination_url)?;
2727         memory_file_path.push(String::from(SNAPSHOT_FILENAME));
2728 
2729         // Create the snapshot file for the entire memory
2730         let mut memory_file = OpenOptions::new()
2731             .read(true)
2732             .write(true)
2733             .create_new(true)
2734             .open(memory_file_path)
2735             .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2736 
2737         let guest_memory = self.guest_memory.memory();
2738 
2739         for range in self.snapshot_memory_ranges.regions() {
2740             let mut offset: u64 = 0;
2741             // Here we are manually handling the retry in case we can't read
2742             // the whole region at once because we can't use the implementation
2743             // from vm-memory::GuestMemory of write_all_to() as it is not
2744             // following the correct behavior. For more info about this issue
2745             // see: https://github.com/rust-vmm/vm-memory/issues/174
2746             loop {
2747                 let bytes_written = guest_memory
2748                     .write_volatile_to(
2749                         GuestAddress(range.gpa + offset),
2750                         &mut memory_file,
2751                         (range.length - offset) as usize,
2752                     )
2753                     .map_err(|e| MigratableError::MigrateSend(e.into()))?;
2754                 offset += bytes_written as u64;
2755 
2756                 if offset == range.length {
2757                     break;
2758                 }
2759             }
2760         }
2761         Ok(())
2762     }
2763 }
2764 
2765 impl Migratable for MemoryManager {
2766     // Start the dirty log in the hypervisor (kvm/mshv).
2767     // Also, reset the dirty bitmap logged by the vmm.
2768     // Just before we do a bulk copy we want to start/clear the dirty log so that
2769     // pages touched during our bulk copy are tracked.
start_dirty_log(&mut self) -> std::result::Result<(), MigratableError>2770     fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2771         self.vm.start_dirty_log().map_err(|e| {
2772             MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e))
2773         })?;
2774 
2775         for r in self.guest_memory.memory().iter() {
2776             (**r).bitmap().reset();
2777         }
2778 
2779         Ok(())
2780     }
2781 
stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError>2782     fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> {
2783         self.vm.stop_dirty_log().map_err(|e| {
2784             MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e))
2785         })?;
2786 
2787         Ok(())
2788     }
2789 
2790     // Generate a table for the pages that are dirty. The dirty pages are collapsed
2791     // together in the table if they are contiguous.
dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError>2792     fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> {
2793         let mut table = MemoryRangeTable::default();
2794         for r in &self.guest_ram_mappings {
2795             let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| {
2796                 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e))
2797             })?;
2798             let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa))
2799             {
2800                 Some(region) => {
2801                     assert!(region.start_addr().raw_value() == r.gpa);
2802                     assert!(region.len() == r.size);
2803                     (**region).bitmap().get_and_reset()
2804                 }
2805                 None => {
2806                     return Err(MigratableError::MigrateSend(anyhow!(
2807                         "Error finding 'guest memory region' with address {:x}",
2808                         r.gpa
2809                     )))
2810                 }
2811             };
2812 
2813             let dirty_bitmap: Vec<u64> = vm_dirty_bitmap
2814                 .iter()
2815                 .zip(vmm_dirty_bitmap.iter())
2816                 .map(|(x, y)| x | y)
2817                 .collect();
2818 
2819             let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096);
2820 
2821             if sub_table.regions().is_empty() {
2822                 info!("Dirty Memory Range Table is empty");
2823             } else {
2824                 info!("Dirty Memory Range Table:");
2825                 for range in sub_table.regions() {
2826                     info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024);
2827                 }
2828             }
2829 
2830             table.extend(sub_table);
2831         }
2832         Ok(table)
2833     }
2834 }
2835