1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(target_arch = "x86_64")] 6 use crate::config::SgxEpcConfig; 7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 8 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 9 use crate::coredump::{ 10 CoredumpMemoryRegion, CoredumpMemoryRegions, DumpState, GuestDebuggableError, 11 }; 12 use crate::migration::url_to_path; 13 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 14 use crate::{GuestMemoryMmap, GuestRegionMmap}; 15 use acpi_tables::{aml, aml::Aml}; 16 use anyhow::anyhow; 17 #[cfg(target_arch = "x86_64")] 18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 19 use arch::{layout, RegionType}; 20 #[cfg(target_arch = "x86_64")] 21 use devices::ioapic; 22 #[cfg(target_arch = "aarch64")] 23 use hypervisor::HypervisorVmError; 24 #[cfg(target_arch = "x86_64")] 25 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 26 use serde::{Deserialize, Serialize}; 27 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 28 use std::collections::BTreeMap; 29 use std::collections::HashMap; 30 use std::convert::TryInto; 31 use std::ffi; 32 use std::fs::{File, OpenOptions}; 33 use std::io::{self, Read}; 34 use std::ops::Deref; 35 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 36 use std::path::PathBuf; 37 use std::result; 38 use std::sync::{Arc, Barrier, Mutex}; 39 use tracer::trace_scoped; 40 use versionize::{VersionMap, Versionize, VersionizeResult}; 41 use versionize_derive::Versionize; 42 use virtio_devices::BlocksState; 43 #[cfg(target_arch = "x86_64")] 44 use vm_allocator::GsiApic; 45 use vm_allocator::{AddressAllocator, SystemAllocator}; 46 use vm_device::BusDevice; 47 use vm_memory::bitmap::AtomicBitmap; 48 use vm_memory::guest_memory::FileOffset; 49 use vm_memory::{ 50 mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace, 51 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 52 }; 53 use vm_migration::{ 54 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 55 Snapshot, SnapshotData, Snapshottable, Transportable, VersionMapped, 56 }; 57 58 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 59 60 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 61 62 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 63 64 #[cfg(target_arch = "x86_64")] 65 const X86_64_IRQ_BASE: u32 = 5; 66 67 #[cfg(target_arch = "x86_64")] 68 const SGX_PAGE_SIZE: u64 = 1 << 12; 69 70 const HOTPLUG_COUNT: usize = 8; 71 72 // Memory policy constants 73 const MPOL_BIND: u32 = 2; 74 const MPOL_MF_STRICT: u32 = 1; 75 const MPOL_MF_MOVE: u32 = 1 << 1; 76 77 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 78 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 79 80 #[derive(Clone, Default, Serialize, Deserialize, Versionize)] 81 struct HotPlugState { 82 base: u64, 83 length: u64, 84 active: bool, 85 inserting: bool, 86 removing: bool, 87 } 88 89 pub struct VirtioMemZone { 90 region: Arc<GuestRegionMmap>, 91 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 92 hotplugged_size: u64, 93 hugepages: bool, 94 blocks_state: Arc<Mutex<BlocksState>>, 95 } 96 97 impl VirtioMemZone { 98 pub fn region(&self) -> &Arc<GuestRegionMmap> { 99 &self.region 100 } 101 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 102 self.virtio_device = Some(virtio_device); 103 } 104 pub fn hotplugged_size(&self) -> u64 { 105 self.hotplugged_size 106 } 107 pub fn hugepages(&self) -> bool { 108 self.hugepages 109 } 110 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 111 &self.blocks_state 112 } 113 pub fn plugged_ranges(&self) -> MemoryRangeTable { 114 self.blocks_state 115 .lock() 116 .unwrap() 117 .memory_ranges(self.region.start_addr().raw_value(), true) 118 } 119 } 120 121 #[derive(Default)] 122 pub struct MemoryZone { 123 regions: Vec<Arc<GuestRegionMmap>>, 124 virtio_mem_zone: Option<VirtioMemZone>, 125 } 126 127 impl MemoryZone { 128 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 129 &self.regions 130 } 131 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 132 &self.virtio_mem_zone 133 } 134 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 135 self.virtio_mem_zone.as_mut() 136 } 137 } 138 139 pub type MemoryZones = HashMap<String, MemoryZone>; 140 141 #[derive(Clone, Serialize, Deserialize, Versionize)] 142 struct GuestRamMapping { 143 slot: u32, 144 gpa: u64, 145 size: u64, 146 zone_id: String, 147 virtio_mem: bool, 148 file_offset: u64, 149 } 150 151 #[derive(Clone, Serialize, Deserialize, Versionize)] 152 struct ArchMemRegion { 153 base: u64, 154 size: usize, 155 r_type: RegionType, 156 } 157 158 pub struct MemoryManager { 159 boot_guest_memory: GuestMemoryMmap, 160 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 161 next_memory_slot: u32, 162 start_of_device_area: GuestAddress, 163 end_of_device_area: GuestAddress, 164 end_of_ram_area: GuestAddress, 165 pub vm: Arc<dyn hypervisor::Vm>, 166 hotplug_slots: Vec<HotPlugState>, 167 selected_slot: usize, 168 mergeable: bool, 169 allocator: Arc<Mutex<SystemAllocator>>, 170 hotplug_method: HotplugMethod, 171 boot_ram: u64, 172 current_ram: u64, 173 next_hotplug_slot: usize, 174 shared: bool, 175 hugepages: bool, 176 hugepage_size: Option<u64>, 177 prefault: bool, 178 thp: bool, 179 #[cfg(target_arch = "x86_64")] 180 sgx_epc_region: Option<SgxEpcRegion>, 181 user_provided_zones: bool, 182 snapshot_memory_ranges: MemoryRangeTable, 183 memory_zones: MemoryZones, 184 log_dirty: bool, // Enable dirty logging for created RAM regions 185 arch_mem_regions: Vec<ArchMemRegion>, 186 ram_allocator: AddressAllocator, 187 dynamic: bool, 188 189 // Keep track of calls to create_userspace_mapping() for guest RAM. 190 // This is useful for getting the dirty pages as we need to know the 191 // slots that the mapping is created in. 192 guest_ram_mappings: Vec<GuestRamMapping>, 193 194 pub acpi_address: Option<GuestAddress>, 195 #[cfg(target_arch = "aarch64")] 196 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 197 } 198 199 #[derive(Debug)] 200 pub enum Error { 201 /// Failed to create shared file. 202 SharedFileCreate(io::Error), 203 204 /// Failed to set shared file length. 205 SharedFileSetLen(io::Error), 206 207 /// Mmap backed guest memory error 208 GuestMemory(MmapError), 209 210 /// Failed to allocate a memory range. 211 MemoryRangeAllocation, 212 213 /// Error from region creation 214 GuestMemoryRegion(MmapRegionError), 215 216 /// No ACPI slot available 217 NoSlotAvailable, 218 219 /// Not enough space in the hotplug RAM region 220 InsufficientHotplugRam, 221 222 /// The requested hotplug memory addition is not a valid size 223 InvalidSize, 224 225 /// Failed to create the user memory region. 226 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 227 228 /// Failed to remove the user memory region. 229 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 230 231 /// Failed to EventFd. 232 EventFdFail(io::Error), 233 234 /// Eventfd write error 235 EventfdError(io::Error), 236 237 /// Failed to virtio-mem resize 238 VirtioMemResizeFail(virtio_devices::mem::Error), 239 240 /// Cannot restore VM 241 Restore(MigratableError), 242 243 /// Cannot restore VM because source URL is missing 244 RestoreMissingSourceUrl, 245 246 /// Cannot create the system allocator 247 CreateSystemAllocator, 248 249 /// Invalid SGX EPC section size 250 #[cfg(target_arch = "x86_64")] 251 EpcSectionSizeInvalid, 252 253 /// Failed allocating SGX EPC region 254 #[cfg(target_arch = "x86_64")] 255 SgxEpcRangeAllocation, 256 257 /// Failed opening SGX virtual EPC device 258 #[cfg(target_arch = "x86_64")] 259 SgxVirtEpcOpen(io::Error), 260 261 /// Failed setting the SGX virtual EPC section size 262 #[cfg(target_arch = "x86_64")] 263 SgxVirtEpcFileSetLen(io::Error), 264 265 /// Failed opening SGX provisioning device 266 #[cfg(target_arch = "x86_64")] 267 SgxProvisionOpen(io::Error), 268 269 /// Failed enabling SGX provisioning 270 #[cfg(target_arch = "x86_64")] 271 SgxEnableProvisioning(hypervisor::HypervisorVmError), 272 273 /// Failed creating a new MmapRegion instance. 274 #[cfg(target_arch = "x86_64")] 275 NewMmapRegion(vm_memory::mmap::MmapRegionError), 276 277 /// No memory zones found. 278 MissingMemoryZones, 279 280 /// Memory configuration is not valid. 281 InvalidMemoryParameters, 282 283 /// Forbidden operation. Impossible to resize guest memory if it is 284 /// backed by user defined memory regions. 285 InvalidResizeWithMemoryZones, 286 287 /// It's invalid to try applying a NUMA policy to a memory zone that is 288 /// memory mapped with MAP_SHARED. 289 InvalidSharedMemoryZoneWithHostNuma, 290 291 /// Failed applying NUMA memory policy. 292 ApplyNumaPolicy(io::Error), 293 294 /// Memory zone identifier is not unique. 295 DuplicateZoneId, 296 297 /// No virtio-mem resizing handler found. 298 MissingVirtioMemHandler, 299 300 /// Unknown memory zone. 301 UnknownMemoryZone, 302 303 /// Invalid size for resizing. Can be anything except 0. 304 InvalidHotplugSize, 305 306 /// Invalid hotplug method associated with memory zones resizing capability. 307 InvalidHotplugMethodWithMemoryZones, 308 309 /// Could not find specified memory zone identifier from hash map. 310 MissingZoneIdentifier, 311 312 /// Resizing the memory zone failed. 313 ResizeZone, 314 315 /// Guest address overflow 316 GuestAddressOverFlow, 317 318 /// Error opening snapshot file 319 SnapshotOpen(io::Error), 320 321 // Error copying snapshot into region 322 SnapshotCopy(GuestMemoryError), 323 324 /// Failed to allocate MMIO address 325 AllocateMmioAddress, 326 327 #[cfg(target_arch = "aarch64")] 328 /// Failed to create UEFI flash 329 CreateUefiFlash(HypervisorVmError), 330 } 331 332 const ENABLE_FLAG: usize = 0; 333 const INSERTING_FLAG: usize = 1; 334 const REMOVING_FLAG: usize = 2; 335 const EJECT_FLAG: usize = 3; 336 337 const BASE_OFFSET_LOW: u64 = 0; 338 const BASE_OFFSET_HIGH: u64 = 0x4; 339 const LENGTH_OFFSET_LOW: u64 = 0x8; 340 const LENGTH_OFFSET_HIGH: u64 = 0xC; 341 const STATUS_OFFSET: u64 = 0x14; 342 const SELECTION_OFFSET: u64 = 0; 343 344 // The MMIO address space size is subtracted with 64k. This is done for the 345 // following reasons: 346 // - Reduce the addressable space size by at least 4k to workaround a Linux 347 // bug when the VMM allocates devices at the end of the addressable space 348 // - Windows requires the addressable space size to be 64k aligned 349 fn mmio_address_space_size(phys_bits: u8) -> u64 { 350 (1 << phys_bits) - (1 << 16) 351 } 352 353 impl BusDevice for MemoryManager { 354 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 355 if self.selected_slot < self.hotplug_slots.len() { 356 let state = &self.hotplug_slots[self.selected_slot]; 357 match offset { 358 BASE_OFFSET_LOW => { 359 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 360 } 361 BASE_OFFSET_HIGH => { 362 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 363 } 364 LENGTH_OFFSET_LOW => { 365 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 366 } 367 LENGTH_OFFSET_HIGH => { 368 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 369 } 370 STATUS_OFFSET => { 371 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 372 data.fill(0); 373 if state.active { 374 data[0] |= 1 << ENABLE_FLAG; 375 } 376 if state.inserting { 377 data[0] |= 1 << INSERTING_FLAG; 378 } 379 if state.removing { 380 data[0] |= 1 << REMOVING_FLAG; 381 } 382 } 383 _ => { 384 warn!( 385 "Unexpected offset for accessing memory manager device: {:#}", 386 offset 387 ); 388 } 389 } 390 } else { 391 warn!("Out of range memory slot: {}", self.selected_slot); 392 } 393 } 394 395 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 396 match offset { 397 SELECTION_OFFSET => { 398 self.selected_slot = usize::from(data[0]); 399 } 400 STATUS_OFFSET => { 401 if self.selected_slot < self.hotplug_slots.len() { 402 let state = &mut self.hotplug_slots[self.selected_slot]; 403 // The ACPI code writes back a 1 to acknowledge the insertion 404 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 405 state.inserting = false; 406 } 407 // Ditto for removal 408 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 409 state.removing = false; 410 } 411 // Trigger removal of "DIMM" 412 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 413 warn!("Ejection of memory not currently supported"); 414 } 415 } else { 416 warn!("Out of range memory slot: {}", self.selected_slot); 417 } 418 } 419 _ => { 420 warn!( 421 "Unexpected offset for accessing memory manager device: {:#}", 422 offset 423 ); 424 } 425 }; 426 None 427 } 428 } 429 430 impl MemoryManager { 431 /// Creates all memory regions based on the available RAM ranges defined 432 /// by `ram_regions`, and based on the description of the memory zones. 433 /// In practice, this function can perform multiple memory mappings of the 434 /// same backing file if there's a hole in the address space between two 435 /// RAM ranges. 436 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 437 /// and zones containing two zones (size 1G and size 4G). 438 /// This function will create 3 resulting memory regions: 439 /// - First one mapping entirely the first memory zone on 0-1G range 440 /// - Second one mapping partially the second memory zone on 1G-3G range 441 /// - Third one mapping partially the second memory zone on 4G-6G range 442 fn create_memory_regions_from_zones( 443 ram_regions: &[(GuestAddress, usize)], 444 zones: &[MemoryZoneConfig], 445 prefault: Option<bool>, 446 thp: bool, 447 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 448 let mut zones = zones.to_owned(); 449 let mut mem_regions = Vec::new(); 450 let mut zone = zones.remove(0); 451 let mut zone_offset = 0; 452 let mut memory_zones = HashMap::new(); 453 454 // Add zone id to the list of memory zones. 455 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 456 457 for ram_region in ram_regions.iter() { 458 let mut ram_region_offset = 0; 459 let mut exit = false; 460 461 loop { 462 let mut ram_region_consumed = false; 463 let mut pull_next_zone = false; 464 465 let ram_region_sub_size = ram_region.1 - ram_region_offset; 466 let zone_sub_size = zone.size as usize - zone_offset; 467 468 let file_offset = zone_offset as u64; 469 let region_start = ram_region 470 .0 471 .checked_add(ram_region_offset as u64) 472 .ok_or(Error::GuestAddressOverFlow)?; 473 let region_size = if zone_sub_size <= ram_region_sub_size { 474 if zone_sub_size == ram_region_sub_size { 475 ram_region_consumed = true; 476 } 477 478 ram_region_offset += zone_sub_size; 479 pull_next_zone = true; 480 481 zone_sub_size 482 } else { 483 zone_offset += ram_region_sub_size; 484 ram_region_consumed = true; 485 486 ram_region_sub_size 487 }; 488 489 let region = MemoryManager::create_ram_region( 490 &zone.file, 491 file_offset, 492 region_start, 493 region_size, 494 match prefault { 495 Some(pf) => pf, 496 None => zone.prefault, 497 }, 498 zone.shared, 499 zone.hugepages, 500 zone.hugepage_size, 501 zone.host_numa_node, 502 None, 503 thp, 504 )?; 505 506 // Add region to the list of regions associated with the 507 // current memory zone. 508 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 509 memory_zone.regions.push(region.clone()); 510 } 511 512 mem_regions.push(region); 513 514 if pull_next_zone { 515 // Get the next zone and reset the offset. 516 zone_offset = 0; 517 if zones.is_empty() { 518 exit = true; 519 break; 520 } 521 zone = zones.remove(0); 522 523 // Check if zone id already exist. In case it does, throw 524 // an error as we need unique identifiers. Otherwise, add 525 // the new zone id to the list of memory zones. 526 if memory_zones.contains_key(&zone.id) { 527 error!( 528 "Memory zone identifier '{}' found more than once. \ 529 It must be unique", 530 zone.id, 531 ); 532 return Err(Error::DuplicateZoneId); 533 } 534 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 535 } 536 537 if ram_region_consumed { 538 break; 539 } 540 } 541 542 if exit { 543 break; 544 } 545 } 546 547 Ok((mem_regions, memory_zones)) 548 } 549 550 // Restore both GuestMemory regions along with MemoryZone zones. 551 fn restore_memory_regions_and_zones( 552 guest_ram_mappings: &[GuestRamMapping], 553 zones_config: &[MemoryZoneConfig], 554 prefault: Option<bool>, 555 mut existing_memory_files: HashMap<u32, File>, 556 thp: bool, 557 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 558 let mut memory_regions = Vec::new(); 559 let mut memory_zones = HashMap::new(); 560 561 for zone_config in zones_config { 562 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 563 } 564 565 for guest_ram_mapping in guest_ram_mappings { 566 for zone_config in zones_config { 567 if guest_ram_mapping.zone_id == zone_config.id { 568 let region = MemoryManager::create_ram_region( 569 &zone_config.file, 570 guest_ram_mapping.file_offset, 571 GuestAddress(guest_ram_mapping.gpa), 572 guest_ram_mapping.size as usize, 573 match prefault { 574 Some(pf) => pf, 575 None => zone_config.prefault, 576 }, 577 zone_config.shared, 578 zone_config.hugepages, 579 zone_config.hugepage_size, 580 zone_config.host_numa_node, 581 existing_memory_files.remove(&guest_ram_mapping.slot), 582 thp, 583 )?; 584 memory_regions.push(Arc::clone(®ion)); 585 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 586 if guest_ram_mapping.virtio_mem { 587 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 588 let region_size = region.len(); 589 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 590 region, 591 virtio_device: None, 592 hotplugged_size, 593 hugepages: zone_config.hugepages, 594 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 595 }); 596 } else { 597 memory_zone.regions.push(region); 598 } 599 } 600 } 601 } 602 } 603 604 memory_regions.sort_by_key(|x| x.start_addr()); 605 606 Ok((memory_regions, memory_zones)) 607 } 608 609 fn fill_saved_regions( 610 &mut self, 611 file_path: PathBuf, 612 saved_regions: MemoryRangeTable, 613 ) -> Result<(), Error> { 614 if saved_regions.is_empty() { 615 return Ok(()); 616 } 617 618 // Open (read only) the snapshot file. 619 let mut memory_file = OpenOptions::new() 620 .read(true) 621 .open(file_path) 622 .map_err(Error::SnapshotOpen)?; 623 624 let guest_memory = self.guest_memory.memory(); 625 for range in saved_regions.regions() { 626 let mut offset: u64 = 0; 627 // Here we are manually handling the retry in case we can't write 628 // the whole region at once because we can't use the implementation 629 // from vm-memory::GuestMemory of read_exact_from() as it is not 630 // following the correct behavior. For more info about this issue 631 // see: https://github.com/rust-vmm/vm-memory/issues/174 632 loop { 633 let bytes_read = guest_memory 634 .read_from( 635 GuestAddress(range.gpa + offset), 636 &mut memory_file, 637 (range.length - offset) as usize, 638 ) 639 .map_err(Error::SnapshotCopy)?; 640 offset += bytes_read as u64; 641 642 if offset == range.length { 643 break; 644 } 645 } 646 } 647 648 Ok(()) 649 } 650 651 fn validate_memory_config( 652 config: &MemoryConfig, 653 user_provided_zones: bool, 654 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 655 let mut allow_mem_hotplug = false; 656 657 if !user_provided_zones { 658 if config.zones.is_some() { 659 error!( 660 "User defined memory regions can't be provided if the \ 661 memory size is not 0" 662 ); 663 return Err(Error::InvalidMemoryParameters); 664 } 665 666 if config.hotplug_size.is_some() { 667 allow_mem_hotplug = true; 668 } 669 670 if let Some(hotplugged_size) = config.hotplugged_size { 671 if let Some(hotplug_size) = config.hotplug_size { 672 if hotplugged_size > hotplug_size { 673 error!( 674 "'hotplugged_size' {} can't be bigger than \ 675 'hotplug_size' {}", 676 hotplugged_size, hotplug_size, 677 ); 678 return Err(Error::InvalidMemoryParameters); 679 } 680 } else { 681 error!( 682 "Invalid to define 'hotplugged_size' when there is\ 683 no 'hotplug_size'" 684 ); 685 return Err(Error::InvalidMemoryParameters); 686 } 687 if config.hotplug_method == HotplugMethod::Acpi { 688 error!( 689 "Invalid to define 'hotplugged_size' with hotplug \ 690 method 'acpi'" 691 ); 692 return Err(Error::InvalidMemoryParameters); 693 } 694 } 695 696 // Create a single zone from the global memory config. This lets 697 // us reuse the codepath for user defined memory zones. 698 let zones = vec![MemoryZoneConfig { 699 id: String::from(DEFAULT_MEMORY_ZONE), 700 size: config.size, 701 file: None, 702 shared: config.shared, 703 hugepages: config.hugepages, 704 hugepage_size: config.hugepage_size, 705 host_numa_node: None, 706 hotplug_size: config.hotplug_size, 707 hotplugged_size: config.hotplugged_size, 708 prefault: config.prefault, 709 }]; 710 711 Ok((config.size, zones, allow_mem_hotplug)) 712 } else { 713 if config.zones.is_none() { 714 error!( 715 "User defined memory regions must be provided if the \ 716 memory size is 0" 717 ); 718 return Err(Error::MissingMemoryZones); 719 } 720 721 // Safe to unwrap as we checked right above there were some 722 // regions. 723 let zones = config.zones.clone().unwrap(); 724 if zones.is_empty() { 725 return Err(Error::MissingMemoryZones); 726 } 727 728 let mut total_ram_size: u64 = 0; 729 for zone in zones.iter() { 730 total_ram_size += zone.size; 731 732 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 733 error!( 734 "Invalid to set host NUMA policy for a memory zone \ 735 backed by a regular file and mapped as 'shared'" 736 ); 737 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 738 } 739 740 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 741 error!("Invalid to set ACPI hotplug method for memory zones"); 742 return Err(Error::InvalidHotplugMethodWithMemoryZones); 743 } 744 745 if let Some(hotplugged_size) = zone.hotplugged_size { 746 if let Some(hotplug_size) = zone.hotplug_size { 747 if hotplugged_size > hotplug_size { 748 error!( 749 "'hotplugged_size' {} can't be bigger than \ 750 'hotplug_size' {}", 751 hotplugged_size, hotplug_size, 752 ); 753 return Err(Error::InvalidMemoryParameters); 754 } 755 } else { 756 error!( 757 "Invalid to define 'hotplugged_size' when there is\ 758 no 'hotplug_size' for a memory zone" 759 ); 760 return Err(Error::InvalidMemoryParameters); 761 } 762 if config.hotplug_method == HotplugMethod::Acpi { 763 error!( 764 "Invalid to define 'hotplugged_size' with hotplug \ 765 method 'acpi'" 766 ); 767 return Err(Error::InvalidMemoryParameters); 768 } 769 } 770 } 771 772 Ok((total_ram_size, zones, allow_mem_hotplug)) 773 } 774 } 775 776 fn allocate_address_space(&mut self) -> Result<(), Error> { 777 let mut list = Vec::new(); 778 779 for (zone_id, memory_zone) in self.memory_zones.iter() { 780 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 781 memory_zone 782 .regions() 783 .iter() 784 .map(|r| (r.clone(), false)) 785 .collect(); 786 787 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 788 regions.push((virtio_mem_zone.region().clone(), true)); 789 } 790 791 list.push((zone_id.clone(), regions)); 792 } 793 794 for (zone_id, regions) in list { 795 for (region, virtio_mem) in regions { 796 let slot = self.create_userspace_mapping( 797 region.start_addr().raw_value(), 798 region.len(), 799 region.as_ptr() as u64, 800 self.mergeable, 801 false, 802 self.log_dirty, 803 )?; 804 805 let file_offset = if let Some(file_offset) = region.file_offset() { 806 file_offset.start() 807 } else { 808 0 809 }; 810 811 self.guest_ram_mappings.push(GuestRamMapping { 812 gpa: region.start_addr().raw_value(), 813 size: region.len(), 814 slot, 815 zone_id: zone_id.clone(), 816 virtio_mem, 817 file_offset, 818 }); 819 self.ram_allocator 820 .allocate(Some(region.start_addr()), region.len(), None) 821 .ok_or(Error::MemoryRangeAllocation)?; 822 } 823 } 824 825 // Allocate SubRegion and Reserved address ranges. 826 for region in self.arch_mem_regions.iter() { 827 if region.r_type == RegionType::Ram { 828 // Ignore the RAM type since ranges have already been allocated 829 // based on the GuestMemory regions. 830 continue; 831 } 832 self.ram_allocator 833 .allocate( 834 Some(GuestAddress(region.base)), 835 region.size as GuestUsize, 836 None, 837 ) 838 .ok_or(Error::MemoryRangeAllocation)?; 839 } 840 841 Ok(()) 842 } 843 844 #[cfg(target_arch = "aarch64")] 845 fn add_uefi_flash(&mut self) -> Result<(), Error> { 846 // On AArch64, the UEFI binary requires a flash device at address 0. 847 // 4 MiB memory is mapped to simulate the flash. 848 let uefi_mem_slot = self.allocate_memory_slot(); 849 let uefi_region = GuestRegionMmap::new( 850 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 851 arch::layout::UEFI_START, 852 ) 853 .unwrap(); 854 let uefi_mem_region = self.vm.make_user_memory_region( 855 uefi_mem_slot, 856 uefi_region.start_addr().raw_value(), 857 uefi_region.len(), 858 uefi_region.as_ptr() as u64, 859 false, 860 false, 861 ); 862 self.vm 863 .create_user_memory_region(uefi_mem_region) 864 .map_err(Error::CreateUefiFlash)?; 865 866 let uefi_flash = 867 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 868 869 self.uefi_flash = Some(uefi_flash); 870 871 Ok(()) 872 } 873 874 #[allow(clippy::too_many_arguments)] 875 pub fn new( 876 vm: Arc<dyn hypervisor::Vm>, 877 config: &MemoryConfig, 878 prefault: Option<bool>, 879 phys_bits: u8, 880 #[cfg(feature = "tdx")] tdx_enabled: bool, 881 restore_data: Option<&MemoryManagerSnapshotData>, 882 existing_memory_files: Option<HashMap<u32, File>>, 883 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 884 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 885 trace_scoped!("MemoryManager::new"); 886 887 let user_provided_zones = config.size == 0; 888 889 let mmio_address_space_size = mmio_address_space_size(phys_bits); 890 debug_assert_eq!( 891 (((mmio_address_space_size) >> 16) << 16), 892 mmio_address_space_size 893 ); 894 let start_of_platform_device_area = 895 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 896 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 897 898 let (ram_size, zones, allow_mem_hotplug) = 899 Self::validate_memory_config(config, user_provided_zones)?; 900 901 let ( 902 start_of_device_area, 903 boot_ram, 904 current_ram, 905 arch_mem_regions, 906 memory_zones, 907 guest_memory, 908 boot_guest_memory, 909 hotplug_slots, 910 next_memory_slot, 911 selected_slot, 912 next_hotplug_slot, 913 ) = if let Some(data) = restore_data { 914 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 915 &data.guest_ram_mappings, 916 &zones, 917 prefault, 918 existing_memory_files.unwrap_or_default(), 919 config.thp, 920 )?; 921 let guest_memory = 922 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 923 let boot_guest_memory = guest_memory.clone(); 924 ( 925 GuestAddress(data.start_of_device_area), 926 data.boot_ram, 927 data.current_ram, 928 data.arch_mem_regions.clone(), 929 memory_zones, 930 guest_memory, 931 boot_guest_memory, 932 data.hotplug_slots.clone(), 933 data.next_memory_slot, 934 data.selected_slot, 935 data.next_hotplug_slot, 936 ) 937 } else { 938 // Init guest memory 939 let arch_mem_regions = arch::arch_memory_regions(ram_size); 940 941 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 942 .iter() 943 .filter(|r| r.2 == RegionType::Ram) 944 .map(|r| (r.0, r.1)) 945 .collect(); 946 947 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 948 .iter() 949 .map(|(a, b, c)| ArchMemRegion { 950 base: a.0, 951 size: *b, 952 r_type: *c, 953 }) 954 .collect(); 955 956 let (mem_regions, mut memory_zones) = 957 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 958 959 let mut guest_memory = 960 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 961 962 let boot_guest_memory = guest_memory.clone(); 963 964 let mut start_of_device_area = 965 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 966 967 // Update list of memory zones for resize. 968 for zone in zones.iter() { 969 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 970 if let Some(hotplug_size) = zone.hotplug_size { 971 if hotplug_size == 0 { 972 error!("'hotplug_size' can't be 0"); 973 return Err(Error::InvalidHotplugSize); 974 } 975 976 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 977 start_of_device_area = start_of_device_area 978 .checked_add(hotplug_size) 979 .ok_or(Error::GuestAddressOverFlow)?; 980 } else { 981 // Alignment must be "natural" i.e. same as size of block 982 let start_addr = GuestAddress( 983 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 984 - 1) 985 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 986 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 987 ); 988 989 // When `prefault` is set by vm_restore, memory manager 990 // will create ram region with `prefault` option in 991 // restore config rather than same option in zone 992 let region = MemoryManager::create_ram_region( 993 &None, 994 0, 995 start_addr, 996 hotplug_size as usize, 997 match prefault { 998 Some(pf) => pf, 999 None => zone.prefault, 1000 }, 1001 zone.shared, 1002 zone.hugepages, 1003 zone.hugepage_size, 1004 zone.host_numa_node, 1005 None, 1006 config.thp, 1007 )?; 1008 1009 guest_memory = guest_memory 1010 .insert_region(Arc::clone(®ion)) 1011 .map_err(Error::GuestMemory)?; 1012 1013 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1014 let region_size = region.len(); 1015 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1016 region, 1017 virtio_device: None, 1018 hotplugged_size, 1019 hugepages: zone.hugepages, 1020 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1021 }); 1022 1023 start_of_device_area = start_addr 1024 .checked_add(hotplug_size) 1025 .ok_or(Error::GuestAddressOverFlow)?; 1026 } 1027 } 1028 } else { 1029 return Err(Error::MissingZoneIdentifier); 1030 } 1031 } 1032 1033 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1034 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1035 1036 ( 1037 start_of_device_area, 1038 ram_size, 1039 ram_size, 1040 arch_mem_regions, 1041 memory_zones, 1042 guest_memory, 1043 boot_guest_memory, 1044 hotplug_slots, 1045 0, 1046 0, 1047 0, 1048 ) 1049 }; 1050 1051 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1052 1053 // Both MMIO and PIO address spaces start at address 0. 1054 let allocator = Arc::new(Mutex::new( 1055 SystemAllocator::new( 1056 #[cfg(target_arch = "x86_64")] 1057 { 1058 GuestAddress(0) 1059 }, 1060 #[cfg(target_arch = "x86_64")] 1061 { 1062 1 << 16 1063 }, 1064 start_of_platform_device_area, 1065 PLATFORM_DEVICE_AREA_SIZE, 1066 layout::MEM_32BIT_DEVICES_START, 1067 layout::MEM_32BIT_DEVICES_SIZE, 1068 #[cfg(target_arch = "x86_64")] 1069 vec![GsiApic::new( 1070 X86_64_IRQ_BASE, 1071 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1072 )], 1073 ) 1074 .ok_or(Error::CreateSystemAllocator)?, 1075 )); 1076 1077 #[cfg(not(feature = "tdx"))] 1078 let dynamic = true; 1079 #[cfg(feature = "tdx")] 1080 let dynamic = !tdx_enabled; 1081 1082 let acpi_address = if dynamic 1083 && config.hotplug_method == HotplugMethod::Acpi 1084 && (config.hotplug_size.unwrap_or_default() > 0) 1085 { 1086 Some( 1087 allocator 1088 .lock() 1089 .unwrap() 1090 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1091 .ok_or(Error::AllocateMmioAddress)?, 1092 ) 1093 } else { 1094 None 1095 }; 1096 1097 // If running on SGX the start of device area and RAM area may diverge but 1098 // at this point they are next to each other. 1099 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1100 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1101 1102 let mut memory_manager = MemoryManager { 1103 boot_guest_memory, 1104 guest_memory, 1105 next_memory_slot, 1106 start_of_device_area, 1107 end_of_device_area, 1108 end_of_ram_area, 1109 vm, 1110 hotplug_slots, 1111 selected_slot, 1112 mergeable: config.mergeable, 1113 allocator, 1114 hotplug_method: config.hotplug_method, 1115 boot_ram, 1116 current_ram, 1117 next_hotplug_slot, 1118 shared: config.shared, 1119 hugepages: config.hugepages, 1120 hugepage_size: config.hugepage_size, 1121 prefault: config.prefault, 1122 #[cfg(target_arch = "x86_64")] 1123 sgx_epc_region: None, 1124 user_provided_zones, 1125 snapshot_memory_ranges: MemoryRangeTable::default(), 1126 memory_zones, 1127 guest_ram_mappings: Vec::new(), 1128 acpi_address, 1129 log_dirty: dynamic, // Cannot log dirty pages on a TD 1130 arch_mem_regions, 1131 ram_allocator, 1132 dynamic, 1133 #[cfg(target_arch = "aarch64")] 1134 uefi_flash: None, 1135 thp: config.thp, 1136 }; 1137 1138 memory_manager.allocate_address_space()?; 1139 1140 #[cfg(target_arch = "aarch64")] 1141 memory_manager.add_uefi_flash()?; 1142 1143 #[cfg(target_arch = "x86_64")] 1144 if let Some(sgx_epc_config) = sgx_epc_config { 1145 memory_manager.setup_sgx(sgx_epc_config)?; 1146 } 1147 1148 Ok(Arc::new(Mutex::new(memory_manager))) 1149 } 1150 1151 pub fn new_from_snapshot( 1152 snapshot: &Snapshot, 1153 vm: Arc<dyn hypervisor::Vm>, 1154 config: &MemoryConfig, 1155 source_url: Option<&str>, 1156 prefault: bool, 1157 phys_bits: u8, 1158 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1159 if let Some(source_url) = source_url { 1160 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1161 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1162 1163 let mem_snapshot: MemoryManagerSnapshotData = 1164 snapshot.to_versioned_state().map_err(Error::Restore)?; 1165 1166 let mm = MemoryManager::new( 1167 vm, 1168 config, 1169 Some(prefault), 1170 phys_bits, 1171 #[cfg(feature = "tdx")] 1172 false, 1173 Some(&mem_snapshot), 1174 None, 1175 #[cfg(target_arch = "x86_64")] 1176 None, 1177 )?; 1178 1179 mm.lock() 1180 .unwrap() 1181 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1182 1183 Ok(mm) 1184 } else { 1185 Err(Error::RestoreMissingSourceUrl) 1186 } 1187 } 1188 1189 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1190 // SAFETY: FFI call with correct arguments 1191 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1192 1193 if res < 0 { 1194 Err(io::Error::last_os_error()) 1195 } else { 1196 Ok(res as RawFd) 1197 } 1198 } 1199 1200 fn mbind( 1201 addr: *mut u8, 1202 len: u64, 1203 mode: u32, 1204 nodemask: Vec<u64>, 1205 maxnode: u64, 1206 flags: u32, 1207 ) -> Result<(), io::Error> { 1208 // SAFETY: FFI call with correct arguments 1209 let res = unsafe { 1210 libc::syscall( 1211 libc::SYS_mbind, 1212 addr as *mut libc::c_void, 1213 len, 1214 mode, 1215 nodemask.as_ptr(), 1216 maxnode, 1217 flags, 1218 ) 1219 }; 1220 1221 if res < 0 { 1222 Err(io::Error::last_os_error()) 1223 } else { 1224 Ok(()) 1225 } 1226 } 1227 1228 fn create_anonymous_file( 1229 size: usize, 1230 hugepages: bool, 1231 hugepage_size: Option<u64>, 1232 ) -> Result<FileOffset, Error> { 1233 let fd = Self::memfd_create( 1234 &ffi::CString::new("ch_ram").unwrap(), 1235 libc::MFD_CLOEXEC 1236 | if hugepages { 1237 libc::MFD_HUGETLB 1238 | if let Some(hugepage_size) = hugepage_size { 1239 /* 1240 * From the Linux kernel: 1241 * Several system calls take a flag to request "hugetlb" huge pages. 1242 * Without further specification, these system calls will use the 1243 * system's default huge page size. If a system supports multiple 1244 * huge page sizes, the desired huge page size can be specified in 1245 * bits [26:31] of the flag arguments. The value in these 6 bits 1246 * will encode the log2 of the huge page size. 1247 */ 1248 1249 hugepage_size.trailing_zeros() << 26 1250 } else { 1251 // Use the system default huge page size 1252 0 1253 } 1254 } else { 1255 0 1256 }, 1257 ) 1258 .map_err(Error::SharedFileCreate)?; 1259 1260 // SAFETY: fd is valid 1261 let f = unsafe { File::from_raw_fd(fd) }; 1262 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1263 1264 Ok(FileOffset::new(f, 0)) 1265 } 1266 1267 fn open_backing_file( 1268 backing_file: &PathBuf, 1269 file_offset: u64, 1270 size: usize, 1271 ) -> Result<FileOffset, Error> { 1272 if backing_file.is_dir() { 1273 // Override file offset as it does not apply in this case. 1274 info!( 1275 "Ignoring file offset since the backing file is a \ 1276 temporary file created from the specified directory." 1277 ); 1278 let fs_str = format!("{}{}", backing_file.display(), "/tmpfile_XXXXXX"); 1279 let fs = ffi::CString::new(fs_str).unwrap(); 1280 let mut path = fs.as_bytes_with_nul().to_owned(); 1281 let path_ptr = path.as_mut_ptr() as *mut _; 1282 // SAFETY: FFI call 1283 let fd = unsafe { libc::mkstemp(path_ptr) }; 1284 if fd == -1 { 1285 return Err(Error::SharedFileCreate(std::io::Error::last_os_error())); 1286 } 1287 // SAFETY: FFI call 1288 unsafe { libc::unlink(path_ptr) }; 1289 // SAFETY: fd is valid 1290 let f = unsafe { File::from_raw_fd(fd) }; 1291 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1292 1293 Ok(FileOffset::new(f, 0)) 1294 } else { 1295 let f = OpenOptions::new() 1296 .read(true) 1297 .write(true) 1298 .open(backing_file) 1299 .map_err(Error::SharedFileCreate)?; 1300 1301 Ok(FileOffset::new(f, file_offset)) 1302 } 1303 } 1304 1305 #[allow(clippy::too_many_arguments)] 1306 pub fn create_ram_region( 1307 backing_file: &Option<PathBuf>, 1308 file_offset: u64, 1309 start_addr: GuestAddress, 1310 size: usize, 1311 prefault: bool, 1312 shared: bool, 1313 hugepages: bool, 1314 hugepage_size: Option<u64>, 1315 host_numa_node: Option<u32>, 1316 existing_memory_file: Option<File>, 1317 thp: bool, 1318 ) -> Result<Arc<GuestRegionMmap>, Error> { 1319 let mut mmap_flags = libc::MAP_NORESERVE; 1320 1321 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1322 // the complexity of the handling clear. 1323 let fo = if let Some(f) = existing_memory_file { 1324 // It must be MAP_SHARED as we wouldn't already have an FD 1325 mmap_flags |= libc::MAP_SHARED; 1326 Some(FileOffset::new(f, file_offset)) 1327 } else if let Some(backing_file) = backing_file { 1328 if shared { 1329 mmap_flags |= libc::MAP_SHARED; 1330 } else { 1331 mmap_flags |= libc::MAP_PRIVATE; 1332 } 1333 Some(Self::open_backing_file(backing_file, file_offset, size)?) 1334 } else if shared || hugepages { 1335 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1336 // because the MAP_PRIVATE will trigger CoW against the backing file with 1337 // the VFIO pinning 1338 mmap_flags |= libc::MAP_SHARED; 1339 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1340 } else { 1341 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1342 None 1343 }; 1344 1345 if prefault { 1346 mmap_flags |= libc::MAP_POPULATE; 1347 } 1348 1349 let region = GuestRegionMmap::new( 1350 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1351 .map_err(Error::GuestMemoryRegion)?, 1352 start_addr, 1353 ) 1354 .map_err(Error::GuestMemory)?; 1355 1356 if region.file_offset().is_none() && thp { 1357 info!( 1358 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1359 region.as_ptr() as u64, 1360 size 1361 ); 1362 // SAFETY: FFI call with corect arguments 1363 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1364 if ret != 0 { 1365 let e = io::Error::last_os_error(); 1366 warn!("Failed to mark pages as THP eligible: {}", e); 1367 } 1368 } 1369 1370 // Apply NUMA policy if needed. 1371 if let Some(node) = host_numa_node { 1372 let addr = region.deref().as_ptr(); 1373 let len = region.deref().size() as u64; 1374 let mode = MPOL_BIND; 1375 let mut nodemask: Vec<u64> = Vec::new(); 1376 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1377 1378 // Linux is kind of buggy in the way it interprets maxnode as it 1379 // will cut off the last node. That's why we have to add 1 to what 1380 // we would consider as the proper maxnode value. 1381 let maxnode = node as u64 + 1 + 1; 1382 1383 // Allocate the right size for the vector. 1384 nodemask.resize((node as usize / 64) + 1, 0); 1385 1386 // Fill the global bitmask through the nodemask vector. 1387 let idx = (node / 64) as usize; 1388 let shift = node % 64; 1389 nodemask[idx] |= 1u64 << shift; 1390 1391 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1392 // force the kernel to move all pages that might have been already 1393 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1394 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1395 // MPOL_BIND is the selected mode as it specifies a strict policy 1396 // that restricts memory allocation to the nodes specified in the 1397 // nodemask. 1398 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1399 .map_err(Error::ApplyNumaPolicy)?; 1400 } 1401 1402 Ok(Arc::new(region)) 1403 } 1404 1405 // Update the GuestMemoryMmap with the new range 1406 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1407 let guest_memory = self 1408 .guest_memory 1409 .memory() 1410 .insert_region(region) 1411 .map_err(Error::GuestMemory)?; 1412 self.guest_memory.lock().unwrap().replace(guest_memory); 1413 1414 Ok(()) 1415 } 1416 1417 // 1418 // Calculate the start address of an area next to RAM. 1419 // 1420 // If memory hotplug is allowed, the start address needs to be aligned 1421 // (rounded-up) to 128MiB boundary. 1422 // If memory hotplug is not allowed, there is no alignment required. 1423 // And it must also start at the 64bit start. 1424 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1425 let mut start_addr = if allow_mem_hotplug { 1426 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1427 } else { 1428 mem_end 1429 }; 1430 1431 start_addr = start_addr 1432 .checked_add(1) 1433 .ok_or(Error::GuestAddressOverFlow)?; 1434 1435 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1436 return Ok(arch::layout::RAM_64BIT_START); 1437 } 1438 1439 Ok(start_addr) 1440 } 1441 1442 pub fn add_ram_region( 1443 &mut self, 1444 start_addr: GuestAddress, 1445 size: usize, 1446 ) -> Result<Arc<GuestRegionMmap>, Error> { 1447 // Allocate memory for the region 1448 let region = MemoryManager::create_ram_region( 1449 &None, 1450 0, 1451 start_addr, 1452 size, 1453 self.prefault, 1454 self.shared, 1455 self.hugepages, 1456 self.hugepage_size, 1457 None, 1458 None, 1459 self.thp, 1460 )?; 1461 1462 // Map it into the guest 1463 let slot = self.create_userspace_mapping( 1464 region.start_addr().0, 1465 region.len(), 1466 region.as_ptr() as u64, 1467 self.mergeable, 1468 false, 1469 self.log_dirty, 1470 )?; 1471 self.guest_ram_mappings.push(GuestRamMapping { 1472 gpa: region.start_addr().raw_value(), 1473 size: region.len(), 1474 slot, 1475 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1476 virtio_mem: false, 1477 file_offset: 0, 1478 }); 1479 1480 self.add_region(Arc::clone(®ion))?; 1481 1482 Ok(region) 1483 } 1484 1485 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1486 info!("Hotplugging new RAM: {}", size); 1487 1488 // Check that there is a free slot 1489 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1490 return Err(Error::NoSlotAvailable); 1491 } 1492 1493 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1494 if size % (128 << 20) != 0 { 1495 return Err(Error::InvalidSize); 1496 } 1497 1498 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1499 1500 if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area { 1501 return Err(Error::InsufficientHotplugRam); 1502 } 1503 1504 let region = self.add_ram_region(start_addr, size)?; 1505 1506 // Add region to the list of regions associated with the default 1507 // memory zone. 1508 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1509 memory_zone.regions.push(Arc::clone(®ion)); 1510 } 1511 1512 // Tell the allocator 1513 self.ram_allocator 1514 .allocate(Some(start_addr), size as GuestUsize, None) 1515 .ok_or(Error::MemoryRangeAllocation)?; 1516 1517 // Update the slot so that it can be queried via the I/O port 1518 let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1519 slot.active = true; 1520 slot.inserting = true; 1521 slot.base = region.start_addr().0; 1522 slot.length = region.len(); 1523 1524 self.next_hotplug_slot += 1; 1525 1526 Ok(region) 1527 } 1528 1529 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1530 self.guest_memory.clone() 1531 } 1532 1533 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1534 self.boot_guest_memory.clone() 1535 } 1536 1537 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1538 self.allocator.clone() 1539 } 1540 1541 pub fn start_of_device_area(&self) -> GuestAddress { 1542 self.start_of_device_area 1543 } 1544 1545 pub fn end_of_device_area(&self) -> GuestAddress { 1546 self.end_of_device_area 1547 } 1548 1549 pub fn allocate_memory_slot(&mut self) -> u32 { 1550 let slot_id = self.next_memory_slot; 1551 self.next_memory_slot += 1; 1552 slot_id 1553 } 1554 1555 pub fn create_userspace_mapping( 1556 &mut self, 1557 guest_phys_addr: u64, 1558 memory_size: u64, 1559 userspace_addr: u64, 1560 mergeable: bool, 1561 readonly: bool, 1562 log_dirty: bool, 1563 ) -> Result<u32, Error> { 1564 let slot = self.allocate_memory_slot(); 1565 let mem_region = self.vm.make_user_memory_region( 1566 slot, 1567 guest_phys_addr, 1568 memory_size, 1569 userspace_addr, 1570 readonly, 1571 log_dirty, 1572 ); 1573 1574 info!( 1575 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1576 guest_phys_addr, userspace_addr, memory_size, slot 1577 ); 1578 1579 self.vm 1580 .create_user_memory_region(mem_region) 1581 .map_err(Error::CreateUserMemoryRegion)?; 1582 1583 // Mark the pages as mergeable if explicitly asked for. 1584 if mergeable { 1585 // SAFETY: the address and size are valid since the 1586 // mmap succeeded. 1587 let ret = unsafe { 1588 libc::madvise( 1589 userspace_addr as *mut libc::c_void, 1590 memory_size as libc::size_t, 1591 libc::MADV_MERGEABLE, 1592 ) 1593 }; 1594 if ret != 0 { 1595 let err = io::Error::last_os_error(); 1596 // Safe to unwrap because the error is constructed with 1597 // last_os_error(), which ensures the output will be Some(). 1598 let errno = err.raw_os_error().unwrap(); 1599 if errno == libc::EINVAL { 1600 warn!("kernel not configured with CONFIG_KSM"); 1601 } else { 1602 warn!("madvise error: {}", err); 1603 } 1604 warn!("failed to mark pages as mergeable"); 1605 } 1606 } 1607 1608 info!( 1609 "Created userspace mapping: {:x} -> {:x} {:x}", 1610 guest_phys_addr, userspace_addr, memory_size 1611 ); 1612 1613 Ok(slot) 1614 } 1615 1616 pub fn remove_userspace_mapping( 1617 &mut self, 1618 guest_phys_addr: u64, 1619 memory_size: u64, 1620 userspace_addr: u64, 1621 mergeable: bool, 1622 slot: u32, 1623 ) -> Result<(), Error> { 1624 let mem_region = self.vm.make_user_memory_region( 1625 slot, 1626 guest_phys_addr, 1627 memory_size, 1628 userspace_addr, 1629 false, /* readonly -- don't care */ 1630 false, /* log dirty */ 1631 ); 1632 1633 self.vm 1634 .remove_user_memory_region(mem_region) 1635 .map_err(Error::RemoveUserMemoryRegion)?; 1636 1637 // Mark the pages as unmergeable if there were previously marked as 1638 // mergeable. 1639 if mergeable { 1640 // SAFETY: the address and size are valid as the region was 1641 // previously advised. 1642 let ret = unsafe { 1643 libc::madvise( 1644 userspace_addr as *mut libc::c_void, 1645 memory_size as libc::size_t, 1646 libc::MADV_UNMERGEABLE, 1647 ) 1648 }; 1649 if ret != 0 { 1650 let err = io::Error::last_os_error(); 1651 // Safe to unwrap because the error is constructed with 1652 // last_os_error(), which ensures the output will be Some(). 1653 let errno = err.raw_os_error().unwrap(); 1654 if errno == libc::EINVAL { 1655 warn!("kernel not configured with CONFIG_KSM"); 1656 } else { 1657 warn!("madvise error: {}", err); 1658 } 1659 warn!("failed to mark pages as unmergeable"); 1660 } 1661 } 1662 1663 info!( 1664 "Removed userspace mapping: {:x} -> {:x} {:x}", 1665 guest_phys_addr, userspace_addr, memory_size 1666 ); 1667 1668 Ok(()) 1669 } 1670 1671 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1672 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1673 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1674 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1675 virtio_mem_device 1676 .lock() 1677 .unwrap() 1678 .resize(size) 1679 .map_err(Error::VirtioMemResizeFail)?; 1680 } 1681 1682 // Keep the hotplugged_size up to date. 1683 virtio_mem_zone.hotplugged_size = size; 1684 } else { 1685 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1686 return Err(Error::MissingVirtioMemHandler); 1687 } 1688 1689 return Ok(()); 1690 } 1691 1692 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1693 Err(Error::UnknownMemoryZone) 1694 } 1695 1696 /// In case this function resulted in adding a new memory region to the 1697 /// guest memory, the new region is returned to the caller. The virtio-mem 1698 /// use case never adds a new region as the whole hotpluggable memory has 1699 /// already been allocated at boot time. 1700 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1701 if self.user_provided_zones { 1702 error!( 1703 "Not allowed to resize guest memory when backed with user \ 1704 defined memory zones." 1705 ); 1706 return Err(Error::InvalidResizeWithMemoryZones); 1707 } 1708 1709 let mut region: Option<Arc<GuestRegionMmap>> = None; 1710 match self.hotplug_method { 1711 HotplugMethod::VirtioMem => { 1712 if desired_ram >= self.boot_ram { 1713 if !self.dynamic { 1714 return Ok(region); 1715 } 1716 1717 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1718 self.current_ram = desired_ram; 1719 } 1720 } 1721 HotplugMethod::Acpi => { 1722 if desired_ram > self.current_ram { 1723 if !self.dynamic { 1724 return Ok(region); 1725 } 1726 1727 region = 1728 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1729 self.current_ram = desired_ram; 1730 } 1731 } 1732 } 1733 Ok(region) 1734 } 1735 1736 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1737 if !self.user_provided_zones { 1738 error!( 1739 "Not allowed to resize guest memory zone when no zone is \ 1740 defined." 1741 ); 1742 return Err(Error::ResizeZone); 1743 } 1744 1745 self.virtio_mem_resize(id, virtio_mem_size) 1746 } 1747 1748 #[cfg(target_arch = "x86_64")] 1749 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1750 let file = OpenOptions::new() 1751 .read(true) 1752 .open("/dev/sgx_provision") 1753 .map_err(Error::SgxProvisionOpen)?; 1754 self.vm 1755 .enable_sgx_attribute(file) 1756 .map_err(Error::SgxEnableProvisioning)?; 1757 1758 // Go over each EPC section and verify its size is a 4k multiple. At 1759 // the same time, calculate the total size needed for the contiguous 1760 // EPC region. 1761 let mut epc_region_size = 0; 1762 for epc_section in sgx_epc_config.iter() { 1763 if epc_section.size == 0 { 1764 return Err(Error::EpcSectionSizeInvalid); 1765 } 1766 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1767 return Err(Error::EpcSectionSizeInvalid); 1768 } 1769 1770 epc_region_size += epc_section.size; 1771 } 1772 1773 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1774 let epc_region_start = GuestAddress( 1775 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1776 ); 1777 1778 self.start_of_device_area = epc_region_start 1779 .checked_add(epc_region_size) 1780 .ok_or(Error::GuestAddressOverFlow)?; 1781 1782 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1783 info!( 1784 "SGX EPC region: 0x{:x} (0x{:x})", 1785 epc_region_start.0, epc_region_size 1786 ); 1787 1788 // Each section can be memory mapped into the allocated region. 1789 let mut epc_section_start = epc_region_start.raw_value(); 1790 for epc_section in sgx_epc_config.iter() { 1791 let file = OpenOptions::new() 1792 .read(true) 1793 .write(true) 1794 .open("/dev/sgx_vepc") 1795 .map_err(Error::SgxVirtEpcOpen)?; 1796 1797 let prot = PROT_READ | PROT_WRITE; 1798 let mut flags = MAP_NORESERVE | MAP_SHARED; 1799 if epc_section.prefault { 1800 flags |= MAP_POPULATE; 1801 } 1802 1803 // We can't use the vm-memory crate to perform the memory mapping 1804 // here as it would try to ensure the size of the backing file is 1805 // matching the size of the expected mapping. The /dev/sgx_vepc 1806 // device does not work that way, it provides a file descriptor 1807 // which is not matching the mapping size, as it's a just a way to 1808 // let KVM know that an EPC section is being created for the guest. 1809 // SAFETY: FFI call with correct arguments 1810 let host_addr = unsafe { 1811 libc::mmap( 1812 std::ptr::null_mut(), 1813 epc_section.size as usize, 1814 prot, 1815 flags, 1816 file.as_raw_fd(), 1817 0, 1818 ) 1819 } as u64; 1820 1821 info!( 1822 "Adding SGX EPC section: 0x{:x} (0x{:x})", 1823 epc_section_start, epc_section.size 1824 ); 1825 1826 let _mem_slot = self.create_userspace_mapping( 1827 epc_section_start, 1828 epc_section.size, 1829 host_addr, 1830 false, 1831 false, 1832 false, 1833 )?; 1834 1835 sgx_epc_region.insert( 1836 epc_section.id.clone(), 1837 SgxEpcSection::new( 1838 GuestAddress(epc_section_start), 1839 epc_section.size as GuestUsize, 1840 ), 1841 ); 1842 1843 epc_section_start += epc_section.size; 1844 } 1845 1846 self.sgx_epc_region = Some(sgx_epc_region); 1847 1848 Ok(()) 1849 } 1850 1851 #[cfg(target_arch = "x86_64")] 1852 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 1853 &self.sgx_epc_region 1854 } 1855 1856 pub fn is_hardlink(f: &File) -> bool { 1857 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 1858 // SAFETY: FFI call with correct arguments 1859 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 1860 if ret != 0 { 1861 error!("Couldn't fstat the backing file"); 1862 return false; 1863 } 1864 1865 // SAFETY: stat is valid 1866 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 1867 } 1868 1869 pub fn memory_zones(&self) -> &MemoryZones { 1870 &self.memory_zones 1871 } 1872 1873 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 1874 &mut self.memory_zones 1875 } 1876 1877 pub fn memory_range_table( 1878 &self, 1879 snapshot: bool, 1880 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 1881 let mut table = MemoryRangeTable::default(); 1882 1883 for memory_zone in self.memory_zones.values() { 1884 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 1885 table.extend(virtio_mem_zone.plugged_ranges()); 1886 } 1887 1888 for region in memory_zone.regions() { 1889 if snapshot { 1890 if let Some(file_offset) = region.file_offset() { 1891 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 1892 && Self::is_hardlink(file_offset.file()) 1893 { 1894 // In this very specific case, we know the memory 1895 // region is backed by a file on the host filesystem 1896 // that can be accessed by the user, and additionally 1897 // the mapping is shared, which means that modifications 1898 // to the content are written to the actual file. 1899 // When meeting these conditions, we can skip the 1900 // copy of the memory content for this specific region, 1901 // as we can assume the user will have it saved through 1902 // the backing file already. 1903 continue; 1904 } 1905 } 1906 } 1907 1908 table.push(MemoryRange { 1909 gpa: region.start_addr().raw_value(), 1910 length: region.len(), 1911 }); 1912 } 1913 } 1914 1915 Ok(table) 1916 } 1917 1918 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 1919 MemoryManagerSnapshotData { 1920 memory_ranges: self.snapshot_memory_ranges.clone(), 1921 guest_ram_mappings: self.guest_ram_mappings.clone(), 1922 start_of_device_area: self.start_of_device_area.0, 1923 boot_ram: self.boot_ram, 1924 current_ram: self.current_ram, 1925 arch_mem_regions: self.arch_mem_regions.clone(), 1926 hotplug_slots: self.hotplug_slots.clone(), 1927 next_memory_slot: self.next_memory_slot, 1928 selected_slot: self.selected_slot, 1929 next_hotplug_slot: self.next_hotplug_slot, 1930 } 1931 } 1932 1933 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 1934 let mut memory_slot_fds = HashMap::new(); 1935 for guest_ram_mapping in &self.guest_ram_mappings { 1936 let slot = guest_ram_mapping.slot; 1937 let guest_memory = self.guest_memory.memory(); 1938 let file = guest_memory 1939 .find_region(GuestAddress(guest_ram_mapping.gpa)) 1940 .unwrap() 1941 .file_offset() 1942 .unwrap() 1943 .file(); 1944 memory_slot_fds.insert(slot, file.as_raw_fd()); 1945 } 1946 memory_slot_fds 1947 } 1948 1949 pub fn acpi_address(&self) -> Option<GuestAddress> { 1950 self.acpi_address 1951 } 1952 1953 pub fn num_guest_ram_mappings(&self) -> u32 { 1954 self.guest_ram_mappings.len() as u32 1955 } 1956 1957 #[cfg(target_arch = "aarch64")] 1958 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1959 self.uefi_flash.as_ref().unwrap().clone() 1960 } 1961 1962 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1963 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 1964 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 1965 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 1966 1967 let mut mem_offset_in_elf = mem_offset; 1968 let mut ram_maps = BTreeMap::new(); 1969 for mapping in mapping_sorted_by_gpa.iter() { 1970 ram_maps.insert( 1971 mapping.gpa, 1972 CoredumpMemoryRegion { 1973 mem_offset_in_elf, 1974 mem_size: mapping.size, 1975 }, 1976 ); 1977 mem_offset_in_elf += mapping.size; 1978 } 1979 1980 CoredumpMemoryRegions { ram_maps } 1981 } 1982 1983 #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] 1984 pub fn coredump_iterate_save_mem( 1985 &mut self, 1986 dump_state: &DumpState, 1987 ) -> std::result::Result<(), GuestDebuggableError> { 1988 let snapshot_memory_ranges = self 1989 .memory_range_table(false) 1990 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 1991 1992 if snapshot_memory_ranges.is_empty() { 1993 return Ok(()); 1994 } 1995 1996 let mut coredump_file = dump_state.file.as_ref().unwrap(); 1997 1998 let guest_memory = self.guest_memory.memory(); 1999 let mut total_bytes: u64 = 0; 2000 2001 for range in snapshot_memory_ranges.regions() { 2002 let mut offset: u64 = 0; 2003 loop { 2004 let bytes_written = guest_memory 2005 .write_to( 2006 GuestAddress(range.gpa + offset), 2007 &mut coredump_file, 2008 (range.length - offset) as usize, 2009 ) 2010 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2011 offset += bytes_written as u64; 2012 total_bytes += bytes_written as u64; 2013 2014 if offset == range.length { 2015 break; 2016 } 2017 } 2018 } 2019 2020 debug!("coredump total bytes {}", total_bytes); 2021 Ok(()) 2022 } 2023 2024 pub fn receive_memory_regions<F>( 2025 &mut self, 2026 ranges: &MemoryRangeTable, 2027 fd: &mut F, 2028 ) -> std::result::Result<(), MigratableError> 2029 where 2030 F: Read, 2031 { 2032 let guest_memory = self.guest_memory(); 2033 let mem = guest_memory.memory(); 2034 2035 for range in ranges.regions() { 2036 let mut offset: u64 = 0; 2037 // Here we are manually handling the retry in case we can't the 2038 // whole region at once because we can't use the implementation 2039 // from vm-memory::GuestMemory of read_exact_from() as it is not 2040 // following the correct behavior. For more info about this issue 2041 // see: https://github.com/rust-vmm/vm-memory/issues/174 2042 loop { 2043 let bytes_read = mem 2044 .read_from( 2045 GuestAddress(range.gpa + offset), 2046 fd, 2047 (range.length - offset) as usize, 2048 ) 2049 .map_err(|e| { 2050 MigratableError::MigrateReceive(anyhow!( 2051 "Error receiving memory from socket: {}", 2052 e 2053 )) 2054 })?; 2055 offset += bytes_read as u64; 2056 2057 if offset == range.length { 2058 break; 2059 } 2060 } 2061 } 2062 2063 Ok(()) 2064 } 2065 } 2066 2067 struct MemoryNotify { 2068 slot_id: usize, 2069 } 2070 2071 impl Aml for MemoryNotify { 2072 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2073 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2074 aml::If::new( 2075 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2076 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2077 ) 2078 .append_aml_bytes(bytes) 2079 } 2080 } 2081 2082 struct MemorySlot { 2083 slot_id: usize, 2084 } 2085 2086 impl Aml for MemorySlot { 2087 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2088 aml::Device::new( 2089 format!("M{:03}", self.slot_id).as_str().into(), 2090 vec![ 2091 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")), 2092 &aml::Name::new("_UID".into(), &self.slot_id), 2093 /* 2094 _STA return value: 2095 Bit [0] – Set if the device is present. 2096 Bit [1] – Set if the device is enabled and decoding its resources. 2097 Bit [2] – Set if the device should be shown in the UI. 2098 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2099 Bit [4] – Set if the battery is present. 2100 Bits [31:5] – Reserved (must be cleared). 2101 */ 2102 &aml::Method::new( 2103 "_STA".into(), 2104 0, 2105 false, 2106 // Call into MSTA method which will interrogate device 2107 vec![&aml::Return::new(&aml::MethodCall::new( 2108 "MSTA".into(), 2109 vec![&self.slot_id], 2110 ))], 2111 ), 2112 // Get details of memory 2113 &aml::Method::new( 2114 "_CRS".into(), 2115 0, 2116 false, 2117 // Call into MCRS which provides actual memory details 2118 vec![&aml::Return::new(&aml::MethodCall::new( 2119 "MCRS".into(), 2120 vec![&self.slot_id], 2121 ))], 2122 ), 2123 ], 2124 ) 2125 .append_aml_bytes(bytes) 2126 } 2127 } 2128 2129 struct MemorySlots { 2130 slots: usize, 2131 } 2132 2133 impl Aml for MemorySlots { 2134 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2135 for slot_id in 0..self.slots { 2136 MemorySlot { slot_id }.append_aml_bytes(bytes); 2137 } 2138 } 2139 } 2140 2141 struct MemoryMethods { 2142 slots: usize, 2143 } 2144 2145 impl Aml for MemoryMethods { 2146 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2147 // Add "MTFY" notification method 2148 let mut memory_notifies = Vec::new(); 2149 for slot_id in 0..self.slots { 2150 memory_notifies.push(MemoryNotify { slot_id }); 2151 } 2152 2153 let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 2154 for memory_notifier in memory_notifies.iter() { 2155 memory_notifies_refs.push(memory_notifier); 2156 } 2157 2158 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes); 2159 2160 // MSCN method 2161 aml::Method::new( 2162 "MSCN".into(), 2163 0, 2164 true, 2165 vec![ 2166 // Take lock defined above 2167 &aml::Acquire::new("MLCK".into(), 0xffff), 2168 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2169 &aml::While::new( 2170 &aml::LessThan::new(&aml::Local(0), &self.slots), 2171 vec![ 2172 // Write slot number (in first argument) to I/O port via field 2173 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2174 // Check if MINS bit is set (inserting) 2175 &aml::If::new( 2176 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2177 // Notify device if it is 2178 vec![ 2179 &aml::MethodCall::new( 2180 "MTFY".into(), 2181 vec![&aml::Local(0), &aml::ONE], 2182 ), 2183 // Reset MINS bit 2184 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2185 ], 2186 ), 2187 // Check if MRMV bit is set 2188 &aml::If::new( 2189 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2190 // Notify device if it is (with the eject constant 0x3) 2191 vec![ 2192 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2193 // Reset MRMV bit 2194 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2195 ], 2196 ), 2197 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2198 ], 2199 ), 2200 // Release lock 2201 &aml::Release::new("MLCK".into()), 2202 ], 2203 ) 2204 .append_aml_bytes(bytes); 2205 2206 // Memory status method 2207 aml::Method::new( 2208 "MSTA".into(), 2209 1, 2210 true, 2211 vec![ 2212 // Take lock defined above 2213 &aml::Acquire::new("MLCK".into(), 0xffff), 2214 // Write slot number (in first argument) to I/O port via field 2215 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2216 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2217 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2218 &aml::If::new( 2219 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2220 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2221 ), 2222 // Release lock 2223 &aml::Release::new("MLCK".into()), 2224 // Return 0 or 0xf 2225 &aml::Return::new(&aml::Local(0)), 2226 ], 2227 ) 2228 .append_aml_bytes(bytes); 2229 2230 // Memory range method 2231 aml::Method::new( 2232 "MCRS".into(), 2233 1, 2234 true, 2235 vec![ 2236 // Take lock defined above 2237 &aml::Acquire::new("MLCK".into(), 0xffff), 2238 // Write slot number (in first argument) to I/O port via field 2239 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2240 &aml::Name::new( 2241 "MR64".into(), 2242 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2243 aml::AddressSpaceCachable::Cacheable, 2244 true, 2245 0x0000_0000_0000_0000u64, 2246 0xFFFF_FFFF_FFFF_FFFEu64, 2247 )]), 2248 ), 2249 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()), 2250 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()), 2251 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()), 2252 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()), 2253 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()), 2254 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()), 2255 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2256 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2257 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2258 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2259 &aml::Add::new( 2260 &aml::Path::new("MAXL"), 2261 &aml::Path::new("MINL"), 2262 &aml::Path::new("LENL"), 2263 ), 2264 &aml::Add::new( 2265 &aml::Path::new("MAXH"), 2266 &aml::Path::new("MINH"), 2267 &aml::Path::new("LENH"), 2268 ), 2269 &aml::If::new( 2270 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2271 vec![&aml::Add::new( 2272 &aml::Path::new("MAXH"), 2273 &aml::ONE, 2274 &aml::Path::new("MAXH"), 2275 )], 2276 ), 2277 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2278 // Release lock 2279 &aml::Release::new("MLCK".into()), 2280 &aml::Return::new(&aml::Path::new("MR64")), 2281 ], 2282 ) 2283 .append_aml_bytes(bytes) 2284 } 2285 } 2286 2287 impl Aml for MemoryManager { 2288 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2289 if let Some(acpi_address) = self.acpi_address { 2290 // Memory Hotplug Controller 2291 aml::Device::new( 2292 "_SB_.MHPC".into(), 2293 vec![ 2294 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2295 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2296 // Mutex to protect concurrent access as we write to choose slot and then read back status 2297 &aml::Mutex::new("MLCK".into(), 0), 2298 &aml::Name::new( 2299 "_CRS".into(), 2300 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2301 aml::AddressSpaceCachable::NotCacheable, 2302 true, 2303 acpi_address.0, 2304 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2305 )]), 2306 ), 2307 // OpRegion and Fields map MMIO range into individual field values 2308 &aml::OpRegion::new( 2309 "MHPR".into(), 2310 aml::OpRegionSpace::SystemMemory, 2311 acpi_address.0 as usize, 2312 MEMORY_MANAGER_ACPI_SIZE, 2313 ), 2314 &aml::Field::new( 2315 "MHPR".into(), 2316 aml::FieldAccessType::DWord, 2317 aml::FieldUpdateRule::Preserve, 2318 vec![ 2319 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2320 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2321 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2322 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2323 ], 2324 ), 2325 &aml::Field::new( 2326 "MHPR".into(), 2327 aml::FieldAccessType::DWord, 2328 aml::FieldUpdateRule::Preserve, 2329 vec![ 2330 aml::FieldEntry::Reserved(128), 2331 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2332 ], 2333 ), 2334 &aml::Field::new( 2335 "MHPR".into(), 2336 aml::FieldAccessType::Byte, 2337 aml::FieldUpdateRule::WriteAsZeroes, 2338 vec![ 2339 aml::FieldEntry::Reserved(160), 2340 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2341 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2342 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2343 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2344 ], 2345 ), 2346 &aml::Field::new( 2347 "MHPR".into(), 2348 aml::FieldAccessType::DWord, 2349 aml::FieldUpdateRule::Preserve, 2350 vec![ 2351 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2352 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2353 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2354 ], 2355 ), 2356 &MemoryMethods { 2357 slots: self.hotplug_slots.len(), 2358 }, 2359 &MemorySlots { 2360 slots: self.hotplug_slots.len(), 2361 }, 2362 ], 2363 ) 2364 .append_aml_bytes(bytes); 2365 } else { 2366 aml::Device::new( 2367 "_SB_.MHPC".into(), 2368 vec![ 2369 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2370 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2371 // Empty MSCN for GED 2372 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2373 ], 2374 ) 2375 .append_aml_bytes(bytes); 2376 } 2377 2378 #[cfg(target_arch = "x86_64")] 2379 { 2380 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2381 let min = sgx_epc_region.start().raw_value(); 2382 let max = min + sgx_epc_region.size() - 1; 2383 // SGX EPC region 2384 aml::Device::new( 2385 "_SB_.EPC_".into(), 2386 vec![ 2387 &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")), 2388 // QWORD describing the EPC region start and size 2389 &aml::Name::new( 2390 "_CRS".into(), 2391 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2392 aml::AddressSpaceCachable::NotCacheable, 2393 true, 2394 min, 2395 max, 2396 )]), 2397 ), 2398 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2399 ], 2400 ) 2401 .append_aml_bytes(bytes); 2402 } 2403 } 2404 } 2405 } 2406 2407 impl Pausable for MemoryManager {} 2408 2409 #[derive(Clone, Serialize, Deserialize, Versionize)] 2410 pub struct MemoryManagerSnapshotData { 2411 memory_ranges: MemoryRangeTable, 2412 guest_ram_mappings: Vec<GuestRamMapping>, 2413 start_of_device_area: u64, 2414 boot_ram: u64, 2415 current_ram: u64, 2416 arch_mem_regions: Vec<ArchMemRegion>, 2417 hotplug_slots: Vec<HotPlugState>, 2418 next_memory_slot: u32, 2419 selected_slot: usize, 2420 next_hotplug_slot: usize, 2421 } 2422 2423 impl VersionMapped for MemoryManagerSnapshotData {} 2424 2425 impl Snapshottable for MemoryManager { 2426 fn id(&self) -> String { 2427 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2428 } 2429 2430 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2431 let memory_ranges = self.memory_range_table(true)?; 2432 2433 // Store locally this list of ranges as it will be used through the 2434 // Transportable::send() implementation. The point is to avoid the 2435 // duplication of code regarding the creation of the path for each 2436 // region. The 'snapshot' step creates the list of memory regions, 2437 // including information about the need to copy a memory region or 2438 // not. This saves the 'send' step having to go through the same 2439 // process, and instead it can directly proceed with storing the 2440 // memory range content for the ranges requiring it. 2441 self.snapshot_memory_ranges = memory_ranges; 2442 2443 Ok(Snapshot::from_data(SnapshotData::new_from_versioned_state( 2444 &self.snapshot_data(), 2445 )?)) 2446 } 2447 } 2448 2449 impl Transportable for MemoryManager { 2450 fn send( 2451 &self, 2452 _snapshot: &Snapshot, 2453 destination_url: &str, 2454 ) -> result::Result<(), MigratableError> { 2455 if self.snapshot_memory_ranges.is_empty() { 2456 return Ok(()); 2457 } 2458 2459 let mut memory_file_path = url_to_path(destination_url)?; 2460 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2461 2462 // Create the snapshot file for the entire memory 2463 let mut memory_file = OpenOptions::new() 2464 .read(true) 2465 .write(true) 2466 .create_new(true) 2467 .open(memory_file_path) 2468 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2469 2470 let guest_memory = self.guest_memory.memory(); 2471 2472 for range in self.snapshot_memory_ranges.regions() { 2473 let mut offset: u64 = 0; 2474 // Here we are manually handling the retry in case we can't read 2475 // the whole region at once because we can't use the implementation 2476 // from vm-memory::GuestMemory of write_all_to() as it is not 2477 // following the correct behavior. For more info about this issue 2478 // see: https://github.com/rust-vmm/vm-memory/issues/174 2479 loop { 2480 let bytes_written = guest_memory 2481 .write_to( 2482 GuestAddress(range.gpa + offset), 2483 &mut memory_file, 2484 (range.length - offset) as usize, 2485 ) 2486 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2487 offset += bytes_written as u64; 2488 2489 if offset == range.length { 2490 break; 2491 } 2492 } 2493 } 2494 Ok(()) 2495 } 2496 } 2497 2498 impl Migratable for MemoryManager { 2499 // Start the dirty log in the hypervisor (kvm/mshv). 2500 // Also, reset the dirty bitmap logged by the vmm. 2501 // Just before we do a bulk copy we want to start/clear the dirty log so that 2502 // pages touched during our bulk copy are tracked. 2503 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2504 self.vm.start_dirty_log().map_err(|e| { 2505 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2506 })?; 2507 2508 for r in self.guest_memory.memory().iter() { 2509 r.bitmap().reset(); 2510 } 2511 2512 Ok(()) 2513 } 2514 2515 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2516 self.vm.stop_dirty_log().map_err(|e| { 2517 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2518 })?; 2519 2520 Ok(()) 2521 } 2522 2523 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2524 // together in the table if they are contiguous. 2525 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2526 let mut table = MemoryRangeTable::default(); 2527 for r in &self.guest_ram_mappings { 2528 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2529 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2530 })?; 2531 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2532 { 2533 Some(region) => { 2534 assert!(region.start_addr().raw_value() == r.gpa); 2535 assert!(region.len() == r.size); 2536 region.bitmap().get_and_reset() 2537 } 2538 None => { 2539 return Err(MigratableError::MigrateSend(anyhow!( 2540 "Error finding 'guest memory region' with address {:x}", 2541 r.gpa 2542 ))) 2543 } 2544 }; 2545 2546 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2547 .iter() 2548 .zip(vmm_dirty_bitmap.iter()) 2549 .map(|(x, y)| x | y) 2550 .collect(); 2551 2552 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2553 2554 if sub_table.regions().is_empty() { 2555 info!("Dirty Memory Range Table is empty"); 2556 } else { 2557 info!("Dirty Memory Range Table:"); 2558 for range in sub_table.regions() { 2559 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2560 } 2561 } 2562 2563 table.extend(sub_table); 2564 } 2565 Ok(table) 2566 } 2567 } 2568