1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(target_arch = "x86_64")] 6 use crate::config::SgxEpcConfig; 7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 8 #[cfg(feature = "guest_debug")] 9 use crate::coredump::{CoredumpMemoryRegion, CoredumpMemoryRegions}; 10 #[cfg(feature = "guest_debug")] 11 use crate::coredump::{DumpState, GuestDebuggableError}; 12 use crate::migration::url_to_path; 13 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 14 use crate::{GuestMemoryMmap, GuestRegionMmap}; 15 use acpi_tables::{aml, aml::Aml}; 16 use anyhow::anyhow; 17 #[cfg(target_arch = "x86_64")] 18 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 19 use arch::{layout, RegionType}; 20 #[cfg(target_arch = "x86_64")] 21 use devices::ioapic; 22 #[cfg(target_arch = "aarch64")] 23 use hypervisor::HypervisorVmError; 24 #[cfg(target_arch = "x86_64")] 25 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 26 use serde::{Deserialize, Serialize}; 27 #[cfg(feature = "guest_debug")] 28 use std::collections::BTreeMap; 29 use std::collections::HashMap; 30 use std::convert::TryInto; 31 use std::ffi; 32 use std::fs::{File, OpenOptions}; 33 use std::io::{self, Read}; 34 use std::ops::Deref; 35 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 36 use std::path::PathBuf; 37 use std::result; 38 use std::sync::{Arc, Barrier, Mutex}; 39 use tracer::trace_scoped; 40 use versionize::{VersionMap, Versionize, VersionizeResult}; 41 use versionize_derive::Versionize; 42 use virtio_devices::BlocksState; 43 #[cfg(target_arch = "x86_64")] 44 use vm_allocator::GsiApic; 45 use vm_allocator::{AddressAllocator, SystemAllocator}; 46 use vm_device::BusDevice; 47 use vm_memory::bitmap::AtomicBitmap; 48 use vm_memory::guest_memory::FileOffset; 49 use vm_memory::{ 50 mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace, 51 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 52 }; 53 use vm_migration::{ 54 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 55 Snapshot, SnapshotDataSection, Snapshottable, Transportable, VersionMapped, 56 }; 57 58 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 59 60 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 61 62 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 63 64 #[cfg(target_arch = "x86_64")] 65 const X86_64_IRQ_BASE: u32 = 5; 66 67 #[cfg(target_arch = "x86_64")] 68 const SGX_PAGE_SIZE: u64 = 1 << 12; 69 70 const HOTPLUG_COUNT: usize = 8; 71 72 // Memory policy constants 73 const MPOL_BIND: u32 = 2; 74 const MPOL_MF_STRICT: u32 = 1; 75 const MPOL_MF_MOVE: u32 = 1 << 1; 76 77 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 78 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 79 80 #[derive(Clone, Default, Serialize, Deserialize, Versionize)] 81 struct HotPlugState { 82 base: u64, 83 length: u64, 84 active: bool, 85 inserting: bool, 86 removing: bool, 87 } 88 89 pub struct VirtioMemZone { 90 region: Arc<GuestRegionMmap>, 91 virtio_device: Option<Arc<Mutex<virtio_devices::Mem>>>, 92 hotplugged_size: u64, 93 hugepages: bool, 94 blocks_state: Arc<Mutex<BlocksState>>, 95 } 96 97 impl VirtioMemZone { 98 pub fn region(&self) -> &Arc<GuestRegionMmap> { 99 &self.region 100 } 101 pub fn set_virtio_device(&mut self, virtio_device: Arc<Mutex<virtio_devices::Mem>>) { 102 self.virtio_device = Some(virtio_device); 103 } 104 pub fn hotplugged_size(&self) -> u64 { 105 self.hotplugged_size 106 } 107 pub fn hugepages(&self) -> bool { 108 self.hugepages 109 } 110 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 111 &self.blocks_state 112 } 113 pub fn plugged_ranges(&self) -> MemoryRangeTable { 114 self.blocks_state 115 .lock() 116 .unwrap() 117 .memory_ranges(self.region.start_addr().raw_value(), true) 118 } 119 } 120 121 #[derive(Default)] 122 pub struct MemoryZone { 123 regions: Vec<Arc<GuestRegionMmap>>, 124 virtio_mem_zone: Option<VirtioMemZone>, 125 } 126 127 impl MemoryZone { 128 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 129 &self.regions 130 } 131 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 132 &self.virtio_mem_zone 133 } 134 pub fn virtio_mem_zone_mut(&mut self) -> Option<&mut VirtioMemZone> { 135 self.virtio_mem_zone.as_mut() 136 } 137 } 138 139 pub type MemoryZones = HashMap<String, MemoryZone>; 140 141 #[derive(Clone, Serialize, Deserialize, Versionize)] 142 struct GuestRamMapping { 143 slot: u32, 144 gpa: u64, 145 size: u64, 146 zone_id: String, 147 virtio_mem: bool, 148 file_offset: u64, 149 } 150 151 #[derive(Clone, Serialize, Deserialize, Versionize)] 152 struct ArchMemRegion { 153 base: u64, 154 size: usize, 155 r_type: RegionType, 156 } 157 158 pub struct MemoryManager { 159 boot_guest_memory: GuestMemoryMmap, 160 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 161 next_memory_slot: u32, 162 start_of_device_area: GuestAddress, 163 end_of_device_area: GuestAddress, 164 end_of_ram_area: GuestAddress, 165 pub vm: Arc<dyn hypervisor::Vm>, 166 hotplug_slots: Vec<HotPlugState>, 167 selected_slot: usize, 168 mergeable: bool, 169 allocator: Arc<Mutex<SystemAllocator>>, 170 hotplug_method: HotplugMethod, 171 boot_ram: u64, 172 current_ram: u64, 173 next_hotplug_slot: usize, 174 shared: bool, 175 hugepages: bool, 176 hugepage_size: Option<u64>, 177 prefault: bool, 178 thp: bool, 179 #[cfg(target_arch = "x86_64")] 180 sgx_epc_region: Option<SgxEpcRegion>, 181 user_provided_zones: bool, 182 snapshot_memory_ranges: MemoryRangeTable, 183 memory_zones: MemoryZones, 184 log_dirty: bool, // Enable dirty logging for created RAM regions 185 arch_mem_regions: Vec<ArchMemRegion>, 186 ram_allocator: AddressAllocator, 187 dynamic: bool, 188 189 // Keep track of calls to create_userspace_mapping() for guest RAM. 190 // This is useful for getting the dirty pages as we need to know the 191 // slots that the mapping is created in. 192 guest_ram_mappings: Vec<GuestRamMapping>, 193 194 pub acpi_address: Option<GuestAddress>, 195 #[cfg(target_arch = "aarch64")] 196 uefi_flash: Option<GuestMemoryAtomic<GuestMemoryMmap>>, 197 } 198 199 #[derive(Debug)] 200 pub enum Error { 201 /// Failed to create shared file. 202 SharedFileCreate(io::Error), 203 204 /// Failed to set shared file length. 205 SharedFileSetLen(io::Error), 206 207 /// Mmap backed guest memory error 208 GuestMemory(MmapError), 209 210 /// Failed to allocate a memory range. 211 MemoryRangeAllocation, 212 213 /// Error from region creation 214 GuestMemoryRegion(MmapRegionError), 215 216 /// No ACPI slot available 217 NoSlotAvailable, 218 219 /// Not enough space in the hotplug RAM region 220 InsufficientHotplugRam, 221 222 /// The requested hotplug memory addition is not a valid size 223 InvalidSize, 224 225 /// Failed to create the user memory region. 226 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 227 228 /// Failed to remove the user memory region. 229 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 230 231 /// Failed to EventFd. 232 EventFdFail(io::Error), 233 234 /// Eventfd write error 235 EventfdError(io::Error), 236 237 /// Failed to virtio-mem resize 238 VirtioMemResizeFail(virtio_devices::mem::Error), 239 240 /// Cannot restore VM 241 Restore(MigratableError), 242 243 /// Cannot restore VM because source URL is missing 244 RestoreMissingSourceUrl, 245 246 /// Cannot create the system allocator 247 CreateSystemAllocator, 248 249 /// Invalid SGX EPC section size 250 #[cfg(target_arch = "x86_64")] 251 EpcSectionSizeInvalid, 252 253 /// Failed allocating SGX EPC region 254 #[cfg(target_arch = "x86_64")] 255 SgxEpcRangeAllocation, 256 257 /// Failed opening SGX virtual EPC device 258 #[cfg(target_arch = "x86_64")] 259 SgxVirtEpcOpen(io::Error), 260 261 /// Failed setting the SGX virtual EPC section size 262 #[cfg(target_arch = "x86_64")] 263 SgxVirtEpcFileSetLen(io::Error), 264 265 /// Failed opening SGX provisioning device 266 #[cfg(target_arch = "x86_64")] 267 SgxProvisionOpen(io::Error), 268 269 /// Failed enabling SGX provisioning 270 #[cfg(target_arch = "x86_64")] 271 SgxEnableProvisioning(hypervisor::HypervisorVmError), 272 273 /// Failed creating a new MmapRegion instance. 274 #[cfg(target_arch = "x86_64")] 275 NewMmapRegion(vm_memory::mmap::MmapRegionError), 276 277 /// No memory zones found. 278 MissingMemoryZones, 279 280 /// Memory configuration is not valid. 281 InvalidMemoryParameters, 282 283 /// Forbidden operation. Impossible to resize guest memory if it is 284 /// backed by user defined memory regions. 285 InvalidResizeWithMemoryZones, 286 287 /// It's invalid to try applying a NUMA policy to a memory zone that is 288 /// memory mapped with MAP_SHARED. 289 InvalidSharedMemoryZoneWithHostNuma, 290 291 /// Failed applying NUMA memory policy. 292 ApplyNumaPolicy(io::Error), 293 294 /// Memory zone identifier is not unique. 295 DuplicateZoneId, 296 297 /// No virtio-mem resizing handler found. 298 MissingVirtioMemHandler, 299 300 /// Unknown memory zone. 301 UnknownMemoryZone, 302 303 /// Invalid size for resizing. Can be anything except 0. 304 InvalidHotplugSize, 305 306 /// Invalid hotplug method associated with memory zones resizing capability. 307 InvalidHotplugMethodWithMemoryZones, 308 309 /// Could not find specified memory zone identifier from hash map. 310 MissingZoneIdentifier, 311 312 /// Resizing the memory zone failed. 313 ResizeZone, 314 315 /// Guest address overflow 316 GuestAddressOverFlow, 317 318 /// Error opening snapshot file 319 SnapshotOpen(io::Error), 320 321 // Error copying snapshot into region 322 SnapshotCopy(GuestMemoryError), 323 324 /// Failed to allocate MMIO address 325 AllocateMmioAddress, 326 327 #[cfg(target_arch = "aarch64")] 328 /// Failed to create UEFI flash 329 CreateUefiFlash(HypervisorVmError), 330 } 331 332 const ENABLE_FLAG: usize = 0; 333 const INSERTING_FLAG: usize = 1; 334 const REMOVING_FLAG: usize = 2; 335 const EJECT_FLAG: usize = 3; 336 337 const BASE_OFFSET_LOW: u64 = 0; 338 const BASE_OFFSET_HIGH: u64 = 0x4; 339 const LENGTH_OFFSET_LOW: u64 = 0x8; 340 const LENGTH_OFFSET_HIGH: u64 = 0xC; 341 const STATUS_OFFSET: u64 = 0x14; 342 const SELECTION_OFFSET: u64 = 0; 343 344 // The MMIO address space size is subtracted with 64k. This is done for the 345 // following reasons: 346 // - Reduce the addressable space size by at least 4k to workaround a Linux 347 // bug when the VMM allocates devices at the end of the addressable space 348 // - Windows requires the addressable space size to be 64k aligned 349 fn mmio_address_space_size(phys_bits: u8) -> u64 { 350 (1 << phys_bits) - (1 << 16) 351 } 352 353 impl BusDevice for MemoryManager { 354 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 355 if self.selected_slot < self.hotplug_slots.len() { 356 let state = &self.hotplug_slots[self.selected_slot]; 357 match offset { 358 BASE_OFFSET_LOW => { 359 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 360 } 361 BASE_OFFSET_HIGH => { 362 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 363 } 364 LENGTH_OFFSET_LOW => { 365 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 366 } 367 LENGTH_OFFSET_HIGH => { 368 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 369 } 370 STATUS_OFFSET => { 371 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 372 data.fill(0); 373 if state.active { 374 data[0] |= 1 << ENABLE_FLAG; 375 } 376 if state.inserting { 377 data[0] |= 1 << INSERTING_FLAG; 378 } 379 if state.removing { 380 data[0] |= 1 << REMOVING_FLAG; 381 } 382 } 383 _ => { 384 warn!( 385 "Unexpected offset for accessing memory manager device: {:#}", 386 offset 387 ); 388 } 389 } 390 } else { 391 warn!("Out of range memory slot: {}", self.selected_slot); 392 } 393 } 394 395 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 396 match offset { 397 SELECTION_OFFSET => { 398 self.selected_slot = usize::from(data[0]); 399 } 400 STATUS_OFFSET => { 401 if self.selected_slot < self.hotplug_slots.len() { 402 let state = &mut self.hotplug_slots[self.selected_slot]; 403 // The ACPI code writes back a 1 to acknowledge the insertion 404 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 405 state.inserting = false; 406 } 407 // Ditto for removal 408 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 409 state.removing = false; 410 } 411 // Trigger removal of "DIMM" 412 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 413 warn!("Ejection of memory not currently supported"); 414 } 415 } else { 416 warn!("Out of range memory slot: {}", self.selected_slot); 417 } 418 } 419 _ => { 420 warn!( 421 "Unexpected offset for accessing memory manager device: {:#}", 422 offset 423 ); 424 } 425 }; 426 None 427 } 428 } 429 430 impl MemoryManager { 431 /// Creates all memory regions based on the available RAM ranges defined 432 /// by `ram_regions`, and based on the description of the memory zones. 433 /// In practice, this function can perform multiple memory mappings of the 434 /// same backing file if there's a hole in the address space between two 435 /// RAM ranges. 436 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 437 /// and zones containing two zones (size 1G and size 4G). 438 /// This function will create 3 resulting memory regions: 439 /// - First one mapping entirely the first memory zone on 0-1G range 440 /// - Second one mapping partially the second memory zone on 1G-3G range 441 /// - Third one mapping partially the second memory zone on 4G-6G range 442 fn create_memory_regions_from_zones( 443 ram_regions: &[(GuestAddress, usize)], 444 zones: &[MemoryZoneConfig], 445 prefault: Option<bool>, 446 thp: bool, 447 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 448 let mut zones = zones.to_owned(); 449 let mut mem_regions = Vec::new(); 450 let mut zone = zones.remove(0); 451 let mut zone_offset = 0; 452 let mut memory_zones = HashMap::new(); 453 454 // Add zone id to the list of memory zones. 455 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 456 457 for ram_region in ram_regions.iter() { 458 let mut ram_region_offset = 0; 459 let mut exit = false; 460 461 loop { 462 let mut ram_region_consumed = false; 463 let mut pull_next_zone = false; 464 465 let ram_region_sub_size = ram_region.1 - ram_region_offset; 466 let zone_sub_size = zone.size as usize - zone_offset; 467 468 let file_offset = zone_offset as u64; 469 let region_start = ram_region 470 .0 471 .checked_add(ram_region_offset as u64) 472 .ok_or(Error::GuestAddressOverFlow)?; 473 let region_size = if zone_sub_size <= ram_region_sub_size { 474 if zone_sub_size == ram_region_sub_size { 475 ram_region_consumed = true; 476 } 477 478 ram_region_offset += zone_sub_size; 479 pull_next_zone = true; 480 481 zone_sub_size 482 } else { 483 zone_offset += ram_region_sub_size; 484 ram_region_consumed = true; 485 486 ram_region_sub_size 487 }; 488 489 let region = MemoryManager::create_ram_region( 490 &zone.file, 491 file_offset, 492 region_start, 493 region_size, 494 match prefault { 495 Some(pf) => pf, 496 None => zone.prefault, 497 }, 498 zone.shared, 499 zone.hugepages, 500 zone.hugepage_size, 501 zone.host_numa_node, 502 None, 503 thp, 504 )?; 505 506 // Add region to the list of regions associated with the 507 // current memory zone. 508 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 509 memory_zone.regions.push(region.clone()); 510 } 511 512 mem_regions.push(region); 513 514 if pull_next_zone { 515 // Get the next zone and reset the offset. 516 zone_offset = 0; 517 if zones.is_empty() { 518 exit = true; 519 break; 520 } 521 zone = zones.remove(0); 522 523 // Check if zone id already exist. In case it does, throw 524 // an error as we need unique identifiers. Otherwise, add 525 // the new zone id to the list of memory zones. 526 if memory_zones.contains_key(&zone.id) { 527 error!( 528 "Memory zone identifier '{}' found more than once. \ 529 It must be unique", 530 zone.id, 531 ); 532 return Err(Error::DuplicateZoneId); 533 } 534 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 535 } 536 537 if ram_region_consumed { 538 break; 539 } 540 } 541 542 if exit { 543 break; 544 } 545 } 546 547 Ok((mem_regions, memory_zones)) 548 } 549 550 // Restore both GuestMemory regions along with MemoryZone zones. 551 fn restore_memory_regions_and_zones( 552 guest_ram_mappings: &[GuestRamMapping], 553 zones_config: &[MemoryZoneConfig], 554 prefault: Option<bool>, 555 mut existing_memory_files: HashMap<u32, File>, 556 thp: bool, 557 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 558 let mut memory_regions = Vec::new(); 559 let mut memory_zones = HashMap::new(); 560 561 for zone_config in zones_config { 562 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 563 } 564 565 for guest_ram_mapping in guest_ram_mappings { 566 for zone_config in zones_config { 567 if guest_ram_mapping.zone_id == zone_config.id { 568 let region = MemoryManager::create_ram_region( 569 &zone_config.file, 570 guest_ram_mapping.file_offset, 571 GuestAddress(guest_ram_mapping.gpa), 572 guest_ram_mapping.size as usize, 573 match prefault { 574 Some(pf) => pf, 575 None => zone_config.prefault, 576 }, 577 zone_config.shared, 578 zone_config.hugepages, 579 zone_config.hugepage_size, 580 zone_config.host_numa_node, 581 existing_memory_files.remove(&guest_ram_mapping.slot), 582 thp, 583 )?; 584 memory_regions.push(Arc::clone(®ion)); 585 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 586 if guest_ram_mapping.virtio_mem { 587 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 588 let region_size = region.len(); 589 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 590 region, 591 virtio_device: None, 592 hotplugged_size, 593 hugepages: zone_config.hugepages, 594 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 595 }); 596 } else { 597 memory_zone.regions.push(region); 598 } 599 } 600 } 601 } 602 } 603 604 memory_regions.sort_by_key(|x| x.start_addr()); 605 606 Ok((memory_regions, memory_zones)) 607 } 608 609 fn fill_saved_regions( 610 &mut self, 611 file_path: PathBuf, 612 saved_regions: MemoryRangeTable, 613 ) -> Result<(), Error> { 614 if saved_regions.is_empty() { 615 return Ok(()); 616 } 617 618 // Open (read only) the snapshot file. 619 let mut memory_file = OpenOptions::new() 620 .read(true) 621 .open(file_path) 622 .map_err(Error::SnapshotOpen)?; 623 624 let guest_memory = self.guest_memory.memory(); 625 for range in saved_regions.regions() { 626 let mut offset: u64 = 0; 627 // Here we are manually handling the retry in case we can't write 628 // the whole region at once because we can't use the implementation 629 // from vm-memory::GuestMemory of read_exact_from() as it is not 630 // following the correct behavior. For more info about this issue 631 // see: https://github.com/rust-vmm/vm-memory/issues/174 632 loop { 633 let bytes_read = guest_memory 634 .read_from( 635 GuestAddress(range.gpa + offset), 636 &mut memory_file, 637 (range.length - offset) as usize, 638 ) 639 .map_err(Error::SnapshotCopy)?; 640 offset += bytes_read as u64; 641 642 if offset == range.length { 643 break; 644 } 645 } 646 } 647 648 Ok(()) 649 } 650 651 fn validate_memory_config( 652 config: &MemoryConfig, 653 user_provided_zones: bool, 654 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 655 let mut allow_mem_hotplug = false; 656 657 if !user_provided_zones { 658 if config.zones.is_some() { 659 error!( 660 "User defined memory regions can't be provided if the \ 661 memory size is not 0" 662 ); 663 return Err(Error::InvalidMemoryParameters); 664 } 665 666 if config.hotplug_size.is_some() { 667 allow_mem_hotplug = true; 668 } 669 670 if let Some(hotplugged_size) = config.hotplugged_size { 671 if let Some(hotplug_size) = config.hotplug_size { 672 if hotplugged_size > hotplug_size { 673 error!( 674 "'hotplugged_size' {} can't be bigger than \ 675 'hotplug_size' {}", 676 hotplugged_size, hotplug_size, 677 ); 678 return Err(Error::InvalidMemoryParameters); 679 } 680 } else { 681 error!( 682 "Invalid to define 'hotplugged_size' when there is\ 683 no 'hotplug_size'" 684 ); 685 return Err(Error::InvalidMemoryParameters); 686 } 687 if config.hotplug_method == HotplugMethod::Acpi { 688 error!( 689 "Invalid to define 'hotplugged_size' with hotplug \ 690 method 'acpi'" 691 ); 692 return Err(Error::InvalidMemoryParameters); 693 } 694 } 695 696 // Create a single zone from the global memory config. This lets 697 // us reuse the codepath for user defined memory zones. 698 let zones = vec![MemoryZoneConfig { 699 id: String::from(DEFAULT_MEMORY_ZONE), 700 size: config.size, 701 file: None, 702 shared: config.shared, 703 hugepages: config.hugepages, 704 hugepage_size: config.hugepage_size, 705 host_numa_node: None, 706 hotplug_size: config.hotplug_size, 707 hotplugged_size: config.hotplugged_size, 708 prefault: config.prefault, 709 }]; 710 711 Ok((config.size, zones, allow_mem_hotplug)) 712 } else { 713 if config.zones.is_none() { 714 error!( 715 "User defined memory regions must be provided if the \ 716 memory size is 0" 717 ); 718 return Err(Error::MissingMemoryZones); 719 } 720 721 // Safe to unwrap as we checked right above there were some 722 // regions. 723 let zones = config.zones.clone().unwrap(); 724 if zones.is_empty() { 725 return Err(Error::MissingMemoryZones); 726 } 727 728 let mut total_ram_size: u64 = 0; 729 for zone in zones.iter() { 730 total_ram_size += zone.size; 731 732 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 733 error!( 734 "Invalid to set host NUMA policy for a memory zone \ 735 backed by a regular file and mapped as 'shared'" 736 ); 737 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 738 } 739 740 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 741 error!("Invalid to set ACPI hotplug method for memory zones"); 742 return Err(Error::InvalidHotplugMethodWithMemoryZones); 743 } 744 745 if let Some(hotplugged_size) = zone.hotplugged_size { 746 if let Some(hotplug_size) = zone.hotplug_size { 747 if hotplugged_size > hotplug_size { 748 error!( 749 "'hotplugged_size' {} can't be bigger than \ 750 'hotplug_size' {}", 751 hotplugged_size, hotplug_size, 752 ); 753 return Err(Error::InvalidMemoryParameters); 754 } 755 } else { 756 error!( 757 "Invalid to define 'hotplugged_size' when there is\ 758 no 'hotplug_size' for a memory zone" 759 ); 760 return Err(Error::InvalidMemoryParameters); 761 } 762 if config.hotplug_method == HotplugMethod::Acpi { 763 error!( 764 "Invalid to define 'hotplugged_size' with hotplug \ 765 method 'acpi'" 766 ); 767 return Err(Error::InvalidMemoryParameters); 768 } 769 } 770 } 771 772 Ok((total_ram_size, zones, allow_mem_hotplug)) 773 } 774 } 775 776 fn allocate_address_space(&mut self) -> Result<(), Error> { 777 let mut list = Vec::new(); 778 779 for (zone_id, memory_zone) in self.memory_zones.iter() { 780 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 781 memory_zone 782 .regions() 783 .iter() 784 .map(|r| (r.clone(), false)) 785 .collect(); 786 787 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 788 regions.push((virtio_mem_zone.region().clone(), true)); 789 } 790 791 list.push((zone_id.clone(), regions)); 792 } 793 794 for (zone_id, regions) in list { 795 for (region, virtio_mem) in regions { 796 let slot = self.create_userspace_mapping( 797 region.start_addr().raw_value(), 798 region.len(), 799 region.as_ptr() as u64, 800 self.mergeable, 801 false, 802 self.log_dirty, 803 )?; 804 805 let file_offset = if let Some(file_offset) = region.file_offset() { 806 file_offset.start() 807 } else { 808 0 809 }; 810 811 self.guest_ram_mappings.push(GuestRamMapping { 812 gpa: region.start_addr().raw_value(), 813 size: region.len(), 814 slot, 815 zone_id: zone_id.clone(), 816 virtio_mem, 817 file_offset, 818 }); 819 self.ram_allocator 820 .allocate(Some(region.start_addr()), region.len(), None) 821 .ok_or(Error::MemoryRangeAllocation)?; 822 } 823 } 824 825 // Allocate SubRegion and Reserved address ranges. 826 for region in self.arch_mem_regions.iter() { 827 if region.r_type == RegionType::Ram { 828 // Ignore the RAM type since ranges have already been allocated 829 // based on the GuestMemory regions. 830 continue; 831 } 832 self.ram_allocator 833 .allocate( 834 Some(GuestAddress(region.base)), 835 region.size as GuestUsize, 836 None, 837 ) 838 .ok_or(Error::MemoryRangeAllocation)?; 839 } 840 841 Ok(()) 842 } 843 844 #[cfg(target_arch = "aarch64")] 845 fn add_uefi_flash(&mut self) -> Result<(), Error> { 846 // On AArch64, the UEFI binary requires a flash device at address 0. 847 // 4 MiB memory is mapped to simulate the flash. 848 let uefi_mem_slot = self.allocate_memory_slot(); 849 let uefi_region = GuestRegionMmap::new( 850 MmapRegion::new(arch::layout::UEFI_SIZE as usize).unwrap(), 851 arch::layout::UEFI_START, 852 ) 853 .unwrap(); 854 let uefi_mem_region = self.vm.make_user_memory_region( 855 uefi_mem_slot, 856 uefi_region.start_addr().raw_value(), 857 uefi_region.len() as u64, 858 uefi_region.as_ptr() as u64, 859 false, 860 false, 861 ); 862 self.vm 863 .create_user_memory_region(uefi_mem_region) 864 .map_err(Error::CreateUefiFlash)?; 865 866 let uefi_flash = 867 GuestMemoryAtomic::new(GuestMemoryMmap::from_regions(vec![uefi_region]).unwrap()); 868 869 self.uefi_flash = Some(uefi_flash); 870 871 Ok(()) 872 } 873 874 #[allow(clippy::too_many_arguments)] 875 pub fn new( 876 vm: Arc<dyn hypervisor::Vm>, 877 config: &MemoryConfig, 878 prefault: Option<bool>, 879 phys_bits: u8, 880 #[cfg(feature = "tdx")] tdx_enabled: bool, 881 restore_data: Option<&MemoryManagerSnapshotData>, 882 existing_memory_files: Option<HashMap<u32, File>>, 883 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 884 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 885 trace_scoped!("MemoryManager::new"); 886 887 let user_provided_zones = config.size == 0; 888 889 let mmio_address_space_size = mmio_address_space_size(phys_bits); 890 debug_assert_eq!( 891 (((mmio_address_space_size) >> 16) << 16), 892 mmio_address_space_size 893 ); 894 let start_of_platform_device_area = 895 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 896 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 897 898 let (ram_size, zones, allow_mem_hotplug) = 899 Self::validate_memory_config(config, user_provided_zones)?; 900 901 let ( 902 start_of_device_area, 903 boot_ram, 904 current_ram, 905 arch_mem_regions, 906 memory_zones, 907 guest_memory, 908 boot_guest_memory, 909 hotplug_slots, 910 next_memory_slot, 911 selected_slot, 912 next_hotplug_slot, 913 ) = if let Some(data) = restore_data { 914 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 915 &data.guest_ram_mappings, 916 &zones, 917 prefault, 918 existing_memory_files.unwrap_or_default(), 919 config.thp, 920 )?; 921 let guest_memory = 922 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 923 let boot_guest_memory = guest_memory.clone(); 924 ( 925 GuestAddress(data.start_of_device_area), 926 data.boot_ram, 927 data.current_ram, 928 data.arch_mem_regions.clone(), 929 memory_zones, 930 guest_memory, 931 boot_guest_memory, 932 data.hotplug_slots.clone(), 933 data.next_memory_slot, 934 data.selected_slot, 935 data.next_hotplug_slot, 936 ) 937 } else { 938 // Init guest memory 939 let arch_mem_regions = arch::arch_memory_regions(ram_size); 940 941 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 942 .iter() 943 .filter(|r| r.2 == RegionType::Ram) 944 .map(|r| (r.0, r.1)) 945 .collect(); 946 947 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 948 .iter() 949 .map(|(a, b, c)| ArchMemRegion { 950 base: a.0, 951 size: *b, 952 r_type: *c, 953 }) 954 .collect(); 955 956 let (mem_regions, mut memory_zones) = 957 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault, config.thp)?; 958 959 let mut guest_memory = 960 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 961 962 let boot_guest_memory = guest_memory.clone(); 963 964 let mut start_of_device_area = 965 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 966 967 // Update list of memory zones for resize. 968 for zone in zones.iter() { 969 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 970 if let Some(hotplug_size) = zone.hotplug_size { 971 if hotplug_size == 0 { 972 error!("'hotplug_size' can't be 0"); 973 return Err(Error::InvalidHotplugSize); 974 } 975 976 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 977 start_of_device_area = start_of_device_area 978 .checked_add(hotplug_size) 979 .ok_or(Error::GuestAddressOverFlow)?; 980 } else { 981 // Alignment must be "natural" i.e. same as size of block 982 let start_addr = GuestAddress( 983 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 984 - 1) 985 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 986 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 987 ); 988 989 // When `prefault` is set by vm_restore, memory manager 990 // will create ram region with `prefault` option in 991 // restore config rather than same option in zone 992 let region = MemoryManager::create_ram_region( 993 &None, 994 0, 995 start_addr, 996 hotplug_size as usize, 997 match prefault { 998 Some(pf) => pf, 999 None => zone.prefault, 1000 }, 1001 zone.shared, 1002 zone.hugepages, 1003 zone.hugepage_size, 1004 zone.host_numa_node, 1005 None, 1006 config.thp, 1007 )?; 1008 1009 guest_memory = guest_memory 1010 .insert_region(Arc::clone(®ion)) 1011 .map_err(Error::GuestMemory)?; 1012 1013 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 1014 let region_size = region.len(); 1015 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 1016 region, 1017 virtio_device: None, 1018 hotplugged_size, 1019 hugepages: zone.hugepages, 1020 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 1021 }); 1022 1023 start_of_device_area = start_addr 1024 .checked_add(hotplug_size) 1025 .ok_or(Error::GuestAddressOverFlow)?; 1026 } 1027 } 1028 } else { 1029 return Err(Error::MissingZoneIdentifier); 1030 } 1031 } 1032 1033 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 1034 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 1035 1036 ( 1037 start_of_device_area, 1038 ram_size, 1039 ram_size, 1040 arch_mem_regions, 1041 memory_zones, 1042 guest_memory, 1043 boot_guest_memory, 1044 hotplug_slots, 1045 0, 1046 0, 1047 0, 1048 ) 1049 }; 1050 1051 let guest_memory = GuestMemoryAtomic::new(guest_memory); 1052 1053 // Both MMIO and PIO address spaces start at address 0. 1054 let allocator = Arc::new(Mutex::new( 1055 SystemAllocator::new( 1056 #[cfg(target_arch = "x86_64")] 1057 { 1058 GuestAddress(0) 1059 }, 1060 #[cfg(target_arch = "x86_64")] 1061 { 1062 1 << 16 1063 }, 1064 start_of_platform_device_area, 1065 PLATFORM_DEVICE_AREA_SIZE, 1066 layout::MEM_32BIT_DEVICES_START, 1067 layout::MEM_32BIT_DEVICES_SIZE, 1068 #[cfg(target_arch = "x86_64")] 1069 vec![GsiApic::new( 1070 X86_64_IRQ_BASE, 1071 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1072 )], 1073 ) 1074 .ok_or(Error::CreateSystemAllocator)?, 1075 )); 1076 1077 #[cfg(not(feature = "tdx"))] 1078 let dynamic = true; 1079 #[cfg(feature = "tdx")] 1080 let dynamic = !tdx_enabled; 1081 1082 let acpi_address = if dynamic 1083 && config.hotplug_method == HotplugMethod::Acpi 1084 && (config.hotplug_size.unwrap_or_default() > 0) 1085 { 1086 Some( 1087 allocator 1088 .lock() 1089 .unwrap() 1090 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1091 .ok_or(Error::AllocateMmioAddress)?, 1092 ) 1093 } else { 1094 None 1095 }; 1096 1097 // If running on SGX the start of device area and RAM area may diverge but 1098 // at this point they are next to each other. 1099 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1100 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1101 1102 let mut memory_manager = MemoryManager { 1103 boot_guest_memory, 1104 guest_memory, 1105 next_memory_slot, 1106 start_of_device_area, 1107 end_of_device_area, 1108 end_of_ram_area, 1109 vm, 1110 hotplug_slots, 1111 selected_slot, 1112 mergeable: config.mergeable, 1113 allocator, 1114 hotplug_method: config.hotplug_method, 1115 boot_ram, 1116 current_ram, 1117 next_hotplug_slot, 1118 shared: config.shared, 1119 hugepages: config.hugepages, 1120 hugepage_size: config.hugepage_size, 1121 prefault: config.prefault, 1122 #[cfg(target_arch = "x86_64")] 1123 sgx_epc_region: None, 1124 user_provided_zones, 1125 snapshot_memory_ranges: MemoryRangeTable::default(), 1126 memory_zones, 1127 guest_ram_mappings: Vec::new(), 1128 acpi_address, 1129 log_dirty: dynamic, // Cannot log dirty pages on a TD 1130 arch_mem_regions, 1131 ram_allocator, 1132 dynamic, 1133 #[cfg(target_arch = "aarch64")] 1134 uefi_flash: None, 1135 thp: config.thp, 1136 }; 1137 1138 memory_manager.allocate_address_space()?; 1139 1140 #[cfg(target_arch = "aarch64")] 1141 memory_manager.add_uefi_flash()?; 1142 1143 #[cfg(target_arch = "x86_64")] 1144 if let Some(sgx_epc_config) = sgx_epc_config { 1145 memory_manager.setup_sgx(sgx_epc_config)?; 1146 } 1147 1148 Ok(Arc::new(Mutex::new(memory_manager))) 1149 } 1150 1151 pub fn new_from_snapshot( 1152 snapshot: &Snapshot, 1153 vm: Arc<dyn hypervisor::Vm>, 1154 config: &MemoryConfig, 1155 source_url: Option<&str>, 1156 prefault: bool, 1157 phys_bits: u8, 1158 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1159 if let Some(source_url) = source_url { 1160 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1161 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1162 1163 let mem_snapshot: MemoryManagerSnapshotData = snapshot 1164 .to_versioned_state(MEMORY_MANAGER_SNAPSHOT_ID) 1165 .map_err(Error::Restore)?; 1166 1167 let mm = MemoryManager::new( 1168 vm, 1169 config, 1170 Some(prefault), 1171 phys_bits, 1172 #[cfg(feature = "tdx")] 1173 false, 1174 Some(&mem_snapshot), 1175 None, 1176 #[cfg(target_arch = "x86_64")] 1177 None, 1178 )?; 1179 1180 mm.lock() 1181 .unwrap() 1182 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1183 1184 Ok(mm) 1185 } else { 1186 Err(Error::RestoreMissingSourceUrl) 1187 } 1188 } 1189 1190 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1191 // SAFETY: FFI call with correct arguments 1192 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1193 1194 if res < 0 { 1195 Err(io::Error::last_os_error()) 1196 } else { 1197 Ok(res as RawFd) 1198 } 1199 } 1200 1201 fn mbind( 1202 addr: *mut u8, 1203 len: u64, 1204 mode: u32, 1205 nodemask: Vec<u64>, 1206 maxnode: u64, 1207 flags: u32, 1208 ) -> Result<(), io::Error> { 1209 // SAFETY: FFI call with correct arguments 1210 let res = unsafe { 1211 libc::syscall( 1212 libc::SYS_mbind, 1213 addr as *mut libc::c_void, 1214 len, 1215 mode, 1216 nodemask.as_ptr(), 1217 maxnode, 1218 flags, 1219 ) 1220 }; 1221 1222 if res < 0 { 1223 Err(io::Error::last_os_error()) 1224 } else { 1225 Ok(()) 1226 } 1227 } 1228 1229 fn create_anonymous_file( 1230 size: usize, 1231 hugepages: bool, 1232 hugepage_size: Option<u64>, 1233 ) -> Result<FileOffset, Error> { 1234 let fd = Self::memfd_create( 1235 &ffi::CString::new("ch_ram").unwrap(), 1236 libc::MFD_CLOEXEC 1237 | if hugepages { 1238 libc::MFD_HUGETLB 1239 | if let Some(hugepage_size) = hugepage_size { 1240 /* 1241 * From the Linux kernel: 1242 * Several system calls take a flag to request "hugetlb" huge pages. 1243 * Without further specification, these system calls will use the 1244 * system's default huge page size. If a system supports multiple 1245 * huge page sizes, the desired huge page size can be specified in 1246 * bits [26:31] of the flag arguments. The value in these 6 bits 1247 * will encode the log2 of the huge page size. 1248 */ 1249 1250 hugepage_size.trailing_zeros() << 26 1251 } else { 1252 // Use the system default huge page size 1253 0 1254 } 1255 } else { 1256 0 1257 }, 1258 ) 1259 .map_err(Error::SharedFileCreate)?; 1260 1261 // SAFETY: fd is valid 1262 let f = unsafe { File::from_raw_fd(fd) }; 1263 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1264 1265 Ok(FileOffset::new(f, 0)) 1266 } 1267 1268 fn open_backing_file( 1269 backing_file: &PathBuf, 1270 file_offset: u64, 1271 size: usize, 1272 ) -> Result<FileOffset, Error> { 1273 if backing_file.is_dir() { 1274 // Override file offset as it does not apply in this case. 1275 info!( 1276 "Ignoring file offset since the backing file is a \ 1277 temporary file created from the specified directory." 1278 ); 1279 let fs_str = format!("{}{}", backing_file.display(), "/tmpfile_XXXXXX"); 1280 let fs = ffi::CString::new(fs_str).unwrap(); 1281 let mut path = fs.as_bytes_with_nul().to_owned(); 1282 let path_ptr = path.as_mut_ptr() as *mut _; 1283 // SAFETY: FFI call 1284 let fd = unsafe { libc::mkstemp(path_ptr) }; 1285 if fd == -1 { 1286 return Err(Error::SharedFileCreate(std::io::Error::last_os_error())); 1287 } 1288 // SAFETY: FFI call 1289 unsafe { libc::unlink(path_ptr) }; 1290 // SAFETY: fd is valid 1291 let f = unsafe { File::from_raw_fd(fd) }; 1292 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1293 1294 Ok(FileOffset::new(f, 0)) 1295 } else { 1296 let f = OpenOptions::new() 1297 .read(true) 1298 .write(true) 1299 .open(backing_file) 1300 .map_err(Error::SharedFileCreate)?; 1301 1302 Ok(FileOffset::new(f, file_offset)) 1303 } 1304 } 1305 1306 #[allow(clippy::too_many_arguments)] 1307 pub fn create_ram_region( 1308 backing_file: &Option<PathBuf>, 1309 file_offset: u64, 1310 start_addr: GuestAddress, 1311 size: usize, 1312 prefault: bool, 1313 shared: bool, 1314 hugepages: bool, 1315 hugepage_size: Option<u64>, 1316 host_numa_node: Option<u32>, 1317 existing_memory_file: Option<File>, 1318 thp: bool, 1319 ) -> Result<Arc<GuestRegionMmap>, Error> { 1320 let mut mmap_flags = libc::MAP_NORESERVE; 1321 1322 // The duplication of mmap_flags ORing here is unfortunate but it also makes 1323 // the complexity of the handling clear. 1324 let fo = if let Some(f) = existing_memory_file { 1325 // It must be MAP_SHARED as we wouldn't already have an FD 1326 mmap_flags |= libc::MAP_SHARED; 1327 Some(FileOffset::new(f, file_offset)) 1328 } else if let Some(backing_file) = backing_file { 1329 if shared { 1330 mmap_flags |= libc::MAP_SHARED; 1331 } else { 1332 mmap_flags |= libc::MAP_PRIVATE; 1333 } 1334 Some(Self::open_backing_file(backing_file, file_offset, size)?) 1335 } else if shared || hugepages { 1336 // For hugepages we must also MAP_SHARED otherwise we will trigger #4805 1337 // because the MAP_PRIVATE will trigger CoW against the backing file with 1338 // the VFIO pinning 1339 mmap_flags |= libc::MAP_SHARED; 1340 Some(Self::create_anonymous_file(size, hugepages, hugepage_size)?) 1341 } else { 1342 mmap_flags |= libc::MAP_PRIVATE | libc::MAP_ANONYMOUS; 1343 None 1344 }; 1345 1346 if prefault { 1347 mmap_flags |= libc::MAP_POPULATE; 1348 } 1349 1350 let region = GuestRegionMmap::new( 1351 MmapRegion::build(fo, size, libc::PROT_READ | libc::PROT_WRITE, mmap_flags) 1352 .map_err(Error::GuestMemoryRegion)?, 1353 start_addr, 1354 ) 1355 .map_err(Error::GuestMemory)?; 1356 1357 if region.file_offset().is_none() && thp { 1358 info!( 1359 "Anonymous mapping at 0x{:x} (size = 0x{:x})", 1360 region.as_ptr() as u64, 1361 size 1362 ); 1363 // SAFETY: FFI call with corect arguments 1364 let ret = unsafe { libc::madvise(region.as_ptr() as _, size, libc::MADV_HUGEPAGE) }; 1365 if ret != 0 { 1366 let e = io::Error::last_os_error(); 1367 warn!("Failed to mark pages as THP eligible: {}", e); 1368 } 1369 } 1370 1371 // Apply NUMA policy if needed. 1372 if let Some(node) = host_numa_node { 1373 let addr = region.deref().as_ptr(); 1374 let len = region.deref().size() as u64; 1375 let mode = MPOL_BIND; 1376 let mut nodemask: Vec<u64> = Vec::new(); 1377 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1378 1379 // Linux is kind of buggy in the way it interprets maxnode as it 1380 // will cut off the last node. That's why we have to add 1 to what 1381 // we would consider as the proper maxnode value. 1382 let maxnode = node as u64 + 1 + 1; 1383 1384 // Allocate the right size for the vector. 1385 nodemask.resize((node as usize / 64) + 1, 0); 1386 1387 // Fill the global bitmask through the nodemask vector. 1388 let idx = (node / 64) as usize; 1389 let shift = node % 64; 1390 nodemask[idx] |= 1u64 << shift; 1391 1392 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1393 // force the kernel to move all pages that might have been already 1394 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1395 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1396 // MPOL_BIND is the selected mode as it specifies a strict policy 1397 // that restricts memory allocation to the nodes specified in the 1398 // nodemask. 1399 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1400 .map_err(Error::ApplyNumaPolicy)?; 1401 } 1402 1403 Ok(Arc::new(region)) 1404 } 1405 1406 // Update the GuestMemoryMmap with the new range 1407 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1408 let guest_memory = self 1409 .guest_memory 1410 .memory() 1411 .insert_region(region) 1412 .map_err(Error::GuestMemory)?; 1413 self.guest_memory.lock().unwrap().replace(guest_memory); 1414 1415 Ok(()) 1416 } 1417 1418 // 1419 // Calculate the start address of an area next to RAM. 1420 // 1421 // If memory hotplug is allowed, the start address needs to be aligned 1422 // (rounded-up) to 128MiB boundary. 1423 // If memory hotplug is not allowed, there is no alignment required. 1424 // And it must also start at the 64bit start. 1425 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1426 let mut start_addr = if allow_mem_hotplug { 1427 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1428 } else { 1429 mem_end 1430 }; 1431 1432 start_addr = start_addr 1433 .checked_add(1) 1434 .ok_or(Error::GuestAddressOverFlow)?; 1435 1436 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1437 return Ok(arch::layout::RAM_64BIT_START); 1438 } 1439 1440 Ok(start_addr) 1441 } 1442 1443 pub fn add_ram_region( 1444 &mut self, 1445 start_addr: GuestAddress, 1446 size: usize, 1447 ) -> Result<Arc<GuestRegionMmap>, Error> { 1448 // Allocate memory for the region 1449 let region = MemoryManager::create_ram_region( 1450 &None, 1451 0, 1452 start_addr, 1453 size, 1454 self.prefault, 1455 self.shared, 1456 self.hugepages, 1457 self.hugepage_size, 1458 None, 1459 None, 1460 self.thp, 1461 )?; 1462 1463 // Map it into the guest 1464 let slot = self.create_userspace_mapping( 1465 region.start_addr().0, 1466 region.len(), 1467 region.as_ptr() as u64, 1468 self.mergeable, 1469 false, 1470 self.log_dirty, 1471 )?; 1472 self.guest_ram_mappings.push(GuestRamMapping { 1473 gpa: region.start_addr().raw_value(), 1474 size: region.len(), 1475 slot, 1476 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1477 virtio_mem: false, 1478 file_offset: 0, 1479 }); 1480 1481 self.add_region(Arc::clone(®ion))?; 1482 1483 Ok(region) 1484 } 1485 1486 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1487 info!("Hotplugging new RAM: {}", size); 1488 1489 // Check that there is a free slot 1490 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1491 return Err(Error::NoSlotAvailable); 1492 } 1493 1494 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1495 if size % (128 << 20) != 0 { 1496 return Err(Error::InvalidSize); 1497 } 1498 1499 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1500 1501 if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area { 1502 return Err(Error::InsufficientHotplugRam); 1503 } 1504 1505 let region = self.add_ram_region(start_addr, size)?; 1506 1507 // Add region to the list of regions associated with the default 1508 // memory zone. 1509 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1510 memory_zone.regions.push(Arc::clone(®ion)); 1511 } 1512 1513 // Tell the allocator 1514 self.ram_allocator 1515 .allocate(Some(start_addr), size as GuestUsize, None) 1516 .ok_or(Error::MemoryRangeAllocation)?; 1517 1518 // Update the slot so that it can be queried via the I/O port 1519 let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1520 slot.active = true; 1521 slot.inserting = true; 1522 slot.base = region.start_addr().0; 1523 slot.length = region.len(); 1524 1525 self.next_hotplug_slot += 1; 1526 1527 Ok(region) 1528 } 1529 1530 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1531 self.guest_memory.clone() 1532 } 1533 1534 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1535 self.boot_guest_memory.clone() 1536 } 1537 1538 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1539 self.allocator.clone() 1540 } 1541 1542 pub fn start_of_device_area(&self) -> GuestAddress { 1543 self.start_of_device_area 1544 } 1545 1546 pub fn end_of_device_area(&self) -> GuestAddress { 1547 self.end_of_device_area 1548 } 1549 1550 pub fn allocate_memory_slot(&mut self) -> u32 { 1551 let slot_id = self.next_memory_slot; 1552 self.next_memory_slot += 1; 1553 slot_id 1554 } 1555 1556 pub fn create_userspace_mapping( 1557 &mut self, 1558 guest_phys_addr: u64, 1559 memory_size: u64, 1560 userspace_addr: u64, 1561 mergeable: bool, 1562 readonly: bool, 1563 log_dirty: bool, 1564 ) -> Result<u32, Error> { 1565 let slot = self.allocate_memory_slot(); 1566 let mem_region = self.vm.make_user_memory_region( 1567 slot, 1568 guest_phys_addr, 1569 memory_size, 1570 userspace_addr, 1571 readonly, 1572 log_dirty, 1573 ); 1574 1575 info!( 1576 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1577 guest_phys_addr, userspace_addr, memory_size, slot 1578 ); 1579 1580 self.vm 1581 .create_user_memory_region(mem_region) 1582 .map_err(Error::CreateUserMemoryRegion)?; 1583 1584 // Mark the pages as mergeable if explicitly asked for. 1585 if mergeable { 1586 // SAFETY: the address and size are valid since the 1587 // mmap succeeded. 1588 let ret = unsafe { 1589 libc::madvise( 1590 userspace_addr as *mut libc::c_void, 1591 memory_size as libc::size_t, 1592 libc::MADV_MERGEABLE, 1593 ) 1594 }; 1595 if ret != 0 { 1596 let err = io::Error::last_os_error(); 1597 // Safe to unwrap because the error is constructed with 1598 // last_os_error(), which ensures the output will be Some(). 1599 let errno = err.raw_os_error().unwrap(); 1600 if errno == libc::EINVAL { 1601 warn!("kernel not configured with CONFIG_KSM"); 1602 } else { 1603 warn!("madvise error: {}", err); 1604 } 1605 warn!("failed to mark pages as mergeable"); 1606 } 1607 } 1608 1609 info!( 1610 "Created userspace mapping: {:x} -> {:x} {:x}", 1611 guest_phys_addr, userspace_addr, memory_size 1612 ); 1613 1614 Ok(slot) 1615 } 1616 1617 pub fn remove_userspace_mapping( 1618 &mut self, 1619 guest_phys_addr: u64, 1620 memory_size: u64, 1621 userspace_addr: u64, 1622 mergeable: bool, 1623 slot: u32, 1624 ) -> Result<(), Error> { 1625 let mem_region = self.vm.make_user_memory_region( 1626 slot, 1627 guest_phys_addr, 1628 memory_size, 1629 userspace_addr, 1630 false, /* readonly -- don't care */ 1631 false, /* log dirty */ 1632 ); 1633 1634 self.vm 1635 .remove_user_memory_region(mem_region) 1636 .map_err(Error::RemoveUserMemoryRegion)?; 1637 1638 // Mark the pages as unmergeable if there were previously marked as 1639 // mergeable. 1640 if mergeable { 1641 // SAFETY: the address and size are valid as the region was 1642 // previously advised. 1643 let ret = unsafe { 1644 libc::madvise( 1645 userspace_addr as *mut libc::c_void, 1646 memory_size as libc::size_t, 1647 libc::MADV_UNMERGEABLE, 1648 ) 1649 }; 1650 if ret != 0 { 1651 let err = io::Error::last_os_error(); 1652 // Safe to unwrap because the error is constructed with 1653 // last_os_error(), which ensures the output will be Some(). 1654 let errno = err.raw_os_error().unwrap(); 1655 if errno == libc::EINVAL { 1656 warn!("kernel not configured with CONFIG_KSM"); 1657 } else { 1658 warn!("madvise error: {}", err); 1659 } 1660 warn!("failed to mark pages as unmergeable"); 1661 } 1662 } 1663 1664 info!( 1665 "Removed userspace mapping: {:x} -> {:x} {:x}", 1666 guest_phys_addr, userspace_addr, memory_size 1667 ); 1668 1669 Ok(()) 1670 } 1671 1672 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1673 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1674 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1675 if let Some(virtio_mem_device) = virtio_mem_zone.virtio_device.as_ref() { 1676 virtio_mem_device 1677 .lock() 1678 .unwrap() 1679 .resize(size) 1680 .map_err(Error::VirtioMemResizeFail)?; 1681 } 1682 1683 // Keep the hotplugged_size up to date. 1684 virtio_mem_zone.hotplugged_size = size; 1685 } else { 1686 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1687 return Err(Error::MissingVirtioMemHandler); 1688 } 1689 1690 return Ok(()); 1691 } 1692 1693 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1694 Err(Error::UnknownMemoryZone) 1695 } 1696 1697 /// In case this function resulted in adding a new memory region to the 1698 /// guest memory, the new region is returned to the caller. The virtio-mem 1699 /// use case never adds a new region as the whole hotpluggable memory has 1700 /// already been allocated at boot time. 1701 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1702 if self.user_provided_zones { 1703 error!( 1704 "Not allowed to resize guest memory when backed with user \ 1705 defined memory zones." 1706 ); 1707 return Err(Error::InvalidResizeWithMemoryZones); 1708 } 1709 1710 let mut region: Option<Arc<GuestRegionMmap>> = None; 1711 match self.hotplug_method { 1712 HotplugMethod::VirtioMem => { 1713 if desired_ram >= self.boot_ram { 1714 if !self.dynamic { 1715 return Ok(region); 1716 } 1717 1718 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1719 self.current_ram = desired_ram; 1720 } 1721 } 1722 HotplugMethod::Acpi => { 1723 if desired_ram > self.current_ram { 1724 if !self.dynamic { 1725 return Ok(region); 1726 } 1727 1728 region = 1729 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1730 self.current_ram = desired_ram; 1731 } 1732 } 1733 } 1734 Ok(region) 1735 } 1736 1737 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1738 if !self.user_provided_zones { 1739 error!( 1740 "Not allowed to resize guest memory zone when no zone is \ 1741 defined." 1742 ); 1743 return Err(Error::ResizeZone); 1744 } 1745 1746 self.virtio_mem_resize(id, virtio_mem_size) 1747 } 1748 1749 #[cfg(target_arch = "x86_64")] 1750 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1751 let file = OpenOptions::new() 1752 .read(true) 1753 .open("/dev/sgx_provision") 1754 .map_err(Error::SgxProvisionOpen)?; 1755 self.vm 1756 .enable_sgx_attribute(file) 1757 .map_err(Error::SgxEnableProvisioning)?; 1758 1759 // Go over each EPC section and verify its size is a 4k multiple. At 1760 // the same time, calculate the total size needed for the contiguous 1761 // EPC region. 1762 let mut epc_region_size = 0; 1763 for epc_section in sgx_epc_config.iter() { 1764 if epc_section.size == 0 { 1765 return Err(Error::EpcSectionSizeInvalid); 1766 } 1767 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1768 return Err(Error::EpcSectionSizeInvalid); 1769 } 1770 1771 epc_region_size += epc_section.size; 1772 } 1773 1774 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1775 let epc_region_start = GuestAddress( 1776 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1777 ); 1778 1779 self.start_of_device_area = epc_region_start 1780 .checked_add(epc_region_size) 1781 .ok_or(Error::GuestAddressOverFlow)?; 1782 1783 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1784 info!( 1785 "SGX EPC region: 0x{:x} (0x{:x})", 1786 epc_region_start.0, epc_region_size 1787 ); 1788 1789 // Each section can be memory mapped into the allocated region. 1790 let mut epc_section_start = epc_region_start.raw_value(); 1791 for epc_section in sgx_epc_config.iter() { 1792 let file = OpenOptions::new() 1793 .read(true) 1794 .write(true) 1795 .open("/dev/sgx_vepc") 1796 .map_err(Error::SgxVirtEpcOpen)?; 1797 1798 let prot = PROT_READ | PROT_WRITE; 1799 let mut flags = MAP_NORESERVE | MAP_SHARED; 1800 if epc_section.prefault { 1801 flags |= MAP_POPULATE; 1802 } 1803 1804 // We can't use the vm-memory crate to perform the memory mapping 1805 // here as it would try to ensure the size of the backing file is 1806 // matching the size of the expected mapping. The /dev/sgx_vepc 1807 // device does not work that way, it provides a file descriptor 1808 // which is not matching the mapping size, as it's a just a way to 1809 // let KVM know that an EPC section is being created for the guest. 1810 // SAFETY: FFI call with correct arguments 1811 let host_addr = unsafe { 1812 libc::mmap( 1813 std::ptr::null_mut(), 1814 epc_section.size as usize, 1815 prot, 1816 flags, 1817 file.as_raw_fd(), 1818 0, 1819 ) 1820 } as u64; 1821 1822 info!( 1823 "Adding SGX EPC section: 0x{:x} (0x{:x})", 1824 epc_section_start, epc_section.size 1825 ); 1826 1827 let _mem_slot = self.create_userspace_mapping( 1828 epc_section_start, 1829 epc_section.size, 1830 host_addr, 1831 false, 1832 false, 1833 false, 1834 )?; 1835 1836 sgx_epc_region.insert( 1837 epc_section.id.clone(), 1838 SgxEpcSection::new( 1839 GuestAddress(epc_section_start), 1840 epc_section.size as GuestUsize, 1841 ), 1842 ); 1843 1844 epc_section_start += epc_section.size; 1845 } 1846 1847 self.sgx_epc_region = Some(sgx_epc_region); 1848 1849 Ok(()) 1850 } 1851 1852 #[cfg(target_arch = "x86_64")] 1853 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 1854 &self.sgx_epc_region 1855 } 1856 1857 pub fn is_hardlink(f: &File) -> bool { 1858 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 1859 // SAFETY: FFI call with correct arguments 1860 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 1861 if ret != 0 { 1862 error!("Couldn't fstat the backing file"); 1863 return false; 1864 } 1865 1866 // SAFETY: stat is valid 1867 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 1868 } 1869 1870 pub fn memory_zones(&self) -> &MemoryZones { 1871 &self.memory_zones 1872 } 1873 1874 pub fn memory_zones_mut(&mut self) -> &mut MemoryZones { 1875 &mut self.memory_zones 1876 } 1877 1878 pub fn memory_range_table( 1879 &self, 1880 snapshot: bool, 1881 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 1882 let mut table = MemoryRangeTable::default(); 1883 1884 for memory_zone in self.memory_zones.values() { 1885 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 1886 table.extend(virtio_mem_zone.plugged_ranges()); 1887 } 1888 1889 for region in memory_zone.regions() { 1890 if snapshot { 1891 if let Some(file_offset) = region.file_offset() { 1892 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 1893 && Self::is_hardlink(file_offset.file()) 1894 { 1895 // In this very specific case, we know the memory 1896 // region is backed by a file on the host filesystem 1897 // that can be accessed by the user, and additionally 1898 // the mapping is shared, which means that modifications 1899 // to the content are written to the actual file. 1900 // When meeting these conditions, we can skip the 1901 // copy of the memory content for this specific region, 1902 // as we can assume the user will have it saved through 1903 // the backing file already. 1904 continue; 1905 } 1906 } 1907 } 1908 1909 table.push(MemoryRange { 1910 gpa: region.start_addr().raw_value(), 1911 length: region.len(), 1912 }); 1913 } 1914 } 1915 1916 Ok(table) 1917 } 1918 1919 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 1920 MemoryManagerSnapshotData { 1921 memory_ranges: self.snapshot_memory_ranges.clone(), 1922 guest_ram_mappings: self.guest_ram_mappings.clone(), 1923 start_of_device_area: self.start_of_device_area.0, 1924 boot_ram: self.boot_ram, 1925 current_ram: self.current_ram, 1926 arch_mem_regions: self.arch_mem_regions.clone(), 1927 hotplug_slots: self.hotplug_slots.clone(), 1928 next_memory_slot: self.next_memory_slot, 1929 selected_slot: self.selected_slot, 1930 next_hotplug_slot: self.next_hotplug_slot, 1931 } 1932 } 1933 1934 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 1935 let mut memory_slot_fds = HashMap::new(); 1936 for guest_ram_mapping in &self.guest_ram_mappings { 1937 let slot = guest_ram_mapping.slot; 1938 let guest_memory = self.guest_memory.memory(); 1939 let file = guest_memory 1940 .find_region(GuestAddress(guest_ram_mapping.gpa)) 1941 .unwrap() 1942 .file_offset() 1943 .unwrap() 1944 .file(); 1945 memory_slot_fds.insert(slot, file.as_raw_fd()); 1946 } 1947 memory_slot_fds 1948 } 1949 1950 pub fn acpi_address(&self) -> Option<GuestAddress> { 1951 self.acpi_address 1952 } 1953 1954 pub fn num_guest_ram_mappings(&self) -> u32 { 1955 self.guest_ram_mappings.len() as u32 1956 } 1957 1958 #[cfg(target_arch = "aarch64")] 1959 pub fn uefi_flash(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1960 self.uefi_flash.as_ref().unwrap().clone() 1961 } 1962 1963 #[cfg(feature = "guest_debug")] 1964 pub fn coredump_memory_regions(&self, mem_offset: u64) -> CoredumpMemoryRegions { 1965 let mut mapping_sorted_by_gpa = self.guest_ram_mappings.clone(); 1966 mapping_sorted_by_gpa.sort_by_key(|m| m.gpa); 1967 1968 let mut mem_offset_in_elf = mem_offset; 1969 let mut ram_maps = BTreeMap::new(); 1970 for mapping in mapping_sorted_by_gpa.iter() { 1971 ram_maps.insert( 1972 mapping.gpa, 1973 CoredumpMemoryRegion { 1974 mem_offset_in_elf, 1975 mem_size: mapping.size, 1976 }, 1977 ); 1978 mem_offset_in_elf += mapping.size; 1979 } 1980 1981 CoredumpMemoryRegions { ram_maps } 1982 } 1983 1984 #[cfg(feature = "guest_debug")] 1985 pub fn coredump_iterate_save_mem( 1986 &mut self, 1987 dump_state: &DumpState, 1988 ) -> std::result::Result<(), GuestDebuggableError> { 1989 let snapshot_memory_ranges = self 1990 .memory_range_table(false) 1991 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 1992 1993 if snapshot_memory_ranges.is_empty() { 1994 return Ok(()); 1995 } 1996 1997 let mut coredump_file = dump_state.file.as_ref().unwrap(); 1998 1999 let guest_memory = self.guest_memory.memory(); 2000 let mut total_bytes: u64 = 0; 2001 2002 for range in snapshot_memory_ranges.regions() { 2003 let mut offset: u64 = 0; 2004 loop { 2005 let bytes_written = guest_memory 2006 .write_to( 2007 GuestAddress(range.gpa + offset), 2008 &mut coredump_file, 2009 (range.length - offset) as usize, 2010 ) 2011 .map_err(|e| GuestDebuggableError::Coredump(e.into()))?; 2012 offset += bytes_written as u64; 2013 total_bytes += bytes_written as u64; 2014 2015 if offset == range.length { 2016 break; 2017 } 2018 } 2019 } 2020 2021 debug!("coredump total bytes {}", total_bytes); 2022 Ok(()) 2023 } 2024 2025 pub fn receive_memory_regions<F>( 2026 &mut self, 2027 ranges: &MemoryRangeTable, 2028 fd: &mut F, 2029 ) -> std::result::Result<(), MigratableError> 2030 where 2031 F: Read, 2032 { 2033 let guest_memory = self.guest_memory(); 2034 let mem = guest_memory.memory(); 2035 2036 for range in ranges.regions() { 2037 let mut offset: u64 = 0; 2038 // Here we are manually handling the retry in case we can't the 2039 // whole region at once because we can't use the implementation 2040 // from vm-memory::GuestMemory of read_exact_from() as it is not 2041 // following the correct behavior. For more info about this issue 2042 // see: https://github.com/rust-vmm/vm-memory/issues/174 2043 loop { 2044 let bytes_read = mem 2045 .read_from( 2046 GuestAddress(range.gpa + offset), 2047 fd, 2048 (range.length - offset) as usize, 2049 ) 2050 .map_err(|e| { 2051 MigratableError::MigrateReceive(anyhow!( 2052 "Error receiving memory from socket: {}", 2053 e 2054 )) 2055 })?; 2056 offset += bytes_read as u64; 2057 2058 if offset == range.length { 2059 break; 2060 } 2061 } 2062 } 2063 2064 Ok(()) 2065 } 2066 } 2067 2068 struct MemoryNotify { 2069 slot_id: usize, 2070 } 2071 2072 impl Aml for MemoryNotify { 2073 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2074 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 2075 aml::If::new( 2076 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 2077 vec![&aml::Notify::new(&object, &aml::Arg(1))], 2078 ) 2079 .append_aml_bytes(bytes) 2080 } 2081 } 2082 2083 struct MemorySlot { 2084 slot_id: usize, 2085 } 2086 2087 impl Aml for MemorySlot { 2088 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2089 aml::Device::new( 2090 format!("M{:03}", self.slot_id).as_str().into(), 2091 vec![ 2092 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")), 2093 &aml::Name::new("_UID".into(), &self.slot_id), 2094 /* 2095 _STA return value: 2096 Bit [0] – Set if the device is present. 2097 Bit [1] – Set if the device is enabled and decoding its resources. 2098 Bit [2] – Set if the device should be shown in the UI. 2099 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 2100 Bit [4] – Set if the battery is present. 2101 Bits [31:5] – Reserved (must be cleared). 2102 */ 2103 &aml::Method::new( 2104 "_STA".into(), 2105 0, 2106 false, 2107 // Call into MSTA method which will interrogate device 2108 vec![&aml::Return::new(&aml::MethodCall::new( 2109 "MSTA".into(), 2110 vec![&self.slot_id], 2111 ))], 2112 ), 2113 // Get details of memory 2114 &aml::Method::new( 2115 "_CRS".into(), 2116 0, 2117 false, 2118 // Call into MCRS which provides actual memory details 2119 vec![&aml::Return::new(&aml::MethodCall::new( 2120 "MCRS".into(), 2121 vec![&self.slot_id], 2122 ))], 2123 ), 2124 ], 2125 ) 2126 .append_aml_bytes(bytes) 2127 } 2128 } 2129 2130 struct MemorySlots { 2131 slots: usize, 2132 } 2133 2134 impl Aml for MemorySlots { 2135 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2136 for slot_id in 0..self.slots { 2137 MemorySlot { slot_id }.append_aml_bytes(bytes); 2138 } 2139 } 2140 } 2141 2142 struct MemoryMethods { 2143 slots: usize, 2144 } 2145 2146 impl Aml for MemoryMethods { 2147 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2148 // Add "MTFY" notification method 2149 let mut memory_notifies = Vec::new(); 2150 for slot_id in 0..self.slots { 2151 memory_notifies.push(MemoryNotify { slot_id }); 2152 } 2153 2154 let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 2155 for memory_notifier in memory_notifies.iter() { 2156 memory_notifies_refs.push(memory_notifier); 2157 } 2158 2159 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes); 2160 2161 // MSCN method 2162 aml::Method::new( 2163 "MSCN".into(), 2164 0, 2165 true, 2166 vec![ 2167 // Take lock defined above 2168 &aml::Acquire::new("MLCK".into(), 0xffff), 2169 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2170 &aml::While::new( 2171 &aml::LessThan::new(&aml::Local(0), &self.slots), 2172 vec![ 2173 // Write slot number (in first argument) to I/O port via field 2174 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 2175 // Check if MINS bit is set (inserting) 2176 &aml::If::new( 2177 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2178 // Notify device if it is 2179 vec![ 2180 &aml::MethodCall::new( 2181 "MTFY".into(), 2182 vec![&aml::Local(0), &aml::ONE], 2183 ), 2184 // Reset MINS bit 2185 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 2186 ], 2187 ), 2188 // Check if MRMV bit is set 2189 &aml::If::new( 2190 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2191 // Notify device if it is (with the eject constant 0x3) 2192 vec![ 2193 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 2194 // Reset MRMV bit 2195 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 2196 ], 2197 ), 2198 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 2199 ], 2200 ), 2201 // Release lock 2202 &aml::Release::new("MLCK".into()), 2203 ], 2204 ) 2205 .append_aml_bytes(bytes); 2206 2207 // Memory status method 2208 aml::Method::new( 2209 "MSTA".into(), 2210 1, 2211 true, 2212 vec![ 2213 // Take lock defined above 2214 &aml::Acquire::new("MLCK".into(), 0xffff), 2215 // Write slot number (in first argument) to I/O port via field 2216 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2217 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2218 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2219 &aml::If::new( 2220 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2221 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2222 ), 2223 // Release lock 2224 &aml::Release::new("MLCK".into()), 2225 // Return 0 or 0xf 2226 &aml::Return::new(&aml::Local(0)), 2227 ], 2228 ) 2229 .append_aml_bytes(bytes); 2230 2231 // Memory range method 2232 aml::Method::new( 2233 "MCRS".into(), 2234 1, 2235 true, 2236 vec![ 2237 // Take lock defined above 2238 &aml::Acquire::new("MLCK".into(), 0xffff), 2239 // Write slot number (in first argument) to I/O port via field 2240 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2241 &aml::Name::new( 2242 "MR64".into(), 2243 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2244 aml::AddressSpaceCachable::Cacheable, 2245 true, 2246 0x0000_0000_0000_0000u64, 2247 0xFFFF_FFFF_FFFF_FFFEu64, 2248 )]), 2249 ), 2250 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()), 2251 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()), 2252 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()), 2253 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()), 2254 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()), 2255 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()), 2256 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2257 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2258 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2259 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2260 &aml::Add::new( 2261 &aml::Path::new("MAXL"), 2262 &aml::Path::new("MINL"), 2263 &aml::Path::new("LENL"), 2264 ), 2265 &aml::Add::new( 2266 &aml::Path::new("MAXH"), 2267 &aml::Path::new("MINH"), 2268 &aml::Path::new("LENH"), 2269 ), 2270 &aml::If::new( 2271 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2272 vec![&aml::Add::new( 2273 &aml::Path::new("MAXH"), 2274 &aml::ONE, 2275 &aml::Path::new("MAXH"), 2276 )], 2277 ), 2278 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2279 // Release lock 2280 &aml::Release::new("MLCK".into()), 2281 &aml::Return::new(&aml::Path::new("MR64")), 2282 ], 2283 ) 2284 .append_aml_bytes(bytes) 2285 } 2286 } 2287 2288 impl Aml for MemoryManager { 2289 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2290 if let Some(acpi_address) = self.acpi_address { 2291 // Memory Hotplug Controller 2292 aml::Device::new( 2293 "_SB_.MHPC".into(), 2294 vec![ 2295 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2296 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2297 // Mutex to protect concurrent access as we write to choose slot and then read back status 2298 &aml::Mutex::new("MLCK".into(), 0), 2299 &aml::Name::new( 2300 "_CRS".into(), 2301 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2302 aml::AddressSpaceCachable::NotCacheable, 2303 true, 2304 acpi_address.0, 2305 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2306 )]), 2307 ), 2308 // OpRegion and Fields map MMIO range into individual field values 2309 &aml::OpRegion::new( 2310 "MHPR".into(), 2311 aml::OpRegionSpace::SystemMemory, 2312 acpi_address.0 as usize, 2313 MEMORY_MANAGER_ACPI_SIZE, 2314 ), 2315 &aml::Field::new( 2316 "MHPR".into(), 2317 aml::FieldAccessType::DWord, 2318 aml::FieldUpdateRule::Preserve, 2319 vec![ 2320 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2321 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2322 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2323 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2324 ], 2325 ), 2326 &aml::Field::new( 2327 "MHPR".into(), 2328 aml::FieldAccessType::DWord, 2329 aml::FieldUpdateRule::Preserve, 2330 vec![ 2331 aml::FieldEntry::Reserved(128), 2332 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2333 ], 2334 ), 2335 &aml::Field::new( 2336 "MHPR".into(), 2337 aml::FieldAccessType::Byte, 2338 aml::FieldUpdateRule::WriteAsZeroes, 2339 vec![ 2340 aml::FieldEntry::Reserved(160), 2341 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2342 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2343 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2344 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2345 ], 2346 ), 2347 &aml::Field::new( 2348 "MHPR".into(), 2349 aml::FieldAccessType::DWord, 2350 aml::FieldUpdateRule::Preserve, 2351 vec![ 2352 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2353 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2354 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2355 ], 2356 ), 2357 &MemoryMethods { 2358 slots: self.hotplug_slots.len(), 2359 }, 2360 &MemorySlots { 2361 slots: self.hotplug_slots.len(), 2362 }, 2363 ], 2364 ) 2365 .append_aml_bytes(bytes); 2366 } else { 2367 aml::Device::new( 2368 "_SB_.MHPC".into(), 2369 vec![ 2370 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2371 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2372 // Empty MSCN for GED 2373 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2374 ], 2375 ) 2376 .append_aml_bytes(bytes); 2377 } 2378 2379 #[cfg(target_arch = "x86_64")] 2380 { 2381 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2382 let min = sgx_epc_region.start().raw_value(); 2383 let max = min + sgx_epc_region.size() - 1; 2384 // SGX EPC region 2385 aml::Device::new( 2386 "_SB_.EPC_".into(), 2387 vec![ 2388 &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")), 2389 // QWORD describing the EPC region start and size 2390 &aml::Name::new( 2391 "_CRS".into(), 2392 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2393 aml::AddressSpaceCachable::NotCacheable, 2394 true, 2395 min, 2396 max, 2397 )]), 2398 ), 2399 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2400 ], 2401 ) 2402 .append_aml_bytes(bytes); 2403 } 2404 } 2405 } 2406 } 2407 2408 impl Pausable for MemoryManager {} 2409 2410 #[derive(Clone, Serialize, Deserialize, Versionize)] 2411 pub struct MemoryManagerSnapshotData { 2412 memory_ranges: MemoryRangeTable, 2413 guest_ram_mappings: Vec<GuestRamMapping>, 2414 start_of_device_area: u64, 2415 boot_ram: u64, 2416 current_ram: u64, 2417 arch_mem_regions: Vec<ArchMemRegion>, 2418 hotplug_slots: Vec<HotPlugState>, 2419 next_memory_slot: u32, 2420 selected_slot: usize, 2421 next_hotplug_slot: usize, 2422 } 2423 2424 impl VersionMapped for MemoryManagerSnapshotData {} 2425 2426 impl Snapshottable for MemoryManager { 2427 fn id(&self) -> String { 2428 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2429 } 2430 2431 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2432 let mut memory_manager_snapshot = Snapshot::new(MEMORY_MANAGER_SNAPSHOT_ID); 2433 2434 let memory_ranges = self.memory_range_table(true)?; 2435 2436 // Store locally this list of ranges as it will be used through the 2437 // Transportable::send() implementation. The point is to avoid the 2438 // duplication of code regarding the creation of the path for each 2439 // region. The 'snapshot' step creates the list of memory regions, 2440 // including information about the need to copy a memory region or 2441 // not. This saves the 'send' step having to go through the same 2442 // process, and instead it can directly proceed with storing the 2443 // memory range content for the ranges requiring it. 2444 self.snapshot_memory_ranges = memory_ranges; 2445 2446 memory_manager_snapshot.add_data_section(SnapshotDataSection::new_from_versioned_state( 2447 MEMORY_MANAGER_SNAPSHOT_ID, 2448 &self.snapshot_data(), 2449 )?); 2450 2451 Ok(memory_manager_snapshot) 2452 } 2453 } 2454 2455 impl Transportable for MemoryManager { 2456 fn send( 2457 &self, 2458 _snapshot: &Snapshot, 2459 destination_url: &str, 2460 ) -> result::Result<(), MigratableError> { 2461 if self.snapshot_memory_ranges.is_empty() { 2462 return Ok(()); 2463 } 2464 2465 let mut memory_file_path = url_to_path(destination_url)?; 2466 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2467 2468 // Create the snapshot file for the entire memory 2469 let mut memory_file = OpenOptions::new() 2470 .read(true) 2471 .write(true) 2472 .create_new(true) 2473 .open(memory_file_path) 2474 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2475 2476 let guest_memory = self.guest_memory.memory(); 2477 2478 for range in self.snapshot_memory_ranges.regions() { 2479 let mut offset: u64 = 0; 2480 // Here we are manually handling the retry in case we can't read 2481 // the whole region at once because we can't use the implementation 2482 // from vm-memory::GuestMemory of write_all_to() as it is not 2483 // following the correct behavior. For more info about this issue 2484 // see: https://github.com/rust-vmm/vm-memory/issues/174 2485 loop { 2486 let bytes_written = guest_memory 2487 .write_to( 2488 GuestAddress(range.gpa + offset), 2489 &mut memory_file, 2490 (range.length - offset) as usize, 2491 ) 2492 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2493 offset += bytes_written as u64; 2494 2495 if offset == range.length { 2496 break; 2497 } 2498 } 2499 } 2500 Ok(()) 2501 } 2502 } 2503 2504 impl Migratable for MemoryManager { 2505 // Start the dirty log in the hypervisor (kvm/mshv). 2506 // Also, reset the dirty bitmap logged by the vmm. 2507 // Just before we do a bulk copy we want to start/clear the dirty log so that 2508 // pages touched during our bulk copy are tracked. 2509 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2510 self.vm.start_dirty_log().map_err(|e| { 2511 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2512 })?; 2513 2514 for r in self.guest_memory.memory().iter() { 2515 r.bitmap().reset(); 2516 } 2517 2518 Ok(()) 2519 } 2520 2521 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2522 self.vm.stop_dirty_log().map_err(|e| { 2523 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2524 })?; 2525 2526 Ok(()) 2527 } 2528 2529 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2530 // together in the table if they are contiguous. 2531 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2532 let mut table = MemoryRangeTable::default(); 2533 for r in &self.guest_ram_mappings { 2534 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2535 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2536 })?; 2537 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2538 { 2539 Some(region) => { 2540 assert!(region.start_addr().raw_value() == r.gpa); 2541 assert!(region.len() == r.size); 2542 region.bitmap().get_and_reset() 2543 } 2544 None => { 2545 return Err(MigratableError::MigrateSend(anyhow!( 2546 "Error finding 'guest memory region' with address {:x}", 2547 r.gpa 2548 ))) 2549 } 2550 }; 2551 2552 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2553 .iter() 2554 .zip(vmm_dirty_bitmap.iter()) 2555 .map(|(x, y)| x | y) 2556 .collect(); 2557 2558 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2559 2560 if sub_table.regions().is_empty() { 2561 info!("Dirty Memory Range Table is empty"); 2562 } else { 2563 info!("Dirty Memory Range Table:"); 2564 for range in sub_table.regions() { 2565 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2566 } 2567 } 2568 2569 table.extend(sub_table); 2570 } 2571 Ok(table) 2572 } 2573 } 2574