1 // Copyright © 2019 Intel Corporation 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 #[cfg(target_arch = "x86_64")] 6 use crate::config::SgxEpcConfig; 7 use crate::config::{HotplugMethod, MemoryConfig, MemoryZoneConfig}; 8 use crate::migration::url_to_path; 9 use crate::MEMORY_MANAGER_SNAPSHOT_ID; 10 use crate::{GuestMemoryMmap, GuestRegionMmap}; 11 use acpi_tables::{aml, aml::Aml}; 12 use anyhow::anyhow; 13 #[cfg(target_arch = "x86_64")] 14 use arch::x86_64::{SgxEpcRegion, SgxEpcSection}; 15 use arch::{layout, RegionType}; 16 #[cfg(target_arch = "x86_64")] 17 use devices::ioapic; 18 #[cfg(target_arch = "x86_64")] 19 use libc::{MAP_NORESERVE, MAP_POPULATE, MAP_SHARED, PROT_READ, PROT_WRITE}; 20 use std::collections::HashMap; 21 use std::convert::TryInto; 22 use std::ffi; 23 use std::fs::{File, OpenOptions}; 24 use std::io; 25 use std::ops::Deref; 26 use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; 27 use std::path::PathBuf; 28 use std::result; 29 use std::sync::{Arc, Barrier, Mutex}; 30 use versionize::{VersionMap, Versionize, VersionizeResult}; 31 use versionize_derive::Versionize; 32 use virtio_devices::BlocksState; 33 #[cfg(target_arch = "x86_64")] 34 use vm_allocator::GsiApic; 35 use vm_allocator::{AddressAllocator, SystemAllocator}; 36 use vm_device::BusDevice; 37 use vm_memory::bitmap::AtomicBitmap; 38 use vm_memory::guest_memory::FileOffset; 39 use vm_memory::{ 40 mmap::MmapRegionError, Address, Bytes, Error as MmapError, GuestAddress, GuestAddressSpace, 41 GuestMemory, GuestMemoryAtomic, GuestMemoryError, GuestMemoryRegion, GuestUsize, MmapRegion, 42 }; 43 use vm_migration::{ 44 protocol::MemoryRange, protocol::MemoryRangeTable, Migratable, MigratableError, Pausable, 45 Snapshot, SnapshotDataSection, Snapshottable, Transportable, VersionMapped, 46 }; 47 48 pub const MEMORY_MANAGER_ACPI_SIZE: usize = 0x18; 49 50 const DEFAULT_MEMORY_ZONE: &str = "mem0"; 51 52 const SNAPSHOT_FILENAME: &str = "memory-ranges"; 53 54 #[cfg(target_arch = "x86_64")] 55 const X86_64_IRQ_BASE: u32 = 5; 56 57 #[cfg(target_arch = "x86_64")] 58 const SGX_PAGE_SIZE: u64 = 1 << 12; 59 60 const HOTPLUG_COUNT: usize = 8; 61 62 // Memory policy constants 63 const MPOL_BIND: u32 = 2; 64 const MPOL_MF_STRICT: u32 = 1; 65 const MPOL_MF_MOVE: u32 = 1 << 1; 66 67 // Reserve 1 MiB for platform MMIO devices (e.g. ACPI control devices) 68 const PLATFORM_DEVICE_AREA_SIZE: u64 = 1 << 20; 69 70 #[derive(Clone, Default, Serialize, Deserialize, Versionize)] 71 struct HotPlugState { 72 base: u64, 73 length: u64, 74 active: bool, 75 inserting: bool, 76 removing: bool, 77 } 78 79 pub struct VirtioMemZone { 80 region: Arc<GuestRegionMmap>, 81 resize_handler: virtio_devices::Resize, 82 hotplugged_size: u64, 83 hugepages: bool, 84 blocks_state: Arc<Mutex<BlocksState>>, 85 } 86 87 impl VirtioMemZone { 88 pub fn region(&self) -> &Arc<GuestRegionMmap> { 89 &self.region 90 } 91 pub fn resize_handler(&self) -> &virtio_devices::Resize { 92 &self.resize_handler 93 } 94 pub fn hotplugged_size(&self) -> u64 { 95 self.hotplugged_size 96 } 97 pub fn hugepages(&self) -> bool { 98 self.hugepages 99 } 100 pub fn blocks_state(&self) -> &Arc<Mutex<BlocksState>> { 101 &self.blocks_state 102 } 103 pub fn plugged_ranges(&self) -> MemoryRangeTable { 104 self.blocks_state 105 .lock() 106 .unwrap() 107 .memory_ranges(self.region.start_addr().raw_value(), true) 108 } 109 } 110 111 #[derive(Default)] 112 pub struct MemoryZone { 113 regions: Vec<Arc<GuestRegionMmap>>, 114 virtio_mem_zone: Option<VirtioMemZone>, 115 } 116 117 impl MemoryZone { 118 pub fn regions(&self) -> &Vec<Arc<GuestRegionMmap>> { 119 &self.regions 120 } 121 pub fn virtio_mem_zone(&self) -> &Option<VirtioMemZone> { 122 &self.virtio_mem_zone 123 } 124 } 125 126 pub type MemoryZones = HashMap<String, MemoryZone>; 127 128 #[derive(Clone, Serialize, Deserialize, Versionize)] 129 struct GuestRamMapping { 130 slot: u32, 131 gpa: u64, 132 size: u64, 133 zone_id: String, 134 virtio_mem: bool, 135 file_offset: u64, 136 } 137 138 #[derive(Clone, Serialize, Deserialize, Versionize)] 139 struct ArchMemRegion { 140 base: u64, 141 size: usize, 142 r_type: RegionType, 143 } 144 145 pub struct MemoryManager { 146 boot_guest_memory: GuestMemoryMmap, 147 guest_memory: GuestMemoryAtomic<GuestMemoryMmap>, 148 next_memory_slot: u32, 149 start_of_device_area: GuestAddress, 150 end_of_device_area: GuestAddress, 151 end_of_ram_area: GuestAddress, 152 pub vm: Arc<dyn hypervisor::Vm>, 153 hotplug_slots: Vec<HotPlugState>, 154 selected_slot: usize, 155 mergeable: bool, 156 allocator: Arc<Mutex<SystemAllocator>>, 157 hotplug_method: HotplugMethod, 158 boot_ram: u64, 159 current_ram: u64, 160 next_hotplug_slot: usize, 161 shared: bool, 162 hugepages: bool, 163 hugepage_size: Option<u64>, 164 prefault: bool, 165 #[cfg(target_arch = "x86_64")] 166 sgx_epc_region: Option<SgxEpcRegion>, 167 user_provided_zones: bool, 168 snapshot_memory_ranges: MemoryRangeTable, 169 memory_zones: MemoryZones, 170 log_dirty: bool, // Enable dirty logging for created RAM regions 171 arch_mem_regions: Vec<ArchMemRegion>, 172 ram_allocator: AddressAllocator, 173 dynamic: bool, 174 175 // Keep track of calls to create_userspace_mapping() for guest RAM. 176 // This is useful for getting the dirty pages as we need to know the 177 // slots that the mapping is created in. 178 guest_ram_mappings: Vec<GuestRamMapping>, 179 180 pub acpi_address: Option<GuestAddress>, 181 } 182 183 #[derive(Debug)] 184 pub enum Error { 185 /// Failed to create shared file. 186 SharedFileCreate(io::Error), 187 188 /// Failed to set shared file length. 189 SharedFileSetLen(io::Error), 190 191 /// Mmap backed guest memory error 192 GuestMemory(MmapError), 193 194 /// Failed to allocate a memory range. 195 MemoryRangeAllocation, 196 197 /// Error from region creation 198 GuestMemoryRegion(MmapRegionError), 199 200 /// No ACPI slot available 201 NoSlotAvailable, 202 203 /// Not enough space in the hotplug RAM region 204 InsufficientHotplugRam, 205 206 /// The requested hotplug memory addition is not a valid size 207 InvalidSize, 208 209 /// Failed to create the user memory region. 210 CreateUserMemoryRegion(hypervisor::HypervisorVmError), 211 212 /// Failed to remove the user memory region. 213 RemoveUserMemoryRegion(hypervisor::HypervisorVmError), 214 215 /// Failed to EventFd. 216 EventFdFail(io::Error), 217 218 /// Eventfd write error 219 EventfdError(io::Error), 220 221 /// Failed to virtio-mem resize 222 VirtioMemResizeFail(virtio_devices::mem::Error), 223 224 /// Cannot restore VM 225 Restore(MigratableError), 226 227 /// Cannot restore VM because source URL is missing 228 RestoreMissingSourceUrl, 229 230 /// Cannot create the system allocator 231 CreateSystemAllocator, 232 233 /// Invalid SGX EPC section size 234 #[cfg(target_arch = "x86_64")] 235 EpcSectionSizeInvalid, 236 237 /// Failed allocating SGX EPC region 238 #[cfg(target_arch = "x86_64")] 239 SgxEpcRangeAllocation, 240 241 /// Failed opening SGX virtual EPC device 242 #[cfg(target_arch = "x86_64")] 243 SgxVirtEpcOpen(io::Error), 244 245 /// Failed setting the SGX virtual EPC section size 246 #[cfg(target_arch = "x86_64")] 247 SgxVirtEpcFileSetLen(io::Error), 248 249 /// Failed opening SGX provisioning device 250 #[cfg(target_arch = "x86_64")] 251 SgxProvisionOpen(io::Error), 252 253 /// Failed enabling SGX provisioning 254 #[cfg(target_arch = "x86_64")] 255 SgxEnableProvisioning(hypervisor::HypervisorVmError), 256 257 /// Failed creating a new MmapRegion instance. 258 #[cfg(target_arch = "x86_64")] 259 NewMmapRegion(vm_memory::mmap::MmapRegionError), 260 261 /// No memory zones found. 262 MissingMemoryZones, 263 264 /// Memory configuration is not valid. 265 InvalidMemoryParameters, 266 267 /// Forbidden operation. Impossible to resize guest memory if it is 268 /// backed by user defined memory regions. 269 InvalidResizeWithMemoryZones, 270 271 /// It's invalid to try applying a NUMA policy to a memory zone that is 272 /// memory mapped with MAP_SHARED. 273 InvalidSharedMemoryZoneWithHostNuma, 274 275 /// Failed applying NUMA memory policy. 276 ApplyNumaPolicy(io::Error), 277 278 /// Memory zone identifier is not unique. 279 DuplicateZoneId, 280 281 /// No virtio-mem resizing handler found. 282 MissingVirtioMemHandler, 283 284 /// Unknown memory zone. 285 UnknownMemoryZone, 286 287 /// Invalid size for resizing. Can be anything except 0. 288 InvalidHotplugSize, 289 290 /// Invalid hotplug method associated with memory zones resizing capability. 291 InvalidHotplugMethodWithMemoryZones, 292 293 /// Could not find specified memory zone identifier from hash map. 294 MissingZoneIdentifier, 295 296 /// Resizing the memory zone failed. 297 ResizeZone, 298 299 /// Guest address overflow 300 GuestAddressOverFlow, 301 302 /// Error opening snapshot file 303 SnapshotOpen(io::Error), 304 305 // Error copying snapshot into region 306 SnapshotCopy(GuestMemoryError), 307 308 /// Failed to allocate MMIO address 309 AllocateMmioAddress, 310 } 311 312 const ENABLE_FLAG: usize = 0; 313 const INSERTING_FLAG: usize = 1; 314 const REMOVING_FLAG: usize = 2; 315 const EJECT_FLAG: usize = 3; 316 317 const BASE_OFFSET_LOW: u64 = 0; 318 const BASE_OFFSET_HIGH: u64 = 0x4; 319 const LENGTH_OFFSET_LOW: u64 = 0x8; 320 const LENGTH_OFFSET_HIGH: u64 = 0xC; 321 const STATUS_OFFSET: u64 = 0x14; 322 const SELECTION_OFFSET: u64 = 0; 323 324 // The MMIO address space size is subtracted with 64k. This is done for the 325 // following reasons: 326 // - Reduce the addressable space size by at least 4k to workaround a Linux 327 // bug when the VMM allocates devices at the end of the addressable space 328 // - Windows requires the addressable space size to be 64k aligned 329 fn mmio_address_space_size(phys_bits: u8) -> u64 { 330 (1 << phys_bits) - (1 << 16) 331 } 332 333 impl BusDevice for MemoryManager { 334 fn read(&mut self, _base: u64, offset: u64, data: &mut [u8]) { 335 if self.selected_slot < self.hotplug_slots.len() { 336 let state = &self.hotplug_slots[self.selected_slot]; 337 match offset { 338 BASE_OFFSET_LOW => { 339 data.copy_from_slice(&state.base.to_le_bytes()[..4]); 340 } 341 BASE_OFFSET_HIGH => { 342 data.copy_from_slice(&state.base.to_le_bytes()[4..]); 343 } 344 LENGTH_OFFSET_LOW => { 345 data.copy_from_slice(&state.length.to_le_bytes()[..4]); 346 } 347 LENGTH_OFFSET_HIGH => { 348 data.copy_from_slice(&state.length.to_le_bytes()[4..]); 349 } 350 STATUS_OFFSET => { 351 // The Linux kernel, quite reasonably, doesn't zero the memory it gives us. 352 data.fill(0); 353 if state.active { 354 data[0] |= 1 << ENABLE_FLAG; 355 } 356 if state.inserting { 357 data[0] |= 1 << INSERTING_FLAG; 358 } 359 if state.removing { 360 data[0] |= 1 << REMOVING_FLAG; 361 } 362 } 363 _ => { 364 warn!( 365 "Unexpected offset for accessing memory manager device: {:#}", 366 offset 367 ); 368 } 369 } 370 } else { 371 warn!("Out of range memory slot: {}", self.selected_slot); 372 } 373 } 374 375 fn write(&mut self, _base: u64, offset: u64, data: &[u8]) -> Option<Arc<Barrier>> { 376 match offset { 377 SELECTION_OFFSET => { 378 self.selected_slot = usize::from(data[0]); 379 } 380 STATUS_OFFSET => { 381 if self.selected_slot < self.hotplug_slots.len() { 382 let state = &mut self.hotplug_slots[self.selected_slot]; 383 // The ACPI code writes back a 1 to acknowledge the insertion 384 if (data[0] & (1 << INSERTING_FLAG) == 1 << INSERTING_FLAG) && state.inserting { 385 state.inserting = false; 386 } 387 // Ditto for removal 388 if (data[0] & (1 << REMOVING_FLAG) == 1 << REMOVING_FLAG) && state.removing { 389 state.removing = false; 390 } 391 // Trigger removal of "DIMM" 392 if data[0] & (1 << EJECT_FLAG) == 1 << EJECT_FLAG { 393 warn!("Ejection of memory not currently supported"); 394 } 395 } else { 396 warn!("Out of range memory slot: {}", self.selected_slot); 397 } 398 } 399 _ => { 400 warn!( 401 "Unexpected offset for accessing memory manager device: {:#}", 402 offset 403 ); 404 } 405 }; 406 None 407 } 408 } 409 410 impl MemoryManager { 411 /// Creates all memory regions based on the available RAM ranges defined 412 /// by `ram_regions`, and based on the description of the memory zones. 413 /// In practice, this function can perform multiple memory mappings of the 414 /// same backing file if there's a hole in the address space between two 415 /// RAM ranges. 416 /// One example might be ram_regions containing 2 regions (0-3G and 4G-6G) 417 /// and zones containing two zones (size 1G and size 4G). 418 /// This function will create 3 resulting memory regions: 419 /// - First one mapping entirely the first memory zone on 0-1G range 420 /// - Second one mapping partially the second memory zone on 1G-3G range 421 /// - Third one mapping partially the second memory zone on 4G-6G range 422 fn create_memory_regions_from_zones( 423 ram_regions: &[(GuestAddress, usize)], 424 zones: &[MemoryZoneConfig], 425 prefault: Option<bool>, 426 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 427 let mut zones = zones.to_owned(); 428 let mut mem_regions = Vec::new(); 429 let mut zone = zones.remove(0); 430 let mut zone_offset = 0; 431 let mut memory_zones = HashMap::new(); 432 433 // Add zone id to the list of memory zones. 434 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 435 436 for ram_region in ram_regions.iter() { 437 let mut ram_region_offset = 0; 438 let mut exit = false; 439 440 loop { 441 let mut ram_region_consumed = false; 442 let mut pull_next_zone = false; 443 444 let ram_region_sub_size = ram_region.1 - ram_region_offset; 445 let zone_sub_size = zone.size as usize - zone_offset; 446 447 let file_offset = zone_offset as u64; 448 let region_start = ram_region 449 .0 450 .checked_add(ram_region_offset as u64) 451 .ok_or(Error::GuestAddressOverFlow)?; 452 let region_size = if zone_sub_size <= ram_region_sub_size { 453 if zone_sub_size == ram_region_sub_size { 454 ram_region_consumed = true; 455 } 456 457 ram_region_offset += zone_sub_size; 458 pull_next_zone = true; 459 460 zone_sub_size 461 } else { 462 zone_offset += ram_region_sub_size; 463 ram_region_consumed = true; 464 465 ram_region_sub_size 466 }; 467 468 let region = MemoryManager::create_ram_region( 469 &zone.file, 470 file_offset, 471 region_start, 472 region_size, 473 match prefault { 474 Some(pf) => pf, 475 None => zone.prefault, 476 }, 477 zone.shared, 478 zone.hugepages, 479 zone.hugepage_size, 480 zone.host_numa_node, 481 None, 482 )?; 483 484 // Add region to the list of regions associated with the 485 // current memory zone. 486 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 487 memory_zone.regions.push(region.clone()); 488 } 489 490 mem_regions.push(region); 491 492 if pull_next_zone { 493 // Get the next zone and reset the offset. 494 zone_offset = 0; 495 if zones.is_empty() { 496 exit = true; 497 break; 498 } 499 zone = zones.remove(0); 500 501 // Check if zone id already exist. In case it does, throw 502 // an error as we need unique identifiers. Otherwise, add 503 // the new zone id to the list of memory zones. 504 if memory_zones.contains_key(&zone.id) { 505 error!( 506 "Memory zone identifier '{}' found more than once. \ 507 It must be unique", 508 zone.id, 509 ); 510 return Err(Error::DuplicateZoneId); 511 } 512 memory_zones.insert(zone.id.clone(), MemoryZone::default()); 513 } 514 515 if ram_region_consumed { 516 break; 517 } 518 } 519 520 if exit { 521 break; 522 } 523 } 524 525 Ok((mem_regions, memory_zones)) 526 } 527 528 // Restore both GuestMemory regions along with MemoryZone zones. 529 fn restore_memory_regions_and_zones( 530 guest_ram_mappings: &[GuestRamMapping], 531 zones_config: &[MemoryZoneConfig], 532 prefault: Option<bool>, 533 mut existing_memory_files: HashMap<u32, File>, 534 ) -> Result<(Vec<Arc<GuestRegionMmap>>, MemoryZones), Error> { 535 let mut memory_regions = Vec::new(); 536 let mut memory_zones = HashMap::new(); 537 538 for zone_config in zones_config { 539 memory_zones.insert(zone_config.id.clone(), MemoryZone::default()); 540 } 541 542 for guest_ram_mapping in guest_ram_mappings { 543 for zone_config in zones_config { 544 if guest_ram_mapping.zone_id == zone_config.id { 545 let region = MemoryManager::create_ram_region( 546 &zone_config.file, 547 guest_ram_mapping.file_offset, 548 GuestAddress(guest_ram_mapping.gpa), 549 guest_ram_mapping.size as usize, 550 match prefault { 551 Some(pf) => pf, 552 None => zone_config.prefault, 553 }, 554 zone_config.shared, 555 zone_config.hugepages, 556 zone_config.hugepage_size, 557 zone_config.host_numa_node, 558 existing_memory_files.remove(&guest_ram_mapping.slot), 559 )?; 560 memory_regions.push(Arc::clone(®ion)); 561 if let Some(memory_zone) = memory_zones.get_mut(&guest_ram_mapping.zone_id) { 562 if guest_ram_mapping.virtio_mem { 563 let hotplugged_size = zone_config.hotplugged_size.unwrap_or(0); 564 let region_size = region.len(); 565 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 566 region, 567 resize_handler: virtio_devices::Resize::new(hotplugged_size) 568 .map_err(Error::EventFdFail)?, 569 hotplugged_size, 570 hugepages: zone_config.hugepages, 571 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 572 }); 573 } else { 574 memory_zone.regions.push(region); 575 } 576 } 577 } 578 } 579 } 580 581 memory_regions.sort_by_key(|x| x.start_addr()); 582 583 Ok((memory_regions, memory_zones)) 584 } 585 586 fn fill_saved_regions( 587 &mut self, 588 file_path: PathBuf, 589 saved_regions: MemoryRangeTable, 590 ) -> Result<(), Error> { 591 if saved_regions.is_empty() { 592 return Ok(()); 593 } 594 595 // Open (read only) the snapshot file. 596 let mut memory_file = OpenOptions::new() 597 .read(true) 598 .open(file_path) 599 .map_err(Error::SnapshotOpen)?; 600 601 let guest_memory = self.guest_memory.memory(); 602 for range in saved_regions.regions() { 603 let mut offset: u64 = 0; 604 // Here we are manually handling the retry in case we can't write 605 // the whole region at once because we can't use the implementation 606 // from vm-memory::GuestMemory of read_exact_from() as it is not 607 // following the correct behavior. For more info about this issue 608 // see: https://github.com/rust-vmm/vm-memory/issues/174 609 loop { 610 let bytes_read = guest_memory 611 .read_from( 612 GuestAddress(range.gpa + offset), 613 &mut memory_file, 614 (range.length - offset) as usize, 615 ) 616 .map_err(Error::SnapshotCopy)?; 617 offset += bytes_read as u64; 618 619 if offset == range.length { 620 break; 621 } 622 } 623 } 624 625 Ok(()) 626 } 627 628 fn validate_memory_config( 629 config: &MemoryConfig, 630 user_provided_zones: bool, 631 ) -> Result<(u64, Vec<MemoryZoneConfig>, bool), Error> { 632 let mut allow_mem_hotplug = false; 633 634 if !user_provided_zones { 635 if config.zones.is_some() { 636 error!( 637 "User defined memory regions can't be provided if the \ 638 memory size is not 0" 639 ); 640 return Err(Error::InvalidMemoryParameters); 641 } 642 643 if config.hotplug_size.is_some() { 644 allow_mem_hotplug = true; 645 } 646 647 if let Some(hotplugged_size) = config.hotplugged_size { 648 if let Some(hotplug_size) = config.hotplug_size { 649 if hotplugged_size > hotplug_size { 650 error!( 651 "'hotplugged_size' {} can't be bigger than \ 652 'hotplug_size' {}", 653 hotplugged_size, hotplug_size, 654 ); 655 return Err(Error::InvalidMemoryParameters); 656 } 657 } else { 658 error!( 659 "Invalid to define 'hotplugged_size' when there is\ 660 no 'hotplug_size'" 661 ); 662 return Err(Error::InvalidMemoryParameters); 663 } 664 if config.hotplug_method == HotplugMethod::Acpi { 665 error!( 666 "Invalid to define 'hotplugged_size' with hotplug \ 667 method 'acpi'" 668 ); 669 return Err(Error::InvalidMemoryParameters); 670 } 671 } 672 673 // Create a single zone from the global memory config. This lets 674 // us reuse the codepath for user defined memory zones. 675 let zones = vec![MemoryZoneConfig { 676 id: String::from(DEFAULT_MEMORY_ZONE), 677 size: config.size, 678 file: None, 679 shared: config.shared, 680 hugepages: config.hugepages, 681 hugepage_size: config.hugepage_size, 682 host_numa_node: None, 683 hotplug_size: config.hotplug_size, 684 hotplugged_size: config.hotplugged_size, 685 prefault: config.prefault, 686 }]; 687 688 Ok((config.size, zones, allow_mem_hotplug)) 689 } else { 690 if config.zones.is_none() { 691 error!( 692 "User defined memory regions must be provided if the \ 693 memory size is 0" 694 ); 695 return Err(Error::MissingMemoryZones); 696 } 697 698 // Safe to unwrap as we checked right above there were some 699 // regions. 700 let zones = config.zones.clone().unwrap(); 701 if zones.is_empty() { 702 return Err(Error::MissingMemoryZones); 703 } 704 705 let mut total_ram_size: u64 = 0; 706 for zone in zones.iter() { 707 total_ram_size += zone.size; 708 709 if zone.shared && zone.file.is_some() && zone.host_numa_node.is_some() { 710 error!( 711 "Invalid to set host NUMA policy for a memory zone \ 712 backed by a regular file and mapped as 'shared'" 713 ); 714 return Err(Error::InvalidSharedMemoryZoneWithHostNuma); 715 } 716 717 if zone.hotplug_size.is_some() && config.hotplug_method == HotplugMethod::Acpi { 718 error!("Invalid to set ACPI hotplug method for memory zones"); 719 return Err(Error::InvalidHotplugMethodWithMemoryZones); 720 } 721 722 if let Some(hotplugged_size) = zone.hotplugged_size { 723 if let Some(hotplug_size) = zone.hotplug_size { 724 if hotplugged_size > hotplug_size { 725 error!( 726 "'hotplugged_size' {} can't be bigger than \ 727 'hotplug_size' {}", 728 hotplugged_size, hotplug_size, 729 ); 730 return Err(Error::InvalidMemoryParameters); 731 } 732 } else { 733 error!( 734 "Invalid to define 'hotplugged_size' when there is\ 735 no 'hotplug_size' for a memory zone" 736 ); 737 return Err(Error::InvalidMemoryParameters); 738 } 739 if config.hotplug_method == HotplugMethod::Acpi { 740 error!( 741 "Invalid to define 'hotplugged_size' with hotplug \ 742 method 'acpi'" 743 ); 744 return Err(Error::InvalidMemoryParameters); 745 } 746 } 747 } 748 749 Ok((total_ram_size, zones, allow_mem_hotplug)) 750 } 751 } 752 753 fn allocate_address_space(&mut self) -> Result<(), Error> { 754 let mut list = Vec::new(); 755 756 for (zone_id, memory_zone) in self.memory_zones.iter() { 757 let mut regions: Vec<(Arc<vm_memory::GuestRegionMmap<AtomicBitmap>>, bool)> = 758 memory_zone 759 .regions() 760 .iter() 761 .map(|r| (r.clone(), false)) 762 .collect(); 763 764 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 765 regions.push((virtio_mem_zone.region().clone(), true)); 766 } 767 768 list.push((zone_id.clone(), regions)); 769 } 770 771 for (zone_id, regions) in list { 772 for (region, virtio_mem) in regions { 773 let slot = self.create_userspace_mapping( 774 region.start_addr().raw_value(), 775 region.len() as u64, 776 region.as_ptr() as u64, 777 self.mergeable, 778 false, 779 self.log_dirty, 780 )?; 781 782 let file_offset = if let Some(file_offset) = region.file_offset() { 783 file_offset.start() 784 } else { 785 0 786 }; 787 788 self.guest_ram_mappings.push(GuestRamMapping { 789 gpa: region.start_addr().raw_value(), 790 size: region.len(), 791 slot, 792 zone_id: zone_id.clone(), 793 virtio_mem, 794 file_offset, 795 }); 796 self.ram_allocator 797 .allocate(Some(region.start_addr()), region.len(), None) 798 .ok_or(Error::MemoryRangeAllocation)?; 799 } 800 } 801 802 // Allocate SubRegion and Reserved address ranges. 803 for region in self.arch_mem_regions.iter() { 804 if region.r_type == RegionType::Ram { 805 // Ignore the RAM type since ranges have already been allocated 806 // based on the GuestMemory regions. 807 continue; 808 } 809 self.ram_allocator 810 .allocate( 811 Some(GuestAddress(region.base)), 812 region.size as GuestUsize, 813 None, 814 ) 815 .ok_or(Error::MemoryRangeAllocation)?; 816 } 817 818 Ok(()) 819 } 820 821 #[allow(clippy::too_many_arguments)] 822 pub fn new( 823 vm: Arc<dyn hypervisor::Vm>, 824 config: &MemoryConfig, 825 prefault: Option<bool>, 826 phys_bits: u8, 827 #[cfg(feature = "tdx")] tdx_enabled: bool, 828 restore_data: Option<&MemoryManagerSnapshotData>, 829 existing_memory_files: Option<HashMap<u32, File>>, 830 #[cfg(target_arch = "x86_64")] sgx_epc_config: Option<Vec<SgxEpcConfig>>, 831 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 832 let user_provided_zones = config.size == 0; 833 834 let mmio_address_space_size = mmio_address_space_size(phys_bits); 835 debug_assert_eq!( 836 (((mmio_address_space_size) >> 16) << 16), 837 mmio_address_space_size 838 ); 839 let start_of_platform_device_area = 840 GuestAddress(mmio_address_space_size - PLATFORM_DEVICE_AREA_SIZE); 841 let end_of_device_area = start_of_platform_device_area.unchecked_sub(1); 842 843 let (ram_size, zones, allow_mem_hotplug) = 844 Self::validate_memory_config(config, user_provided_zones)?; 845 846 let ( 847 start_of_device_area, 848 boot_ram, 849 current_ram, 850 arch_mem_regions, 851 memory_zones, 852 guest_memory, 853 boot_guest_memory, 854 hotplug_slots, 855 next_memory_slot, 856 selected_slot, 857 next_hotplug_slot, 858 ) = if let Some(data) = restore_data { 859 let (regions, memory_zones) = Self::restore_memory_regions_and_zones( 860 &data.guest_ram_mappings, 861 &zones, 862 prefault, 863 existing_memory_files.unwrap_or_default(), 864 )?; 865 let guest_memory = 866 GuestMemoryMmap::from_arc_regions(regions).map_err(Error::GuestMemory)?; 867 let boot_guest_memory = guest_memory.clone(); 868 ( 869 GuestAddress(data.start_of_device_area), 870 data.boot_ram, 871 data.current_ram, 872 data.arch_mem_regions.clone(), 873 memory_zones, 874 guest_memory, 875 boot_guest_memory, 876 data.hotplug_slots.clone(), 877 data.next_memory_slot, 878 data.selected_slot, 879 data.next_hotplug_slot, 880 ) 881 } else { 882 // Init guest memory 883 let arch_mem_regions = arch::arch_memory_regions(ram_size); 884 885 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 886 .iter() 887 .filter(|r| r.2 == RegionType::Ram) 888 .map(|r| (r.0, r.1)) 889 .collect(); 890 891 let arch_mem_regions: Vec<ArchMemRegion> = arch_mem_regions 892 .iter() 893 .map(|(a, b, c)| ArchMemRegion { 894 base: a.0, 895 size: *b, 896 r_type: *c, 897 }) 898 .collect(); 899 900 let (mem_regions, mut memory_zones) = 901 Self::create_memory_regions_from_zones(&ram_regions, &zones, prefault)?; 902 903 let mut guest_memory = 904 GuestMemoryMmap::from_arc_regions(mem_regions).map_err(Error::GuestMemory)?; 905 906 let boot_guest_memory = guest_memory.clone(); 907 908 let mut start_of_device_area = 909 MemoryManager::start_addr(guest_memory.last_addr(), allow_mem_hotplug)?; 910 911 // Update list of memory zones for resize. 912 for zone in zones.iter() { 913 if let Some(memory_zone) = memory_zones.get_mut(&zone.id) { 914 if let Some(hotplug_size) = zone.hotplug_size { 915 if hotplug_size == 0 { 916 error!("'hotplug_size' can't be 0"); 917 return Err(Error::InvalidHotplugSize); 918 } 919 920 if !user_provided_zones && config.hotplug_method == HotplugMethod::Acpi { 921 start_of_device_area = start_of_device_area 922 .checked_add(hotplug_size) 923 .ok_or(Error::GuestAddressOverFlow)?; 924 } else { 925 // Alignment must be "natural" i.e. same as size of block 926 let start_addr = GuestAddress( 927 (start_of_device_area.0 + virtio_devices::VIRTIO_MEM_ALIGN_SIZE 928 - 1) 929 / virtio_devices::VIRTIO_MEM_ALIGN_SIZE 930 * virtio_devices::VIRTIO_MEM_ALIGN_SIZE, 931 ); 932 933 // When `prefault` is set by vm_restore, memory manager 934 // will create ram region with `prefault` option in 935 // restore config rather than same option in zone 936 let region = MemoryManager::create_ram_region( 937 &None, 938 0, 939 start_addr, 940 hotplug_size as usize, 941 match prefault { 942 Some(pf) => pf, 943 None => zone.prefault, 944 }, 945 zone.shared, 946 zone.hugepages, 947 zone.hugepage_size, 948 zone.host_numa_node, 949 None, 950 )?; 951 952 guest_memory = guest_memory 953 .insert_region(Arc::clone(®ion)) 954 .map_err(Error::GuestMemory)?; 955 956 let hotplugged_size = zone.hotplugged_size.unwrap_or(0); 957 let region_size = region.len(); 958 memory_zone.virtio_mem_zone = Some(VirtioMemZone { 959 region, 960 resize_handler: virtio_devices::Resize::new(hotplugged_size) 961 .map_err(Error::EventFdFail)?, 962 hotplugged_size, 963 hugepages: zone.hugepages, 964 blocks_state: Arc::new(Mutex::new(BlocksState::new(region_size))), 965 }); 966 967 start_of_device_area = start_addr 968 .checked_add(hotplug_size) 969 .ok_or(Error::GuestAddressOverFlow)?; 970 } 971 } 972 } else { 973 return Err(Error::MissingZoneIdentifier); 974 } 975 } 976 977 let mut hotplug_slots = Vec::with_capacity(HOTPLUG_COUNT); 978 hotplug_slots.resize_with(HOTPLUG_COUNT, HotPlugState::default); 979 980 ( 981 start_of_device_area, 982 ram_size, 983 ram_size, 984 arch_mem_regions, 985 memory_zones, 986 guest_memory, 987 boot_guest_memory, 988 hotplug_slots, 989 0, 990 0, 991 0, 992 ) 993 }; 994 995 let guest_memory = GuestMemoryAtomic::new(guest_memory); 996 997 // Both MMIO and PIO address spaces start at address 0. 998 let allocator = Arc::new(Mutex::new( 999 SystemAllocator::new( 1000 #[cfg(target_arch = "x86_64")] 1001 { 1002 GuestAddress(0) 1003 }, 1004 #[cfg(target_arch = "x86_64")] 1005 { 1006 1 << 16 1007 }, 1008 start_of_platform_device_area, 1009 PLATFORM_DEVICE_AREA_SIZE, 1010 layout::MEM_32BIT_DEVICES_START, 1011 layout::MEM_32BIT_DEVICES_SIZE, 1012 #[cfg(target_arch = "x86_64")] 1013 vec![GsiApic::new( 1014 X86_64_IRQ_BASE, 1015 ioapic::NUM_IOAPIC_PINS as u32 - X86_64_IRQ_BASE, 1016 )], 1017 ) 1018 .ok_or(Error::CreateSystemAllocator)?, 1019 )); 1020 1021 #[cfg(not(feature = "tdx"))] 1022 let dynamic = true; 1023 #[cfg(feature = "tdx")] 1024 let dynamic = !tdx_enabled; 1025 1026 let hotplug_method = config.hotplug_method.clone(); 1027 1028 let acpi_address = if dynamic && hotplug_method == HotplugMethod::Acpi { 1029 Some( 1030 allocator 1031 .lock() 1032 .unwrap() 1033 .allocate_platform_mmio_addresses(None, MEMORY_MANAGER_ACPI_SIZE as u64, None) 1034 .ok_or(Error::AllocateMmioAddress)?, 1035 ) 1036 } else { 1037 None 1038 }; 1039 1040 // If running on SGX the start of device area and RAM area may diverge but 1041 // at this point they are next to each other. 1042 let end_of_ram_area = start_of_device_area.unchecked_sub(1); 1043 let ram_allocator = AddressAllocator::new(GuestAddress(0), start_of_device_area.0).unwrap(); 1044 1045 let mut memory_manager = MemoryManager { 1046 boot_guest_memory, 1047 guest_memory, 1048 next_memory_slot, 1049 start_of_device_area, 1050 end_of_device_area, 1051 end_of_ram_area, 1052 vm, 1053 hotplug_slots, 1054 selected_slot, 1055 mergeable: config.mergeable, 1056 allocator, 1057 hotplug_method, 1058 boot_ram, 1059 current_ram, 1060 next_hotplug_slot, 1061 shared: config.shared, 1062 hugepages: config.hugepages, 1063 hugepage_size: config.hugepage_size, 1064 prefault: config.prefault, 1065 #[cfg(target_arch = "x86_64")] 1066 sgx_epc_region: None, 1067 user_provided_zones, 1068 snapshot_memory_ranges: MemoryRangeTable::default(), 1069 memory_zones, 1070 guest_ram_mappings: Vec::new(), 1071 1072 acpi_address, 1073 log_dirty: dynamic, // Cannot log dirty pages on a TD 1074 arch_mem_regions, 1075 ram_allocator, 1076 dynamic, 1077 }; 1078 1079 memory_manager.allocate_address_space()?; 1080 #[cfg(target_arch = "x86_64")] 1081 if let Some(sgx_epc_config) = sgx_epc_config { 1082 memory_manager.setup_sgx(sgx_epc_config)?; 1083 } 1084 1085 Ok(Arc::new(Mutex::new(memory_manager))) 1086 } 1087 1088 pub fn new_from_snapshot( 1089 snapshot: &Snapshot, 1090 vm: Arc<dyn hypervisor::Vm>, 1091 config: &MemoryConfig, 1092 source_url: Option<&str>, 1093 prefault: bool, 1094 phys_bits: u8, 1095 ) -> Result<Arc<Mutex<MemoryManager>>, Error> { 1096 if let Some(source_url) = source_url { 1097 let mut memory_file_path = url_to_path(source_url).map_err(Error::Restore)?; 1098 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 1099 1100 let mem_snapshot: MemoryManagerSnapshotData = snapshot 1101 .to_versioned_state(MEMORY_MANAGER_SNAPSHOT_ID) 1102 .map_err(Error::Restore)?; 1103 1104 let mm = MemoryManager::new( 1105 vm, 1106 config, 1107 Some(prefault), 1108 phys_bits, 1109 #[cfg(feature = "tdx")] 1110 false, 1111 Some(&mem_snapshot), 1112 None, 1113 #[cfg(target_arch = "x86_64")] 1114 None, 1115 )?; 1116 1117 mm.lock() 1118 .unwrap() 1119 .fill_saved_regions(memory_file_path, mem_snapshot.memory_ranges)?; 1120 1121 Ok(mm) 1122 } else { 1123 Err(Error::RestoreMissingSourceUrl) 1124 } 1125 } 1126 1127 fn memfd_create(name: &ffi::CStr, flags: u32) -> Result<RawFd, io::Error> { 1128 let res = unsafe { libc::syscall(libc::SYS_memfd_create, name.as_ptr(), flags) }; 1129 1130 if res < 0 { 1131 Err(io::Error::last_os_error()) 1132 } else { 1133 Ok(res as RawFd) 1134 } 1135 } 1136 1137 fn mbind( 1138 addr: *mut u8, 1139 len: u64, 1140 mode: u32, 1141 nodemask: Vec<u64>, 1142 maxnode: u64, 1143 flags: u32, 1144 ) -> Result<(), io::Error> { 1145 let res = unsafe { 1146 libc::syscall( 1147 libc::SYS_mbind, 1148 addr as *mut libc::c_void, 1149 len, 1150 mode, 1151 nodemask.as_ptr(), 1152 maxnode, 1153 flags, 1154 ) 1155 }; 1156 1157 if res < 0 { 1158 Err(io::Error::last_os_error()) 1159 } else { 1160 Ok(()) 1161 } 1162 } 1163 1164 fn open_memory_file( 1165 backing_file: &Option<PathBuf>, 1166 file_offset: u64, 1167 size: usize, 1168 hugepages: bool, 1169 hugepage_size: Option<u64>, 1170 ) -> Result<(File, u64), Error> { 1171 let (f, f_off) = match backing_file { 1172 Some(ref file) => { 1173 if file.is_dir() { 1174 // Override file offset as it does not apply in this case. 1175 info!( 1176 "Ignoring file offset since the backing file is a \ 1177 temporary file created from the specified directory." 1178 ); 1179 let fs_str = format!("{}{}", file.display(), "/tmpfile_XXXXXX"); 1180 let fs = ffi::CString::new(fs_str).unwrap(); 1181 let mut path = fs.as_bytes_with_nul().to_owned(); 1182 let path_ptr = path.as_mut_ptr() as *mut _; 1183 let fd = unsafe { libc::mkstemp(path_ptr) }; 1184 unsafe { libc::unlink(path_ptr) }; 1185 let f = unsafe { File::from_raw_fd(fd) }; 1186 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1187 1188 (f, 0) 1189 } else { 1190 let f = OpenOptions::new() 1191 .read(true) 1192 .write(true) 1193 .open(file) 1194 .map_err(Error::SharedFileCreate)?; 1195 1196 (f, file_offset) 1197 } 1198 } 1199 None => { 1200 let fd = Self::memfd_create( 1201 &ffi::CString::new("ch_ram").unwrap(), 1202 if hugepages { 1203 libc::MFD_HUGETLB 1204 | if let Some(hugepage_size) = hugepage_size { 1205 /* 1206 * From the Linux kernel: 1207 * Several system calls take a flag to request "hugetlb" huge pages. 1208 * Without further specification, these system calls will use the 1209 * system's default huge page size. If a system supports multiple 1210 * huge page sizes, the desired huge page size can be specified in 1211 * bits [26:31] of the flag arguments. The value in these 6 bits 1212 * will encode the log2 of the huge page size. 1213 */ 1214 1215 hugepage_size.trailing_zeros() << 26 1216 } else { 1217 // Use the system default huge page size 1218 0 1219 } 1220 } else { 1221 0 1222 }, 1223 ) 1224 .map_err(Error::SharedFileCreate)?; 1225 1226 let f = unsafe { File::from_raw_fd(fd) }; 1227 f.set_len(size as u64).map_err(Error::SharedFileSetLen)?; 1228 1229 (f, 0) 1230 } 1231 }; 1232 1233 Ok((f, f_off)) 1234 } 1235 1236 #[allow(clippy::too_many_arguments)] 1237 fn create_ram_region( 1238 backing_file: &Option<PathBuf>, 1239 file_offset: u64, 1240 start_addr: GuestAddress, 1241 size: usize, 1242 prefault: bool, 1243 shared: bool, 1244 hugepages: bool, 1245 hugepage_size: Option<u64>, 1246 host_numa_node: Option<u32>, 1247 existing_memory_file: Option<File>, 1248 ) -> Result<Arc<GuestRegionMmap>, Error> { 1249 let (f, f_off) = if let Some(f) = existing_memory_file { 1250 (f, file_offset) 1251 } else { 1252 Self::open_memory_file(backing_file, file_offset, size, hugepages, hugepage_size)? 1253 }; 1254 1255 let mut mmap_flags = libc::MAP_NORESERVE 1256 | if shared { 1257 libc::MAP_SHARED 1258 } else { 1259 libc::MAP_PRIVATE 1260 }; 1261 if prefault { 1262 mmap_flags |= libc::MAP_POPULATE; 1263 } 1264 1265 let region = GuestRegionMmap::new( 1266 MmapRegion::build( 1267 Some(FileOffset::new(f, f_off)), 1268 size, 1269 libc::PROT_READ | libc::PROT_WRITE, 1270 mmap_flags, 1271 ) 1272 .map_err(Error::GuestMemoryRegion)?, 1273 start_addr, 1274 ) 1275 .map_err(Error::GuestMemory)?; 1276 1277 // Apply NUMA policy if needed. 1278 if let Some(node) = host_numa_node { 1279 let addr = region.deref().as_ptr(); 1280 let len = region.deref().size() as u64; 1281 let mode = MPOL_BIND; 1282 let mut nodemask: Vec<u64> = Vec::new(); 1283 let flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 1284 1285 // Linux is kind of buggy in the way it interprets maxnode as it 1286 // will cut off the last node. That's why we have to add 1 to what 1287 // we would consider as the proper maxnode value. 1288 let maxnode = node as u64 + 1 + 1; 1289 1290 // Allocate the right size for the vector. 1291 nodemask.resize((node as usize / 64) + 1, 0); 1292 1293 // Fill the global bitmask through the nodemask vector. 1294 let idx = (node / 64) as usize; 1295 let shift = node % 64; 1296 nodemask[idx] |= 1u64 << shift; 1297 1298 // Policies are enforced by using MPOL_MF_MOVE flag as it will 1299 // force the kernel to move all pages that might have been already 1300 // allocated to the proper set of NUMA nodes. MPOL_MF_STRICT is 1301 // used to throw an error if MPOL_MF_MOVE didn't succeed. 1302 // MPOL_BIND is the selected mode as it specifies a strict policy 1303 // that restricts memory allocation to the nodes specified in the 1304 // nodemask. 1305 Self::mbind(addr, len, mode, nodemask, maxnode, flags) 1306 .map_err(Error::ApplyNumaPolicy)?; 1307 } 1308 1309 Ok(Arc::new(region)) 1310 } 1311 1312 // Update the GuestMemoryMmap with the new range 1313 fn add_region(&mut self, region: Arc<GuestRegionMmap>) -> Result<(), Error> { 1314 let guest_memory = self 1315 .guest_memory 1316 .memory() 1317 .insert_region(region) 1318 .map_err(Error::GuestMemory)?; 1319 self.guest_memory.lock().unwrap().replace(guest_memory); 1320 1321 Ok(()) 1322 } 1323 1324 // 1325 // Calculate the start address of an area next to RAM. 1326 // 1327 // If memory hotplug is allowed, the start address needs to be aligned 1328 // (rounded-up) to 128MiB boundary. 1329 // If memory hotplug is not allowed, there is no alignment required. 1330 // And it must also start at the 64bit start. 1331 fn start_addr(mem_end: GuestAddress, allow_mem_hotplug: bool) -> Result<GuestAddress, Error> { 1332 let mut start_addr = if allow_mem_hotplug { 1333 GuestAddress(mem_end.0 | ((128 << 20) - 1)) 1334 } else { 1335 mem_end 1336 }; 1337 1338 start_addr = start_addr 1339 .checked_add(1) 1340 .ok_or(Error::GuestAddressOverFlow)?; 1341 1342 if mem_end < arch::layout::MEM_32BIT_RESERVED_START { 1343 return Ok(arch::layout::RAM_64BIT_START); 1344 } 1345 1346 Ok(start_addr) 1347 } 1348 1349 pub fn add_ram_region( 1350 &mut self, 1351 start_addr: GuestAddress, 1352 size: usize, 1353 ) -> Result<Arc<GuestRegionMmap>, Error> { 1354 // Allocate memory for the region 1355 let region = MemoryManager::create_ram_region( 1356 &None, 1357 0, 1358 start_addr, 1359 size, 1360 self.prefault, 1361 self.shared, 1362 self.hugepages, 1363 self.hugepage_size, 1364 None, 1365 None, 1366 )?; 1367 1368 // Map it into the guest 1369 let slot = self.create_userspace_mapping( 1370 region.start_addr().0, 1371 region.len() as u64, 1372 region.as_ptr() as u64, 1373 self.mergeable, 1374 false, 1375 self.log_dirty, 1376 )?; 1377 self.guest_ram_mappings.push(GuestRamMapping { 1378 gpa: region.start_addr().raw_value(), 1379 size: region.len(), 1380 slot, 1381 zone_id: DEFAULT_MEMORY_ZONE.to_string(), 1382 virtio_mem: false, 1383 file_offset: 0, 1384 }); 1385 1386 self.add_region(Arc::clone(®ion))?; 1387 1388 Ok(region) 1389 } 1390 1391 fn hotplug_ram_region(&mut self, size: usize) -> Result<Arc<GuestRegionMmap>, Error> { 1392 info!("Hotplugging new RAM: {}", size); 1393 1394 // Check that there is a free slot 1395 if self.next_hotplug_slot >= HOTPLUG_COUNT { 1396 return Err(Error::NoSlotAvailable); 1397 } 1398 1399 // "Inserted" DIMM must have a size that is a multiple of 128MiB 1400 if size % (128 << 20) != 0 { 1401 return Err(Error::InvalidSize); 1402 } 1403 1404 let start_addr = MemoryManager::start_addr(self.guest_memory.memory().last_addr(), true)?; 1405 1406 if start_addr.checked_add(size.try_into().unwrap()).unwrap() >= self.end_of_ram_area { 1407 return Err(Error::InsufficientHotplugRam); 1408 } 1409 1410 let region = self.add_ram_region(start_addr, size)?; 1411 1412 // Add region to the list of regions associated with the default 1413 // memory zone. 1414 if let Some(memory_zone) = self.memory_zones.get_mut(DEFAULT_MEMORY_ZONE) { 1415 memory_zone.regions.push(Arc::clone(®ion)); 1416 } 1417 1418 // Tell the allocator 1419 self.ram_allocator 1420 .allocate(Some(start_addr), size as GuestUsize, None) 1421 .ok_or(Error::MemoryRangeAllocation)?; 1422 1423 // Update the slot so that it can be queried via the I/O port 1424 let mut slot = &mut self.hotplug_slots[self.next_hotplug_slot]; 1425 slot.active = true; 1426 slot.inserting = true; 1427 slot.base = region.start_addr().0; 1428 slot.length = region.len() as u64; 1429 1430 self.next_hotplug_slot += 1; 1431 1432 Ok(region) 1433 } 1434 1435 pub fn guest_memory(&self) -> GuestMemoryAtomic<GuestMemoryMmap> { 1436 self.guest_memory.clone() 1437 } 1438 1439 pub fn boot_guest_memory(&self) -> GuestMemoryMmap { 1440 self.boot_guest_memory.clone() 1441 } 1442 1443 pub fn allocator(&self) -> Arc<Mutex<SystemAllocator>> { 1444 self.allocator.clone() 1445 } 1446 1447 pub fn start_of_device_area(&self) -> GuestAddress { 1448 self.start_of_device_area 1449 } 1450 1451 pub fn end_of_device_area(&self) -> GuestAddress { 1452 self.end_of_device_area 1453 } 1454 1455 pub fn allocate_memory_slot(&mut self) -> u32 { 1456 let slot_id = self.next_memory_slot; 1457 self.next_memory_slot += 1; 1458 slot_id 1459 } 1460 1461 pub fn create_userspace_mapping( 1462 &mut self, 1463 guest_phys_addr: u64, 1464 memory_size: u64, 1465 userspace_addr: u64, 1466 mergeable: bool, 1467 readonly: bool, 1468 log_dirty: bool, 1469 ) -> Result<u32, Error> { 1470 let slot = self.allocate_memory_slot(); 1471 let mem_region = self.vm.make_user_memory_region( 1472 slot, 1473 guest_phys_addr, 1474 memory_size, 1475 userspace_addr, 1476 readonly, 1477 log_dirty, 1478 ); 1479 1480 info!( 1481 "Creating userspace mapping: {:x} -> {:x} {:x}, slot {}", 1482 guest_phys_addr, userspace_addr, memory_size, slot 1483 ); 1484 1485 self.vm 1486 .create_user_memory_region(mem_region) 1487 .map_err(Error::CreateUserMemoryRegion)?; 1488 1489 // Mark the pages as mergeable if explicitly asked for. 1490 if mergeable { 1491 // Safe because the address and size are valid since the 1492 // mmap succeeded. 1493 let ret = unsafe { 1494 libc::madvise( 1495 userspace_addr as *mut libc::c_void, 1496 memory_size as libc::size_t, 1497 libc::MADV_MERGEABLE, 1498 ) 1499 }; 1500 if ret != 0 { 1501 let err = io::Error::last_os_error(); 1502 // Safe to unwrap because the error is constructed with 1503 // last_os_error(), which ensures the output will be Some(). 1504 let errno = err.raw_os_error().unwrap(); 1505 if errno == libc::EINVAL { 1506 warn!("kernel not configured with CONFIG_KSM"); 1507 } else { 1508 warn!("madvise error: {}", err); 1509 } 1510 warn!("failed to mark pages as mergeable"); 1511 } 1512 } 1513 1514 info!( 1515 "Created userspace mapping: {:x} -> {:x} {:x}", 1516 guest_phys_addr, userspace_addr, memory_size 1517 ); 1518 1519 Ok(slot) 1520 } 1521 1522 pub fn remove_userspace_mapping( 1523 &mut self, 1524 guest_phys_addr: u64, 1525 memory_size: u64, 1526 userspace_addr: u64, 1527 mergeable: bool, 1528 slot: u32, 1529 ) -> Result<(), Error> { 1530 let mem_region = self.vm.make_user_memory_region( 1531 slot, 1532 guest_phys_addr, 1533 memory_size, 1534 userspace_addr, 1535 false, /* readonly -- don't care */ 1536 false, /* log dirty */ 1537 ); 1538 1539 self.vm 1540 .remove_user_memory_region(mem_region) 1541 .map_err(Error::RemoveUserMemoryRegion)?; 1542 1543 // Mark the pages as unmergeable if there were previously marked as 1544 // mergeable. 1545 if mergeable { 1546 // Safe because the address and size are valid as the region was 1547 // previously advised. 1548 let ret = unsafe { 1549 libc::madvise( 1550 userspace_addr as *mut libc::c_void, 1551 memory_size as libc::size_t, 1552 libc::MADV_UNMERGEABLE, 1553 ) 1554 }; 1555 if ret != 0 { 1556 let err = io::Error::last_os_error(); 1557 // Safe to unwrap because the error is constructed with 1558 // last_os_error(), which ensures the output will be Some(). 1559 let errno = err.raw_os_error().unwrap(); 1560 if errno == libc::EINVAL { 1561 warn!("kernel not configured with CONFIG_KSM"); 1562 } else { 1563 warn!("madvise error: {}", err); 1564 } 1565 warn!("failed to mark pages as unmergeable"); 1566 } 1567 } 1568 1569 info!( 1570 "Removed userspace mapping: {:x} -> {:x} {:x}", 1571 guest_phys_addr, userspace_addr, memory_size 1572 ); 1573 1574 Ok(()) 1575 } 1576 1577 pub fn virtio_mem_resize(&mut self, id: &str, size: u64) -> Result<(), Error> { 1578 if let Some(memory_zone) = self.memory_zones.get_mut(id) { 1579 if let Some(virtio_mem_zone) = &mut memory_zone.virtio_mem_zone { 1580 virtio_mem_zone 1581 .resize_handler() 1582 .work(size) 1583 .map_err(Error::VirtioMemResizeFail)?; 1584 1585 // Keep the hotplugged_size up to date. 1586 virtio_mem_zone.hotplugged_size = size; 1587 } else { 1588 error!("Failed resizing virtio-mem region: No virtio-mem handler"); 1589 return Err(Error::MissingVirtioMemHandler); 1590 } 1591 1592 return Ok(()); 1593 } 1594 1595 error!("Failed resizing virtio-mem region: Unknown memory zone"); 1596 Err(Error::UnknownMemoryZone) 1597 } 1598 1599 /// In case this function resulted in adding a new memory region to the 1600 /// guest memory, the new region is returned to the caller. The virtio-mem 1601 /// use case never adds a new region as the whole hotpluggable memory has 1602 /// already been allocated at boot time. 1603 pub fn resize(&mut self, desired_ram: u64) -> Result<Option<Arc<GuestRegionMmap>>, Error> { 1604 if self.user_provided_zones { 1605 error!( 1606 "Not allowed to resize guest memory when backed with user \ 1607 defined memory zones." 1608 ); 1609 return Err(Error::InvalidResizeWithMemoryZones); 1610 } 1611 1612 let mut region: Option<Arc<GuestRegionMmap>> = None; 1613 match self.hotplug_method { 1614 HotplugMethod::VirtioMem => { 1615 if desired_ram >= self.boot_ram { 1616 if !self.dynamic { 1617 return Ok(region); 1618 } 1619 1620 self.virtio_mem_resize(DEFAULT_MEMORY_ZONE, desired_ram - self.boot_ram)?; 1621 self.current_ram = desired_ram; 1622 } 1623 } 1624 HotplugMethod::Acpi => { 1625 if desired_ram > self.current_ram { 1626 if !self.dynamic { 1627 return Ok(region); 1628 } 1629 1630 region = 1631 Some(self.hotplug_ram_region((desired_ram - self.current_ram) as usize)?); 1632 self.current_ram = desired_ram; 1633 } 1634 } 1635 } 1636 Ok(region) 1637 } 1638 1639 pub fn resize_zone(&mut self, id: &str, virtio_mem_size: u64) -> Result<(), Error> { 1640 if !self.user_provided_zones { 1641 error!( 1642 "Not allowed to resize guest memory zone when no zone is \ 1643 defined." 1644 ); 1645 return Err(Error::ResizeZone); 1646 } 1647 1648 self.virtio_mem_resize(id, virtio_mem_size) 1649 } 1650 1651 #[cfg(target_arch = "x86_64")] 1652 pub fn setup_sgx(&mut self, sgx_epc_config: Vec<SgxEpcConfig>) -> Result<(), Error> { 1653 let file = OpenOptions::new() 1654 .read(true) 1655 .open("/dev/sgx_provision") 1656 .map_err(Error::SgxProvisionOpen)?; 1657 self.vm 1658 .enable_sgx_attribute(file) 1659 .map_err(Error::SgxEnableProvisioning)?; 1660 1661 // Go over each EPC section and verify its size is a 4k multiple. At 1662 // the same time, calculate the total size needed for the contiguous 1663 // EPC region. 1664 let mut epc_region_size = 0; 1665 for epc_section in sgx_epc_config.iter() { 1666 if epc_section.size == 0 { 1667 return Err(Error::EpcSectionSizeInvalid); 1668 } 1669 if epc_section.size & (SGX_PAGE_SIZE - 1) != 0 { 1670 return Err(Error::EpcSectionSizeInvalid); 1671 } 1672 1673 epc_region_size += epc_section.size; 1674 } 1675 1676 // Place the SGX EPC region on a 4k boundary between the RAM and the device area 1677 let epc_region_start = GuestAddress( 1678 ((self.start_of_device_area.0 + SGX_PAGE_SIZE - 1) / SGX_PAGE_SIZE) * SGX_PAGE_SIZE, 1679 ); 1680 1681 self.start_of_device_area = epc_region_start 1682 .checked_add(epc_region_size) 1683 .ok_or(Error::GuestAddressOverFlow)?; 1684 1685 let mut sgx_epc_region = SgxEpcRegion::new(epc_region_start, epc_region_size as GuestUsize); 1686 info!( 1687 "SGX EPC region: 0x{:x} (0x{:x})", 1688 epc_region_start.0, epc_region_size 1689 ); 1690 1691 // Each section can be memory mapped into the allocated region. 1692 let mut epc_section_start = epc_region_start.raw_value(); 1693 for epc_section in sgx_epc_config.iter() { 1694 let file = OpenOptions::new() 1695 .read(true) 1696 .write(true) 1697 .open("/dev/sgx_vepc") 1698 .map_err(Error::SgxVirtEpcOpen)?; 1699 1700 let prot = PROT_READ | PROT_WRITE; 1701 let mut flags = MAP_NORESERVE | MAP_SHARED; 1702 if epc_section.prefault { 1703 flags |= MAP_POPULATE; 1704 } 1705 1706 // We can't use the vm-memory crate to perform the memory mapping 1707 // here as it would try to ensure the size of the backing file is 1708 // matching the size of the expected mapping. The /dev/sgx_vepc 1709 // device does not work that way, it provides a file descriptor 1710 // which is not matching the mapping size, as it's a just a way to 1711 // let KVM know that an EPC section is being created for the guest. 1712 let host_addr = unsafe { 1713 libc::mmap( 1714 std::ptr::null_mut(), 1715 epc_section.size as usize, 1716 prot, 1717 flags, 1718 file.as_raw_fd(), 1719 0, 1720 ) 1721 } as u64; 1722 1723 info!( 1724 "Adding SGX EPC section: 0x{:x} (0x{:x})", 1725 epc_section_start, epc_section.size 1726 ); 1727 1728 let _mem_slot = self.create_userspace_mapping( 1729 epc_section_start, 1730 epc_section.size, 1731 host_addr, 1732 false, 1733 false, 1734 false, 1735 )?; 1736 1737 sgx_epc_region.insert( 1738 epc_section.id.clone(), 1739 SgxEpcSection::new( 1740 GuestAddress(epc_section_start), 1741 epc_section.size as GuestUsize, 1742 ), 1743 ); 1744 1745 epc_section_start += epc_section.size; 1746 } 1747 1748 self.sgx_epc_region = Some(sgx_epc_region); 1749 1750 Ok(()) 1751 } 1752 1753 #[cfg(target_arch = "x86_64")] 1754 pub fn sgx_epc_region(&self) -> &Option<SgxEpcRegion> { 1755 &self.sgx_epc_region 1756 } 1757 1758 pub fn is_hardlink(f: &File) -> bool { 1759 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 1760 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 1761 if ret != 0 { 1762 error!("Couldn't fstat the backing file"); 1763 return false; 1764 } 1765 1766 unsafe { (*stat.as_ptr()).st_nlink as usize > 0 } 1767 } 1768 1769 pub fn memory_zones(&self) -> &MemoryZones { 1770 &self.memory_zones 1771 } 1772 1773 pub fn memory_range_table( 1774 &self, 1775 snapshot: bool, 1776 ) -> std::result::Result<MemoryRangeTable, MigratableError> { 1777 let mut table = MemoryRangeTable::default(); 1778 1779 for memory_zone in self.memory_zones.values() { 1780 if let Some(virtio_mem_zone) = memory_zone.virtio_mem_zone() { 1781 table.extend(virtio_mem_zone.plugged_ranges()); 1782 } 1783 1784 for region in memory_zone.regions() { 1785 if snapshot { 1786 if let Some(file_offset) = region.file_offset() { 1787 if (region.flags() & libc::MAP_SHARED == libc::MAP_SHARED) 1788 && Self::is_hardlink(file_offset.file()) 1789 { 1790 // In this very specific case, we know the memory 1791 // region is backed by a file on the host filesystem 1792 // that can be accessed by the user, and additionally 1793 // the mapping is shared, which means that modifications 1794 // to the content are written to the actual file. 1795 // When meeting these conditions, we can skip the 1796 // copy of the memory content for this specific region, 1797 // as we can assume the user will have it saved through 1798 // the backing file already. 1799 continue; 1800 } 1801 } 1802 } 1803 1804 table.push(MemoryRange { 1805 gpa: region.start_addr().raw_value(), 1806 length: region.len() as u64, 1807 }); 1808 } 1809 } 1810 1811 Ok(table) 1812 } 1813 1814 pub fn snapshot_data(&self) -> MemoryManagerSnapshotData { 1815 MemoryManagerSnapshotData { 1816 memory_ranges: self.snapshot_memory_ranges.clone(), 1817 guest_ram_mappings: self.guest_ram_mappings.clone(), 1818 start_of_device_area: self.start_of_device_area.0, 1819 boot_ram: self.boot_ram, 1820 current_ram: self.current_ram, 1821 arch_mem_regions: self.arch_mem_regions.clone(), 1822 hotplug_slots: self.hotplug_slots.clone(), 1823 next_memory_slot: self.next_memory_slot, 1824 selected_slot: self.selected_slot, 1825 next_hotplug_slot: self.next_hotplug_slot, 1826 } 1827 } 1828 1829 pub fn memory_slot_fds(&self) -> HashMap<u32, RawFd> { 1830 let mut memory_slot_fds = HashMap::new(); 1831 for guest_ram_mapping in &self.guest_ram_mappings { 1832 let slot = guest_ram_mapping.slot; 1833 let guest_memory = self.guest_memory.memory(); 1834 let file = guest_memory 1835 .find_region(GuestAddress(guest_ram_mapping.gpa)) 1836 .unwrap() 1837 .file_offset() 1838 .unwrap() 1839 .file(); 1840 memory_slot_fds.insert(slot, file.as_raw_fd()); 1841 } 1842 memory_slot_fds 1843 } 1844 1845 pub fn acpi_address(&self) -> Option<GuestAddress> { 1846 self.acpi_address 1847 } 1848 } 1849 1850 struct MemoryNotify { 1851 slot_id: usize, 1852 } 1853 1854 impl Aml for MemoryNotify { 1855 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1856 let object = aml::Path::new(&format!("M{:03}", self.slot_id)); 1857 aml::If::new( 1858 &aml::Equal::new(&aml::Arg(0), &self.slot_id), 1859 vec![&aml::Notify::new(&object, &aml::Arg(1))], 1860 ) 1861 .append_aml_bytes(bytes) 1862 } 1863 } 1864 1865 struct MemorySlot { 1866 slot_id: usize, 1867 } 1868 1869 impl Aml for MemorySlot { 1870 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1871 aml::Device::new( 1872 format!("M{:03}", self.slot_id).as_str().into(), 1873 vec![ 1874 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0C80")), 1875 &aml::Name::new("_UID".into(), &self.slot_id), 1876 /* 1877 _STA return value: 1878 Bit [0] – Set if the device is present. 1879 Bit [1] – Set if the device is enabled and decoding its resources. 1880 Bit [2] – Set if the device should be shown in the UI. 1881 Bit [3] – Set if the device is functioning properly (cleared if device failed its diagnostics). 1882 Bit [4] – Set if the battery is present. 1883 Bits [31:5] – Reserved (must be cleared). 1884 */ 1885 &aml::Method::new( 1886 "_STA".into(), 1887 0, 1888 false, 1889 // Call into MSTA method which will interrogate device 1890 vec![&aml::Return::new(&aml::MethodCall::new( 1891 "MSTA".into(), 1892 vec![&self.slot_id], 1893 ))], 1894 ), 1895 // Get details of memory 1896 &aml::Method::new( 1897 "_CRS".into(), 1898 0, 1899 false, 1900 // Call into MCRS which provides actual memory details 1901 vec![&aml::Return::new(&aml::MethodCall::new( 1902 "MCRS".into(), 1903 vec![&self.slot_id], 1904 ))], 1905 ), 1906 ], 1907 ) 1908 .append_aml_bytes(bytes) 1909 } 1910 } 1911 1912 struct MemorySlots { 1913 slots: usize, 1914 } 1915 1916 impl Aml for MemorySlots { 1917 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1918 for slot_id in 0..self.slots { 1919 MemorySlot { slot_id }.append_aml_bytes(bytes); 1920 } 1921 } 1922 } 1923 1924 struct MemoryMethods { 1925 slots: usize, 1926 } 1927 1928 impl Aml for MemoryMethods { 1929 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 1930 // Add "MTFY" notification method 1931 let mut memory_notifies = Vec::new(); 1932 for slot_id in 0..self.slots { 1933 memory_notifies.push(MemoryNotify { slot_id }); 1934 } 1935 1936 let mut memory_notifies_refs: Vec<&dyn aml::Aml> = Vec::new(); 1937 for memory_notifier in memory_notifies.iter() { 1938 memory_notifies_refs.push(memory_notifier); 1939 } 1940 1941 aml::Method::new("MTFY".into(), 2, true, memory_notifies_refs).append_aml_bytes(bytes); 1942 1943 // MSCN method 1944 aml::Method::new( 1945 "MSCN".into(), 1946 0, 1947 true, 1948 vec![ 1949 // Take lock defined above 1950 &aml::Acquire::new("MLCK".into(), 0xffff), 1951 &aml::Store::new(&aml::Local(0), &aml::ZERO), 1952 &aml::While::new( 1953 &aml::LessThan::new(&aml::Local(0), &self.slots), 1954 vec![ 1955 // Write slot number (in first argument) to I/O port via field 1956 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Local(0)), 1957 // Check if MINS bit is set (inserting) 1958 &aml::If::new( 1959 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 1960 // Notify device if it is 1961 vec![ 1962 &aml::MethodCall::new( 1963 "MTFY".into(), 1964 vec![&aml::Local(0), &aml::ONE], 1965 ), 1966 // Reset MINS bit 1967 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MINS"), &aml::ONE), 1968 ], 1969 ), 1970 // Check if MRMV bit is set 1971 &aml::If::new( 1972 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 1973 // Notify device if it is (with the eject constant 0x3) 1974 vec![ 1975 &aml::MethodCall::new("MTFY".into(), vec![&aml::Local(0), &3u8]), 1976 // Reset MRMV bit 1977 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MRMV"), &aml::ONE), 1978 ], 1979 ), 1980 &aml::Add::new(&aml::Local(0), &aml::Local(0), &aml::ONE), 1981 ], 1982 ), 1983 // Release lock 1984 &aml::Release::new("MLCK".into()), 1985 ], 1986 ) 1987 .append_aml_bytes(bytes); 1988 1989 // Memory status method 1990 aml::Method::new( 1991 "MSTA".into(), 1992 1, 1993 true, 1994 vec![ 1995 // Take lock defined above 1996 &aml::Acquire::new("MLCK".into(), 0xffff), 1997 // Write slot number (in first argument) to I/O port via field 1998 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 1999 &aml::Store::new(&aml::Local(0), &aml::ZERO), 2000 // Check if MEN_ bit is set, if so make the local variable 0xf (see _STA for details of meaning) 2001 &aml::If::new( 2002 &aml::Equal::new(&aml::Path::new("\\_SB_.MHPC.MEN_"), &aml::ONE), 2003 vec![&aml::Store::new(&aml::Local(0), &0xfu8)], 2004 ), 2005 // Release lock 2006 &aml::Release::new("MLCK".into()), 2007 // Return 0 or 0xf 2008 &aml::Return::new(&aml::Local(0)), 2009 ], 2010 ) 2011 .append_aml_bytes(bytes); 2012 2013 // Memory range method 2014 aml::Method::new( 2015 "MCRS".into(), 2016 1, 2017 true, 2018 vec![ 2019 // Take lock defined above 2020 &aml::Acquire::new("MLCK".into(), 0xffff), 2021 // Write slot number (in first argument) to I/O port via field 2022 &aml::Store::new(&aml::Path::new("\\_SB_.MHPC.MSEL"), &aml::Arg(0)), 2023 &aml::Name::new( 2024 "MR64".into(), 2025 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2026 aml::AddressSpaceCachable::Cacheable, 2027 true, 2028 0x0000_0000_0000_0000u64, 2029 0xFFFF_FFFF_FFFF_FFFEu64, 2030 )]), 2031 ), 2032 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &14usize, "MINL".into()), 2033 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &18usize, "MINH".into()), 2034 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &22usize, "MAXL".into()), 2035 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &26usize, "MAXH".into()), 2036 &aml::CreateField::<u64>::new(&aml::Path::new("MR64"), &38usize, "LENL".into()), 2037 &aml::CreateField::<u32>::new(&aml::Path::new("MR64"), &42usize, "LENH".into()), 2038 &aml::Store::new(&aml::Path::new("MINL"), &aml::Path::new("\\_SB_.MHPC.MHBL")), 2039 &aml::Store::new(&aml::Path::new("MINH"), &aml::Path::new("\\_SB_.MHPC.MHBH")), 2040 &aml::Store::new(&aml::Path::new("LENL"), &aml::Path::new("\\_SB_.MHPC.MHLL")), 2041 &aml::Store::new(&aml::Path::new("LENH"), &aml::Path::new("\\_SB_.MHPC.MHLH")), 2042 &aml::Add::new( 2043 &aml::Path::new("MAXL"), 2044 &aml::Path::new("MINL"), 2045 &aml::Path::new("LENL"), 2046 ), 2047 &aml::Add::new( 2048 &aml::Path::new("MAXH"), 2049 &aml::Path::new("MINH"), 2050 &aml::Path::new("LENH"), 2051 ), 2052 &aml::If::new( 2053 &aml::LessThan::new(&aml::Path::new("MAXL"), &aml::Path::new("MINL")), 2054 vec![&aml::Add::new( 2055 &aml::Path::new("MAXH"), 2056 &aml::ONE, 2057 &aml::Path::new("MAXH"), 2058 )], 2059 ), 2060 &aml::Subtract::new(&aml::Path::new("MAXL"), &aml::Path::new("MAXL"), &aml::ONE), 2061 // Release lock 2062 &aml::Release::new("MLCK".into()), 2063 &aml::Return::new(&aml::Path::new("MR64")), 2064 ], 2065 ) 2066 .append_aml_bytes(bytes) 2067 } 2068 } 2069 2070 impl Aml for MemoryManager { 2071 fn append_aml_bytes(&self, bytes: &mut Vec<u8>) { 2072 if let Some(acpi_address) = self.acpi_address { 2073 // Memory Hotplug Controller 2074 aml::Device::new( 2075 "_SB_.MHPC".into(), 2076 vec![ 2077 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2078 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2079 // Mutex to protect concurrent access as we write to choose slot and then read back status 2080 &aml::Mutex::new("MLCK".into(), 0), 2081 &aml::Name::new( 2082 "_CRS".into(), 2083 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2084 aml::AddressSpaceCachable::NotCacheable, 2085 true, 2086 acpi_address.0 as u64, 2087 acpi_address.0 + MEMORY_MANAGER_ACPI_SIZE as u64 - 1, 2088 )]), 2089 ), 2090 // OpRegion and Fields map MMIO range into individual field values 2091 &aml::OpRegion::new( 2092 "MHPR".into(), 2093 aml::OpRegionSpace::SystemMemory, 2094 acpi_address.0 as usize, 2095 MEMORY_MANAGER_ACPI_SIZE, 2096 ), 2097 &aml::Field::new( 2098 "MHPR".into(), 2099 aml::FieldAccessType::DWord, 2100 aml::FieldUpdateRule::Preserve, 2101 vec![ 2102 aml::FieldEntry::Named(*b"MHBL", 32), // Base (low 4 bytes) 2103 aml::FieldEntry::Named(*b"MHBH", 32), // Base (high 4 bytes) 2104 aml::FieldEntry::Named(*b"MHLL", 32), // Length (low 4 bytes) 2105 aml::FieldEntry::Named(*b"MHLH", 32), // Length (high 4 bytes) 2106 ], 2107 ), 2108 &aml::Field::new( 2109 "MHPR".into(), 2110 aml::FieldAccessType::DWord, 2111 aml::FieldUpdateRule::Preserve, 2112 vec![ 2113 aml::FieldEntry::Reserved(128), 2114 aml::FieldEntry::Named(*b"MHPX", 32), // PXM 2115 ], 2116 ), 2117 &aml::Field::new( 2118 "MHPR".into(), 2119 aml::FieldAccessType::Byte, 2120 aml::FieldUpdateRule::WriteAsZeroes, 2121 vec![ 2122 aml::FieldEntry::Reserved(160), 2123 aml::FieldEntry::Named(*b"MEN_", 1), // Enabled 2124 aml::FieldEntry::Named(*b"MINS", 1), // Inserting 2125 aml::FieldEntry::Named(*b"MRMV", 1), // Removing 2126 aml::FieldEntry::Named(*b"MEJ0", 1), // Ejecting 2127 ], 2128 ), 2129 &aml::Field::new( 2130 "MHPR".into(), 2131 aml::FieldAccessType::DWord, 2132 aml::FieldUpdateRule::Preserve, 2133 vec![ 2134 aml::FieldEntry::Named(*b"MSEL", 32), // Selector 2135 aml::FieldEntry::Named(*b"MOEV", 32), // Event 2136 aml::FieldEntry::Named(*b"MOSC", 32), // OSC 2137 ], 2138 ), 2139 &MemoryMethods { 2140 slots: self.hotplug_slots.len(), 2141 }, 2142 &MemorySlots { 2143 slots: self.hotplug_slots.len(), 2144 }, 2145 ], 2146 ) 2147 .append_aml_bytes(bytes); 2148 } else { 2149 aml::Device::new( 2150 "_SB_.MHPC".into(), 2151 vec![ 2152 &aml::Name::new("_HID".into(), &aml::EisaName::new("PNP0A06")), 2153 &aml::Name::new("_UID".into(), &"Memory Hotplug Controller"), 2154 // Empty MSCN for GED 2155 &aml::Method::new("MSCN".into(), 0, true, vec![]), 2156 ], 2157 ) 2158 .append_aml_bytes(bytes); 2159 } 2160 2161 #[cfg(target_arch = "x86_64")] 2162 { 2163 if let Some(sgx_epc_region) = &self.sgx_epc_region { 2164 let min = sgx_epc_region.start().raw_value() as u64; 2165 let max = min + sgx_epc_region.size() as u64 - 1; 2166 // SGX EPC region 2167 aml::Device::new( 2168 "_SB_.EPC_".into(), 2169 vec![ 2170 &aml::Name::new("_HID".into(), &aml::EisaName::new("INT0E0C")), 2171 // QWORD describing the EPC region start and size 2172 &aml::Name::new( 2173 "_CRS".into(), 2174 &aml::ResourceTemplate::new(vec![&aml::AddressSpace::new_memory( 2175 aml::AddressSpaceCachable::NotCacheable, 2176 true, 2177 min, 2178 max, 2179 )]), 2180 ), 2181 &aml::Method::new("_STA".into(), 0, false, vec![&aml::Return::new(&0xfu8)]), 2182 ], 2183 ) 2184 .append_aml_bytes(bytes); 2185 } 2186 } 2187 } 2188 } 2189 2190 impl Pausable for MemoryManager {} 2191 2192 #[derive(Clone, Serialize, Deserialize, Versionize)] 2193 pub struct MemoryManagerSnapshotData { 2194 memory_ranges: MemoryRangeTable, 2195 guest_ram_mappings: Vec<GuestRamMapping>, 2196 start_of_device_area: u64, 2197 boot_ram: u64, 2198 current_ram: u64, 2199 arch_mem_regions: Vec<ArchMemRegion>, 2200 hotplug_slots: Vec<HotPlugState>, 2201 next_memory_slot: u32, 2202 selected_slot: usize, 2203 next_hotplug_slot: usize, 2204 } 2205 2206 impl VersionMapped for MemoryManagerSnapshotData {} 2207 2208 impl Snapshottable for MemoryManager { 2209 fn id(&self) -> String { 2210 MEMORY_MANAGER_SNAPSHOT_ID.to_string() 2211 } 2212 2213 fn snapshot(&mut self) -> result::Result<Snapshot, MigratableError> { 2214 let mut memory_manager_snapshot = Snapshot::new(MEMORY_MANAGER_SNAPSHOT_ID); 2215 2216 let memory_ranges = self.memory_range_table(true)?; 2217 2218 // Store locally this list of ranges as it will be used through the 2219 // Transportable::send() implementation. The point is to avoid the 2220 // duplication of code regarding the creation of the path for each 2221 // region. The 'snapshot' step creates the list of memory regions, 2222 // including information about the need to copy a memory region or 2223 // not. This saves the 'send' step having to go through the same 2224 // process, and instead it can directly proceed with storing the 2225 // memory range content for the ranges requiring it. 2226 self.snapshot_memory_ranges = memory_ranges; 2227 2228 memory_manager_snapshot.add_data_section(SnapshotDataSection::new_from_versioned_state( 2229 MEMORY_MANAGER_SNAPSHOT_ID, 2230 &self.snapshot_data(), 2231 )?); 2232 2233 Ok(memory_manager_snapshot) 2234 } 2235 } 2236 2237 impl Transportable for MemoryManager { 2238 fn send( 2239 &self, 2240 _snapshot: &Snapshot, 2241 destination_url: &str, 2242 ) -> result::Result<(), MigratableError> { 2243 if self.snapshot_memory_ranges.is_empty() { 2244 return Ok(()); 2245 } 2246 2247 let mut memory_file_path = url_to_path(destination_url)?; 2248 memory_file_path.push(String::from(SNAPSHOT_FILENAME)); 2249 2250 // Create the snapshot file for the entire memory 2251 let mut memory_file = OpenOptions::new() 2252 .read(true) 2253 .write(true) 2254 .create_new(true) 2255 .open(memory_file_path) 2256 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2257 2258 let guest_memory = self.guest_memory.memory(); 2259 2260 for range in self.snapshot_memory_ranges.regions() { 2261 let mut offset: u64 = 0; 2262 // Here we are manually handling the retry in case we can't read 2263 // the whole region at once because we can't use the implementation 2264 // from vm-memory::GuestMemory of write_all_to() as it is not 2265 // following the correct behavior. For more info about this issue 2266 // see: https://github.com/rust-vmm/vm-memory/issues/174 2267 loop { 2268 let bytes_written = guest_memory 2269 .write_to( 2270 GuestAddress(range.gpa + offset), 2271 &mut memory_file, 2272 (range.length - offset) as usize, 2273 ) 2274 .map_err(|e| MigratableError::MigrateSend(e.into()))?; 2275 offset += bytes_written as u64; 2276 2277 if offset == range.length { 2278 break; 2279 } 2280 } 2281 } 2282 Ok(()) 2283 } 2284 } 2285 2286 impl Migratable for MemoryManager { 2287 // Start the dirty log in the hypervisor (kvm/mshv). 2288 // Also, reset the dirty bitmap logged by the vmm. 2289 // Just before we do a bulk copy we want to start/clear the dirty log so that 2290 // pages touched during our bulk copy are tracked. 2291 fn start_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2292 self.vm.start_dirty_log().map_err(|e| { 2293 MigratableError::MigrateSend(anyhow!("Error starting VM dirty log {}", e)) 2294 })?; 2295 2296 for r in self.guest_memory.memory().iter() { 2297 r.bitmap().reset(); 2298 } 2299 2300 Ok(()) 2301 } 2302 2303 fn stop_dirty_log(&mut self) -> std::result::Result<(), MigratableError> { 2304 self.vm.stop_dirty_log().map_err(|e| { 2305 MigratableError::MigrateSend(anyhow!("Error stopping VM dirty log {}", e)) 2306 })?; 2307 2308 Ok(()) 2309 } 2310 2311 // Generate a table for the pages that are dirty. The dirty pages are collapsed 2312 // together in the table if they are contiguous. 2313 fn dirty_log(&mut self) -> std::result::Result<MemoryRangeTable, MigratableError> { 2314 let mut table = MemoryRangeTable::default(); 2315 for r in &self.guest_ram_mappings { 2316 let vm_dirty_bitmap = self.vm.get_dirty_log(r.slot, r.gpa, r.size).map_err(|e| { 2317 MigratableError::MigrateSend(anyhow!("Error getting VM dirty log {}", e)) 2318 })?; 2319 let vmm_dirty_bitmap = match self.guest_memory.memory().find_region(GuestAddress(r.gpa)) 2320 { 2321 Some(region) => { 2322 assert!(region.start_addr().raw_value() == r.gpa); 2323 assert!(region.len() == r.size); 2324 region.bitmap().get_and_reset() 2325 } 2326 None => { 2327 return Err(MigratableError::MigrateSend(anyhow!( 2328 "Error finding 'guest memory region' with address {:x}", 2329 r.gpa 2330 ))) 2331 } 2332 }; 2333 2334 let dirty_bitmap: Vec<u64> = vm_dirty_bitmap 2335 .iter() 2336 .zip(vmm_dirty_bitmap.iter()) 2337 .map(|(x, y)| x | y) 2338 .collect(); 2339 2340 let sub_table = MemoryRangeTable::from_bitmap(dirty_bitmap, r.gpa, 4096); 2341 2342 if sub_table.regions().is_empty() { 2343 info!("Dirty Memory Range Table is empty"); 2344 } else { 2345 info!("Dirty Memory Range Table:"); 2346 for range in sub_table.regions() { 2347 info!("GPA: {:x} size: {} (KiB)", range.gpa, range.length / 1024); 2348 } 2349 } 2350 2351 table.extend(sub_table); 2352 } 2353 Ok(table) 2354 } 2355 } 2356