1 // Copyright © 2020, Oracle and/or its affiliates. 2 // 3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 // SPDX-License-Identifier: Apache-2.0 5 // 6 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 7 // Use of this source code is governed by a BSD-style license that can be 8 // found in the LICENSE-BSD-3-Clause file. 9 use std::sync::Arc; 10 pub mod interrupts; 11 pub mod layout; 12 mod mpspec; 13 mod mptable; 14 pub mod regs; 15 use std::collections::BTreeMap; 16 use std::mem; 17 18 use hypervisor::arch::x86::{CpuIdEntry, CPUID_FLAG_VALID_INDEX}; 19 use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError}; 20 use linux_loader::loader::bootparam::{boot_params, setup_header}; 21 use linux_loader::loader::elf::start_info::{ 22 hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info, 23 }; 24 use thiserror::Error; 25 use vm_memory::{ 26 Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 27 GuestMemoryRegion, GuestUsize, 28 }; 29 30 use crate::{GuestMemoryMmap, InitramfsConfig, RegionType}; 31 mod smbios; 32 use std::arch::x86_64; 33 #[cfg(feature = "tdx")] 34 pub mod tdx; 35 36 // CPUID feature bits 37 #[cfg(feature = "kvm")] 38 const TSC_DEADLINE_TIMER_ECX_BIT: u8 = 24; // tsc deadline timer ecx bit. 39 const HYPERVISOR_ECX_BIT: u8 = 31; // Hypervisor ecx bit. 40 const MTRR_EDX_BIT: u8 = 12; // Hypervisor ecx bit. 41 const INVARIANT_TSC_EDX_BIT: u8 = 8; // Invariant TSC bit on 0x8000_0007 EDX 42 const AMX_BF16: u8 = 22; // AMX tile computation on bfloat16 numbers 43 const AMX_TILE: u8 = 24; // AMX tile load/store instructions 44 const AMX_INT8: u8 = 25; // AMX tile computation on 8-bit integers 45 46 // KVM feature bits 47 #[cfg(feature = "tdx")] 48 const KVM_FEATURE_CLOCKSOURCE_BIT: u8 = 0; 49 #[cfg(feature = "tdx")] 50 const KVM_FEATURE_CLOCKSOURCE2_BIT: u8 = 3; 51 #[cfg(feature = "tdx")] 52 const KVM_FEATURE_CLOCKSOURCE_STABLE_BIT: u8 = 24; 53 #[cfg(feature = "tdx")] 54 const KVM_FEATURE_ASYNC_PF_BIT: u8 = 4; 55 #[cfg(feature = "tdx")] 56 const KVM_FEATURE_ASYNC_PF_VMEXIT_BIT: u8 = 10; 57 #[cfg(feature = "tdx")] 58 const KVM_FEATURE_STEAL_TIME_BIT: u8 = 5; 59 60 pub const _NSIG: i32 = 65; 61 62 #[derive(Debug, Copy, Clone)] 63 /// Specifies the entry point address where the guest must start 64 /// executing code, as well as which of the supported boot protocols 65 /// is to be used to configure the guest initial state. 66 pub struct EntryPoint { 67 /// Address in guest memory where the guest must start execution 68 pub entry_addr: GuestAddress, 69 /// This field is used for bzImage to fill the zero page 70 pub setup_header: Option<setup_header>, 71 } 72 73 const E820_RAM: u32 = 1; 74 const E820_RESERVED: u32 = 2; 75 76 #[derive(Clone)] 77 pub struct SgxEpcSection { 78 start: GuestAddress, 79 size: GuestUsize, 80 } 81 82 impl SgxEpcSection { 83 pub fn new(start: GuestAddress, size: GuestUsize) -> Self { 84 SgxEpcSection { start, size } 85 } 86 pub fn start(&self) -> GuestAddress { 87 self.start 88 } 89 pub fn size(&self) -> GuestUsize { 90 self.size 91 } 92 } 93 94 #[derive(Clone)] 95 pub struct SgxEpcRegion { 96 start: GuestAddress, 97 size: GuestUsize, 98 epc_sections: BTreeMap<String, SgxEpcSection>, 99 } 100 101 impl SgxEpcRegion { 102 pub fn new(start: GuestAddress, size: GuestUsize) -> Self { 103 SgxEpcRegion { 104 start, 105 size, 106 epc_sections: BTreeMap::new(), 107 } 108 } 109 pub fn start(&self) -> GuestAddress { 110 self.start 111 } 112 pub fn size(&self) -> GuestUsize { 113 self.size 114 } 115 pub fn epc_sections(&self) -> &BTreeMap<String, SgxEpcSection> { 116 &self.epc_sections 117 } 118 pub fn insert(&mut self, id: String, epc_section: SgxEpcSection) { 119 self.epc_sections.insert(id, epc_section); 120 } 121 } 122 123 pub struct CpuidConfig { 124 pub sgx_epc_sections: Option<Vec<SgxEpcSection>>, 125 pub phys_bits: u8, 126 pub kvm_hyperv: bool, 127 #[cfg(feature = "tdx")] 128 pub tdx: bool, 129 pub amx: bool, 130 } 131 132 #[derive(Debug, Error)] 133 pub enum Error { 134 /// Error writing MP table to memory. 135 #[error("Error writing MP table to memory: {0}")] 136 MpTableSetup(mptable::Error), 137 138 /// Error configuring the general purpose registers 139 #[error("Error configuring the general purpose registers: {0}")] 140 RegsConfiguration(regs::Error), 141 142 /// Error configuring the special registers 143 #[error("Error configuring the special registers: {0}")] 144 SregsConfiguration(regs::Error), 145 146 /// Error configuring the floating point related registers 147 #[error("Error configuring the floating point related registers: {0}")] 148 FpuConfiguration(regs::Error), 149 150 /// Error configuring the MSR registers 151 #[error("Error configuring the MSR registers: {0}")] 152 MsrsConfiguration(regs::Error), 153 154 /// Failed to set supported CPUs. 155 #[error("Failed to set supported CPUs: {0}")] 156 SetSupportedCpusFailed(anyhow::Error), 157 158 /// Cannot set the local interruption due to bad configuration. 159 #[error("Cannot set the local interruption due to bad configuration: {0}")] 160 LocalIntConfiguration(anyhow::Error), 161 162 /// Error setting up SMBIOS table 163 #[error("Error setting up SMBIOS table: {0}")] 164 SmbiosSetup(smbios::Error), 165 166 /// Could not find any SGX EPC section 167 #[error("Could not find any SGX EPC section")] 168 NoSgxEpcSection, 169 170 /// Missing SGX CPU feature 171 #[error("Missing SGX CPU feature")] 172 MissingSgxFeature, 173 174 /// Missing SGX_LC CPU feature 175 #[error("Missing SGX_LC CPU feature")] 176 MissingSgxLaunchControlFeature, 177 178 /// Error getting supported CPUID through the hypervisor (kvm/mshv) API 179 #[error("Error getting supported CPUID through the hypervisor API: {0}")] 180 CpuidGetSupported(HypervisorError), 181 182 /// Error populating CPUID with KVM HyperV emulation details 183 #[error("Error populating CPUID with KVM HyperV emulation details: {0}")] 184 CpuidKvmHyperV(vmm_sys_util::fam::Error), 185 186 /// Error populating CPUID with CPU identification 187 #[error("Error populating CPUID with CPU identification: {0}")] 188 CpuidIdentification(vmm_sys_util::fam::Error), 189 190 /// Error checking CPUID compatibility 191 #[error("Error checking CPUID compatibility")] 192 CpuidCheckCompatibility, 193 194 // Error writing EBDA address 195 #[error("Error writing EBDA address: {0}")] 196 EbdaSetup(vm_memory::GuestMemoryError), 197 198 // Error getting CPU TSC frequency 199 #[error("Error getting CPU TSC frequency: {0}")] 200 GetTscFrequency(HypervisorCpuError), 201 202 /// Error retrieving TDX capabilities through the hypervisor (kvm/mshv) API 203 #[cfg(feature = "tdx")] 204 #[error("Error retrieving TDX capabilities through the hypervisor API: {0}")] 205 TdxCapabilities(HypervisorError), 206 207 /// Failed to configure E820 map for bzImage 208 #[error("Failed to configure E820 map for bzImage")] 209 E820Configuration, 210 } 211 212 impl From<Error> for super::Error { 213 fn from(e: Error) -> super::Error { 214 super::Error::PlatformSpecific(e) 215 } 216 } 217 218 pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 { 219 if let Some(t) = topology { 220 let thread_mask_width = u8::BITS - (t.0 - 1).leading_zeros(); 221 let core_mask_width = u8::BITS - (t.1 - 1).leading_zeros(); 222 let die_mask_width = u8::BITS - (t.2 - 1).leading_zeros(); 223 224 let thread_id = cpu_id % (t.0 as u32); 225 let core_id = cpu_id / (t.0 as u32) % (t.1 as u32); 226 let die_id = cpu_id / ((t.0 * t.1) as u32) % (t.2 as u32); 227 let socket_id = cpu_id / ((t.0 * t.1 * t.2) as u32); 228 229 return thread_id 230 | (core_id << thread_mask_width) 231 | (die_id << (thread_mask_width + core_mask_width)) 232 | (socket_id << (thread_mask_width + core_mask_width + die_mask_width)); 233 } 234 235 cpu_id 236 } 237 238 #[derive(Copy, Clone, Debug)] 239 pub enum CpuidReg { 240 EAX, 241 EBX, 242 ECX, 243 EDX, 244 } 245 246 pub struct CpuidPatch { 247 pub function: u32, 248 pub index: u32, 249 pub flags_bit: Option<u8>, 250 pub eax_bit: Option<u8>, 251 pub ebx_bit: Option<u8>, 252 pub ecx_bit: Option<u8>, 253 pub edx_bit: Option<u8>, 254 } 255 256 impl CpuidPatch { 257 pub fn get_cpuid_reg( 258 cpuid: &[CpuIdEntry], 259 function: u32, 260 index: Option<u32>, 261 reg: CpuidReg, 262 ) -> Option<u32> { 263 for entry in cpuid.iter() { 264 if entry.function == function && (index.is_none() || index.unwrap() == entry.index) { 265 return match reg { 266 CpuidReg::EAX => Some(entry.eax), 267 CpuidReg::EBX => Some(entry.ebx), 268 CpuidReg::ECX => Some(entry.ecx), 269 CpuidReg::EDX => Some(entry.edx), 270 }; 271 } 272 } 273 274 None 275 } 276 277 pub fn set_cpuid_reg( 278 cpuid: &mut Vec<CpuIdEntry>, 279 function: u32, 280 index: Option<u32>, 281 reg: CpuidReg, 282 value: u32, 283 ) { 284 let mut entry_found = false; 285 for entry in cpuid.iter_mut() { 286 if entry.function == function && (index.is_none() || index.unwrap() == entry.index) { 287 entry_found = true; 288 match reg { 289 CpuidReg::EAX => { 290 entry.eax = value; 291 } 292 CpuidReg::EBX => { 293 entry.ebx = value; 294 } 295 CpuidReg::ECX => { 296 entry.ecx = value; 297 } 298 CpuidReg::EDX => { 299 entry.edx = value; 300 } 301 } 302 } 303 } 304 305 if entry_found { 306 return; 307 } 308 309 // Entry not found, so let's add it. 310 if let Some(index) = index { 311 let mut entry = CpuIdEntry { 312 function, 313 index, 314 flags: CPUID_FLAG_VALID_INDEX, 315 ..Default::default() 316 }; 317 match reg { 318 CpuidReg::EAX => { 319 entry.eax = value; 320 } 321 CpuidReg::EBX => { 322 entry.ebx = value; 323 } 324 CpuidReg::ECX => { 325 entry.ecx = value; 326 } 327 CpuidReg::EDX => { 328 entry.edx = value; 329 } 330 } 331 332 cpuid.push(entry); 333 } 334 } 335 336 pub fn patch_cpuid(cpuid: &mut [CpuIdEntry], patches: Vec<CpuidPatch>) { 337 for entry in cpuid { 338 for patch in patches.iter() { 339 if entry.function == patch.function && entry.index == patch.index { 340 if let Some(flags_bit) = patch.flags_bit { 341 entry.flags |= 1 << flags_bit; 342 } 343 if let Some(eax_bit) = patch.eax_bit { 344 entry.eax |= 1 << eax_bit; 345 } 346 if let Some(ebx_bit) = patch.ebx_bit { 347 entry.ebx |= 1 << ebx_bit; 348 } 349 if let Some(ecx_bit) = patch.ecx_bit { 350 entry.ecx |= 1 << ecx_bit; 351 } 352 if let Some(edx_bit) = patch.edx_bit { 353 entry.edx |= 1 << edx_bit; 354 } 355 } 356 } 357 } 358 } 359 360 pub fn is_feature_enabled( 361 cpuid: &[CpuIdEntry], 362 function: u32, 363 index: u32, 364 reg: CpuidReg, 365 feature_bit: usize, 366 ) -> bool { 367 let mask = 1 << feature_bit; 368 369 for entry in cpuid { 370 if entry.function == function && entry.index == index { 371 let reg_val = match reg { 372 CpuidReg::EAX => entry.eax, 373 CpuidReg::EBX => entry.ebx, 374 CpuidReg::ECX => entry.ecx, 375 CpuidReg::EDX => entry.edx, 376 }; 377 378 return (reg_val & mask) == mask; 379 } 380 } 381 382 false 383 } 384 } 385 386 #[derive(Debug)] 387 enum CpuidCompatibleCheck { 388 BitwiseSubset, // bitwise subset 389 Equal, // equal in value 390 NumNotGreater, // smaller or equal as a number 391 } 392 393 pub struct CpuidFeatureEntry { 394 function: u32, 395 index: u32, 396 feature_reg: CpuidReg, 397 compatible_check: CpuidCompatibleCheck, 398 } 399 400 impl CpuidFeatureEntry { 401 fn checked_feature_entry_list() -> Vec<CpuidFeatureEntry> { 402 vec![ 403 // The following list includes all hardware features bits from 404 // the CPUID Wiki Page: https://en.wikipedia.org/wiki/CPUID 405 // Leaf 0x1, ECX/EDX, feature bits 406 CpuidFeatureEntry { 407 function: 1, 408 index: 0, 409 feature_reg: CpuidReg::ECX, 410 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 411 }, 412 CpuidFeatureEntry { 413 function: 1, 414 index: 0, 415 feature_reg: CpuidReg::EDX, 416 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 417 }, 418 // Leaf 0x7, EAX/EBX/ECX/EDX, extended features 419 CpuidFeatureEntry { 420 function: 7, 421 index: 0, 422 feature_reg: CpuidReg::EAX, 423 compatible_check: CpuidCompatibleCheck::NumNotGreater, 424 }, 425 CpuidFeatureEntry { 426 function: 7, 427 index: 0, 428 feature_reg: CpuidReg::EBX, 429 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 430 }, 431 CpuidFeatureEntry { 432 function: 7, 433 index: 0, 434 feature_reg: CpuidReg::ECX, 435 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 436 }, 437 CpuidFeatureEntry { 438 function: 7, 439 index: 0, 440 feature_reg: CpuidReg::EDX, 441 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 442 }, 443 // Leaf 0x7 subleaf 0x1, EAX, extended features 444 CpuidFeatureEntry { 445 function: 7, 446 index: 1, 447 feature_reg: CpuidReg::EAX, 448 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 449 }, 450 // Leaf 0x8000_0001, ECX/EDX, CPUID features bits 451 CpuidFeatureEntry { 452 function: 0x8000_0001, 453 index: 0, 454 feature_reg: CpuidReg::ECX, 455 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 456 }, 457 CpuidFeatureEntry { 458 function: 0x8000_0001, 459 index: 0, 460 feature_reg: CpuidReg::EDX, 461 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 462 }, 463 // KVM CPUID bits: https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html 464 // Leaf 0x4000_0000, EAX/EBX/ECX/EDX, KVM CPUID SIGNATURE 465 CpuidFeatureEntry { 466 function: 0x4000_0000, 467 index: 0, 468 feature_reg: CpuidReg::EAX, 469 compatible_check: CpuidCompatibleCheck::NumNotGreater, 470 }, 471 CpuidFeatureEntry { 472 function: 0x4000_0000, 473 index: 0, 474 feature_reg: CpuidReg::EBX, 475 compatible_check: CpuidCompatibleCheck::Equal, 476 }, 477 CpuidFeatureEntry { 478 function: 0x4000_0000, 479 index: 0, 480 feature_reg: CpuidReg::ECX, 481 compatible_check: CpuidCompatibleCheck::Equal, 482 }, 483 CpuidFeatureEntry { 484 function: 0x4000_0000, 485 index: 0, 486 feature_reg: CpuidReg::EDX, 487 compatible_check: CpuidCompatibleCheck::Equal, 488 }, 489 // Leaf 0x4000_0001, EAX/EBX/ECX/EDX, KVM CPUID features 490 CpuidFeatureEntry { 491 function: 0x4000_0001, 492 index: 0, 493 feature_reg: CpuidReg::EAX, 494 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 495 }, 496 CpuidFeatureEntry { 497 function: 0x4000_0001, 498 index: 0, 499 feature_reg: CpuidReg::EBX, 500 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 501 }, 502 CpuidFeatureEntry { 503 function: 0x4000_0001, 504 index: 0, 505 feature_reg: CpuidReg::ECX, 506 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 507 }, 508 CpuidFeatureEntry { 509 function: 0x4000_0001, 510 index: 0, 511 feature_reg: CpuidReg::EDX, 512 compatible_check: CpuidCompatibleCheck::BitwiseSubset, 513 }, 514 ] 515 } 516 517 fn get_features_from_cpuid( 518 cpuid: &[CpuIdEntry], 519 feature_entry_list: &[CpuidFeatureEntry], 520 ) -> Vec<u32> { 521 let mut features = vec![0; feature_entry_list.len()]; 522 for (i, feature_entry) in feature_entry_list.iter().enumerate() { 523 for cpuid_entry in cpuid { 524 if cpuid_entry.function == feature_entry.function 525 && cpuid_entry.index == feature_entry.index 526 { 527 match feature_entry.feature_reg { 528 CpuidReg::EAX => { 529 features[i] = cpuid_entry.eax; 530 } 531 CpuidReg::EBX => { 532 features[i] = cpuid_entry.ebx; 533 } 534 CpuidReg::ECX => { 535 features[i] = cpuid_entry.ecx; 536 } 537 CpuidReg::EDX => { 538 features[i] = cpuid_entry.edx; 539 } 540 } 541 542 break; 543 } 544 } 545 } 546 547 features 548 } 549 550 // The function returns `Error` (a.k.a. "incompatible"), when the CPUID features from `src_vm_cpuid` 551 // is not a subset of those of the `dest_vm_cpuid`. 552 pub fn check_cpuid_compatibility( 553 src_vm_cpuid: &[CpuIdEntry], 554 dest_vm_cpuid: &[CpuIdEntry], 555 ) -> Result<(), Error> { 556 let feature_entry_list = &Self::checked_feature_entry_list(); 557 let src_vm_features = Self::get_features_from_cpuid(src_vm_cpuid, feature_entry_list); 558 let dest_vm_features = Self::get_features_from_cpuid(dest_vm_cpuid, feature_entry_list); 559 560 // Loop on feature bit and check if the 'source vm' feature is a subset 561 // of those of the 'destination vm' feature 562 let mut compatible = true; 563 for (i, (src_vm_feature, dest_vm_feature)) in src_vm_features 564 .iter() 565 .zip(dest_vm_features.iter()) 566 .enumerate() 567 { 568 let entry = &feature_entry_list[i]; 569 let entry_compatible = match entry.compatible_check { 570 CpuidCompatibleCheck::BitwiseSubset => { 571 let different_feature_bits = src_vm_feature ^ dest_vm_feature; 572 let src_vm_feature_bits_only = different_feature_bits & src_vm_feature; 573 src_vm_feature_bits_only == 0 574 } 575 CpuidCompatibleCheck::Equal => src_vm_feature == dest_vm_feature, 576 CpuidCompatibleCheck::NumNotGreater => src_vm_feature <= dest_vm_feature, 577 }; 578 if !entry_compatible { 579 error!( 580 "Detected incompatible CPUID entry: leaf={:#02x} (subleaf={:#02x}), register='{:?}', \ 581 compatible_check='{:?}', source VM feature='{:#04x}', destination VM feature'{:#04x}'.", 582 entry.function, entry.index, entry.feature_reg, 583 entry.compatible_check, src_vm_feature, dest_vm_feature 584 ); 585 586 compatible = false; 587 } 588 } 589 590 if compatible { 591 info!("No CPU incompatibility detected."); 592 Ok(()) 593 } else { 594 Err(Error::CpuidCheckCompatibility) 595 } 596 } 597 } 598 599 pub fn generate_common_cpuid( 600 hypervisor: &Arc<dyn hypervisor::Hypervisor>, 601 config: &CpuidConfig, 602 ) -> super::Result<Vec<CpuIdEntry>> { 603 // SAFETY: cpuid called with valid leaves 604 if unsafe { x86_64::__cpuid(1) }.ecx & (1 << HYPERVISOR_ECX_BIT) == 1 << HYPERVISOR_ECX_BIT { 605 // SAFETY: cpuid called with valid leaves 606 let hypervisor_cpuid = unsafe { x86_64::__cpuid(0x4000_0000) }; 607 608 let mut identifier: [u8; 12] = [0; 12]; 609 identifier[0..4].copy_from_slice(&hypervisor_cpuid.ebx.to_le_bytes()[..]); 610 identifier[4..8].copy_from_slice(&hypervisor_cpuid.ecx.to_le_bytes()[..]); 611 identifier[8..12].copy_from_slice(&hypervisor_cpuid.edx.to_le_bytes()[..]); 612 613 info!( 614 "Running under nested virtualisation. Hypervisor string: {}", 615 String::from_utf8_lossy(&identifier) 616 ); 617 } 618 619 info!( 620 "Generating guest CPUID for with physical address size: {}", 621 config.phys_bits 622 ); 623 #[allow(unused_mut)] 624 let mut cpuid_patches = vec![ 625 // Patch hypervisor bit 626 CpuidPatch { 627 function: 1, 628 index: 0, 629 flags_bit: None, 630 eax_bit: None, 631 ebx_bit: None, 632 ecx_bit: Some(HYPERVISOR_ECX_BIT), 633 edx_bit: None, 634 }, 635 // Enable MTRR feature 636 CpuidPatch { 637 function: 1, 638 index: 0, 639 flags_bit: None, 640 eax_bit: None, 641 ebx_bit: None, 642 ecx_bit: None, 643 edx_bit: Some(MTRR_EDX_BIT), 644 }, 645 ]; 646 647 #[cfg(feature = "kvm")] 648 if matches!( 649 hypervisor.hypervisor_type(), 650 hypervisor::HypervisorType::Kvm 651 ) { 652 // Patch tsc deadline timer bit 653 cpuid_patches.push(CpuidPatch { 654 function: 1, 655 index: 0, 656 flags_bit: None, 657 eax_bit: None, 658 ebx_bit: None, 659 ecx_bit: Some(TSC_DEADLINE_TIMER_ECX_BIT), 660 edx_bit: None, 661 }); 662 } 663 664 // Supported CPUID 665 let mut cpuid = hypervisor 666 .get_supported_cpuid() 667 .map_err(Error::CpuidGetSupported)?; 668 669 CpuidPatch::patch_cpuid(&mut cpuid, cpuid_patches); 670 671 if let Some(sgx_epc_sections) = &config.sgx_epc_sections { 672 update_cpuid_sgx(&mut cpuid, sgx_epc_sections)?; 673 } 674 675 #[cfg(feature = "tdx")] 676 let tdx_capabilities = if config.tdx { 677 let caps = hypervisor 678 .tdx_capabilities() 679 .map_err(Error::TdxCapabilities)?; 680 info!("TDX capabilities {:#?}", caps); 681 Some(caps) 682 } else { 683 None 684 }; 685 686 // Update some existing CPUID 687 for entry in cpuid.as_mut_slice().iter_mut() { 688 match entry.function { 689 // Clear AMX related bits if the AMX feature is not enabled 690 0x7 => { 691 if !config.amx && entry.index == 0 { 692 entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8)) 693 } 694 } 695 0xd => 696 { 697 #[cfg(feature = "tdx")] 698 if let Some(caps) = &tdx_capabilities { 699 let xcr0_mask: u64 = 0x82ff; 700 let xss_mask: u64 = !xcr0_mask; 701 if entry.index == 0 { 702 entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32); 703 entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32); 704 entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32; 705 entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32; 706 } else if entry.index == 1 { 707 entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32); 708 entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32); 709 entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32; 710 entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32; 711 } 712 } 713 } 714 // Copy host L1 cache details if not populated by KVM 715 0x8000_0005 => { 716 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { 717 // SAFETY: cpuid called with valid leaves 718 if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 { 719 // SAFETY: cpuid called with valid leaves 720 let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) }; 721 entry.eax = leaf.eax; 722 entry.ebx = leaf.ebx; 723 entry.ecx = leaf.ecx; 724 entry.edx = leaf.edx; 725 } 726 } 727 } 728 // Copy host L2 cache details if not populated by KVM 729 0x8000_0006 => { 730 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 { 731 // SAFETY: cpuid called with valid leaves 732 if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 { 733 // SAFETY: cpuid called with valid leaves 734 let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) }; 735 entry.eax = leaf.eax; 736 entry.ebx = leaf.ebx; 737 entry.ecx = leaf.ecx; 738 entry.edx = leaf.edx; 739 } 740 } 741 } 742 // Set CPU physical bits 743 0x8000_0008 => { 744 entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff); 745 } 746 0x4000_0001 => { 747 // These features are not supported by TDX 748 #[cfg(feature = "tdx")] 749 if config.tdx { 750 entry.eax &= !(1 << KVM_FEATURE_CLOCKSOURCE_BIT 751 | 1 << KVM_FEATURE_CLOCKSOURCE2_BIT 752 | 1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 753 | 1 << KVM_FEATURE_ASYNC_PF_BIT 754 | 1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT 755 | 1 << KVM_FEATURE_STEAL_TIME_BIT) 756 } 757 } 758 _ => {} 759 } 760 } 761 762 // Copy CPU identification string 763 for i in 0x8000_0002..=0x8000_0004 { 764 cpuid.retain(|c| c.function != i); 765 // SAFETY: call cpuid with valid leaves 766 let leaf = unsafe { std::arch::x86_64::__cpuid(i) }; 767 cpuid.push(CpuIdEntry { 768 function: i, 769 eax: leaf.eax, 770 ebx: leaf.ebx, 771 ecx: leaf.ecx, 772 edx: leaf.edx, 773 ..Default::default() 774 }); 775 } 776 777 if config.kvm_hyperv { 778 // Remove conflicting entries 779 cpuid.retain(|c| c.function != 0x4000_0000); 780 cpuid.retain(|c| c.function != 0x4000_0001); 781 // See "Hypervisor Top Level Functional Specification" for details 782 // Compliance with "Hv#1" requires leaves up to 0x4000_000a 783 cpuid.push(CpuIdEntry { 784 function: 0x40000000, 785 eax: 0x4000000a, // Maximum cpuid leaf 786 ebx: 0x756e694c, // "Linu" 787 ecx: 0x564b2078, // "x KV" 788 edx: 0x7648204d, // "M Hv" 789 ..Default::default() 790 }); 791 cpuid.push(CpuIdEntry { 792 function: 0x40000001, 793 eax: 0x31237648, // "Hv#1" 794 ..Default::default() 795 }); 796 cpuid.push(CpuIdEntry { 797 function: 0x40000002, 798 eax: 0x3839, // "Build number" 799 ebx: 0xa0000, // "Version" 800 ..Default::default() 801 }); 802 cpuid.push(CpuIdEntry { 803 function: 0x4000_0003, 804 eax: (1 << 1) // AccessPartitionReferenceCounter 805 | (1 << 2) // AccessSynicRegs 806 | (1 << 3) // AccessSyntheticTimerRegs 807 | (1 << 9), // AccessPartitionReferenceTsc 808 edx: 1 << 3, // CPU dynamic partitioning 809 ..Default::default() 810 }); 811 cpuid.push(CpuIdEntry { 812 function: 0x4000_0004, 813 eax: 1 << 5, // Recommend relaxed timing 814 ..Default::default() 815 }); 816 for i in 0x4000_0005..=0x4000_000a { 817 cpuid.push(CpuIdEntry { 818 function: i, 819 ..Default::default() 820 }); 821 } 822 } 823 824 Ok(cpuid) 825 } 826 827 pub fn configure_vcpu( 828 vcpu: &Arc<dyn hypervisor::Vcpu>, 829 id: u8, 830 boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>, 831 cpuid: Vec<CpuIdEntry>, 832 kvm_hyperv: bool, 833 cpu_vendor: CpuVendor, 834 topology: Option<(u8, u8, u8)>, 835 ) -> super::Result<()> { 836 let x2apic_id = get_x2apic_id(id as u32, topology); 837 838 // Per vCPU CPUID changes; common are handled via generate_common_cpuid() 839 let mut cpuid = cpuid; 840 CpuidPatch::set_cpuid_reg(&mut cpuid, 0xb, None, CpuidReg::EDX, x2apic_id); 841 CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1f, None, CpuidReg::EDX, x2apic_id); 842 if matches!(cpu_vendor, CpuVendor::AMD) { 843 CpuidPatch::set_cpuid_reg(&mut cpuid, 0x8000_001e, Some(0), CpuidReg::EAX, x2apic_id); 844 } 845 846 // Set ApicId in cpuid for each vcpu - found in cpuid ebx when eax = 1 847 let mut apic_id_patched = false; 848 for entry in &mut cpuid { 849 if entry.function == 1 { 850 entry.ebx &= 0xffffff; 851 entry.ebx |= x2apic_id << 24; 852 apic_id_patched = true; 853 break; 854 } 855 } 856 assert!(apic_id_patched); 857 858 if let Some(t) = topology { 859 update_cpuid_topology(&mut cpuid, t.0, t.1, t.2, cpu_vendor, id); 860 } 861 862 // The TSC frequency CPUID leaf should not be included when running with HyperV emulation 863 if !kvm_hyperv { 864 if let Some(tsc_khz) = vcpu.tsc_khz().map_err(Error::GetTscFrequency)? { 865 // Need to check that the TSC doesn't vary with dynamic frequency 866 // SAFETY: cpuid called with valid leaves 867 if unsafe { std::arch::x86_64::__cpuid(0x8000_0007) }.edx 868 & (1u32 << INVARIANT_TSC_EDX_BIT) 869 > 0 870 { 871 CpuidPatch::set_cpuid_reg( 872 &mut cpuid, 873 0x4000_0000, 874 None, 875 CpuidReg::EAX, 876 0x4000_0010, 877 ); 878 cpuid.retain(|c| c.function != 0x4000_0010); 879 cpuid.push(CpuIdEntry { 880 function: 0x4000_0010, 881 eax: tsc_khz, 882 ebx: 1000000, /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's 883 * APIC_BUS_CYCLE_NS */ 884 ..Default::default() 885 }); 886 }; 887 } 888 } 889 890 for c in &cpuid { 891 info!("{}", c); 892 } 893 894 vcpu.set_cpuid2(&cpuid) 895 .map_err(|e| Error::SetSupportedCpusFailed(e.into()))?; 896 897 if kvm_hyperv { 898 vcpu.enable_hyperv_synic().unwrap(); 899 } 900 901 regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?; 902 if let Some((kernel_entry_point, guest_memory)) = boot_setup { 903 regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?; 904 regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?; 905 regs::setup_sregs(&guest_memory.memory(), vcpu).map_err(Error::SregsConfiguration)?; 906 } 907 interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?; 908 Ok(()) 909 } 910 911 /// Returns a Vec of the valid memory addresses. 912 /// 913 /// These should be used to configure the GuestMemory structure for the platform. 914 /// For x86_64 all addresses are valid from the start of the kernel except a 915 /// carve out at the end of 32bit address space. 916 pub fn arch_memory_regions() -> Vec<(GuestAddress, usize, RegionType)> { 917 vec![ 918 // 0 GiB ~ 3GiB: memory before the gap 919 ( 920 GuestAddress(0), 921 layout::MEM_32BIT_RESERVED_START.raw_value() as usize, 922 RegionType::Ram, 923 ), 924 // 4 GiB ~ inf: memory after the gap 925 (layout::RAM_64BIT_START, usize::MAX, RegionType::Ram), 926 // 3 GiB ~ 3712 MiB: 32-bit device memory hole 927 ( 928 layout::MEM_32BIT_RESERVED_START, 929 layout::MEM_32BIT_DEVICES_SIZE as usize, 930 RegionType::SubRegion, 931 ), 932 // 3712 MiB ~ 3968 MiB: 32-bit reserved memory hole 933 ( 934 layout::MEM_32BIT_RESERVED_START.unchecked_add(layout::MEM_32BIT_DEVICES_SIZE), 935 (layout::MEM_32BIT_RESERVED_SIZE - layout::MEM_32BIT_DEVICES_SIZE) as usize, 936 RegionType::Reserved, 937 ), 938 ] 939 } 940 941 /// Configures the system and should be called once per vm before starting vcpu threads. 942 /// 943 /// # Arguments 944 /// 945 /// * `guest_mem` - The memory to be used by the guest. 946 /// * `cmdline_addr` - Address in `guest_mem` where the kernel command line was loaded. 947 /// * `cmdline_size` - Size of the kernel command line in bytes including the null terminator. 948 /// * `num_cpus` - Number of virtual CPUs the guest will have. 949 #[allow(clippy::too_many_arguments)] 950 pub fn configure_system( 951 guest_mem: &GuestMemoryMmap, 952 cmdline_addr: GuestAddress, 953 cmdline_size: usize, 954 initramfs: &Option<InitramfsConfig>, 955 _num_cpus: u8, 956 setup_header: Option<setup_header>, 957 rsdp_addr: Option<GuestAddress>, 958 sgx_epc_region: Option<SgxEpcRegion>, 959 serial_number: Option<&str>, 960 uuid: Option<&str>, 961 oem_strings: Option<&[&str]>, 962 topology: Option<(u8, u8, u8)>, 963 ) -> super::Result<()> { 964 // Write EBDA address to location where ACPICA expects to find it 965 guest_mem 966 .write_obj((layout::EBDA_START.0 >> 4) as u16, layout::EBDA_POINTER) 967 .map_err(Error::EbdaSetup)?; 968 969 let size = smbios::setup_smbios(guest_mem, serial_number, uuid, oem_strings) 970 .map_err(Error::SmbiosSetup)?; 971 972 // Place the MP table after the SMIOS table aligned to 16 bytes 973 let offset = GuestAddress(layout::SMBIOS_START).unchecked_add(size); 974 let offset = GuestAddress((offset.0 + 16) & !0xf); 975 mptable::setup_mptable(offset, guest_mem, _num_cpus, topology).map_err(Error::MpTableSetup)?; 976 977 // Check that the RAM is not smaller than the RSDP start address 978 if let Some(rsdp_addr) = rsdp_addr { 979 if rsdp_addr.0 > guest_mem.last_addr().0 { 980 return Err(super::Error::RsdpPastRamEnd); 981 } 982 } 983 984 match setup_header { 985 Some(hdr) => configure_32bit_entry( 986 guest_mem, 987 cmdline_addr, 988 cmdline_size, 989 initramfs, 990 hdr, 991 rsdp_addr, 992 sgx_epc_region, 993 ), 994 None => configure_pvh( 995 guest_mem, 996 cmdline_addr, 997 initramfs, 998 rsdp_addr, 999 sgx_epc_region, 1000 ), 1001 } 1002 } 1003 1004 type RamRange = (u64, u64); 1005 1006 /// Returns usable physical memory ranges for the guest 1007 /// These should be used to create e820_RAM memory maps 1008 pub fn generate_ram_ranges(guest_mem: &GuestMemoryMmap) -> super::Result<Vec<RamRange>> { 1009 // Merge continuous memory regions into one region. 1010 // Note: memory regions from "GuestMemory" are sorted and non-zero sized. 1011 let ram_regions = { 1012 let mut ram_regions = Vec::new(); 1013 let mut current_start = guest_mem 1014 .iter() 1015 .next() 1016 .map(GuestMemoryRegion::start_addr) 1017 .expect("GuestMemory must have one memory region at least") 1018 .raw_value(); 1019 let mut current_end = current_start; 1020 1021 for (start, size) in guest_mem 1022 .iter() 1023 .map(|m| (m.start_addr().raw_value(), m.len())) 1024 { 1025 if current_end == start { 1026 // This zone is continuous with the previous one. 1027 current_end += size; 1028 } else { 1029 ram_regions.push((current_start, current_end)); 1030 1031 current_start = start; 1032 current_end = start + size; 1033 } 1034 } 1035 1036 ram_regions.push((current_start, current_end)); 1037 1038 ram_regions 1039 }; 1040 1041 // Create the memory map entry for memory region before the gap 1042 let mut ram_ranges = vec![]; 1043 1044 // Generate the first usable physical memory range before the gap. The e820 map 1045 // should only report memory above 1MiB. 1046 let first_ram_range = { 1047 let (first_region_start, first_region_end) = 1048 ram_regions.first().ok_or(super::Error::MemmapTableSetup)?; 1049 let high_ram_start = layout::HIGH_RAM_START.raw_value(); 1050 let mem_32bit_reserved_start = layout::MEM_32BIT_RESERVED_START.raw_value(); 1051 1052 if !((first_region_start <= &high_ram_start) 1053 && (first_region_end > &high_ram_start) 1054 && (first_region_end <= &mem_32bit_reserved_start)) 1055 { 1056 error!( 1057 "Unexpected first memory region layout: (start: 0x{:08x}, end: 0x{:08x}). 1058 high_ram_start: 0x{:08x}, mem_32bit_reserved_start: 0x{:08x}", 1059 first_region_start, first_region_end, high_ram_start, mem_32bit_reserved_start 1060 ); 1061 1062 return Err(super::Error::MemmapTableSetup); 1063 } 1064 1065 info!( 1066 "first usable physical memory range, start: 0x{:08x}, end: 0x{:08x}", 1067 high_ram_start, first_region_end 1068 ); 1069 1070 (high_ram_start, *first_region_end) 1071 }; 1072 ram_ranges.push(first_ram_range); 1073 1074 // Generate additional usable physical memory range after the gap if any. 1075 for ram_region in ram_regions.iter().skip(1) { 1076 info!( 1077 "found usable physical memory range, start: 0x{:08x}, end: 0x{:08x}", 1078 ram_region.0, ram_region.1 1079 ); 1080 1081 ram_ranges.push(*ram_region); 1082 } 1083 1084 Ok(ram_ranges) 1085 } 1086 1087 fn configure_pvh( 1088 guest_mem: &GuestMemoryMmap, 1089 cmdline_addr: GuestAddress, 1090 initramfs: &Option<InitramfsConfig>, 1091 rsdp_addr: Option<GuestAddress>, 1092 sgx_epc_region: Option<SgxEpcRegion>, 1093 ) -> super::Result<()> { 1094 const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336ec578; 1095 1096 let mut start_info = hvm_start_info { 1097 magic: XEN_HVM_START_MAGIC_VALUE, 1098 version: 1, // pvh has version 1 1099 nr_modules: 0, 1100 cmdline_paddr: cmdline_addr.raw_value(), 1101 memmap_paddr: layout::MEMMAP_START.raw_value(), 1102 ..Default::default() 1103 }; 1104 1105 if let Some(rsdp_addr) = rsdp_addr { 1106 start_info.rsdp_paddr = rsdp_addr.0; 1107 } 1108 1109 if let Some(initramfs_config) = initramfs { 1110 // The initramfs has been written to guest memory already, here we just need to 1111 // create the module structure that describes it. 1112 let ramdisk_mod = hvm_modlist_entry { 1113 paddr: initramfs_config.address.raw_value(), 1114 size: initramfs_config.size as u64, 1115 ..Default::default() 1116 }; 1117 1118 start_info.nr_modules += 1; 1119 start_info.modlist_paddr = layout::MODLIST_START.raw_value(); 1120 1121 // Write the modlist struct to guest memory. 1122 guest_mem 1123 .write_obj(ramdisk_mod, layout::MODLIST_START) 1124 .map_err(super::Error::ModlistSetup)?; 1125 } 1126 1127 // Vector to hold the memory maps which needs to be written to guest memory 1128 // at MEMMAP_START after all of the mappings are recorded. 1129 let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new(); 1130 1131 // Create the memory map entries. 1132 add_memmap_entry(&mut memmap, 0, layout::EBDA_START.raw_value(), E820_RAM); 1133 1134 // Get usable physical memory ranges 1135 let ram_ranges = generate_ram_ranges(guest_mem)?; 1136 1137 // Create e820 memory map entries 1138 for ram_range in ram_ranges { 1139 info!( 1140 "create_memmap_entry, start: 0x{:08x}, end: 0x{:08x}", 1141 ram_range.0, ram_range.1 1142 ); 1143 add_memmap_entry( 1144 &mut memmap, 1145 ram_range.0, 1146 ram_range.1 - ram_range.0, 1147 E820_RAM, 1148 ); 1149 } 1150 1151 add_memmap_entry( 1152 &mut memmap, 1153 layout::PCI_MMCONFIG_START.0, 1154 layout::PCI_MMCONFIG_SIZE, 1155 E820_RESERVED, 1156 ); 1157 1158 if let Some(sgx_epc_region) = sgx_epc_region { 1159 add_memmap_entry( 1160 &mut memmap, 1161 sgx_epc_region.start().raw_value(), 1162 sgx_epc_region.size(), 1163 E820_RESERVED, 1164 ); 1165 } 1166 1167 start_info.memmap_entries = memmap.len() as u32; 1168 1169 // Copy the vector with the memmap table to the MEMMAP_START address 1170 // which is already saved in the memmap_paddr field of hvm_start_info struct. 1171 let mut memmap_start_addr = layout::MEMMAP_START; 1172 1173 guest_mem 1174 .checked_offset( 1175 memmap_start_addr, 1176 mem::size_of::<hvm_memmap_table_entry>() * start_info.memmap_entries as usize, 1177 ) 1178 .ok_or(super::Error::MemmapTablePastRamEnd)?; 1179 1180 // For every entry in the memmap vector, write it to guest memory. 1181 for memmap_entry in memmap { 1182 guest_mem 1183 .write_obj(memmap_entry, memmap_start_addr) 1184 .map_err(|_| super::Error::MemmapTableSetup)?; 1185 memmap_start_addr = 1186 memmap_start_addr.unchecked_add(mem::size_of::<hvm_memmap_table_entry>() as u64); 1187 } 1188 1189 // The hvm_start_info struct itself must be stored at PVH_START_INFO 1190 // address, and %rbx will be initialized to contain PVH_INFO_START prior to 1191 // starting the guest, as required by the PVH ABI. 1192 let start_info_addr = layout::PVH_INFO_START; 1193 1194 guest_mem 1195 .checked_offset(start_info_addr, mem::size_of::<hvm_start_info>()) 1196 .ok_or(super::Error::StartInfoPastRamEnd)?; 1197 1198 // Write the start_info struct to guest memory. 1199 guest_mem 1200 .write_obj(start_info, start_info_addr) 1201 .map_err(|_| super::Error::StartInfoSetup)?; 1202 1203 Ok(()) 1204 } 1205 1206 fn configure_32bit_entry( 1207 guest_mem: &GuestMemoryMmap, 1208 cmdline_addr: GuestAddress, 1209 cmdline_size: usize, 1210 initramfs: &Option<InitramfsConfig>, 1211 setup_hdr: setup_header, 1212 rsdp_addr: Option<GuestAddress>, 1213 sgx_epc_region: Option<SgxEpcRegion>, 1214 ) -> super::Result<()> { 1215 const KERNEL_LOADER_OTHER: u8 = 0xff; 1216 1217 // Use the provided setup header 1218 let mut params = boot_params { 1219 hdr: setup_hdr, 1220 ..Default::default() 1221 }; 1222 1223 // Common bootparams settings 1224 if params.hdr.type_of_loader == 0 { 1225 params.hdr.type_of_loader = KERNEL_LOADER_OTHER; 1226 } 1227 params.hdr.cmd_line_ptr = cmdline_addr.raw_value() as u32; 1228 params.hdr.cmdline_size = cmdline_size as u32; 1229 1230 if let Some(initramfs_config) = initramfs { 1231 params.hdr.ramdisk_image = initramfs_config.address.raw_value() as u32; 1232 params.hdr.ramdisk_size = initramfs_config.size as u32; 1233 } 1234 1235 add_e820_entry(&mut params, 0, layout::EBDA_START.raw_value(), E820_RAM)?; 1236 1237 let mem_end = guest_mem.last_addr(); 1238 if mem_end < layout::MEM_32BIT_RESERVED_START { 1239 add_e820_entry( 1240 &mut params, 1241 layout::HIGH_RAM_START.raw_value(), 1242 mem_end.unchecked_offset_from(layout::HIGH_RAM_START) + 1, 1243 E820_RAM, 1244 )?; 1245 } else { 1246 add_e820_entry( 1247 &mut params, 1248 layout::HIGH_RAM_START.raw_value(), 1249 layout::MEM_32BIT_RESERVED_START.unchecked_offset_from(layout::HIGH_RAM_START), 1250 E820_RAM, 1251 )?; 1252 if mem_end > layout::RAM_64BIT_START { 1253 add_e820_entry( 1254 &mut params, 1255 layout::RAM_64BIT_START.raw_value(), 1256 mem_end.unchecked_offset_from(layout::RAM_64BIT_START) + 1, 1257 E820_RAM, 1258 )?; 1259 } 1260 } 1261 1262 add_e820_entry( 1263 &mut params, 1264 layout::PCI_MMCONFIG_START.0, 1265 layout::PCI_MMCONFIG_SIZE, 1266 E820_RESERVED, 1267 )?; 1268 1269 if let Some(sgx_epc_region) = sgx_epc_region { 1270 add_e820_entry( 1271 &mut params, 1272 sgx_epc_region.start().raw_value(), 1273 sgx_epc_region.size(), 1274 E820_RESERVED, 1275 )?; 1276 } 1277 1278 if let Some(rsdp_addr) = rsdp_addr { 1279 params.acpi_rsdp_addr = rsdp_addr.0; 1280 } 1281 1282 let zero_page_addr = layout::ZERO_PAGE_START; 1283 guest_mem 1284 .checked_offset(zero_page_addr, mem::size_of::<boot_params>()) 1285 .ok_or(super::Error::ZeroPagePastRamEnd)?; 1286 guest_mem 1287 .write_obj(params, zero_page_addr) 1288 .map_err(super::Error::ZeroPageSetup)?; 1289 1290 Ok(()) 1291 } 1292 1293 /// Add an e820 region to the e820 map. 1294 /// Returns Ok(()) if successful, or an error if there is no space left in the map. 1295 fn add_e820_entry( 1296 params: &mut boot_params, 1297 addr: u64, 1298 size: u64, 1299 mem_type: u32, 1300 ) -> Result<(), Error> { 1301 if params.e820_entries >= params.e820_table.len() as u8 { 1302 return Err(Error::E820Configuration); 1303 } 1304 1305 params.e820_table[params.e820_entries as usize].addr = addr; 1306 params.e820_table[params.e820_entries as usize].size = size; 1307 params.e820_table[params.e820_entries as usize].type_ = mem_type; 1308 params.e820_entries += 1; 1309 1310 Ok(()) 1311 } 1312 1313 fn add_memmap_entry(memmap: &mut Vec<hvm_memmap_table_entry>, addr: u64, size: u64, mem_type: u32) { 1314 // Add the table entry to the vector 1315 memmap.push(hvm_memmap_table_entry { 1316 addr, 1317 size, 1318 type_: mem_type, 1319 reserved: 0, 1320 }); 1321 } 1322 1323 /// Returns the memory address where the initramfs could be loaded. 1324 pub fn initramfs_load_addr( 1325 guest_mem: &GuestMemoryMmap, 1326 initramfs_size: usize, 1327 ) -> super::Result<u64> { 1328 let first_region = guest_mem 1329 .find_region(GuestAddress::new(0)) 1330 .ok_or(super::Error::InitramfsAddress)?; 1331 // It's safe to cast to usize because the size of a region can't be greater than usize. 1332 let lowmem_size = first_region.len() as usize; 1333 1334 if lowmem_size < initramfs_size { 1335 return Err(super::Error::InitramfsAddress); 1336 } 1337 1338 let aligned_addr: u64 = ((lowmem_size - initramfs_size) & !(crate::pagesize() - 1)) as u64; 1339 Ok(aligned_addr) 1340 } 1341 1342 pub fn get_host_cpu_phys_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>) -> u8 { 1343 // SAFETY: call cpuid with valid leaves 1344 unsafe { 1345 let leaf = x86_64::__cpuid(0x8000_0000); 1346 1347 // Detect and handle AMD SME (Secure Memory Encryption) properly. 1348 // Some physical address bits may become reserved when the feature is enabled. 1349 // See AMD64 Architecture Programmer's Manual Volume 2, Section 7.10.1 1350 let reduced = if leaf.eax >= 0x8000_001f 1351 && matches!(hypervisor.get_cpu_vendor(), CpuVendor::AMD) 1352 && x86_64::__cpuid(0x8000_001f).eax & 0x1 != 0 1353 { 1354 (x86_64::__cpuid(0x8000_001f).ebx >> 6) & 0x3f 1355 } else { 1356 0 1357 }; 1358 1359 if leaf.eax >= 0x8000_0008 { 1360 let leaf = x86_64::__cpuid(0x8000_0008); 1361 ((leaf.eax & 0xff) - reduced) as u8 1362 } else { 1363 36 1364 } 1365 } 1366 } 1367 1368 fn update_cpuid_topology( 1369 cpuid: &mut Vec<CpuIdEntry>, 1370 threads_per_core: u8, 1371 cores_per_die: u8, 1372 dies_per_package: u8, 1373 cpu_vendor: CpuVendor, 1374 id: u8, 1375 ) { 1376 let x2apic_id = get_x2apic_id( 1377 id as u32, 1378 Some((threads_per_core, cores_per_die, dies_per_package)), 1379 ); 1380 1381 let thread_width = 8 - (threads_per_core - 1).leading_zeros(); 1382 let core_width = (8 - (cores_per_die - 1).leading_zeros()) + thread_width; 1383 let die_width = (8 - (dies_per_package - 1).leading_zeros()) + core_width; 1384 1385 let mut cpu_ebx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX).unwrap_or(0); 1386 cpu_ebx |= ((dies_per_package as u32) * (cores_per_die as u32) * (threads_per_core as u32)) 1387 & (0xff << 16); 1388 CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX, cpu_ebx); 1389 1390 let mut cpu_edx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX).unwrap_or(0); 1391 cpu_edx |= 1 << 28; 1392 CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX, cpu_edx); 1393 1394 // CPU Topology leaf 0xb 1395 CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::EAX, thread_width); 1396 CpuidPatch::set_cpuid_reg( 1397 cpuid, 1398 0xb, 1399 Some(0), 1400 CpuidReg::EBX, 1401 u32::from(threads_per_core), 1402 ); 1403 CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::ECX, 1 << 8); 1404 1405 CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EAX, die_width); 1406 CpuidPatch::set_cpuid_reg( 1407 cpuid, 1408 0xb, 1409 Some(1), 1410 CpuidReg::EBX, 1411 u32::from(dies_per_package * cores_per_die * threads_per_core), 1412 ); 1413 CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::ECX, 2 << 8); 1414 1415 // CPU Topology leaf 0x1f 1416 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::EAX, thread_width); 1417 CpuidPatch::set_cpuid_reg( 1418 cpuid, 1419 0x1f, 1420 Some(0), 1421 CpuidReg::EBX, 1422 u32::from(threads_per_core), 1423 ); 1424 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::ECX, 1 << 8); 1425 1426 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::EAX, core_width); 1427 CpuidPatch::set_cpuid_reg( 1428 cpuid, 1429 0x1f, 1430 Some(1), 1431 CpuidReg::EBX, 1432 u32::from(cores_per_die * threads_per_core), 1433 ); 1434 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::ECX, 2 << 8); 1435 1436 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::EAX, die_width); 1437 CpuidPatch::set_cpuid_reg( 1438 cpuid, 1439 0x1f, 1440 Some(2), 1441 CpuidReg::EBX, 1442 u32::from(dies_per_package * cores_per_die * threads_per_core), 1443 ); 1444 CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::ECX, 5 << 8); 1445 1446 if matches!(cpu_vendor, CpuVendor::AMD) { 1447 CpuidPatch::set_cpuid_reg( 1448 cpuid, 1449 0x8000_001e, 1450 Some(0), 1451 CpuidReg::EBX, 1452 ((threads_per_core as u32 - 1) << 8) | (x2apic_id & 0xff), 1453 ); 1454 CpuidPatch::set_cpuid_reg( 1455 cpuid, 1456 0x8000_001e, 1457 Some(0), 1458 CpuidReg::ECX, 1459 ((dies_per_package as u32 - 1) << 8) | (thread_width + die_width) & 0xff, 1460 ); 1461 CpuidPatch::set_cpuid_reg(cpuid, 0x8000_001e, Some(0), CpuidReg::EDX, 0); 1462 if cores_per_die * threads_per_core > 1 { 1463 let ecx = 1464 CpuidPatch::get_cpuid_reg(cpuid, 0x8000_0001, Some(0), CpuidReg::ECX).unwrap_or(0); 1465 CpuidPatch::set_cpuid_reg( 1466 cpuid, 1467 0x8000_0001, 1468 Some(0), 1469 CpuidReg::ECX, 1470 ecx | (1u32 << 1) | (1u32 << 22), 1471 ); 1472 CpuidPatch::set_cpuid_reg( 1473 cpuid, 1474 0x0000_0001, 1475 Some(0), 1476 CpuidReg::EBX, 1477 (x2apic_id << 24) | (8 << 8) | (((cores_per_die * threads_per_core) as u32) << 16), 1478 ); 1479 let cpuid_patches = vec![ 1480 // Patch tsc deadline timer bit 1481 CpuidPatch { 1482 function: 1, 1483 index: 0, 1484 flags_bit: None, 1485 eax_bit: None, 1486 ebx_bit: None, 1487 ecx_bit: None, 1488 edx_bit: Some(28), 1489 }, 1490 ]; 1491 CpuidPatch::patch_cpuid(cpuid, cpuid_patches); 1492 CpuidPatch::set_cpuid_reg( 1493 cpuid, 1494 0x8000_0008, 1495 Some(0), 1496 CpuidReg::ECX, 1497 ((thread_width + core_width + die_width) << 12) 1498 | ((cores_per_die * threads_per_core) - 1) as u32, 1499 ); 1500 } else { 1501 CpuidPatch::set_cpuid_reg(cpuid, 0x8000_0008, Some(0), CpuidReg::ECX, 0u32); 1502 } 1503 } 1504 } 1505 1506 // The goal is to update the CPUID sub-leaves to reflect the number of EPC 1507 // sections exposed to the guest. 1508 fn update_cpuid_sgx( 1509 cpuid: &mut Vec<CpuIdEntry>, 1510 epc_sections: &[SgxEpcSection], 1511 ) -> Result<(), Error> { 1512 // Something's wrong if there's no EPC section. 1513 if epc_sections.is_empty() { 1514 return Err(Error::NoSgxEpcSection); 1515 } 1516 // We can't go further if the hypervisor does not support SGX feature. 1517 if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::EBX, 2) { 1518 return Err(Error::MissingSgxFeature); 1519 } 1520 // We can't go further if the hypervisor does not support SGX_LC feature. 1521 if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::ECX, 30) { 1522 return Err(Error::MissingSgxLaunchControlFeature); 1523 } 1524 1525 // Get host CPUID for leaf 0x12, subleaf 0x2. This is to retrieve EPC 1526 // properties such as confidentiality and integrity. 1527 // SAFETY: call cpuid with valid leaves 1528 let leaf = unsafe { std::arch::x86_64::__cpuid_count(0x12, 0x2) }; 1529 1530 for (i, epc_section) in epc_sections.iter().enumerate() { 1531 let subleaf_idx = i + 2; 1532 let start = epc_section.start().raw_value(); 1533 let size = epc_section.size(); 1534 let eax = (start & 0xffff_f000) as u32 | 0x1; 1535 let ebx = (start >> 32) as u32; 1536 let ecx = (size & 0xffff_f000) as u32 | (leaf.ecx & 0xf); 1537 let edx = (size >> 32) as u32; 1538 // CPU Topology leaf 0x12 1539 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, eax); 1540 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, ebx); 1541 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, ecx); 1542 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, edx); 1543 } 1544 1545 // Add one NULL entry to terminate the dynamic list 1546 let subleaf_idx = epc_sections.len() + 2; 1547 // CPU Topology leaf 0x12 1548 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, 0); 1549 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, 0); 1550 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, 0); 1551 CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, 0); 1552 1553 Ok(()) 1554 } 1555 1556 #[cfg(test)] 1557 mod tests { 1558 use linux_loader::loader::bootparam::boot_e820_entry; 1559 1560 use super::*; 1561 1562 #[test] 1563 fn regions_base_addr() { 1564 let regions = arch_memory_regions(); 1565 assert_eq!(4, regions.len()); 1566 assert_eq!(GuestAddress(0), regions[0].0); 1567 assert_eq!(GuestAddress(1 << 32), regions[1].0); 1568 } 1569 1570 #[test] 1571 fn test_system_configuration() { 1572 let no_vcpus = 4; 1573 let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap(); 1574 let config_err = configure_system( 1575 &gm, 1576 GuestAddress(0), 1577 0, 1578 &None, 1579 1, 1580 None, 1581 Some(layout::RSDP_POINTER), 1582 None, 1583 None, 1584 None, 1585 None, 1586 None, 1587 ); 1588 config_err.unwrap_err(); 1589 1590 // Now assigning some memory that falls before the 32bit memory hole. 1591 let arch_mem_regions = arch_memory_regions(); 1592 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1593 .iter() 1594 .filter(|r| r.2 == RegionType::Ram && r.1 != usize::MAX) 1595 .map(|r| (r.0, r.1)) 1596 .collect(); 1597 let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap(); 1598 1599 configure_system( 1600 &gm, 1601 GuestAddress(0), 1602 0, 1603 &None, 1604 no_vcpus, 1605 None, 1606 None, 1607 None, 1608 None, 1609 None, 1610 None, 1611 None, 1612 ) 1613 .unwrap(); 1614 1615 // Now assigning some memory that falls after the 32bit memory hole. 1616 let arch_mem_regions = arch_memory_regions(); 1617 let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions 1618 .iter() 1619 .filter(|r| r.2 == RegionType::Ram) 1620 .map(|r| { 1621 if r.1 == usize::MAX { 1622 (r.0, 128 << 20) 1623 } else { 1624 (r.0, r.1) 1625 } 1626 }) 1627 .collect(); 1628 let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap(); 1629 configure_system( 1630 &gm, 1631 GuestAddress(0), 1632 0, 1633 &None, 1634 no_vcpus, 1635 None, 1636 None, 1637 None, 1638 None, 1639 None, 1640 None, 1641 None, 1642 ) 1643 .unwrap(); 1644 1645 configure_system( 1646 &gm, 1647 GuestAddress(0), 1648 0, 1649 &None, 1650 no_vcpus, 1651 None, 1652 None, 1653 None, 1654 None, 1655 None, 1656 None, 1657 None, 1658 ) 1659 .unwrap(); 1660 } 1661 1662 #[test] 1663 fn test_add_e820_entry() { 1664 let e820_table = [(boot_e820_entry { 1665 addr: 0x1, 1666 size: 4, 1667 type_: 1, 1668 }); 128]; 1669 1670 let expected_params = boot_params { 1671 e820_table, 1672 e820_entries: 1, 1673 ..Default::default() 1674 }; 1675 1676 let mut params: boot_params = Default::default(); 1677 add_e820_entry( 1678 &mut params, 1679 e820_table[0].addr, 1680 e820_table[0].size, 1681 e820_table[0].type_, 1682 ) 1683 .unwrap(); 1684 assert_eq!( 1685 format!("{:?}", params.e820_table[0]), 1686 format!("{:?}", expected_params.e820_table[0]) 1687 ); 1688 assert_eq!(params.e820_entries, expected_params.e820_entries); 1689 1690 // Exercise the scenario where the field storing the length of the e820 entry table is 1691 // is bigger than the allocated memory. 1692 params.e820_entries = params.e820_table.len() as u8 + 1; 1693 add_e820_entry( 1694 &mut params, 1695 e820_table[0].addr, 1696 e820_table[0].size, 1697 e820_table[0].type_, 1698 ) 1699 .unwrap_err(); 1700 } 1701 1702 #[test] 1703 fn test_add_memmap_entry() { 1704 let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new(); 1705 1706 let expected_memmap = vec![ 1707 hvm_memmap_table_entry { 1708 addr: 0x0, 1709 size: 0x1000, 1710 type_: E820_RAM, 1711 ..Default::default() 1712 }, 1713 hvm_memmap_table_entry { 1714 addr: 0x10000, 1715 size: 0xa000, 1716 type_: E820_RESERVED, 1717 ..Default::default() 1718 }, 1719 ]; 1720 1721 add_memmap_entry(&mut memmap, 0, 0x1000, E820_RAM); 1722 add_memmap_entry(&mut memmap, 0x10000, 0xa000, E820_RESERVED); 1723 1724 assert_eq!(format!("{memmap:?}"), format!("{expected_memmap:?}")); 1725 } 1726 1727 #[test] 1728 fn test_get_x2apic_id() { 1729 let x2apic_id = get_x2apic_id(0, Some((2, 3, 1))); 1730 assert_eq!(x2apic_id, 0); 1731 1732 let x2apic_id = get_x2apic_id(1, Some((2, 3, 1))); 1733 assert_eq!(x2apic_id, 1); 1734 1735 let x2apic_id = get_x2apic_id(2, Some((2, 3, 1))); 1736 assert_eq!(x2apic_id, 2); 1737 1738 let x2apic_id = get_x2apic_id(6, Some((2, 3, 1))); 1739 assert_eq!(x2apic_id, 8); 1740 1741 let x2apic_id = get_x2apic_id(7, Some((2, 3, 1))); 1742 assert_eq!(x2apic_id, 9); 1743 1744 let x2apic_id = get_x2apic_id(8, Some((2, 3, 1))); 1745 assert_eq!(x2apic_id, 10); 1746 } 1747 } 1748