xref: /cloud-hypervisor/arch/src/x86_64/mod.rs (revision 3f3489e38e32a652241e889a9a1f6c67823d584b)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 // SPDX-License-Identifier: Apache-2.0
5 //
6 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE-BSD-3-Clause file.
9 use std::sync::Arc;
10 pub mod interrupts;
11 pub mod layout;
12 mod mpspec;
13 mod mptable;
14 pub mod regs;
15 use std::collections::BTreeMap;
16 use std::mem;
17 
18 use hypervisor::arch::x86::{CpuIdEntry, CPUID_FLAG_VALID_INDEX};
19 use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError};
20 use linux_loader::loader::bootparam::{boot_params, setup_header};
21 use linux_loader::loader::elf::start_info::{
22     hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info,
23 };
24 use thiserror::Error;
25 use vm_memory::{
26     Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
27     GuestMemoryRegion, GuestUsize,
28 };
29 
30 use crate::{GuestMemoryMmap, InitramfsConfig, RegionType};
31 mod smbios;
32 use std::arch::x86_64;
33 #[cfg(feature = "tdx")]
34 pub mod tdx;
35 
36 // CPUID feature bits
37 #[cfg(feature = "kvm")]
38 const TSC_DEADLINE_TIMER_ECX_BIT: u8 = 24; // tsc deadline timer ecx bit.
39 const HYPERVISOR_ECX_BIT: u8 = 31; // Hypervisor ecx bit.
40 const MTRR_EDX_BIT: u8 = 12; // Hypervisor ecx bit.
41 const INVARIANT_TSC_EDX_BIT: u8 = 8; // Invariant TSC bit on 0x8000_0007 EDX
42 const AMX_BF16: u8 = 22; // AMX tile computation on bfloat16 numbers
43 const AMX_TILE: u8 = 24; // AMX tile load/store instructions
44 const AMX_INT8: u8 = 25; // AMX tile computation on 8-bit integers
45 
46 // KVM feature bits
47 #[cfg(feature = "tdx")]
48 const KVM_FEATURE_CLOCKSOURCE_BIT: u8 = 0;
49 #[cfg(feature = "tdx")]
50 const KVM_FEATURE_CLOCKSOURCE2_BIT: u8 = 3;
51 #[cfg(feature = "tdx")]
52 const KVM_FEATURE_CLOCKSOURCE_STABLE_BIT: u8 = 24;
53 #[cfg(feature = "tdx")]
54 const KVM_FEATURE_ASYNC_PF_BIT: u8 = 4;
55 #[cfg(feature = "tdx")]
56 const KVM_FEATURE_ASYNC_PF_VMEXIT_BIT: u8 = 10;
57 #[cfg(feature = "tdx")]
58 const KVM_FEATURE_STEAL_TIME_BIT: u8 = 5;
59 
60 pub const _NSIG: i32 = 65;
61 
62 #[derive(Debug, Copy, Clone)]
63 /// Specifies the entry point address where the guest must start
64 /// executing code, as well as which of the supported boot protocols
65 /// is to be used to configure the guest initial state.
66 pub struct EntryPoint {
67     /// Address in guest memory where the guest must start execution
68     pub entry_addr: GuestAddress,
69     /// This field is used for bzImage to fill the zero page
70     pub setup_header: Option<setup_header>,
71 }
72 
73 const E820_RAM: u32 = 1;
74 const E820_RESERVED: u32 = 2;
75 
76 #[derive(Clone)]
77 pub struct SgxEpcSection {
78     start: GuestAddress,
79     size: GuestUsize,
80 }
81 
82 impl SgxEpcSection {
83     pub fn new(start: GuestAddress, size: GuestUsize) -> Self {
84         SgxEpcSection { start, size }
85     }
86     pub fn start(&self) -> GuestAddress {
87         self.start
88     }
89     pub fn size(&self) -> GuestUsize {
90         self.size
91     }
92 }
93 
94 #[derive(Clone)]
95 pub struct SgxEpcRegion {
96     start: GuestAddress,
97     size: GuestUsize,
98     epc_sections: BTreeMap<String, SgxEpcSection>,
99 }
100 
101 impl SgxEpcRegion {
102     pub fn new(start: GuestAddress, size: GuestUsize) -> Self {
103         SgxEpcRegion {
104             start,
105             size,
106             epc_sections: BTreeMap::new(),
107         }
108     }
109     pub fn start(&self) -> GuestAddress {
110         self.start
111     }
112     pub fn size(&self) -> GuestUsize {
113         self.size
114     }
115     pub fn epc_sections(&self) -> &BTreeMap<String, SgxEpcSection> {
116         &self.epc_sections
117     }
118     pub fn insert(&mut self, id: String, epc_section: SgxEpcSection) {
119         self.epc_sections.insert(id, epc_section);
120     }
121 }
122 
123 pub struct CpuidConfig {
124     pub sgx_epc_sections: Option<Vec<SgxEpcSection>>,
125     pub phys_bits: u8,
126     pub kvm_hyperv: bool,
127     #[cfg(feature = "tdx")]
128     pub tdx: bool,
129     pub amx: bool,
130 }
131 
132 #[derive(Debug, Error)]
133 pub enum Error {
134     /// Error writing MP table to memory.
135     #[error("Error writing MP table to memory: {0}")]
136     MpTableSetup(#[source] mptable::Error),
137 
138     /// Error configuring the general purpose registers
139     #[error("Error configuring the general purpose registers: {0}")]
140     RegsConfiguration(#[source] regs::Error),
141 
142     /// Error configuring the special registers
143     #[error("Error configuring the special registers: {0}")]
144     SregsConfiguration(#[source] regs::Error),
145 
146     /// Error configuring the floating point related registers
147     #[error("Error configuring the floating point related registers: {0}")]
148     FpuConfiguration(#[source] regs::Error),
149 
150     /// Error configuring the MSR registers
151     #[error("Error configuring the MSR registers: {0}")]
152     MsrsConfiguration(#[source] regs::Error),
153 
154     /// Failed to set supported CPUs.
155     #[error("Failed to set supported CPUs: {0}")]
156     SetSupportedCpusFailed(#[source] anyhow::Error),
157 
158     /// Cannot set the local interruption due to bad configuration.
159     #[error("Cannot set the local interruption due to bad configuration: {0}")]
160     LocalIntConfiguration(#[source] anyhow::Error),
161 
162     /// Error setting up SMBIOS table
163     #[error("Error setting up SMBIOS table: {0}")]
164     SmbiosSetup(#[source] smbios::Error),
165 
166     /// Could not find any SGX EPC section
167     #[error("Could not find any SGX EPC section")]
168     NoSgxEpcSection,
169 
170     /// Missing SGX CPU feature
171     #[error("Missing SGX CPU feature")]
172     MissingSgxFeature,
173 
174     /// Missing SGX_LC CPU feature
175     #[error("Missing SGX_LC CPU feature")]
176     MissingSgxLaunchControlFeature,
177 
178     /// Error getting supported CPUID through the hypervisor (kvm/mshv) API
179     #[error("Error getting supported CPUID through the hypervisor API: {0}")]
180     CpuidGetSupported(#[source] HypervisorError),
181 
182     /// Error populating CPUID with KVM HyperV emulation details
183     #[error("Error populating CPUID with KVM HyperV emulation details: {0}")]
184     CpuidKvmHyperV(#[source] vmm_sys_util::fam::Error),
185 
186     /// Error populating CPUID with CPU identification
187     #[error("Error populating CPUID with CPU identification: {0}")]
188     CpuidIdentification(#[source] vmm_sys_util::fam::Error),
189 
190     /// Error checking CPUID compatibility
191     #[error("Error checking CPUID compatibility")]
192     CpuidCheckCompatibility,
193 
194     // Error writing EBDA address
195     #[error("Error writing EBDA address: {0}")]
196     EbdaSetup(#[source] vm_memory::GuestMemoryError),
197 
198     // Error getting CPU TSC frequency
199     #[error("Error getting CPU TSC frequency: {0}")]
200     GetTscFrequency(#[source] HypervisorCpuError),
201 
202     /// Error retrieving TDX capabilities through the hypervisor (kvm/mshv) API
203     #[cfg(feature = "tdx")]
204     #[error("Error retrieving TDX capabilities through the hypervisor API: {0}")]
205     TdxCapabilities(#[source] HypervisorError),
206 
207     /// Failed to configure E820 map for bzImage
208     #[error("Failed to configure E820 map for bzImage")]
209     E820Configuration,
210 }
211 
212 pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 {
213     if let Some(t) = topology {
214         let thread_mask_width = u8::BITS - (t.0 - 1).leading_zeros();
215         let core_mask_width = u8::BITS - (t.1 - 1).leading_zeros();
216         let die_mask_width = u8::BITS - (t.2 - 1).leading_zeros();
217 
218         let thread_id = cpu_id % (t.0 as u32);
219         let core_id = cpu_id / (t.0 as u32) % (t.1 as u32);
220         let die_id = cpu_id / ((t.0 * t.1) as u32) % (t.2 as u32);
221         let socket_id = cpu_id / ((t.0 * t.1 * t.2) as u32);
222 
223         return thread_id
224             | (core_id << thread_mask_width)
225             | (die_id << (thread_mask_width + core_mask_width))
226             | (socket_id << (thread_mask_width + core_mask_width + die_mask_width));
227     }
228 
229     cpu_id
230 }
231 
232 #[derive(Copy, Clone, Debug)]
233 pub enum CpuidReg {
234     EAX,
235     EBX,
236     ECX,
237     EDX,
238 }
239 
240 pub struct CpuidPatch {
241     pub function: u32,
242     pub index: u32,
243     pub flags_bit: Option<u8>,
244     pub eax_bit: Option<u8>,
245     pub ebx_bit: Option<u8>,
246     pub ecx_bit: Option<u8>,
247     pub edx_bit: Option<u8>,
248 }
249 
250 impl CpuidPatch {
251     pub fn get_cpuid_reg(
252         cpuid: &[CpuIdEntry],
253         function: u32,
254         index: Option<u32>,
255         reg: CpuidReg,
256     ) -> Option<u32> {
257         for entry in cpuid.iter() {
258             if entry.function == function && (index.is_none() || index.unwrap() == entry.index) {
259                 return match reg {
260                     CpuidReg::EAX => Some(entry.eax),
261                     CpuidReg::EBX => Some(entry.ebx),
262                     CpuidReg::ECX => Some(entry.ecx),
263                     CpuidReg::EDX => Some(entry.edx),
264                 };
265             }
266         }
267 
268         None
269     }
270 
271     pub fn set_cpuid_reg(
272         cpuid: &mut Vec<CpuIdEntry>,
273         function: u32,
274         index: Option<u32>,
275         reg: CpuidReg,
276         value: u32,
277     ) {
278         let mut entry_found = false;
279         for entry in cpuid.iter_mut() {
280             if entry.function == function && (index.is_none() || index.unwrap() == entry.index) {
281                 entry_found = true;
282                 match reg {
283                     CpuidReg::EAX => {
284                         entry.eax = value;
285                     }
286                     CpuidReg::EBX => {
287                         entry.ebx = value;
288                     }
289                     CpuidReg::ECX => {
290                         entry.ecx = value;
291                     }
292                     CpuidReg::EDX => {
293                         entry.edx = value;
294                     }
295                 }
296             }
297         }
298 
299         if entry_found {
300             return;
301         }
302 
303         // Entry not found, so let's add it.
304         if let Some(index) = index {
305             let mut entry = CpuIdEntry {
306                 function,
307                 index,
308                 flags: CPUID_FLAG_VALID_INDEX,
309                 ..Default::default()
310             };
311             match reg {
312                 CpuidReg::EAX => {
313                     entry.eax = value;
314                 }
315                 CpuidReg::EBX => {
316                     entry.ebx = value;
317                 }
318                 CpuidReg::ECX => {
319                     entry.ecx = value;
320                 }
321                 CpuidReg::EDX => {
322                     entry.edx = value;
323                 }
324             }
325 
326             cpuid.push(entry);
327         }
328     }
329 
330     pub fn patch_cpuid(cpuid: &mut [CpuIdEntry], patches: Vec<CpuidPatch>) {
331         for entry in cpuid {
332             for patch in patches.iter() {
333                 if entry.function == patch.function && entry.index == patch.index {
334                     if let Some(flags_bit) = patch.flags_bit {
335                         entry.flags |= 1 << flags_bit;
336                     }
337                     if let Some(eax_bit) = patch.eax_bit {
338                         entry.eax |= 1 << eax_bit;
339                     }
340                     if let Some(ebx_bit) = patch.ebx_bit {
341                         entry.ebx |= 1 << ebx_bit;
342                     }
343                     if let Some(ecx_bit) = patch.ecx_bit {
344                         entry.ecx |= 1 << ecx_bit;
345                     }
346                     if let Some(edx_bit) = patch.edx_bit {
347                         entry.edx |= 1 << edx_bit;
348                     }
349                 }
350             }
351         }
352     }
353 
354     pub fn is_feature_enabled(
355         cpuid: &[CpuIdEntry],
356         function: u32,
357         index: u32,
358         reg: CpuidReg,
359         feature_bit: usize,
360     ) -> bool {
361         let mask = 1 << feature_bit;
362 
363         for entry in cpuid {
364             if entry.function == function && entry.index == index {
365                 let reg_val = match reg {
366                     CpuidReg::EAX => entry.eax,
367                     CpuidReg::EBX => entry.ebx,
368                     CpuidReg::ECX => entry.ecx,
369                     CpuidReg::EDX => entry.edx,
370                 };
371 
372                 return (reg_val & mask) == mask;
373             }
374         }
375 
376         false
377     }
378 }
379 
380 #[derive(Debug)]
381 enum CpuidCompatibleCheck {
382     BitwiseSubset, // bitwise subset
383     Equal,         // equal in value
384     NumNotGreater, // smaller or equal as a number
385 }
386 
387 pub struct CpuidFeatureEntry {
388     function: u32,
389     index: u32,
390     feature_reg: CpuidReg,
391     compatible_check: CpuidCompatibleCheck,
392 }
393 
394 impl CpuidFeatureEntry {
395     fn checked_feature_entry_list() -> Vec<CpuidFeatureEntry> {
396         vec![
397             // The following list includes all hardware features bits from
398             // the CPUID Wiki Page: https://en.wikipedia.org/wiki/CPUID
399             // Leaf 0x1, ECX/EDX, feature bits
400             CpuidFeatureEntry {
401                 function: 1,
402                 index: 0,
403                 feature_reg: CpuidReg::ECX,
404                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
405             },
406             CpuidFeatureEntry {
407                 function: 1,
408                 index: 0,
409                 feature_reg: CpuidReg::EDX,
410                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
411             },
412             // Leaf 0x7, EAX/EBX/ECX/EDX, extended features
413             CpuidFeatureEntry {
414                 function: 7,
415                 index: 0,
416                 feature_reg: CpuidReg::EAX,
417                 compatible_check: CpuidCompatibleCheck::NumNotGreater,
418             },
419             CpuidFeatureEntry {
420                 function: 7,
421                 index: 0,
422                 feature_reg: CpuidReg::EBX,
423                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
424             },
425             CpuidFeatureEntry {
426                 function: 7,
427                 index: 0,
428                 feature_reg: CpuidReg::ECX,
429                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
430             },
431             CpuidFeatureEntry {
432                 function: 7,
433                 index: 0,
434                 feature_reg: CpuidReg::EDX,
435                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
436             },
437             // Leaf 0x7 subleaf 0x1, EAX, extended features
438             CpuidFeatureEntry {
439                 function: 7,
440                 index: 1,
441                 feature_reg: CpuidReg::EAX,
442                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
443             },
444             // Leaf 0x8000_0001, ECX/EDX, CPUID features bits
445             CpuidFeatureEntry {
446                 function: 0x8000_0001,
447                 index: 0,
448                 feature_reg: CpuidReg::ECX,
449                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
450             },
451             CpuidFeatureEntry {
452                 function: 0x8000_0001,
453                 index: 0,
454                 feature_reg: CpuidReg::EDX,
455                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
456             },
457             // KVM CPUID bits: https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html
458             // Leaf 0x4000_0000, EAX/EBX/ECX/EDX, KVM CPUID SIGNATURE
459             CpuidFeatureEntry {
460                 function: 0x4000_0000,
461                 index: 0,
462                 feature_reg: CpuidReg::EAX,
463                 compatible_check: CpuidCompatibleCheck::NumNotGreater,
464             },
465             CpuidFeatureEntry {
466                 function: 0x4000_0000,
467                 index: 0,
468                 feature_reg: CpuidReg::EBX,
469                 compatible_check: CpuidCompatibleCheck::Equal,
470             },
471             CpuidFeatureEntry {
472                 function: 0x4000_0000,
473                 index: 0,
474                 feature_reg: CpuidReg::ECX,
475                 compatible_check: CpuidCompatibleCheck::Equal,
476             },
477             CpuidFeatureEntry {
478                 function: 0x4000_0000,
479                 index: 0,
480                 feature_reg: CpuidReg::EDX,
481                 compatible_check: CpuidCompatibleCheck::Equal,
482             },
483             // Leaf 0x4000_0001, EAX/EBX/ECX/EDX, KVM CPUID features
484             CpuidFeatureEntry {
485                 function: 0x4000_0001,
486                 index: 0,
487                 feature_reg: CpuidReg::EAX,
488                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
489             },
490             CpuidFeatureEntry {
491                 function: 0x4000_0001,
492                 index: 0,
493                 feature_reg: CpuidReg::EBX,
494                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
495             },
496             CpuidFeatureEntry {
497                 function: 0x4000_0001,
498                 index: 0,
499                 feature_reg: CpuidReg::ECX,
500                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
501             },
502             CpuidFeatureEntry {
503                 function: 0x4000_0001,
504                 index: 0,
505                 feature_reg: CpuidReg::EDX,
506                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
507             },
508         ]
509     }
510 
511     fn get_features_from_cpuid(
512         cpuid: &[CpuIdEntry],
513         feature_entry_list: &[CpuidFeatureEntry],
514     ) -> Vec<u32> {
515         let mut features = vec![0; feature_entry_list.len()];
516         for (i, feature_entry) in feature_entry_list.iter().enumerate() {
517             for cpuid_entry in cpuid {
518                 if cpuid_entry.function == feature_entry.function
519                     && cpuid_entry.index == feature_entry.index
520                 {
521                     match feature_entry.feature_reg {
522                         CpuidReg::EAX => {
523                             features[i] = cpuid_entry.eax;
524                         }
525                         CpuidReg::EBX => {
526                             features[i] = cpuid_entry.ebx;
527                         }
528                         CpuidReg::ECX => {
529                             features[i] = cpuid_entry.ecx;
530                         }
531                         CpuidReg::EDX => {
532                             features[i] = cpuid_entry.edx;
533                         }
534                     }
535 
536                     break;
537                 }
538             }
539         }
540 
541         features
542     }
543 
544     // The function returns `Error` (a.k.a. "incompatible"), when the CPUID features from `src_vm_cpuid`
545     // is not a subset of those of the `dest_vm_cpuid`.
546     pub fn check_cpuid_compatibility(
547         src_vm_cpuid: &[CpuIdEntry],
548         dest_vm_cpuid: &[CpuIdEntry],
549     ) -> Result<(), Error> {
550         let feature_entry_list = &Self::checked_feature_entry_list();
551         let src_vm_features = Self::get_features_from_cpuid(src_vm_cpuid, feature_entry_list);
552         let dest_vm_features = Self::get_features_from_cpuid(dest_vm_cpuid, feature_entry_list);
553 
554         // Loop on feature bit and check if the 'source vm' feature is a subset
555         // of those of the 'destination vm' feature
556         let mut compatible = true;
557         for (i, (src_vm_feature, dest_vm_feature)) in src_vm_features
558             .iter()
559             .zip(dest_vm_features.iter())
560             .enumerate()
561         {
562             let entry = &feature_entry_list[i];
563             let entry_compatible = match entry.compatible_check {
564                 CpuidCompatibleCheck::BitwiseSubset => {
565                     let different_feature_bits = src_vm_feature ^ dest_vm_feature;
566                     let src_vm_feature_bits_only = different_feature_bits & src_vm_feature;
567                     src_vm_feature_bits_only == 0
568                 }
569                 CpuidCompatibleCheck::Equal => src_vm_feature == dest_vm_feature,
570                 CpuidCompatibleCheck::NumNotGreater => src_vm_feature <= dest_vm_feature,
571             };
572             if !entry_compatible {
573                 error!(
574                     "Detected incompatible CPUID entry: leaf={:#02x} (subleaf={:#02x}), register='{:?}', \
575                     compatible_check='{:?}', source VM feature='{:#04x}', destination VM feature'{:#04x}'.",
576                     entry.function, entry.index, entry.feature_reg,
577                     entry.compatible_check, src_vm_feature, dest_vm_feature
578                     );
579 
580                 compatible = false;
581             }
582         }
583 
584         if compatible {
585             info!("No CPU incompatibility detected.");
586             Ok(())
587         } else {
588             Err(Error::CpuidCheckCompatibility)
589         }
590     }
591 }
592 
593 pub fn generate_common_cpuid(
594     hypervisor: &Arc<dyn hypervisor::Hypervisor>,
595     config: &CpuidConfig,
596 ) -> super::Result<Vec<CpuIdEntry>> {
597     // SAFETY: cpuid called with valid leaves
598     if unsafe { x86_64::__cpuid(1) }.ecx & (1 << HYPERVISOR_ECX_BIT) == 1 << HYPERVISOR_ECX_BIT {
599         // SAFETY: cpuid called with valid leaves
600         let hypervisor_cpuid = unsafe { x86_64::__cpuid(0x4000_0000) };
601 
602         let mut identifier: [u8; 12] = [0; 12];
603         identifier[0..4].copy_from_slice(&hypervisor_cpuid.ebx.to_le_bytes()[..]);
604         identifier[4..8].copy_from_slice(&hypervisor_cpuid.ecx.to_le_bytes()[..]);
605         identifier[8..12].copy_from_slice(&hypervisor_cpuid.edx.to_le_bytes()[..]);
606 
607         info!(
608             "Running under nested virtualisation. Hypervisor string: {}",
609             String::from_utf8_lossy(&identifier)
610         );
611     }
612 
613     info!(
614         "Generating guest CPUID for with physical address size: {}",
615         config.phys_bits
616     );
617     #[allow(unused_mut)]
618     let mut cpuid_patches = vec![
619         // Patch hypervisor bit
620         CpuidPatch {
621             function: 1,
622             index: 0,
623             flags_bit: None,
624             eax_bit: None,
625             ebx_bit: None,
626             ecx_bit: Some(HYPERVISOR_ECX_BIT),
627             edx_bit: None,
628         },
629         // Enable MTRR feature
630         CpuidPatch {
631             function: 1,
632             index: 0,
633             flags_bit: None,
634             eax_bit: None,
635             ebx_bit: None,
636             ecx_bit: None,
637             edx_bit: Some(MTRR_EDX_BIT),
638         },
639     ];
640 
641     #[cfg(feature = "kvm")]
642     if matches!(
643         hypervisor.hypervisor_type(),
644         hypervisor::HypervisorType::Kvm
645     ) {
646         // Patch tsc deadline timer bit
647         cpuid_patches.push(CpuidPatch {
648             function: 1,
649             index: 0,
650             flags_bit: None,
651             eax_bit: None,
652             ebx_bit: None,
653             ecx_bit: Some(TSC_DEADLINE_TIMER_ECX_BIT),
654             edx_bit: None,
655         });
656     }
657 
658     // Supported CPUID
659     let mut cpuid = hypervisor
660         .get_supported_cpuid()
661         .map_err(Error::CpuidGetSupported)?;
662 
663     CpuidPatch::patch_cpuid(&mut cpuid, cpuid_patches);
664 
665     if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
666         update_cpuid_sgx(&mut cpuid, sgx_epc_sections)?;
667     }
668 
669     #[cfg(feature = "tdx")]
670     let tdx_capabilities = if config.tdx {
671         let caps = hypervisor
672             .tdx_capabilities()
673             .map_err(Error::TdxCapabilities)?;
674         info!("TDX capabilities {:#?}", caps);
675         Some(caps)
676     } else {
677         None
678     };
679 
680     // Update some existing CPUID
681     for entry in cpuid.as_mut_slice().iter_mut() {
682         match entry.function {
683             // Clear AMX related bits if the AMX feature is not enabled
684             0x7 => {
685                 if !config.amx && entry.index == 0 {
686                     entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8))
687                 }
688             }
689             0xd =>
690             {
691                 #[cfg(feature = "tdx")]
692                 if let Some(caps) = &tdx_capabilities {
693                     let xcr0_mask: u64 = 0x82ff;
694                     let xss_mask: u64 = !xcr0_mask;
695                     if entry.index == 0 {
696                         entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32);
697                         entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32);
698                         entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32;
699                         entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32;
700                     } else if entry.index == 1 {
701                         entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32);
702                         entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32);
703                         entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32;
704                         entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32;
705                     }
706                 }
707             }
708             // Copy host L1 cache details if not populated by KVM
709             0x8000_0005 => {
710                 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 {
711                     // SAFETY: cpuid called with valid leaves
712                     if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 {
713                         // SAFETY: cpuid called with valid leaves
714                         let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) };
715                         entry.eax = leaf.eax;
716                         entry.ebx = leaf.ebx;
717                         entry.ecx = leaf.ecx;
718                         entry.edx = leaf.edx;
719                     }
720                 }
721             }
722             // Copy host L2 cache details if not populated by KVM
723             0x8000_0006 => {
724                 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 {
725                     // SAFETY: cpuid called with valid leaves
726                     if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 {
727                         // SAFETY: cpuid called with valid leaves
728                         let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) };
729                         entry.eax = leaf.eax;
730                         entry.ebx = leaf.ebx;
731                         entry.ecx = leaf.ecx;
732                         entry.edx = leaf.edx;
733                     }
734                 }
735             }
736             // Set CPU physical bits
737             0x8000_0008 => {
738                 entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff);
739             }
740             0x4000_0001 => {
741                 // These features are not supported by TDX
742                 #[cfg(feature = "tdx")]
743                 if config.tdx {
744                     entry.eax &= !((1 << KVM_FEATURE_CLOCKSOURCE_BIT)
745                         | (1 << KVM_FEATURE_CLOCKSOURCE2_BIT)
746                         | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)
747                         | (1 << KVM_FEATURE_ASYNC_PF_BIT)
748                         | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT)
749                         | (1 << KVM_FEATURE_STEAL_TIME_BIT))
750                 }
751             }
752             _ => {}
753         }
754     }
755 
756     // Copy CPU identification string
757     for i in 0x8000_0002..=0x8000_0004 {
758         cpuid.retain(|c| c.function != i);
759         // SAFETY: call cpuid with valid leaves
760         let leaf = unsafe { std::arch::x86_64::__cpuid(i) };
761         cpuid.push(CpuIdEntry {
762             function: i,
763             eax: leaf.eax,
764             ebx: leaf.ebx,
765             ecx: leaf.ecx,
766             edx: leaf.edx,
767             ..Default::default()
768         });
769     }
770 
771     if config.kvm_hyperv {
772         // Remove conflicting entries
773         cpuid.retain(|c| c.function != 0x4000_0000);
774         cpuid.retain(|c| c.function != 0x4000_0001);
775         // See "Hypervisor Top Level Functional Specification" for details
776         // Compliance with "Hv#1" requires leaves up to 0x4000_000a
777         cpuid.push(CpuIdEntry {
778             function: 0x40000000,
779             eax: 0x4000000a, // Maximum cpuid leaf
780             ebx: 0x756e694c, // "Linu"
781             ecx: 0x564b2078, // "x KV"
782             edx: 0x7648204d, // "M Hv"
783             ..Default::default()
784         });
785         cpuid.push(CpuIdEntry {
786             function: 0x40000001,
787             eax: 0x31237648, // "Hv#1"
788             ..Default::default()
789         });
790         cpuid.push(CpuIdEntry {
791             function: 0x40000002,
792             eax: 0x3839,  // "Build number"
793             ebx: 0xa0000, // "Version"
794             ..Default::default()
795         });
796         cpuid.push(CpuIdEntry {
797             function: 0x4000_0003,
798             eax: (1 << 1) // AccessPartitionReferenceCounter
799                    | (1 << 2) // AccessSynicRegs
800                    | (1 << 3) // AccessSyntheticTimerRegs
801                    | (1 << 9), // AccessPartitionReferenceTsc
802             edx: 1 << 3, // CPU dynamic partitioning
803             ..Default::default()
804         });
805         cpuid.push(CpuIdEntry {
806             function: 0x4000_0004,
807             eax: 1 << 5, // Recommend relaxed timing
808             ..Default::default()
809         });
810         for i in 0x4000_0005..=0x4000_000a {
811             cpuid.push(CpuIdEntry {
812                 function: i,
813                 ..Default::default()
814             });
815         }
816     }
817 
818     Ok(cpuid)
819 }
820 
821 pub fn configure_vcpu(
822     vcpu: &Arc<dyn hypervisor::Vcpu>,
823     id: u8,
824     boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
825     cpuid: Vec<CpuIdEntry>,
826     kvm_hyperv: bool,
827     cpu_vendor: CpuVendor,
828     topology: Option<(u8, u8, u8)>,
829 ) -> super::Result<()> {
830     let x2apic_id = get_x2apic_id(id as u32, topology);
831 
832     // Per vCPU CPUID changes; common are handled via generate_common_cpuid()
833     let mut cpuid = cpuid;
834     CpuidPatch::set_cpuid_reg(&mut cpuid, 0xb, None, CpuidReg::EDX, x2apic_id);
835     CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1f, None, CpuidReg::EDX, x2apic_id);
836     if matches!(cpu_vendor, CpuVendor::AMD) {
837         CpuidPatch::set_cpuid_reg(&mut cpuid, 0x8000_001e, Some(0), CpuidReg::EAX, x2apic_id);
838     }
839 
840     // Set ApicId in cpuid for each vcpu - found in cpuid ebx when eax = 1
841     let mut apic_id_patched = false;
842     for entry in &mut cpuid {
843         if entry.function == 1 {
844             entry.ebx &= 0xffffff;
845             entry.ebx |= x2apic_id << 24;
846             apic_id_patched = true;
847             break;
848         }
849     }
850     assert!(apic_id_patched);
851 
852     if let Some(t) = topology {
853         update_cpuid_topology(&mut cpuid, t.0, t.1, t.2, cpu_vendor, id);
854     }
855 
856     // The TSC frequency CPUID leaf should not be included when running with HyperV emulation
857     if !kvm_hyperv {
858         if let Some(tsc_khz) = vcpu.tsc_khz().map_err(Error::GetTscFrequency)? {
859             // Need to check that the TSC doesn't vary with dynamic frequency
860             // SAFETY: cpuid called with valid leaves
861             if unsafe { std::arch::x86_64::__cpuid(0x8000_0007) }.edx
862                 & (1u32 << INVARIANT_TSC_EDX_BIT)
863                 > 0
864             {
865                 CpuidPatch::set_cpuid_reg(
866                     &mut cpuid,
867                     0x4000_0000,
868                     None,
869                     CpuidReg::EAX,
870                     0x4000_0010,
871                 );
872                 cpuid.retain(|c| c.function != 0x4000_0010);
873                 cpuid.push(CpuIdEntry {
874                     function: 0x4000_0010,
875                     eax: tsc_khz,
876                     ebx: 1000000, /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's
877                                    * APIC_BUS_CYCLE_NS */
878                     ..Default::default()
879                 });
880             };
881         }
882     }
883 
884     for c in &cpuid {
885         debug!("{}", c);
886     }
887 
888     vcpu.set_cpuid2(&cpuid)
889         .map_err(|e| Error::SetSupportedCpusFailed(e.into()))?;
890 
891     if kvm_hyperv {
892         vcpu.enable_hyperv_synic().unwrap();
893     }
894 
895     regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?;
896     if let Some((kernel_entry_point, guest_memory)) = boot_setup {
897         regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?;
898         regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?;
899         regs::setup_sregs(&guest_memory.memory(), vcpu).map_err(Error::SregsConfiguration)?;
900     }
901     interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?;
902     Ok(())
903 }
904 
905 /// Returns a Vec of the valid memory addresses.
906 ///
907 /// These should be used to configure the GuestMemory structure for the platform.
908 /// For x86_64 all addresses are valid from the start of the kernel except a
909 /// carve out at the end of 32bit address space.
910 pub fn arch_memory_regions() -> Vec<(GuestAddress, usize, RegionType)> {
911     vec![
912         // 0 GiB ~ 3GiB: memory before the gap
913         (
914             GuestAddress(0),
915             layout::MEM_32BIT_RESERVED_START.raw_value() as usize,
916             RegionType::Ram,
917         ),
918         // 4 GiB ~ inf: memory after the gap
919         (layout::RAM_64BIT_START, usize::MAX, RegionType::Ram),
920         // 3 GiB ~ 3712 MiB: 32-bit device memory hole
921         (
922             layout::MEM_32BIT_RESERVED_START,
923             layout::MEM_32BIT_DEVICES_SIZE as usize,
924             RegionType::SubRegion,
925         ),
926         // 3712 MiB ~ 3968 MiB: 32-bit reserved memory hole
927         (
928             layout::MEM_32BIT_RESERVED_START.unchecked_add(layout::MEM_32BIT_DEVICES_SIZE),
929             (layout::MEM_32BIT_RESERVED_SIZE - layout::MEM_32BIT_DEVICES_SIZE) as usize,
930             RegionType::Reserved,
931         ),
932     ]
933 }
934 
935 /// Configures the system and should be called once per vm before starting vcpu threads.
936 ///
937 /// # Arguments
938 ///
939 /// * `guest_mem` - The memory to be used by the guest.
940 /// * `cmdline_addr` - Address in `guest_mem` where the kernel command line was loaded.
941 /// * `cmdline_size` - Size of the kernel command line in bytes including the null terminator.
942 /// * `num_cpus` - Number of virtual CPUs the guest will have.
943 #[allow(clippy::too_many_arguments)]
944 pub fn configure_system(
945     guest_mem: &GuestMemoryMmap,
946     cmdline_addr: GuestAddress,
947     cmdline_size: usize,
948     initramfs: &Option<InitramfsConfig>,
949     _num_cpus: u8,
950     setup_header: Option<setup_header>,
951     rsdp_addr: Option<GuestAddress>,
952     sgx_epc_region: Option<SgxEpcRegion>,
953     serial_number: Option<&str>,
954     uuid: Option<&str>,
955     oem_strings: Option<&[&str]>,
956     topology: Option<(u8, u8, u8)>,
957 ) -> super::Result<()> {
958     // Write EBDA address to location where ACPICA expects to find it
959     guest_mem
960         .write_obj((layout::EBDA_START.0 >> 4) as u16, layout::EBDA_POINTER)
961         .map_err(Error::EbdaSetup)?;
962 
963     let size = smbios::setup_smbios(guest_mem, serial_number, uuid, oem_strings)
964         .map_err(Error::SmbiosSetup)?;
965 
966     // Place the MP table after the SMIOS table aligned to 16 bytes
967     let offset = GuestAddress(layout::SMBIOS_START).unchecked_add(size);
968     let offset = GuestAddress((offset.0 + 16) & !0xf);
969     mptable::setup_mptable(offset, guest_mem, _num_cpus, topology).map_err(Error::MpTableSetup)?;
970 
971     // Check that the RAM is not smaller than the RSDP start address
972     if let Some(rsdp_addr) = rsdp_addr {
973         if rsdp_addr.0 > guest_mem.last_addr().0 {
974             return Err(super::Error::RsdpPastRamEnd);
975         }
976     }
977 
978     match setup_header {
979         Some(hdr) => configure_32bit_entry(
980             guest_mem,
981             cmdline_addr,
982             cmdline_size,
983             initramfs,
984             hdr,
985             rsdp_addr,
986             sgx_epc_region,
987         ),
988         None => configure_pvh(
989             guest_mem,
990             cmdline_addr,
991             initramfs,
992             rsdp_addr,
993             sgx_epc_region,
994         ),
995     }
996 }
997 
998 type RamRange = (u64, u64);
999 
1000 /// Returns usable physical memory ranges for the guest
1001 /// These should be used to create e820_RAM memory maps
1002 pub fn generate_ram_ranges(guest_mem: &GuestMemoryMmap) -> super::Result<Vec<RamRange>> {
1003     // Merge continuous memory regions into one region.
1004     // Note: memory regions from "GuestMemory" are sorted and non-zero sized.
1005     let ram_regions = {
1006         let mut ram_regions = Vec::new();
1007         let mut current_start = guest_mem
1008             .iter()
1009             .next()
1010             .map(GuestMemoryRegion::start_addr)
1011             .expect("GuestMemory must have one memory region at least")
1012             .raw_value();
1013         let mut current_end = current_start;
1014 
1015         for (start, size) in guest_mem
1016             .iter()
1017             .map(|m| (m.start_addr().raw_value(), m.len()))
1018         {
1019             if current_end == start {
1020                 // This zone is continuous with the previous one.
1021                 current_end += size;
1022             } else {
1023                 ram_regions.push((current_start, current_end));
1024 
1025                 current_start = start;
1026                 current_end = start + size;
1027             }
1028         }
1029 
1030         ram_regions.push((current_start, current_end));
1031 
1032         ram_regions
1033     };
1034 
1035     // Create the memory map entry for memory region before the gap
1036     let mut ram_ranges = vec![];
1037 
1038     // Generate the first usable physical memory range before the gap. The e820 map
1039     // should only report memory above 1MiB.
1040     let first_ram_range = {
1041         let (first_region_start, first_region_end) =
1042             ram_regions.first().ok_or(super::Error::MemmapTableSetup)?;
1043         let high_ram_start = layout::HIGH_RAM_START.raw_value();
1044         let mem_32bit_reserved_start = layout::MEM_32BIT_RESERVED_START.raw_value();
1045 
1046         if !((first_region_start <= &high_ram_start)
1047             && (first_region_end > &high_ram_start)
1048             && (first_region_end <= &mem_32bit_reserved_start))
1049         {
1050             error!(
1051                 "Unexpected first memory region layout: (start: 0x{:08x}, end: 0x{:08x}).
1052                 high_ram_start: 0x{:08x}, mem_32bit_reserved_start: 0x{:08x}",
1053                 first_region_start, first_region_end, high_ram_start, mem_32bit_reserved_start
1054             );
1055 
1056             return Err(super::Error::MemmapTableSetup);
1057         }
1058 
1059         info!(
1060             "first usable physical memory range, start: 0x{:08x}, end: 0x{:08x}",
1061             high_ram_start, first_region_end
1062         );
1063 
1064         (high_ram_start, *first_region_end)
1065     };
1066     ram_ranges.push(first_ram_range);
1067 
1068     // Generate additional usable physical memory range after the gap if any.
1069     for ram_region in ram_regions.iter().skip(1) {
1070         info!(
1071             "found usable physical memory range, start: 0x{:08x}, end: 0x{:08x}",
1072             ram_region.0, ram_region.1
1073         );
1074 
1075         ram_ranges.push(*ram_region);
1076     }
1077 
1078     Ok(ram_ranges)
1079 }
1080 
1081 fn configure_pvh(
1082     guest_mem: &GuestMemoryMmap,
1083     cmdline_addr: GuestAddress,
1084     initramfs: &Option<InitramfsConfig>,
1085     rsdp_addr: Option<GuestAddress>,
1086     sgx_epc_region: Option<SgxEpcRegion>,
1087 ) -> super::Result<()> {
1088     const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336ec578;
1089 
1090     let mut start_info = hvm_start_info {
1091         magic: XEN_HVM_START_MAGIC_VALUE,
1092         version: 1, // pvh has version 1
1093         nr_modules: 0,
1094         cmdline_paddr: cmdline_addr.raw_value(),
1095         memmap_paddr: layout::MEMMAP_START.raw_value(),
1096         ..Default::default()
1097     };
1098 
1099     if let Some(rsdp_addr) = rsdp_addr {
1100         start_info.rsdp_paddr = rsdp_addr.0;
1101     }
1102 
1103     if let Some(initramfs_config) = initramfs {
1104         // The initramfs has been written to guest memory already, here we just need to
1105         // create the module structure that describes it.
1106         let ramdisk_mod = hvm_modlist_entry {
1107             paddr: initramfs_config.address.raw_value(),
1108             size: initramfs_config.size as u64,
1109             ..Default::default()
1110         };
1111 
1112         start_info.nr_modules += 1;
1113         start_info.modlist_paddr = layout::MODLIST_START.raw_value();
1114 
1115         // Write the modlist struct to guest memory.
1116         guest_mem
1117             .write_obj(ramdisk_mod, layout::MODLIST_START)
1118             .map_err(super::Error::ModlistSetup)?;
1119     }
1120 
1121     // Vector to hold the memory maps which needs to be written to guest memory
1122     // at MEMMAP_START after all of the mappings are recorded.
1123     let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new();
1124 
1125     // Create the memory map entries.
1126     add_memmap_entry(&mut memmap, 0, layout::EBDA_START.raw_value(), E820_RAM);
1127 
1128     // Get usable physical memory ranges
1129     let ram_ranges = generate_ram_ranges(guest_mem)?;
1130 
1131     // Create e820 memory map entries
1132     for ram_range in ram_ranges {
1133         info!(
1134             "create_memmap_entry, start: 0x{:08x}, end: 0x{:08x}",
1135             ram_range.0, ram_range.1
1136         );
1137         add_memmap_entry(
1138             &mut memmap,
1139             ram_range.0,
1140             ram_range.1 - ram_range.0,
1141             E820_RAM,
1142         );
1143     }
1144 
1145     add_memmap_entry(
1146         &mut memmap,
1147         layout::PCI_MMCONFIG_START.0,
1148         layout::PCI_MMCONFIG_SIZE,
1149         E820_RESERVED,
1150     );
1151 
1152     if let Some(sgx_epc_region) = sgx_epc_region {
1153         add_memmap_entry(
1154             &mut memmap,
1155             sgx_epc_region.start().raw_value(),
1156             sgx_epc_region.size(),
1157             E820_RESERVED,
1158         );
1159     }
1160 
1161     start_info.memmap_entries = memmap.len() as u32;
1162 
1163     // Copy the vector with the memmap table to the MEMMAP_START address
1164     // which is already saved in the memmap_paddr field of hvm_start_info struct.
1165     let mut memmap_start_addr = layout::MEMMAP_START;
1166 
1167     guest_mem
1168         .checked_offset(
1169             memmap_start_addr,
1170             mem::size_of::<hvm_memmap_table_entry>() * start_info.memmap_entries as usize,
1171         )
1172         .ok_or(super::Error::MemmapTablePastRamEnd)?;
1173 
1174     // For every entry in the memmap vector, write it to guest memory.
1175     for memmap_entry in memmap {
1176         guest_mem
1177             .write_obj(memmap_entry, memmap_start_addr)
1178             .map_err(|_| super::Error::MemmapTableSetup)?;
1179         memmap_start_addr =
1180             memmap_start_addr.unchecked_add(mem::size_of::<hvm_memmap_table_entry>() as u64);
1181     }
1182 
1183     // The hvm_start_info struct itself must be stored at PVH_START_INFO
1184     // address, and %rbx will be initialized to contain PVH_INFO_START prior to
1185     // starting the guest, as required by the PVH ABI.
1186     let start_info_addr = layout::PVH_INFO_START;
1187 
1188     guest_mem
1189         .checked_offset(start_info_addr, mem::size_of::<hvm_start_info>())
1190         .ok_or(super::Error::StartInfoPastRamEnd)?;
1191 
1192     // Write the start_info struct to guest memory.
1193     guest_mem
1194         .write_obj(start_info, start_info_addr)
1195         .map_err(|_| super::Error::StartInfoSetup)?;
1196 
1197     Ok(())
1198 }
1199 
1200 fn configure_32bit_entry(
1201     guest_mem: &GuestMemoryMmap,
1202     cmdline_addr: GuestAddress,
1203     cmdline_size: usize,
1204     initramfs: &Option<InitramfsConfig>,
1205     setup_hdr: setup_header,
1206     rsdp_addr: Option<GuestAddress>,
1207     sgx_epc_region: Option<SgxEpcRegion>,
1208 ) -> super::Result<()> {
1209     const KERNEL_LOADER_OTHER: u8 = 0xff;
1210 
1211     // Use the provided setup header
1212     let mut params = boot_params {
1213         hdr: setup_hdr,
1214         ..Default::default()
1215     };
1216 
1217     // Common bootparams settings
1218     if params.hdr.type_of_loader == 0 {
1219         params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
1220     }
1221     params.hdr.cmd_line_ptr = cmdline_addr.raw_value() as u32;
1222     params.hdr.cmdline_size = cmdline_size as u32;
1223 
1224     if let Some(initramfs_config) = initramfs {
1225         params.hdr.ramdisk_image = initramfs_config.address.raw_value() as u32;
1226         params.hdr.ramdisk_size = initramfs_config.size as u32;
1227     }
1228 
1229     add_e820_entry(&mut params, 0, layout::EBDA_START.raw_value(), E820_RAM)?;
1230 
1231     let mem_end = guest_mem.last_addr();
1232     if mem_end < layout::MEM_32BIT_RESERVED_START {
1233         add_e820_entry(
1234             &mut params,
1235             layout::HIGH_RAM_START.raw_value(),
1236             mem_end.unchecked_offset_from(layout::HIGH_RAM_START) + 1,
1237             E820_RAM,
1238         )?;
1239     } else {
1240         add_e820_entry(
1241             &mut params,
1242             layout::HIGH_RAM_START.raw_value(),
1243             layout::MEM_32BIT_RESERVED_START.unchecked_offset_from(layout::HIGH_RAM_START),
1244             E820_RAM,
1245         )?;
1246         if mem_end > layout::RAM_64BIT_START {
1247             add_e820_entry(
1248                 &mut params,
1249                 layout::RAM_64BIT_START.raw_value(),
1250                 mem_end.unchecked_offset_from(layout::RAM_64BIT_START) + 1,
1251                 E820_RAM,
1252             )?;
1253         }
1254     }
1255 
1256     add_e820_entry(
1257         &mut params,
1258         layout::PCI_MMCONFIG_START.0,
1259         layout::PCI_MMCONFIG_SIZE,
1260         E820_RESERVED,
1261     )?;
1262 
1263     if let Some(sgx_epc_region) = sgx_epc_region {
1264         add_e820_entry(
1265             &mut params,
1266             sgx_epc_region.start().raw_value(),
1267             sgx_epc_region.size(),
1268             E820_RESERVED,
1269         )?;
1270     }
1271 
1272     if let Some(rsdp_addr) = rsdp_addr {
1273         params.acpi_rsdp_addr = rsdp_addr.0;
1274     }
1275 
1276     let zero_page_addr = layout::ZERO_PAGE_START;
1277     guest_mem
1278         .checked_offset(zero_page_addr, mem::size_of::<boot_params>())
1279         .ok_or(super::Error::ZeroPagePastRamEnd)?;
1280     guest_mem
1281         .write_obj(params, zero_page_addr)
1282         .map_err(super::Error::ZeroPageSetup)?;
1283 
1284     Ok(())
1285 }
1286 
1287 /// Add an e820 region to the e820 map.
1288 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
1289 fn add_e820_entry(
1290     params: &mut boot_params,
1291     addr: u64,
1292     size: u64,
1293     mem_type: u32,
1294 ) -> Result<(), Error> {
1295     if params.e820_entries >= params.e820_table.len() as u8 {
1296         return Err(Error::E820Configuration);
1297     }
1298 
1299     params.e820_table[params.e820_entries as usize].addr = addr;
1300     params.e820_table[params.e820_entries as usize].size = size;
1301     params.e820_table[params.e820_entries as usize].type_ = mem_type;
1302     params.e820_entries += 1;
1303 
1304     Ok(())
1305 }
1306 
1307 fn add_memmap_entry(memmap: &mut Vec<hvm_memmap_table_entry>, addr: u64, size: u64, mem_type: u32) {
1308     // Add the table entry to the vector
1309     memmap.push(hvm_memmap_table_entry {
1310         addr,
1311         size,
1312         type_: mem_type,
1313         reserved: 0,
1314     });
1315 }
1316 
1317 /// Returns the memory address where the initramfs could be loaded.
1318 pub fn initramfs_load_addr(
1319     guest_mem: &GuestMemoryMmap,
1320     initramfs_size: usize,
1321 ) -> super::Result<u64> {
1322     let first_region = guest_mem
1323         .find_region(GuestAddress::new(0))
1324         .ok_or(super::Error::InitramfsAddress)?;
1325     // It's safe to cast to usize because the size of a region can't be greater than usize.
1326     let lowmem_size = first_region.len() as usize;
1327 
1328     if lowmem_size < initramfs_size {
1329         return Err(super::Error::InitramfsAddress);
1330     }
1331 
1332     let aligned_addr: u64 = ((lowmem_size - initramfs_size) & !(crate::pagesize() - 1)) as u64;
1333     Ok(aligned_addr)
1334 }
1335 
1336 pub fn get_host_cpu_phys_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>) -> u8 {
1337     // SAFETY: call cpuid with valid leaves
1338     unsafe {
1339         let leaf = x86_64::__cpuid(0x8000_0000);
1340 
1341         // Detect and handle AMD SME (Secure Memory Encryption) properly.
1342         // Some physical address bits may become reserved when the feature is enabled.
1343         // See AMD64 Architecture Programmer's Manual Volume 2, Section 7.10.1
1344         let reduced = if leaf.eax >= 0x8000_001f
1345             && matches!(hypervisor.get_cpu_vendor(), CpuVendor::AMD)
1346             && x86_64::__cpuid(0x8000_001f).eax & 0x1 != 0
1347         {
1348             (x86_64::__cpuid(0x8000_001f).ebx >> 6) & 0x3f
1349         } else {
1350             0
1351         };
1352 
1353         if leaf.eax >= 0x8000_0008 {
1354             let leaf = x86_64::__cpuid(0x8000_0008);
1355             ((leaf.eax & 0xff) - reduced) as u8
1356         } else {
1357             36
1358         }
1359     }
1360 }
1361 
1362 fn update_cpuid_topology(
1363     cpuid: &mut Vec<CpuIdEntry>,
1364     threads_per_core: u8,
1365     cores_per_die: u8,
1366     dies_per_package: u8,
1367     cpu_vendor: CpuVendor,
1368     id: u8,
1369 ) {
1370     let x2apic_id = get_x2apic_id(
1371         id as u32,
1372         Some((threads_per_core, cores_per_die, dies_per_package)),
1373     );
1374 
1375     let thread_width = 8 - (threads_per_core - 1).leading_zeros();
1376     let core_width = (8 - (cores_per_die - 1).leading_zeros()) + thread_width;
1377     let die_width = (8 - (dies_per_package - 1).leading_zeros()) + core_width;
1378 
1379     let mut cpu_ebx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX).unwrap_or(0);
1380     cpu_ebx |= ((dies_per_package as u32) * (cores_per_die as u32) * (threads_per_core as u32))
1381         & (0xff << 16);
1382     CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX, cpu_ebx);
1383 
1384     let mut cpu_edx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX).unwrap_or(0);
1385     cpu_edx |= 1 << 28;
1386     CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX, cpu_edx);
1387 
1388     // CPU Topology leaf 0xb
1389     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::EAX, thread_width);
1390     CpuidPatch::set_cpuid_reg(
1391         cpuid,
1392         0xb,
1393         Some(0),
1394         CpuidReg::EBX,
1395         u32::from(threads_per_core),
1396     );
1397     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::ECX, 1 << 8);
1398 
1399     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EAX, die_width);
1400     CpuidPatch::set_cpuid_reg(
1401         cpuid,
1402         0xb,
1403         Some(1),
1404         CpuidReg::EBX,
1405         u32::from(dies_per_package * cores_per_die * threads_per_core),
1406     );
1407     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::ECX, 2 << 8);
1408     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EDX, x2apic_id);
1409 
1410     // CPU Topology leaf 0x1f
1411     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::EAX, thread_width);
1412     CpuidPatch::set_cpuid_reg(
1413         cpuid,
1414         0x1f,
1415         Some(0),
1416         CpuidReg::EBX,
1417         u32::from(threads_per_core),
1418     );
1419     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::ECX, 1 << 8);
1420 
1421     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::EAX, core_width);
1422     CpuidPatch::set_cpuid_reg(
1423         cpuid,
1424         0x1f,
1425         Some(1),
1426         CpuidReg::EBX,
1427         u32::from(cores_per_die * threads_per_core),
1428     );
1429     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::ECX, 2 << 8);
1430 
1431     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::EAX, die_width);
1432     CpuidPatch::set_cpuid_reg(
1433         cpuid,
1434         0x1f,
1435         Some(2),
1436         CpuidReg::EBX,
1437         u32::from(dies_per_package * cores_per_die * threads_per_core),
1438     );
1439     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::ECX, 5 << 8);
1440 
1441     if matches!(cpu_vendor, CpuVendor::AMD) {
1442         CpuidPatch::set_cpuid_reg(
1443             cpuid,
1444             0x8000_001e,
1445             Some(0),
1446             CpuidReg::EBX,
1447             ((threads_per_core as u32 - 1) << 8) | (x2apic_id & 0xff),
1448         );
1449         CpuidPatch::set_cpuid_reg(
1450             cpuid,
1451             0x8000_001e,
1452             Some(0),
1453             CpuidReg::ECX,
1454             ((dies_per_package as u32 - 1) << 8) | (thread_width + die_width) & 0xff,
1455         );
1456         CpuidPatch::set_cpuid_reg(cpuid, 0x8000_001e, Some(0), CpuidReg::EDX, 0);
1457         if cores_per_die * threads_per_core > 1 {
1458             let ecx =
1459                 CpuidPatch::get_cpuid_reg(cpuid, 0x8000_0001, Some(0), CpuidReg::ECX).unwrap_or(0);
1460             CpuidPatch::set_cpuid_reg(
1461                 cpuid,
1462                 0x8000_0001,
1463                 Some(0),
1464                 CpuidReg::ECX,
1465                 ecx | (1u32 << 1) | (1u32 << 22),
1466             );
1467             CpuidPatch::set_cpuid_reg(
1468                 cpuid,
1469                 0x0000_0001,
1470                 Some(0),
1471                 CpuidReg::EBX,
1472                 (x2apic_id << 24) | (8 << 8) | (((cores_per_die * threads_per_core) as u32) << 16),
1473             );
1474             let cpuid_patches = vec![
1475                 // Patch tsc deadline timer bit
1476                 CpuidPatch {
1477                     function: 1,
1478                     index: 0,
1479                     flags_bit: None,
1480                     eax_bit: None,
1481                     ebx_bit: None,
1482                     ecx_bit: None,
1483                     edx_bit: Some(28),
1484                 },
1485             ];
1486             CpuidPatch::patch_cpuid(cpuid, cpuid_patches);
1487             CpuidPatch::set_cpuid_reg(
1488                 cpuid,
1489                 0x8000_0008,
1490                 Some(0),
1491                 CpuidReg::ECX,
1492                 ((thread_width + core_width + die_width) << 12)
1493                     | ((cores_per_die * threads_per_core) - 1) as u32,
1494             );
1495         } else {
1496             CpuidPatch::set_cpuid_reg(cpuid, 0x8000_0008, Some(0), CpuidReg::ECX, 0u32);
1497         }
1498     }
1499 }
1500 
1501 // The goal is to update the CPUID sub-leaves to reflect the number of EPC
1502 // sections exposed to the guest.
1503 fn update_cpuid_sgx(
1504     cpuid: &mut Vec<CpuIdEntry>,
1505     epc_sections: &[SgxEpcSection],
1506 ) -> Result<(), Error> {
1507     // Something's wrong if there's no EPC section.
1508     if epc_sections.is_empty() {
1509         return Err(Error::NoSgxEpcSection);
1510     }
1511     // We can't go further if the hypervisor does not support SGX feature.
1512     if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::EBX, 2) {
1513         return Err(Error::MissingSgxFeature);
1514     }
1515     // We can't go further if the hypervisor does not support SGX_LC feature.
1516     if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::ECX, 30) {
1517         return Err(Error::MissingSgxLaunchControlFeature);
1518     }
1519 
1520     // Get host CPUID for leaf 0x12, subleaf 0x2. This is to retrieve EPC
1521     // properties such as confidentiality and integrity.
1522     // SAFETY: call cpuid with valid leaves
1523     let leaf = unsafe { std::arch::x86_64::__cpuid_count(0x12, 0x2) };
1524 
1525     for (i, epc_section) in epc_sections.iter().enumerate() {
1526         let subleaf_idx = i + 2;
1527         let start = epc_section.start().raw_value();
1528         let size = epc_section.size();
1529         let eax = (start & 0xffff_f000) as u32 | 0x1;
1530         let ebx = (start >> 32) as u32;
1531         let ecx = (size & 0xffff_f000) as u32 | (leaf.ecx & 0xf);
1532         let edx = (size >> 32) as u32;
1533         // CPU Topology leaf 0x12
1534         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, eax);
1535         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, ebx);
1536         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, ecx);
1537         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, edx);
1538     }
1539 
1540     // Add one NULL entry to terminate the dynamic list
1541     let subleaf_idx = epc_sections.len() + 2;
1542     // CPU Topology leaf 0x12
1543     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, 0);
1544     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, 0);
1545     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, 0);
1546     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, 0);
1547 
1548     Ok(())
1549 }
1550 
1551 #[cfg(test)]
1552 mod tests {
1553     use linux_loader::loader::bootparam::boot_e820_entry;
1554 
1555     use super::*;
1556 
1557     #[test]
1558     fn regions_base_addr() {
1559         let regions = arch_memory_regions();
1560         assert_eq!(4, regions.len());
1561         assert_eq!(GuestAddress(0), regions[0].0);
1562         assert_eq!(GuestAddress(1 << 32), regions[1].0);
1563     }
1564 
1565     #[test]
1566     fn test_system_configuration() {
1567         let no_vcpus = 4;
1568         let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap();
1569         let config_err = configure_system(
1570             &gm,
1571             GuestAddress(0),
1572             0,
1573             &None,
1574             1,
1575             None,
1576             Some(layout::RSDP_POINTER),
1577             None,
1578             None,
1579             None,
1580             None,
1581             None,
1582         );
1583         config_err.unwrap_err();
1584 
1585         // Now assigning some memory that falls before the 32bit memory hole.
1586         let arch_mem_regions = arch_memory_regions();
1587         let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1588             .iter()
1589             .filter(|r| r.2 == RegionType::Ram && r.1 != usize::MAX)
1590             .map(|r| (r.0, r.1))
1591             .collect();
1592         let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap();
1593 
1594         configure_system(
1595             &gm,
1596             GuestAddress(0),
1597             0,
1598             &None,
1599             no_vcpus,
1600             None,
1601             None,
1602             None,
1603             None,
1604             None,
1605             None,
1606             None,
1607         )
1608         .unwrap();
1609 
1610         // Now assigning some memory that falls after the 32bit memory hole.
1611         let arch_mem_regions = arch_memory_regions();
1612         let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1613             .iter()
1614             .filter(|r| r.2 == RegionType::Ram)
1615             .map(|r| {
1616                 if r.1 == usize::MAX {
1617                     (r.0, 128 << 20)
1618                 } else {
1619                     (r.0, r.1)
1620                 }
1621             })
1622             .collect();
1623         let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap();
1624         configure_system(
1625             &gm,
1626             GuestAddress(0),
1627             0,
1628             &None,
1629             no_vcpus,
1630             None,
1631             None,
1632             None,
1633             None,
1634             None,
1635             None,
1636             None,
1637         )
1638         .unwrap();
1639 
1640         configure_system(
1641             &gm,
1642             GuestAddress(0),
1643             0,
1644             &None,
1645             no_vcpus,
1646             None,
1647             None,
1648             None,
1649             None,
1650             None,
1651             None,
1652             None,
1653         )
1654         .unwrap();
1655     }
1656 
1657     #[test]
1658     fn test_add_e820_entry() {
1659         let e820_table = [(boot_e820_entry {
1660             addr: 0x1,
1661             size: 4,
1662             type_: 1,
1663         }); 128];
1664 
1665         let expected_params = boot_params {
1666             e820_table,
1667             e820_entries: 1,
1668             ..Default::default()
1669         };
1670 
1671         let mut params: boot_params = Default::default();
1672         add_e820_entry(
1673             &mut params,
1674             e820_table[0].addr,
1675             e820_table[0].size,
1676             e820_table[0].type_,
1677         )
1678         .unwrap();
1679         assert_eq!(
1680             format!("{:?}", params.e820_table[0]),
1681             format!("{:?}", expected_params.e820_table[0])
1682         );
1683         assert_eq!(params.e820_entries, expected_params.e820_entries);
1684 
1685         // Exercise the scenario where the field storing the length of the e820 entry table is
1686         // is bigger than the allocated memory.
1687         params.e820_entries = params.e820_table.len() as u8 + 1;
1688         add_e820_entry(
1689             &mut params,
1690             e820_table[0].addr,
1691             e820_table[0].size,
1692             e820_table[0].type_,
1693         )
1694         .unwrap_err();
1695     }
1696 
1697     #[test]
1698     fn test_add_memmap_entry() {
1699         let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new();
1700 
1701         let expected_memmap = vec![
1702             hvm_memmap_table_entry {
1703                 addr: 0x0,
1704                 size: 0x1000,
1705                 type_: E820_RAM,
1706                 ..Default::default()
1707             },
1708             hvm_memmap_table_entry {
1709                 addr: 0x10000,
1710                 size: 0xa000,
1711                 type_: E820_RESERVED,
1712                 ..Default::default()
1713             },
1714         ];
1715 
1716         add_memmap_entry(&mut memmap, 0, 0x1000, E820_RAM);
1717         add_memmap_entry(&mut memmap, 0x10000, 0xa000, E820_RESERVED);
1718 
1719         assert_eq!(format!("{memmap:?}"), format!("{expected_memmap:?}"));
1720     }
1721 
1722     #[test]
1723     fn test_get_x2apic_id() {
1724         let x2apic_id = get_x2apic_id(0, Some((2, 3, 1)));
1725         assert_eq!(x2apic_id, 0);
1726 
1727         let x2apic_id = get_x2apic_id(1, Some((2, 3, 1)));
1728         assert_eq!(x2apic_id, 1);
1729 
1730         let x2apic_id = get_x2apic_id(2, Some((2, 3, 1)));
1731         assert_eq!(x2apic_id, 2);
1732 
1733         let x2apic_id = get_x2apic_id(6, Some((2, 3, 1)));
1734         assert_eq!(x2apic_id, 8);
1735 
1736         let x2apic_id = get_x2apic_id(7, Some((2, 3, 1)));
1737         assert_eq!(x2apic_id, 9);
1738 
1739         let x2apic_id = get_x2apic_id(8, Some((2, 3, 1)));
1740         assert_eq!(x2apic_id, 10);
1741     }
1742 }
1743