xref: /cloud-hypervisor/arch/src/x86_64/mod.rs (revision 61e57e1cb149de03ae1e0b799b9e5ba9a4a63ace)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 // SPDX-License-Identifier: Apache-2.0
5 //
6 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE-BSD-3-Clause file.
9 use std::sync::Arc;
10 pub mod interrupts;
11 pub mod layout;
12 mod mpspec;
13 mod mptable;
14 pub mod regs;
15 use std::collections::BTreeMap;
16 use std::mem;
17 
18 use hypervisor::arch::x86::{CpuIdEntry, CPUID_FLAG_VALID_INDEX};
19 use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError};
20 use linux_loader::loader::bootparam::{boot_params, setup_header};
21 use linux_loader::loader::elf::start_info::{
22     hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info,
23 };
24 use thiserror::Error;
25 use vm_memory::{
26     Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
27     GuestMemoryRegion, GuestUsize,
28 };
29 
30 use crate::{GuestMemoryMmap, InitramfsConfig, RegionType};
31 mod smbios;
32 use std::arch::x86_64;
33 #[cfg(feature = "tdx")]
34 pub mod tdx;
35 
36 // CPUID feature bits
37 #[cfg(feature = "kvm")]
38 const TSC_DEADLINE_TIMER_ECX_BIT: u8 = 24; // tsc deadline timer ecx bit.
39 const HYPERVISOR_ECX_BIT: u8 = 31; // Hypervisor ecx bit.
40 const MTRR_EDX_BIT: u8 = 12; // Hypervisor ecx bit.
41 const INVARIANT_TSC_EDX_BIT: u8 = 8; // Invariant TSC bit on 0x8000_0007 EDX
42 const AMX_BF16: u8 = 22; // AMX tile computation on bfloat16 numbers
43 const AMX_TILE: u8 = 24; // AMX tile load/store instructions
44 const AMX_INT8: u8 = 25; // AMX tile computation on 8-bit integers
45 
46 // KVM feature bits
47 #[cfg(feature = "tdx")]
48 const KVM_FEATURE_CLOCKSOURCE_BIT: u8 = 0;
49 #[cfg(feature = "tdx")]
50 const KVM_FEATURE_CLOCKSOURCE2_BIT: u8 = 3;
51 #[cfg(feature = "tdx")]
52 const KVM_FEATURE_CLOCKSOURCE_STABLE_BIT: u8 = 24;
53 #[cfg(feature = "tdx")]
54 const KVM_FEATURE_ASYNC_PF_BIT: u8 = 4;
55 #[cfg(feature = "tdx")]
56 const KVM_FEATURE_ASYNC_PF_VMEXIT_BIT: u8 = 10;
57 #[cfg(feature = "tdx")]
58 const KVM_FEATURE_STEAL_TIME_BIT: u8 = 5;
59 
60 pub const _NSIG: i32 = 65;
61 
62 #[derive(Debug, Copy, Clone)]
63 /// Specifies the entry point address where the guest must start
64 /// executing code, as well as which of the supported boot protocols
65 /// is to be used to configure the guest initial state.
66 pub struct EntryPoint {
67     /// Address in guest memory where the guest must start execution
68     pub entry_addr: GuestAddress,
69     /// This field is used for bzImage to fill the zero page
70     pub setup_header: Option<setup_header>,
71 }
72 
73 const E820_RAM: u32 = 1;
74 const E820_RESERVED: u32 = 2;
75 
76 #[derive(Clone)]
77 pub struct SgxEpcSection {
78     start: GuestAddress,
79     size: GuestUsize,
80 }
81 
82 impl SgxEpcSection {
83     pub fn new(start: GuestAddress, size: GuestUsize) -> Self {
84         SgxEpcSection { start, size }
85     }
86     pub fn start(&self) -> GuestAddress {
87         self.start
88     }
89     pub fn size(&self) -> GuestUsize {
90         self.size
91     }
92 }
93 
94 #[derive(Clone)]
95 pub struct SgxEpcRegion {
96     start: GuestAddress,
97     size: GuestUsize,
98     epc_sections: BTreeMap<String, SgxEpcSection>,
99 }
100 
101 impl SgxEpcRegion {
102     pub fn new(start: GuestAddress, size: GuestUsize) -> Self {
103         SgxEpcRegion {
104             start,
105             size,
106             epc_sections: BTreeMap::new(),
107         }
108     }
109     pub fn start(&self) -> GuestAddress {
110         self.start
111     }
112     pub fn size(&self) -> GuestUsize {
113         self.size
114     }
115     pub fn epc_sections(&self) -> &BTreeMap<String, SgxEpcSection> {
116         &self.epc_sections
117     }
118     pub fn insert(&mut self, id: String, epc_section: SgxEpcSection) {
119         self.epc_sections.insert(id, epc_section);
120     }
121 }
122 
123 pub struct CpuidConfig {
124     pub sgx_epc_sections: Option<Vec<SgxEpcSection>>,
125     pub phys_bits: u8,
126     pub kvm_hyperv: bool,
127     #[cfg(feature = "tdx")]
128     pub tdx: bool,
129     pub amx: bool,
130 }
131 
132 #[derive(Debug, Error)]
133 pub enum Error {
134     /// Error writing MP table to memory.
135     #[error("Error writing MP table to memory: {0}")]
136     MpTableSetup(mptable::Error),
137 
138     /// Error configuring the general purpose registers
139     #[error("Error configuring the general purpose registers: {0}")]
140     RegsConfiguration(regs::Error),
141 
142     /// Error configuring the special registers
143     #[error("Error configuring the special registers: {0}")]
144     SregsConfiguration(regs::Error),
145 
146     /// Error configuring the floating point related registers
147     #[error("Error configuring the floating point related registers: {0}")]
148     FpuConfiguration(regs::Error),
149 
150     /// Error configuring the MSR registers
151     #[error("Error configuring the MSR registers: {0}")]
152     MsrsConfiguration(regs::Error),
153 
154     /// Failed to set supported CPUs.
155     #[error("Failed to set supported CPUs: {0}")]
156     SetSupportedCpusFailed(anyhow::Error),
157 
158     /// Cannot set the local interruption due to bad configuration.
159     #[error("Cannot set the local interruption due to bad configuration: {0}")]
160     LocalIntConfiguration(anyhow::Error),
161 
162     /// Error setting up SMBIOS table
163     #[error("Error setting up SMBIOS table: {0}")]
164     SmbiosSetup(smbios::Error),
165 
166     /// Could not find any SGX EPC section
167     #[error("Could not find any SGX EPC section")]
168     NoSgxEpcSection,
169 
170     /// Missing SGX CPU feature
171     #[error("Missing SGX CPU feature")]
172     MissingSgxFeature,
173 
174     /// Missing SGX_LC CPU feature
175     #[error("Missing SGX_LC CPU feature")]
176     MissingSgxLaunchControlFeature,
177 
178     /// Error getting supported CPUID through the hypervisor (kvm/mshv) API
179     #[error("Error getting supported CPUID through the hypervisor API: {0}")]
180     CpuidGetSupported(HypervisorError),
181 
182     /// Error populating CPUID with KVM HyperV emulation details
183     #[error("Error populating CPUID with KVM HyperV emulation details: {0}")]
184     CpuidKvmHyperV(vmm_sys_util::fam::Error),
185 
186     /// Error populating CPUID with CPU identification
187     #[error("Error populating CPUID with CPU identification: {0}")]
188     CpuidIdentification(vmm_sys_util::fam::Error),
189 
190     /// Error checking CPUID compatibility
191     #[error("Error checking CPUID compatibility")]
192     CpuidCheckCompatibility,
193 
194     // Error writing EBDA address
195     #[error("Error writing EBDA address: {0}")]
196     EbdaSetup(vm_memory::GuestMemoryError),
197 
198     // Error getting CPU TSC frequency
199     #[error("Error getting CPU TSC frequency: {0}")]
200     GetTscFrequency(HypervisorCpuError),
201 
202     /// Error retrieving TDX capabilities through the hypervisor (kvm/mshv) API
203     #[cfg(feature = "tdx")]
204     #[error("Error retrieving TDX capabilities through the hypervisor API: {0}")]
205     TdxCapabilities(HypervisorError),
206 
207     /// Failed to configure E820 map for bzImage
208     #[error("Failed to configure E820 map for bzImage")]
209     E820Configuration,
210 }
211 
212 impl From<Error> for super::Error {
213     fn from(e: Error) -> super::Error {
214         super::Error::PlatformSpecific(e)
215     }
216 }
217 
218 pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 {
219     if let Some(t) = topology {
220         let thread_mask_width = u8::BITS - (t.0 - 1).leading_zeros();
221         let core_mask_width = u8::BITS - (t.1 - 1).leading_zeros();
222         let die_mask_width = u8::BITS - (t.2 - 1).leading_zeros();
223 
224         let thread_id = cpu_id % (t.0 as u32);
225         let core_id = cpu_id / (t.0 as u32) % (t.1 as u32);
226         let die_id = cpu_id / ((t.0 * t.1) as u32) % (t.2 as u32);
227         let socket_id = cpu_id / ((t.0 * t.1 * t.2) as u32);
228 
229         return thread_id
230             | (core_id << thread_mask_width)
231             | (die_id << (thread_mask_width + core_mask_width))
232             | (socket_id << (thread_mask_width + core_mask_width + die_mask_width));
233     }
234 
235     cpu_id
236 }
237 
238 #[derive(Copy, Clone, Debug)]
239 pub enum CpuidReg {
240     EAX,
241     EBX,
242     ECX,
243     EDX,
244 }
245 
246 pub struct CpuidPatch {
247     pub function: u32,
248     pub index: u32,
249     pub flags_bit: Option<u8>,
250     pub eax_bit: Option<u8>,
251     pub ebx_bit: Option<u8>,
252     pub ecx_bit: Option<u8>,
253     pub edx_bit: Option<u8>,
254 }
255 
256 impl CpuidPatch {
257     pub fn get_cpuid_reg(
258         cpuid: &[CpuIdEntry],
259         function: u32,
260         index: Option<u32>,
261         reg: CpuidReg,
262     ) -> Option<u32> {
263         for entry in cpuid.iter() {
264             if entry.function == function && (index.is_none() || index.unwrap() == entry.index) {
265                 return match reg {
266                     CpuidReg::EAX => Some(entry.eax),
267                     CpuidReg::EBX => Some(entry.ebx),
268                     CpuidReg::ECX => Some(entry.ecx),
269                     CpuidReg::EDX => Some(entry.edx),
270                 };
271             }
272         }
273 
274         None
275     }
276 
277     pub fn set_cpuid_reg(
278         cpuid: &mut Vec<CpuIdEntry>,
279         function: u32,
280         index: Option<u32>,
281         reg: CpuidReg,
282         value: u32,
283     ) {
284         let mut entry_found = false;
285         for entry in cpuid.iter_mut() {
286             if entry.function == function && (index.is_none() || index.unwrap() == entry.index) {
287                 entry_found = true;
288                 match reg {
289                     CpuidReg::EAX => {
290                         entry.eax = value;
291                     }
292                     CpuidReg::EBX => {
293                         entry.ebx = value;
294                     }
295                     CpuidReg::ECX => {
296                         entry.ecx = value;
297                     }
298                     CpuidReg::EDX => {
299                         entry.edx = value;
300                     }
301                 }
302             }
303         }
304 
305         if entry_found {
306             return;
307         }
308 
309         // Entry not found, so let's add it.
310         if let Some(index) = index {
311             let mut entry = CpuIdEntry {
312                 function,
313                 index,
314                 flags: CPUID_FLAG_VALID_INDEX,
315                 ..Default::default()
316             };
317             match reg {
318                 CpuidReg::EAX => {
319                     entry.eax = value;
320                 }
321                 CpuidReg::EBX => {
322                     entry.ebx = value;
323                 }
324                 CpuidReg::ECX => {
325                     entry.ecx = value;
326                 }
327                 CpuidReg::EDX => {
328                     entry.edx = value;
329                 }
330             }
331 
332             cpuid.push(entry);
333         }
334     }
335 
336     pub fn patch_cpuid(cpuid: &mut [CpuIdEntry], patches: Vec<CpuidPatch>) {
337         for entry in cpuid {
338             for patch in patches.iter() {
339                 if entry.function == patch.function && entry.index == patch.index {
340                     if let Some(flags_bit) = patch.flags_bit {
341                         entry.flags |= 1 << flags_bit;
342                     }
343                     if let Some(eax_bit) = patch.eax_bit {
344                         entry.eax |= 1 << eax_bit;
345                     }
346                     if let Some(ebx_bit) = patch.ebx_bit {
347                         entry.ebx |= 1 << ebx_bit;
348                     }
349                     if let Some(ecx_bit) = patch.ecx_bit {
350                         entry.ecx |= 1 << ecx_bit;
351                     }
352                     if let Some(edx_bit) = patch.edx_bit {
353                         entry.edx |= 1 << edx_bit;
354                     }
355                 }
356             }
357         }
358     }
359 
360     pub fn is_feature_enabled(
361         cpuid: &[CpuIdEntry],
362         function: u32,
363         index: u32,
364         reg: CpuidReg,
365         feature_bit: usize,
366     ) -> bool {
367         let mask = 1 << feature_bit;
368 
369         for entry in cpuid {
370             if entry.function == function && entry.index == index {
371                 let reg_val = match reg {
372                     CpuidReg::EAX => entry.eax,
373                     CpuidReg::EBX => entry.ebx,
374                     CpuidReg::ECX => entry.ecx,
375                     CpuidReg::EDX => entry.edx,
376                 };
377 
378                 return (reg_val & mask) == mask;
379             }
380         }
381 
382         false
383     }
384 }
385 
386 #[derive(Debug)]
387 enum CpuidCompatibleCheck {
388     BitwiseSubset, // bitwise subset
389     Equal,         // equal in value
390     NumNotGreater, // smaller or equal as a number
391 }
392 
393 pub struct CpuidFeatureEntry {
394     function: u32,
395     index: u32,
396     feature_reg: CpuidReg,
397     compatible_check: CpuidCompatibleCheck,
398 }
399 
400 impl CpuidFeatureEntry {
401     fn checked_feature_entry_list() -> Vec<CpuidFeatureEntry> {
402         vec![
403             // The following list includes all hardware features bits from
404             // the CPUID Wiki Page: https://en.wikipedia.org/wiki/CPUID
405             // Leaf 0x1, ECX/EDX, feature bits
406             CpuidFeatureEntry {
407                 function: 1,
408                 index: 0,
409                 feature_reg: CpuidReg::ECX,
410                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
411             },
412             CpuidFeatureEntry {
413                 function: 1,
414                 index: 0,
415                 feature_reg: CpuidReg::EDX,
416                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
417             },
418             // Leaf 0x7, EAX/EBX/ECX/EDX, extended features
419             CpuidFeatureEntry {
420                 function: 7,
421                 index: 0,
422                 feature_reg: CpuidReg::EAX,
423                 compatible_check: CpuidCompatibleCheck::NumNotGreater,
424             },
425             CpuidFeatureEntry {
426                 function: 7,
427                 index: 0,
428                 feature_reg: CpuidReg::EBX,
429                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
430             },
431             CpuidFeatureEntry {
432                 function: 7,
433                 index: 0,
434                 feature_reg: CpuidReg::ECX,
435                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
436             },
437             CpuidFeatureEntry {
438                 function: 7,
439                 index: 0,
440                 feature_reg: CpuidReg::EDX,
441                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
442             },
443             // Leaf 0x7 subleaf 0x1, EAX, extended features
444             CpuidFeatureEntry {
445                 function: 7,
446                 index: 1,
447                 feature_reg: CpuidReg::EAX,
448                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
449             },
450             // Leaf 0x8000_0001, ECX/EDX, CPUID features bits
451             CpuidFeatureEntry {
452                 function: 0x8000_0001,
453                 index: 0,
454                 feature_reg: CpuidReg::ECX,
455                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
456             },
457             CpuidFeatureEntry {
458                 function: 0x8000_0001,
459                 index: 0,
460                 feature_reg: CpuidReg::EDX,
461                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
462             },
463             // KVM CPUID bits: https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html
464             // Leaf 0x4000_0000, EAX/EBX/ECX/EDX, KVM CPUID SIGNATURE
465             CpuidFeatureEntry {
466                 function: 0x4000_0000,
467                 index: 0,
468                 feature_reg: CpuidReg::EAX,
469                 compatible_check: CpuidCompatibleCheck::NumNotGreater,
470             },
471             CpuidFeatureEntry {
472                 function: 0x4000_0000,
473                 index: 0,
474                 feature_reg: CpuidReg::EBX,
475                 compatible_check: CpuidCompatibleCheck::Equal,
476             },
477             CpuidFeatureEntry {
478                 function: 0x4000_0000,
479                 index: 0,
480                 feature_reg: CpuidReg::ECX,
481                 compatible_check: CpuidCompatibleCheck::Equal,
482             },
483             CpuidFeatureEntry {
484                 function: 0x4000_0000,
485                 index: 0,
486                 feature_reg: CpuidReg::EDX,
487                 compatible_check: CpuidCompatibleCheck::Equal,
488             },
489             // Leaf 0x4000_0001, EAX/EBX/ECX/EDX, KVM CPUID features
490             CpuidFeatureEntry {
491                 function: 0x4000_0001,
492                 index: 0,
493                 feature_reg: CpuidReg::EAX,
494                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
495             },
496             CpuidFeatureEntry {
497                 function: 0x4000_0001,
498                 index: 0,
499                 feature_reg: CpuidReg::EBX,
500                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
501             },
502             CpuidFeatureEntry {
503                 function: 0x4000_0001,
504                 index: 0,
505                 feature_reg: CpuidReg::ECX,
506                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
507             },
508             CpuidFeatureEntry {
509                 function: 0x4000_0001,
510                 index: 0,
511                 feature_reg: CpuidReg::EDX,
512                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
513             },
514         ]
515     }
516 
517     fn get_features_from_cpuid(
518         cpuid: &[CpuIdEntry],
519         feature_entry_list: &[CpuidFeatureEntry],
520     ) -> Vec<u32> {
521         let mut features = vec![0; feature_entry_list.len()];
522         for (i, feature_entry) in feature_entry_list.iter().enumerate() {
523             for cpuid_entry in cpuid {
524                 if cpuid_entry.function == feature_entry.function
525                     && cpuid_entry.index == feature_entry.index
526                 {
527                     match feature_entry.feature_reg {
528                         CpuidReg::EAX => {
529                             features[i] = cpuid_entry.eax;
530                         }
531                         CpuidReg::EBX => {
532                             features[i] = cpuid_entry.ebx;
533                         }
534                         CpuidReg::ECX => {
535                             features[i] = cpuid_entry.ecx;
536                         }
537                         CpuidReg::EDX => {
538                             features[i] = cpuid_entry.edx;
539                         }
540                     }
541 
542                     break;
543                 }
544             }
545         }
546 
547         features
548     }
549 
550     // The function returns `Error` (a.k.a. "incompatible"), when the CPUID features from `src_vm_cpuid`
551     // is not a subset of those of the `dest_vm_cpuid`.
552     pub fn check_cpuid_compatibility(
553         src_vm_cpuid: &[CpuIdEntry],
554         dest_vm_cpuid: &[CpuIdEntry],
555     ) -> Result<(), Error> {
556         let feature_entry_list = &Self::checked_feature_entry_list();
557         let src_vm_features = Self::get_features_from_cpuid(src_vm_cpuid, feature_entry_list);
558         let dest_vm_features = Self::get_features_from_cpuid(dest_vm_cpuid, feature_entry_list);
559 
560         // Loop on feature bit and check if the 'source vm' feature is a subset
561         // of those of the 'destination vm' feature
562         let mut compatible = true;
563         for (i, (src_vm_feature, dest_vm_feature)) in src_vm_features
564             .iter()
565             .zip(dest_vm_features.iter())
566             .enumerate()
567         {
568             let entry = &feature_entry_list[i];
569             let entry_compatible = match entry.compatible_check {
570                 CpuidCompatibleCheck::BitwiseSubset => {
571                     let different_feature_bits = src_vm_feature ^ dest_vm_feature;
572                     let src_vm_feature_bits_only = different_feature_bits & src_vm_feature;
573                     src_vm_feature_bits_only == 0
574                 }
575                 CpuidCompatibleCheck::Equal => src_vm_feature == dest_vm_feature,
576                 CpuidCompatibleCheck::NumNotGreater => src_vm_feature <= dest_vm_feature,
577             };
578             if !entry_compatible {
579                 error!(
580                     "Detected incompatible CPUID entry: leaf={:#02x} (subleaf={:#02x}), register='{:?}', \
581                     compatible_check='{:?}', source VM feature='{:#04x}', destination VM feature'{:#04x}'.",
582                     entry.function, entry.index, entry.feature_reg,
583                     entry.compatible_check, src_vm_feature, dest_vm_feature
584                     );
585 
586                 compatible = false;
587             }
588         }
589 
590         if compatible {
591             info!("No CPU incompatibility detected.");
592             Ok(())
593         } else {
594             Err(Error::CpuidCheckCompatibility)
595         }
596     }
597 }
598 
599 pub fn generate_common_cpuid(
600     hypervisor: &Arc<dyn hypervisor::Hypervisor>,
601     config: &CpuidConfig,
602 ) -> super::Result<Vec<CpuIdEntry>> {
603     // SAFETY: cpuid called with valid leaves
604     if unsafe { x86_64::__cpuid(1) }.ecx & 1 << HYPERVISOR_ECX_BIT == 1 << HYPERVISOR_ECX_BIT {
605         // SAFETY: cpuid called with valid leaves
606         let hypervisor_cpuid = unsafe { x86_64::__cpuid(0x4000_0000) };
607 
608         let mut identifier: [u8; 12] = [0; 12];
609         identifier[0..4].copy_from_slice(&hypervisor_cpuid.ebx.to_le_bytes()[..]);
610         identifier[4..8].copy_from_slice(&hypervisor_cpuid.ecx.to_le_bytes()[..]);
611         identifier[8..12].copy_from_slice(&hypervisor_cpuid.edx.to_le_bytes()[..]);
612 
613         info!(
614             "Running under nested virtualisation. Hypervisor string: {}",
615             String::from_utf8_lossy(&identifier)
616         );
617     }
618 
619     info!(
620         "Generating guest CPUID for with physical address size: {}",
621         config.phys_bits
622     );
623     #[allow(unused_mut)]
624     let mut cpuid_patches = vec![
625         // Patch hypervisor bit
626         CpuidPatch {
627             function: 1,
628             index: 0,
629             flags_bit: None,
630             eax_bit: None,
631             ebx_bit: None,
632             ecx_bit: Some(HYPERVISOR_ECX_BIT),
633             edx_bit: None,
634         },
635         // Enable MTRR feature
636         CpuidPatch {
637             function: 1,
638             index: 0,
639             flags_bit: None,
640             eax_bit: None,
641             ebx_bit: None,
642             ecx_bit: None,
643             edx_bit: Some(MTRR_EDX_BIT),
644         },
645     ];
646 
647     #[cfg(feature = "kvm")]
648     if matches!(
649         hypervisor.hypervisor_type(),
650         hypervisor::HypervisorType::Kvm
651     ) {
652         // Patch tsc deadline timer bit
653         cpuid_patches.push(CpuidPatch {
654             function: 1,
655             index: 0,
656             flags_bit: None,
657             eax_bit: None,
658             ebx_bit: None,
659             ecx_bit: Some(TSC_DEADLINE_TIMER_ECX_BIT),
660             edx_bit: None,
661         });
662     }
663 
664     // Supported CPUID
665     let mut cpuid = hypervisor
666         .get_supported_cpuid()
667         .map_err(Error::CpuidGetSupported)?;
668 
669     CpuidPatch::patch_cpuid(&mut cpuid, cpuid_patches);
670 
671     if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
672         update_cpuid_sgx(&mut cpuid, sgx_epc_sections)?;
673     }
674 
675     #[cfg(feature = "tdx")]
676     let tdx_capabilities = if config.tdx {
677         let caps = hypervisor
678             .tdx_capabilities()
679             .map_err(Error::TdxCapabilities)?;
680         info!("TDX capabilities {:#?}", caps);
681         Some(caps)
682     } else {
683         None
684     };
685 
686     // Update some existing CPUID
687     for entry in cpuid.as_mut_slice().iter_mut() {
688         match entry.function {
689             // Clear AMX related bits if the AMX feature is not enabled
690             0x7 => {
691                 if !config.amx && entry.index == 0 {
692                     entry.edx &= !(1 << AMX_BF16 | 1 << AMX_TILE | 1 << AMX_INT8)
693                 }
694             }
695             0xd =>
696             {
697                 #[cfg(feature = "tdx")]
698                 if let Some(caps) = &tdx_capabilities {
699                     let xcr0_mask: u64 = 0x82ff;
700                     let xss_mask: u64 = !xcr0_mask;
701                     if entry.index == 0 {
702                         entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32);
703                         entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32);
704                         entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32;
705                         entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32;
706                     } else if entry.index == 1 {
707                         entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32);
708                         entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32);
709                         entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32;
710                         entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32;
711                     }
712                 }
713             }
714             // Copy host L1 cache details if not populated by KVM
715             0x8000_0005 => {
716                 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 {
717                     // SAFETY: cpuid called with valid leaves
718                     if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 {
719                         // SAFETY: cpuid called with valid leaves
720                         let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) };
721                         entry.eax = leaf.eax;
722                         entry.ebx = leaf.ebx;
723                         entry.ecx = leaf.ecx;
724                         entry.edx = leaf.edx;
725                     }
726                 }
727             }
728             // Copy host L2 cache details if not populated by KVM
729             0x8000_0006 => {
730                 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 {
731                     // SAFETY: cpuid called with valid leaves
732                     if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 {
733                         // SAFETY: cpuid called with valid leaves
734                         let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) };
735                         entry.eax = leaf.eax;
736                         entry.ebx = leaf.ebx;
737                         entry.ecx = leaf.ecx;
738                         entry.edx = leaf.edx;
739                     }
740                 }
741             }
742             // Set CPU physical bits
743             0x8000_0008 => {
744                 entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff);
745             }
746             0x4000_0001 => {
747                 // These features are not supported by TDX
748                 #[cfg(feature = "tdx")]
749                 if config.tdx {
750                     entry.eax &= !(1 << KVM_FEATURE_CLOCKSOURCE_BIT
751                         | 1 << KVM_FEATURE_CLOCKSOURCE2_BIT
752                         | 1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT
753                         | 1 << KVM_FEATURE_ASYNC_PF_BIT
754                         | 1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT
755                         | 1 << KVM_FEATURE_STEAL_TIME_BIT)
756                 }
757             }
758             _ => {}
759         }
760     }
761 
762     // Copy CPU identification string
763     for i in 0x8000_0002..=0x8000_0004 {
764         cpuid.retain(|c| c.function != i);
765         // SAFETY: call cpuid with valid leaves
766         let leaf = unsafe { std::arch::x86_64::__cpuid(i) };
767         cpuid.push(CpuIdEntry {
768             function: i,
769             eax: leaf.eax,
770             ebx: leaf.ebx,
771             ecx: leaf.ecx,
772             edx: leaf.edx,
773             ..Default::default()
774         });
775     }
776 
777     if config.kvm_hyperv {
778         // Remove conflicting entries
779         cpuid.retain(|c| c.function != 0x4000_0000);
780         cpuid.retain(|c| c.function != 0x4000_0001);
781         // See "Hypervisor Top Level Functional Specification" for details
782         // Compliance with "Hv#1" requires leaves up to 0x4000_000a
783         cpuid.push(CpuIdEntry {
784             function: 0x40000000,
785             eax: 0x4000000a, // Maximum cpuid leaf
786             ebx: 0x756e694c, // "Linu"
787             ecx: 0x564b2078, // "x KV"
788             edx: 0x7648204d, // "M Hv"
789             ..Default::default()
790         });
791         cpuid.push(CpuIdEntry {
792             function: 0x40000001,
793             eax: 0x31237648, // "Hv#1"
794             ..Default::default()
795         });
796         cpuid.push(CpuIdEntry {
797             function: 0x40000002,
798             eax: 0x3839,  // "Build number"
799             ebx: 0xa0000, // "Version"
800             ..Default::default()
801         });
802         cpuid.push(CpuIdEntry {
803             function: 0x4000_0003,
804             eax: 1 << 1 // AccessPartitionReferenceCounter
805                    | 1 << 2 // AccessSynicRegs
806                    | 1 << 3 // AccessSyntheticTimerRegs
807                    | 1 << 9, // AccessPartitionReferenceTsc
808             edx: 1 << 3, // CPU dynamic partitioning
809             ..Default::default()
810         });
811         cpuid.push(CpuIdEntry {
812             function: 0x4000_0004,
813             eax: 1 << 5, // Recommend relaxed timing
814             ..Default::default()
815         });
816         for i in 0x4000_0005..=0x4000_000a {
817             cpuid.push(CpuIdEntry {
818                 function: i,
819                 ..Default::default()
820             });
821         }
822     }
823 
824     Ok(cpuid)
825 }
826 
827 pub fn configure_vcpu(
828     vcpu: &Arc<dyn hypervisor::Vcpu>,
829     id: u8,
830     boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
831     cpuid: Vec<CpuIdEntry>,
832     kvm_hyperv: bool,
833     cpu_vendor: CpuVendor,
834     topology: Option<(u8, u8, u8)>,
835 ) -> super::Result<()> {
836     let x2apic_id = get_x2apic_id(id as u32, topology);
837 
838     // Per vCPU CPUID changes; common are handled via generate_common_cpuid()
839     let mut cpuid = cpuid;
840     CpuidPatch::set_cpuid_reg(&mut cpuid, 0xb, None, CpuidReg::EDX, x2apic_id);
841     CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1f, None, CpuidReg::EDX, x2apic_id);
842     if matches!(cpu_vendor, CpuVendor::AMD) {
843         CpuidPatch::set_cpuid_reg(&mut cpuid, 0x8000_001e, Some(0), CpuidReg::EAX, x2apic_id);
844     }
845 
846     // Set ApicId in cpuid for each vcpu - found in cpuid ebx when eax = 1
847     let mut apic_id_patched = false;
848     for entry in &mut cpuid {
849         if entry.function == 1 {
850             entry.ebx &= 0xffffff;
851             entry.ebx |= x2apic_id << 24;
852             apic_id_patched = true;
853             break;
854         }
855     }
856     assert!(apic_id_patched);
857 
858     if let Some(t) = topology {
859         update_cpuid_topology(&mut cpuid, t.0, t.1, t.2, cpu_vendor, id);
860     }
861 
862     // The TSC frequency CPUID leaf should not be included when running with HyperV emulation
863     if !kvm_hyperv {
864         if let Some(tsc_khz) = vcpu.tsc_khz().map_err(Error::GetTscFrequency)? {
865             // Need to check that the TSC doesn't vary with dynamic frequency
866             // SAFETY: cpuid called with valid leaves
867             if unsafe { std::arch::x86_64::__cpuid(0x8000_0007) }.edx
868                 & (1u32 << INVARIANT_TSC_EDX_BIT)
869                 > 0
870             {
871                 CpuidPatch::set_cpuid_reg(
872                     &mut cpuid,
873                     0x4000_0000,
874                     None,
875                     CpuidReg::EAX,
876                     0x4000_0010,
877                 );
878                 cpuid.retain(|c| c.function != 0x4000_0010);
879                 cpuid.push(CpuIdEntry {
880                     function: 0x4000_0010,
881                     eax: tsc_khz,
882                     ebx: 1000000, /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's
883                                    * APIC_BUS_CYCLE_NS */
884                     ..Default::default()
885                 });
886             };
887         }
888     }
889 
890     vcpu.set_cpuid2(&cpuid)
891         .map_err(|e| Error::SetSupportedCpusFailed(e.into()))?;
892 
893     if kvm_hyperv {
894         vcpu.enable_hyperv_synic().unwrap();
895     }
896 
897     regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?;
898     if let Some((kernel_entry_point, guest_memory)) = boot_setup {
899         regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?;
900         regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?;
901         regs::setup_sregs(&guest_memory.memory(), vcpu).map_err(Error::SregsConfiguration)?;
902     }
903     interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?;
904     Ok(())
905 }
906 
907 /// Returns a Vec of the valid memory addresses.
908 ///
909 /// These should be used to configure the GuestMemory structure for the platform.
910 /// For x86_64 all addresses are valid from the start of the kernel except a
911 /// carve out at the end of 32bit address space.
912 pub fn arch_memory_regions() -> Vec<(GuestAddress, usize, RegionType)> {
913     vec![
914         // 0 GiB ~ 3GiB: memory before the gap
915         (
916             GuestAddress(0),
917             layout::MEM_32BIT_RESERVED_START.raw_value() as usize,
918             RegionType::Ram,
919         ),
920         // 4 GiB ~ inf: memory after the gap
921         (layout::RAM_64BIT_START, usize::MAX, RegionType::Ram),
922         // 3 GiB ~ 3712 MiB: 32-bit device memory hole
923         (
924             layout::MEM_32BIT_RESERVED_START,
925             layout::MEM_32BIT_DEVICES_SIZE as usize,
926             RegionType::SubRegion,
927         ),
928         // 3712 MiB ~ 3968 MiB: 32-bit reserved memory hole
929         (
930             layout::MEM_32BIT_RESERVED_START.unchecked_add(layout::MEM_32BIT_DEVICES_SIZE),
931             (layout::MEM_32BIT_RESERVED_SIZE - layout::MEM_32BIT_DEVICES_SIZE) as usize,
932             RegionType::Reserved,
933         ),
934     ]
935 }
936 
937 /// Configures the system and should be called once per vm before starting vcpu threads.
938 ///
939 /// # Arguments
940 ///
941 /// * `guest_mem` - The memory to be used by the guest.
942 /// * `cmdline_addr` - Address in `guest_mem` where the kernel command line was loaded.
943 /// * `cmdline_size` - Size of the kernel command line in bytes including the null terminator.
944 /// * `num_cpus` - Number of virtual CPUs the guest will have.
945 #[allow(clippy::too_many_arguments)]
946 pub fn configure_system(
947     guest_mem: &GuestMemoryMmap,
948     cmdline_addr: GuestAddress,
949     cmdline_size: usize,
950     initramfs: &Option<InitramfsConfig>,
951     _num_cpus: u8,
952     setup_header: Option<setup_header>,
953     rsdp_addr: Option<GuestAddress>,
954     sgx_epc_region: Option<SgxEpcRegion>,
955     serial_number: Option<&str>,
956     uuid: Option<&str>,
957     oem_strings: Option<&[&str]>,
958     topology: Option<(u8, u8, u8)>,
959 ) -> super::Result<()> {
960     // Write EBDA address to location where ACPICA expects to find it
961     guest_mem
962         .write_obj((layout::EBDA_START.0 >> 4) as u16, layout::EBDA_POINTER)
963         .map_err(Error::EbdaSetup)?;
964 
965     let size = smbios::setup_smbios(guest_mem, serial_number, uuid, oem_strings)
966         .map_err(Error::SmbiosSetup)?;
967 
968     // Place the MP table after the SMIOS table aligned to 16 bytes
969     let offset = GuestAddress(layout::SMBIOS_START).unchecked_add(size);
970     let offset = GuestAddress((offset.0 + 16) & !0xf);
971     mptable::setup_mptable(offset, guest_mem, _num_cpus, topology).map_err(Error::MpTableSetup)?;
972 
973     // Check that the RAM is not smaller than the RSDP start address
974     if let Some(rsdp_addr) = rsdp_addr {
975         if rsdp_addr.0 > guest_mem.last_addr().0 {
976             return Err(super::Error::RsdpPastRamEnd);
977         }
978     }
979 
980     match setup_header {
981         Some(hdr) => configure_32bit_entry(
982             guest_mem,
983             cmdline_addr,
984             cmdline_size,
985             initramfs,
986             hdr,
987             rsdp_addr,
988             sgx_epc_region,
989         ),
990         None => configure_pvh(
991             guest_mem,
992             cmdline_addr,
993             initramfs,
994             rsdp_addr,
995             sgx_epc_region,
996         ),
997     }
998 }
999 
1000 type RamRange = (u64, u64);
1001 
1002 /// Returns usable physical memory ranges for the guest
1003 /// These should be used to create e820_RAM memory maps
1004 pub fn generate_ram_ranges(guest_mem: &GuestMemoryMmap) -> super::Result<Vec<RamRange>> {
1005     // Merge continuous memory regions into one region.
1006     // Note: memory regions from "GuestMemory" are sorted and non-zero sized.
1007     let ram_regions = {
1008         let mut ram_regions = Vec::new();
1009         let mut current_start = guest_mem
1010             .iter()
1011             .next()
1012             .map(GuestMemoryRegion::start_addr)
1013             .expect("GuestMemory must have one memory region at least")
1014             .raw_value();
1015         let mut current_end = current_start;
1016 
1017         for (start, size) in guest_mem
1018             .iter()
1019             .map(|m| (m.start_addr().raw_value(), m.len()))
1020         {
1021             if current_end == start {
1022                 // This zone is continuous with the previous one.
1023                 current_end += size;
1024             } else {
1025                 ram_regions.push((current_start, current_end));
1026 
1027                 current_start = start;
1028                 current_end = start + size;
1029             }
1030         }
1031 
1032         ram_regions.push((current_start, current_end));
1033 
1034         ram_regions
1035     };
1036 
1037     // Create the memory map entry for memory region before the gap
1038     let mut ram_ranges = vec![];
1039 
1040     // Generate the first usable physical memory range before the gap. The e820 map
1041     // should only report memory above 1MiB.
1042     let first_ram_range = {
1043         let (first_region_start, first_region_end) =
1044             ram_regions.first().ok_or(super::Error::MemmapTableSetup)?;
1045         let high_ram_start = layout::HIGH_RAM_START.raw_value();
1046         let mem_32bit_reserved_start = layout::MEM_32BIT_RESERVED_START.raw_value();
1047 
1048         if !((first_region_start <= &high_ram_start)
1049             && (first_region_end > &high_ram_start)
1050             && (first_region_end <= &mem_32bit_reserved_start))
1051         {
1052             error!(
1053                 "Unexpected first memory region layout: (start: 0x{:08x}, end: 0x{:08x}).
1054                 high_ram_start: 0x{:08x}, mem_32bit_reserved_start: 0x{:08x}",
1055                 first_region_start, first_region_end, high_ram_start, mem_32bit_reserved_start
1056             );
1057 
1058             return Err(super::Error::MemmapTableSetup);
1059         }
1060 
1061         info!(
1062             "first usable physical memory range, start: 0x{:08x}, end: 0x{:08x}",
1063             high_ram_start, first_region_end
1064         );
1065 
1066         (high_ram_start, *first_region_end)
1067     };
1068     ram_ranges.push(first_ram_range);
1069 
1070     // Generate additional usable physical memory range after the gap if any.
1071     for ram_region in ram_regions.iter().skip(1) {
1072         info!(
1073             "found usable physical memory range, start: 0x{:08x}, end: 0x{:08x}",
1074             ram_region.0, ram_region.1
1075         );
1076 
1077         ram_ranges.push(*ram_region);
1078     }
1079 
1080     Ok(ram_ranges)
1081 }
1082 
1083 fn configure_pvh(
1084     guest_mem: &GuestMemoryMmap,
1085     cmdline_addr: GuestAddress,
1086     initramfs: &Option<InitramfsConfig>,
1087     rsdp_addr: Option<GuestAddress>,
1088     sgx_epc_region: Option<SgxEpcRegion>,
1089 ) -> super::Result<()> {
1090     const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336ec578;
1091 
1092     let mut start_info = hvm_start_info {
1093         magic: XEN_HVM_START_MAGIC_VALUE,
1094         version: 1, // pvh has version 1
1095         nr_modules: 0,
1096         cmdline_paddr: cmdline_addr.raw_value(),
1097         memmap_paddr: layout::MEMMAP_START.raw_value(),
1098         ..Default::default()
1099     };
1100 
1101     if let Some(rsdp_addr) = rsdp_addr {
1102         start_info.rsdp_paddr = rsdp_addr.0;
1103     }
1104 
1105     if let Some(initramfs_config) = initramfs {
1106         // The initramfs has been written to guest memory already, here we just need to
1107         // create the module structure that describes it.
1108         let ramdisk_mod = hvm_modlist_entry {
1109             paddr: initramfs_config.address.raw_value(),
1110             size: initramfs_config.size as u64,
1111             ..Default::default()
1112         };
1113 
1114         start_info.nr_modules += 1;
1115         start_info.modlist_paddr = layout::MODLIST_START.raw_value();
1116 
1117         // Write the modlist struct to guest memory.
1118         guest_mem
1119             .write_obj(ramdisk_mod, layout::MODLIST_START)
1120             .map_err(super::Error::ModlistSetup)?;
1121     }
1122 
1123     // Vector to hold the memory maps which needs to be written to guest memory
1124     // at MEMMAP_START after all of the mappings are recorded.
1125     let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new();
1126 
1127     // Create the memory map entries.
1128     add_memmap_entry(&mut memmap, 0, layout::EBDA_START.raw_value(), E820_RAM);
1129 
1130     // Get usable physical memory ranges
1131     let ram_ranges = generate_ram_ranges(guest_mem)?;
1132 
1133     // Create e820 memory map entries
1134     for ram_range in ram_ranges {
1135         info!(
1136             "create_memmap_entry, start: 0x{:08x}, end: 0x{:08x}",
1137             ram_range.0, ram_range.1
1138         );
1139         add_memmap_entry(
1140             &mut memmap,
1141             ram_range.0,
1142             ram_range.1 - ram_range.0,
1143             E820_RAM,
1144         );
1145     }
1146 
1147     add_memmap_entry(
1148         &mut memmap,
1149         layout::PCI_MMCONFIG_START.0,
1150         layout::PCI_MMCONFIG_SIZE,
1151         E820_RESERVED,
1152     );
1153 
1154     if let Some(sgx_epc_region) = sgx_epc_region {
1155         add_memmap_entry(
1156             &mut memmap,
1157             sgx_epc_region.start().raw_value(),
1158             sgx_epc_region.size(),
1159             E820_RESERVED,
1160         );
1161     }
1162 
1163     start_info.memmap_entries = memmap.len() as u32;
1164 
1165     // Copy the vector with the memmap table to the MEMMAP_START address
1166     // which is already saved in the memmap_paddr field of hvm_start_info struct.
1167     let mut memmap_start_addr = layout::MEMMAP_START;
1168 
1169     guest_mem
1170         .checked_offset(
1171             memmap_start_addr,
1172             mem::size_of::<hvm_memmap_table_entry>() * start_info.memmap_entries as usize,
1173         )
1174         .ok_or(super::Error::MemmapTablePastRamEnd)?;
1175 
1176     // For every entry in the memmap vector, write it to guest memory.
1177     for memmap_entry in memmap {
1178         guest_mem
1179             .write_obj(memmap_entry, memmap_start_addr)
1180             .map_err(|_| super::Error::MemmapTableSetup)?;
1181         memmap_start_addr =
1182             memmap_start_addr.unchecked_add(mem::size_of::<hvm_memmap_table_entry>() as u64);
1183     }
1184 
1185     // The hvm_start_info struct itself must be stored at PVH_START_INFO
1186     // address, and %rbx will be initialized to contain PVH_INFO_START prior to
1187     // starting the guest, as required by the PVH ABI.
1188     let start_info_addr = layout::PVH_INFO_START;
1189 
1190     guest_mem
1191         .checked_offset(start_info_addr, mem::size_of::<hvm_start_info>())
1192         .ok_or(super::Error::StartInfoPastRamEnd)?;
1193 
1194     // Write the start_info struct to guest memory.
1195     guest_mem
1196         .write_obj(start_info, start_info_addr)
1197         .map_err(|_| super::Error::StartInfoSetup)?;
1198 
1199     Ok(())
1200 }
1201 
1202 fn configure_32bit_entry(
1203     guest_mem: &GuestMemoryMmap,
1204     cmdline_addr: GuestAddress,
1205     cmdline_size: usize,
1206     initramfs: &Option<InitramfsConfig>,
1207     setup_hdr: setup_header,
1208     rsdp_addr: Option<GuestAddress>,
1209     sgx_epc_region: Option<SgxEpcRegion>,
1210 ) -> super::Result<()> {
1211     const KERNEL_LOADER_OTHER: u8 = 0xff;
1212 
1213     // Use the provided setup header
1214     let mut params = boot_params {
1215         hdr: setup_hdr,
1216         ..Default::default()
1217     };
1218 
1219     // Common bootparams settings
1220     if params.hdr.type_of_loader == 0 {
1221         params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
1222     }
1223     params.hdr.cmd_line_ptr = cmdline_addr.raw_value() as u32;
1224     params.hdr.cmdline_size = cmdline_size as u32;
1225 
1226     if let Some(initramfs_config) = initramfs {
1227         params.hdr.ramdisk_image = initramfs_config.address.raw_value() as u32;
1228         params.hdr.ramdisk_size = initramfs_config.size as u32;
1229     }
1230 
1231     add_e820_entry(&mut params, 0, layout::EBDA_START.raw_value(), E820_RAM)?;
1232 
1233     let mem_end = guest_mem.last_addr();
1234     if mem_end < layout::MEM_32BIT_RESERVED_START {
1235         add_e820_entry(
1236             &mut params,
1237             layout::HIGH_RAM_START.raw_value(),
1238             mem_end.unchecked_offset_from(layout::HIGH_RAM_START) + 1,
1239             E820_RAM,
1240         )?;
1241     } else {
1242         add_e820_entry(
1243             &mut params,
1244             layout::HIGH_RAM_START.raw_value(),
1245             layout::MEM_32BIT_RESERVED_START.unchecked_offset_from(layout::HIGH_RAM_START),
1246             E820_RAM,
1247         )?;
1248         if mem_end > layout::RAM_64BIT_START {
1249             add_e820_entry(
1250                 &mut params,
1251                 layout::RAM_64BIT_START.raw_value(),
1252                 mem_end.unchecked_offset_from(layout::RAM_64BIT_START) + 1,
1253                 E820_RAM,
1254             )?;
1255         }
1256     }
1257 
1258     add_e820_entry(
1259         &mut params,
1260         layout::PCI_MMCONFIG_START.0,
1261         layout::PCI_MMCONFIG_SIZE,
1262         E820_RESERVED,
1263     )?;
1264 
1265     if let Some(sgx_epc_region) = sgx_epc_region {
1266         add_e820_entry(
1267             &mut params,
1268             sgx_epc_region.start().raw_value(),
1269             sgx_epc_region.size(),
1270             E820_RESERVED,
1271         )?;
1272     }
1273 
1274     if let Some(rsdp_addr) = rsdp_addr {
1275         params.acpi_rsdp_addr = rsdp_addr.0;
1276     }
1277 
1278     let zero_page_addr = layout::ZERO_PAGE_START;
1279     guest_mem
1280         .checked_offset(zero_page_addr, mem::size_of::<boot_params>())
1281         .ok_or(super::Error::ZeroPagePastRamEnd)?;
1282     guest_mem
1283         .write_obj(params, zero_page_addr)
1284         .map_err(super::Error::ZeroPageSetup)?;
1285 
1286     Ok(())
1287 }
1288 
1289 /// Add an e820 region to the e820 map.
1290 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
1291 fn add_e820_entry(
1292     params: &mut boot_params,
1293     addr: u64,
1294     size: u64,
1295     mem_type: u32,
1296 ) -> Result<(), Error> {
1297     if params.e820_entries >= params.e820_table.len() as u8 {
1298         return Err(Error::E820Configuration);
1299     }
1300 
1301     params.e820_table[params.e820_entries as usize].addr = addr;
1302     params.e820_table[params.e820_entries as usize].size = size;
1303     params.e820_table[params.e820_entries as usize].type_ = mem_type;
1304     params.e820_entries += 1;
1305 
1306     Ok(())
1307 }
1308 
1309 fn add_memmap_entry(memmap: &mut Vec<hvm_memmap_table_entry>, addr: u64, size: u64, mem_type: u32) {
1310     // Add the table entry to the vector
1311     memmap.push(hvm_memmap_table_entry {
1312         addr,
1313         size,
1314         type_: mem_type,
1315         reserved: 0,
1316     });
1317 }
1318 
1319 /// Returns the memory address where the initramfs could be loaded.
1320 pub fn initramfs_load_addr(
1321     guest_mem: &GuestMemoryMmap,
1322     initramfs_size: usize,
1323 ) -> super::Result<u64> {
1324     let first_region = guest_mem
1325         .find_region(GuestAddress::new(0))
1326         .ok_or(super::Error::InitramfsAddress)?;
1327     // It's safe to cast to usize because the size of a region can't be greater than usize.
1328     let lowmem_size = first_region.len() as usize;
1329 
1330     if lowmem_size < initramfs_size {
1331         return Err(super::Error::InitramfsAddress);
1332     }
1333 
1334     let aligned_addr: u64 = ((lowmem_size - initramfs_size) & !(crate::pagesize() - 1)) as u64;
1335     Ok(aligned_addr)
1336 }
1337 
1338 pub fn get_host_cpu_phys_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>) -> u8 {
1339     // SAFETY: call cpuid with valid leaves
1340     unsafe {
1341         let leaf = x86_64::__cpuid(0x8000_0000);
1342 
1343         // Detect and handle AMD SME (Secure Memory Encryption) properly.
1344         // Some physical address bits may become reserved when the feature is enabled.
1345         // See AMD64 Architecture Programmer's Manual Volume 2, Section 7.10.1
1346         let reduced = if leaf.eax >= 0x8000_001f
1347             && matches!(hypervisor.get_cpu_vendor(), CpuVendor::AMD)
1348             && x86_64::__cpuid(0x8000_001f).eax & 0x1 != 0
1349         {
1350             (x86_64::__cpuid(0x8000_001f).ebx >> 6) & 0x3f
1351         } else {
1352             0
1353         };
1354 
1355         if leaf.eax >= 0x8000_0008 {
1356             let leaf = x86_64::__cpuid(0x8000_0008);
1357             ((leaf.eax & 0xff) - reduced) as u8
1358         } else {
1359             36
1360         }
1361     }
1362 }
1363 
1364 fn update_cpuid_topology(
1365     cpuid: &mut Vec<CpuIdEntry>,
1366     threads_per_core: u8,
1367     cores_per_die: u8,
1368     dies_per_package: u8,
1369     cpu_vendor: CpuVendor,
1370     id: u8,
1371 ) {
1372     let x2apic_id = get_x2apic_id(
1373         id as u32,
1374         Some((threads_per_core, cores_per_die, dies_per_package)),
1375     );
1376 
1377     let thread_width = 8 - (threads_per_core - 1).leading_zeros();
1378     let core_width = (8 - (cores_per_die - 1).leading_zeros()) + thread_width;
1379     let die_width = (8 - (dies_per_package - 1).leading_zeros()) + core_width;
1380 
1381     let mut cpu_ebx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX).unwrap_or(0);
1382     cpu_ebx |= ((dies_per_package as u32) * (cores_per_die as u32) * (threads_per_core as u32))
1383         & 0xff << 16;
1384     CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX, cpu_ebx);
1385 
1386     let mut cpu_edx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX).unwrap_or(0);
1387     cpu_edx |= 1 << 28;
1388     CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX, cpu_edx);
1389 
1390     // CPU Topology leaf 0xb
1391     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::EAX, thread_width);
1392     CpuidPatch::set_cpuid_reg(
1393         cpuid,
1394         0xb,
1395         Some(0),
1396         CpuidReg::EBX,
1397         u32::from(threads_per_core),
1398     );
1399     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::ECX, 1 << 8);
1400 
1401     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EAX, die_width);
1402     CpuidPatch::set_cpuid_reg(
1403         cpuid,
1404         0xb,
1405         Some(1),
1406         CpuidReg::EBX,
1407         u32::from(dies_per_package * cores_per_die * threads_per_core),
1408     );
1409     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::ECX, 2 << 8);
1410 
1411     // CPU Topology leaf 0x1f
1412     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::EAX, thread_width);
1413     CpuidPatch::set_cpuid_reg(
1414         cpuid,
1415         0x1f,
1416         Some(0),
1417         CpuidReg::EBX,
1418         u32::from(threads_per_core),
1419     );
1420     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::ECX, 1 << 8);
1421 
1422     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::EAX, core_width);
1423     CpuidPatch::set_cpuid_reg(
1424         cpuid,
1425         0x1f,
1426         Some(1),
1427         CpuidReg::EBX,
1428         u32::from(cores_per_die * threads_per_core),
1429     );
1430     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::ECX, 2 << 8);
1431 
1432     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::EAX, die_width);
1433     CpuidPatch::set_cpuid_reg(
1434         cpuid,
1435         0x1f,
1436         Some(2),
1437         CpuidReg::EBX,
1438         u32::from(dies_per_package * cores_per_die * threads_per_core),
1439     );
1440     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::ECX, 5 << 8);
1441 
1442     if matches!(cpu_vendor, CpuVendor::AMD) {
1443         CpuidPatch::set_cpuid_reg(
1444             cpuid,
1445             0x8000_001e,
1446             Some(0),
1447             CpuidReg::EBX,
1448             ((threads_per_core as u32 - 1) << 8) | (x2apic_id & 0xff),
1449         );
1450         CpuidPatch::set_cpuid_reg(
1451             cpuid,
1452             0x8000_001e,
1453             Some(0),
1454             CpuidReg::ECX,
1455             ((dies_per_package as u32 - 1) << 8) | (thread_width + die_width) & 0xff,
1456         );
1457         CpuidPatch::set_cpuid_reg(cpuid, 0x8000_001e, Some(0), CpuidReg::EDX, 0);
1458         if cores_per_die * threads_per_core > 1 {
1459             let ecx =
1460                 CpuidPatch::get_cpuid_reg(cpuid, 0x8000_0001, Some(0), CpuidReg::ECX).unwrap_or(0);
1461             CpuidPatch::set_cpuid_reg(
1462                 cpuid,
1463                 0x8000_0001,
1464                 Some(0),
1465                 CpuidReg::ECX,
1466                 ecx | (1u32 << 1) | (1u32 << 22),
1467             );
1468             CpuidPatch::set_cpuid_reg(
1469                 cpuid,
1470                 0x0000_0001,
1471                 Some(0),
1472                 CpuidReg::EBX,
1473                 (x2apic_id << 24) | (8 << 8) | (((cores_per_die * threads_per_core) as u32) << 16),
1474             );
1475             let cpuid_patches = vec![
1476                 // Patch tsc deadline timer bit
1477                 CpuidPatch {
1478                     function: 1,
1479                     index: 0,
1480                     flags_bit: None,
1481                     eax_bit: None,
1482                     ebx_bit: None,
1483                     ecx_bit: None,
1484                     edx_bit: Some(28),
1485                 },
1486             ];
1487             CpuidPatch::patch_cpuid(cpuid, cpuid_patches);
1488             CpuidPatch::set_cpuid_reg(
1489                 cpuid,
1490                 0x8000_0008,
1491                 Some(0),
1492                 CpuidReg::ECX,
1493                 ((thread_width + core_width + die_width) << 12)
1494                     | ((cores_per_die * threads_per_core) - 1) as u32,
1495             );
1496         } else {
1497             CpuidPatch::set_cpuid_reg(cpuid, 0x8000_0008, Some(0), CpuidReg::ECX, 0u32);
1498         }
1499     }
1500 }
1501 
1502 // The goal is to update the CPUID sub-leaves to reflect the number of EPC
1503 // sections exposed to the guest.
1504 fn update_cpuid_sgx(
1505     cpuid: &mut Vec<CpuIdEntry>,
1506     epc_sections: &[SgxEpcSection],
1507 ) -> Result<(), Error> {
1508     // Something's wrong if there's no EPC section.
1509     if epc_sections.is_empty() {
1510         return Err(Error::NoSgxEpcSection);
1511     }
1512     // We can't go further if the hypervisor does not support SGX feature.
1513     if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::EBX, 2) {
1514         return Err(Error::MissingSgxFeature);
1515     }
1516     // We can't go further if the hypervisor does not support SGX_LC feature.
1517     if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::ECX, 30) {
1518         return Err(Error::MissingSgxLaunchControlFeature);
1519     }
1520 
1521     // Get host CPUID for leaf 0x12, subleaf 0x2. This is to retrieve EPC
1522     // properties such as confidentiality and integrity.
1523     // SAFETY: call cpuid with valid leaves
1524     let leaf = unsafe { std::arch::x86_64::__cpuid_count(0x12, 0x2) };
1525 
1526     for (i, epc_section) in epc_sections.iter().enumerate() {
1527         let subleaf_idx = i + 2;
1528         let start = epc_section.start().raw_value();
1529         let size = epc_section.size();
1530         let eax = (start & 0xffff_f000) as u32 | 0x1;
1531         let ebx = (start >> 32) as u32;
1532         let ecx = (size & 0xffff_f000) as u32 | (leaf.ecx & 0xf);
1533         let edx = (size >> 32) as u32;
1534         // CPU Topology leaf 0x12
1535         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, eax);
1536         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, ebx);
1537         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, ecx);
1538         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, edx);
1539     }
1540 
1541     // Add one NULL entry to terminate the dynamic list
1542     let subleaf_idx = epc_sections.len() + 2;
1543     // CPU Topology leaf 0x12
1544     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, 0);
1545     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, 0);
1546     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, 0);
1547     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, 0);
1548 
1549     Ok(())
1550 }
1551 
1552 #[cfg(test)]
1553 mod tests {
1554     use linux_loader::loader::bootparam::boot_e820_entry;
1555 
1556     use super::*;
1557 
1558     #[test]
1559     fn regions_base_addr() {
1560         let regions = arch_memory_regions();
1561         assert_eq!(4, regions.len());
1562         assert_eq!(GuestAddress(0), regions[0].0);
1563         assert_eq!(GuestAddress(1 << 32), regions[1].0);
1564     }
1565 
1566     #[test]
1567     fn test_system_configuration() {
1568         let no_vcpus = 4;
1569         let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap();
1570         let config_err = configure_system(
1571             &gm,
1572             GuestAddress(0),
1573             0,
1574             &None,
1575             1,
1576             None,
1577             Some(layout::RSDP_POINTER),
1578             None,
1579             None,
1580             None,
1581             None,
1582             None,
1583         );
1584         assert!(config_err.is_err());
1585 
1586         // Now assigning some memory that falls before the 32bit memory hole.
1587         let arch_mem_regions = arch_memory_regions();
1588         let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1589             .iter()
1590             .filter(|r| r.2 == RegionType::Ram && r.1 != usize::MAX)
1591             .map(|r| (r.0, r.1))
1592             .collect();
1593         let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap();
1594 
1595         configure_system(
1596             &gm,
1597             GuestAddress(0),
1598             0,
1599             &None,
1600             no_vcpus,
1601             None,
1602             None,
1603             None,
1604             None,
1605             None,
1606             None,
1607             None,
1608         )
1609         .unwrap();
1610 
1611         // Now assigning some memory that falls after the 32bit memory hole.
1612         let arch_mem_regions = arch_memory_regions();
1613         let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1614             .iter()
1615             .filter(|r| r.2 == RegionType::Ram)
1616             .map(|r| {
1617                 if r.1 == usize::MAX {
1618                     (r.0, 128 << 20)
1619                 } else {
1620                     (r.0, r.1)
1621                 }
1622             })
1623             .collect();
1624         let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap();
1625         configure_system(
1626             &gm,
1627             GuestAddress(0),
1628             0,
1629             &None,
1630             no_vcpus,
1631             None,
1632             None,
1633             None,
1634             None,
1635             None,
1636             None,
1637             None,
1638         )
1639         .unwrap();
1640 
1641         configure_system(
1642             &gm,
1643             GuestAddress(0),
1644             0,
1645             &None,
1646             no_vcpus,
1647             None,
1648             None,
1649             None,
1650             None,
1651             None,
1652             None,
1653             None,
1654         )
1655         .unwrap();
1656     }
1657 
1658     #[test]
1659     fn test_add_e820_entry() {
1660         let e820_table = [(boot_e820_entry {
1661             addr: 0x1,
1662             size: 4,
1663             type_: 1,
1664         }); 128];
1665 
1666         let expected_params = boot_params {
1667             e820_table,
1668             e820_entries: 1,
1669             ..Default::default()
1670         };
1671 
1672         let mut params: boot_params = Default::default();
1673         add_e820_entry(
1674             &mut params,
1675             e820_table[0].addr,
1676             e820_table[0].size,
1677             e820_table[0].type_,
1678         )
1679         .unwrap();
1680         assert_eq!(
1681             format!("{:?}", params.e820_table[0]),
1682             format!("{:?}", expected_params.e820_table[0])
1683         );
1684         assert_eq!(params.e820_entries, expected_params.e820_entries);
1685 
1686         // Exercise the scenario where the field storing the length of the e820 entry table is
1687         // is bigger than the allocated memory.
1688         params.e820_entries = params.e820_table.len() as u8 + 1;
1689         assert!(add_e820_entry(
1690             &mut params,
1691             e820_table[0].addr,
1692             e820_table[0].size,
1693             e820_table[0].type_
1694         )
1695         .is_err());
1696     }
1697 
1698     #[test]
1699     fn test_add_memmap_entry() {
1700         let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new();
1701 
1702         let expected_memmap = vec![
1703             hvm_memmap_table_entry {
1704                 addr: 0x0,
1705                 size: 0x1000,
1706                 type_: E820_RAM,
1707                 ..Default::default()
1708             },
1709             hvm_memmap_table_entry {
1710                 addr: 0x10000,
1711                 size: 0xa000,
1712                 type_: E820_RESERVED,
1713                 ..Default::default()
1714             },
1715         ];
1716 
1717         add_memmap_entry(&mut memmap, 0, 0x1000, E820_RAM);
1718         add_memmap_entry(&mut memmap, 0x10000, 0xa000, E820_RESERVED);
1719 
1720         assert_eq!(format!("{memmap:?}"), format!("{expected_memmap:?}"));
1721     }
1722 
1723     #[test]
1724     fn test_get_x2apic_id() {
1725         let x2apic_id = get_x2apic_id(0, Some((2, 3, 1)));
1726         assert_eq!(x2apic_id, 0);
1727 
1728         let x2apic_id = get_x2apic_id(1, Some((2, 3, 1)));
1729         assert_eq!(x2apic_id, 1);
1730 
1731         let x2apic_id = get_x2apic_id(2, Some((2, 3, 1)));
1732         assert_eq!(x2apic_id, 2);
1733 
1734         let x2apic_id = get_x2apic_id(6, Some((2, 3, 1)));
1735         assert_eq!(x2apic_id, 8);
1736 
1737         let x2apic_id = get_x2apic_id(7, Some((2, 3, 1)));
1738         assert_eq!(x2apic_id, 9);
1739 
1740         let x2apic_id = get_x2apic_id(8, Some((2, 3, 1)));
1741         assert_eq!(x2apic_id, 10);
1742     }
1743 }
1744