xref: /cloud-hypervisor/arch/src/x86_64/mod.rs (revision b686a5bb24f949e3b201308d69b01e85c14f1ad6)
1 // Copyright © 2020, Oracle and/or its affiliates.
2 //
3 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4 // SPDX-License-Identifier: Apache-2.0
5 //
6 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE-BSD-3-Clause file.
9 use std::sync::Arc;
10 pub mod interrupts;
11 pub mod layout;
12 mod mpspec;
13 mod mptable;
14 pub mod regs;
15 use std::collections::BTreeMap;
16 use std::mem;
17 
18 use hypervisor::arch::x86::{CpuIdEntry, CPUID_FLAG_VALID_INDEX};
19 use hypervisor::{CpuVendor, HypervisorCpuError, HypervisorError};
20 use linux_loader::loader::bootparam::{boot_params, setup_header};
21 use linux_loader::loader::elf::start_info::{
22     hvm_memmap_table_entry, hvm_modlist_entry, hvm_start_info,
23 };
24 use thiserror::Error;
25 use vm_memory::{
26     Address, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
27     GuestMemoryRegion, GuestUsize,
28 };
29 
30 use crate::{GuestMemoryMmap, InitramfsConfig, RegionType};
31 mod smbios;
32 use std::arch::x86_64;
33 #[cfg(feature = "tdx")]
34 pub mod tdx;
35 
36 // CPUID feature bits
37 #[cfg(feature = "kvm")]
38 const TSC_DEADLINE_TIMER_ECX_BIT: u8 = 24; // tsc deadline timer ecx bit.
39 const HYPERVISOR_ECX_BIT: u8 = 31; // Hypervisor ecx bit.
40 const MTRR_EDX_BIT: u8 = 12; // Hypervisor ecx bit.
41 const INVARIANT_TSC_EDX_BIT: u8 = 8; // Invariant TSC bit on 0x8000_0007 EDX
42 const AMX_BF16: u8 = 22; // AMX tile computation on bfloat16 numbers
43 const AMX_TILE: u8 = 24; // AMX tile load/store instructions
44 const AMX_INT8: u8 = 25; // AMX tile computation on 8-bit integers
45 
46 // KVM feature bits
47 #[cfg(feature = "tdx")]
48 const KVM_FEATURE_CLOCKSOURCE_BIT: u8 = 0;
49 #[cfg(feature = "tdx")]
50 const KVM_FEATURE_CLOCKSOURCE2_BIT: u8 = 3;
51 #[cfg(feature = "tdx")]
52 const KVM_FEATURE_CLOCKSOURCE_STABLE_BIT: u8 = 24;
53 #[cfg(feature = "tdx")]
54 const KVM_FEATURE_ASYNC_PF_BIT: u8 = 4;
55 #[cfg(feature = "tdx")]
56 const KVM_FEATURE_ASYNC_PF_VMEXIT_BIT: u8 = 10;
57 #[cfg(feature = "tdx")]
58 const KVM_FEATURE_STEAL_TIME_BIT: u8 = 5;
59 
60 pub const _NSIG: i32 = 65;
61 
62 #[derive(Debug, Copy, Clone)]
63 /// Specifies the entry point address where the guest must start
64 /// executing code, as well as which of the supported boot protocols
65 /// is to be used to configure the guest initial state.
66 pub struct EntryPoint {
67     /// Address in guest memory where the guest must start execution
68     pub entry_addr: GuestAddress,
69     /// This field is used for bzImage to fill the zero page
70     pub setup_header: Option<setup_header>,
71 }
72 
73 const E820_RAM: u32 = 1;
74 const E820_RESERVED: u32 = 2;
75 
76 #[derive(Clone)]
77 pub struct SgxEpcSection {
78     start: GuestAddress,
79     size: GuestUsize,
80 }
81 
82 impl SgxEpcSection {
83     pub fn new(start: GuestAddress, size: GuestUsize) -> Self {
84         SgxEpcSection { start, size }
85     }
86     pub fn start(&self) -> GuestAddress {
87         self.start
88     }
89     pub fn size(&self) -> GuestUsize {
90         self.size
91     }
92 }
93 
94 #[derive(Clone)]
95 pub struct SgxEpcRegion {
96     start: GuestAddress,
97     size: GuestUsize,
98     epc_sections: BTreeMap<String, SgxEpcSection>,
99 }
100 
101 impl SgxEpcRegion {
102     pub fn new(start: GuestAddress, size: GuestUsize) -> Self {
103         SgxEpcRegion {
104             start,
105             size,
106             epc_sections: BTreeMap::new(),
107         }
108     }
109     pub fn start(&self) -> GuestAddress {
110         self.start
111     }
112     pub fn size(&self) -> GuestUsize {
113         self.size
114     }
115     pub fn epc_sections(&self) -> &BTreeMap<String, SgxEpcSection> {
116         &self.epc_sections
117     }
118     pub fn insert(&mut self, id: String, epc_section: SgxEpcSection) {
119         self.epc_sections.insert(id, epc_section);
120     }
121 }
122 
123 pub struct CpuidConfig {
124     pub sgx_epc_sections: Option<Vec<SgxEpcSection>>,
125     pub phys_bits: u8,
126     pub kvm_hyperv: bool,
127     #[cfg(feature = "tdx")]
128     pub tdx: bool,
129     pub amx: bool,
130 }
131 
132 #[derive(Debug, Error)]
133 pub enum Error {
134     /// Error writing MP table to memory.
135     #[error("Error writing MP table to memory: {0}")]
136     MpTableSetup(mptable::Error),
137 
138     /// Error configuring the general purpose registers
139     #[error("Error configuring the general purpose registers: {0}")]
140     RegsConfiguration(regs::Error),
141 
142     /// Error configuring the special registers
143     #[error("Error configuring the special registers: {0}")]
144     SregsConfiguration(regs::Error),
145 
146     /// Error configuring the floating point related registers
147     #[error("Error configuring the floating point related registers: {0}")]
148     FpuConfiguration(regs::Error),
149 
150     /// Error configuring the MSR registers
151     #[error("Error configuring the MSR registers: {0}")]
152     MsrsConfiguration(regs::Error),
153 
154     /// Failed to set supported CPUs.
155     #[error("Failed to set supported CPUs: {0}")]
156     SetSupportedCpusFailed(anyhow::Error),
157 
158     /// Cannot set the local interruption due to bad configuration.
159     #[error("Cannot set the local interruption due to bad configuration: {0}")]
160     LocalIntConfiguration(anyhow::Error),
161 
162     /// Error setting up SMBIOS table
163     #[error("Error setting up SMBIOS table: {0}")]
164     SmbiosSetup(smbios::Error),
165 
166     /// Could not find any SGX EPC section
167     #[error("Could not find any SGX EPC section")]
168     NoSgxEpcSection,
169 
170     /// Missing SGX CPU feature
171     #[error("Missing SGX CPU feature")]
172     MissingSgxFeature,
173 
174     /// Missing SGX_LC CPU feature
175     #[error("Missing SGX_LC CPU feature")]
176     MissingSgxLaunchControlFeature,
177 
178     /// Error getting supported CPUID through the hypervisor (kvm/mshv) API
179     #[error("Error getting supported CPUID through the hypervisor API: {0}")]
180     CpuidGetSupported(HypervisorError),
181 
182     /// Error populating CPUID with KVM HyperV emulation details
183     #[error("Error populating CPUID with KVM HyperV emulation details: {0}")]
184     CpuidKvmHyperV(vmm_sys_util::fam::Error),
185 
186     /// Error populating CPUID with CPU identification
187     #[error("Error populating CPUID with CPU identification: {0}")]
188     CpuidIdentification(vmm_sys_util::fam::Error),
189 
190     /// Error checking CPUID compatibility
191     #[error("Error checking CPUID compatibility")]
192     CpuidCheckCompatibility,
193 
194     // Error writing EBDA address
195     #[error("Error writing EBDA address: {0}")]
196     EbdaSetup(vm_memory::GuestMemoryError),
197 
198     // Error getting CPU TSC frequency
199     #[error("Error getting CPU TSC frequency: {0}")]
200     GetTscFrequency(HypervisorCpuError),
201 
202     /// Error retrieving TDX capabilities through the hypervisor (kvm/mshv) API
203     #[cfg(feature = "tdx")]
204     #[error("Error retrieving TDX capabilities through the hypervisor API: {0}")]
205     TdxCapabilities(HypervisorError),
206 
207     /// Failed to configure E820 map for bzImage
208     #[error("Failed to configure E820 map for bzImage")]
209     E820Configuration,
210 }
211 
212 impl From<Error> for super::Error {
213     fn from(e: Error) -> super::Error {
214         super::Error::PlatformSpecific(e)
215     }
216 }
217 
218 pub fn get_x2apic_id(cpu_id: u32, topology: Option<(u8, u8, u8)>) -> u32 {
219     if let Some(t) = topology {
220         let thread_mask_width = u8::BITS - (t.0 - 1).leading_zeros();
221         let core_mask_width = u8::BITS - (t.1 - 1).leading_zeros();
222         let die_mask_width = u8::BITS - (t.2 - 1).leading_zeros();
223 
224         let thread_id = cpu_id % (t.0 as u32);
225         let core_id = cpu_id / (t.0 as u32) % (t.1 as u32);
226         let die_id = cpu_id / ((t.0 * t.1) as u32) % (t.2 as u32);
227         let socket_id = cpu_id / ((t.0 * t.1 * t.2) as u32);
228 
229         return thread_id
230             | (core_id << thread_mask_width)
231             | (die_id << (thread_mask_width + core_mask_width))
232             | (socket_id << (thread_mask_width + core_mask_width + die_mask_width));
233     }
234 
235     cpu_id
236 }
237 
238 #[derive(Copy, Clone, Debug)]
239 pub enum CpuidReg {
240     EAX,
241     EBX,
242     ECX,
243     EDX,
244 }
245 
246 pub struct CpuidPatch {
247     pub function: u32,
248     pub index: u32,
249     pub flags_bit: Option<u8>,
250     pub eax_bit: Option<u8>,
251     pub ebx_bit: Option<u8>,
252     pub ecx_bit: Option<u8>,
253     pub edx_bit: Option<u8>,
254 }
255 
256 impl CpuidPatch {
257     pub fn get_cpuid_reg(
258         cpuid: &[CpuIdEntry],
259         function: u32,
260         index: Option<u32>,
261         reg: CpuidReg,
262     ) -> Option<u32> {
263         for entry in cpuid.iter() {
264             if entry.function == function && (index.is_none() || index.unwrap() == entry.index) {
265                 return match reg {
266                     CpuidReg::EAX => Some(entry.eax),
267                     CpuidReg::EBX => Some(entry.ebx),
268                     CpuidReg::ECX => Some(entry.ecx),
269                     CpuidReg::EDX => Some(entry.edx),
270                 };
271             }
272         }
273 
274         None
275     }
276 
277     pub fn set_cpuid_reg(
278         cpuid: &mut Vec<CpuIdEntry>,
279         function: u32,
280         index: Option<u32>,
281         reg: CpuidReg,
282         value: u32,
283     ) {
284         let mut entry_found = false;
285         for entry in cpuid.iter_mut() {
286             if entry.function == function && (index.is_none() || index.unwrap() == entry.index) {
287                 entry_found = true;
288                 match reg {
289                     CpuidReg::EAX => {
290                         entry.eax = value;
291                     }
292                     CpuidReg::EBX => {
293                         entry.ebx = value;
294                     }
295                     CpuidReg::ECX => {
296                         entry.ecx = value;
297                     }
298                     CpuidReg::EDX => {
299                         entry.edx = value;
300                     }
301                 }
302             }
303         }
304 
305         if entry_found {
306             return;
307         }
308 
309         // Entry not found, so let's add it.
310         if let Some(index) = index {
311             let mut entry = CpuIdEntry {
312                 function,
313                 index,
314                 flags: CPUID_FLAG_VALID_INDEX,
315                 ..Default::default()
316             };
317             match reg {
318                 CpuidReg::EAX => {
319                     entry.eax = value;
320                 }
321                 CpuidReg::EBX => {
322                     entry.ebx = value;
323                 }
324                 CpuidReg::ECX => {
325                     entry.ecx = value;
326                 }
327                 CpuidReg::EDX => {
328                     entry.edx = value;
329                 }
330             }
331 
332             cpuid.push(entry);
333         }
334     }
335 
336     pub fn patch_cpuid(cpuid: &mut [CpuIdEntry], patches: Vec<CpuidPatch>) {
337         for entry in cpuid {
338             for patch in patches.iter() {
339                 if entry.function == patch.function && entry.index == patch.index {
340                     if let Some(flags_bit) = patch.flags_bit {
341                         entry.flags |= 1 << flags_bit;
342                     }
343                     if let Some(eax_bit) = patch.eax_bit {
344                         entry.eax |= 1 << eax_bit;
345                     }
346                     if let Some(ebx_bit) = patch.ebx_bit {
347                         entry.ebx |= 1 << ebx_bit;
348                     }
349                     if let Some(ecx_bit) = patch.ecx_bit {
350                         entry.ecx |= 1 << ecx_bit;
351                     }
352                     if let Some(edx_bit) = patch.edx_bit {
353                         entry.edx |= 1 << edx_bit;
354                     }
355                 }
356             }
357         }
358     }
359 
360     pub fn is_feature_enabled(
361         cpuid: &[CpuIdEntry],
362         function: u32,
363         index: u32,
364         reg: CpuidReg,
365         feature_bit: usize,
366     ) -> bool {
367         let mask = 1 << feature_bit;
368 
369         for entry in cpuid {
370             if entry.function == function && entry.index == index {
371                 let reg_val = match reg {
372                     CpuidReg::EAX => entry.eax,
373                     CpuidReg::EBX => entry.ebx,
374                     CpuidReg::ECX => entry.ecx,
375                     CpuidReg::EDX => entry.edx,
376                 };
377 
378                 return (reg_val & mask) == mask;
379             }
380         }
381 
382         false
383     }
384 }
385 
386 #[derive(Debug)]
387 enum CpuidCompatibleCheck {
388     BitwiseSubset, // bitwise subset
389     Equal,         // equal in value
390     NumNotGreater, // smaller or equal as a number
391 }
392 
393 pub struct CpuidFeatureEntry {
394     function: u32,
395     index: u32,
396     feature_reg: CpuidReg,
397     compatible_check: CpuidCompatibleCheck,
398 }
399 
400 impl CpuidFeatureEntry {
401     fn checked_feature_entry_list() -> Vec<CpuidFeatureEntry> {
402         vec![
403             // The following list includes all hardware features bits from
404             // the CPUID Wiki Page: https://en.wikipedia.org/wiki/CPUID
405             // Leaf 0x1, ECX/EDX, feature bits
406             CpuidFeatureEntry {
407                 function: 1,
408                 index: 0,
409                 feature_reg: CpuidReg::ECX,
410                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
411             },
412             CpuidFeatureEntry {
413                 function: 1,
414                 index: 0,
415                 feature_reg: CpuidReg::EDX,
416                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
417             },
418             // Leaf 0x7, EAX/EBX/ECX/EDX, extended features
419             CpuidFeatureEntry {
420                 function: 7,
421                 index: 0,
422                 feature_reg: CpuidReg::EAX,
423                 compatible_check: CpuidCompatibleCheck::NumNotGreater,
424             },
425             CpuidFeatureEntry {
426                 function: 7,
427                 index: 0,
428                 feature_reg: CpuidReg::EBX,
429                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
430             },
431             CpuidFeatureEntry {
432                 function: 7,
433                 index: 0,
434                 feature_reg: CpuidReg::ECX,
435                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
436             },
437             CpuidFeatureEntry {
438                 function: 7,
439                 index: 0,
440                 feature_reg: CpuidReg::EDX,
441                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
442             },
443             // Leaf 0x7 subleaf 0x1, EAX, extended features
444             CpuidFeatureEntry {
445                 function: 7,
446                 index: 1,
447                 feature_reg: CpuidReg::EAX,
448                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
449             },
450             // Leaf 0x8000_0001, ECX/EDX, CPUID features bits
451             CpuidFeatureEntry {
452                 function: 0x8000_0001,
453                 index: 0,
454                 feature_reg: CpuidReg::ECX,
455                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
456             },
457             CpuidFeatureEntry {
458                 function: 0x8000_0001,
459                 index: 0,
460                 feature_reg: CpuidReg::EDX,
461                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
462             },
463             // KVM CPUID bits: https://www.kernel.org/doc/html/latest/virt/kvm/cpuid.html
464             // Leaf 0x4000_0000, EAX/EBX/ECX/EDX, KVM CPUID SIGNATURE
465             CpuidFeatureEntry {
466                 function: 0x4000_0000,
467                 index: 0,
468                 feature_reg: CpuidReg::EAX,
469                 compatible_check: CpuidCompatibleCheck::NumNotGreater,
470             },
471             CpuidFeatureEntry {
472                 function: 0x4000_0000,
473                 index: 0,
474                 feature_reg: CpuidReg::EBX,
475                 compatible_check: CpuidCompatibleCheck::Equal,
476             },
477             CpuidFeatureEntry {
478                 function: 0x4000_0000,
479                 index: 0,
480                 feature_reg: CpuidReg::ECX,
481                 compatible_check: CpuidCompatibleCheck::Equal,
482             },
483             CpuidFeatureEntry {
484                 function: 0x4000_0000,
485                 index: 0,
486                 feature_reg: CpuidReg::EDX,
487                 compatible_check: CpuidCompatibleCheck::Equal,
488             },
489             // Leaf 0x4000_0001, EAX/EBX/ECX/EDX, KVM CPUID features
490             CpuidFeatureEntry {
491                 function: 0x4000_0001,
492                 index: 0,
493                 feature_reg: CpuidReg::EAX,
494                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
495             },
496             CpuidFeatureEntry {
497                 function: 0x4000_0001,
498                 index: 0,
499                 feature_reg: CpuidReg::EBX,
500                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
501             },
502             CpuidFeatureEntry {
503                 function: 0x4000_0001,
504                 index: 0,
505                 feature_reg: CpuidReg::ECX,
506                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
507             },
508             CpuidFeatureEntry {
509                 function: 0x4000_0001,
510                 index: 0,
511                 feature_reg: CpuidReg::EDX,
512                 compatible_check: CpuidCompatibleCheck::BitwiseSubset,
513             },
514         ]
515     }
516 
517     fn get_features_from_cpuid(
518         cpuid: &[CpuIdEntry],
519         feature_entry_list: &[CpuidFeatureEntry],
520     ) -> Vec<u32> {
521         let mut features = vec![0; feature_entry_list.len()];
522         for (i, feature_entry) in feature_entry_list.iter().enumerate() {
523             for cpuid_entry in cpuid {
524                 if cpuid_entry.function == feature_entry.function
525                     && cpuid_entry.index == feature_entry.index
526                 {
527                     match feature_entry.feature_reg {
528                         CpuidReg::EAX => {
529                             features[i] = cpuid_entry.eax;
530                         }
531                         CpuidReg::EBX => {
532                             features[i] = cpuid_entry.ebx;
533                         }
534                         CpuidReg::ECX => {
535                             features[i] = cpuid_entry.ecx;
536                         }
537                         CpuidReg::EDX => {
538                             features[i] = cpuid_entry.edx;
539                         }
540                     }
541 
542                     break;
543                 }
544             }
545         }
546 
547         features
548     }
549 
550     // The function returns `Error` (a.k.a. "incompatible"), when the CPUID features from `src_vm_cpuid`
551     // is not a subset of those of the `dest_vm_cpuid`.
552     pub fn check_cpuid_compatibility(
553         src_vm_cpuid: &[CpuIdEntry],
554         dest_vm_cpuid: &[CpuIdEntry],
555     ) -> Result<(), Error> {
556         let feature_entry_list = &Self::checked_feature_entry_list();
557         let src_vm_features = Self::get_features_from_cpuid(src_vm_cpuid, feature_entry_list);
558         let dest_vm_features = Self::get_features_from_cpuid(dest_vm_cpuid, feature_entry_list);
559 
560         // Loop on feature bit and check if the 'source vm' feature is a subset
561         // of those of the 'destination vm' feature
562         let mut compatible = true;
563         for (i, (src_vm_feature, dest_vm_feature)) in src_vm_features
564             .iter()
565             .zip(dest_vm_features.iter())
566             .enumerate()
567         {
568             let entry = &feature_entry_list[i];
569             let entry_compatible = match entry.compatible_check {
570                 CpuidCompatibleCheck::BitwiseSubset => {
571                     let different_feature_bits = src_vm_feature ^ dest_vm_feature;
572                     let src_vm_feature_bits_only = different_feature_bits & src_vm_feature;
573                     src_vm_feature_bits_only == 0
574                 }
575                 CpuidCompatibleCheck::Equal => src_vm_feature == dest_vm_feature,
576                 CpuidCompatibleCheck::NumNotGreater => src_vm_feature <= dest_vm_feature,
577             };
578             if !entry_compatible {
579                 error!(
580                     "Detected incompatible CPUID entry: leaf={:#02x} (subleaf={:#02x}), register='{:?}', \
581                     compatible_check='{:?}', source VM feature='{:#04x}', destination VM feature'{:#04x}'.",
582                     entry.function, entry.index, entry.feature_reg,
583                     entry.compatible_check, src_vm_feature, dest_vm_feature
584                     );
585 
586                 compatible = false;
587             }
588         }
589 
590         if compatible {
591             info!("No CPU incompatibility detected.");
592             Ok(())
593         } else {
594             Err(Error::CpuidCheckCompatibility)
595         }
596     }
597 }
598 
599 pub fn generate_common_cpuid(
600     hypervisor: &Arc<dyn hypervisor::Hypervisor>,
601     config: &CpuidConfig,
602 ) -> super::Result<Vec<CpuIdEntry>> {
603     // SAFETY: cpuid called with valid leaves
604     if unsafe { x86_64::__cpuid(1) }.ecx & (1 << HYPERVISOR_ECX_BIT) == 1 << HYPERVISOR_ECX_BIT {
605         // SAFETY: cpuid called with valid leaves
606         let hypervisor_cpuid = unsafe { x86_64::__cpuid(0x4000_0000) };
607 
608         let mut identifier: [u8; 12] = [0; 12];
609         identifier[0..4].copy_from_slice(&hypervisor_cpuid.ebx.to_le_bytes()[..]);
610         identifier[4..8].copy_from_slice(&hypervisor_cpuid.ecx.to_le_bytes()[..]);
611         identifier[8..12].copy_from_slice(&hypervisor_cpuid.edx.to_le_bytes()[..]);
612 
613         info!(
614             "Running under nested virtualisation. Hypervisor string: {}",
615             String::from_utf8_lossy(&identifier)
616         );
617     }
618 
619     info!(
620         "Generating guest CPUID for with physical address size: {}",
621         config.phys_bits
622     );
623     #[allow(unused_mut)]
624     let mut cpuid_patches = vec![
625         // Patch hypervisor bit
626         CpuidPatch {
627             function: 1,
628             index: 0,
629             flags_bit: None,
630             eax_bit: None,
631             ebx_bit: None,
632             ecx_bit: Some(HYPERVISOR_ECX_BIT),
633             edx_bit: None,
634         },
635         // Enable MTRR feature
636         CpuidPatch {
637             function: 1,
638             index: 0,
639             flags_bit: None,
640             eax_bit: None,
641             ebx_bit: None,
642             ecx_bit: None,
643             edx_bit: Some(MTRR_EDX_BIT),
644         },
645     ];
646 
647     #[cfg(feature = "kvm")]
648     if matches!(
649         hypervisor.hypervisor_type(),
650         hypervisor::HypervisorType::Kvm
651     ) {
652         // Patch tsc deadline timer bit
653         cpuid_patches.push(CpuidPatch {
654             function: 1,
655             index: 0,
656             flags_bit: None,
657             eax_bit: None,
658             ebx_bit: None,
659             ecx_bit: Some(TSC_DEADLINE_TIMER_ECX_BIT),
660             edx_bit: None,
661         });
662     }
663 
664     // Supported CPUID
665     let mut cpuid = hypervisor
666         .get_supported_cpuid()
667         .map_err(Error::CpuidGetSupported)?;
668 
669     CpuidPatch::patch_cpuid(&mut cpuid, cpuid_patches);
670 
671     if let Some(sgx_epc_sections) = &config.sgx_epc_sections {
672         update_cpuid_sgx(&mut cpuid, sgx_epc_sections)?;
673     }
674 
675     #[cfg(feature = "tdx")]
676     let tdx_capabilities = if config.tdx {
677         let caps = hypervisor
678             .tdx_capabilities()
679             .map_err(Error::TdxCapabilities)?;
680         info!("TDX capabilities {:#?}", caps);
681         Some(caps)
682     } else {
683         None
684     };
685 
686     // Update some existing CPUID
687     for entry in cpuid.as_mut_slice().iter_mut() {
688         match entry.function {
689             // Clear AMX related bits if the AMX feature is not enabled
690             0x7 => {
691                 if !config.amx && entry.index == 0 {
692                     entry.edx &= !((1 << AMX_BF16) | (1 << AMX_TILE) | (1 << AMX_INT8))
693                 }
694             }
695             0xd =>
696             {
697                 #[cfg(feature = "tdx")]
698                 if let Some(caps) = &tdx_capabilities {
699                     let xcr0_mask: u64 = 0x82ff;
700                     let xss_mask: u64 = !xcr0_mask;
701                     if entry.index == 0 {
702                         entry.eax &= (caps.xfam_fixed0 as u32) & (xcr0_mask as u32);
703                         entry.eax |= (caps.xfam_fixed1 as u32) & (xcr0_mask as u32);
704                         entry.edx &= ((caps.xfam_fixed0 & xcr0_mask) >> 32) as u32;
705                         entry.edx |= ((caps.xfam_fixed1 & xcr0_mask) >> 32) as u32;
706                     } else if entry.index == 1 {
707                         entry.ecx &= (caps.xfam_fixed0 as u32) & (xss_mask as u32);
708                         entry.ecx |= (caps.xfam_fixed1 as u32) & (xss_mask as u32);
709                         entry.edx &= ((caps.xfam_fixed0 & xss_mask) >> 32) as u32;
710                         entry.edx |= ((caps.xfam_fixed1 & xss_mask) >> 32) as u32;
711                     }
712                 }
713             }
714             // Copy host L1 cache details if not populated by KVM
715             0x8000_0005 => {
716                 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 {
717                     // SAFETY: cpuid called with valid leaves
718                     if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0005 {
719                         // SAFETY: cpuid called with valid leaves
720                         let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0005) };
721                         entry.eax = leaf.eax;
722                         entry.ebx = leaf.ebx;
723                         entry.ecx = leaf.ecx;
724                         entry.edx = leaf.edx;
725                     }
726                 }
727             }
728             // Copy host L2 cache details if not populated by KVM
729             0x8000_0006 => {
730                 if entry.eax == 0 && entry.ebx == 0 && entry.ecx == 0 && entry.edx == 0 {
731                     // SAFETY: cpuid called with valid leaves
732                     if unsafe { std::arch::x86_64::__cpuid(0x8000_0000).eax } >= 0x8000_0006 {
733                         // SAFETY: cpuid called with valid leaves
734                         let leaf = unsafe { std::arch::x86_64::__cpuid(0x8000_0006) };
735                         entry.eax = leaf.eax;
736                         entry.ebx = leaf.ebx;
737                         entry.ecx = leaf.ecx;
738                         entry.edx = leaf.edx;
739                     }
740                 }
741             }
742             // Set CPU physical bits
743             0x8000_0008 => {
744                 entry.eax = (entry.eax & 0xffff_ff00) | (config.phys_bits as u32 & 0xff);
745             }
746             0x4000_0001 => {
747                 // These features are not supported by TDX
748                 #[cfg(feature = "tdx")]
749                 if config.tdx {
750                     entry.eax &= !((1 << KVM_FEATURE_CLOCKSOURCE_BIT)
751                         | (1 << KVM_FEATURE_CLOCKSOURCE2_BIT)
752                         | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)
753                         | (1 << KVM_FEATURE_ASYNC_PF_BIT)
754                         | (1 << KVM_FEATURE_ASYNC_PF_VMEXIT_BIT)
755                         | (1 << KVM_FEATURE_STEAL_TIME_BIT))
756                 }
757             }
758             _ => {}
759         }
760     }
761 
762     // Copy CPU identification string
763     for i in 0x8000_0002..=0x8000_0004 {
764         cpuid.retain(|c| c.function != i);
765         // SAFETY: call cpuid with valid leaves
766         let leaf = unsafe { std::arch::x86_64::__cpuid(i) };
767         cpuid.push(CpuIdEntry {
768             function: i,
769             eax: leaf.eax,
770             ebx: leaf.ebx,
771             ecx: leaf.ecx,
772             edx: leaf.edx,
773             ..Default::default()
774         });
775     }
776 
777     if config.kvm_hyperv {
778         // Remove conflicting entries
779         cpuid.retain(|c| c.function != 0x4000_0000);
780         cpuid.retain(|c| c.function != 0x4000_0001);
781         // See "Hypervisor Top Level Functional Specification" for details
782         // Compliance with "Hv#1" requires leaves up to 0x4000_000a
783         cpuid.push(CpuIdEntry {
784             function: 0x40000000,
785             eax: 0x4000000a, // Maximum cpuid leaf
786             ebx: 0x756e694c, // "Linu"
787             ecx: 0x564b2078, // "x KV"
788             edx: 0x7648204d, // "M Hv"
789             ..Default::default()
790         });
791         cpuid.push(CpuIdEntry {
792             function: 0x40000001,
793             eax: 0x31237648, // "Hv#1"
794             ..Default::default()
795         });
796         cpuid.push(CpuIdEntry {
797             function: 0x40000002,
798             eax: 0x3839,  // "Build number"
799             ebx: 0xa0000, // "Version"
800             ..Default::default()
801         });
802         cpuid.push(CpuIdEntry {
803             function: 0x4000_0003,
804             eax: (1 << 1) // AccessPartitionReferenceCounter
805                    | (1 << 2) // AccessSynicRegs
806                    | (1 << 3) // AccessSyntheticTimerRegs
807                    | (1 << 9), // AccessPartitionReferenceTsc
808             edx: 1 << 3, // CPU dynamic partitioning
809             ..Default::default()
810         });
811         cpuid.push(CpuIdEntry {
812             function: 0x4000_0004,
813             eax: 1 << 5, // Recommend relaxed timing
814             ..Default::default()
815         });
816         for i in 0x4000_0005..=0x4000_000a {
817             cpuid.push(CpuIdEntry {
818                 function: i,
819                 ..Default::default()
820             });
821         }
822     }
823 
824     Ok(cpuid)
825 }
826 
827 pub fn configure_vcpu(
828     vcpu: &Arc<dyn hypervisor::Vcpu>,
829     id: u8,
830     boot_setup: Option<(EntryPoint, &GuestMemoryAtomic<GuestMemoryMmap>)>,
831     cpuid: Vec<CpuIdEntry>,
832     kvm_hyperv: bool,
833     cpu_vendor: CpuVendor,
834     topology: Option<(u8, u8, u8)>,
835 ) -> super::Result<()> {
836     let x2apic_id = get_x2apic_id(id as u32, topology);
837 
838     // Per vCPU CPUID changes; common are handled via generate_common_cpuid()
839     let mut cpuid = cpuid;
840     CpuidPatch::set_cpuid_reg(&mut cpuid, 0xb, None, CpuidReg::EDX, x2apic_id);
841     CpuidPatch::set_cpuid_reg(&mut cpuid, 0x1f, None, CpuidReg::EDX, x2apic_id);
842     if matches!(cpu_vendor, CpuVendor::AMD) {
843         CpuidPatch::set_cpuid_reg(&mut cpuid, 0x8000_001e, Some(0), CpuidReg::EAX, x2apic_id);
844     }
845 
846     // Set ApicId in cpuid for each vcpu - found in cpuid ebx when eax = 1
847     let mut apic_id_patched = false;
848     for entry in &mut cpuid {
849         if entry.function == 1 {
850             entry.ebx &= 0xffffff;
851             entry.ebx |= x2apic_id << 24;
852             apic_id_patched = true;
853             break;
854         }
855     }
856     assert!(apic_id_patched);
857 
858     if let Some(t) = topology {
859         update_cpuid_topology(&mut cpuid, t.0, t.1, t.2, cpu_vendor, id);
860     }
861 
862     // The TSC frequency CPUID leaf should not be included when running with HyperV emulation
863     if !kvm_hyperv {
864         if let Some(tsc_khz) = vcpu.tsc_khz().map_err(Error::GetTscFrequency)? {
865             // Need to check that the TSC doesn't vary with dynamic frequency
866             // SAFETY: cpuid called with valid leaves
867             if unsafe { std::arch::x86_64::__cpuid(0x8000_0007) }.edx
868                 & (1u32 << INVARIANT_TSC_EDX_BIT)
869                 > 0
870             {
871                 CpuidPatch::set_cpuid_reg(
872                     &mut cpuid,
873                     0x4000_0000,
874                     None,
875                     CpuidReg::EAX,
876                     0x4000_0010,
877                 );
878                 cpuid.retain(|c| c.function != 0x4000_0010);
879                 cpuid.push(CpuIdEntry {
880                     function: 0x4000_0010,
881                     eax: tsc_khz,
882                     ebx: 1000000, /* LAPIC resolution of 1ns (freq: 1GHz) is hardcoded in KVM's
883                                    * APIC_BUS_CYCLE_NS */
884                     ..Default::default()
885                 });
886             };
887         }
888     }
889 
890     for c in &cpuid {
891         info!("{}", c);
892     }
893 
894     vcpu.set_cpuid2(&cpuid)
895         .map_err(|e| Error::SetSupportedCpusFailed(e.into()))?;
896 
897     if kvm_hyperv {
898         vcpu.enable_hyperv_synic().unwrap();
899     }
900 
901     regs::setup_msrs(vcpu).map_err(Error::MsrsConfiguration)?;
902     if let Some((kernel_entry_point, guest_memory)) = boot_setup {
903         regs::setup_regs(vcpu, kernel_entry_point).map_err(Error::RegsConfiguration)?;
904         regs::setup_fpu(vcpu).map_err(Error::FpuConfiguration)?;
905         regs::setup_sregs(&guest_memory.memory(), vcpu).map_err(Error::SregsConfiguration)?;
906     }
907     interrupts::set_lint(vcpu).map_err(|e| Error::LocalIntConfiguration(e.into()))?;
908     Ok(())
909 }
910 
911 /// Returns a Vec of the valid memory addresses.
912 ///
913 /// These should be used to configure the GuestMemory structure for the platform.
914 /// For x86_64 all addresses are valid from the start of the kernel except a
915 /// carve out at the end of 32bit address space.
916 pub fn arch_memory_regions() -> Vec<(GuestAddress, usize, RegionType)> {
917     vec![
918         // 0 GiB ~ 3GiB: memory before the gap
919         (
920             GuestAddress(0),
921             layout::MEM_32BIT_RESERVED_START.raw_value() as usize,
922             RegionType::Ram,
923         ),
924         // 4 GiB ~ inf: memory after the gap
925         (layout::RAM_64BIT_START, usize::MAX, RegionType::Ram),
926         // 3 GiB ~ 3712 MiB: 32-bit device memory hole
927         (
928             layout::MEM_32BIT_RESERVED_START,
929             layout::MEM_32BIT_DEVICES_SIZE as usize,
930             RegionType::SubRegion,
931         ),
932         // 3712 MiB ~ 3968 MiB: 32-bit reserved memory hole
933         (
934             layout::MEM_32BIT_RESERVED_START.unchecked_add(layout::MEM_32BIT_DEVICES_SIZE),
935             (layout::MEM_32BIT_RESERVED_SIZE - layout::MEM_32BIT_DEVICES_SIZE) as usize,
936             RegionType::Reserved,
937         ),
938     ]
939 }
940 
941 /// Configures the system and should be called once per vm before starting vcpu threads.
942 ///
943 /// # Arguments
944 ///
945 /// * `guest_mem` - The memory to be used by the guest.
946 /// * `cmdline_addr` - Address in `guest_mem` where the kernel command line was loaded.
947 /// * `cmdline_size` - Size of the kernel command line in bytes including the null terminator.
948 /// * `num_cpus` - Number of virtual CPUs the guest will have.
949 #[allow(clippy::too_many_arguments)]
950 pub fn configure_system(
951     guest_mem: &GuestMemoryMmap,
952     cmdline_addr: GuestAddress,
953     cmdline_size: usize,
954     initramfs: &Option<InitramfsConfig>,
955     _num_cpus: u8,
956     setup_header: Option<setup_header>,
957     rsdp_addr: Option<GuestAddress>,
958     sgx_epc_region: Option<SgxEpcRegion>,
959     serial_number: Option<&str>,
960     uuid: Option<&str>,
961     oem_strings: Option<&[&str]>,
962     topology: Option<(u8, u8, u8)>,
963 ) -> super::Result<()> {
964     // Write EBDA address to location where ACPICA expects to find it
965     guest_mem
966         .write_obj((layout::EBDA_START.0 >> 4) as u16, layout::EBDA_POINTER)
967         .map_err(Error::EbdaSetup)?;
968 
969     let size = smbios::setup_smbios(guest_mem, serial_number, uuid, oem_strings)
970         .map_err(Error::SmbiosSetup)?;
971 
972     // Place the MP table after the SMIOS table aligned to 16 bytes
973     let offset = GuestAddress(layout::SMBIOS_START).unchecked_add(size);
974     let offset = GuestAddress((offset.0 + 16) & !0xf);
975     mptable::setup_mptable(offset, guest_mem, _num_cpus, topology).map_err(Error::MpTableSetup)?;
976 
977     // Check that the RAM is not smaller than the RSDP start address
978     if let Some(rsdp_addr) = rsdp_addr {
979         if rsdp_addr.0 > guest_mem.last_addr().0 {
980             return Err(super::Error::RsdpPastRamEnd);
981         }
982     }
983 
984     match setup_header {
985         Some(hdr) => configure_32bit_entry(
986             guest_mem,
987             cmdline_addr,
988             cmdline_size,
989             initramfs,
990             hdr,
991             rsdp_addr,
992             sgx_epc_region,
993         ),
994         None => configure_pvh(
995             guest_mem,
996             cmdline_addr,
997             initramfs,
998             rsdp_addr,
999             sgx_epc_region,
1000         ),
1001     }
1002 }
1003 
1004 type RamRange = (u64, u64);
1005 
1006 /// Returns usable physical memory ranges for the guest
1007 /// These should be used to create e820_RAM memory maps
1008 pub fn generate_ram_ranges(guest_mem: &GuestMemoryMmap) -> super::Result<Vec<RamRange>> {
1009     // Merge continuous memory regions into one region.
1010     // Note: memory regions from "GuestMemory" are sorted and non-zero sized.
1011     let ram_regions = {
1012         let mut ram_regions = Vec::new();
1013         let mut current_start = guest_mem
1014             .iter()
1015             .next()
1016             .map(GuestMemoryRegion::start_addr)
1017             .expect("GuestMemory must have one memory region at least")
1018             .raw_value();
1019         let mut current_end = current_start;
1020 
1021         for (start, size) in guest_mem
1022             .iter()
1023             .map(|m| (m.start_addr().raw_value(), m.len()))
1024         {
1025             if current_end == start {
1026                 // This zone is continuous with the previous one.
1027                 current_end += size;
1028             } else {
1029                 ram_regions.push((current_start, current_end));
1030 
1031                 current_start = start;
1032                 current_end = start + size;
1033             }
1034         }
1035 
1036         ram_regions.push((current_start, current_end));
1037 
1038         ram_regions
1039     };
1040 
1041     // Create the memory map entry for memory region before the gap
1042     let mut ram_ranges = vec![];
1043 
1044     // Generate the first usable physical memory range before the gap. The e820 map
1045     // should only report memory above 1MiB.
1046     let first_ram_range = {
1047         let (first_region_start, first_region_end) =
1048             ram_regions.first().ok_or(super::Error::MemmapTableSetup)?;
1049         let high_ram_start = layout::HIGH_RAM_START.raw_value();
1050         let mem_32bit_reserved_start = layout::MEM_32BIT_RESERVED_START.raw_value();
1051 
1052         if !((first_region_start <= &high_ram_start)
1053             && (first_region_end > &high_ram_start)
1054             && (first_region_end <= &mem_32bit_reserved_start))
1055         {
1056             error!(
1057                 "Unexpected first memory region layout: (start: 0x{:08x}, end: 0x{:08x}).
1058                 high_ram_start: 0x{:08x}, mem_32bit_reserved_start: 0x{:08x}",
1059                 first_region_start, first_region_end, high_ram_start, mem_32bit_reserved_start
1060             );
1061 
1062             return Err(super::Error::MemmapTableSetup);
1063         }
1064 
1065         info!(
1066             "first usable physical memory range, start: 0x{:08x}, end: 0x{:08x}",
1067             high_ram_start, first_region_end
1068         );
1069 
1070         (high_ram_start, *first_region_end)
1071     };
1072     ram_ranges.push(first_ram_range);
1073 
1074     // Generate additional usable physical memory range after the gap if any.
1075     for ram_region in ram_regions.iter().skip(1) {
1076         info!(
1077             "found usable physical memory range, start: 0x{:08x}, end: 0x{:08x}",
1078             ram_region.0, ram_region.1
1079         );
1080 
1081         ram_ranges.push(*ram_region);
1082     }
1083 
1084     Ok(ram_ranges)
1085 }
1086 
1087 fn configure_pvh(
1088     guest_mem: &GuestMemoryMmap,
1089     cmdline_addr: GuestAddress,
1090     initramfs: &Option<InitramfsConfig>,
1091     rsdp_addr: Option<GuestAddress>,
1092     sgx_epc_region: Option<SgxEpcRegion>,
1093 ) -> super::Result<()> {
1094     const XEN_HVM_START_MAGIC_VALUE: u32 = 0x336ec578;
1095 
1096     let mut start_info = hvm_start_info {
1097         magic: XEN_HVM_START_MAGIC_VALUE,
1098         version: 1, // pvh has version 1
1099         nr_modules: 0,
1100         cmdline_paddr: cmdline_addr.raw_value(),
1101         memmap_paddr: layout::MEMMAP_START.raw_value(),
1102         ..Default::default()
1103     };
1104 
1105     if let Some(rsdp_addr) = rsdp_addr {
1106         start_info.rsdp_paddr = rsdp_addr.0;
1107     }
1108 
1109     if let Some(initramfs_config) = initramfs {
1110         // The initramfs has been written to guest memory already, here we just need to
1111         // create the module structure that describes it.
1112         let ramdisk_mod = hvm_modlist_entry {
1113             paddr: initramfs_config.address.raw_value(),
1114             size: initramfs_config.size as u64,
1115             ..Default::default()
1116         };
1117 
1118         start_info.nr_modules += 1;
1119         start_info.modlist_paddr = layout::MODLIST_START.raw_value();
1120 
1121         // Write the modlist struct to guest memory.
1122         guest_mem
1123             .write_obj(ramdisk_mod, layout::MODLIST_START)
1124             .map_err(super::Error::ModlistSetup)?;
1125     }
1126 
1127     // Vector to hold the memory maps which needs to be written to guest memory
1128     // at MEMMAP_START after all of the mappings are recorded.
1129     let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new();
1130 
1131     // Create the memory map entries.
1132     add_memmap_entry(&mut memmap, 0, layout::EBDA_START.raw_value(), E820_RAM);
1133 
1134     // Get usable physical memory ranges
1135     let ram_ranges = generate_ram_ranges(guest_mem)?;
1136 
1137     // Create e820 memory map entries
1138     for ram_range in ram_ranges {
1139         info!(
1140             "create_memmap_entry, start: 0x{:08x}, end: 0x{:08x}",
1141             ram_range.0, ram_range.1
1142         );
1143         add_memmap_entry(
1144             &mut memmap,
1145             ram_range.0,
1146             ram_range.1 - ram_range.0,
1147             E820_RAM,
1148         );
1149     }
1150 
1151     add_memmap_entry(
1152         &mut memmap,
1153         layout::PCI_MMCONFIG_START.0,
1154         layout::PCI_MMCONFIG_SIZE,
1155         E820_RESERVED,
1156     );
1157 
1158     if let Some(sgx_epc_region) = sgx_epc_region {
1159         add_memmap_entry(
1160             &mut memmap,
1161             sgx_epc_region.start().raw_value(),
1162             sgx_epc_region.size(),
1163             E820_RESERVED,
1164         );
1165     }
1166 
1167     start_info.memmap_entries = memmap.len() as u32;
1168 
1169     // Copy the vector with the memmap table to the MEMMAP_START address
1170     // which is already saved in the memmap_paddr field of hvm_start_info struct.
1171     let mut memmap_start_addr = layout::MEMMAP_START;
1172 
1173     guest_mem
1174         .checked_offset(
1175             memmap_start_addr,
1176             mem::size_of::<hvm_memmap_table_entry>() * start_info.memmap_entries as usize,
1177         )
1178         .ok_or(super::Error::MemmapTablePastRamEnd)?;
1179 
1180     // For every entry in the memmap vector, write it to guest memory.
1181     for memmap_entry in memmap {
1182         guest_mem
1183             .write_obj(memmap_entry, memmap_start_addr)
1184             .map_err(|_| super::Error::MemmapTableSetup)?;
1185         memmap_start_addr =
1186             memmap_start_addr.unchecked_add(mem::size_of::<hvm_memmap_table_entry>() as u64);
1187     }
1188 
1189     // The hvm_start_info struct itself must be stored at PVH_START_INFO
1190     // address, and %rbx will be initialized to contain PVH_INFO_START prior to
1191     // starting the guest, as required by the PVH ABI.
1192     let start_info_addr = layout::PVH_INFO_START;
1193 
1194     guest_mem
1195         .checked_offset(start_info_addr, mem::size_of::<hvm_start_info>())
1196         .ok_or(super::Error::StartInfoPastRamEnd)?;
1197 
1198     // Write the start_info struct to guest memory.
1199     guest_mem
1200         .write_obj(start_info, start_info_addr)
1201         .map_err(|_| super::Error::StartInfoSetup)?;
1202 
1203     Ok(())
1204 }
1205 
1206 fn configure_32bit_entry(
1207     guest_mem: &GuestMemoryMmap,
1208     cmdline_addr: GuestAddress,
1209     cmdline_size: usize,
1210     initramfs: &Option<InitramfsConfig>,
1211     setup_hdr: setup_header,
1212     rsdp_addr: Option<GuestAddress>,
1213     sgx_epc_region: Option<SgxEpcRegion>,
1214 ) -> super::Result<()> {
1215     const KERNEL_LOADER_OTHER: u8 = 0xff;
1216 
1217     // Use the provided setup header
1218     let mut params = boot_params {
1219         hdr: setup_hdr,
1220         ..Default::default()
1221     };
1222 
1223     // Common bootparams settings
1224     if params.hdr.type_of_loader == 0 {
1225         params.hdr.type_of_loader = KERNEL_LOADER_OTHER;
1226     }
1227     params.hdr.cmd_line_ptr = cmdline_addr.raw_value() as u32;
1228     params.hdr.cmdline_size = cmdline_size as u32;
1229 
1230     if let Some(initramfs_config) = initramfs {
1231         params.hdr.ramdisk_image = initramfs_config.address.raw_value() as u32;
1232         params.hdr.ramdisk_size = initramfs_config.size as u32;
1233     }
1234 
1235     add_e820_entry(&mut params, 0, layout::EBDA_START.raw_value(), E820_RAM)?;
1236 
1237     let mem_end = guest_mem.last_addr();
1238     if mem_end < layout::MEM_32BIT_RESERVED_START {
1239         add_e820_entry(
1240             &mut params,
1241             layout::HIGH_RAM_START.raw_value(),
1242             mem_end.unchecked_offset_from(layout::HIGH_RAM_START) + 1,
1243             E820_RAM,
1244         )?;
1245     } else {
1246         add_e820_entry(
1247             &mut params,
1248             layout::HIGH_RAM_START.raw_value(),
1249             layout::MEM_32BIT_RESERVED_START.unchecked_offset_from(layout::HIGH_RAM_START),
1250             E820_RAM,
1251         )?;
1252         if mem_end > layout::RAM_64BIT_START {
1253             add_e820_entry(
1254                 &mut params,
1255                 layout::RAM_64BIT_START.raw_value(),
1256                 mem_end.unchecked_offset_from(layout::RAM_64BIT_START) + 1,
1257                 E820_RAM,
1258             )?;
1259         }
1260     }
1261 
1262     add_e820_entry(
1263         &mut params,
1264         layout::PCI_MMCONFIG_START.0,
1265         layout::PCI_MMCONFIG_SIZE,
1266         E820_RESERVED,
1267     )?;
1268 
1269     if let Some(sgx_epc_region) = sgx_epc_region {
1270         add_e820_entry(
1271             &mut params,
1272             sgx_epc_region.start().raw_value(),
1273             sgx_epc_region.size(),
1274             E820_RESERVED,
1275         )?;
1276     }
1277 
1278     if let Some(rsdp_addr) = rsdp_addr {
1279         params.acpi_rsdp_addr = rsdp_addr.0;
1280     }
1281 
1282     let zero_page_addr = layout::ZERO_PAGE_START;
1283     guest_mem
1284         .checked_offset(zero_page_addr, mem::size_of::<boot_params>())
1285         .ok_or(super::Error::ZeroPagePastRamEnd)?;
1286     guest_mem
1287         .write_obj(params, zero_page_addr)
1288         .map_err(super::Error::ZeroPageSetup)?;
1289 
1290     Ok(())
1291 }
1292 
1293 /// Add an e820 region to the e820 map.
1294 /// Returns Ok(()) if successful, or an error if there is no space left in the map.
1295 fn add_e820_entry(
1296     params: &mut boot_params,
1297     addr: u64,
1298     size: u64,
1299     mem_type: u32,
1300 ) -> Result<(), Error> {
1301     if params.e820_entries >= params.e820_table.len() as u8 {
1302         return Err(Error::E820Configuration);
1303     }
1304 
1305     params.e820_table[params.e820_entries as usize].addr = addr;
1306     params.e820_table[params.e820_entries as usize].size = size;
1307     params.e820_table[params.e820_entries as usize].type_ = mem_type;
1308     params.e820_entries += 1;
1309 
1310     Ok(())
1311 }
1312 
1313 fn add_memmap_entry(memmap: &mut Vec<hvm_memmap_table_entry>, addr: u64, size: u64, mem_type: u32) {
1314     // Add the table entry to the vector
1315     memmap.push(hvm_memmap_table_entry {
1316         addr,
1317         size,
1318         type_: mem_type,
1319         reserved: 0,
1320     });
1321 }
1322 
1323 /// Returns the memory address where the initramfs could be loaded.
1324 pub fn initramfs_load_addr(
1325     guest_mem: &GuestMemoryMmap,
1326     initramfs_size: usize,
1327 ) -> super::Result<u64> {
1328     let first_region = guest_mem
1329         .find_region(GuestAddress::new(0))
1330         .ok_or(super::Error::InitramfsAddress)?;
1331     // It's safe to cast to usize because the size of a region can't be greater than usize.
1332     let lowmem_size = first_region.len() as usize;
1333 
1334     if lowmem_size < initramfs_size {
1335         return Err(super::Error::InitramfsAddress);
1336     }
1337 
1338     let aligned_addr: u64 = ((lowmem_size - initramfs_size) & !(crate::pagesize() - 1)) as u64;
1339     Ok(aligned_addr)
1340 }
1341 
1342 pub fn get_host_cpu_phys_bits(hypervisor: &Arc<dyn hypervisor::Hypervisor>) -> u8 {
1343     // SAFETY: call cpuid with valid leaves
1344     unsafe {
1345         let leaf = x86_64::__cpuid(0x8000_0000);
1346 
1347         // Detect and handle AMD SME (Secure Memory Encryption) properly.
1348         // Some physical address bits may become reserved when the feature is enabled.
1349         // See AMD64 Architecture Programmer's Manual Volume 2, Section 7.10.1
1350         let reduced = if leaf.eax >= 0x8000_001f
1351             && matches!(hypervisor.get_cpu_vendor(), CpuVendor::AMD)
1352             && x86_64::__cpuid(0x8000_001f).eax & 0x1 != 0
1353         {
1354             (x86_64::__cpuid(0x8000_001f).ebx >> 6) & 0x3f
1355         } else {
1356             0
1357         };
1358 
1359         if leaf.eax >= 0x8000_0008 {
1360             let leaf = x86_64::__cpuid(0x8000_0008);
1361             ((leaf.eax & 0xff) - reduced) as u8
1362         } else {
1363             36
1364         }
1365     }
1366 }
1367 
1368 fn update_cpuid_topology(
1369     cpuid: &mut Vec<CpuIdEntry>,
1370     threads_per_core: u8,
1371     cores_per_die: u8,
1372     dies_per_package: u8,
1373     cpu_vendor: CpuVendor,
1374     id: u8,
1375 ) {
1376     let x2apic_id = get_x2apic_id(
1377         id as u32,
1378         Some((threads_per_core, cores_per_die, dies_per_package)),
1379     );
1380 
1381     let thread_width = 8 - (threads_per_core - 1).leading_zeros();
1382     let core_width = (8 - (cores_per_die - 1).leading_zeros()) + thread_width;
1383     let die_width = (8 - (dies_per_package - 1).leading_zeros()) + core_width;
1384 
1385     let mut cpu_ebx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX).unwrap_or(0);
1386     cpu_ebx |= ((dies_per_package as u32) * (cores_per_die as u32) * (threads_per_core as u32))
1387         & (0xff << 16);
1388     CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EBX, cpu_ebx);
1389 
1390     let mut cpu_edx = CpuidPatch::get_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX).unwrap_or(0);
1391     cpu_edx |= 1 << 28;
1392     CpuidPatch::set_cpuid_reg(cpuid, 0x1, None, CpuidReg::EDX, cpu_edx);
1393 
1394     // CPU Topology leaf 0xb
1395     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::EAX, thread_width);
1396     CpuidPatch::set_cpuid_reg(
1397         cpuid,
1398         0xb,
1399         Some(0),
1400         CpuidReg::EBX,
1401         u32::from(threads_per_core),
1402     );
1403     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(0), CpuidReg::ECX, 1 << 8);
1404 
1405     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::EAX, die_width);
1406     CpuidPatch::set_cpuid_reg(
1407         cpuid,
1408         0xb,
1409         Some(1),
1410         CpuidReg::EBX,
1411         u32::from(dies_per_package * cores_per_die * threads_per_core),
1412     );
1413     CpuidPatch::set_cpuid_reg(cpuid, 0xb, Some(1), CpuidReg::ECX, 2 << 8);
1414 
1415     // CPU Topology leaf 0x1f
1416     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::EAX, thread_width);
1417     CpuidPatch::set_cpuid_reg(
1418         cpuid,
1419         0x1f,
1420         Some(0),
1421         CpuidReg::EBX,
1422         u32::from(threads_per_core),
1423     );
1424     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(0), CpuidReg::ECX, 1 << 8);
1425 
1426     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::EAX, core_width);
1427     CpuidPatch::set_cpuid_reg(
1428         cpuid,
1429         0x1f,
1430         Some(1),
1431         CpuidReg::EBX,
1432         u32::from(cores_per_die * threads_per_core),
1433     );
1434     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(1), CpuidReg::ECX, 2 << 8);
1435 
1436     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::EAX, die_width);
1437     CpuidPatch::set_cpuid_reg(
1438         cpuid,
1439         0x1f,
1440         Some(2),
1441         CpuidReg::EBX,
1442         u32::from(dies_per_package * cores_per_die * threads_per_core),
1443     );
1444     CpuidPatch::set_cpuid_reg(cpuid, 0x1f, Some(2), CpuidReg::ECX, 5 << 8);
1445 
1446     if matches!(cpu_vendor, CpuVendor::AMD) {
1447         CpuidPatch::set_cpuid_reg(
1448             cpuid,
1449             0x8000_001e,
1450             Some(0),
1451             CpuidReg::EBX,
1452             ((threads_per_core as u32 - 1) << 8) | (x2apic_id & 0xff),
1453         );
1454         CpuidPatch::set_cpuid_reg(
1455             cpuid,
1456             0x8000_001e,
1457             Some(0),
1458             CpuidReg::ECX,
1459             ((dies_per_package as u32 - 1) << 8) | (thread_width + die_width) & 0xff,
1460         );
1461         CpuidPatch::set_cpuid_reg(cpuid, 0x8000_001e, Some(0), CpuidReg::EDX, 0);
1462         if cores_per_die * threads_per_core > 1 {
1463             let ecx =
1464                 CpuidPatch::get_cpuid_reg(cpuid, 0x8000_0001, Some(0), CpuidReg::ECX).unwrap_or(0);
1465             CpuidPatch::set_cpuid_reg(
1466                 cpuid,
1467                 0x8000_0001,
1468                 Some(0),
1469                 CpuidReg::ECX,
1470                 ecx | (1u32 << 1) | (1u32 << 22),
1471             );
1472             CpuidPatch::set_cpuid_reg(
1473                 cpuid,
1474                 0x0000_0001,
1475                 Some(0),
1476                 CpuidReg::EBX,
1477                 (x2apic_id << 24) | (8 << 8) | (((cores_per_die * threads_per_core) as u32) << 16),
1478             );
1479             let cpuid_patches = vec![
1480                 // Patch tsc deadline timer bit
1481                 CpuidPatch {
1482                     function: 1,
1483                     index: 0,
1484                     flags_bit: None,
1485                     eax_bit: None,
1486                     ebx_bit: None,
1487                     ecx_bit: None,
1488                     edx_bit: Some(28),
1489                 },
1490             ];
1491             CpuidPatch::patch_cpuid(cpuid, cpuid_patches);
1492             CpuidPatch::set_cpuid_reg(
1493                 cpuid,
1494                 0x8000_0008,
1495                 Some(0),
1496                 CpuidReg::ECX,
1497                 ((thread_width + core_width + die_width) << 12)
1498                     | ((cores_per_die * threads_per_core) - 1) as u32,
1499             );
1500         } else {
1501             CpuidPatch::set_cpuid_reg(cpuid, 0x8000_0008, Some(0), CpuidReg::ECX, 0u32);
1502         }
1503     }
1504 }
1505 
1506 // The goal is to update the CPUID sub-leaves to reflect the number of EPC
1507 // sections exposed to the guest.
1508 fn update_cpuid_sgx(
1509     cpuid: &mut Vec<CpuIdEntry>,
1510     epc_sections: &[SgxEpcSection],
1511 ) -> Result<(), Error> {
1512     // Something's wrong if there's no EPC section.
1513     if epc_sections.is_empty() {
1514         return Err(Error::NoSgxEpcSection);
1515     }
1516     // We can't go further if the hypervisor does not support SGX feature.
1517     if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::EBX, 2) {
1518         return Err(Error::MissingSgxFeature);
1519     }
1520     // We can't go further if the hypervisor does not support SGX_LC feature.
1521     if !CpuidPatch::is_feature_enabled(cpuid, 0x7, 0, CpuidReg::ECX, 30) {
1522         return Err(Error::MissingSgxLaunchControlFeature);
1523     }
1524 
1525     // Get host CPUID for leaf 0x12, subleaf 0x2. This is to retrieve EPC
1526     // properties such as confidentiality and integrity.
1527     // SAFETY: call cpuid with valid leaves
1528     let leaf = unsafe { std::arch::x86_64::__cpuid_count(0x12, 0x2) };
1529 
1530     for (i, epc_section) in epc_sections.iter().enumerate() {
1531         let subleaf_idx = i + 2;
1532         let start = epc_section.start().raw_value();
1533         let size = epc_section.size();
1534         let eax = (start & 0xffff_f000) as u32 | 0x1;
1535         let ebx = (start >> 32) as u32;
1536         let ecx = (size & 0xffff_f000) as u32 | (leaf.ecx & 0xf);
1537         let edx = (size >> 32) as u32;
1538         // CPU Topology leaf 0x12
1539         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, eax);
1540         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, ebx);
1541         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, ecx);
1542         CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, edx);
1543     }
1544 
1545     // Add one NULL entry to terminate the dynamic list
1546     let subleaf_idx = epc_sections.len() + 2;
1547     // CPU Topology leaf 0x12
1548     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EAX, 0);
1549     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EBX, 0);
1550     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::ECX, 0);
1551     CpuidPatch::set_cpuid_reg(cpuid, 0x12, Some(subleaf_idx as u32), CpuidReg::EDX, 0);
1552 
1553     Ok(())
1554 }
1555 
1556 #[cfg(test)]
1557 mod tests {
1558     use linux_loader::loader::bootparam::boot_e820_entry;
1559 
1560     use super::*;
1561 
1562     #[test]
1563     fn regions_base_addr() {
1564         let regions = arch_memory_regions();
1565         assert_eq!(4, regions.len());
1566         assert_eq!(GuestAddress(0), regions[0].0);
1567         assert_eq!(GuestAddress(1 << 32), regions[1].0);
1568     }
1569 
1570     #[test]
1571     fn test_system_configuration() {
1572         let no_vcpus = 4;
1573         let gm = GuestMemoryMmap::from_ranges(&[(GuestAddress(0), 0x10000)]).unwrap();
1574         let config_err = configure_system(
1575             &gm,
1576             GuestAddress(0),
1577             0,
1578             &None,
1579             1,
1580             None,
1581             Some(layout::RSDP_POINTER),
1582             None,
1583             None,
1584             None,
1585             None,
1586             None,
1587         );
1588         config_err.unwrap_err();
1589 
1590         // Now assigning some memory that falls before the 32bit memory hole.
1591         let arch_mem_regions = arch_memory_regions();
1592         let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1593             .iter()
1594             .filter(|r| r.2 == RegionType::Ram && r.1 != usize::MAX)
1595             .map(|r| (r.0, r.1))
1596             .collect();
1597         let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap();
1598 
1599         configure_system(
1600             &gm,
1601             GuestAddress(0),
1602             0,
1603             &None,
1604             no_vcpus,
1605             None,
1606             None,
1607             None,
1608             None,
1609             None,
1610             None,
1611             None,
1612         )
1613         .unwrap();
1614 
1615         // Now assigning some memory that falls after the 32bit memory hole.
1616         let arch_mem_regions = arch_memory_regions();
1617         let ram_regions: Vec<(GuestAddress, usize)> = arch_mem_regions
1618             .iter()
1619             .filter(|r| r.2 == RegionType::Ram)
1620             .map(|r| {
1621                 if r.1 == usize::MAX {
1622                     (r.0, 128 << 20)
1623                 } else {
1624                     (r.0, r.1)
1625                 }
1626             })
1627             .collect();
1628         let gm = GuestMemoryMmap::from_ranges(&ram_regions).unwrap();
1629         configure_system(
1630             &gm,
1631             GuestAddress(0),
1632             0,
1633             &None,
1634             no_vcpus,
1635             None,
1636             None,
1637             None,
1638             None,
1639             None,
1640             None,
1641             None,
1642         )
1643         .unwrap();
1644 
1645         configure_system(
1646             &gm,
1647             GuestAddress(0),
1648             0,
1649             &None,
1650             no_vcpus,
1651             None,
1652             None,
1653             None,
1654             None,
1655             None,
1656             None,
1657             None,
1658         )
1659         .unwrap();
1660     }
1661 
1662     #[test]
1663     fn test_add_e820_entry() {
1664         let e820_table = [(boot_e820_entry {
1665             addr: 0x1,
1666             size: 4,
1667             type_: 1,
1668         }); 128];
1669 
1670         let expected_params = boot_params {
1671             e820_table,
1672             e820_entries: 1,
1673             ..Default::default()
1674         };
1675 
1676         let mut params: boot_params = Default::default();
1677         add_e820_entry(
1678             &mut params,
1679             e820_table[0].addr,
1680             e820_table[0].size,
1681             e820_table[0].type_,
1682         )
1683         .unwrap();
1684         assert_eq!(
1685             format!("{:?}", params.e820_table[0]),
1686             format!("{:?}", expected_params.e820_table[0])
1687         );
1688         assert_eq!(params.e820_entries, expected_params.e820_entries);
1689 
1690         // Exercise the scenario where the field storing the length of the e820 entry table is
1691         // is bigger than the allocated memory.
1692         params.e820_entries = params.e820_table.len() as u8 + 1;
1693         add_e820_entry(
1694             &mut params,
1695             e820_table[0].addr,
1696             e820_table[0].size,
1697             e820_table[0].type_,
1698         )
1699         .unwrap_err();
1700     }
1701 
1702     #[test]
1703     fn test_add_memmap_entry() {
1704         let mut memmap: Vec<hvm_memmap_table_entry> = Vec::new();
1705 
1706         let expected_memmap = vec![
1707             hvm_memmap_table_entry {
1708                 addr: 0x0,
1709                 size: 0x1000,
1710                 type_: E820_RAM,
1711                 ..Default::default()
1712             },
1713             hvm_memmap_table_entry {
1714                 addr: 0x10000,
1715                 size: 0xa000,
1716                 type_: E820_RESERVED,
1717                 ..Default::default()
1718             },
1719         ];
1720 
1721         add_memmap_entry(&mut memmap, 0, 0x1000, E820_RAM);
1722         add_memmap_entry(&mut memmap, 0x10000, 0xa000, E820_RESERVED);
1723 
1724         assert_eq!(format!("{memmap:?}"), format!("{expected_memmap:?}"));
1725     }
1726 
1727     #[test]
1728     fn test_get_x2apic_id() {
1729         let x2apic_id = get_x2apic_id(0, Some((2, 3, 1)));
1730         assert_eq!(x2apic_id, 0);
1731 
1732         let x2apic_id = get_x2apic_id(1, Some((2, 3, 1)));
1733         assert_eq!(x2apic_id, 1);
1734 
1735         let x2apic_id = get_x2apic_id(2, Some((2, 3, 1)));
1736         assert_eq!(x2apic_id, 2);
1737 
1738         let x2apic_id = get_x2apic_id(6, Some((2, 3, 1)));
1739         assert_eq!(x2apic_id, 8);
1740 
1741         let x2apic_id = get_x2apic_id(7, Some((2, 3, 1)));
1742         assert_eq!(x2apic_id, 9);
1743 
1744         let x2apic_id = get_x2apic_id(8, Some((2, 3, 1)));
1745         assert_eq!(x2apic_id, 10);
1746     }
1747 }
1748