xref: /cloud-hypervisor/hypervisor/src/kvm/mod.rs (revision 274f1aa2e738d579ffff9d4cfd7ed7c45293af31)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
4 //
5 // Copyright © 2020, Microsoft Corporation
6 //
7 // Copyright 2018-2019 CrowdStrike, Inc.
8 //
9 //
10 
11 #[cfg(target_arch = "aarch64")]
12 use crate::aarch64::gic::KvmGicV3Its;
13 #[cfg(target_arch = "aarch64")]
14 pub use crate::aarch64::{
15     check_required_kvm_extensions, gic::Gicv3ItsState as GicState, is_system_register, VcpuInit,
16     VcpuKvmState,
17 };
18 #[cfg(target_arch = "aarch64")]
19 use crate::arch::aarch64::gic::{Vgic, VgicConfig};
20 use crate::cpu;
21 use crate::hypervisor;
22 use crate::vec_with_array_field;
23 use crate::vm::{self, InterruptSourceConfig, VmOps};
24 use crate::HypervisorType;
25 #[cfg(target_arch = "aarch64")]
26 use crate::{arm64_core_reg_id, offset_of};
27 use kvm_ioctls::{NoDatamatch, VcpuFd, VmFd};
28 use std::any::Any;
29 use std::collections::HashMap;
30 #[cfg(target_arch = "aarch64")]
31 use std::convert::TryInto;
32 #[cfg(target_arch = "x86_64")]
33 use std::fs::File;
34 #[cfg(target_arch = "x86_64")]
35 use std::os::unix::io::AsRawFd;
36 #[cfg(feature = "tdx")]
37 use std::os::unix::io::RawFd;
38 use std::result;
39 #[cfg(target_arch = "x86_64")]
40 use std::sync::atomic::{AtomicBool, Ordering};
41 #[cfg(target_arch = "aarch64")]
42 use std::sync::Mutex;
43 use std::sync::{Arc, RwLock};
44 use vmm_sys_util::eventfd::EventFd;
45 // x86_64 dependencies
46 #[cfg(target_arch = "x86_64")]
47 pub mod x86_64;
48 #[cfg(target_arch = "x86_64")]
49 use crate::arch::x86::{
50     CpuIdEntry, FpuState, LapicState, MsrEntry, SpecialRegisters, StandardRegisters,
51     NUM_IOAPIC_PINS,
52 };
53 #[cfg(target_arch = "x86_64")]
54 use crate::ClockData;
55 use crate::{
56     CpuState, IoEventAddress, IrqRoutingEntry, MpState, UserMemoryRegion,
57     USER_MEMORY_REGION_LOG_DIRTY, USER_MEMORY_REGION_READ, USER_MEMORY_REGION_WRITE,
58 };
59 #[cfg(target_arch = "aarch64")]
60 use aarch64::{RegList, Register, StandardRegisters};
61 #[cfg(target_arch = "x86_64")]
62 use kvm_bindings::{
63     kvm_enable_cap, kvm_msr_entry, MsrList, KVM_CAP_HYPERV_SYNIC, KVM_CAP_SPLIT_IRQCHIP,
64     KVM_GUESTDBG_USE_HW_BP,
65 };
66 #[cfg(target_arch = "x86_64")]
67 use x86_64::check_required_kvm_extensions;
68 #[cfg(target_arch = "x86_64")]
69 pub use x86_64::{CpuId, ExtendedControlRegisters, MsrEntries, VcpuKvmState, Xsave};
70 // aarch64 dependencies
71 #[cfg(target_arch = "aarch64")]
72 pub mod aarch64;
73 pub use kvm_bindings;
74 #[cfg(feature = "tdx")]
75 use kvm_bindings::KVMIO;
76 pub use kvm_bindings::{
77     kvm_clock_data, kvm_create_device, kvm_device_type_KVM_DEV_TYPE_VFIO, kvm_guest_debug,
78     kvm_irq_routing, kvm_irq_routing_entry, kvm_mp_state, kvm_userspace_memory_region,
79     KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_IRQ_ROUTING_IRQCHIP, KVM_IRQ_ROUTING_MSI,
80     KVM_MEM_LOG_DIRTY_PAGES, KVM_MEM_READONLY, KVM_MSI_VALID_DEVID,
81 };
82 #[cfg(target_arch = "aarch64")]
83 use kvm_bindings::{
84     kvm_regs, user_fpsimd_state, user_pt_regs, KVM_GUESTDBG_USE_HW, KVM_NR_SPSR, KVM_REG_ARM64,
85     KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, KVM_REG_ARM64_SYSREG_CRN_MASK,
86     KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP1_MASK, KVM_REG_ARM64_SYSREG_OP2_MASK,
87     KVM_REG_ARM_CORE, KVM_REG_SIZE_U128, KVM_REG_SIZE_U32, KVM_REG_SIZE_U64,
88 };
89 pub use kvm_ioctls;
90 pub use kvm_ioctls::{Cap, Kvm};
91 #[cfg(target_arch = "aarch64")]
92 use std::mem;
93 use thiserror::Error;
94 use vfio_ioctls::VfioDeviceFd;
95 #[cfg(feature = "tdx")]
96 use vmm_sys_util::{ioctl::ioctl_with_val, ioctl_ioc_nr, ioctl_iowr_nr};
97 ///
98 /// Export generically-named wrappers of kvm-bindings for Unix-based platforms
99 ///
100 pub use {
101     kvm_bindings::kvm_create_device as CreateDevice, kvm_bindings::kvm_device_attr as DeviceAttr,
102     kvm_bindings::kvm_run, kvm_bindings::kvm_vcpu_events as VcpuEvents, kvm_ioctls::VcpuExit,
103 };
104 
105 #[cfg(target_arch = "x86_64")]
106 const KVM_CAP_SGX_ATTRIBUTE: u32 = 196;
107 
108 #[cfg(feature = "tdx")]
109 const KVM_EXIT_TDX: u32 = 50;
110 #[cfg(feature = "tdx")]
111 const TDG_VP_VMCALL_GET_QUOTE: u64 = 0x10002;
112 #[cfg(feature = "tdx")]
113 const TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT: u64 = 0x10004;
114 #[cfg(feature = "tdx")]
115 const TDG_VP_VMCALL_SUCCESS: u64 = 0;
116 #[cfg(feature = "tdx")]
117 const TDG_VP_VMCALL_INVALID_OPERAND: u64 = 0x8000000000000000;
118 
119 #[cfg(feature = "tdx")]
120 ioctl_iowr_nr!(KVM_MEMORY_ENCRYPT_OP, KVMIO, 0xba, std::os::raw::c_ulong);
121 
122 #[cfg(feature = "tdx")]
123 #[repr(u32)]
124 enum TdxCommand {
125     Capabilities = 0,
126     InitVm,
127     InitVcpu,
128     InitMemRegion,
129     Finalize,
130 }
131 
132 #[cfg(feature = "tdx")]
133 pub enum TdxExitDetails {
134     GetQuote,
135     SetupEventNotifyInterrupt,
136 }
137 
138 #[cfg(feature = "tdx")]
139 pub enum TdxExitStatus {
140     Success,
141     InvalidOperand,
142 }
143 
144 #[cfg(feature = "tdx")]
145 const TDX_MAX_NR_CPUID_CONFIGS: usize = 6;
146 
147 #[cfg(feature = "tdx")]
148 #[repr(C)]
149 #[derive(Debug, Default)]
150 pub struct TdxCpuidConfig {
151     pub leaf: u32,
152     pub sub_leaf: u32,
153     pub eax: u32,
154     pub ebx: u32,
155     pub ecx: u32,
156     pub edx: u32,
157 }
158 
159 #[cfg(feature = "tdx")]
160 #[repr(C)]
161 #[derive(Debug, Default)]
162 pub struct TdxCapabilities {
163     pub attrs_fixed0: u64,
164     pub attrs_fixed1: u64,
165     pub xfam_fixed0: u64,
166     pub xfam_fixed1: u64,
167     pub nr_cpuid_configs: u32,
168     pub padding: u32,
169     pub cpuid_configs: [TdxCpuidConfig; TDX_MAX_NR_CPUID_CONFIGS],
170 }
171 
172 impl From<kvm_userspace_memory_region> for UserMemoryRegion {
173     fn from(region: kvm_userspace_memory_region) -> Self {
174         let mut flags = USER_MEMORY_REGION_READ;
175         if region.flags & KVM_MEM_READONLY == 0 {
176             flags |= USER_MEMORY_REGION_WRITE;
177         }
178         if region.flags & KVM_MEM_LOG_DIRTY_PAGES != 0 {
179             flags |= USER_MEMORY_REGION_LOG_DIRTY;
180         }
181 
182         UserMemoryRegion {
183             slot: region.slot,
184             guest_phys_addr: region.guest_phys_addr,
185             memory_size: region.memory_size,
186             userspace_addr: region.userspace_addr,
187             flags,
188         }
189     }
190 }
191 
192 impl From<UserMemoryRegion> for kvm_userspace_memory_region {
193     fn from(region: UserMemoryRegion) -> Self {
194         assert!(
195             region.flags & USER_MEMORY_REGION_READ != 0,
196             "KVM mapped memory is always readable"
197         );
198 
199         let mut flags = 0;
200         if region.flags & USER_MEMORY_REGION_WRITE == 0 {
201             flags |= KVM_MEM_READONLY;
202         }
203         if region.flags & USER_MEMORY_REGION_LOG_DIRTY != 0 {
204             flags |= KVM_MEM_LOG_DIRTY_PAGES;
205         }
206 
207         kvm_userspace_memory_region {
208             slot: region.slot,
209             guest_phys_addr: region.guest_phys_addr,
210             memory_size: region.memory_size,
211             userspace_addr: region.userspace_addr,
212             flags,
213         }
214     }
215 }
216 
217 impl From<kvm_mp_state> for MpState {
218     fn from(s: kvm_mp_state) -> Self {
219         MpState::Kvm(s)
220     }
221 }
222 
223 impl From<MpState> for kvm_mp_state {
224     fn from(ms: MpState) -> Self {
225         match ms {
226             MpState::Kvm(s) => s,
227             /* Needed in case other hypervisors are enabled */
228             #[allow(unreachable_patterns)]
229             _ => panic!("CpuState is not valid"),
230         }
231     }
232 }
233 
234 impl From<kvm_ioctls::IoEventAddress> for IoEventAddress {
235     fn from(a: kvm_ioctls::IoEventAddress) -> Self {
236         match a {
237             kvm_ioctls::IoEventAddress::Pio(x) => Self::Pio(x),
238             kvm_ioctls::IoEventAddress::Mmio(x) => Self::Mmio(x),
239         }
240     }
241 }
242 
243 impl From<IoEventAddress> for kvm_ioctls::IoEventAddress {
244     fn from(a: IoEventAddress) -> Self {
245         match a {
246             IoEventAddress::Pio(x) => Self::Pio(x),
247             IoEventAddress::Mmio(x) => Self::Mmio(x),
248         }
249     }
250 }
251 
252 impl From<VcpuKvmState> for CpuState {
253     fn from(s: VcpuKvmState) -> Self {
254         CpuState::Kvm(s)
255     }
256 }
257 
258 impl From<CpuState> for VcpuKvmState {
259     fn from(s: CpuState) -> Self {
260         match s {
261             CpuState::Kvm(s) => s,
262             /* Needed in case other hypervisors are enabled */
263             #[allow(unreachable_patterns)]
264             _ => panic!("CpuState is not valid"),
265         }
266     }
267 }
268 
269 #[cfg(target_arch = "x86_64")]
270 impl From<kvm_clock_data> for ClockData {
271     fn from(d: kvm_clock_data) -> Self {
272         ClockData::Kvm(d)
273     }
274 }
275 
276 #[cfg(target_arch = "x86_64")]
277 impl From<ClockData> for kvm_clock_data {
278     fn from(ms: ClockData) -> Self {
279         match ms {
280             ClockData::Kvm(s) => s,
281             /* Needed in case other hypervisors are enabled */
282             #[allow(unreachable_patterns)]
283             _ => panic!("CpuState is not valid"),
284         }
285     }
286 }
287 
288 impl From<kvm_irq_routing_entry> for IrqRoutingEntry {
289     fn from(s: kvm_irq_routing_entry) -> Self {
290         IrqRoutingEntry::Kvm(s)
291     }
292 }
293 
294 impl From<IrqRoutingEntry> for kvm_irq_routing_entry {
295     fn from(e: IrqRoutingEntry) -> Self {
296         match e {
297             IrqRoutingEntry::Kvm(e) => e,
298             /* Needed in case other hypervisors are enabled */
299             #[allow(unreachable_patterns)]
300             _ => panic!("IrqRoutingEntry is not valid"),
301         }
302     }
303 }
304 
305 struct KvmDirtyLogSlot {
306     slot: u32,
307     guest_phys_addr: u64,
308     memory_size: u64,
309     userspace_addr: u64,
310 }
311 
312 /// Wrapper over KVM VM ioctls.
313 pub struct KvmVm {
314     fd: Arc<VmFd>,
315     #[cfg(target_arch = "x86_64")]
316     msrs: Vec<MsrEntry>,
317     dirty_log_slots: Arc<RwLock<HashMap<u32, KvmDirtyLogSlot>>>,
318 }
319 
320 impl KvmVm {
321     ///
322     /// Creates an emulated device in the kernel.
323     ///
324     /// See the documentation for `KVM_CREATE_DEVICE`.
325     fn create_device(&self, device: &mut CreateDevice) -> vm::Result<vfio_ioctls::VfioDeviceFd> {
326         let device_fd = self
327             .fd
328             .create_device(device)
329             .map_err(|e| vm::HypervisorVmError::CreateDevice(e.into()))?;
330         Ok(VfioDeviceFd::new_from_kvm(device_fd))
331     }
332     /// Checks if a particular `Cap` is available.
333     pub fn check_extension(&self, c: Cap) -> bool {
334         self.fd.check_extension(c)
335     }
336 }
337 
338 /// Implementation of Vm trait for KVM
339 ///
340 /// # Examples
341 ///
342 /// ```
343 /// # use hypervisor::kvm::KvmHypervisor;
344 /// # use std::sync::Arc;
345 /// let kvm = KvmHypervisor::new().unwrap();
346 /// let hypervisor = Arc::new(kvm);
347 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
348 /// ```
349 impl vm::Vm for KvmVm {
350     #[cfg(target_arch = "x86_64")]
351     ///
352     /// Sets the address of the one-page region in the VM's address space.
353     ///
354     fn set_identity_map_address(&self, address: u64) -> vm::Result<()> {
355         self.fd
356             .set_identity_map_address(address)
357             .map_err(|e| vm::HypervisorVmError::SetIdentityMapAddress(e.into()))
358     }
359     #[cfg(target_arch = "x86_64")]
360     ///
361     /// Sets the address of the three-page region in the VM's address space.
362     ///
363     fn set_tss_address(&self, offset: usize) -> vm::Result<()> {
364         self.fd
365             .set_tss_address(offset)
366             .map_err(|e| vm::HypervisorVmError::SetTssAddress(e.into()))
367     }
368     ///
369     /// Creates an in-kernel interrupt controller.
370     ///
371     fn create_irq_chip(&self) -> vm::Result<()> {
372         self.fd
373             .create_irq_chip()
374             .map_err(|e| vm::HypervisorVmError::CreateIrq(e.into()))
375     }
376     ///
377     /// Registers an event that will, when signaled, trigger the `gsi` IRQ.
378     ///
379     fn register_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
380         self.fd
381             .register_irqfd(fd, gsi)
382             .map_err(|e| vm::HypervisorVmError::RegisterIrqFd(e.into()))
383     }
384     ///
385     /// Unregisters an event that will, when signaled, trigger the `gsi` IRQ.
386     ///
387     fn unregister_irqfd(&self, fd: &EventFd, gsi: u32) -> vm::Result<()> {
388         self.fd
389             .unregister_irqfd(fd, gsi)
390             .map_err(|e| vm::HypervisorVmError::UnregisterIrqFd(e.into()))
391     }
392     ///
393     /// Creates a VcpuFd object from a vcpu RawFd.
394     ///
395     fn create_vcpu(
396         &self,
397         id: u8,
398         vm_ops: Option<Arc<dyn VmOps>>,
399     ) -> vm::Result<Arc<dyn cpu::Vcpu>> {
400         let vc = self
401             .fd
402             .create_vcpu(id as u64)
403             .map_err(|e| vm::HypervisorVmError::CreateVcpu(e.into()))?;
404         let vcpu = KvmVcpu {
405             fd: vc,
406             #[cfg(target_arch = "x86_64")]
407             msrs: self.msrs.clone(),
408             vm_ops,
409             #[cfg(target_arch = "x86_64")]
410             hyperv_synic: AtomicBool::new(false),
411         };
412         Ok(Arc::new(vcpu))
413     }
414     #[cfg(target_arch = "aarch64")]
415     ///
416     /// Creates a virtual GIC device.
417     ///
418     fn create_vgic(&self, config: VgicConfig) -> vm::Result<Arc<Mutex<dyn Vgic>>> {
419         let gic_device = KvmGicV3Its::new(self, config)
420             .map_err(|e| vm::HypervisorVmError::CreateVgic(anyhow!("Vgic error {:?}", e)))?;
421         Ok(Arc::new(Mutex::new(gic_device)))
422     }
423     ///
424     /// Registers an event to be signaled whenever a certain address is written to.
425     ///
426     fn register_ioevent(
427         &self,
428         fd: &EventFd,
429         addr: &IoEventAddress,
430         datamatch: Option<vm::DataMatch>,
431     ) -> vm::Result<()> {
432         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
433         if let Some(dm) = datamatch {
434             match dm {
435                 vm::DataMatch::DataMatch32(kvm_dm32) => self
436                     .fd
437                     .register_ioevent(fd, addr, kvm_dm32)
438                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
439                 vm::DataMatch::DataMatch64(kvm_dm64) => self
440                     .fd
441                     .register_ioevent(fd, addr, kvm_dm64)
442                     .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into())),
443             }
444         } else {
445             self.fd
446                 .register_ioevent(fd, addr, NoDatamatch)
447                 .map_err(|e| vm::HypervisorVmError::RegisterIoEvent(e.into()))
448         }
449     }
450     ///
451     /// Unregisters an event from a certain address it has been previously registered to.
452     ///
453     fn unregister_ioevent(&self, fd: &EventFd, addr: &IoEventAddress) -> vm::Result<()> {
454         let addr = &kvm_ioctls::IoEventAddress::from(*addr);
455         self.fd
456             .unregister_ioevent(fd, addr, NoDatamatch)
457             .map_err(|e| vm::HypervisorVmError::UnregisterIoEvent(e.into()))
458     }
459 
460     ///
461     /// Constructs a routing entry
462     ///
463     fn make_routing_entry(&self, gsi: u32, config: &InterruptSourceConfig) -> IrqRoutingEntry {
464         match &config {
465             InterruptSourceConfig::MsiIrq(cfg) => {
466                 let mut kvm_route = kvm_irq_routing_entry {
467                     gsi,
468                     type_: KVM_IRQ_ROUTING_MSI,
469                     ..Default::default()
470                 };
471 
472                 kvm_route.u.msi.address_lo = cfg.low_addr;
473                 kvm_route.u.msi.address_hi = cfg.high_addr;
474                 kvm_route.u.msi.data = cfg.data;
475 
476                 if self.check_extension(crate::kvm::Cap::MsiDevid) {
477                     // On AArch64, there is limitation on the range of the 'devid',
478                     // it can not be greater than 65536 (the max of u16).
479                     //
480                     // BDF can not be used directly, because 'segment' is in high
481                     // 16 bits. The layout of the u32 BDF is:
482                     // |---- 16 bits ----|-- 8 bits --|-- 5 bits --|-- 3 bits --|
483                     // |      segment    |     bus    |   device   |  function  |
484                     //
485                     // Now that we support 1 bus only in a segment, we can build a
486                     // 'devid' by replacing the 'bus' bits with the low 8 bits of
487                     // 'segment' data.
488                     // This way we can resolve the range checking problem and give
489                     // different `devid` to all the devices. Limitation is that at
490                     // most 256 segments can be supported.
491                     //
492                     let modified_devid = (cfg.devid & 0x00ff_0000) >> 8 | cfg.devid & 0xff;
493 
494                     kvm_route.flags = KVM_MSI_VALID_DEVID;
495                     kvm_route.u.msi.__bindgen_anon_1.devid = modified_devid;
496                 }
497                 kvm_route.into()
498             }
499             InterruptSourceConfig::LegacyIrq(cfg) => {
500                 let mut kvm_route = kvm_irq_routing_entry {
501                     gsi,
502                     type_: KVM_IRQ_ROUTING_IRQCHIP,
503                     ..Default::default()
504                 };
505                 kvm_route.u.irqchip.irqchip = cfg.irqchip;
506                 kvm_route.u.irqchip.pin = cfg.pin;
507 
508                 kvm_route.into()
509             }
510         }
511     }
512 
513     ///
514     /// Sets the GSI routing table entries, overwriting any previously set
515     /// entries, as per the `KVM_SET_GSI_ROUTING` ioctl.
516     ///
517     fn set_gsi_routing(&self, entries: &[IrqRoutingEntry]) -> vm::Result<()> {
518         let mut irq_routing =
519             vec_with_array_field::<kvm_irq_routing, kvm_irq_routing_entry>(entries.len());
520         irq_routing[0].nr = entries.len() as u32;
521         irq_routing[0].flags = 0;
522         let entries: Vec<kvm_irq_routing_entry> = entries
523             .iter()
524             .map(|entry| match entry {
525                 IrqRoutingEntry::Kvm(e) => *e,
526                 #[allow(unreachable_patterns)]
527                 _ => panic!("IrqRoutingEntry type is wrong"),
528             })
529             .collect();
530 
531         // SAFETY: irq_routing initialized with entries.len() and now it is being turned into
532         // entries_slice with entries.len() again. It is guaranteed to be large enough to hold
533         // everything from entries.
534         unsafe {
535             let entries_slice: &mut [kvm_irq_routing_entry] =
536                 irq_routing[0].entries.as_mut_slice(entries.len());
537             entries_slice.copy_from_slice(&entries);
538         }
539 
540         self.fd
541             .set_gsi_routing(&irq_routing[0])
542             .map_err(|e| vm::HypervisorVmError::SetGsiRouting(e.into()))
543     }
544     ///
545     /// Creates a memory region structure that can be used with {create/remove}_user_memory_region
546     ///
547     fn make_user_memory_region(
548         &self,
549         slot: u32,
550         guest_phys_addr: u64,
551         memory_size: u64,
552         userspace_addr: u64,
553         readonly: bool,
554         log_dirty_pages: bool,
555     ) -> UserMemoryRegion {
556         kvm_userspace_memory_region {
557             slot,
558             guest_phys_addr,
559             memory_size,
560             userspace_addr,
561             flags: if readonly { KVM_MEM_READONLY } else { 0 }
562                 | if log_dirty_pages {
563                     KVM_MEM_LOG_DIRTY_PAGES
564                 } else {
565                     0
566                 },
567         }
568         .into()
569     }
570     ///
571     /// Creates a guest physical memory region.
572     ///
573     fn create_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
574         let mut region: kvm_userspace_memory_region = user_memory_region.into();
575 
576         if (region.flags & KVM_MEM_LOG_DIRTY_PAGES) != 0 {
577             if (region.flags & KVM_MEM_READONLY) != 0 {
578                 return Err(vm::HypervisorVmError::CreateUserMemory(anyhow!(
579                     "Error creating regions with both 'dirty-pages-log' and 'read-only'."
580                 )));
581             }
582 
583             // Keep track of the regions that need dirty pages log
584             self.dirty_log_slots.write().unwrap().insert(
585                 region.slot,
586                 KvmDirtyLogSlot {
587                     slot: region.slot,
588                     guest_phys_addr: region.guest_phys_addr,
589                     memory_size: region.memory_size,
590                     userspace_addr: region.userspace_addr,
591                 },
592             );
593 
594             // Always create guest physical memory region without `KVM_MEM_LOG_DIRTY_PAGES`.
595             // For regions that need this flag, dirty pages log will be turned on in `start_dirty_log`.
596             region.flags = 0;
597         }
598 
599         // SAFETY: Safe because guest regions are guaranteed not to overlap.
600         unsafe {
601             self.fd
602                 .set_user_memory_region(region)
603                 .map_err(|e| vm::HypervisorVmError::CreateUserMemory(e.into()))
604         }
605     }
606     ///
607     /// Removes a guest physical memory region.
608     ///
609     fn remove_user_memory_region(&self, user_memory_region: UserMemoryRegion) -> vm::Result<()> {
610         let mut region: kvm_userspace_memory_region = user_memory_region.into();
611 
612         // Remove the corresponding entry from "self.dirty_log_slots" if needed
613         self.dirty_log_slots.write().unwrap().remove(&region.slot);
614 
615         // Setting the size to 0 means "remove"
616         region.memory_size = 0;
617         // SAFETY: Safe because guest regions are guaranteed not to overlap.
618         unsafe {
619             self.fd
620                 .set_user_memory_region(region)
621                 .map_err(|e| vm::HypervisorVmError::RemoveUserMemory(e.into()))
622         }
623     }
624     ///
625     /// Returns the preferred CPU target type which can be emulated by KVM on underlying host.
626     ///
627     #[cfg(target_arch = "aarch64")]
628     fn get_preferred_target(&self, kvi: &mut VcpuInit) -> vm::Result<()> {
629         self.fd
630             .get_preferred_target(kvi)
631             .map_err(|e| vm::HypervisorVmError::GetPreferredTarget(e.into()))
632     }
633     #[cfg(target_arch = "x86_64")]
634     fn enable_split_irq(&self) -> vm::Result<()> {
635         // Create split irqchip
636         // Only the local APIC is emulated in kernel, both PICs and IOAPIC
637         // are not.
638         let mut cap = kvm_enable_cap {
639             cap: KVM_CAP_SPLIT_IRQCHIP,
640             ..Default::default()
641         };
642         cap.args[0] = NUM_IOAPIC_PINS as u64;
643         self.fd
644             .enable_cap(&cap)
645             .map_err(|e| vm::HypervisorVmError::EnableSplitIrq(e.into()))?;
646         Ok(())
647     }
648     #[cfg(target_arch = "x86_64")]
649     fn enable_sgx_attribute(&self, file: File) -> vm::Result<()> {
650         let mut cap = kvm_enable_cap {
651             cap: KVM_CAP_SGX_ATTRIBUTE,
652             ..Default::default()
653         };
654         cap.args[0] = file.as_raw_fd() as u64;
655         self.fd
656             .enable_cap(&cap)
657             .map_err(|e| vm::HypervisorVmError::EnableSgxAttribute(e.into()))?;
658         Ok(())
659     }
660     /// Retrieve guest clock.
661     #[cfg(target_arch = "x86_64")]
662     fn get_clock(&self) -> vm::Result<ClockData> {
663         Ok(self
664             .fd
665             .get_clock()
666             .map_err(|e| vm::HypervisorVmError::GetClock(e.into()))?
667             .into())
668     }
669     /// Set guest clock.
670     #[cfg(target_arch = "x86_64")]
671     fn set_clock(&self, data: &ClockData) -> vm::Result<()> {
672         let data = (*data).into();
673         self.fd
674             .set_clock(&data)
675             .map_err(|e| vm::HypervisorVmError::SetClock(e.into()))
676     }
677     /// Create a device that is used for passthrough
678     fn create_passthrough_device(&self) -> vm::Result<VfioDeviceFd> {
679         let mut vfio_dev = kvm_create_device {
680             type_: kvm_device_type_KVM_DEV_TYPE_VFIO,
681             fd: 0,
682             flags: 0,
683         };
684 
685         self.create_device(&mut vfio_dev)
686             .map_err(|e| vm::HypervisorVmError::CreatePassthroughDevice(e.into()))
687     }
688     ///
689     /// Start logging dirty pages
690     ///
691     fn start_dirty_log(&self) -> vm::Result<()> {
692         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
693         for (_, s) in dirty_log_slots.iter() {
694             let region = kvm_userspace_memory_region {
695                 slot: s.slot,
696                 guest_phys_addr: s.guest_phys_addr,
697                 memory_size: s.memory_size,
698                 userspace_addr: s.userspace_addr,
699                 flags: KVM_MEM_LOG_DIRTY_PAGES,
700             };
701             // SAFETY: Safe because guest regions are guaranteed not to overlap.
702             unsafe {
703                 self.fd
704                     .set_user_memory_region(region)
705                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
706             }
707         }
708 
709         Ok(())
710     }
711 
712     ///
713     /// Stop logging dirty pages
714     ///
715     fn stop_dirty_log(&self) -> vm::Result<()> {
716         let dirty_log_slots = self.dirty_log_slots.read().unwrap();
717         for (_, s) in dirty_log_slots.iter() {
718             let region = kvm_userspace_memory_region {
719                 slot: s.slot,
720                 guest_phys_addr: s.guest_phys_addr,
721                 memory_size: s.memory_size,
722                 userspace_addr: s.userspace_addr,
723                 flags: 0,
724             };
725             // SAFETY: Safe because guest regions are guaranteed not to overlap.
726             unsafe {
727                 self.fd
728                     .set_user_memory_region(region)
729                     .map_err(|e| vm::HypervisorVmError::StartDirtyLog(e.into()))?;
730             }
731         }
732 
733         Ok(())
734     }
735 
736     ///
737     /// Get dirty pages bitmap (one bit per page)
738     ///
739     fn get_dirty_log(&self, slot: u32, _base_gpa: u64, memory_size: u64) -> vm::Result<Vec<u64>> {
740         self.fd
741             .get_dirty_log(slot, memory_size as usize)
742             .map_err(|e| vm::HypervisorVmError::GetDirtyLog(e.into()))
743     }
744 
745     ///
746     /// Initialize TDX for this VM
747     ///
748     #[cfg(feature = "tdx")]
749     fn tdx_init(&self, cpuid: &[CpuIdEntry], max_vcpus: u32) -> vm::Result<()> {
750         const TDX_ATTR_SEPT_VE_DISABLE: usize = 28;
751 
752         let mut cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
753             cpuid.iter().map(|e| (*e).into()).collect();
754         cpuid.resize(256, kvm_bindings::kvm_cpuid_entry2::default());
755 
756         #[repr(C)]
757         struct TdxInitVm {
758             attributes: u64,
759             max_vcpus: u32,
760             padding: u32,
761             mrconfigid: [u64; 6],
762             mrowner: [u64; 6],
763             mrownerconfig: [u64; 6],
764             cpuid_nent: u32,
765             cpuid_padding: u32,
766             cpuid_entries: [kvm_bindings::kvm_cpuid_entry2; 256],
767         }
768         let data = TdxInitVm {
769             attributes: 1 << TDX_ATTR_SEPT_VE_DISABLE,
770             max_vcpus,
771             padding: 0,
772             mrconfigid: [0; 6],
773             mrowner: [0; 6],
774             mrownerconfig: [0; 6],
775             cpuid_nent: cpuid.len() as u32,
776             cpuid_padding: 0,
777             cpuid_entries: cpuid.as_slice().try_into().unwrap(),
778         };
779 
780         tdx_command(
781             &self.fd.as_raw_fd(),
782             TdxCommand::InitVm,
783             0,
784             &data as *const _ as u64,
785         )
786         .map_err(vm::HypervisorVmError::InitializeTdx)
787     }
788 
789     ///
790     /// Finalize the TDX setup for this VM
791     ///
792     #[cfg(feature = "tdx")]
793     fn tdx_finalize(&self) -> vm::Result<()> {
794         tdx_command(&self.fd.as_raw_fd(), TdxCommand::Finalize, 0, 0)
795             .map_err(vm::HypervisorVmError::FinalizeTdx)
796     }
797 
798     ///
799     /// Initialize memory regions for the TDX VM
800     ///
801     #[cfg(feature = "tdx")]
802     fn tdx_init_memory_region(
803         &self,
804         host_address: u64,
805         guest_address: u64,
806         size: u64,
807         measure: bool,
808     ) -> vm::Result<()> {
809         #[repr(C)]
810         struct TdxInitMemRegion {
811             host_address: u64,
812             guest_address: u64,
813             pages: u64,
814         }
815         let data = TdxInitMemRegion {
816             host_address,
817             guest_address,
818             pages: size / 4096,
819         };
820 
821         tdx_command(
822             &self.fd.as_raw_fd(),
823             TdxCommand::InitMemRegion,
824             u32::from(measure),
825             &data as *const _ as u64,
826         )
827         .map_err(vm::HypervisorVmError::InitMemRegionTdx)
828     }
829     /// Downcast to the underlying KvmVm type
830     fn as_any(&self) -> &dyn Any {
831         self
832     }
833 }
834 
835 #[cfg(feature = "tdx")]
836 fn tdx_command(
837     fd: &RawFd,
838     command: TdxCommand,
839     flags: u32,
840     data: u64,
841 ) -> std::result::Result<(), std::io::Error> {
842     #[repr(C)]
843     struct TdxIoctlCmd {
844         command: TdxCommand,
845         flags: u32,
846         data: u64,
847         error: u64,
848         unused: u64,
849     }
850     let cmd = TdxIoctlCmd {
851         command,
852         flags,
853         data,
854         error: 0,
855         unused: 0,
856     };
857     // SAFETY: FFI call. All input parameters are valid.
858     let ret = unsafe {
859         ioctl_with_val(
860             fd,
861             KVM_MEMORY_ENCRYPT_OP(),
862             &cmd as *const TdxIoctlCmd as std::os::raw::c_ulong,
863         )
864     };
865 
866     if ret < 0 {
867         return Err(std::io::Error::last_os_error());
868     }
869     Ok(())
870 }
871 
872 /// Wrapper over KVM system ioctls.
873 pub struct KvmHypervisor {
874     kvm: Kvm,
875 }
876 
877 impl KvmHypervisor {
878     #[cfg(target_arch = "x86_64")]
879     ///
880     /// Retrieve the list of MSRs supported by the hypervisor.
881     ///
882     fn get_msr_list(&self) -> hypervisor::Result<MsrList> {
883         self.kvm
884             .get_msr_index_list()
885             .map_err(|e| hypervisor::HypervisorError::GetMsrList(e.into()))
886     }
887 }
888 
889 /// Enum for KVM related error
890 #[derive(Debug, Error)]
891 pub enum KvmError {
892     #[error("Capability missing: {0:?}")]
893     CapabilityMissing(Cap),
894 }
895 pub type KvmResult<T> = result::Result<T, KvmError>;
896 impl KvmHypervisor {
897     /// Create a hypervisor based on Kvm
898     #[allow(clippy::new_ret_no_self)]
899     pub fn new() -> hypervisor::Result<Arc<dyn hypervisor::Hypervisor>> {
900         let kvm_obj = Kvm::new().map_err(|e| hypervisor::HypervisorError::VmCreate(e.into()))?;
901         let api_version = kvm_obj.get_api_version();
902 
903         if api_version != kvm_bindings::KVM_API_VERSION as i32 {
904             return Err(hypervisor::HypervisorError::IncompatibleApiVersion);
905         }
906 
907         Ok(Arc::new(KvmHypervisor { kvm: kvm_obj }))
908     }
909     /// Check if the hypervisor is available
910     pub fn is_available() -> hypervisor::Result<bool> {
911         match std::fs::metadata("/dev/kvm") {
912             Ok(_) => Ok(true),
913             Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
914             Err(err) => Err(hypervisor::HypervisorError::HypervisorAvailableCheck(
915                 err.into(),
916             )),
917         }
918     }
919 }
920 /// Implementation of Hypervisor trait for KVM
921 ///
922 /// # Examples
923 ///
924 /// ```
925 /// # use hypervisor::kvm::KvmHypervisor;
926 /// # use std::sync::Arc;
927 /// let kvm = KvmHypervisor::new().unwrap();
928 /// let hypervisor = Arc::new(kvm);
929 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
930 /// ```
931 impl hypervisor::Hypervisor for KvmHypervisor {
932     ///
933     /// Returns the type of the hypervisor
934     ///
935     fn hypervisor_type(&self) -> HypervisorType {
936         HypervisorType::Kvm
937     }
938     /// Create a KVM vm object of a specific VM type and return the object as Vm trait object
939     ///
940     /// # Examples
941     ///
942     /// ```
943     /// # use hypervisor::kvm::KvmHypervisor;
944     /// use hypervisor::kvm::KvmVm;
945     /// let hypervisor = KvmHypervisor::new().unwrap();
946     /// let vm = hypervisor.create_vm_with_type(0).unwrap();
947     /// ```
948     fn create_vm_with_type(&self, vm_type: u64) -> hypervisor::Result<Arc<dyn vm::Vm>> {
949         let fd: VmFd;
950         loop {
951             match self.kvm.create_vm_with_type(vm_type) {
952                 Ok(res) => fd = res,
953                 Err(e) => {
954                     if e.errno() == libc::EINTR {
955                         // If the error returned is EINTR, which means the
956                         // ioctl has been interrupted, we have to retry as
957                         // this can't be considered as a regular error.
958                         continue;
959                     } else {
960                         return Err(hypervisor::HypervisorError::VmCreate(e.into()));
961                     }
962                 }
963             }
964             break;
965         }
966 
967         let vm_fd = Arc::new(fd);
968 
969         #[cfg(target_arch = "x86_64")]
970         {
971             let msr_list = self.get_msr_list()?;
972             let num_msrs = msr_list.as_fam_struct_ref().nmsrs as usize;
973             let mut msrs: Vec<MsrEntry> = vec![
974                 MsrEntry {
975                     ..Default::default()
976                 };
977                 num_msrs
978             ];
979             let indices = msr_list.as_slice();
980             for (pos, index) in indices.iter().enumerate() {
981                 msrs[pos].index = *index;
982             }
983 
984             Ok(Arc::new(KvmVm {
985                 fd: vm_fd,
986                 msrs,
987                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
988             }))
989         }
990 
991         #[cfg(target_arch = "aarch64")]
992         {
993             Ok(Arc::new(KvmVm {
994                 fd: vm_fd,
995                 dirty_log_slots: Arc::new(RwLock::new(HashMap::new())),
996             }))
997         }
998     }
999 
1000     /// Create a KVM vm object and return the object as Vm trait object
1001     ///
1002     /// # Examples
1003     ///
1004     /// ```
1005     /// # use hypervisor::kvm::KvmHypervisor;
1006     /// use hypervisor::kvm::KvmVm;
1007     /// let hypervisor = KvmHypervisor::new().unwrap();
1008     /// let vm = hypervisor.create_vm().unwrap();
1009     /// ```
1010     fn create_vm(&self) -> hypervisor::Result<Arc<dyn vm::Vm>> {
1011         #[allow(unused_mut)]
1012         let mut vm_type: u64 = 0; // Create with default platform type
1013 
1014         // When KVM supports Cap::ArmVmIPASize, it is better to get the IPA
1015         // size from the host and use that when creating the VM, which may
1016         // avoid unnecessary VM creation failures.
1017         #[cfg(target_arch = "aarch64")]
1018         if self.kvm.check_extension(Cap::ArmVmIPASize) {
1019             vm_type = self.kvm.get_host_ipa_limit().try_into().unwrap();
1020         }
1021 
1022         self.create_vm_with_type(vm_type)
1023     }
1024 
1025     fn check_required_extensions(&self) -> hypervisor::Result<()> {
1026         check_required_kvm_extensions(&self.kvm)
1027             .map_err(|e| hypervisor::HypervisorError::CheckExtensions(e.into()))
1028     }
1029 
1030     #[cfg(target_arch = "x86_64")]
1031     ///
1032     /// X86 specific call to get the system supported CPUID values.
1033     ///
1034     fn get_supported_cpuid(&self) -> hypervisor::Result<Vec<CpuIdEntry>> {
1035         let kvm_cpuid = self
1036             .kvm
1037             .get_supported_cpuid(kvm_bindings::KVM_MAX_CPUID_ENTRIES)
1038             .map_err(|e| hypervisor::HypervisorError::GetCpuId(e.into()))?;
1039 
1040         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1041 
1042         Ok(v)
1043     }
1044 
1045     #[cfg(target_arch = "aarch64")]
1046     ///
1047     /// Retrieve AArch64 host maximum IPA size supported by KVM.
1048     ///
1049     fn get_host_ipa_limit(&self) -> i32 {
1050         self.kvm.get_host_ipa_limit()
1051     }
1052 
1053     ///
1054     /// Retrieve TDX capabilities
1055     ///
1056     #[cfg(feature = "tdx")]
1057     fn tdx_capabilities(&self) -> hypervisor::Result<TdxCapabilities> {
1058         let data = TdxCapabilities {
1059             nr_cpuid_configs: TDX_MAX_NR_CPUID_CONFIGS as u32,
1060             ..Default::default()
1061         };
1062 
1063         tdx_command(
1064             &self.kvm.as_raw_fd(),
1065             TdxCommand::Capabilities,
1066             0,
1067             &data as *const _ as u64,
1068         )
1069         .map_err(|e| hypervisor::HypervisorError::TdxCapabilities(e.into()))?;
1070 
1071         Ok(data)
1072     }
1073 
1074     ///
1075     /// Get the number of supported hardware breakpoints
1076     ///
1077     fn get_guest_debug_hw_bps(&self) -> usize {
1078         #[cfg(target_arch = "x86_64")]
1079         {
1080             4
1081         }
1082         #[cfg(target_arch = "aarch64")]
1083         {
1084             self.kvm.get_guest_debug_hw_bps() as usize
1085         }
1086     }
1087 
1088     /// Get maximum number of vCPUs
1089     fn get_max_vcpus(&self) -> u32 {
1090         self.kvm.get_max_vcpus().min(u32::MAX as usize) as u32
1091     }
1092 }
1093 /// Vcpu struct for KVM
1094 pub struct KvmVcpu {
1095     fd: VcpuFd,
1096     #[cfg(target_arch = "x86_64")]
1097     msrs: Vec<MsrEntry>,
1098     vm_ops: Option<Arc<dyn vm::VmOps>>,
1099     #[cfg(target_arch = "x86_64")]
1100     hyperv_synic: AtomicBool,
1101 }
1102 /// Implementation of Vcpu trait for KVM
1103 ///
1104 /// # Examples
1105 ///
1106 /// ```
1107 /// # use hypervisor::kvm::KvmHypervisor;
1108 /// # use std::sync::Arc;
1109 /// let kvm = KvmHypervisor::new().unwrap();
1110 /// let hypervisor = Arc::new(kvm);
1111 /// let vm = hypervisor.create_vm().expect("new VM fd creation failed");
1112 /// let vcpu = vm.create_vcpu(0, None).unwrap();
1113 /// ```
1114 impl cpu::Vcpu for KvmVcpu {
1115     #[cfg(target_arch = "x86_64")]
1116     ///
1117     /// Returns the vCPU general purpose registers.
1118     ///
1119     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1120         Ok(self
1121             .fd
1122             .get_regs()
1123             .map_err(|e| cpu::HypervisorCpuError::GetStandardRegs(e.into()))?
1124             .into())
1125     }
1126     ///
1127     /// Returns the vCPU general purpose registers.
1128     /// The `KVM_GET_REGS` ioctl is not available on AArch64, `KVM_GET_ONE_REG`
1129     /// is used to get registers one by one.
1130     ///
1131     #[cfg(target_arch = "aarch64")]
1132     fn get_regs(&self) -> cpu::Result<StandardRegisters> {
1133         let mut state: StandardRegisters = kvm_regs::default();
1134         let mut off = offset_of!(user_pt_regs, regs);
1135         // There are 31 user_pt_regs:
1136         // https://elixir.free-electrons.com/linux/v4.14.174/source/arch/arm64/include/uapi/asm/ptrace.h#L72
1137         // These actually are the general-purpose registers of the Armv8-a
1138         // architecture (i.e x0-x30 if used as a 64bit register or w0-30 when used as a 32bit register).
1139         for i in 0..31 {
1140             state.regs.regs[i] = self
1141                 .fd
1142                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1143                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1144                 .try_into()
1145                 .unwrap();
1146             off += std::mem::size_of::<u64>();
1147         }
1148 
1149         // We are now entering the "Other register" section of the ARMv8-a architecture.
1150         // First one, stack pointer.
1151         let off = offset_of!(user_pt_regs, sp);
1152         state.regs.sp = self
1153             .fd
1154             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1155             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1156             .try_into()
1157             .unwrap();
1158 
1159         // Second one, the program counter.
1160         let off = offset_of!(user_pt_regs, pc);
1161         state.regs.pc = self
1162             .fd
1163             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1164             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1165             .try_into()
1166             .unwrap();
1167 
1168         // Next is the processor state.
1169         let off = offset_of!(user_pt_regs, pstate);
1170         state.regs.pstate = self
1171             .fd
1172             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1173             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1174             .try_into()
1175             .unwrap();
1176 
1177         // The stack pointer associated with EL1
1178         let off = offset_of!(kvm_regs, sp_el1);
1179         state.sp_el1 = self
1180             .fd
1181             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1182             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1183             .try_into()
1184             .unwrap();
1185 
1186         // Exception Link Register for EL1, when taking an exception to EL1, this register
1187         // holds the address to which to return afterwards.
1188         let off = offset_of!(kvm_regs, elr_el1);
1189         state.elr_el1 = self
1190             .fd
1191             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1192             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1193             .try_into()
1194             .unwrap();
1195 
1196         // Saved Program Status Registers, there are 5 of them used in the kernel.
1197         let mut off = offset_of!(kvm_regs, spsr);
1198         for i in 0..KVM_NR_SPSR as usize {
1199             state.spsr[i] = self
1200                 .fd
1201                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, off))
1202                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1203                 .try_into()
1204                 .unwrap();
1205             off += std::mem::size_of::<u64>();
1206         }
1207 
1208         // Now moving on to floating point registers which are stored in the user_fpsimd_state in the kernel:
1209         // https://elixir.free-electrons.com/linux/v4.9.62/source/arch/arm64/include/uapi/asm/kvm.h#L53
1210         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1211         for i in 0..32 {
1212             state.fp_regs.vregs[i] = self
1213                 .fd
1214                 .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U128, off))
1215                 .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?;
1216             off += mem::size_of::<u128>();
1217         }
1218 
1219         // Floating-point Status Register
1220         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1221         state.fp_regs.fpsr = self
1222             .fd
1223             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1224             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1225             .try_into()
1226             .unwrap();
1227 
1228         // Floating-point Control Register
1229         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1230         state.fp_regs.fpcr = self
1231             .fd
1232             .get_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U32, off))
1233             .map_err(|e| cpu::HypervisorCpuError::GetCoreRegister(e.into()))?
1234             .try_into()
1235             .unwrap();
1236         Ok(state)
1237     }
1238     #[cfg(target_arch = "x86_64")]
1239     ///
1240     /// Sets the vCPU general purpose registers using the `KVM_SET_REGS` ioctl.
1241     ///
1242     fn set_regs(&self, regs: &StandardRegisters) -> cpu::Result<()> {
1243         let regs = (*regs).into();
1244         self.fd
1245             .set_regs(&regs)
1246             .map_err(|e| cpu::HypervisorCpuError::SetStandardRegs(e.into()))
1247     }
1248 
1249     ///
1250     /// Sets the vCPU general purpose registers.
1251     /// The `KVM_SET_REGS` ioctl is not available on AArch64, `KVM_SET_ONE_REG`
1252     /// is used to set registers one by one.
1253     ///
1254     #[cfg(target_arch = "aarch64")]
1255     fn set_regs(&self, state: &StandardRegisters) -> cpu::Result<()> {
1256         // The function follows the exact identical order from `state`. Look there
1257         // for some additional info on registers.
1258         let mut off = offset_of!(user_pt_regs, regs);
1259         for i in 0..31 {
1260             self.fd
1261                 .set_one_reg(
1262                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1263                     state.regs.regs[i].into(),
1264                 )
1265                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1266             off += std::mem::size_of::<u64>();
1267         }
1268 
1269         let off = offset_of!(user_pt_regs, sp);
1270         self.fd
1271             .set_one_reg(
1272                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1273                 state.regs.sp.into(),
1274             )
1275             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1276 
1277         let off = offset_of!(user_pt_regs, pc);
1278         self.fd
1279             .set_one_reg(
1280                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1281                 state.regs.pc.into(),
1282             )
1283             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1284 
1285         let off = offset_of!(user_pt_regs, pstate);
1286         self.fd
1287             .set_one_reg(
1288                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1289                 state.regs.pstate.into(),
1290             )
1291             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1292 
1293         let off = offset_of!(kvm_regs, sp_el1);
1294         self.fd
1295             .set_one_reg(
1296                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1297                 state.sp_el1.into(),
1298             )
1299             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1300 
1301         let off = offset_of!(kvm_regs, elr_el1);
1302         self.fd
1303             .set_one_reg(
1304                 arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1305                 state.elr_el1.into(),
1306             )
1307             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1308 
1309         let mut off = offset_of!(kvm_regs, spsr);
1310         for i in 0..KVM_NR_SPSR as usize {
1311             self.fd
1312                 .set_one_reg(
1313                     arm64_core_reg_id!(KVM_REG_SIZE_U64, off),
1314                     state.spsr[i].into(),
1315                 )
1316                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1317             off += std::mem::size_of::<u64>();
1318         }
1319 
1320         let mut off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, vregs);
1321         for i in 0..32 {
1322             self.fd
1323                 .set_one_reg(
1324                     arm64_core_reg_id!(KVM_REG_SIZE_U128, off),
1325                     state.fp_regs.vregs[i],
1326                 )
1327                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1328             off += mem::size_of::<u128>();
1329         }
1330 
1331         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpsr);
1332         self.fd
1333             .set_one_reg(
1334                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1335                 state.fp_regs.fpsr.into(),
1336             )
1337             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1338 
1339         let off = offset_of!(kvm_regs, fp_regs) + offset_of!(user_fpsimd_state, fpcr);
1340         self.fd
1341             .set_one_reg(
1342                 arm64_core_reg_id!(KVM_REG_SIZE_U32, off),
1343                 state.fp_regs.fpcr.into(),
1344             )
1345             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1346         Ok(())
1347     }
1348 
1349     #[cfg(target_arch = "x86_64")]
1350     ///
1351     /// Returns the vCPU special registers.
1352     ///
1353     fn get_sregs(&self) -> cpu::Result<SpecialRegisters> {
1354         Ok(self
1355             .fd
1356             .get_sregs()
1357             .map_err(|e| cpu::HypervisorCpuError::GetSpecialRegs(e.into()))?
1358             .into())
1359     }
1360     #[cfg(target_arch = "x86_64")]
1361     ///
1362     /// Sets the vCPU special registers using the `KVM_SET_SREGS` ioctl.
1363     ///
1364     fn set_sregs(&self, sregs: &SpecialRegisters) -> cpu::Result<()> {
1365         let sregs = (*sregs).into();
1366         self.fd
1367             .set_sregs(&sregs)
1368             .map_err(|e| cpu::HypervisorCpuError::SetSpecialRegs(e.into()))
1369     }
1370     #[cfg(target_arch = "x86_64")]
1371     ///
1372     /// Returns the floating point state (FPU) from the vCPU.
1373     ///
1374     fn get_fpu(&self) -> cpu::Result<FpuState> {
1375         Ok(self
1376             .fd
1377             .get_fpu()
1378             .map_err(|e| cpu::HypervisorCpuError::GetFloatingPointRegs(e.into()))?
1379             .into())
1380     }
1381     #[cfg(target_arch = "x86_64")]
1382     ///
1383     /// Set the floating point state (FPU) of a vCPU using the `KVM_SET_FPU` ioct.
1384     ///
1385     fn set_fpu(&self, fpu: &FpuState) -> cpu::Result<()> {
1386         let fpu: kvm_bindings::kvm_fpu = (*fpu).clone().into();
1387         self.fd
1388             .set_fpu(&fpu)
1389             .map_err(|e| cpu::HypervisorCpuError::SetFloatingPointRegs(e.into()))
1390     }
1391     #[cfg(target_arch = "x86_64")]
1392     ///
1393     /// X86 specific call to setup the CPUID registers.
1394     ///
1395     fn set_cpuid2(&self, cpuid: &[CpuIdEntry]) -> cpu::Result<()> {
1396         let cpuid: Vec<kvm_bindings::kvm_cpuid_entry2> =
1397             cpuid.iter().map(|e| (*e).into()).collect();
1398         let kvm_cpuid = <CpuId>::from_entries(&cpuid)
1399             .map_err(|_| cpu::HypervisorCpuError::SetCpuid(anyhow!("failed to create CpuId")))?;
1400 
1401         self.fd
1402             .set_cpuid2(&kvm_cpuid)
1403             .map_err(|e| cpu::HypervisorCpuError::SetCpuid(e.into()))
1404     }
1405     #[cfg(target_arch = "x86_64")]
1406     ///
1407     /// X86 specific call to enable HyperV SynIC
1408     ///
1409     fn enable_hyperv_synic(&self) -> cpu::Result<()> {
1410         // Update the information about Hyper-V SynIC being enabled and
1411         // emulated as it will influence later which MSRs should be saved.
1412         self.hyperv_synic.store(true, Ordering::Release);
1413 
1414         let cap = kvm_enable_cap {
1415             cap: KVM_CAP_HYPERV_SYNIC,
1416             ..Default::default()
1417         };
1418         self.fd
1419             .enable_cap(&cap)
1420             .map_err(|e| cpu::HypervisorCpuError::EnableHyperVSyncIc(e.into()))
1421     }
1422     ///
1423     /// X86 specific call to retrieve the CPUID registers.
1424     ///
1425     #[cfg(target_arch = "x86_64")]
1426     fn get_cpuid2(&self, num_entries: usize) -> cpu::Result<Vec<CpuIdEntry>> {
1427         let kvm_cpuid = self
1428             .fd
1429             .get_cpuid2(num_entries)
1430             .map_err(|e| cpu::HypervisorCpuError::GetCpuid(e.into()))?;
1431 
1432         let v = kvm_cpuid.as_slice().iter().map(|e| (*e).into()).collect();
1433 
1434         Ok(v)
1435     }
1436     #[cfg(target_arch = "x86_64")]
1437     ///
1438     /// Returns the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1439     ///
1440     fn get_lapic(&self) -> cpu::Result<LapicState> {
1441         Ok(self
1442             .fd
1443             .get_lapic()
1444             .map_err(|e| cpu::HypervisorCpuError::GetlapicState(e.into()))?
1445             .into())
1446     }
1447     #[cfg(target_arch = "x86_64")]
1448     ///
1449     /// Sets the state of the LAPIC (Local Advanced Programmable Interrupt Controller).
1450     ///
1451     fn set_lapic(&self, klapic: &LapicState) -> cpu::Result<()> {
1452         let klapic: kvm_bindings::kvm_lapic_state = (*klapic).clone().into();
1453         self.fd
1454             .set_lapic(&klapic)
1455             .map_err(|e| cpu::HypervisorCpuError::SetLapicState(e.into()))
1456     }
1457     #[cfg(target_arch = "x86_64")]
1458     ///
1459     /// Returns the model-specific registers (MSR) for this vCPU.
1460     ///
1461     fn get_msrs(&self, msrs: &mut Vec<MsrEntry>) -> cpu::Result<usize> {
1462         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1463         let mut kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1464         let succ = self
1465             .fd
1466             .get_msrs(&mut kvm_msrs)
1467             .map_err(|e| cpu::HypervisorCpuError::GetMsrEntries(e.into()))?;
1468 
1469         msrs[..succ].copy_from_slice(
1470             &kvm_msrs.as_slice()[..succ]
1471                 .iter()
1472                 .map(|e| (*e).into())
1473                 .collect::<Vec<MsrEntry>>(),
1474         );
1475 
1476         Ok(succ)
1477     }
1478     #[cfg(target_arch = "x86_64")]
1479     ///
1480     /// Setup the model-specific registers (MSR) for this vCPU.
1481     /// Returns the number of MSR entries actually written.
1482     ///
1483     fn set_msrs(&self, msrs: &[MsrEntry]) -> cpu::Result<usize> {
1484         let kvm_msrs: Vec<kvm_msr_entry> = msrs.iter().map(|e| (*e).into()).collect();
1485         let kvm_msrs = MsrEntries::from_entries(&kvm_msrs).unwrap();
1486         self.fd
1487             .set_msrs(&kvm_msrs)
1488             .map_err(|e| cpu::HypervisorCpuError::SetMsrEntries(e.into()))
1489     }
1490     ///
1491     /// Returns the vcpu's current "multiprocessing state".
1492     ///
1493     fn get_mp_state(&self) -> cpu::Result<MpState> {
1494         Ok(self
1495             .fd
1496             .get_mp_state()
1497             .map_err(|e| cpu::HypervisorCpuError::GetMpState(e.into()))?
1498             .into())
1499     }
1500     ///
1501     /// Sets the vcpu's current "multiprocessing state".
1502     ///
1503     fn set_mp_state(&self, mp_state: MpState) -> cpu::Result<()> {
1504         self.fd
1505             .set_mp_state(mp_state.into())
1506             .map_err(|e| cpu::HypervisorCpuError::SetMpState(e.into()))
1507     }
1508     #[cfg(target_arch = "x86_64")]
1509     ///
1510     /// Translates guest virtual address to guest physical address using the `KVM_TRANSLATE` ioctl.
1511     ///
1512     fn translate_gva(&self, gva: u64, _flags: u64) -> cpu::Result<(u64, u32)> {
1513         let tr = self
1514             .fd
1515             .translate_gva(gva)
1516             .map_err(|e| cpu::HypervisorCpuError::TranslateVirtualAddress(e.into()))?;
1517         // tr.valid is set if the GVA is mapped to valid GPA.
1518         match tr.valid {
1519             0 => Err(cpu::HypervisorCpuError::TranslateVirtualAddress(anyhow!(
1520                 "Invalid GVA: {:#x}",
1521                 gva
1522             ))),
1523             _ => Ok((tr.physical_address, 0)),
1524         }
1525     }
1526     ///
1527     /// Triggers the running of the current virtual CPU returning an exit reason.
1528     ///
1529     fn run(&self) -> std::result::Result<cpu::VmExit, cpu::HypervisorCpuError> {
1530         match self.fd.run() {
1531             Ok(run) => match run {
1532                 #[cfg(target_arch = "x86_64")]
1533                 VcpuExit::IoIn(addr, data) => {
1534                     if let Some(vm_ops) = &self.vm_ops {
1535                         return vm_ops
1536                             .pio_read(addr.into(), data)
1537                             .map(|_| cpu::VmExit::Ignore)
1538                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1539                     }
1540 
1541                     Ok(cpu::VmExit::IoIn(addr, data))
1542                 }
1543                 #[cfg(target_arch = "x86_64")]
1544                 VcpuExit::IoOut(addr, data) => {
1545                     if let Some(vm_ops) = &self.vm_ops {
1546                         return vm_ops
1547                             .pio_write(addr.into(), data)
1548                             .map(|_| cpu::VmExit::Ignore)
1549                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1550                     }
1551 
1552                     Ok(cpu::VmExit::IoOut(addr, data))
1553                 }
1554                 #[cfg(target_arch = "x86_64")]
1555                 VcpuExit::IoapicEoi(vector) => Ok(cpu::VmExit::IoapicEoi(vector)),
1556                 #[cfg(target_arch = "x86_64")]
1557                 VcpuExit::Shutdown | VcpuExit::Hlt => Ok(cpu::VmExit::Reset),
1558 
1559                 #[cfg(target_arch = "aarch64")]
1560                 VcpuExit::SystemEvent(event_type, flags) => {
1561                     use kvm_bindings::{KVM_SYSTEM_EVENT_RESET, KVM_SYSTEM_EVENT_SHUTDOWN};
1562                     // On Aarch64, when the VM is shutdown, run() returns
1563                     // VcpuExit::SystemEvent with reason KVM_SYSTEM_EVENT_SHUTDOWN
1564                     if event_type == KVM_SYSTEM_EVENT_RESET {
1565                         Ok(cpu::VmExit::Reset)
1566                     } else if event_type == KVM_SYSTEM_EVENT_SHUTDOWN {
1567                         Ok(cpu::VmExit::Shutdown)
1568                     } else {
1569                         Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1570                             "Unexpected system event with type 0x{:x}, flags 0x{:x}",
1571                             event_type,
1572                             flags
1573                         )))
1574                     }
1575                 }
1576 
1577                 VcpuExit::MmioRead(addr, data) => {
1578                     if let Some(vm_ops) = &self.vm_ops {
1579                         return vm_ops
1580                             .mmio_read(addr, data)
1581                             .map(|_| cpu::VmExit::Ignore)
1582                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1583                     }
1584 
1585                     Ok(cpu::VmExit::MmioRead(addr, data))
1586                 }
1587                 VcpuExit::MmioWrite(addr, data) => {
1588                     if let Some(vm_ops) = &self.vm_ops {
1589                         return vm_ops
1590                             .mmio_write(addr, data)
1591                             .map(|_| cpu::VmExit::Ignore)
1592                             .map_err(|e| cpu::HypervisorCpuError::RunVcpu(e.into()));
1593                     }
1594 
1595                     Ok(cpu::VmExit::MmioWrite(addr, data))
1596                 }
1597                 VcpuExit::Hyperv => Ok(cpu::VmExit::Hyperv),
1598                 #[cfg(feature = "tdx")]
1599                 VcpuExit::Unsupported(KVM_EXIT_TDX) => Ok(cpu::VmExit::Tdx),
1600                 VcpuExit::Debug(_) => Ok(cpu::VmExit::Debug),
1601 
1602                 r => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1603                     "Unexpected exit reason on vcpu run: {:?}",
1604                     r
1605                 ))),
1606             },
1607 
1608             Err(ref e) => match e.errno() {
1609                 libc::EAGAIN | libc::EINTR => Ok(cpu::VmExit::Ignore),
1610                 _ => Err(cpu::HypervisorCpuError::RunVcpu(anyhow!(
1611                     "VCPU error {:?}",
1612                     e
1613                 ))),
1614             },
1615         }
1616     }
1617     #[cfg(target_arch = "x86_64")]
1618     ///
1619     /// Let the guest know that it has been paused, which prevents from
1620     /// potential soft lockups when being resumed.
1621     ///
1622     fn notify_guest_clock_paused(&self) -> cpu::Result<()> {
1623         if let Err(e) = self.fd.kvmclock_ctrl() {
1624             // Linux kernel returns -EINVAL if the PV clock isn't yet initialised
1625             // which could be because we're still in firmware or the guest doesn't
1626             // use KVM clock.
1627             if e.errno() != libc::EINVAL {
1628                 return Err(cpu::HypervisorCpuError::NotifyGuestClockPaused(e.into()));
1629             }
1630         }
1631 
1632         Ok(())
1633     }
1634     ///
1635     /// Sets debug registers to set hardware breakpoints and/or enable single step.
1636     ///
1637     fn set_guest_debug(
1638         &self,
1639         addrs: &[vm_memory::GuestAddress],
1640         singlestep: bool,
1641     ) -> cpu::Result<()> {
1642         let mut dbg = kvm_guest_debug {
1643             #[cfg(target_arch = "x86_64")]
1644             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP,
1645             #[cfg(target_arch = "aarch64")]
1646             control: KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW,
1647             ..Default::default()
1648         };
1649         if singlestep {
1650             dbg.control |= KVM_GUESTDBG_SINGLESTEP;
1651         }
1652 
1653         // Set the debug registers.
1654         // Here we assume that the number of addresses do not exceed what
1655         // `Hypervisor::get_guest_debug_hw_bps()` specifies.
1656         #[cfg(target_arch = "x86_64")]
1657         {
1658             // Set bits 9 and 10.
1659             // bit 9: GE (global exact breakpoint enable) flag.
1660             // bit 10: always 1.
1661             dbg.arch.debugreg[7] = 0x0600;
1662 
1663             for (i, addr) in addrs.iter().enumerate() {
1664                 dbg.arch.debugreg[i] = addr.0;
1665                 // Set global breakpoint enable flag
1666                 dbg.arch.debugreg[7] |= 2 << (i * 2);
1667             }
1668         }
1669         #[cfg(target_arch = "aarch64")]
1670         {
1671             for (i, addr) in addrs.iter().enumerate() {
1672                 // DBGBCR_EL1 (Debug Breakpoint Control Registers, D13.3.2):
1673                 // bit 0: 1 (Enabled)
1674                 // bit 1~2: 0b11 (PMC = EL1/EL0)
1675                 // bit 5~8: 0b1111 (BAS = AArch64)
1676                 // others: 0
1677                 dbg.arch.dbg_bcr[i] = 0b1u64 | 0b110u64 | 0b1_1110_0000u64;
1678                 // DBGBVR_EL1 (Debug Breakpoint Value Registers, D13.3.3):
1679                 // bit 2~52: VA[2:52]
1680                 dbg.arch.dbg_bvr[i] = (!0u64 >> 11) & addr.0;
1681             }
1682         }
1683         self.fd
1684             .set_guest_debug(&dbg)
1685             .map_err(|e| cpu::HypervisorCpuError::SetDebugRegs(e.into()))
1686     }
1687     #[cfg(target_arch = "aarch64")]
1688     fn vcpu_init(&self, kvi: &VcpuInit) -> cpu::Result<()> {
1689         self.fd
1690             .vcpu_init(kvi)
1691             .map_err(|e| cpu::HypervisorCpuError::VcpuInit(e.into()))
1692     }
1693     ///
1694     /// Gets a list of the guest registers that are supported for the
1695     /// KVM_GET_ONE_REG/KVM_SET_ONE_REG calls.
1696     ///
1697     #[cfg(target_arch = "aarch64")]
1698     fn get_reg_list(&self, reg_list: &mut RegList) -> cpu::Result<()> {
1699         self.fd
1700             .get_reg_list(reg_list)
1701             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))
1702     }
1703     ///
1704     /// Gets the value of a system register
1705     ///
1706     #[cfg(target_arch = "aarch64")]
1707     fn get_sys_reg(&self, sys_reg: u32) -> cpu::Result<u64> {
1708         //
1709         // Arm Architecture Reference Manual defines the encoding of
1710         // AArch64 system registers, see
1711         // https://developer.arm.com/documentation/ddi0487 (chapter D12).
1712         // While KVM defines another ID for each AArch64 system register,
1713         // which is used in calling `KVM_G/SET_ONE_REG` to access a system
1714         // register of a guest.
1715         // A mapping exists between the Arm standard encoding and the KVM ID.
1716         // This function takes the standard u32 ID as input parameter, converts
1717         // it to the corresponding KVM ID, and call `KVM_GET_ONE_REG` API to
1718         // get the value of the system parameter.
1719         //
1720         let id: u64 = KVM_REG_ARM64
1721             | KVM_REG_SIZE_U64
1722             | KVM_REG_ARM64_SYSREG as u64
1723             | ((((sys_reg) >> 5)
1724                 & (KVM_REG_ARM64_SYSREG_OP0_MASK
1725                     | KVM_REG_ARM64_SYSREG_OP1_MASK
1726                     | KVM_REG_ARM64_SYSREG_CRN_MASK
1727                     | KVM_REG_ARM64_SYSREG_CRM_MASK
1728                     | KVM_REG_ARM64_SYSREG_OP2_MASK)) as u64);
1729         Ok(self
1730             .fd
1731             .get_one_reg(id)
1732             .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?
1733             .try_into()
1734             .unwrap())
1735     }
1736     ///
1737     /// Configure core registers for a given CPU.
1738     ///
1739     #[cfg(target_arch = "aarch64")]
1740     fn setup_regs(&self, cpu_id: u8, boot_ip: u64, fdt_start: u64) -> cpu::Result<()> {
1741         #[allow(non_upper_case_globals)]
1742         // PSR (Processor State Register) bits.
1743         // Taken from arch/arm64/include/uapi/asm/ptrace.h.
1744         const PSR_MODE_EL1h: u64 = 0x0000_0005;
1745         const PSR_F_BIT: u64 = 0x0000_0040;
1746         const PSR_I_BIT: u64 = 0x0000_0080;
1747         const PSR_A_BIT: u64 = 0x0000_0100;
1748         const PSR_D_BIT: u64 = 0x0000_0200;
1749         // Taken from arch/arm64/kvm/inject_fault.c.
1750         const PSTATE_FAULT_BITS_64: u64 =
1751             PSR_MODE_EL1h | PSR_A_BIT | PSR_F_BIT | PSR_I_BIT | PSR_D_BIT;
1752 
1753         let kreg_off = offset_of!(kvm_regs, regs);
1754 
1755         // Get the register index of the PSTATE (Processor State) register.
1756         let pstate = offset_of!(user_pt_regs, pstate) + kreg_off;
1757         self.fd
1758             .set_one_reg(
1759                 arm64_core_reg_id!(KVM_REG_SIZE_U64, pstate),
1760                 PSTATE_FAULT_BITS_64.into(),
1761             )
1762             .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1763 
1764         // Other vCPUs are powered off initially awaiting PSCI wakeup.
1765         if cpu_id == 0 {
1766             // Setting the PC (Processor Counter) to the current program address (kernel address).
1767             let pc = offset_of!(user_pt_regs, pc) + kreg_off;
1768             self.fd
1769                 .set_one_reg(arm64_core_reg_id!(KVM_REG_SIZE_U64, pc), boot_ip.into())
1770                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1771 
1772             // Last mandatory thing to set -> the address pointing to the FDT (also called DTB).
1773             // "The device tree blob (dtb) must be placed on an 8-byte boundary and must
1774             // not exceed 2 megabytes in size." -> https://www.kernel.org/doc/Documentation/arm64/booting.txt.
1775             // We are choosing to place it the end of DRAM. See `get_fdt_addr`.
1776             let regs0 = offset_of!(user_pt_regs, regs) + kreg_off;
1777             self.fd
1778                 .set_one_reg(
1779                     arm64_core_reg_id!(KVM_REG_SIZE_U64, regs0),
1780                     fdt_start.into(),
1781                 )
1782                 .map_err(|e| cpu::HypervisorCpuError::SetCoreRegister(e.into()))?;
1783         }
1784         Ok(())
1785     }
1786 
1787     #[cfg(target_arch = "x86_64")]
1788     ///
1789     /// Get the current CPU state
1790     ///
1791     /// Ordering requirements:
1792     ///
1793     /// KVM_GET_MP_STATE calls kvm_apic_accept_events(), which might modify
1794     /// vCPU/LAPIC state. As such, it must be done before most everything
1795     /// else, otherwise we cannot restore everything and expect it to work.
1796     ///
1797     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1798     /// still running.
1799     ///
1800     /// KVM_GET_LAPIC may change state of LAPIC before returning it.
1801     ///
1802     /// GET_VCPU_EVENTS should probably be last to save. The code looks as
1803     /// it might as well be affected by internal state modifications of the
1804     /// GET ioctls.
1805     ///
1806     /// SREGS saves/restores a pending interrupt, similar to what
1807     /// VCPU_EVENTS also does.
1808     ///
1809     /// GET_MSRS requires a pre-populated data structure to do something
1810     /// meaningful. For SET_MSRS it will then contain good data.
1811     ///
1812     /// # Example
1813     ///
1814     /// ```rust
1815     /// # use hypervisor::kvm::KvmHypervisor;
1816     /// # use std::sync::Arc;
1817     /// let kvm = KvmHypervisor::new().unwrap();
1818     /// let hv = Arc::new(kvm);
1819     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1820     /// vm.enable_split_irq().unwrap();
1821     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1822     /// let state = vcpu.state().unwrap();
1823     /// ```
1824     fn state(&self) -> cpu::Result<CpuState> {
1825         let cpuid = self.get_cpuid2(kvm_bindings::KVM_MAX_CPUID_ENTRIES)?;
1826         let mp_state = self.get_mp_state()?.into();
1827         let regs = self.get_regs()?;
1828         let sregs = self.get_sregs()?;
1829         let xsave = self.get_xsave()?;
1830         let xcrs = self.get_xcrs()?;
1831         let lapic_state = self.get_lapic()?;
1832         let fpu = self.get_fpu()?;
1833 
1834         // Try to get all MSRs based on the list previously retrieved from KVM.
1835         // If the number of MSRs obtained from GET_MSRS is different from the
1836         // expected amount, we fallback onto a slower method by getting MSRs
1837         // by chunks. This is the only way to make sure we try to get as many
1838         // MSRs as possible, even if some MSRs are not supported.
1839         let mut msr_entries = self.msrs.clone();
1840 
1841         // Save extra MSRs if the Hyper-V synthetic interrupt controller is
1842         // emulated.
1843         if self.hyperv_synic.load(Ordering::Acquire) {
1844             let hyperv_synic_msrs = vec![
1845                 0x40000020, 0x40000021, 0x40000080, 0x40000081, 0x40000082, 0x40000083, 0x40000084,
1846                 0x40000090, 0x40000091, 0x40000092, 0x40000093, 0x40000094, 0x40000095, 0x40000096,
1847                 0x40000097, 0x40000098, 0x40000099, 0x4000009a, 0x4000009b, 0x4000009c, 0x4000009d,
1848                 0x4000009e, 0x4000009f, 0x400000b0, 0x400000b1, 0x400000b2, 0x400000b3, 0x400000b4,
1849                 0x400000b5, 0x400000b6, 0x400000b7,
1850             ];
1851             for index in hyperv_synic_msrs {
1852                 let msr = kvm_msr_entry {
1853                     index,
1854                     ..Default::default()
1855                 };
1856                 msr_entries.push(msr.into());
1857             }
1858         }
1859 
1860         let expected_num_msrs = msr_entries.len();
1861         let num_msrs = self.get_msrs(&mut msr_entries)?;
1862         let msrs = if num_msrs != expected_num_msrs {
1863             let mut faulty_msr_index = num_msrs;
1864             let mut msr_entries_tmp = msr_entries[..faulty_msr_index].to_vec();
1865 
1866             loop {
1867                 warn!(
1868                     "Detected faulty MSR 0x{:x} while getting MSRs",
1869                     msr_entries[faulty_msr_index].index
1870                 );
1871 
1872                 // Skip the first bad MSR
1873                 let start_pos = faulty_msr_index + 1;
1874 
1875                 let mut sub_msr_entries = msr_entries[start_pos..].to_vec();
1876                 let num_msrs = self.get_msrs(&mut sub_msr_entries)?;
1877 
1878                 msr_entries_tmp.extend(&sub_msr_entries[..num_msrs]);
1879 
1880                 if num_msrs == sub_msr_entries.len() {
1881                     break;
1882                 }
1883 
1884                 faulty_msr_index = start_pos + num_msrs;
1885             }
1886 
1887             msr_entries_tmp
1888         } else {
1889             msr_entries
1890         };
1891 
1892         let vcpu_events = self.get_vcpu_events()?;
1893         let tsc_khz = self.tsc_khz()?;
1894 
1895         Ok(VcpuKvmState {
1896             cpuid,
1897             msrs,
1898             vcpu_events,
1899             regs: regs.into(),
1900             sregs: sregs.into(),
1901             fpu,
1902             lapic_state,
1903             xsave,
1904             xcrs,
1905             mp_state,
1906             tsc_khz,
1907         }
1908         .into())
1909     }
1910     ///
1911     /// Get the current AArch64 CPU state
1912     ///
1913     #[cfg(target_arch = "aarch64")]
1914     fn state(&self) -> cpu::Result<CpuState> {
1915         let mut state = VcpuKvmState {
1916             mp_state: self.get_mp_state()?.into(),
1917             ..Default::default()
1918         };
1919         // Get core registers
1920         state.core_regs = self.get_regs()?;
1921 
1922         // Get systerm register
1923         // Call KVM_GET_REG_LIST to get all registers available to the guest.
1924         // For ArmV8 there are around 500 registers.
1925         let mut sys_regs: Vec<Register> = Vec::new();
1926         let mut reg_list = RegList::new(500).unwrap();
1927         self.fd
1928             .get_reg_list(&mut reg_list)
1929             .map_err(|e| cpu::HypervisorCpuError::GetRegList(e.into()))?;
1930 
1931         // At this point reg_list should contain: core registers and system
1932         // registers.
1933         // The register list contains the number of registers and their ids. We
1934         // will be needing to call KVM_GET_ONE_REG on each id in order to save
1935         // all of them. We carve out from the list  the core registers which are
1936         // represented in the kernel by kvm_regs structure and for which we can
1937         // calculate the id based on the offset in the structure.
1938         reg_list.retain(|regid| is_system_register(*regid));
1939 
1940         // Now, for the rest of the registers left in the previously fetched
1941         // register list, we are simply calling KVM_GET_ONE_REG.
1942         let indices = reg_list.as_slice();
1943         for index in indices.iter() {
1944             sys_regs.push(kvm_bindings::kvm_one_reg {
1945                 id: *index,
1946                 addr: self
1947                     .fd
1948                     .get_one_reg(*index)
1949                     .map_err(|e| cpu::HypervisorCpuError::GetSysRegister(e.into()))?
1950                     .try_into()
1951                     .unwrap(),
1952             });
1953         }
1954 
1955         state.sys_regs = sys_regs;
1956 
1957         Ok(state.into())
1958     }
1959     #[cfg(target_arch = "x86_64")]
1960     ///
1961     /// Restore the previously saved CPU state
1962     ///
1963     /// Ordering requirements:
1964     ///
1965     /// KVM_GET_VCPU_EVENTS/KVM_SET_VCPU_EVENTS is unsafe if other vCPUs are
1966     /// still running.
1967     ///
1968     /// Some SET ioctls (like set_mp_state) depend on kvm_vcpu_is_bsp(), so
1969     /// if we ever change the BSP, we have to do that before restoring anything.
1970     /// The same seems to be true for CPUID stuff.
1971     ///
1972     /// SREGS saves/restores a pending interrupt, similar to what
1973     /// VCPU_EVENTS also does.
1974     ///
1975     /// SET_REGS clears pending exceptions unconditionally, thus, it must be
1976     /// done before SET_VCPU_EVENTS, which restores it.
1977     ///
1978     /// SET_LAPIC must come after SET_SREGS, because the latter restores
1979     /// the apic base msr.
1980     ///
1981     /// SET_LAPIC must come before SET_MSRS, because the TSC deadline MSR
1982     /// only restores successfully, when the LAPIC is correctly configured.
1983     ///
1984     /// Arguments: CpuState
1985     /// # Example
1986     ///
1987     /// ```rust
1988     /// # use hypervisor::kvm::KvmHypervisor;
1989     /// # use std::sync::Arc;
1990     /// let kvm = KvmHypervisor::new().unwrap();
1991     /// let hv = Arc::new(kvm);
1992     /// let vm = hv.create_vm().expect("new VM fd creation failed");
1993     /// vm.enable_split_irq().unwrap();
1994     /// let vcpu = vm.create_vcpu(0, None).unwrap();
1995     /// let state = vcpu.state().unwrap();
1996     /// vcpu.set_state(&state).unwrap();
1997     /// ```
1998     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
1999         let state: VcpuKvmState = state.clone().into();
2000         self.set_cpuid2(&state.cpuid)?;
2001         self.set_mp_state(state.mp_state.into())?;
2002         self.set_regs(&state.regs.into())?;
2003         self.set_sregs(&state.sregs.into())?;
2004         self.set_xsave(&state.xsave)?;
2005         self.set_xcrs(&state.xcrs)?;
2006         self.set_lapic(&state.lapic_state)?;
2007         self.set_fpu(&state.fpu)?;
2008 
2009         if let Some(freq) = state.tsc_khz {
2010             self.set_tsc_khz(freq)?;
2011         }
2012 
2013         // Try to set all MSRs previously stored.
2014         // If the number of MSRs set from SET_MSRS is different from the
2015         // expected amount, we fallback onto a slower method by setting MSRs
2016         // by chunks. This is the only way to make sure we try to set as many
2017         // MSRs as possible, even if some MSRs are not supported.
2018         let expected_num_msrs = state.msrs.len();
2019         let num_msrs = self.set_msrs(&state.msrs)?;
2020         if num_msrs != expected_num_msrs {
2021             let mut faulty_msr_index = num_msrs;
2022 
2023             loop {
2024                 warn!(
2025                     "Detected faulty MSR 0x{:x} while setting MSRs",
2026                     state.msrs[faulty_msr_index].index
2027                 );
2028 
2029                 // Skip the first bad MSR
2030                 let start_pos = faulty_msr_index + 1;
2031 
2032                 let sub_msr_entries = state.msrs[start_pos..].to_vec();
2033 
2034                 let num_msrs = self.set_msrs(&sub_msr_entries)?;
2035 
2036                 if num_msrs == sub_msr_entries.len() {
2037                     break;
2038                 }
2039 
2040                 faulty_msr_index = start_pos + num_msrs;
2041             }
2042         }
2043 
2044         self.set_vcpu_events(&state.vcpu_events)?;
2045 
2046         Ok(())
2047     }
2048     ///
2049     /// Restore the previously saved AArch64 CPU state
2050     ///
2051     #[cfg(target_arch = "aarch64")]
2052     fn set_state(&self, state: &CpuState) -> cpu::Result<()> {
2053         let state: VcpuKvmState = state.clone().into();
2054         // Set core registers
2055         self.set_regs(&state.core_regs)?;
2056         // Set system registers
2057         for reg in &state.sys_regs {
2058             self.fd
2059                 .set_one_reg(reg.id, reg.addr.into())
2060                 .map_err(|e| cpu::HypervisorCpuError::SetSysRegister(e.into()))?;
2061         }
2062 
2063         self.set_mp_state(state.mp_state.into())?;
2064 
2065         Ok(())
2066     }
2067 
2068     ///
2069     /// Initialize TDX for this CPU
2070     ///
2071     #[cfg(feature = "tdx")]
2072     fn tdx_init(&self, hob_address: u64) -> cpu::Result<()> {
2073         tdx_command(&self.fd.as_raw_fd(), TdxCommand::InitVcpu, 0, hob_address)
2074             .map_err(cpu::HypervisorCpuError::InitializeTdx)
2075     }
2076 
2077     ///
2078     /// Set the "immediate_exit" state
2079     ///
2080     fn set_immediate_exit(&self, exit: bool) {
2081         self.fd.set_kvm_immediate_exit(exit.into());
2082     }
2083 
2084     ///
2085     /// Returns the details about TDX exit reason
2086     ///
2087     #[cfg(feature = "tdx")]
2088     fn get_tdx_exit_details(&mut self) -> cpu::Result<TdxExitDetails> {
2089         let kvm_run = self.fd.get_kvm_run();
2090         // SAFETY: accessing a union field in a valid structure
2091         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2092 
2093         tdx_vmcall.status_code = TDG_VP_VMCALL_INVALID_OPERAND;
2094 
2095         if tdx_vmcall.type_ != 0 {
2096             return Err(cpu::HypervisorCpuError::UnknownTdxVmCall);
2097         }
2098 
2099         match tdx_vmcall.subfunction {
2100             TDG_VP_VMCALL_GET_QUOTE => Ok(TdxExitDetails::GetQuote),
2101             TDG_VP_VMCALL_SETUP_EVENT_NOTIFY_INTERRUPT => {
2102                 Ok(TdxExitDetails::SetupEventNotifyInterrupt)
2103             }
2104             _ => Err(cpu::HypervisorCpuError::UnknownTdxVmCall),
2105         }
2106     }
2107 
2108     ///
2109     /// Set the status code for TDX exit
2110     ///
2111     #[cfg(feature = "tdx")]
2112     fn set_tdx_status(&mut self, status: TdxExitStatus) {
2113         let kvm_run = self.fd.get_kvm_run();
2114         // SAFETY: accessing a union field in a valid structure
2115         let tdx_vmcall = unsafe { &mut kvm_run.__bindgen_anon_1.tdx.u.vmcall };
2116 
2117         tdx_vmcall.status_code = match status {
2118             TdxExitStatus::Success => TDG_VP_VMCALL_SUCCESS,
2119             TdxExitStatus::InvalidOperand => TDG_VP_VMCALL_INVALID_OPERAND,
2120         };
2121     }
2122     #[cfg(target_arch = "x86_64")]
2123     ///
2124     /// Return the list of initial MSR entries for a VCPU
2125     ///
2126     fn boot_msr_entries(&self) -> Vec<MsrEntry> {
2127         use crate::arch::x86::{msr_index, MTRR_ENABLE, MTRR_MEM_TYPE_WB};
2128 
2129         [
2130             msr!(msr_index::MSR_IA32_SYSENTER_CS),
2131             msr!(msr_index::MSR_IA32_SYSENTER_ESP),
2132             msr!(msr_index::MSR_IA32_SYSENTER_EIP),
2133             msr!(msr_index::MSR_STAR),
2134             msr!(msr_index::MSR_CSTAR),
2135             msr!(msr_index::MSR_LSTAR),
2136             msr!(msr_index::MSR_KERNEL_GS_BASE),
2137             msr!(msr_index::MSR_SYSCALL_MASK),
2138             msr!(msr_index::MSR_IA32_TSC),
2139             msr_data!(
2140                 msr_index::MSR_IA32_MISC_ENABLE,
2141                 msr_index::MSR_IA32_MISC_ENABLE_FAST_STRING as u64
2142             ),
2143             msr_data!(msr_index::MSR_MTRRdefType, MTRR_ENABLE | MTRR_MEM_TYPE_WB),
2144         ]
2145         .to_vec()
2146     }
2147     #[cfg(target_arch = "aarch64")]
2148     fn has_pmu_support(&self) -> bool {
2149         let cpu_attr = kvm_bindings::kvm_device_attr {
2150             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2151             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2152             addr: 0x0,
2153             flags: 0,
2154         };
2155         self.fd.has_device_attr(&cpu_attr).is_ok()
2156     }
2157     #[cfg(target_arch = "aarch64")]
2158     fn init_pmu(&self, irq: u32) -> cpu::Result<()> {
2159         let cpu_attr = kvm_bindings::kvm_device_attr {
2160             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2161             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_INIT),
2162             addr: 0x0,
2163             flags: 0,
2164         };
2165         let cpu_attr_irq = kvm_bindings::kvm_device_attr {
2166             group: kvm_bindings::KVM_ARM_VCPU_PMU_V3_CTRL,
2167             attr: u64::from(kvm_bindings::KVM_ARM_VCPU_PMU_V3_IRQ),
2168             addr: &irq as *const u32 as u64,
2169             flags: 0,
2170         };
2171         self.fd
2172             .set_device_attr(&cpu_attr_irq)
2173             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)?;
2174         self.fd
2175             .set_device_attr(&cpu_attr)
2176             .map_err(|_| cpu::HypervisorCpuError::InitializePmu)
2177     }
2178 
2179     #[cfg(target_arch = "x86_64")]
2180     ///
2181     /// Get the frequency of the TSC if available
2182     ///
2183     fn tsc_khz(&self) -> cpu::Result<Option<u32>> {
2184         match self.fd.get_tsc_khz() {
2185             Err(e) => {
2186                 if e.errno() == libc::EIO {
2187                     Ok(None)
2188                 } else {
2189                     Err(cpu::HypervisorCpuError::GetTscKhz(e.into()))
2190                 }
2191             }
2192             Ok(v) => Ok(Some(v)),
2193         }
2194     }
2195 
2196     #[cfg(target_arch = "x86_64")]
2197     ///
2198     /// Set the frequency of the TSC if available
2199     ///
2200     fn set_tsc_khz(&self, freq: u32) -> cpu::Result<()> {
2201         match self.fd.set_tsc_khz(freq) {
2202             Err(e) => {
2203                 if e.errno() == libc::EIO {
2204                     Ok(())
2205                 } else {
2206                     Err(cpu::HypervisorCpuError::SetTscKhz(e.into()))
2207                 }
2208             }
2209             Ok(_) => Ok(()),
2210         }
2211     }
2212 }
2213 
2214 impl KvmVcpu {
2215     #[cfg(target_arch = "x86_64")]
2216     ///
2217     /// X86 specific call that returns the vcpu's current "xsave struct".
2218     ///
2219     fn get_xsave(&self) -> cpu::Result<Xsave> {
2220         self.fd
2221             .get_xsave()
2222             .map_err(|e| cpu::HypervisorCpuError::GetXsaveState(e.into()))
2223     }
2224     #[cfg(target_arch = "x86_64")]
2225     ///
2226     /// X86 specific call that sets the vcpu's current "xsave struct".
2227     ///
2228     fn set_xsave(&self, xsave: &Xsave) -> cpu::Result<()> {
2229         self.fd
2230             .set_xsave(xsave)
2231             .map_err(|e| cpu::HypervisorCpuError::SetXsaveState(e.into()))
2232     }
2233     #[cfg(target_arch = "x86_64")]
2234     ///
2235     /// X86 specific call that returns the vcpu's current "xcrs".
2236     ///
2237     fn get_xcrs(&self) -> cpu::Result<ExtendedControlRegisters> {
2238         self.fd
2239             .get_xcrs()
2240             .map_err(|e| cpu::HypervisorCpuError::GetXcsr(e.into()))
2241     }
2242     #[cfg(target_arch = "x86_64")]
2243     ///
2244     /// X86 specific call that sets the vcpu's current "xcrs".
2245     ///
2246     fn set_xcrs(&self, xcrs: &ExtendedControlRegisters) -> cpu::Result<()> {
2247         self.fd
2248             .set_xcrs(xcrs)
2249             .map_err(|e| cpu::HypervisorCpuError::SetXcsr(e.into()))
2250     }
2251     #[cfg(target_arch = "x86_64")]
2252     ///
2253     /// Returns currently pending exceptions, interrupts, and NMIs as well as related
2254     /// states of the vcpu.
2255     ///
2256     fn get_vcpu_events(&self) -> cpu::Result<VcpuEvents> {
2257         self.fd
2258             .get_vcpu_events()
2259             .map_err(|e| cpu::HypervisorCpuError::GetVcpuEvents(e.into()))
2260     }
2261     #[cfg(target_arch = "x86_64")]
2262     ///
2263     /// Sets pending exceptions, interrupts, and NMIs as well as related states
2264     /// of the vcpu.
2265     ///
2266     fn set_vcpu_events(&self, events: &VcpuEvents) -> cpu::Result<()> {
2267         self.fd
2268             .set_vcpu_events(events)
2269             .map_err(|e| cpu::HypervisorCpuError::SetVcpuEvents(e.into()))
2270     }
2271 }
2272